chuk-puzzles-gym 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chuk_puzzles_gym/eval.py +168 -46
- chuk_puzzles_gym/export/dataset.py +7 -1
- chuk_puzzles_gym/games/_base/game.py +123 -0
- chuk_puzzles_gym/gym_env.py +21 -5
- chuk_puzzles_gym/models/__init__.py +2 -0
- chuk_puzzles_gym/models/evaluation.py +165 -1
- chuk_puzzles_gym/server.py +34 -3
- {chuk_puzzles_gym-0.10.1.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
- {chuk_puzzles_gym-0.10.1.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +12 -12
- {chuk_puzzles_gym-0.10.1.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
- {chuk_puzzles_gym-0.10.1.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
- {chuk_puzzles_gym-0.10.1.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
chuk_puzzles_gym/eval.py
CHANGED
|
@@ -25,7 +25,7 @@ import sys
|
|
|
25
25
|
import time
|
|
26
26
|
from dataclasses import dataclass, field
|
|
27
27
|
from datetime import datetime
|
|
28
|
-
from typing import TYPE_CHECKING
|
|
28
|
+
from typing import TYPE_CHECKING, Any
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
pass
|
|
@@ -100,8 +100,36 @@ class EvaluationReport:
|
|
|
100
100
|
return 0.0
|
|
101
101
|
return sum(e.hints_used for e in self.episodes) / self.total_episodes
|
|
102
102
|
|
|
103
|
+
@property
|
|
104
|
+
def avg_backtrack_rate(self) -> float:
|
|
105
|
+
"""Average backtrack rate across episodes with reasoning metrics."""
|
|
106
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
107
|
+
if not with_metrics:
|
|
108
|
+
return 0.0
|
|
109
|
+
return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def avg_reasoning_overhead(self) -> float:
|
|
113
|
+
"""Average reasoning overhead across episodes with reasoning metrics."""
|
|
114
|
+
with_metrics = [
|
|
115
|
+
e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
|
|
116
|
+
]
|
|
117
|
+
if not with_metrics:
|
|
118
|
+
return 0.0
|
|
119
|
+
return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def avg_progress_steadiness(self) -> float:
|
|
123
|
+
"""Average progress steadiness across episodes with reasoning metrics."""
|
|
124
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
125
|
+
if not with_metrics:
|
|
126
|
+
return 0.0
|
|
127
|
+
return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
128
|
+
|
|
103
129
|
def to_markdown(self) -> str:
|
|
104
130
|
"""Generate markdown report."""
|
|
131
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
132
|
+
|
|
105
133
|
lines = [
|
|
106
134
|
f"# {self.game.title()} {self.difficulty.title()} Evaluation",
|
|
107
135
|
"",
|
|
@@ -112,24 +140,78 @@ class EvaluationReport:
|
|
|
112
140
|
f"**Avg Hints:** {self.avg_hints:.1f}",
|
|
113
141
|
f"**Avg Efficiency:** {self.avg_efficiency:.1%}",
|
|
114
142
|
f"**Avg Time:** {self.avg_time_ms:.0f}ms",
|
|
115
|
-
"",
|
|
116
|
-
f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
|
|
117
|
-
"",
|
|
118
|
-
"## Episode Details",
|
|
119
|
-
"",
|
|
120
|
-
"| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |",
|
|
121
|
-
"|------|--------|-------|---------|-------|------------|-----------|",
|
|
122
143
|
]
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
144
|
+
|
|
145
|
+
if has_reasoning:
|
|
146
|
+
lines.extend(
|
|
147
|
+
[
|
|
148
|
+
"",
|
|
149
|
+
"### Reasoning Depth",
|
|
150
|
+
f"**Avg Backtrack Rate:** {self.avg_backtrack_rate:.1%}",
|
|
151
|
+
f"**Avg Reasoning Overhead:** {self.avg_reasoning_overhead:.2f}x",
|
|
152
|
+
f"**Avg Progress Steadiness:** {self.avg_progress_steadiness:.1%}",
|
|
153
|
+
]
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
lines.extend(
|
|
157
|
+
[
|
|
158
|
+
"",
|
|
159
|
+
f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
|
|
160
|
+
"",
|
|
161
|
+
"## Episode Details",
|
|
162
|
+
"",
|
|
163
|
+
]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if has_reasoning:
|
|
167
|
+
lines.append(
|
|
168
|
+
"| Seed | Status | Steps | Invalid | Hints | Efficiency | Backtracks | Steadiness | Time (ms) |"
|
|
169
|
+
)
|
|
126
170
|
lines.append(
|
|
127
|
-
|
|
171
|
+
"|------|--------|-------|---------|-------|------------|------------|------------|-----------|"
|
|
128
172
|
)
|
|
173
|
+
for e in self.episodes:
|
|
174
|
+
status = "solved" if e.success else e.status.value
|
|
175
|
+
eff = f"{e.efficiency_score:.0%}" if e.success else "-"
|
|
176
|
+
bt = str(e.reasoning_metrics.backtrack_count) if e.reasoning_metrics else "-"
|
|
177
|
+
st = f"{e.reasoning_metrics.progress_steadiness:.0%}" if e.reasoning_metrics else "-"
|
|
178
|
+
lines.append(
|
|
179
|
+
f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {bt} | {st} | {e.wall_time_ms} |"
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
lines.append("| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |")
|
|
183
|
+
lines.append("|------|--------|-------|---------|-------|------------|-----------|")
|
|
184
|
+
for e in self.episodes:
|
|
185
|
+
status = "solved" if e.success else e.status.value
|
|
186
|
+
eff = f"{e.efficiency_score:.0%}" if e.success else "-"
|
|
187
|
+
lines.append(
|
|
188
|
+
f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
|
|
189
|
+
)
|
|
190
|
+
|
|
129
191
|
return "\n".join(lines)
|
|
130
192
|
|
|
131
193
|
def to_json(self) -> str:
|
|
132
194
|
"""Generate JSON report."""
|
|
195
|
+
summary: dict[str, Any] = {
|
|
196
|
+
"total_episodes": self.total_episodes,
|
|
197
|
+
"solved_count": self.solved_count,
|
|
198
|
+
"solve_rate": self.solve_rate,
|
|
199
|
+
"avg_steps": self.avg_moves,
|
|
200
|
+
"avg_invalid": self.avg_invalid_moves,
|
|
201
|
+
"avg_hints": self.avg_hints,
|
|
202
|
+
"avg_efficiency": self.avg_efficiency,
|
|
203
|
+
"avg_time_ms": self.avg_time_ms,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Add aggregate reasoning metrics if available
|
|
207
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
208
|
+
if has_reasoning:
|
|
209
|
+
summary["reasoning"] = {
|
|
210
|
+
"avg_backtrack_rate": round(self.avg_backtrack_rate, 3),
|
|
211
|
+
"avg_reasoning_overhead": round(self.avg_reasoning_overhead, 3),
|
|
212
|
+
"avg_progress_steadiness": round(self.avg_progress_steadiness, 3),
|
|
213
|
+
}
|
|
214
|
+
|
|
133
215
|
return json.dumps(
|
|
134
216
|
{
|
|
135
217
|
"game": self.game,
|
|
@@ -139,16 +221,7 @@ class EvaluationReport:
|
|
|
139
221
|
"hint_budget": self.solver_config.hint_budget,
|
|
140
222
|
"hint_penalty": self.solver_config.hint_penalty,
|
|
141
223
|
},
|
|
142
|
-
"summary":
|
|
143
|
-
"total_episodes": self.total_episodes,
|
|
144
|
-
"solved_count": self.solved_count,
|
|
145
|
-
"solve_rate": self.solve_rate,
|
|
146
|
-
"avg_steps": self.avg_moves,
|
|
147
|
-
"avg_invalid": self.avg_invalid_moves,
|
|
148
|
-
"avg_hints": self.avg_hints,
|
|
149
|
-
"avg_efficiency": self.avg_efficiency,
|
|
150
|
-
"avg_time_ms": self.avg_time_ms,
|
|
151
|
-
},
|
|
224
|
+
"summary": summary,
|
|
152
225
|
"episodes": [e.to_summary_dict() for e in self.episodes],
|
|
153
226
|
},
|
|
154
227
|
indent=2,
|
|
@@ -158,35 +231,61 @@ class EvaluationReport:
|
|
|
158
231
|
"""Generate CSV report."""
|
|
159
232
|
import io
|
|
160
233
|
|
|
234
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
235
|
+
|
|
161
236
|
output = io.StringIO()
|
|
162
237
|
writer = csv.writer(output)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
writer.writerow(
|
|
238
|
+
|
|
239
|
+
header = [
|
|
240
|
+
"game",
|
|
241
|
+
"difficulty",
|
|
242
|
+
"seed",
|
|
243
|
+
"status",
|
|
244
|
+
"steps_taken",
|
|
245
|
+
"invalid_actions",
|
|
246
|
+
"hints_used",
|
|
247
|
+
"efficiency",
|
|
248
|
+
"wall_time_ms",
|
|
249
|
+
]
|
|
250
|
+
if has_reasoning:
|
|
251
|
+
header.extend(
|
|
178
252
|
[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
e.invalid_actions,
|
|
185
|
-
e.hints_used,
|
|
186
|
-
f"{e.efficiency_score:.3f}",
|
|
187
|
-
e.wall_time_ms,
|
|
253
|
+
"backtrack_count",
|
|
254
|
+
"backtrack_rate",
|
|
255
|
+
"reasoning_overhead",
|
|
256
|
+
"progress_steadiness",
|
|
257
|
+
"error_streak_max",
|
|
188
258
|
]
|
|
189
259
|
)
|
|
260
|
+
writer.writerow(header)
|
|
261
|
+
|
|
262
|
+
for e in self.episodes:
|
|
263
|
+
row = [
|
|
264
|
+
e.game,
|
|
265
|
+
e.difficulty.value,
|
|
266
|
+
e.seed,
|
|
267
|
+
e.status.value,
|
|
268
|
+
e.steps_taken,
|
|
269
|
+
e.invalid_actions,
|
|
270
|
+
e.hints_used,
|
|
271
|
+
f"{e.efficiency_score:.3f}",
|
|
272
|
+
e.wall_time_ms,
|
|
273
|
+
]
|
|
274
|
+
if has_reasoning:
|
|
275
|
+
rm = e.reasoning_metrics
|
|
276
|
+
if rm is not None:
|
|
277
|
+
row.extend(
|
|
278
|
+
[
|
|
279
|
+
rm.backtrack_count,
|
|
280
|
+
f"{rm.backtrack_rate:.3f}",
|
|
281
|
+
f"{rm.reasoning_overhead:.3f}",
|
|
282
|
+
f"{rm.progress_steadiness:.3f}",
|
|
283
|
+
rm.error_streak_max,
|
|
284
|
+
]
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
row.extend(["", "", "", "", ""])
|
|
288
|
+
writer.writerow(row)
|
|
190
289
|
return output.getvalue()
|
|
191
290
|
|
|
192
291
|
def print_summary(self) -> None:
|
|
@@ -206,6 +305,15 @@ class EvaluationReport:
|
|
|
206
305
|
print(f"Avg Efficiency: {self.avg_efficiency:.1%}")
|
|
207
306
|
print(f"Avg Time: {self.avg_time_ms:.0f}ms")
|
|
208
307
|
|
|
308
|
+
# Reasoning depth metrics
|
|
309
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
310
|
+
if has_reasoning:
|
|
311
|
+
print("-" * 40)
|
|
312
|
+
print("Reasoning Depth:")
|
|
313
|
+
print(f" Backtrack Rate: {self.avg_backtrack_rate:.1%}")
|
|
314
|
+
print(f" Reasoning Overhead: {self.avg_reasoning_overhead:.2f}x")
|
|
315
|
+
print(f" Progress Steadiness: {self.avg_progress_steadiness:.1%}")
|
|
316
|
+
|
|
209
317
|
|
|
210
318
|
async def _apply_hint(game: PuzzleGame, hint_data: tuple) -> MoveResult:
|
|
211
319
|
"""Apply a hint to the game based on game type.
|
|
@@ -433,15 +541,22 @@ async def run_episode(
|
|
|
433
541
|
# Apply the hint based on game type
|
|
434
542
|
try:
|
|
435
543
|
result = await _apply_hint(game, hint_data)
|
|
544
|
+
# Normalize hint_data to a tuple for position tracking
|
|
545
|
+
position = hint_data if isinstance(hint_data, tuple) else (hint_data,)
|
|
436
546
|
if result.success:
|
|
437
547
|
steps_taken += 1
|
|
548
|
+
# Use game's dynamic optimal_steps (reflects current state)
|
|
549
|
+
remaining = game.optimal_steps or 0
|
|
550
|
+
game.reasoning_tracker.record_valid_move(position, remaining)
|
|
438
551
|
else:
|
|
439
552
|
invalid_actions += 1
|
|
553
|
+
game.reasoning_tracker.record_invalid_move()
|
|
440
554
|
# If we get too many consecutive invalid moves, break
|
|
441
555
|
if invalid_actions > 50:
|
|
442
556
|
break
|
|
443
557
|
except (TypeError, ValueError, AttributeError, IndexError):
|
|
444
558
|
invalid_actions += 1
|
|
559
|
+
game.reasoning_tracker.record_invalid_move()
|
|
445
560
|
if invalid_actions > 50:
|
|
446
561
|
break
|
|
447
562
|
elif not use_hints:
|
|
@@ -461,6 +576,12 @@ async def run_episode(
|
|
|
461
576
|
# Get retries from game if tracked
|
|
462
577
|
retries = getattr(game, "retries", 0)
|
|
463
578
|
|
|
579
|
+
# Collect reasoning depth metrics (use pre-solve optimal_steps since
|
|
580
|
+
# the game's optimal_steps may be 0 after solving)
|
|
581
|
+
reasoning_metrics = game.reasoning_tracker.to_metrics(
|
|
582
|
+
optimal_path_length=optimal_steps if optimal_steps and optimal_steps >= 1 else None,
|
|
583
|
+
)
|
|
584
|
+
|
|
464
585
|
return EpisodeResult(
|
|
465
586
|
game=game.name,
|
|
466
587
|
difficulty=DifficultyLevel(difficulty),
|
|
@@ -475,6 +596,7 @@ async def run_episode(
|
|
|
475
596
|
retries=retries,
|
|
476
597
|
optimal_steps=optimal_steps,
|
|
477
598
|
solver_config=solver_config,
|
|
599
|
+
reasoning_metrics=reasoning_metrics,
|
|
478
600
|
)
|
|
479
601
|
|
|
480
602
|
|
|
@@ -190,6 +190,12 @@ class DatasetExporter:
|
|
|
190
190
|
if canonical:
|
|
191
191
|
gold_answer = str(canonical)
|
|
192
192
|
|
|
193
|
+
# Build reasoning tags from complexity profile
|
|
194
|
+
complexity_profile = game.complexity_profile
|
|
195
|
+
reasoning_type = complexity_profile.get("reasoning_type", "deductive")
|
|
196
|
+
search_space = complexity_profile.get("search_space", "medium")
|
|
197
|
+
tags = [domain, difficulty.value, f"reasoning:{reasoning_type}", f"search:{search_space}"]
|
|
198
|
+
|
|
193
199
|
# Create Problem using core schema
|
|
194
200
|
return Problem(
|
|
195
201
|
# Identity
|
|
@@ -214,7 +220,7 @@ class DatasetExporter:
|
|
|
214
220
|
),
|
|
215
221
|
# Metadata
|
|
216
222
|
operation_count=game.optimal_steps,
|
|
217
|
-
tags=
|
|
223
|
+
tags=tags,
|
|
218
224
|
)
|
|
219
225
|
|
|
220
226
|
@property
|
|
@@ -1,10 +1,99 @@
|
|
|
1
1
|
"""Abstract base class for all puzzle games."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import random
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from typing import Any
|
|
6
8
|
|
|
7
9
|
from ...models import DifficultyLevel, DifficultyProfile, MoveResult, SolverConfig
|
|
10
|
+
from ...models.evaluation import ReasoningMetrics
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReasoningTracker:
|
|
14
|
+
"""Tracks reasoning depth metrics during puzzle gameplay.
|
|
15
|
+
|
|
16
|
+
Accumulates data about backtrack behavior, solver distance progression,
|
|
17
|
+
and error patterns. Produces a ReasoningMetrics snapshot on demand.
|
|
18
|
+
|
|
19
|
+
This is a lightweight, non-Pydantic class meant to be mutated during play.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__slots__ = (
|
|
23
|
+
"_placed_positions",
|
|
24
|
+
"_solver_distance_trace",
|
|
25
|
+
"_backtrack_count",
|
|
26
|
+
"_consecutive_errors",
|
|
27
|
+
"_error_streaks",
|
|
28
|
+
"_max_error_streak",
|
|
29
|
+
"_total_actions",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
self._placed_positions: set[tuple[Any, ...]] = set()
|
|
34
|
+
self._solver_distance_trace: list[int] = []
|
|
35
|
+
self._backtrack_count: int = 0
|
|
36
|
+
self._consecutive_errors: int = 0
|
|
37
|
+
self._error_streaks: list[int] = []
|
|
38
|
+
self._max_error_streak: int = 0
|
|
39
|
+
self._total_actions: int = 0
|
|
40
|
+
|
|
41
|
+
def record_valid_move(self, position: tuple[Any, ...], remaining_count: int) -> None:
|
|
42
|
+
"""Record a valid (successful) move.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
position: The position/target of the move (for backtrack detection)
|
|
46
|
+
remaining_count: How many positions remain to be filled after this move
|
|
47
|
+
"""
|
|
48
|
+
self._total_actions += 1
|
|
49
|
+
|
|
50
|
+
# Detect backtrack: placing at a position already placed before
|
|
51
|
+
if position in self._placed_positions:
|
|
52
|
+
self._backtrack_count += 1
|
|
53
|
+
self._placed_positions.add(position)
|
|
54
|
+
|
|
55
|
+
self._solver_distance_trace.append(remaining_count)
|
|
56
|
+
|
|
57
|
+
# Finalize any pending error streak
|
|
58
|
+
if self._consecutive_errors > 0:
|
|
59
|
+
self._error_streaks.append(self._consecutive_errors)
|
|
60
|
+
self._consecutive_errors = 0
|
|
61
|
+
|
|
62
|
+
def record_invalid_move(self) -> None:
|
|
63
|
+
"""Record an invalid (failed) move."""
|
|
64
|
+
self._total_actions += 1
|
|
65
|
+
self._consecutive_errors += 1
|
|
66
|
+
self._max_error_streak = max(self._max_error_streak, self._consecutive_errors)
|
|
67
|
+
|
|
68
|
+
def to_metrics(self, optimal_path_length: int | None = None) -> ReasoningMetrics:
|
|
69
|
+
"""Produce a frozen ReasoningMetrics snapshot.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
optimal_path_length: Minimum steps to solve (from solver), if known.
|
|
73
|
+
"""
|
|
74
|
+
# Finalize any pending error streak
|
|
75
|
+
error_streaks = list(self._error_streaks)
|
|
76
|
+
if self._consecutive_errors > 0:
|
|
77
|
+
error_streaks.append(self._consecutive_errors)
|
|
78
|
+
|
|
79
|
+
return ReasoningMetrics(
|
|
80
|
+
backtrack_count=self._backtrack_count,
|
|
81
|
+
solver_distance_trace=list(self._solver_distance_trace),
|
|
82
|
+
error_streak_max=self._max_error_streak,
|
|
83
|
+
error_streaks=error_streaks,
|
|
84
|
+
total_actions=self._total_actions,
|
|
85
|
+
optimal_path_length=optimal_path_length,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def reset(self) -> None:
|
|
89
|
+
"""Reset all tracked state."""
|
|
90
|
+
self._placed_positions.clear()
|
|
91
|
+
self._solver_distance_trace.clear()
|
|
92
|
+
self._backtrack_count = 0
|
|
93
|
+
self._consecutive_errors = 0
|
|
94
|
+
self._error_streaks.clear()
|
|
95
|
+
self._max_error_streak = 0
|
|
96
|
+
self._total_actions = 0
|
|
8
97
|
|
|
9
98
|
|
|
10
99
|
class PuzzleGame(ABC):
|
|
@@ -64,6 +153,9 @@ class PuzzleGame(ABC):
|
|
|
64
153
|
self.game_started = False
|
|
65
154
|
self._last_move_position: tuple[Any, ...] | None = None # For retry detection
|
|
66
155
|
|
|
156
|
+
# Reasoning depth tracker
|
|
157
|
+
self._reasoning_tracker = ReasoningTracker()
|
|
158
|
+
|
|
67
159
|
@abstractmethod
|
|
68
160
|
async def generate_puzzle(self) -> None:
|
|
69
161
|
"""Generate a new puzzle with a unique solution.
|
|
@@ -162,8 +254,11 @@ class PuzzleGame(ABC):
|
|
|
162
254
|
"""
|
|
163
255
|
if success:
|
|
164
256
|
self.moves_made += 1
|
|
257
|
+
remaining = self._compute_remaining()
|
|
258
|
+
self._reasoning_tracker.record_valid_move(position, remaining)
|
|
165
259
|
else:
|
|
166
260
|
self.invalid_moves += 1
|
|
261
|
+
self._reasoning_tracker.record_invalid_move()
|
|
167
262
|
|
|
168
263
|
# Detect retries (same position attempted again)
|
|
169
264
|
if self._last_move_position == position:
|
|
@@ -183,6 +278,34 @@ class PuzzleGame(ABC):
|
|
|
183
278
|
self.hints_used += 1
|
|
184
279
|
return True
|
|
185
280
|
|
|
281
|
+
def _compute_remaining(self) -> int:
|
|
282
|
+
"""Compute how many positions remain to be filled.
|
|
283
|
+
|
|
284
|
+
Uses optimal_steps directly since it is typically dynamic
|
|
285
|
+
(reflects current game state, e.g. counting empty cells).
|
|
286
|
+
Override in subclasses for more accurate tracking.
|
|
287
|
+
"""
|
|
288
|
+
return self.optimal_steps or 0
|
|
289
|
+
|
|
290
|
+
def get_reasoning_metrics(self) -> ReasoningMetrics:
|
|
291
|
+
"""Get a snapshot of reasoning depth metrics for the current episode.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Frozen ReasoningMetrics with all tracked data.
|
|
295
|
+
"""
|
|
296
|
+
optimal = self.optimal_steps
|
|
297
|
+
# optimal_path_length requires ge=1; treat 0 or negative as unknown
|
|
298
|
+
if optimal is not None and optimal < 1:
|
|
299
|
+
optimal = None
|
|
300
|
+
return self._reasoning_tracker.to_metrics(
|
|
301
|
+
optimal_path_length=optimal,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
@property
|
|
305
|
+
def reasoning_tracker(self) -> ReasoningTracker:
|
|
306
|
+
"""Access the reasoning tracker directly."""
|
|
307
|
+
return self._reasoning_tracker
|
|
308
|
+
|
|
186
309
|
def can_use_hint(self) -> bool:
|
|
187
310
|
"""Check if hints are available without consuming one.
|
|
188
311
|
|
chuk_puzzles_gym/gym_env.py
CHANGED
|
@@ -197,6 +197,7 @@ class PuzzleEnv:
|
|
|
197
197
|
result = await self._execute_action(cmd, args)
|
|
198
198
|
except Exception as e:
|
|
199
199
|
self._game.invalid_moves += 1
|
|
200
|
+
self._game.reasoning_tracker.record_invalid_move()
|
|
200
201
|
return (
|
|
201
202
|
self._get_observation(),
|
|
202
203
|
self.reward_config["invalid_attempt"],
|
|
@@ -207,17 +208,25 @@ class PuzzleEnv:
|
|
|
207
208
|
|
|
208
209
|
self._step_count += 1
|
|
209
210
|
|
|
211
|
+
# Build position tuple from parsed args for reasoning tracker
|
|
212
|
+
position = tuple(args)
|
|
213
|
+
|
|
210
214
|
# Calculate reward
|
|
211
215
|
if result.success:
|
|
212
216
|
reward = self.reward_config["correct_placement"]
|
|
213
217
|
|
|
218
|
+
# Feed reasoning tracker
|
|
219
|
+
# optimal_steps is dynamic (reflects current state), so use it directly
|
|
220
|
+
remaining = self._game.optimal_steps or 0
|
|
221
|
+
self._game.reasoning_tracker.record_valid_move(position, remaining)
|
|
222
|
+
|
|
214
223
|
# Check for completion
|
|
215
224
|
terminated = self._game.is_complete()
|
|
216
225
|
if terminated:
|
|
217
226
|
# Add completion bonus with efficiency multiplier
|
|
218
|
-
|
|
219
|
-
if
|
|
220
|
-
efficiency = min(1.0,
|
|
227
|
+
opt = self._game.optimal_steps
|
|
228
|
+
if opt and self._game.moves_made > 0:
|
|
229
|
+
efficiency = min(1.0, opt / self._game.moves_made)
|
|
221
230
|
else:
|
|
222
231
|
efficiency = 1.0
|
|
223
232
|
reward += (
|
|
@@ -226,11 +235,12 @@ class PuzzleEnv:
|
|
|
226
235
|
else:
|
|
227
236
|
reward = self.reward_config["invalid_attempt"]
|
|
228
237
|
self._game.invalid_moves += 1
|
|
238
|
+
self._game.reasoning_tracker.record_invalid_move()
|
|
229
239
|
terminated = False
|
|
230
240
|
|
|
231
241
|
truncated = self._step_count >= self.max_steps
|
|
232
242
|
|
|
233
|
-
info = {
|
|
243
|
+
info: dict[str, Any] = {
|
|
234
244
|
"action": action_str,
|
|
235
245
|
"success": result.success,
|
|
236
246
|
"message": result.message,
|
|
@@ -239,6 +249,10 @@ class PuzzleEnv:
|
|
|
239
249
|
"hints_used": self._game.hints_used,
|
|
240
250
|
}
|
|
241
251
|
|
|
252
|
+
# Include reasoning metrics on episode end
|
|
253
|
+
if terminated or truncated:
|
|
254
|
+
info["reasoning_metrics"] = self._game.get_reasoning_metrics().to_dict()
|
|
255
|
+
|
|
242
256
|
return self._get_observation(), reward, terminated, truncated, info
|
|
243
257
|
|
|
244
258
|
async def _execute_action(self, cmd: str, args: list[str]) -> Any:
|
|
@@ -371,7 +385,7 @@ class PuzzleEnv:
|
|
|
371
385
|
if self._game is None:
|
|
372
386
|
return {"error": "no_game"}
|
|
373
387
|
|
|
374
|
-
obs = {
|
|
388
|
+
obs: dict[str, Any] = {
|
|
375
389
|
"game": self._game.name,
|
|
376
390
|
"difficulty": self._game.difficulty.value,
|
|
377
391
|
"seed": self._game.seed,
|
|
@@ -397,6 +411,7 @@ class PuzzleEnv:
|
|
|
397
411
|
return {}
|
|
398
412
|
|
|
399
413
|
profile = self._game.difficulty_profile
|
|
414
|
+
reasoning = self._game.get_reasoning_metrics()
|
|
400
415
|
return {
|
|
401
416
|
"optimal_steps": self._game.optimal_steps,
|
|
402
417
|
"difficulty_profile": {
|
|
@@ -411,6 +426,7 @@ class PuzzleEnv:
|
|
|
411
426
|
"hint_budget": self.solver_config.hint_budget,
|
|
412
427
|
"hint_penalty": self.solver_config.hint_penalty,
|
|
413
428
|
},
|
|
429
|
+
"reasoning_metrics": reasoning.to_dict(),
|
|
414
430
|
}
|
|
415
431
|
|
|
416
432
|
def render(self, mode: str = "ansi") -> str | None:
|
|
@@ -20,6 +20,7 @@ from .evaluation import (
|
|
|
20
20
|
EpisodeTracer,
|
|
21
21
|
EvaluationSummary,
|
|
22
22
|
MoveRecord,
|
|
23
|
+
ReasoningMetrics,
|
|
23
24
|
SolverConfig,
|
|
24
25
|
TraceEvent,
|
|
25
26
|
)
|
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
|
42
43
|
"EpisodeTracer",
|
|
43
44
|
"EvaluationSummary",
|
|
44
45
|
"MoveRecord",
|
|
46
|
+
"ReasoningMetrics",
|
|
45
47
|
"SolverConfig",
|
|
46
48
|
"TraceEvent",
|
|
47
49
|
]
|
|
@@ -38,6 +38,132 @@ class MoveRecord(BaseModel):
|
|
|
38
38
|
timestamp_ms: int = Field(default=0, description="Milliseconds since episode start")
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
class ReasoningMetrics(BaseModel):
|
|
42
|
+
"""Reasoning depth metrics for evaluating quality of agent reasoning.
|
|
43
|
+
|
|
44
|
+
Goes beyond binary success/failure to measure *how* an agent reasons:
|
|
45
|
+
- Backtrack detection: did the agent revise previous placements?
|
|
46
|
+
- Progress tracking: how steadily did the agent make progress?
|
|
47
|
+
- Error patterns: were errors isolated or clustered in streaks?
|
|
48
|
+
- Reasoning overhead: how much wasted work relative to optimal?
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model_config = ConfigDict(frozen=True)
|
|
52
|
+
|
|
53
|
+
# Raw tracking data
|
|
54
|
+
backtrack_count: int = Field(
|
|
55
|
+
default=0,
|
|
56
|
+
ge=0,
|
|
57
|
+
description="Times agent placed a value at a previously filled position",
|
|
58
|
+
)
|
|
59
|
+
solver_distance_trace: list[int] = Field(
|
|
60
|
+
default_factory=list,
|
|
61
|
+
description="Remaining positions to fill after each valid move",
|
|
62
|
+
)
|
|
63
|
+
error_streak_max: int = Field(
|
|
64
|
+
default=0,
|
|
65
|
+
ge=0,
|
|
66
|
+
description="Longest consecutive run of invalid moves",
|
|
67
|
+
)
|
|
68
|
+
error_streaks: list[int] = Field(
|
|
69
|
+
default_factory=list,
|
|
70
|
+
description="Lengths of each consecutive error streak",
|
|
71
|
+
)
|
|
72
|
+
total_actions: int = Field(
|
|
73
|
+
default=0,
|
|
74
|
+
ge=0,
|
|
75
|
+
description="Total actions taken (valid + invalid)",
|
|
76
|
+
)
|
|
77
|
+
optimal_path_length: int | None = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
ge=1,
|
|
80
|
+
description="Minimum steps to solve (from solver)",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@computed_field
|
|
84
|
+
@property
|
|
85
|
+
def reasoning_overhead(self) -> float:
|
|
86
|
+
"""Ratio of total actions to optimal path length.
|
|
87
|
+
|
|
88
|
+
1.0 = perfect (no wasted actions). Higher = more wasted reasoning.
|
|
89
|
+
Returns 0.0 if optimal path length is unknown.
|
|
90
|
+
"""
|
|
91
|
+
if self.optimal_path_length is None or self.optimal_path_length == 0:
|
|
92
|
+
return 0.0
|
|
93
|
+
if self.total_actions == 0:
|
|
94
|
+
return 0.0
|
|
95
|
+
return self.total_actions / self.optimal_path_length
|
|
96
|
+
|
|
97
|
+
@computed_field
|
|
98
|
+
@property
|
|
99
|
+
def backtrack_rate(self) -> float:
|
|
100
|
+
"""Fraction of valid moves that were backtracks (revisions).
|
|
101
|
+
|
|
102
|
+
0.0 = no backtracks, 1.0 = every move was a revision.
|
|
103
|
+
"""
|
|
104
|
+
valid_moves = len(self.solver_distance_trace)
|
|
105
|
+
if valid_moves == 0:
|
|
106
|
+
return 0.0
|
|
107
|
+
return self.backtrack_count / valid_moves
|
|
108
|
+
|
|
109
|
+
@computed_field
|
|
110
|
+
@property
|
|
111
|
+
def progress_velocity(self) -> float:
|
|
112
|
+
"""Average progress per valid move (cells solved per step).
|
|
113
|
+
|
|
114
|
+
Measures how much closer to the solution each move gets.
|
|
115
|
+
1.0 = every move reduces remaining by exactly 1. Lower = backtracks/plateaus.
|
|
116
|
+
Returns 0.0 if insufficient data.
|
|
117
|
+
"""
|
|
118
|
+
trace = self.solver_distance_trace
|
|
119
|
+
if len(trace) < 2:
|
|
120
|
+
return 0.0
|
|
121
|
+
total_progress = trace[0] - trace[-1]
|
|
122
|
+
steps = len(trace) - 1
|
|
123
|
+
if steps == 0:
|
|
124
|
+
return 0.0
|
|
125
|
+
return total_progress / steps
|
|
126
|
+
|
|
127
|
+
@computed_field
|
|
128
|
+
@property
|
|
129
|
+
def progress_steadiness(self) -> float:
|
|
130
|
+
"""Measure of how monotonically progress decreased (0.0 to 1.0).
|
|
131
|
+
|
|
132
|
+
1.0 = perfectly monotonic progress (every move reduced remaining count).
|
|
133
|
+
0.0 = no monotonic progress at all.
|
|
134
|
+
"""
|
|
135
|
+
trace = self.solver_distance_trace
|
|
136
|
+
if len(trace) < 2:
|
|
137
|
+
return 1.0
|
|
138
|
+
monotonic_steps = sum(1 for i in range(1, len(trace)) if trace[i] < trace[i - 1])
|
|
139
|
+
return monotonic_steps / (len(trace) - 1)
|
|
140
|
+
|
|
141
|
+
@computed_field
|
|
142
|
+
@property
|
|
143
|
+
def avg_error_streak(self) -> float:
|
|
144
|
+
"""Average length of consecutive error streaks.
|
|
145
|
+
|
|
146
|
+
Returns 0.0 if no error streaks occurred.
|
|
147
|
+
"""
|
|
148
|
+
if not self.error_streaks:
|
|
149
|
+
return 0.0
|
|
150
|
+
return sum(self.error_streaks) / len(self.error_streaks)
|
|
151
|
+
|
|
152
|
+
def to_dict(self) -> dict[str, Any]:
|
|
153
|
+
"""Convert to flat dictionary for reporting."""
|
|
154
|
+
return {
|
|
155
|
+
"backtrack_count": self.backtrack_count,
|
|
156
|
+
"backtrack_rate": round(self.backtrack_rate, 3),
|
|
157
|
+
"reasoning_overhead": round(self.reasoning_overhead, 3),
|
|
158
|
+
"progress_velocity": round(self.progress_velocity, 3),
|
|
159
|
+
"progress_steadiness": round(self.progress_steadiness, 3),
|
|
160
|
+
"error_streak_max": self.error_streak_max,
|
|
161
|
+
"avg_error_streak": round(self.avg_error_streak, 3),
|
|
162
|
+
"total_actions": self.total_actions,
|
|
163
|
+
"optimal_path_length": self.optimal_path_length,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
41
167
|
class EpisodeResult(BaseModel):
|
|
42
168
|
"""Complete result of a single puzzle episode with normalized metrics.
|
|
43
169
|
|
|
@@ -91,6 +217,12 @@ class EpisodeResult(BaseModel):
|
|
|
91
217
|
description="Complete move history for detailed analysis",
|
|
92
218
|
)
|
|
93
219
|
|
|
220
|
+
# Reasoning depth metrics
|
|
221
|
+
reasoning_metrics: ReasoningMetrics | None = Field(
|
|
222
|
+
default=None,
|
|
223
|
+
description="Detailed reasoning depth metrics (backtracks, progress, error patterns)",
|
|
224
|
+
)
|
|
225
|
+
|
|
94
226
|
# Computed normalized metrics
|
|
95
227
|
@computed_field
|
|
96
228
|
@property
|
|
@@ -154,7 +286,7 @@ class EpisodeResult(BaseModel):
|
|
|
154
286
|
|
|
155
287
|
def to_summary_dict(self) -> dict[str, Any]:
|
|
156
288
|
"""One-line episode summary for logging/streaming."""
|
|
157
|
-
|
|
289
|
+
d: dict[str, Any] = {
|
|
158
290
|
"game": self.game,
|
|
159
291
|
"seed": self.seed,
|
|
160
292
|
"difficulty": self.difficulty.value,
|
|
@@ -165,6 +297,9 @@ class EpisodeResult(BaseModel):
|
|
|
165
297
|
"efficiency": round(self.efficiency_score, 3),
|
|
166
298
|
"time_ms": self.wall_time_ms,
|
|
167
299
|
}
|
|
300
|
+
if self.reasoning_metrics is not None:
|
|
301
|
+
d["reasoning"] = self.reasoning_metrics.to_dict()
|
|
302
|
+
return d
|
|
168
303
|
|
|
169
304
|
def to_jsonl(self) -> str:
|
|
170
305
|
"""Single-line JSON for streaming output."""
|
|
@@ -217,6 +352,35 @@ class EvaluationSummary(BaseModel):
|
|
|
217
352
|
return 0.0
|
|
218
353
|
return sum(e.wall_time_ms for e in self.episodes) / len(self.episodes)
|
|
219
354
|
|
|
355
|
+
@computed_field
|
|
356
|
+
@property
|
|
357
|
+
def avg_backtrack_rate(self) -> float:
|
|
358
|
+
"""Average backtrack rate across episodes with reasoning metrics."""
|
|
359
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
360
|
+
if not with_metrics:
|
|
361
|
+
return 0.0
|
|
362
|
+
return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
363
|
+
|
|
364
|
+
@computed_field
|
|
365
|
+
@property
|
|
366
|
+
def avg_reasoning_overhead(self) -> float:
|
|
367
|
+
"""Average reasoning overhead across episodes with reasoning metrics."""
|
|
368
|
+
with_metrics = [
|
|
369
|
+
e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
|
|
370
|
+
]
|
|
371
|
+
if not with_metrics:
|
|
372
|
+
return 0.0
|
|
373
|
+
return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
374
|
+
|
|
375
|
+
@computed_field
|
|
376
|
+
@property
|
|
377
|
+
def avg_progress_steadiness(self) -> float:
|
|
378
|
+
"""Average progress steadiness across episodes with reasoning metrics."""
|
|
379
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
380
|
+
if not with_metrics:
|
|
381
|
+
return 0.0
|
|
382
|
+
return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
383
|
+
|
|
220
384
|
|
|
221
385
|
class TraceEvent(BaseModel):
|
|
222
386
|
"""A single event in an episode trace for JSONL logging."""
|
chuk_puzzles_gym/server.py
CHANGED
|
@@ -63,6 +63,9 @@ class ArcadeHandler(TelnetHandler):
|
|
|
63
63
|
if not self.current_game:
|
|
64
64
|
return
|
|
65
65
|
|
|
66
|
+
# Get final reasoning metrics
|
|
67
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
68
|
+
|
|
66
69
|
if self.output_mode == OutputMode.JSON:
|
|
67
70
|
await self.send_json_response(
|
|
68
71
|
type="complete",
|
|
@@ -72,17 +75,27 @@ class ArcadeHandler(TelnetHandler):
|
|
|
72
75
|
invalid_moves=self.current_game.invalid_moves,
|
|
73
76
|
hints_used=self.current_game.hints_used,
|
|
74
77
|
optimal_steps=self.current_game.optimal_steps,
|
|
78
|
+
reasoning_metrics=reasoning,
|
|
75
79
|
)
|
|
76
80
|
elif self.output_mode == OutputMode.STRICT:
|
|
77
81
|
await self.send_line(
|
|
78
82
|
f"COMPLETE:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
|
|
79
|
-
f"{self.current_game.hints_used}"
|
|
83
|
+
f"{self.current_game.hints_used}:"
|
|
84
|
+
f"BT={reasoning['backtrack_count']}:"
|
|
85
|
+
f"OH={reasoning['reasoning_overhead']:.2f}:"
|
|
86
|
+
f"ST={reasoning['progress_steadiness']:.2f}"
|
|
80
87
|
)
|
|
81
88
|
else:
|
|
82
89
|
await self.send_line("\n" + "=" * 50)
|
|
83
90
|
await self.send_line("CONGRATULATIONS! YOU SOLVED IT!")
|
|
84
91
|
await self.send_line("=" * 50)
|
|
85
92
|
await self.send_line(self.current_game.get_stats())
|
|
93
|
+
await self.send_line("")
|
|
94
|
+
await self.send_line("Reasoning Depth:")
|
|
95
|
+
await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
|
|
96
|
+
await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
|
|
97
|
+
await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
|
|
98
|
+
await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
|
|
86
99
|
await self.send_line("\nType 'menu' to play another game.")
|
|
87
100
|
await self.send_line("=" * 50 + "\n")
|
|
88
101
|
|
|
@@ -109,6 +122,9 @@ class ArcadeHandler(TelnetHandler):
|
|
|
109
122
|
"constraint_density": profile.constraint_density,
|
|
110
123
|
}
|
|
111
124
|
|
|
125
|
+
# Reasoning depth metrics
|
|
126
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
127
|
+
|
|
112
128
|
return {
|
|
113
129
|
"game": self.current_game.name,
|
|
114
130
|
"difficulty": self.current_game.difficulty.value,
|
|
@@ -120,6 +136,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
120
136
|
"optimal_steps": self.current_game.optimal_steps,
|
|
121
137
|
"is_complete": self.current_game.is_complete(),
|
|
122
138
|
"difficulty_profile": profile_dict,
|
|
139
|
+
"reasoning_metrics": reasoning,
|
|
123
140
|
"grid": grid,
|
|
124
141
|
}
|
|
125
142
|
|
|
@@ -435,9 +452,10 @@ class ArcadeHandler(TelnetHandler):
|
|
|
435
452
|
return
|
|
436
453
|
|
|
437
454
|
if cmd_enum == GameCommand.STATS:
|
|
438
|
-
# Show detailed stats including difficulty profile
|
|
455
|
+
# Show detailed stats including difficulty profile and reasoning metrics
|
|
439
456
|
profile = self.current_game.difficulty_profile
|
|
440
457
|
optimal = self.current_game.optimal_steps
|
|
458
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
441
459
|
|
|
442
460
|
if self.output_mode == OutputMode.JSON:
|
|
443
461
|
await self.send_json_response(
|
|
@@ -455,11 +473,15 @@ class ArcadeHandler(TelnetHandler):
|
|
|
455
473
|
"state_observability": profile.state_observability,
|
|
456
474
|
"constraint_density": profile.constraint_density,
|
|
457
475
|
},
|
|
476
|
+
reasoning_metrics=reasoning,
|
|
458
477
|
)
|
|
459
478
|
elif self.output_mode == OutputMode.STRICT:
|
|
460
479
|
await self.send_line(
|
|
461
480
|
f"STATS:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
|
|
462
|
-
f"{self.current_game.hints_used}:{optimal or 0}"
|
|
481
|
+
f"{self.current_game.hints_used}:{optimal or 0}:"
|
|
482
|
+
f"BT={reasoning['backtrack_count']}:"
|
|
483
|
+
f"OH={reasoning['reasoning_overhead']:.2f}:"
|
|
484
|
+
f"ST={reasoning['progress_steadiness']:.2f}"
|
|
463
485
|
)
|
|
464
486
|
else:
|
|
465
487
|
await self.send_line("")
|
|
@@ -482,6 +504,15 @@ class ArcadeHandler(TelnetHandler):
|
|
|
482
504
|
await self.send_line(f" Optimal steps: {optimal}")
|
|
483
505
|
await self.send_line(f" Current efficiency: {efficiency:.1%}")
|
|
484
506
|
await self.send_line("")
|
|
507
|
+
await self.send_line("Reasoning Depth:")
|
|
508
|
+
await self.send_line(f" Backtrack count: {reasoning['backtrack_count']}")
|
|
509
|
+
await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
|
|
510
|
+
await self.send_line(f" Progress velocity: {reasoning['progress_velocity']:.2f} cells/step")
|
|
511
|
+
await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
|
|
512
|
+
await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
|
|
513
|
+
await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
|
|
514
|
+
await self.send_line(f" Total actions: {reasoning['total_actions']}")
|
|
515
|
+
await self.send_line("")
|
|
485
516
|
await self.send_line("Difficulty Profile:")
|
|
486
517
|
await self.send_line(f" Logic depth: {profile.logic_depth}")
|
|
487
518
|
await self.send_line(f" Branching factor: {profile.branching_factor:.1f}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chuk-puzzles-gym
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.2
|
|
4
4
|
Summary: Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation
|
|
5
5
|
Author: Chris Hay
|
|
6
6
|
License: MIT
|
|
@@ -93,10 +93,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
|
|
|
93
93
|
- Enable with `mode agent` command
|
|
94
94
|
- Machine-parseable grid format with clear start/end markers
|
|
95
95
|
- Compact output optimized for LLM tool integration
|
|
96
|
+
- **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
|
|
97
|
+
- Backtrack detection (did the agent revise previous placements?)
|
|
98
|
+
- Progress steadiness (monotonic advance toward solution?)
|
|
99
|
+
- Error streak analysis (isolated mistakes vs. clustered confusion?)
|
|
100
|
+
- Reasoning overhead (wasted work relative to optimal path)
|
|
101
|
+
- Solver distance traces (remaining work after each valid move)
|
|
102
|
+
- Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
|
|
96
103
|
- **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
|
|
97
104
|
- Batch evaluation with configurable episodes
|
|
98
105
|
- Multiple output formats (JSON, CSV, Markdown)
|
|
99
|
-
- Metrics: moves, invalid moves, hints, solve time
|
|
106
|
+
- Metrics: moves, invalid moves, hints, solve time, reasoning depth
|
|
100
107
|
- Reproducible with deterministic seeds
|
|
101
108
|
- **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
|
|
102
109
|
- JSONL output with complete problem definitions and solutions
|
|
@@ -500,6 +507,7 @@ games = PuzzleEnv.available_games()
|
|
|
500
507
|
|
|
501
508
|
- **All 30 games** accessible through unified API
|
|
502
509
|
- **Configurable rewards** for correct moves, invalid attempts, completion bonuses
|
|
510
|
+
- **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
|
|
503
511
|
- **Hint system** with optional budget limits
|
|
504
512
|
- **Solver-free mode** for pure reasoning benchmarks
|
|
505
513
|
- **Efficiency scoring** based on optimal step counts
|
|
@@ -515,8 +523,25 @@ obs = {
|
|
|
515
523
|
"moves": 5,
|
|
516
524
|
"invalid_moves": 1,
|
|
517
525
|
"hints_used": 2,
|
|
526
|
+
"hints_remaining": 98,
|
|
518
527
|
"is_complete": False,
|
|
519
|
-
"grid": [[4, 0, 8, ...], ...] # Game-specific state
|
|
528
|
+
"grid": [[4, 0, 8, ...], ...], # Game-specific state
|
|
529
|
+
"render": " | 1 2 3 | ...", # ASCII grid
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Info dict includes reasoning metrics and difficulty profile
|
|
533
|
+
info = {
|
|
534
|
+
"optimal_steps": 45,
|
|
535
|
+
"difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
|
|
536
|
+
"reasoning_metrics": {
|
|
537
|
+
"backtrack_count": 0,
|
|
538
|
+
"backtrack_rate": 0.0,
|
|
539
|
+
"progress_velocity": 1.0,
|
|
540
|
+
"progress_steadiness": 1.0,
|
|
541
|
+
"reasoning_overhead": 1.0,
|
|
542
|
+
"error_streak_max": 0,
|
|
543
|
+
"solver_distance_trace": [44, 43, 42, ...],
|
|
544
|
+
},
|
|
520
545
|
}
|
|
521
546
|
```
|
|
522
547
|
|
|
@@ -546,6 +571,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
|
|
|
546
571
|
env = PuzzleEnv("sudoku", solver_config=config)
|
|
547
572
|
```
|
|
548
573
|
|
|
574
|
+
## Reasoning Depth Metrics
|
|
575
|
+
|
|
576
|
+
Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
|
|
577
|
+
|
|
578
|
+
### Metrics
|
|
579
|
+
|
|
580
|
+
| Metric | Description | Perfect Score |
|
|
581
|
+
|--------|-------------|---------------|
|
|
582
|
+
| `backtrack_count` | Times the agent revised a previous placement | 0 |
|
|
583
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
|
|
584
|
+
| `progress_velocity` | Average cells solved per step | 1.0 |
|
|
585
|
+
| `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
|
|
586
|
+
| `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
|
|
587
|
+
| `error_streak_max` | Longest run of consecutive invalid moves | 0 |
|
|
588
|
+
| `avg_error_streak` | Average length of error bursts | 0.0 |
|
|
589
|
+
| `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
|
|
590
|
+
|
|
591
|
+
### Usage in Gym Environment
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
from chuk_puzzles_gym.gym_env import PuzzleEnv
|
|
595
|
+
|
|
596
|
+
env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
|
|
597
|
+
obs, info = await env.reset()
|
|
598
|
+
|
|
599
|
+
# Reasoning metrics available in info after reset
|
|
600
|
+
print(info["reasoning_metrics"])
|
|
601
|
+
|
|
602
|
+
# ... agent plays ...
|
|
603
|
+
obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
|
|
604
|
+
|
|
605
|
+
# On episode end, info includes full reasoning metrics
|
|
606
|
+
if terminated:
|
|
607
|
+
metrics = info["reasoning_metrics"]
|
|
608
|
+
print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
|
|
609
|
+
print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
|
|
610
|
+
print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
### Usage in Server (Telnet/WebSocket)
|
|
614
|
+
|
|
615
|
+
Reasoning metrics are included automatically in server output:
|
|
616
|
+
|
|
617
|
+
- **JSON mode**: `reasoning_metrics` dict in every state response and completion message
|
|
618
|
+
- **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
|
|
619
|
+
- **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
|
|
620
|
+
|
|
621
|
+
```
|
|
622
|
+
> mode json
|
|
623
|
+
> place 1 1 5
|
|
624
|
+
{"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
|
|
625
|
+
|
|
626
|
+
> stats
|
|
627
|
+
{"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
### Usage in Evaluation Harness
|
|
631
|
+
|
|
632
|
+
```bash
|
|
633
|
+
# Reasoning metrics included in all output formats
|
|
634
|
+
chuk-puzzles-eval sudoku -d easy -n 10 -o json
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
```python
|
|
638
|
+
from chuk_puzzles_gym.eval import evaluate_game
|
|
639
|
+
|
|
640
|
+
report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
|
|
641
|
+
report.print_summary() # Includes "Reasoning Depth" section
|
|
642
|
+
|
|
643
|
+
# Aggregate metrics
|
|
644
|
+
print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
|
|
645
|
+
print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
|
|
646
|
+
print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
|
|
647
|
+
```
|
|
648
|
+
|
|
649
|
+
### What the Metrics Reveal
|
|
650
|
+
|
|
651
|
+
A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
|
|
652
|
+
|
|
653
|
+
A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
|
|
654
|
+
|
|
655
|
+
These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
|
|
656
|
+
|
|
549
657
|
## Evaluation Harness
|
|
550
658
|
|
|
551
659
|
The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
|
|
@@ -604,6 +712,12 @@ Avg Time: 12ms
|
|
|
604
712
|
| `hints_used` | Number of hints requested |
|
|
605
713
|
| `wall_time_ms` | Time to solve in milliseconds |
|
|
606
714
|
| `seed` | Puzzle seed for reproducibility |
|
|
715
|
+
| `backtrack_count` | Times agent revised a previous placement |
|
|
716
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks |
|
|
717
|
+
| `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
|
|
718
|
+
| `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
|
|
719
|
+
| `error_streak_max` | Longest run of consecutive invalid moves |
|
|
720
|
+
| `progress_velocity` | Average cells solved per step |
|
|
607
721
|
|
|
608
722
|
## Dataset Export
|
|
609
723
|
|
|
@@ -1194,12 +1308,13 @@ chuk-puzzles-gym/
|
|
|
1194
1308
|
│ │ ├── base.py # GridPosition, MoveResult
|
|
1195
1309
|
│ │ ├── config.py # Base GameConfig
|
|
1196
1310
|
│ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
|
|
1311
|
+
│ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
|
|
1197
1312
|
│ │ └── games.py # Game-specific models (Cage, Task, etc.)
|
|
1198
1313
|
│ └── games/ # Self-contained game modules
|
|
1199
1314
|
│ ├── __init__.py # AVAILABLE_GAMES registry
|
|
1200
1315
|
│ ├── _base/ # Base classes
|
|
1201
1316
|
│ │ ├── __init__.py
|
|
1202
|
-
│ │ ├── game.py # PuzzleGame ABC
|
|
1317
|
+
│ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
|
|
1203
1318
|
│ │ └── commands.py # GameCommandHandler ABC
|
|
1204
1319
|
│ ├── sudoku/ # Example game module
|
|
1205
1320
|
│ │ ├── __init__.py # Exports SudokuGame
|
|
@@ -1226,6 +1341,7 @@ chuk-puzzles-gym/
|
|
|
1226
1341
|
│ ├── example_graph_coloring.py # Graph Coloring game logic demo
|
|
1227
1342
|
│ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
|
|
1228
1343
|
│ ├── example_rush_hour.py # Rush Hour game logic demo
|
|
1344
|
+
│ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
|
|
1229
1345
|
│ └── README.md # Example usage guide
|
|
1230
1346
|
├── .github/workflows/ # CI/CD workflows
|
|
1231
1347
|
├── pyproject.toml # Modern Python project config
|
|
@@ -1465,9 +1581,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
|
|
|
1465
1581
|
### Highlights
|
|
1466
1582
|
|
|
1467
1583
|
**Benchmarking & Metrics**
|
|
1468
|
-
- Puzzle complexity metrics (constraint count, variable count, branching factor)
|
|
1469
|
-
- Episode model for tracking game sessions
|
|
1470
|
-
-
|
|
1584
|
+
- ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
|
|
1585
|
+
- ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
|
|
1586
|
+
- ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
|
|
1587
|
+
- ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
|
|
1471
1588
|
|
|
1472
1589
|
**Agent Evaluation Tools**
|
|
1473
1590
|
- Batch evaluation harness CLI
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
chuk_puzzles_gym/__init__.py,sha256=zh2sc6QFKrtAmMLee7vlHgXuOBoB5CjSldlKFjZTVVE,521
|
|
2
2
|
chuk_puzzles_gym/constants.py,sha256=58pKdvwoaB4PF1AK4b7mLNf_Y_YFyFassd1hYH1IUNE,280
|
|
3
|
-
chuk_puzzles_gym/eval.py,sha256
|
|
4
|
-
chuk_puzzles_gym/gym_env.py,sha256=
|
|
5
|
-
chuk_puzzles_gym/server.py,sha256=
|
|
3
|
+
chuk_puzzles_gym/eval.py,sha256=-ku_pshSMG5RIu-p4MdS9ju4kduyKjvTn8Q99y_UO_E,31830
|
|
4
|
+
chuk_puzzles_gym/gym_env.py,sha256=V2Eg1CFXKceR6vWTvAzvfanXvZL24STbw3YP8-cjkk0,18074
|
|
5
|
+
chuk_puzzles_gym/server.py,sha256=SWfuBO4wtm_4Ri8l5hbQmvMF7ZN4Q42Wt66neFp5-nQ,48055
|
|
6
6
|
chuk_puzzles_gym/export/__init__.py,sha256=TTXBRR5CBBCL04r1iXMzxib9oOIDTC4npxy2_L1xc2A,366
|
|
7
|
-
chuk_puzzles_gym/export/dataset.py,sha256=
|
|
7
|
+
chuk_puzzles_gym/export/dataset.py,sha256=bza7iCfp4POz0gCcoSRF_hTRZmuAD-59DyrrHiqo4ac,11335
|
|
8
8
|
chuk_puzzles_gym/games/__init__.py,sha256=zByuxje5uVWQ4wBoGHUooHkAg5cgCljrCCXkyOLxLzo,3403
|
|
9
9
|
chuk_puzzles_gym/games/_base/__init__.py,sha256=oNjoMvOVDb010ooyGxAfXBrOqmw1BAGavmaxf44tmz0,188
|
|
10
10
|
chuk_puzzles_gym/games/_base/commands.py,sha256=tY0kxk08D8nPr_C_awo8qDUhkL6EHA59KnWiLlYnloY,2381
|
|
11
|
-
chuk_puzzles_gym/games/_base/game.py,sha256
|
|
11
|
+
chuk_puzzles_gym/games/_base/game.py,sha256=Jwfjj4qazgaWLQLNTghfMuydy-D3KrOuUmpCM9kpjlU,15711
|
|
12
12
|
chuk_puzzles_gym/games/binary/__init__.py,sha256=Pphgj0kcvHUgkM0Mq89GsWPt-Bg6DobDLi7cqliOywk,156
|
|
13
13
|
chuk_puzzles_gym/games/binary/config.py,sha256=Iw8Wax1856aqaz1KvDC69Qou6z8gxIWr5rSAI0MGnWg,812
|
|
14
14
|
chuk_puzzles_gym/games/binary/game.py,sha256=lRBweQIdzyRZm_jMPItZ1VAzAcsEEbxvGqjGwAlTTy0,16359
|
|
@@ -118,17 +118,17 @@ chuk_puzzles_gym/games/sudoku/game.py,sha256=35vB5x-KIs5z2b-CDV-dq5kifmVkoEkbLOx
|
|
|
118
118
|
chuk_puzzles_gym/games/tents/__init__.py,sha256=iVxsZg7Juz3iHXTK8mfJZniFcMNnmAd2h2RjxR2TH40,133
|
|
119
119
|
chuk_puzzles_gym/games/tents/config.py,sha256=gSi5epG5va8-a4ZQv5ekcFDkWQSYOSheX2j4FIs_I8Q,914
|
|
120
120
|
chuk_puzzles_gym/games/tents/game.py,sha256=JGPLYvIosCwjJYhi0FCtA3YUFsgQsD9L_BEArHSOPFM,15802
|
|
121
|
-
chuk_puzzles_gym/models/__init__.py,sha256=
|
|
121
|
+
chuk_puzzles_gym/models/__init__.py,sha256=6SQn3zEcalTl-9VqKbSwvmWaYkRMuGKUkfiC25c9-h8,976
|
|
122
122
|
chuk_puzzles_gym/models/base.py,sha256=L7Zug9jUXJCOhD3wKJp0ppJZNTgroDQwdYMjvAaVVqc,1156
|
|
123
123
|
chuk_puzzles_gym/models/config.py,sha256=12UkPlEEFzN1k9ZfJClpVqkp7E11MWriZVAH2RkfEM4,301
|
|
124
124
|
chuk_puzzles_gym/models/enums.py,sha256=xmHv0OK2zKcxpfhJP3huuXhDnnX0BDLCwWfpR9ZuraQ,2342
|
|
125
|
-
chuk_puzzles_gym/models/evaluation.py,sha256=
|
|
125
|
+
chuk_puzzles_gym/models/evaluation.py,sha256=b2ldWPih-lo2jy59pWincjv9qZuF6PsZd42LPZsZzLc,22162
|
|
126
126
|
chuk_puzzles_gym/models/games.py,sha256=rnEW_Sl9xuZtvlBXBZfab34HrIhtUEiBdUSs_nvh10o,442
|
|
127
127
|
chuk_puzzles_gym/trace/__init__.py,sha256=8JHaHxbTDhT9kv4e2e5Px4dCWuXY49OXmvzkMS4nKfw,273
|
|
128
128
|
chuk_puzzles_gym/trace/generator.py,sha256=4pks0d_asoDE15QjM2VuzgFWTV1fZke_gHH2lVF8KVQ,34058
|
|
129
129
|
chuk_puzzles_gym/utils/__init__.py,sha256=1AKPfRjT9YlBxxcA7qdKcvKBXdHJzfGtUWansrb_2VE,149
|
|
130
|
-
chuk_puzzles_gym-0.10.
|
|
131
|
-
chuk_puzzles_gym-0.10.
|
|
132
|
-
chuk_puzzles_gym-0.10.
|
|
133
|
-
chuk_puzzles_gym-0.10.
|
|
134
|
-
chuk_puzzles_gym-0.10.
|
|
130
|
+
chuk_puzzles_gym-0.10.2.dist-info/METADATA,sha256=adaIAGmTJQj7wES0bqZEETQ5pbQQJ9OrswxhNZayits,55140
|
|
131
|
+
chuk_puzzles_gym-0.10.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
132
|
+
chuk_puzzles_gym-0.10.2.dist-info/entry_points.txt,sha256=tJGHiH8wjkBev2SPNuXOLFkaXE76sW9ZFIMQw4pUj5E,181
|
|
133
|
+
chuk_puzzles_gym-0.10.2.dist-info/top_level.txt,sha256=H3z9wKGl7CV1BPlO6t5lEtok6WW9rwGr5C1Dr3Kqx28,17
|
|
134
|
+
chuk_puzzles_gym-0.10.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|