chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chuk_puzzles_gym/eval.py +168 -46
- chuk_puzzles_gym/export/dataset.py +7 -1
- chuk_puzzles_gym/games/_base/game.py +123 -0
- chuk_puzzles_gym/games/binary/game.py +2 -0
- chuk_puzzles_gym/games/bridges/game.py +2 -0
- chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
- chuk_puzzles_gym/games/einstein/game.py +2 -0
- chuk_puzzles_gym/games/fillomino/game.py +2 -0
- chuk_puzzles_gym/games/futoshiki/game.py +2 -0
- chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
- chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
- chuk_puzzles_gym/games/hidato/game.py +2 -0
- chuk_puzzles_gym/games/hitori/game.py +2 -0
- chuk_puzzles_gym/games/kakuro/game.py +2 -0
- chuk_puzzles_gym/games/kenken/game.py +2 -0
- chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
- chuk_puzzles_gym/games/knapsack/game.py +2 -0
- chuk_puzzles_gym/games/lights_out/game.py +2 -0
- chuk_puzzles_gym/games/logic_grid/game.py +2 -0
- chuk_puzzles_gym/games/mastermind/game.py +2 -0
- chuk_puzzles_gym/games/minesweeper/game.py +2 -0
- chuk_puzzles_gym/games/nonogram/game.py +2 -0
- chuk_puzzles_gym/games/nqueens/game.py +5 -0
- chuk_puzzles_gym/games/numberlink/game.py +6 -0
- chuk_puzzles_gym/games/nurikabe/game.py +2 -0
- chuk_puzzles_gym/games/rush_hour/game.py +4 -0
- chuk_puzzles_gym/games/scheduler/game.py +2 -0
- chuk_puzzles_gym/games/shikaku/game.py +2 -0
- chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
- chuk_puzzles_gym/games/slitherlink/game.py +2 -0
- chuk_puzzles_gym/games/sokoban/game.py +2 -0
- chuk_puzzles_gym/games/star_battle/game.py +2 -0
- chuk_puzzles_gym/games/sudoku/game.py +2 -0
- chuk_puzzles_gym/games/tents/game.py +2 -0
- chuk_puzzles_gym/gym_env.py +21 -5
- chuk_puzzles_gym/models/__init__.py +2 -0
- chuk_puzzles_gym/models/evaluation.py +165 -1
- chuk_puzzles_gym/server.py +51 -72
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
|
@@ -173,6 +173,8 @@ class LightsOutGame(PuzzleGame):
|
|
|
173
173
|
Returns:
|
|
174
174
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
175
175
|
"""
|
|
176
|
+
if not self.can_use_hint():
|
|
177
|
+
return None
|
|
176
178
|
# Find a cell in the solution that should be pressed
|
|
177
179
|
for row in range(self.size):
|
|
178
180
|
for col in range(self.size):
|
|
@@ -235,6 +235,8 @@ class LogicGridGame(PuzzleGame):
|
|
|
235
235
|
Returns:
|
|
236
236
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
237
237
|
"""
|
|
238
|
+
if not self.can_use_hint():
|
|
239
|
+
return None
|
|
238
240
|
# Find a connection that hasn't been marked
|
|
239
241
|
for person in self.categories.person:
|
|
240
242
|
attrs = self.solution[person]
|
|
@@ -192,6 +192,8 @@ class NonogramGame(PuzzleGame):
|
|
|
192
192
|
Returns:
|
|
193
193
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
194
194
|
"""
|
|
195
|
+
if not self.can_use_hint():
|
|
196
|
+
return None
|
|
195
197
|
unknown_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == -1]
|
|
196
198
|
if not unknown_cells:
|
|
197
199
|
return None
|
|
@@ -296,6 +296,11 @@ class NQueensGame(PuzzleGame):
|
|
|
296
296
|
|
|
297
297
|
return "\n".join(lines)
|
|
298
298
|
|
|
299
|
+
def get_stats(self) -> str:
|
|
300
|
+
"""Get current game statistics."""
|
|
301
|
+
placed = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 1)
|
|
302
|
+
return f"Moves: {self.moves_made} | Queens: {placed}/{self.size} | Board: {self.size}x{self.size} | Seed: {self.seed}"
|
|
303
|
+
|
|
299
304
|
def get_rules(self) -> str:
|
|
300
305
|
return (
|
|
301
306
|
f"N-QUEENS ({self.size}x{self.size})\n"
|
|
@@ -317,6 +317,12 @@ class NumberlinkGame(PuzzleGame):
|
|
|
317
317
|
|
|
318
318
|
return "\n".join(lines)
|
|
319
319
|
|
|
320
|
+
def get_stats(self) -> str:
|
|
321
|
+
"""Get current game statistics."""
|
|
322
|
+
filled = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] != 0)
|
|
323
|
+
total = self.size * self.size
|
|
324
|
+
return f"Moves: {self.moves_made} | Filled: {filled}/{total} | Pairs: {self.num_pairs} | Seed: {self.seed}"
|
|
325
|
+
|
|
320
326
|
def get_rules(self) -> str:
|
|
321
327
|
return (
|
|
322
328
|
f"NUMBERLINK ({self.size}x{self.size}, {self.num_pairs} pairs)\n"
|
|
@@ -479,6 +479,8 @@ class NurikabeGame(PuzzleGame):
|
|
|
479
479
|
Returns:
|
|
480
480
|
Tuple of (hint_data, hint_message) or None
|
|
481
481
|
"""
|
|
482
|
+
if not self.can_use_hint():
|
|
483
|
+
return None
|
|
482
484
|
# Find a cell that differs from solution
|
|
483
485
|
for row in range(self.size):
|
|
484
486
|
for col in range(self.size):
|
|
@@ -454,6 +454,10 @@ class RushHourGame(PuzzleGame):
|
|
|
454
454
|
|
|
455
455
|
return "\n".join(lines)
|
|
456
456
|
|
|
457
|
+
def get_stats(self) -> str:
|
|
458
|
+
"""Get current game statistics."""
|
|
459
|
+
return f"Moves: {self.moves_made} | Vehicles: {len(self.vehicles)} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
|
|
460
|
+
|
|
457
461
|
def get_rules(self) -> str:
|
|
458
462
|
return (
|
|
459
463
|
f"RUSH HOUR ({self.size}x{self.size})\n"
|
|
@@ -316,6 +316,8 @@ class SchedulerGame(PuzzleGame):
|
|
|
316
316
|
Returns:
|
|
317
317
|
Tuple of (hint_data, hint_message) or None
|
|
318
318
|
"""
|
|
319
|
+
if not self.can_use_hint():
|
|
320
|
+
return None
|
|
319
321
|
# Find an unscheduled task that's in the optimal solution
|
|
320
322
|
for task_id in range(self.num_tasks):
|
|
321
323
|
if task_id not in self.schedule and task_id in self.optimal_schedule:
|
|
@@ -327,6 +327,8 @@ class ShikakuGame(PuzzleGame):
|
|
|
327
327
|
|
|
328
328
|
async def get_hint(self) -> tuple[Any, str] | None:
|
|
329
329
|
"""Get a hint for the next move."""
|
|
330
|
+
if not self.can_use_hint():
|
|
331
|
+
return None
|
|
330
332
|
# Find a rectangle from the solution that hasn't been placed yet
|
|
331
333
|
solution_rects: dict[int, list[tuple[int, int]]] = {}
|
|
332
334
|
for r in range(self.size):
|
|
@@ -255,6 +255,11 @@ class SkyscrapersGame(PuzzleGame):
|
|
|
255
255
|
|
|
256
256
|
return "\n".join(lines)
|
|
257
257
|
|
|
258
|
+
def get_stats(self) -> str:
|
|
259
|
+
"""Get current game statistics."""
|
|
260
|
+
empty = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0)
|
|
261
|
+
return f"Moves: {self.moves_made} | Empty cells: {empty} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
|
|
262
|
+
|
|
258
263
|
def get_rules(self) -> str:
|
|
259
264
|
return (
|
|
260
265
|
f"SKYSCRAPERS ({self.size}x{self.size})\n"
|
|
@@ -272,6 +272,8 @@ class SlitherlinkGame(PuzzleGame):
|
|
|
272
272
|
Returns:
|
|
273
273
|
Tuple of (hint_data, hint_message) or None
|
|
274
274
|
"""
|
|
275
|
+
if not self.can_use_hint():
|
|
276
|
+
return None
|
|
275
277
|
# Find an edge that's in the solution but not set by player
|
|
276
278
|
for row in range(self.size + 1):
|
|
277
279
|
for col in range(self.size):
|
|
@@ -301,6 +301,8 @@ class StarBattleGame(PuzzleGame):
|
|
|
301
301
|
Returns:
|
|
302
302
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
303
303
|
"""
|
|
304
|
+
if not self.can_use_hint():
|
|
305
|
+
return None
|
|
304
306
|
# Find a star location from solution that hasn't been placed
|
|
305
307
|
for r in range(self.size):
|
|
306
308
|
for c in range(self.size):
|
|
@@ -249,6 +249,8 @@ class SudokuGame(PuzzleGame):
|
|
|
249
249
|
Returns:
|
|
250
250
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
251
251
|
"""
|
|
252
|
+
if not self.can_use_hint():
|
|
253
|
+
return None
|
|
252
254
|
empty_cells = [(r, c) for r in range(9) for c in range(9) if self.grid[r][c] == 0]
|
|
253
255
|
if not empty_cells:
|
|
254
256
|
return None
|
|
@@ -326,6 +326,8 @@ class TentsGame(PuzzleGame):
|
|
|
326
326
|
Returns:
|
|
327
327
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
328
328
|
"""
|
|
329
|
+
if not self.can_use_hint():
|
|
330
|
+
return None
|
|
329
331
|
# Find a tent location from solution that hasn't been placed
|
|
330
332
|
for r in range(self.size):
|
|
331
333
|
for c in range(self.size):
|
chuk_puzzles_gym/gym_env.py
CHANGED
|
@@ -197,6 +197,7 @@ class PuzzleEnv:
|
|
|
197
197
|
result = await self._execute_action(cmd, args)
|
|
198
198
|
except Exception as e:
|
|
199
199
|
self._game.invalid_moves += 1
|
|
200
|
+
self._game.reasoning_tracker.record_invalid_move()
|
|
200
201
|
return (
|
|
201
202
|
self._get_observation(),
|
|
202
203
|
self.reward_config["invalid_attempt"],
|
|
@@ -207,17 +208,25 @@ class PuzzleEnv:
|
|
|
207
208
|
|
|
208
209
|
self._step_count += 1
|
|
209
210
|
|
|
211
|
+
# Build position tuple from parsed args for reasoning tracker
|
|
212
|
+
position = tuple(args)
|
|
213
|
+
|
|
210
214
|
# Calculate reward
|
|
211
215
|
if result.success:
|
|
212
216
|
reward = self.reward_config["correct_placement"]
|
|
213
217
|
|
|
218
|
+
# Feed reasoning tracker
|
|
219
|
+
# optimal_steps is dynamic (reflects current state), so use it directly
|
|
220
|
+
remaining = self._game.optimal_steps or 0
|
|
221
|
+
self._game.reasoning_tracker.record_valid_move(position, remaining)
|
|
222
|
+
|
|
214
223
|
# Check for completion
|
|
215
224
|
terminated = self._game.is_complete()
|
|
216
225
|
if terminated:
|
|
217
226
|
# Add completion bonus with efficiency multiplier
|
|
218
|
-
|
|
219
|
-
if
|
|
220
|
-
efficiency = min(1.0,
|
|
227
|
+
opt = self._game.optimal_steps
|
|
228
|
+
if opt and self._game.moves_made > 0:
|
|
229
|
+
efficiency = min(1.0, opt / self._game.moves_made)
|
|
221
230
|
else:
|
|
222
231
|
efficiency = 1.0
|
|
223
232
|
reward += (
|
|
@@ -226,11 +235,12 @@ class PuzzleEnv:
|
|
|
226
235
|
else:
|
|
227
236
|
reward = self.reward_config["invalid_attempt"]
|
|
228
237
|
self._game.invalid_moves += 1
|
|
238
|
+
self._game.reasoning_tracker.record_invalid_move()
|
|
229
239
|
terminated = False
|
|
230
240
|
|
|
231
241
|
truncated = self._step_count >= self.max_steps
|
|
232
242
|
|
|
233
|
-
info = {
|
|
243
|
+
info: dict[str, Any] = {
|
|
234
244
|
"action": action_str,
|
|
235
245
|
"success": result.success,
|
|
236
246
|
"message": result.message,
|
|
@@ -239,6 +249,10 @@ class PuzzleEnv:
|
|
|
239
249
|
"hints_used": self._game.hints_used,
|
|
240
250
|
}
|
|
241
251
|
|
|
252
|
+
# Include reasoning metrics on episode end
|
|
253
|
+
if terminated or truncated:
|
|
254
|
+
info["reasoning_metrics"] = self._game.get_reasoning_metrics().to_dict()
|
|
255
|
+
|
|
242
256
|
return self._get_observation(), reward, terminated, truncated, info
|
|
243
257
|
|
|
244
258
|
async def _execute_action(self, cmd: str, args: list[str]) -> Any:
|
|
@@ -371,7 +385,7 @@ class PuzzleEnv:
|
|
|
371
385
|
if self._game is None:
|
|
372
386
|
return {"error": "no_game"}
|
|
373
387
|
|
|
374
|
-
obs = {
|
|
388
|
+
obs: dict[str, Any] = {
|
|
375
389
|
"game": self._game.name,
|
|
376
390
|
"difficulty": self._game.difficulty.value,
|
|
377
391
|
"seed": self._game.seed,
|
|
@@ -397,6 +411,7 @@ class PuzzleEnv:
|
|
|
397
411
|
return {}
|
|
398
412
|
|
|
399
413
|
profile = self._game.difficulty_profile
|
|
414
|
+
reasoning = self._game.get_reasoning_metrics()
|
|
400
415
|
return {
|
|
401
416
|
"optimal_steps": self._game.optimal_steps,
|
|
402
417
|
"difficulty_profile": {
|
|
@@ -411,6 +426,7 @@ class PuzzleEnv:
|
|
|
411
426
|
"hint_budget": self.solver_config.hint_budget,
|
|
412
427
|
"hint_penalty": self.solver_config.hint_penalty,
|
|
413
428
|
},
|
|
429
|
+
"reasoning_metrics": reasoning.to_dict(),
|
|
414
430
|
}
|
|
415
431
|
|
|
416
432
|
def render(self, mode: str = "ansi") -> str | None:
|
|
@@ -20,6 +20,7 @@ from .evaluation import (
|
|
|
20
20
|
EpisodeTracer,
|
|
21
21
|
EvaluationSummary,
|
|
22
22
|
MoveRecord,
|
|
23
|
+
ReasoningMetrics,
|
|
23
24
|
SolverConfig,
|
|
24
25
|
TraceEvent,
|
|
25
26
|
)
|
|
@@ -42,6 +43,7 @@ __all__ = [
|
|
|
42
43
|
"EpisodeTracer",
|
|
43
44
|
"EvaluationSummary",
|
|
44
45
|
"MoveRecord",
|
|
46
|
+
"ReasoningMetrics",
|
|
45
47
|
"SolverConfig",
|
|
46
48
|
"TraceEvent",
|
|
47
49
|
]
|
|
@@ -38,6 +38,132 @@ class MoveRecord(BaseModel):
|
|
|
38
38
|
timestamp_ms: int = Field(default=0, description="Milliseconds since episode start")
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
class ReasoningMetrics(BaseModel):
|
|
42
|
+
"""Reasoning depth metrics for evaluating quality of agent reasoning.
|
|
43
|
+
|
|
44
|
+
Goes beyond binary success/failure to measure *how* an agent reasons:
|
|
45
|
+
- Backtrack detection: did the agent revise previous placements?
|
|
46
|
+
- Progress tracking: how steadily did the agent make progress?
|
|
47
|
+
- Error patterns: were errors isolated or clustered in streaks?
|
|
48
|
+
- Reasoning overhead: how much wasted work relative to optimal?
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model_config = ConfigDict(frozen=True)
|
|
52
|
+
|
|
53
|
+
# Raw tracking data
|
|
54
|
+
backtrack_count: int = Field(
|
|
55
|
+
default=0,
|
|
56
|
+
ge=0,
|
|
57
|
+
description="Times agent placed a value at a previously filled position",
|
|
58
|
+
)
|
|
59
|
+
solver_distance_trace: list[int] = Field(
|
|
60
|
+
default_factory=list,
|
|
61
|
+
description="Remaining positions to fill after each valid move",
|
|
62
|
+
)
|
|
63
|
+
error_streak_max: int = Field(
|
|
64
|
+
default=0,
|
|
65
|
+
ge=0,
|
|
66
|
+
description="Longest consecutive run of invalid moves",
|
|
67
|
+
)
|
|
68
|
+
error_streaks: list[int] = Field(
|
|
69
|
+
default_factory=list,
|
|
70
|
+
description="Lengths of each consecutive error streak",
|
|
71
|
+
)
|
|
72
|
+
total_actions: int = Field(
|
|
73
|
+
default=0,
|
|
74
|
+
ge=0,
|
|
75
|
+
description="Total actions taken (valid + invalid)",
|
|
76
|
+
)
|
|
77
|
+
optimal_path_length: int | None = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
ge=1,
|
|
80
|
+
description="Minimum steps to solve (from solver)",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@computed_field
|
|
84
|
+
@property
|
|
85
|
+
def reasoning_overhead(self) -> float:
|
|
86
|
+
"""Ratio of total actions to optimal path length.
|
|
87
|
+
|
|
88
|
+
1.0 = perfect (no wasted actions). Higher = more wasted reasoning.
|
|
89
|
+
Returns 0.0 if optimal path length is unknown.
|
|
90
|
+
"""
|
|
91
|
+
if self.optimal_path_length is None or self.optimal_path_length == 0:
|
|
92
|
+
return 0.0
|
|
93
|
+
if self.total_actions == 0:
|
|
94
|
+
return 0.0
|
|
95
|
+
return self.total_actions / self.optimal_path_length
|
|
96
|
+
|
|
97
|
+
@computed_field
|
|
98
|
+
@property
|
|
99
|
+
def backtrack_rate(self) -> float:
|
|
100
|
+
"""Fraction of valid moves that were backtracks (revisions).
|
|
101
|
+
|
|
102
|
+
0.0 = no backtracks, 1.0 = every move was a revision.
|
|
103
|
+
"""
|
|
104
|
+
valid_moves = len(self.solver_distance_trace)
|
|
105
|
+
if valid_moves == 0:
|
|
106
|
+
return 0.0
|
|
107
|
+
return self.backtrack_count / valid_moves
|
|
108
|
+
|
|
109
|
+
@computed_field
|
|
110
|
+
@property
|
|
111
|
+
def progress_velocity(self) -> float:
|
|
112
|
+
"""Average progress per valid move (cells solved per step).
|
|
113
|
+
|
|
114
|
+
Measures how much closer to the solution each move gets.
|
|
115
|
+
1.0 = every move reduces remaining by exactly 1. Lower = backtracks/plateaus.
|
|
116
|
+
Returns 0.0 if insufficient data.
|
|
117
|
+
"""
|
|
118
|
+
trace = self.solver_distance_trace
|
|
119
|
+
if len(trace) < 2:
|
|
120
|
+
return 0.0
|
|
121
|
+
total_progress = trace[0] - trace[-1]
|
|
122
|
+
steps = len(trace) - 1
|
|
123
|
+
if steps == 0:
|
|
124
|
+
return 0.0
|
|
125
|
+
return total_progress / steps
|
|
126
|
+
|
|
127
|
+
@computed_field
|
|
128
|
+
@property
|
|
129
|
+
def progress_steadiness(self) -> float:
|
|
130
|
+
"""Measure of how monotonically progress decreased (0.0 to 1.0).
|
|
131
|
+
|
|
132
|
+
1.0 = perfectly monotonic progress (every move reduced remaining count).
|
|
133
|
+
0.0 = no monotonic progress at all.
|
|
134
|
+
"""
|
|
135
|
+
trace = self.solver_distance_trace
|
|
136
|
+
if len(trace) < 2:
|
|
137
|
+
return 1.0
|
|
138
|
+
monotonic_steps = sum(1 for i in range(1, len(trace)) if trace[i] < trace[i - 1])
|
|
139
|
+
return monotonic_steps / (len(trace) - 1)
|
|
140
|
+
|
|
141
|
+
@computed_field
|
|
142
|
+
@property
|
|
143
|
+
def avg_error_streak(self) -> float:
|
|
144
|
+
"""Average length of consecutive error streaks.
|
|
145
|
+
|
|
146
|
+
Returns 0.0 if no error streaks occurred.
|
|
147
|
+
"""
|
|
148
|
+
if not self.error_streaks:
|
|
149
|
+
return 0.0
|
|
150
|
+
return sum(self.error_streaks) / len(self.error_streaks)
|
|
151
|
+
|
|
152
|
+
def to_dict(self) -> dict[str, Any]:
|
|
153
|
+
"""Convert to flat dictionary for reporting."""
|
|
154
|
+
return {
|
|
155
|
+
"backtrack_count": self.backtrack_count,
|
|
156
|
+
"backtrack_rate": round(self.backtrack_rate, 3),
|
|
157
|
+
"reasoning_overhead": round(self.reasoning_overhead, 3),
|
|
158
|
+
"progress_velocity": round(self.progress_velocity, 3),
|
|
159
|
+
"progress_steadiness": round(self.progress_steadiness, 3),
|
|
160
|
+
"error_streak_max": self.error_streak_max,
|
|
161
|
+
"avg_error_streak": round(self.avg_error_streak, 3),
|
|
162
|
+
"total_actions": self.total_actions,
|
|
163
|
+
"optimal_path_length": self.optimal_path_length,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
41
167
|
class EpisodeResult(BaseModel):
|
|
42
168
|
"""Complete result of a single puzzle episode with normalized metrics.
|
|
43
169
|
|
|
@@ -91,6 +217,12 @@ class EpisodeResult(BaseModel):
|
|
|
91
217
|
description="Complete move history for detailed analysis",
|
|
92
218
|
)
|
|
93
219
|
|
|
220
|
+
# Reasoning depth metrics
|
|
221
|
+
reasoning_metrics: ReasoningMetrics | None = Field(
|
|
222
|
+
default=None,
|
|
223
|
+
description="Detailed reasoning depth metrics (backtracks, progress, error patterns)",
|
|
224
|
+
)
|
|
225
|
+
|
|
94
226
|
# Computed normalized metrics
|
|
95
227
|
@computed_field
|
|
96
228
|
@property
|
|
@@ -154,7 +286,7 @@ class EpisodeResult(BaseModel):
|
|
|
154
286
|
|
|
155
287
|
def to_summary_dict(self) -> dict[str, Any]:
|
|
156
288
|
"""One-line episode summary for logging/streaming."""
|
|
157
|
-
|
|
289
|
+
d: dict[str, Any] = {
|
|
158
290
|
"game": self.game,
|
|
159
291
|
"seed": self.seed,
|
|
160
292
|
"difficulty": self.difficulty.value,
|
|
@@ -165,6 +297,9 @@ class EpisodeResult(BaseModel):
|
|
|
165
297
|
"efficiency": round(self.efficiency_score, 3),
|
|
166
298
|
"time_ms": self.wall_time_ms,
|
|
167
299
|
}
|
|
300
|
+
if self.reasoning_metrics is not None:
|
|
301
|
+
d["reasoning"] = self.reasoning_metrics.to_dict()
|
|
302
|
+
return d
|
|
168
303
|
|
|
169
304
|
def to_jsonl(self) -> str:
|
|
170
305
|
"""Single-line JSON for streaming output."""
|
|
@@ -217,6 +352,35 @@ class EvaluationSummary(BaseModel):
|
|
|
217
352
|
return 0.0
|
|
218
353
|
return sum(e.wall_time_ms for e in self.episodes) / len(self.episodes)
|
|
219
354
|
|
|
355
|
+
@computed_field
|
|
356
|
+
@property
|
|
357
|
+
def avg_backtrack_rate(self) -> float:
|
|
358
|
+
"""Average backtrack rate across episodes with reasoning metrics."""
|
|
359
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
360
|
+
if not with_metrics:
|
|
361
|
+
return 0.0
|
|
362
|
+
return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
363
|
+
|
|
364
|
+
@computed_field
|
|
365
|
+
@property
|
|
366
|
+
def avg_reasoning_overhead(self) -> float:
|
|
367
|
+
"""Average reasoning overhead across episodes with reasoning metrics."""
|
|
368
|
+
with_metrics = [
|
|
369
|
+
e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
|
|
370
|
+
]
|
|
371
|
+
if not with_metrics:
|
|
372
|
+
return 0.0
|
|
373
|
+
return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
374
|
+
|
|
375
|
+
@computed_field
|
|
376
|
+
@property
|
|
377
|
+
def avg_progress_steadiness(self) -> float:
|
|
378
|
+
"""Average progress steadiness across episodes with reasoning metrics."""
|
|
379
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
380
|
+
if not with_metrics:
|
|
381
|
+
return 0.0
|
|
382
|
+
return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
383
|
+
|
|
220
384
|
|
|
221
385
|
class TraceEvent(BaseModel):
|
|
222
386
|
"""A single event in an episode trace for JSONL logging."""
|