chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chuk_puzzles_gym/eval.py +168 -46
- chuk_puzzles_gym/export/dataset.py +7 -1
- chuk_puzzles_gym/games/_base/game.py +123 -0
- chuk_puzzles_gym/games/binary/game.py +2 -0
- chuk_puzzles_gym/games/bridges/game.py +2 -0
- chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
- chuk_puzzles_gym/games/einstein/game.py +2 -0
- chuk_puzzles_gym/games/fillomino/game.py +2 -0
- chuk_puzzles_gym/games/futoshiki/game.py +2 -0
- chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
- chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
- chuk_puzzles_gym/games/hidato/game.py +2 -0
- chuk_puzzles_gym/games/hitori/game.py +2 -0
- chuk_puzzles_gym/games/kakuro/game.py +2 -0
- chuk_puzzles_gym/games/kenken/game.py +2 -0
- chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
- chuk_puzzles_gym/games/knapsack/game.py +2 -0
- chuk_puzzles_gym/games/lights_out/game.py +2 -0
- chuk_puzzles_gym/games/logic_grid/game.py +2 -0
- chuk_puzzles_gym/games/mastermind/game.py +2 -0
- chuk_puzzles_gym/games/minesweeper/game.py +2 -0
- chuk_puzzles_gym/games/nonogram/game.py +2 -0
- chuk_puzzles_gym/games/nqueens/game.py +5 -0
- chuk_puzzles_gym/games/numberlink/game.py +6 -0
- chuk_puzzles_gym/games/nurikabe/game.py +2 -0
- chuk_puzzles_gym/games/rush_hour/game.py +4 -0
- chuk_puzzles_gym/games/scheduler/game.py +2 -0
- chuk_puzzles_gym/games/shikaku/game.py +2 -0
- chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
- chuk_puzzles_gym/games/slitherlink/game.py +2 -0
- chuk_puzzles_gym/games/sokoban/game.py +2 -0
- chuk_puzzles_gym/games/star_battle/game.py +2 -0
- chuk_puzzles_gym/games/sudoku/game.py +2 -0
- chuk_puzzles_gym/games/tents/game.py +2 -0
- chuk_puzzles_gym/gym_env.py +21 -5
- chuk_puzzles_gym/models/__init__.py +2 -0
- chuk_puzzles_gym/models/evaluation.py +165 -1
- chuk_puzzles_gym/server.py +51 -72
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
chuk_puzzles_gym/eval.py
CHANGED
|
@@ -25,7 +25,7 @@ import sys
|
|
|
25
25
|
import time
|
|
26
26
|
from dataclasses import dataclass, field
|
|
27
27
|
from datetime import datetime
|
|
28
|
-
from typing import TYPE_CHECKING
|
|
28
|
+
from typing import TYPE_CHECKING, Any
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
pass
|
|
@@ -100,8 +100,36 @@ class EvaluationReport:
|
|
|
100
100
|
return 0.0
|
|
101
101
|
return sum(e.hints_used for e in self.episodes) / self.total_episodes
|
|
102
102
|
|
|
103
|
+
@property
|
|
104
|
+
def avg_backtrack_rate(self) -> float:
|
|
105
|
+
"""Average backtrack rate across episodes with reasoning metrics."""
|
|
106
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
107
|
+
if not with_metrics:
|
|
108
|
+
return 0.0
|
|
109
|
+
return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def avg_reasoning_overhead(self) -> float:
|
|
113
|
+
"""Average reasoning overhead across episodes with reasoning metrics."""
|
|
114
|
+
with_metrics = [
|
|
115
|
+
e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
|
|
116
|
+
]
|
|
117
|
+
if not with_metrics:
|
|
118
|
+
return 0.0
|
|
119
|
+
return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def avg_progress_steadiness(self) -> float:
|
|
123
|
+
"""Average progress steadiness across episodes with reasoning metrics."""
|
|
124
|
+
with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
|
|
125
|
+
if not with_metrics:
|
|
126
|
+
return 0.0
|
|
127
|
+
return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
|
|
128
|
+
|
|
103
129
|
def to_markdown(self) -> str:
|
|
104
130
|
"""Generate markdown report."""
|
|
131
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
132
|
+
|
|
105
133
|
lines = [
|
|
106
134
|
f"# {self.game.title()} {self.difficulty.title()} Evaluation",
|
|
107
135
|
"",
|
|
@@ -112,24 +140,78 @@ class EvaluationReport:
|
|
|
112
140
|
f"**Avg Hints:** {self.avg_hints:.1f}",
|
|
113
141
|
f"**Avg Efficiency:** {self.avg_efficiency:.1%}",
|
|
114
142
|
f"**Avg Time:** {self.avg_time_ms:.0f}ms",
|
|
115
|
-
"",
|
|
116
|
-
f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
|
|
117
|
-
"",
|
|
118
|
-
"## Episode Details",
|
|
119
|
-
"",
|
|
120
|
-
"| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |",
|
|
121
|
-
"|------|--------|-------|---------|-------|------------|-----------|",
|
|
122
143
|
]
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
144
|
+
|
|
145
|
+
if has_reasoning:
|
|
146
|
+
lines.extend(
|
|
147
|
+
[
|
|
148
|
+
"",
|
|
149
|
+
"### Reasoning Depth",
|
|
150
|
+
f"**Avg Backtrack Rate:** {self.avg_backtrack_rate:.1%}",
|
|
151
|
+
f"**Avg Reasoning Overhead:** {self.avg_reasoning_overhead:.2f}x",
|
|
152
|
+
f"**Avg Progress Steadiness:** {self.avg_progress_steadiness:.1%}",
|
|
153
|
+
]
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
lines.extend(
|
|
157
|
+
[
|
|
158
|
+
"",
|
|
159
|
+
f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
|
|
160
|
+
"",
|
|
161
|
+
"## Episode Details",
|
|
162
|
+
"",
|
|
163
|
+
]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if has_reasoning:
|
|
167
|
+
lines.append(
|
|
168
|
+
"| Seed | Status | Steps | Invalid | Hints | Efficiency | Backtracks | Steadiness | Time (ms) |"
|
|
169
|
+
)
|
|
126
170
|
lines.append(
|
|
127
|
-
|
|
171
|
+
"|------|--------|-------|---------|-------|------------|------------|------------|-----------|"
|
|
128
172
|
)
|
|
173
|
+
for e in self.episodes:
|
|
174
|
+
status = "solved" if e.success else e.status.value
|
|
175
|
+
eff = f"{e.efficiency_score:.0%}" if e.success else "-"
|
|
176
|
+
bt = str(e.reasoning_metrics.backtrack_count) if e.reasoning_metrics else "-"
|
|
177
|
+
st = f"{e.reasoning_metrics.progress_steadiness:.0%}" if e.reasoning_metrics else "-"
|
|
178
|
+
lines.append(
|
|
179
|
+
f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {bt} | {st} | {e.wall_time_ms} |"
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
lines.append("| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |")
|
|
183
|
+
lines.append("|------|--------|-------|---------|-------|------------|-----------|")
|
|
184
|
+
for e in self.episodes:
|
|
185
|
+
status = "solved" if e.success else e.status.value
|
|
186
|
+
eff = f"{e.efficiency_score:.0%}" if e.success else "-"
|
|
187
|
+
lines.append(
|
|
188
|
+
f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
|
|
189
|
+
)
|
|
190
|
+
|
|
129
191
|
return "\n".join(lines)
|
|
130
192
|
|
|
131
193
|
def to_json(self) -> str:
|
|
132
194
|
"""Generate JSON report."""
|
|
195
|
+
summary: dict[str, Any] = {
|
|
196
|
+
"total_episodes": self.total_episodes,
|
|
197
|
+
"solved_count": self.solved_count,
|
|
198
|
+
"solve_rate": self.solve_rate,
|
|
199
|
+
"avg_steps": self.avg_moves,
|
|
200
|
+
"avg_invalid": self.avg_invalid_moves,
|
|
201
|
+
"avg_hints": self.avg_hints,
|
|
202
|
+
"avg_efficiency": self.avg_efficiency,
|
|
203
|
+
"avg_time_ms": self.avg_time_ms,
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Add aggregate reasoning metrics if available
|
|
207
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
208
|
+
if has_reasoning:
|
|
209
|
+
summary["reasoning"] = {
|
|
210
|
+
"avg_backtrack_rate": round(self.avg_backtrack_rate, 3),
|
|
211
|
+
"avg_reasoning_overhead": round(self.avg_reasoning_overhead, 3),
|
|
212
|
+
"avg_progress_steadiness": round(self.avg_progress_steadiness, 3),
|
|
213
|
+
}
|
|
214
|
+
|
|
133
215
|
return json.dumps(
|
|
134
216
|
{
|
|
135
217
|
"game": self.game,
|
|
@@ -139,16 +221,7 @@ class EvaluationReport:
|
|
|
139
221
|
"hint_budget": self.solver_config.hint_budget,
|
|
140
222
|
"hint_penalty": self.solver_config.hint_penalty,
|
|
141
223
|
},
|
|
142
|
-
"summary":
|
|
143
|
-
"total_episodes": self.total_episodes,
|
|
144
|
-
"solved_count": self.solved_count,
|
|
145
|
-
"solve_rate": self.solve_rate,
|
|
146
|
-
"avg_steps": self.avg_moves,
|
|
147
|
-
"avg_invalid": self.avg_invalid_moves,
|
|
148
|
-
"avg_hints": self.avg_hints,
|
|
149
|
-
"avg_efficiency": self.avg_efficiency,
|
|
150
|
-
"avg_time_ms": self.avg_time_ms,
|
|
151
|
-
},
|
|
224
|
+
"summary": summary,
|
|
152
225
|
"episodes": [e.to_summary_dict() for e in self.episodes],
|
|
153
226
|
},
|
|
154
227
|
indent=2,
|
|
@@ -158,35 +231,61 @@ class EvaluationReport:
|
|
|
158
231
|
"""Generate CSV report."""
|
|
159
232
|
import io
|
|
160
233
|
|
|
234
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
235
|
+
|
|
161
236
|
output = io.StringIO()
|
|
162
237
|
writer = csv.writer(output)
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
writer.writerow(
|
|
238
|
+
|
|
239
|
+
header = [
|
|
240
|
+
"game",
|
|
241
|
+
"difficulty",
|
|
242
|
+
"seed",
|
|
243
|
+
"status",
|
|
244
|
+
"steps_taken",
|
|
245
|
+
"invalid_actions",
|
|
246
|
+
"hints_used",
|
|
247
|
+
"efficiency",
|
|
248
|
+
"wall_time_ms",
|
|
249
|
+
]
|
|
250
|
+
if has_reasoning:
|
|
251
|
+
header.extend(
|
|
178
252
|
[
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
e.invalid_actions,
|
|
185
|
-
e.hints_used,
|
|
186
|
-
f"{e.efficiency_score:.3f}",
|
|
187
|
-
e.wall_time_ms,
|
|
253
|
+
"backtrack_count",
|
|
254
|
+
"backtrack_rate",
|
|
255
|
+
"reasoning_overhead",
|
|
256
|
+
"progress_steadiness",
|
|
257
|
+
"error_streak_max",
|
|
188
258
|
]
|
|
189
259
|
)
|
|
260
|
+
writer.writerow(header)
|
|
261
|
+
|
|
262
|
+
for e in self.episodes:
|
|
263
|
+
row = [
|
|
264
|
+
e.game,
|
|
265
|
+
e.difficulty.value,
|
|
266
|
+
e.seed,
|
|
267
|
+
e.status.value,
|
|
268
|
+
e.steps_taken,
|
|
269
|
+
e.invalid_actions,
|
|
270
|
+
e.hints_used,
|
|
271
|
+
f"{e.efficiency_score:.3f}",
|
|
272
|
+
e.wall_time_ms,
|
|
273
|
+
]
|
|
274
|
+
if has_reasoning:
|
|
275
|
+
rm = e.reasoning_metrics
|
|
276
|
+
if rm is not None:
|
|
277
|
+
row.extend(
|
|
278
|
+
[
|
|
279
|
+
rm.backtrack_count,
|
|
280
|
+
f"{rm.backtrack_rate:.3f}",
|
|
281
|
+
f"{rm.reasoning_overhead:.3f}",
|
|
282
|
+
f"{rm.progress_steadiness:.3f}",
|
|
283
|
+
rm.error_streak_max,
|
|
284
|
+
]
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
row.extend(["", "", "", "", ""])
|
|
288
|
+
writer.writerow(row)
|
|
190
289
|
return output.getvalue()
|
|
191
290
|
|
|
192
291
|
def print_summary(self) -> None:
|
|
@@ -206,6 +305,15 @@ class EvaluationReport:
|
|
|
206
305
|
print(f"Avg Efficiency: {self.avg_efficiency:.1%}")
|
|
207
306
|
print(f"Avg Time: {self.avg_time_ms:.0f}ms")
|
|
208
307
|
|
|
308
|
+
# Reasoning depth metrics
|
|
309
|
+
has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
|
|
310
|
+
if has_reasoning:
|
|
311
|
+
print("-" * 40)
|
|
312
|
+
print("Reasoning Depth:")
|
|
313
|
+
print(f" Backtrack Rate: {self.avg_backtrack_rate:.1%}")
|
|
314
|
+
print(f" Reasoning Overhead: {self.avg_reasoning_overhead:.2f}x")
|
|
315
|
+
print(f" Progress Steadiness: {self.avg_progress_steadiness:.1%}")
|
|
316
|
+
|
|
209
317
|
|
|
210
318
|
async def _apply_hint(game: PuzzleGame, hint_data: tuple) -> MoveResult:
|
|
211
319
|
"""Apply a hint to the game based on game type.
|
|
@@ -433,15 +541,22 @@ async def run_episode(
|
|
|
433
541
|
# Apply the hint based on game type
|
|
434
542
|
try:
|
|
435
543
|
result = await _apply_hint(game, hint_data)
|
|
544
|
+
# Normalize hint_data to a tuple for position tracking
|
|
545
|
+
position = hint_data if isinstance(hint_data, tuple) else (hint_data,)
|
|
436
546
|
if result.success:
|
|
437
547
|
steps_taken += 1
|
|
548
|
+
# Use game's dynamic optimal_steps (reflects current state)
|
|
549
|
+
remaining = game.optimal_steps or 0
|
|
550
|
+
game.reasoning_tracker.record_valid_move(position, remaining)
|
|
438
551
|
else:
|
|
439
552
|
invalid_actions += 1
|
|
553
|
+
game.reasoning_tracker.record_invalid_move()
|
|
440
554
|
# If we get too many consecutive invalid moves, break
|
|
441
555
|
if invalid_actions > 50:
|
|
442
556
|
break
|
|
443
557
|
except (TypeError, ValueError, AttributeError, IndexError):
|
|
444
558
|
invalid_actions += 1
|
|
559
|
+
game.reasoning_tracker.record_invalid_move()
|
|
445
560
|
if invalid_actions > 50:
|
|
446
561
|
break
|
|
447
562
|
elif not use_hints:
|
|
@@ -461,6 +576,12 @@ async def run_episode(
|
|
|
461
576
|
# Get retries from game if tracked
|
|
462
577
|
retries = getattr(game, "retries", 0)
|
|
463
578
|
|
|
579
|
+
# Collect reasoning depth metrics (use pre-solve optimal_steps since
|
|
580
|
+
# the game's optimal_steps may be 0 after solving)
|
|
581
|
+
reasoning_metrics = game.reasoning_tracker.to_metrics(
|
|
582
|
+
optimal_path_length=optimal_steps if optimal_steps and optimal_steps >= 1 else None,
|
|
583
|
+
)
|
|
584
|
+
|
|
464
585
|
return EpisodeResult(
|
|
465
586
|
game=game.name,
|
|
466
587
|
difficulty=DifficultyLevel(difficulty),
|
|
@@ -475,6 +596,7 @@ async def run_episode(
|
|
|
475
596
|
retries=retries,
|
|
476
597
|
optimal_steps=optimal_steps,
|
|
477
598
|
solver_config=solver_config,
|
|
599
|
+
reasoning_metrics=reasoning_metrics,
|
|
478
600
|
)
|
|
479
601
|
|
|
480
602
|
|
|
@@ -190,6 +190,12 @@ class DatasetExporter:
|
|
|
190
190
|
if canonical:
|
|
191
191
|
gold_answer = str(canonical)
|
|
192
192
|
|
|
193
|
+
# Build reasoning tags from complexity profile
|
|
194
|
+
complexity_profile = game.complexity_profile
|
|
195
|
+
reasoning_type = complexity_profile.get("reasoning_type", "deductive")
|
|
196
|
+
search_space = complexity_profile.get("search_space", "medium")
|
|
197
|
+
tags = [domain, difficulty.value, f"reasoning:{reasoning_type}", f"search:{search_space}"]
|
|
198
|
+
|
|
193
199
|
# Create Problem using core schema
|
|
194
200
|
return Problem(
|
|
195
201
|
# Identity
|
|
@@ -214,7 +220,7 @@ class DatasetExporter:
|
|
|
214
220
|
),
|
|
215
221
|
# Metadata
|
|
216
222
|
operation_count=game.optimal_steps,
|
|
217
|
-
tags=
|
|
223
|
+
tags=tags,
|
|
218
224
|
)
|
|
219
225
|
|
|
220
226
|
@property
|
|
@@ -1,10 +1,99 @@
|
|
|
1
1
|
"""Abstract base class for all puzzle games."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import random
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from typing import Any
|
|
6
8
|
|
|
7
9
|
from ...models import DifficultyLevel, DifficultyProfile, MoveResult, SolverConfig
|
|
10
|
+
from ...models.evaluation import ReasoningMetrics
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReasoningTracker:
|
|
14
|
+
"""Tracks reasoning depth metrics during puzzle gameplay.
|
|
15
|
+
|
|
16
|
+
Accumulates data about backtrack behavior, solver distance progression,
|
|
17
|
+
and error patterns. Produces a ReasoningMetrics snapshot on demand.
|
|
18
|
+
|
|
19
|
+
This is a lightweight, non-Pydantic class meant to be mutated during play.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
__slots__ = (
|
|
23
|
+
"_placed_positions",
|
|
24
|
+
"_solver_distance_trace",
|
|
25
|
+
"_backtrack_count",
|
|
26
|
+
"_consecutive_errors",
|
|
27
|
+
"_error_streaks",
|
|
28
|
+
"_max_error_streak",
|
|
29
|
+
"_total_actions",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self) -> None:
|
|
33
|
+
self._placed_positions: set[tuple[Any, ...]] = set()
|
|
34
|
+
self._solver_distance_trace: list[int] = []
|
|
35
|
+
self._backtrack_count: int = 0
|
|
36
|
+
self._consecutive_errors: int = 0
|
|
37
|
+
self._error_streaks: list[int] = []
|
|
38
|
+
self._max_error_streak: int = 0
|
|
39
|
+
self._total_actions: int = 0
|
|
40
|
+
|
|
41
|
+
def record_valid_move(self, position: tuple[Any, ...], remaining_count: int) -> None:
|
|
42
|
+
"""Record a valid (successful) move.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
position: The position/target of the move (for backtrack detection)
|
|
46
|
+
remaining_count: How many positions remain to be filled after this move
|
|
47
|
+
"""
|
|
48
|
+
self._total_actions += 1
|
|
49
|
+
|
|
50
|
+
# Detect backtrack: placing at a position already placed before
|
|
51
|
+
if position in self._placed_positions:
|
|
52
|
+
self._backtrack_count += 1
|
|
53
|
+
self._placed_positions.add(position)
|
|
54
|
+
|
|
55
|
+
self._solver_distance_trace.append(remaining_count)
|
|
56
|
+
|
|
57
|
+
# Finalize any pending error streak
|
|
58
|
+
if self._consecutive_errors > 0:
|
|
59
|
+
self._error_streaks.append(self._consecutive_errors)
|
|
60
|
+
self._consecutive_errors = 0
|
|
61
|
+
|
|
62
|
+
def record_invalid_move(self) -> None:
|
|
63
|
+
"""Record an invalid (failed) move."""
|
|
64
|
+
self._total_actions += 1
|
|
65
|
+
self._consecutive_errors += 1
|
|
66
|
+
self._max_error_streak = max(self._max_error_streak, self._consecutive_errors)
|
|
67
|
+
|
|
68
|
+
def to_metrics(self, optimal_path_length: int | None = None) -> ReasoningMetrics:
|
|
69
|
+
"""Produce a frozen ReasoningMetrics snapshot.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
optimal_path_length: Minimum steps to solve (from solver), if known.
|
|
73
|
+
"""
|
|
74
|
+
# Finalize any pending error streak
|
|
75
|
+
error_streaks = list(self._error_streaks)
|
|
76
|
+
if self._consecutive_errors > 0:
|
|
77
|
+
error_streaks.append(self._consecutive_errors)
|
|
78
|
+
|
|
79
|
+
return ReasoningMetrics(
|
|
80
|
+
backtrack_count=self._backtrack_count,
|
|
81
|
+
solver_distance_trace=list(self._solver_distance_trace),
|
|
82
|
+
error_streak_max=self._max_error_streak,
|
|
83
|
+
error_streaks=error_streaks,
|
|
84
|
+
total_actions=self._total_actions,
|
|
85
|
+
optimal_path_length=optimal_path_length,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def reset(self) -> None:
|
|
89
|
+
"""Reset all tracked state."""
|
|
90
|
+
self._placed_positions.clear()
|
|
91
|
+
self._solver_distance_trace.clear()
|
|
92
|
+
self._backtrack_count = 0
|
|
93
|
+
self._consecutive_errors = 0
|
|
94
|
+
self._error_streaks.clear()
|
|
95
|
+
self._max_error_streak = 0
|
|
96
|
+
self._total_actions = 0
|
|
8
97
|
|
|
9
98
|
|
|
10
99
|
class PuzzleGame(ABC):
|
|
@@ -64,6 +153,9 @@ class PuzzleGame(ABC):
|
|
|
64
153
|
self.game_started = False
|
|
65
154
|
self._last_move_position: tuple[Any, ...] | None = None # For retry detection
|
|
66
155
|
|
|
156
|
+
# Reasoning depth tracker
|
|
157
|
+
self._reasoning_tracker = ReasoningTracker()
|
|
158
|
+
|
|
67
159
|
@abstractmethod
|
|
68
160
|
async def generate_puzzle(self) -> None:
|
|
69
161
|
"""Generate a new puzzle with a unique solution.
|
|
@@ -162,8 +254,11 @@ class PuzzleGame(ABC):
|
|
|
162
254
|
"""
|
|
163
255
|
if success:
|
|
164
256
|
self.moves_made += 1
|
|
257
|
+
remaining = self._compute_remaining()
|
|
258
|
+
self._reasoning_tracker.record_valid_move(position, remaining)
|
|
165
259
|
else:
|
|
166
260
|
self.invalid_moves += 1
|
|
261
|
+
self._reasoning_tracker.record_invalid_move()
|
|
167
262
|
|
|
168
263
|
# Detect retries (same position attempted again)
|
|
169
264
|
if self._last_move_position == position:
|
|
@@ -183,6 +278,34 @@ class PuzzleGame(ABC):
|
|
|
183
278
|
self.hints_used += 1
|
|
184
279
|
return True
|
|
185
280
|
|
|
281
|
+
def _compute_remaining(self) -> int:
|
|
282
|
+
"""Compute how many positions remain to be filled.
|
|
283
|
+
|
|
284
|
+
Uses optimal_steps directly since it is typically dynamic
|
|
285
|
+
(reflects current game state, e.g. counting empty cells).
|
|
286
|
+
Override in subclasses for more accurate tracking.
|
|
287
|
+
"""
|
|
288
|
+
return self.optimal_steps or 0
|
|
289
|
+
|
|
290
|
+
def get_reasoning_metrics(self) -> ReasoningMetrics:
|
|
291
|
+
"""Get a snapshot of reasoning depth metrics for the current episode.
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
Frozen ReasoningMetrics with all tracked data.
|
|
295
|
+
"""
|
|
296
|
+
optimal = self.optimal_steps
|
|
297
|
+
# optimal_path_length requires ge=1; treat 0 or negative as unknown
|
|
298
|
+
if optimal is not None and optimal < 1:
|
|
299
|
+
optimal = None
|
|
300
|
+
return self._reasoning_tracker.to_metrics(
|
|
301
|
+
optimal_path_length=optimal,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
@property
|
|
305
|
+
def reasoning_tracker(self) -> ReasoningTracker:
|
|
306
|
+
"""Access the reasoning tracker directly."""
|
|
307
|
+
return self._reasoning_tracker
|
|
308
|
+
|
|
186
309
|
def can_use_hint(self) -> bool:
|
|
187
310
|
"""Check if hints are available without consuming one.
|
|
188
311
|
|
|
@@ -358,6 +358,8 @@ class BinaryPuzzleGame(PuzzleGame):
|
|
|
358
358
|
Returns:
|
|
359
359
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
360
360
|
"""
|
|
361
|
+
if not self.can_use_hint():
|
|
362
|
+
return None
|
|
361
363
|
empty_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == -1]
|
|
362
364
|
if not empty_cells:
|
|
363
365
|
return None
|
|
@@ -415,6 +415,8 @@ class BridgesGame(PuzzleGame):
|
|
|
415
415
|
|
|
416
416
|
async def get_hint(self) -> tuple[Any, str] | None:
|
|
417
417
|
"""Get a hint for the next move."""
|
|
418
|
+
if not self.can_use_hint():
|
|
419
|
+
return None
|
|
418
420
|
# Find a bridge in the solution that's not yet placed correctly
|
|
419
421
|
for bridge_key, solution_count in self.solution.items():
|
|
420
422
|
current_count = self.bridges.get(bridge_key, 0)
|
|
@@ -362,6 +362,11 @@ class CryptarithmeticGame(PuzzleGame):
|
|
|
362
362
|
|
|
363
363
|
return "\n".join(lines)
|
|
364
364
|
|
|
365
|
+
def get_stats(self) -> str:
|
|
366
|
+
"""Get current game statistics."""
|
|
367
|
+
assigned = sum(1 for v in self.player_mapping.values() if v is not None)
|
|
368
|
+
return f"Moves: {self.moves_made} | Assigned: {assigned}/{len(self.letters)} | Seed: {self.seed}"
|
|
369
|
+
|
|
365
370
|
def get_rules(self) -> str:
|
|
366
371
|
return (
|
|
367
372
|
"CRYPTARITHMETIC\n"
|
|
@@ -281,6 +281,8 @@ class EinsteinGame(PuzzleGame):
|
|
|
281
281
|
Returns:
|
|
282
282
|
Tuple of (hint_data, hint_message) or None
|
|
283
283
|
"""
|
|
284
|
+
if not self.can_use_hint():
|
|
285
|
+
return None
|
|
284
286
|
# Find first unassigned attribute in solution
|
|
285
287
|
for i in range(self.num_houses):
|
|
286
288
|
for attr in ATTRIBUTES:
|
|
@@ -435,6 +435,8 @@ class FillominoGame(PuzzleGame):
|
|
|
435
435
|
Returns:
|
|
436
436
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
437
437
|
"""
|
|
438
|
+
if not self.can_use_hint():
|
|
439
|
+
return None
|
|
438
440
|
# Find an empty cell
|
|
439
441
|
for r in range(self.size):
|
|
440
442
|
for c in range(self.size):
|
|
@@ -288,6 +288,8 @@ class FutoshikiGame(PuzzleGame):
|
|
|
288
288
|
Returns:
|
|
289
289
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
290
290
|
"""
|
|
291
|
+
if not self.can_use_hint():
|
|
292
|
+
return None
|
|
291
293
|
empty_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0]
|
|
292
294
|
if not empty_cells:
|
|
293
295
|
return None
|
|
@@ -4,10 +4,14 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
|
|
5
5
|
from ...models import GameCommand, MoveResult
|
|
6
6
|
from .._base import CommandResult, GameCommandHandler
|
|
7
|
+
from .game import COLOR_NAMES
|
|
7
8
|
|
|
8
9
|
if TYPE_CHECKING:
|
|
9
10
|
from .game import GraphColoringGame
|
|
10
11
|
|
|
12
|
+
# Build a lookup from lowercase color name to color number
|
|
13
|
+
_COLOR_NAME_TO_NUM = {name.lower(): i + 1 for i, name in enumerate(COLOR_NAMES)}
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class GraphColoringCommandHandler(GameCommandHandler):
|
|
13
17
|
"""Handles commands for Graph Coloring game."""
|
|
@@ -36,6 +40,16 @@ class GraphColoringCommandHandler(GameCommandHandler):
|
|
|
36
40
|
else:
|
|
37
41
|
return self.error_result(f"Unknown command: {cmd}")
|
|
38
42
|
|
|
43
|
+
def _parse_color(self, value: str) -> int | None:
|
|
44
|
+
"""Parse a color argument as either an integer or a color name."""
|
|
45
|
+
# Try integer first
|
|
46
|
+
try:
|
|
47
|
+
return int(value)
|
|
48
|
+
except ValueError:
|
|
49
|
+
pass
|
|
50
|
+
# Try color name lookup
|
|
51
|
+
return _COLOR_NAME_TO_NUM.get(value.lower())
|
|
52
|
+
|
|
39
53
|
async def _handle_place(self, args: list[str]) -> CommandResult:
|
|
40
54
|
"""Handle the PLACE command: place <node> <color>."""
|
|
41
55
|
if len(args) != 2:
|
|
@@ -45,10 +59,13 @@ class GraphColoringCommandHandler(GameCommandHandler):
|
|
|
45
59
|
)
|
|
46
60
|
|
|
47
61
|
node = self.parse_int(args[0], "node")
|
|
48
|
-
color = self.
|
|
62
|
+
color = self._parse_color(args[1])
|
|
49
63
|
|
|
50
|
-
if node is None
|
|
51
|
-
return self.error_result("Node
|
|
64
|
+
if node is None:
|
|
65
|
+
return self.error_result("Node must be an integer.")
|
|
66
|
+
if color is None:
|
|
67
|
+
valid = ", ".join(f"{i + 1}={COLOR_NAMES[i]}" for i in range(self.game.num_colors))
|
|
68
|
+
return self.error_result(f"Invalid color. Use a number or name: {valid}")
|
|
52
69
|
|
|
53
70
|
result = await self.game.validate_move(node, color)
|
|
54
71
|
|
|
@@ -289,6 +289,11 @@ class GraphColoringGame(PuzzleGame):
|
|
|
289
289
|
|
|
290
290
|
return "\n".join(lines)
|
|
291
291
|
|
|
292
|
+
def get_stats(self) -> str:
|
|
293
|
+
"""Get current game statistics."""
|
|
294
|
+
colored = sum(1 for n in range(1, self.num_nodes + 1) if self.coloring.get(n, 0) > 0)
|
|
295
|
+
return f"Moves: {self.moves_made} | Colored: {colored}/{self.num_nodes} | Edges: {len(self.edges)} | Seed: {self.seed}"
|
|
296
|
+
|
|
292
297
|
def get_rules(self) -> str:
|
|
293
298
|
return (
|
|
294
299
|
f"GRAPH COLORING ({self.num_nodes} nodes, {self.num_colors} colors)\n"
|
|
@@ -298,9 +303,11 @@ class GraphColoringGame(PuzzleGame):
|
|
|
298
303
|
)
|
|
299
304
|
|
|
300
305
|
def get_commands(self) -> str:
|
|
306
|
+
color_map = ", ".join(f"{i + 1}={COLOR_NAMES[i]}" for i in range(self.num_colors))
|
|
301
307
|
return (
|
|
302
308
|
"Commands:\n"
|
|
303
|
-
f" place <node> <color> - Color a node (
|
|
309
|
+
f" place <node> <color> - Color a node (number or name)\n"
|
|
310
|
+
f" Colors: {color_map}\n"
|
|
304
311
|
" clear <node> - Remove color from a node\n"
|
|
305
312
|
" hint - Get a hint\n"
|
|
306
313
|
" check - Check if solved\n"
|
|
@@ -302,6 +302,8 @@ class HidatoGame(PuzzleGame):
|
|
|
302
302
|
Returns:
|
|
303
303
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
304
304
|
"""
|
|
305
|
+
if not self.can_use_hint():
|
|
306
|
+
return None
|
|
305
307
|
# Find an empty cell
|
|
306
308
|
empty_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0]
|
|
307
309
|
if not empty_cells:
|
|
@@ -395,6 +395,8 @@ class HitoriGame(PuzzleGame):
|
|
|
395
395
|
|
|
396
396
|
async def get_hint(self) -> tuple[Any, str] | None:
|
|
397
397
|
"""Get a hint for the next move."""
|
|
398
|
+
if not self.can_use_hint():
|
|
399
|
+
return None
|
|
398
400
|
# Find a cell that should be shaded but isn't, or vice versa
|
|
399
401
|
for r in range(self.size):
|
|
400
402
|
for c in range(self.size):
|
|
@@ -366,6 +366,8 @@ class KenKenGame(PuzzleGame):
|
|
|
366
366
|
Returns:
|
|
367
367
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
368
368
|
"""
|
|
369
|
+
if not self.can_use_hint():
|
|
370
|
+
return None
|
|
369
371
|
empty_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0]
|
|
370
372
|
if not empty_cells:
|
|
371
373
|
return None
|
|
@@ -395,6 +395,8 @@ class KillerSudokuGame(PuzzleGame):
|
|
|
395
395
|
Returns:
|
|
396
396
|
Tuple of (hint_data, hint_message) or None if puzzle is complete
|
|
397
397
|
"""
|
|
398
|
+
if not self.can_use_hint():
|
|
399
|
+
return None
|
|
398
400
|
empty_cells = [(r, c) for r in range(9) for c in range(9) if self.grid[r][c] == 0]
|
|
399
401
|
if not empty_cells:
|
|
400
402
|
return None
|
|
@@ -255,6 +255,8 @@ class KnapsackGame(PuzzleGame):
|
|
|
255
255
|
Returns:
|
|
256
256
|
Tuple of (hint_data, hint_message) or None
|
|
257
257
|
"""
|
|
258
|
+
if not self.can_use_hint():
|
|
259
|
+
return None
|
|
258
260
|
# Suggest selecting an item that's in the optimal solution but not selected
|
|
259
261
|
for i in range(len(self.items)):
|
|
260
262
|
if self.optimal_selection[i] and not self.selection[i]:
|