chuk-puzzles-gym 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. chuk_puzzles_gym/__init__.py +19 -0
  2. chuk_puzzles_gym/constants.py +9 -0
  3. chuk_puzzles_gym/eval.py +763 -0
  4. chuk_puzzles_gym/export/__init__.py +20 -0
  5. chuk_puzzles_gym/export/dataset.py +376 -0
  6. chuk_puzzles_gym/games/__init__.py +94 -0
  7. chuk_puzzles_gym/games/_base/__init__.py +6 -0
  8. chuk_puzzles_gym/games/_base/commands.py +91 -0
  9. chuk_puzzles_gym/games/_base/game.py +337 -0
  10. chuk_puzzles_gym/games/binary/__init__.py +6 -0
  11. chuk_puzzles_gym/games/binary/config.py +23 -0
  12. chuk_puzzles_gym/games/binary/game.py +434 -0
  13. chuk_puzzles_gym/games/bridges/__init__.py +6 -0
  14. chuk_puzzles_gym/games/bridges/config.py +24 -0
  15. chuk_puzzles_gym/games/bridges/game.py +489 -0
  16. chuk_puzzles_gym/games/einstein/__init__.py +6 -0
  17. chuk_puzzles_gym/games/einstein/config.py +23 -0
  18. chuk_puzzles_gym/games/einstein/constants.py +13 -0
  19. chuk_puzzles_gym/games/einstein/game.py +366 -0
  20. chuk_puzzles_gym/games/einstein/models.py +35 -0
  21. chuk_puzzles_gym/games/fillomino/__init__.py +6 -0
  22. chuk_puzzles_gym/games/fillomino/config.py +24 -0
  23. chuk_puzzles_gym/games/fillomino/game.py +516 -0
  24. chuk_puzzles_gym/games/futoshiki/__init__.py +6 -0
  25. chuk_puzzles_gym/games/futoshiki/config.py +23 -0
  26. chuk_puzzles_gym/games/futoshiki/game.py +391 -0
  27. chuk_puzzles_gym/games/hidato/__init__.py +6 -0
  28. chuk_puzzles_gym/games/hidato/config.py +24 -0
  29. chuk_puzzles_gym/games/hidato/game.py +403 -0
  30. chuk_puzzles_gym/games/hitori/__init__.py +6 -0
  31. chuk_puzzles_gym/games/hitori/config.py +23 -0
  32. chuk_puzzles_gym/games/hitori/game.py +451 -0
  33. chuk_puzzles_gym/games/kakuro/__init__.py +6 -0
  34. chuk_puzzles_gym/games/kakuro/config.py +24 -0
  35. chuk_puzzles_gym/games/kakuro/game.py +399 -0
  36. chuk_puzzles_gym/games/kenken/__init__.py +6 -0
  37. chuk_puzzles_gym/games/kenken/config.py +24 -0
  38. chuk_puzzles_gym/games/kenken/enums.py +13 -0
  39. chuk_puzzles_gym/games/kenken/game.py +486 -0
  40. chuk_puzzles_gym/games/kenken/models.py +15 -0
  41. chuk_puzzles_gym/games/killer_sudoku/__init__.py +6 -0
  42. chuk_puzzles_gym/games/killer_sudoku/config.py +23 -0
  43. chuk_puzzles_gym/games/killer_sudoku/game.py +502 -0
  44. chuk_puzzles_gym/games/killer_sudoku/models.py +15 -0
  45. chuk_puzzles_gym/games/knapsack/__init__.py +6 -0
  46. chuk_puzzles_gym/games/knapsack/config.py +24 -0
  47. chuk_puzzles_gym/games/knapsack/enums.py +10 -0
  48. chuk_puzzles_gym/games/knapsack/game.py +340 -0
  49. chuk_puzzles_gym/games/knapsack/models.py +13 -0
  50. chuk_puzzles_gym/games/lights_out/__init__.py +6 -0
  51. chuk_puzzles_gym/games/lights_out/config.py +24 -0
  52. chuk_puzzles_gym/games/lights_out/game.py +249 -0
  53. chuk_puzzles_gym/games/logic_grid/__init__.py +6 -0
  54. chuk_puzzles_gym/games/logic_grid/config.py +24 -0
  55. chuk_puzzles_gym/games/logic_grid/constants.py +12 -0
  56. chuk_puzzles_gym/games/logic_grid/game.py +333 -0
  57. chuk_puzzles_gym/games/logic_grid/models.py +24 -0
  58. chuk_puzzles_gym/games/mastermind/__init__.py +6 -0
  59. chuk_puzzles_gym/games/mastermind/config.py +25 -0
  60. chuk_puzzles_gym/games/mastermind/game.py +297 -0
  61. chuk_puzzles_gym/games/minesweeper/__init__.py +6 -0
  62. chuk_puzzles_gym/games/minesweeper/config.py +24 -0
  63. chuk_puzzles_gym/games/minesweeper/enums.py +12 -0
  64. chuk_puzzles_gym/games/minesweeper/game.py +432 -0
  65. chuk_puzzles_gym/games/nonogram/__init__.py +6 -0
  66. chuk_puzzles_gym/games/nonogram/config.py +23 -0
  67. chuk_puzzles_gym/games/nonogram/game.py +296 -0
  68. chuk_puzzles_gym/games/nurikabe/__init__.py +6 -0
  69. chuk_puzzles_gym/games/nurikabe/config.py +24 -0
  70. chuk_puzzles_gym/games/nurikabe/enums.py +14 -0
  71. chuk_puzzles_gym/games/nurikabe/game.py +586 -0
  72. chuk_puzzles_gym/games/scheduler/__init__.py +6 -0
  73. chuk_puzzles_gym/games/scheduler/config.py +25 -0
  74. chuk_puzzles_gym/games/scheduler/constants.py +15 -0
  75. chuk_puzzles_gym/games/scheduler/enums.py +10 -0
  76. chuk_puzzles_gym/games/scheduler/game.py +431 -0
  77. chuk_puzzles_gym/games/scheduler/models.py +14 -0
  78. chuk_puzzles_gym/games/shikaku/__init__.py +6 -0
  79. chuk_puzzles_gym/games/shikaku/config.py +24 -0
  80. chuk_puzzles_gym/games/shikaku/game.py +419 -0
  81. chuk_puzzles_gym/games/slitherlink/__init__.py +6 -0
  82. chuk_puzzles_gym/games/slitherlink/config.py +23 -0
  83. chuk_puzzles_gym/games/slitherlink/game.py +386 -0
  84. chuk_puzzles_gym/games/sokoban/__init__.py +6 -0
  85. chuk_puzzles_gym/games/sokoban/config.py +24 -0
  86. chuk_puzzles_gym/games/sokoban/game.py +671 -0
  87. chuk_puzzles_gym/games/star_battle/__init__.py +6 -0
  88. chuk_puzzles_gym/games/star_battle/config.py +24 -0
  89. chuk_puzzles_gym/games/star_battle/game.py +390 -0
  90. chuk_puzzles_gym/games/sudoku/__init__.py +7 -0
  91. chuk_puzzles_gym/games/sudoku/commands.py +96 -0
  92. chuk_puzzles_gym/games/sudoku/config.py +22 -0
  93. chuk_puzzles_gym/games/sudoku/game.py +328 -0
  94. chuk_puzzles_gym/games/tents/__init__.py +6 -0
  95. chuk_puzzles_gym/games/tents/config.py +24 -0
  96. chuk_puzzles_gym/games/tents/game.py +416 -0
  97. chuk_puzzles_gym/gym_env.py +465 -0
  98. chuk_puzzles_gym/models/__init__.py +47 -0
  99. chuk_puzzles_gym/models/base.py +30 -0
  100. chuk_puzzles_gym/models/config.py +11 -0
  101. chuk_puzzles_gym/models/enums.py +104 -0
  102. chuk_puzzles_gym/models/evaluation.py +487 -0
  103. chuk_puzzles_gym/models/games.py +12 -0
  104. chuk_puzzles_gym/server.py +1171 -0
  105. chuk_puzzles_gym/trace/__init__.py +10 -0
  106. chuk_puzzles_gym/trace/generator.py +726 -0
  107. chuk_puzzles_gym/utils/__init__.py +4 -0
  108. chuk_puzzles_gym-0.9.dist-info/METADATA +1471 -0
  109. chuk_puzzles_gym-0.9.dist-info/RECORD +112 -0
  110. chuk_puzzles_gym-0.9.dist-info/WHEEL +5 -0
  111. chuk_puzzles_gym-0.9.dist-info/entry_points.txt +4 -0
  112. chuk_puzzles_gym-0.9.dist-info/top_level.txt +1 -0
@@ -0,0 +1,763 @@
1
+ """
2
+ Evaluation harness for puzzle-arcade-server.
3
+
4
+ Run benchmarks against puzzle games and collect metrics.
5
+
6
+ Usage:
7
+ puzzle-arcade-eval sudoku --difficulty medium --episodes 10
8
+ puzzle-arcade-eval --all --difficulty easy --episodes 5
9
+ puzzle-arcade-eval kenken --seeds 1,2,3,4,5
10
+
11
+ # Solver-free mode (pure model reasoning)
12
+ puzzle-arcade-eval sudoku --solver-free
13
+
14
+ # Solver-assisted with budget
15
+ puzzle-arcade-eval sudoku --hint-budget 10 --hint-penalty 0.1
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import asyncio
22
+ import csv
23
+ import json
24
+ import sys
25
+ import time
26
+ from dataclasses import dataclass, field
27
+ from datetime import datetime
28
+ from typing import TYPE_CHECKING
29
+
30
+ if TYPE_CHECKING:
31
+ pass
32
+
33
+ from .games import AVAILABLE_GAMES
34
+ from .games._base import PuzzleGame
35
+ from .models import (
36
+ DifficultyLevel,
37
+ EpisodeResult,
38
+ EpisodeStatus,
39
+ MoveResult,
40
+ SolverConfig,
41
+ )
42
+
43
+
44
+ @dataclass
45
+ class EvaluationReport:
46
+ """Summary report of evaluation run.
47
+
48
+ This wraps the Pydantic EvaluationSummary for backwards compatibility
49
+ while providing additional output formatting methods.
50
+ """
51
+
52
+ game: str
53
+ difficulty: str
54
+ solver_config: SolverConfig = field(default_factory=SolverConfig)
55
+ episodes: list[EpisodeResult] = field(default_factory=list)
56
+
57
+ @property
58
+ def total_episodes(self) -> int:
59
+ return len(self.episodes)
60
+
61
+ @property
62
+ def solved_count(self) -> int:
63
+ return sum(1 for e in self.episodes if e.success)
64
+
65
+ @property
66
+ def solve_rate(self) -> float:
67
+ if not self.episodes:
68
+ return 0.0
69
+ return self.solved_count / self.total_episodes
70
+
71
+ @property
72
+ def avg_moves(self) -> float:
73
+ if not self.episodes:
74
+ return 0.0
75
+ return sum(e.steps_taken for e in self.episodes) / self.total_episodes
76
+
77
+ @property
78
+ def avg_invalid_moves(self) -> float:
79
+ if not self.episodes:
80
+ return 0.0
81
+ return sum(e.invalid_actions for e in self.episodes) / self.total_episodes
82
+
83
+ @property
84
+ def avg_time_ms(self) -> float:
85
+ if not self.episodes:
86
+ return 0.0
87
+ return sum(e.wall_time_ms for e in self.episodes) / self.total_episodes
88
+
89
+ @property
90
+ def avg_efficiency(self) -> float:
91
+ """Average efficiency score across solved episodes."""
92
+ solved = [e for e in self.episodes if e.success]
93
+ if not solved:
94
+ return 0.0
95
+ return sum(e.efficiency_score for e in solved) / len(solved)
96
+
97
+ @property
98
+ def avg_hints(self) -> float:
99
+ if not self.episodes:
100
+ return 0.0
101
+ return sum(e.hints_used for e in self.episodes) / self.total_episodes
102
+
103
+ def to_markdown(self) -> str:
104
+ """Generate markdown report."""
105
+ lines = [
106
+ f"# {self.game.title()} {self.difficulty.title()} Evaluation",
107
+ "",
108
+ f"**Episodes:** {self.total_episodes}",
109
+ f"**Solved:** {self.solved_count}/{self.total_episodes} ({self.solve_rate:.1%})",
110
+ f"**Avg Steps:** {self.avg_moves:.1f}",
111
+ f"**Avg Invalid:** {self.avg_invalid_moves:.1f}",
112
+ f"**Avg Hints:** {self.avg_hints:.1f}",
113
+ f"**Avg Efficiency:** {self.avg_efficiency:.1%}",
114
+ f"**Avg Time:** {self.avg_time_ms:.0f}ms",
115
+ "",
116
+ f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
117
+ "",
118
+ "## Episode Details",
119
+ "",
120
+ "| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |",
121
+ "|------|--------|-------|---------|-------|------------|-----------|",
122
+ ]
123
+ for e in self.episodes:
124
+ status = "solved" if e.success else e.status.value
125
+ eff = f"{e.efficiency_score:.0%}" if e.success else "-"
126
+ lines.append(
127
+ f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
128
+ )
129
+ return "\n".join(lines)
130
+
131
+ def to_json(self) -> str:
132
+ """Generate JSON report."""
133
+ return json.dumps(
134
+ {
135
+ "game": self.game,
136
+ "difficulty": self.difficulty,
137
+ "solver_config": {
138
+ "solver_allowed": self.solver_config.solver_allowed,
139
+ "hint_budget": self.solver_config.hint_budget,
140
+ "hint_penalty": self.solver_config.hint_penalty,
141
+ },
142
+ "summary": {
143
+ "total_episodes": self.total_episodes,
144
+ "solved_count": self.solved_count,
145
+ "solve_rate": self.solve_rate,
146
+ "avg_steps": self.avg_moves,
147
+ "avg_invalid": self.avg_invalid_moves,
148
+ "avg_hints": self.avg_hints,
149
+ "avg_efficiency": self.avg_efficiency,
150
+ "avg_time_ms": self.avg_time_ms,
151
+ },
152
+ "episodes": [e.to_summary_dict() for e in self.episodes],
153
+ },
154
+ indent=2,
155
+ )
156
+
157
+ def to_csv(self) -> str:
158
+ """Generate CSV report."""
159
+ import io
160
+
161
+ output = io.StringIO()
162
+ writer = csv.writer(output)
163
+ writer.writerow(
164
+ [
165
+ "game",
166
+ "difficulty",
167
+ "seed",
168
+ "status",
169
+ "steps_taken",
170
+ "invalid_actions",
171
+ "hints_used",
172
+ "efficiency",
173
+ "wall_time_ms",
174
+ ]
175
+ )
176
+ for e in self.episodes:
177
+ writer.writerow(
178
+ [
179
+ e.game,
180
+ e.difficulty.value,
181
+ e.seed,
182
+ e.status.value,
183
+ e.steps_taken,
184
+ e.invalid_actions,
185
+ e.hints_used,
186
+ f"{e.efficiency_score:.3f}",
187
+ e.wall_time_ms,
188
+ ]
189
+ )
190
+ return output.getvalue()
191
+
192
+ def print_summary(self) -> None:
193
+ """Print human-readable summary to stdout."""
194
+ print(f"\n{self.game.title()} {self.difficulty.title()} Evaluation ({self.total_episodes} episodes)")
195
+ print("=" * 60)
196
+ solver_mode = (
197
+ "solver-free"
198
+ if not self.solver_config.solver_allowed
199
+ else f"solver-assisted (budget={self.solver_config.hint_budget})"
200
+ )
201
+ print(f"Mode: {solver_mode}")
202
+ print(f"Solved: {self.solved_count}/{self.total_episodes} ({self.solve_rate:.1%})")
203
+ print(f"Avg Steps: {self.avg_moves:.1f}")
204
+ print(f"Avg Invalid: {self.avg_invalid_moves:.1f}")
205
+ print(f"Avg Hints: {self.avg_hints:.1f}")
206
+ print(f"Avg Efficiency: {self.avg_efficiency:.1%}")
207
+ print(f"Avg Time: {self.avg_time_ms:.0f}ms")
208
+
209
+
210
+ async def _apply_hint(game: PuzzleGame, hint_data: tuple) -> MoveResult:
211
+ """Apply a hint to the game based on game type.
212
+
213
+ Different games return hints in different formats:
214
+ - Grid games (Sudoku, KenKen, etc.): (row, col, value)
215
+ - Einstein: (house, attr, value)
216
+ - Mastermind: (guess_sequence,)
217
+ - Minesweeper: (row, col)
218
+ - Lights Out: (row, col)
219
+ - Knapsack: (item_id,)
220
+ - Sokoban: (direction,)
221
+ - Bridges: (r1, c1, r2, c2, count)
222
+ - Shikaku: (r1, c1, r2, c2)
223
+ - etc.
224
+ """
225
+ game_name = game.name.lower()
226
+
227
+ # Grid-based number placement games
228
+ if game_name in [
229
+ "sudoku",
230
+ "kenken",
231
+ "kakuro",
232
+ "killer sudoku",
233
+ "futoshiki",
234
+ "binary puzzle",
235
+ "nonogram",
236
+ "hidato",
237
+ "fillomino",
238
+ ]:
239
+ if len(hint_data) >= 3:
240
+ row, col, value = hint_data[0], hint_data[1], hint_data[2]
241
+ return await game.validate_move(row, col, value)
242
+
243
+ # Hitori - hint is (row, col, action) where action is "shade" or "unshade"
244
+ if game_name in ["hitori"]:
245
+ if len(hint_data) >= 3:
246
+ row, col, action = hint_data[0], hint_data[1], hint_data[2]
247
+ return await game.validate_move(row, col, action)
248
+
249
+ # Star placement games
250
+ if game_name in ["star battle"]:
251
+ if len(hint_data) >= 2:
252
+ row, col = hint_data[0], hint_data[1]
253
+ return await game.validate_move(row, col, "place")
254
+
255
+ # Tents game - hint is (row, col, action) where action is "place" or "remove"
256
+ if game_name in ["tents and trees"]:
257
+ if len(hint_data) >= 3:
258
+ row, col, action = hint_data[0], hint_data[1], hint_data[2]
259
+ return await game.validate_move(row, col, action)
260
+
261
+ # Einstein puzzle - hint is (person, category, value)
262
+ # validate_move expects (house, attr, value) where house is person name
263
+ if game_name in ["einstein's puzzle", "einstein"]:
264
+ if len(hint_data) >= 3:
265
+ person, category, value = hint_data[0], hint_data[1], hint_data[2]
266
+ return await game.validate_move(person, category, value)
267
+
268
+ # Logic Grid puzzle - hint is (person, category, value)
269
+ # validate_move expects (cat1, val1, cat2, val2, state)
270
+ # Need to convert: connect person to category=value
271
+ if game_name in ["logic grid"]:
272
+ if len(hint_data) >= 3:
273
+ person, category, value = hint_data[0], hint_data[1], hint_data[2]
274
+ # Connect person to category=value means: cat1=person, val1=person, cat2=category, val2=value
275
+ return await game.validate_move("person", person, category, value, True)
276
+
277
+ # Mastermind - hint is now the complete secret code tuple
278
+ # validate_move expects (*guess) - the full code
279
+ if game_name in ["mastermind"]:
280
+ # The hint provides the complete secret code
281
+ return await game.validate_move(*hint_data)
282
+
283
+ # Minesweeper
284
+ if game_name in ["minesweeper"]:
285
+ if len(hint_data) >= 2:
286
+ row, col = hint_data[0], hint_data[1]
287
+ action = hint_data[2] if len(hint_data) > 2 else "reveal"
288
+ return await game.validate_move(row, col, action)
289
+
290
+ # Lights Out - hint is (row, col), validate_move(row, col)
291
+ # The issue is that pressing a cell toggles itself and neighbors
292
+ # The hint gives one cell from the solution pattern, but we need to track presses
293
+ if game_name in ["lights out"]:
294
+ if len(hint_data) >= 2:
295
+ row, col = hint_data[0], hint_data[1]
296
+ return await game.validate_move(row, col)
297
+
298
+ # Bridges
299
+ if game_name in ["bridges"]:
300
+ if len(hint_data) >= 5:
301
+ r1, c1, r2, c2, count = hint_data[0], hint_data[1], hint_data[2], hint_data[3], hint_data[4]
302
+ return await game.validate_move(r1, c1, r2, c2, count)
303
+
304
+ # Shikaku
305
+ if game_name in ["shikaku"]:
306
+ if len(hint_data) >= 4:
307
+ r1, c1, r2, c2 = hint_data[0], hint_data[1], hint_data[2], hint_data[3]
308
+ return await game.validate_move(r1, c1, r2, c2)
309
+
310
+ # Slitherlink
311
+ if game_name in ["slitherlink"]:
312
+ if len(hint_data) >= 4:
313
+ r1, c1, r2, c2 = hint_data[0], hint_data[1], hint_data[2], hint_data[3]
314
+ return await game.validate_move(r1, c1, r2, c2)
315
+
316
+ # Nurikabe
317
+ if game_name in ["nurikabe"]:
318
+ if len(hint_data) >= 2:
319
+ row, col = hint_data[0], hint_data[1]
320
+ state = hint_data[2] if len(hint_data) > 2 else "sea"
321
+ return await game.validate_move(row, col, state)
322
+
323
+ # Knapsack - hint is (action, item_index) like ("select", 1)
324
+ # validate_move expects (action, item_index)
325
+ if game_name in ["knapsack"]:
326
+ if len(hint_data) >= 2:
327
+ action, item_index = hint_data[0], hint_data[1]
328
+ return await game.validate_move(action, item_index)
329
+
330
+ # Task Scheduler - hint is (task_id, worker, start_time)
331
+ # validate_move expects (task_id, worker_id, start_time)
332
+ if game_name in ["task scheduler"]:
333
+ if len(hint_data) >= 3:
334
+ task_id, worker, start_time = hint_data[0], hint_data[1], hint_data[2]
335
+ return await game.validate_move(task_id, worker, start_time)
336
+
337
+ # Sokoban - hint is a direction string like "up", "down", etc.
338
+ # Note: Sokoban requires planning/search algorithms for reliable solving.
339
+ # The greedy hint approach often gets stuck in loops.
340
+ if game_name in ["sokoban"]:
341
+ if hint_data:
342
+ direction = hint_data if isinstance(hint_data, str) else hint_data
343
+ return await game.validate_move(direction)
344
+
345
+ # Generic fallback - try validate_move with hint args as tuple
346
+ if isinstance(hint_data, tuple) and len(hint_data) >= 2:
347
+ return await game.validate_move(*hint_data)
348
+
349
+ # Single value fallback
350
+ return await game.validate_move(hint_data)
351
+
352
+
353
+ async def run_episode(
354
+ game_class: type[PuzzleGame],
355
+ difficulty: str,
356
+ seed: int,
357
+ solver_config: SolverConfig | None = None,
358
+ use_hints: bool = True,
359
+ max_moves: int = 1000,
360
+ timeout_sec: float = 30.0,
361
+ ) -> EpisodeResult:
362
+ """Run a single puzzle episode using hints to solve.
363
+
364
+ Args:
365
+ game_class: The puzzle game class to instantiate
366
+ difficulty: Difficulty level (easy, medium, hard)
367
+ seed: Random seed for reproducible puzzle generation
368
+ solver_config: Configuration for solver/hint usage
369
+ use_hints: Whether to use hints for auto-solving
370
+ max_moves: Maximum moves before giving up
371
+ timeout_sec: Maximum time in seconds before timeout
372
+
373
+ Returns:
374
+ EpisodeResult with all metrics and status
375
+ """
376
+ solver_config = solver_config or SolverConfig()
377
+ game = game_class(difficulty=difficulty, seed=seed, solver_config=solver_config)
378
+ await game.generate_puzzle()
379
+
380
+ # Get optimal steps for efficiency calculation
381
+ optimal_steps = game.optimal_steps
382
+
383
+ started_at = datetime.now()
384
+ start_time = time.perf_counter()
385
+
386
+ steps_taken = 0
387
+ invalid_actions = 0
388
+ hints_used = 0
389
+ retries = 0
390
+ status = EpisodeStatus.FAILED
391
+
392
+ while steps_taken < max_moves and not game.is_complete():
393
+ # Check for timeout
394
+ elapsed = time.perf_counter() - start_time
395
+ if elapsed > timeout_sec:
396
+ status = EpisodeStatus.TIMEOUT
397
+ break
398
+
399
+ if use_hints and game.can_use_hint():
400
+ hint_result = await game.get_hint()
401
+ if hint_result is None:
402
+ # No hint available, puzzle might be complete or stuck
403
+ break
404
+
405
+ # Hints return (hint_data, hint_message) tuple
406
+ hint_data, _hint_message = hint_result
407
+
408
+ # Record hint usage (increments game.hints_used for budget tracking)
409
+ game.record_hint()
410
+ hints_used += 1
411
+
412
+ # Apply the hint based on game type
413
+ try:
414
+ result = await _apply_hint(game, hint_data)
415
+ if result.success:
416
+ steps_taken += 1
417
+ else:
418
+ invalid_actions += 1
419
+ # If we get too many consecutive invalid moves, break
420
+ if invalid_actions > 50:
421
+ break
422
+ except (TypeError, ValueError, AttributeError, IndexError):
423
+ invalid_actions += 1
424
+ if invalid_actions > 50:
425
+ break
426
+ elif not use_hints:
427
+ # Without hints, we can't solve automatically
428
+ break
429
+ else:
430
+ # Hints exhausted (budget exceeded)
431
+ break
432
+
433
+ end_time = time.perf_counter()
434
+ ended_at = datetime.now()
435
+ wall_time_ms = int((end_time - start_time) * 1000)
436
+
437
+ if game.is_complete():
438
+ status = EpisodeStatus.SOLVED
439
+
440
+ # Get retries from game if tracked
441
+ retries = getattr(game, "retries", 0)
442
+
443
+ return EpisodeResult(
444
+ game=game.name,
445
+ difficulty=DifficultyLevel(difficulty),
446
+ seed=seed,
447
+ started_at=started_at,
448
+ ended_at=ended_at,
449
+ wall_time_ms=wall_time_ms,
450
+ status=status,
451
+ steps_taken=steps_taken,
452
+ invalid_actions=invalid_actions,
453
+ hints_used=hints_used,
454
+ retries=retries,
455
+ optimal_steps=optimal_steps,
456
+ solver_config=solver_config,
457
+ )
458
+
459
+
460
+ async def evaluate_game(
461
+ game_name: str,
462
+ difficulty: str = "easy",
463
+ episodes: int = 10,
464
+ seeds: list[int] | None = None,
465
+ solver_config: SolverConfig | None = None,
466
+ use_hints: bool = True,
467
+ max_moves: int = 1000,
468
+ verbose: bool = False,
469
+ ) -> EvaluationReport:
470
+ """Run evaluation for a specific game.
471
+
472
+ Args:
473
+ game_name: Name of the game to evaluate
474
+ difficulty: Difficulty level (easy, medium, hard)
475
+ episodes: Number of episodes to run
476
+ seeds: Specific seeds to use (generates random if None)
477
+ solver_config: Configuration for solver/hint usage
478
+ use_hints: Whether to use hints for auto-solving
479
+ max_moves: Maximum moves per episode
480
+ verbose: Print progress during evaluation
481
+
482
+ Returns:
483
+ EvaluationReport with all episode results
484
+ """
485
+ if game_name not in AVAILABLE_GAMES:
486
+ raise ValueError(f"Unknown game: {game_name}. Available: {list(AVAILABLE_GAMES.keys())}")
487
+
488
+ solver_config = solver_config or SolverConfig()
489
+ game_class = AVAILABLE_GAMES[game_name]
490
+ report = EvaluationReport(game=game_name, difficulty=difficulty, solver_config=solver_config)
491
+
492
+ # Generate seeds if not provided
493
+ if seeds is None:
494
+ import random
495
+
496
+ seeds = [random.randint(1, 2**31 - 1) for _ in range(episodes)]
497
+
498
+ for i, seed in enumerate(seeds):
499
+ if verbose:
500
+ print(f" Running episode {i + 1}/{len(seeds)} (seed={seed})...", end=" ", flush=True)
501
+
502
+ result = await run_episode(
503
+ game_class=game_class, # type: ignore[type-abstract]
504
+ difficulty=difficulty,
505
+ seed=seed,
506
+ solver_config=solver_config,
507
+ use_hints=use_hints,
508
+ max_moves=max_moves,
509
+ )
510
+ report.episodes.append(result)
511
+
512
+ if verbose:
513
+ status = "solved" if result.success else result.status.value
514
+ eff = f", eff={result.efficiency_score:.0%}" if result.success else ""
515
+ print(f"{status} ({result.steps_taken} steps{eff}, {result.wall_time_ms}ms)")
516
+
517
+ return report
518
+
519
+
520
+ async def evaluate_all_games(
521
+ difficulty: str = "easy",
522
+ episodes: int = 5,
523
+ solver_config: SolverConfig | None = None,
524
+ use_hints: bool = True,
525
+ verbose: bool = False,
526
+ ) -> dict[str, EvaluationReport]:
527
+ """Run evaluation for all available games.
528
+
529
+ Args:
530
+ difficulty: Difficulty level for all games
531
+ episodes: Number of episodes per game
532
+ solver_config: Configuration for solver/hint usage
533
+ use_hints: Whether to use hints for auto-solving
534
+ verbose: Print progress during evaluation
535
+
536
+ Returns:
537
+ Dict mapping game names to EvaluationReports
538
+ """
539
+ reports = {}
540
+
541
+ for game_name in sorted(AVAILABLE_GAMES.keys()):
542
+ if verbose:
543
+ print(f"\nEvaluating {game_name}...")
544
+
545
+ try:
546
+ report = await evaluate_game(
547
+ game_name=game_name,
548
+ difficulty=difficulty,
549
+ episodes=episodes,
550
+ solver_config=solver_config,
551
+ use_hints=use_hints,
552
+ verbose=verbose,
553
+ )
554
+ reports[game_name] = report
555
+ except Exception as e:
556
+ if verbose:
557
+ print(f" Error: {e}")
558
+
559
+ return reports
560
+
561
+
562
+ def parse_args() -> argparse.Namespace:
563
+ """Parse command line arguments."""
564
+ parser = argparse.ArgumentParser(
565
+ description="Puzzle Arcade Evaluation Harness",
566
+ formatter_class=argparse.RawDescriptionHelpFormatter,
567
+ epilog="""
568
+ Examples:
569
+ puzzle-arcade-eval sudoku --difficulty medium --episodes 10
570
+ puzzle-arcade-eval --all --difficulty easy --episodes 5
571
+ puzzle-arcade-eval kenken --seeds 1,2,3,4,5 --output json
572
+ puzzle-arcade-eval sudoku --output csv > results.csv
573
+
574
+ # Solver configuration
575
+ puzzle-arcade-eval sudoku --solver-free # Pure model reasoning
576
+ puzzle-arcade-eval sudoku --hint-budget 10 # Limited hints
577
+ puzzle-arcade-eval sudoku --hint-penalty 0.1 # Penalize hint usage
578
+ """,
579
+ )
580
+
581
+ parser.add_argument(
582
+ "game",
583
+ nargs="?",
584
+ help="Game to evaluate (e.g., sudoku, kenken). Use --all for all games.",
585
+ )
586
+ parser.add_argument(
587
+ "--all",
588
+ action="store_true",
589
+ help="Evaluate all available games",
590
+ )
591
+ parser.add_argument(
592
+ "-d",
593
+ "--difficulty",
594
+ choices=["easy", "medium", "hard"],
595
+ default="easy",
596
+ help="Difficulty level (default: easy)",
597
+ )
598
+ parser.add_argument(
599
+ "-n",
600
+ "--episodes",
601
+ type=int,
602
+ default=10,
603
+ help="Number of episodes to run (default: 10)",
604
+ )
605
+ parser.add_argument(
606
+ "--seeds",
607
+ type=str,
608
+ help="Comma-separated list of seeds to use (e.g., 1,2,3,4,5)",
609
+ )
610
+ parser.add_argument(
611
+ "-o",
612
+ "--output",
613
+ choices=["text", "json", "csv", "markdown", "jsonl"],
614
+ default="text",
615
+ help="Output format (default: text)",
616
+ )
617
+ parser.add_argument(
618
+ "--max-moves",
619
+ type=int,
620
+ default=1000,
621
+ help="Maximum moves per episode (default: 1000)",
622
+ )
623
+ parser.add_argument(
624
+ "-v",
625
+ "--verbose",
626
+ action="store_true",
627
+ help="Verbose output",
628
+ )
629
+ parser.add_argument(
630
+ "--list-games",
631
+ action="store_true",
632
+ help="List all available games and exit",
633
+ )
634
+
635
+ # Solver configuration arguments
636
+ solver_group = parser.add_argument_group("solver configuration")
637
+ solver_group.add_argument(
638
+ "--solver-free",
639
+ action="store_true",
640
+ help="Disable solver hints (pure model reasoning mode)",
641
+ )
642
+ solver_group.add_argument(
643
+ "--hint-budget",
644
+ type=int,
645
+ default=100,
646
+ help="Maximum number of hints allowed (default: 100)",
647
+ )
648
+ solver_group.add_argument(
649
+ "--hint-penalty",
650
+ type=float,
651
+ default=0.0,
652
+ help="Score penalty per hint used, 0.0-1.0 (default: 0.0)",
653
+ )
654
+
655
+ return parser.parse_args()
656
+
657
+
658
+ def main() -> None:
659
+ """Main entry point for the evaluation CLI."""
660
+ args = parse_args()
661
+
662
+ if args.list_games:
663
+ print("Available games:")
664
+ for name in sorted(AVAILABLE_GAMES.keys()):
665
+ game = AVAILABLE_GAMES[name]("easy") # type: ignore[abstract]
666
+ print(f" {name:20} - {game.description}")
667
+ return
668
+
669
+ if not args.game and not args.all:
670
+ print("Error: Please specify a game or use --all")
671
+ print("Use --list-games to see available games")
672
+ sys.exit(1)
673
+
674
+ # Parse seeds if provided
675
+ seeds = None
676
+ if args.seeds:
677
+ seeds = [int(s.strip()) for s in args.seeds.split(",")]
678
+
679
+ # Build solver configuration
680
+ if args.solver_free:
681
+ solver_config = SolverConfig.solver_free()
682
+ else:
683
+ solver_config = SolverConfig(
684
+ solver_allowed=True,
685
+ hint_budget=args.hint_budget,
686
+ hint_penalty=args.hint_penalty,
687
+ )
688
+
689
+ # Run evaluation
690
+ if args.all:
691
+ reports = asyncio.run(
692
+ evaluate_all_games(
693
+ difficulty=args.difficulty,
694
+ episodes=args.episodes,
695
+ solver_config=solver_config,
696
+ verbose=args.verbose,
697
+ )
698
+ )
699
+
700
+ # Output results
701
+ if args.output == "json":
702
+ print(
703
+ json.dumps(
704
+ {name: json.loads(r.to_json()) for name, r in reports.items()},
705
+ indent=2,
706
+ )
707
+ )
708
+ elif args.output == "jsonl":
709
+ # Stream one-line JSON per episode
710
+ for report in reports.values():
711
+ for episode in report.episodes:
712
+ print(episode.to_jsonl())
713
+ elif args.output == "csv":
714
+ # Combine all CSVs
715
+ first = True
716
+ for report in reports.values():
717
+ csv_out = report.to_csv()
718
+ if first:
719
+ print(csv_out, end="")
720
+ first = False
721
+ else:
722
+ # Skip header for subsequent reports
723
+ lines = csv_out.split("\n")
724
+ print("\n".join(lines[1:]), end="")
725
+ elif args.output == "markdown":
726
+ for report in reports.values():
727
+ print(report.to_markdown())
728
+ print("\n---\n")
729
+ else:
730
+ print("\n" + "=" * 60)
731
+ print("PUZZLE ARCADE EVALUATION SUMMARY")
732
+ print("=" * 60)
733
+ for report in reports.values():
734
+ report.print_summary()
735
+ else:
736
+ report = asyncio.run(
737
+ evaluate_game(
738
+ game_name=args.game,
739
+ difficulty=args.difficulty,
740
+ episodes=args.episodes,
741
+ seeds=seeds,
742
+ solver_config=solver_config,
743
+ max_moves=args.max_moves,
744
+ verbose=args.verbose,
745
+ )
746
+ )
747
+
748
+ # Output results
749
+ if args.output == "json":
750
+ print(report.to_json())
751
+ elif args.output == "jsonl":
752
+ for episode in report.episodes:
753
+ print(episode.to_jsonl())
754
+ elif args.output == "csv":
755
+ print(report.to_csv())
756
+ elif args.output == "markdown":
757
+ print(report.to_markdown())
758
+ else:
759
+ report.print_summary()
760
+
761
+
762
+ if __name__ == "__main__":
763
+ main()