chuk-puzzles-gym 0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chuk_puzzles_gym/__init__.py +19 -0
- chuk_puzzles_gym/constants.py +9 -0
- chuk_puzzles_gym/eval.py +763 -0
- chuk_puzzles_gym/export/__init__.py +20 -0
- chuk_puzzles_gym/export/dataset.py +376 -0
- chuk_puzzles_gym/games/__init__.py +94 -0
- chuk_puzzles_gym/games/_base/__init__.py +6 -0
- chuk_puzzles_gym/games/_base/commands.py +91 -0
- chuk_puzzles_gym/games/_base/game.py +337 -0
- chuk_puzzles_gym/games/binary/__init__.py +6 -0
- chuk_puzzles_gym/games/binary/config.py +23 -0
- chuk_puzzles_gym/games/binary/game.py +434 -0
- chuk_puzzles_gym/games/bridges/__init__.py +6 -0
- chuk_puzzles_gym/games/bridges/config.py +24 -0
- chuk_puzzles_gym/games/bridges/game.py +489 -0
- chuk_puzzles_gym/games/einstein/__init__.py +6 -0
- chuk_puzzles_gym/games/einstein/config.py +23 -0
- chuk_puzzles_gym/games/einstein/constants.py +13 -0
- chuk_puzzles_gym/games/einstein/game.py +366 -0
- chuk_puzzles_gym/games/einstein/models.py +35 -0
- chuk_puzzles_gym/games/fillomino/__init__.py +6 -0
- chuk_puzzles_gym/games/fillomino/config.py +24 -0
- chuk_puzzles_gym/games/fillomino/game.py +516 -0
- chuk_puzzles_gym/games/futoshiki/__init__.py +6 -0
- chuk_puzzles_gym/games/futoshiki/config.py +23 -0
- chuk_puzzles_gym/games/futoshiki/game.py +391 -0
- chuk_puzzles_gym/games/hidato/__init__.py +6 -0
- chuk_puzzles_gym/games/hidato/config.py +24 -0
- chuk_puzzles_gym/games/hidato/game.py +403 -0
- chuk_puzzles_gym/games/hitori/__init__.py +6 -0
- chuk_puzzles_gym/games/hitori/config.py +23 -0
- chuk_puzzles_gym/games/hitori/game.py +451 -0
- chuk_puzzles_gym/games/kakuro/__init__.py +6 -0
- chuk_puzzles_gym/games/kakuro/config.py +24 -0
- chuk_puzzles_gym/games/kakuro/game.py +399 -0
- chuk_puzzles_gym/games/kenken/__init__.py +6 -0
- chuk_puzzles_gym/games/kenken/config.py +24 -0
- chuk_puzzles_gym/games/kenken/enums.py +13 -0
- chuk_puzzles_gym/games/kenken/game.py +486 -0
- chuk_puzzles_gym/games/kenken/models.py +15 -0
- chuk_puzzles_gym/games/killer_sudoku/__init__.py +6 -0
- chuk_puzzles_gym/games/killer_sudoku/config.py +23 -0
- chuk_puzzles_gym/games/killer_sudoku/game.py +502 -0
- chuk_puzzles_gym/games/killer_sudoku/models.py +15 -0
- chuk_puzzles_gym/games/knapsack/__init__.py +6 -0
- chuk_puzzles_gym/games/knapsack/config.py +24 -0
- chuk_puzzles_gym/games/knapsack/enums.py +10 -0
- chuk_puzzles_gym/games/knapsack/game.py +340 -0
- chuk_puzzles_gym/games/knapsack/models.py +13 -0
- chuk_puzzles_gym/games/lights_out/__init__.py +6 -0
- chuk_puzzles_gym/games/lights_out/config.py +24 -0
- chuk_puzzles_gym/games/lights_out/game.py +249 -0
- chuk_puzzles_gym/games/logic_grid/__init__.py +6 -0
- chuk_puzzles_gym/games/logic_grid/config.py +24 -0
- chuk_puzzles_gym/games/logic_grid/constants.py +12 -0
- chuk_puzzles_gym/games/logic_grid/game.py +333 -0
- chuk_puzzles_gym/games/logic_grid/models.py +24 -0
- chuk_puzzles_gym/games/mastermind/__init__.py +6 -0
- chuk_puzzles_gym/games/mastermind/config.py +25 -0
- chuk_puzzles_gym/games/mastermind/game.py +297 -0
- chuk_puzzles_gym/games/minesweeper/__init__.py +6 -0
- chuk_puzzles_gym/games/minesweeper/config.py +24 -0
- chuk_puzzles_gym/games/minesweeper/enums.py +12 -0
- chuk_puzzles_gym/games/minesweeper/game.py +432 -0
- chuk_puzzles_gym/games/nonogram/__init__.py +6 -0
- chuk_puzzles_gym/games/nonogram/config.py +23 -0
- chuk_puzzles_gym/games/nonogram/game.py +296 -0
- chuk_puzzles_gym/games/nurikabe/__init__.py +6 -0
- chuk_puzzles_gym/games/nurikabe/config.py +24 -0
- chuk_puzzles_gym/games/nurikabe/enums.py +14 -0
- chuk_puzzles_gym/games/nurikabe/game.py +586 -0
- chuk_puzzles_gym/games/scheduler/__init__.py +6 -0
- chuk_puzzles_gym/games/scheduler/config.py +25 -0
- chuk_puzzles_gym/games/scheduler/constants.py +15 -0
- chuk_puzzles_gym/games/scheduler/enums.py +10 -0
- chuk_puzzles_gym/games/scheduler/game.py +431 -0
- chuk_puzzles_gym/games/scheduler/models.py +14 -0
- chuk_puzzles_gym/games/shikaku/__init__.py +6 -0
- chuk_puzzles_gym/games/shikaku/config.py +24 -0
- chuk_puzzles_gym/games/shikaku/game.py +419 -0
- chuk_puzzles_gym/games/slitherlink/__init__.py +6 -0
- chuk_puzzles_gym/games/slitherlink/config.py +23 -0
- chuk_puzzles_gym/games/slitherlink/game.py +386 -0
- chuk_puzzles_gym/games/sokoban/__init__.py +6 -0
- chuk_puzzles_gym/games/sokoban/config.py +24 -0
- chuk_puzzles_gym/games/sokoban/game.py +671 -0
- chuk_puzzles_gym/games/star_battle/__init__.py +6 -0
- chuk_puzzles_gym/games/star_battle/config.py +24 -0
- chuk_puzzles_gym/games/star_battle/game.py +390 -0
- chuk_puzzles_gym/games/sudoku/__init__.py +7 -0
- chuk_puzzles_gym/games/sudoku/commands.py +96 -0
- chuk_puzzles_gym/games/sudoku/config.py +22 -0
- chuk_puzzles_gym/games/sudoku/game.py +328 -0
- chuk_puzzles_gym/games/tents/__init__.py +6 -0
- chuk_puzzles_gym/games/tents/config.py +24 -0
- chuk_puzzles_gym/games/tents/game.py +416 -0
- chuk_puzzles_gym/gym_env.py +465 -0
- chuk_puzzles_gym/models/__init__.py +47 -0
- chuk_puzzles_gym/models/base.py +30 -0
- chuk_puzzles_gym/models/config.py +11 -0
- chuk_puzzles_gym/models/enums.py +104 -0
- chuk_puzzles_gym/models/evaluation.py +487 -0
- chuk_puzzles_gym/models/games.py +12 -0
- chuk_puzzles_gym/server.py +1171 -0
- chuk_puzzles_gym/trace/__init__.py +10 -0
- chuk_puzzles_gym/trace/generator.py +726 -0
- chuk_puzzles_gym/utils/__init__.py +4 -0
- chuk_puzzles_gym-0.9.dist-info/METADATA +1471 -0
- chuk_puzzles_gym-0.9.dist-info/RECORD +112 -0
- chuk_puzzles_gym-0.9.dist-info/WHEEL +5 -0
- chuk_puzzles_gym-0.9.dist-info/entry_points.txt +4 -0
- chuk_puzzles_gym-0.9.dist-info/top_level.txt +1 -0
chuk_puzzles_gym/eval.py
ADDED
|
@@ -0,0 +1,763 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation harness for puzzle-arcade-server.
|
|
3
|
+
|
|
4
|
+
Run benchmarks against puzzle games and collect metrics.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
puzzle-arcade-eval sudoku --difficulty medium --episodes 10
|
|
8
|
+
puzzle-arcade-eval --all --difficulty easy --episodes 5
|
|
9
|
+
puzzle-arcade-eval kenken --seeds 1,2,3,4,5
|
|
10
|
+
|
|
11
|
+
# Solver-free mode (pure model reasoning)
|
|
12
|
+
puzzle-arcade-eval sudoku --solver-free
|
|
13
|
+
|
|
14
|
+
# Solver-assisted with budget
|
|
15
|
+
puzzle-arcade-eval sudoku --hint-budget 10 --hint-penalty 0.1
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import asyncio
|
|
22
|
+
import csv
|
|
23
|
+
import json
|
|
24
|
+
import sys
|
|
25
|
+
import time
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from datetime import datetime
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
from .games import AVAILABLE_GAMES
|
|
34
|
+
from .games._base import PuzzleGame
|
|
35
|
+
from .models import (
|
|
36
|
+
DifficultyLevel,
|
|
37
|
+
EpisodeResult,
|
|
38
|
+
EpisodeStatus,
|
|
39
|
+
MoveResult,
|
|
40
|
+
SolverConfig,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class EvaluationReport:
|
|
46
|
+
"""Summary report of evaluation run.
|
|
47
|
+
|
|
48
|
+
This wraps the Pydantic EvaluationSummary for backwards compatibility
|
|
49
|
+
while providing additional output formatting methods.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
game: str
|
|
53
|
+
difficulty: str
|
|
54
|
+
solver_config: SolverConfig = field(default_factory=SolverConfig)
|
|
55
|
+
episodes: list[EpisodeResult] = field(default_factory=list)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def total_episodes(self) -> int:
|
|
59
|
+
return len(self.episodes)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def solved_count(self) -> int:
|
|
63
|
+
return sum(1 for e in self.episodes if e.success)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def solve_rate(self) -> float:
|
|
67
|
+
if not self.episodes:
|
|
68
|
+
return 0.0
|
|
69
|
+
return self.solved_count / self.total_episodes
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def avg_moves(self) -> float:
|
|
73
|
+
if not self.episodes:
|
|
74
|
+
return 0.0
|
|
75
|
+
return sum(e.steps_taken for e in self.episodes) / self.total_episodes
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def avg_invalid_moves(self) -> float:
|
|
79
|
+
if not self.episodes:
|
|
80
|
+
return 0.0
|
|
81
|
+
return sum(e.invalid_actions for e in self.episodes) / self.total_episodes
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def avg_time_ms(self) -> float:
|
|
85
|
+
if not self.episodes:
|
|
86
|
+
return 0.0
|
|
87
|
+
return sum(e.wall_time_ms for e in self.episodes) / self.total_episodes
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def avg_efficiency(self) -> float:
|
|
91
|
+
"""Average efficiency score across solved episodes."""
|
|
92
|
+
solved = [e for e in self.episodes if e.success]
|
|
93
|
+
if not solved:
|
|
94
|
+
return 0.0
|
|
95
|
+
return sum(e.efficiency_score for e in solved) / len(solved)
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def avg_hints(self) -> float:
|
|
99
|
+
if not self.episodes:
|
|
100
|
+
return 0.0
|
|
101
|
+
return sum(e.hints_used for e in self.episodes) / self.total_episodes
|
|
102
|
+
|
|
103
|
+
def to_markdown(self) -> str:
|
|
104
|
+
"""Generate markdown report."""
|
|
105
|
+
lines = [
|
|
106
|
+
f"# {self.game.title()} {self.difficulty.title()} Evaluation",
|
|
107
|
+
"",
|
|
108
|
+
f"**Episodes:** {self.total_episodes}",
|
|
109
|
+
f"**Solved:** {self.solved_count}/{self.total_episodes} ({self.solve_rate:.1%})",
|
|
110
|
+
f"**Avg Steps:** {self.avg_moves:.1f}",
|
|
111
|
+
f"**Avg Invalid:** {self.avg_invalid_moves:.1f}",
|
|
112
|
+
f"**Avg Hints:** {self.avg_hints:.1f}",
|
|
113
|
+
f"**Avg Efficiency:** {self.avg_efficiency:.1%}",
|
|
114
|
+
f"**Avg Time:** {self.avg_time_ms:.0f}ms",
|
|
115
|
+
"",
|
|
116
|
+
f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
|
|
117
|
+
"",
|
|
118
|
+
"## Episode Details",
|
|
119
|
+
"",
|
|
120
|
+
"| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |",
|
|
121
|
+
"|------|--------|-------|---------|-------|------------|-----------|",
|
|
122
|
+
]
|
|
123
|
+
for e in self.episodes:
|
|
124
|
+
status = "solved" if e.success else e.status.value
|
|
125
|
+
eff = f"{e.efficiency_score:.0%}" if e.success else "-"
|
|
126
|
+
lines.append(
|
|
127
|
+
f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
|
|
128
|
+
)
|
|
129
|
+
return "\n".join(lines)
|
|
130
|
+
|
|
131
|
+
def to_json(self) -> str:
|
|
132
|
+
"""Generate JSON report."""
|
|
133
|
+
return json.dumps(
|
|
134
|
+
{
|
|
135
|
+
"game": self.game,
|
|
136
|
+
"difficulty": self.difficulty,
|
|
137
|
+
"solver_config": {
|
|
138
|
+
"solver_allowed": self.solver_config.solver_allowed,
|
|
139
|
+
"hint_budget": self.solver_config.hint_budget,
|
|
140
|
+
"hint_penalty": self.solver_config.hint_penalty,
|
|
141
|
+
},
|
|
142
|
+
"summary": {
|
|
143
|
+
"total_episodes": self.total_episodes,
|
|
144
|
+
"solved_count": self.solved_count,
|
|
145
|
+
"solve_rate": self.solve_rate,
|
|
146
|
+
"avg_steps": self.avg_moves,
|
|
147
|
+
"avg_invalid": self.avg_invalid_moves,
|
|
148
|
+
"avg_hints": self.avg_hints,
|
|
149
|
+
"avg_efficiency": self.avg_efficiency,
|
|
150
|
+
"avg_time_ms": self.avg_time_ms,
|
|
151
|
+
},
|
|
152
|
+
"episodes": [e.to_summary_dict() for e in self.episodes],
|
|
153
|
+
},
|
|
154
|
+
indent=2,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def to_csv(self) -> str:
|
|
158
|
+
"""Generate CSV report."""
|
|
159
|
+
import io
|
|
160
|
+
|
|
161
|
+
output = io.StringIO()
|
|
162
|
+
writer = csv.writer(output)
|
|
163
|
+
writer.writerow(
|
|
164
|
+
[
|
|
165
|
+
"game",
|
|
166
|
+
"difficulty",
|
|
167
|
+
"seed",
|
|
168
|
+
"status",
|
|
169
|
+
"steps_taken",
|
|
170
|
+
"invalid_actions",
|
|
171
|
+
"hints_used",
|
|
172
|
+
"efficiency",
|
|
173
|
+
"wall_time_ms",
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
for e in self.episodes:
|
|
177
|
+
writer.writerow(
|
|
178
|
+
[
|
|
179
|
+
e.game,
|
|
180
|
+
e.difficulty.value,
|
|
181
|
+
e.seed,
|
|
182
|
+
e.status.value,
|
|
183
|
+
e.steps_taken,
|
|
184
|
+
e.invalid_actions,
|
|
185
|
+
e.hints_used,
|
|
186
|
+
f"{e.efficiency_score:.3f}",
|
|
187
|
+
e.wall_time_ms,
|
|
188
|
+
]
|
|
189
|
+
)
|
|
190
|
+
return output.getvalue()
|
|
191
|
+
|
|
192
|
+
def print_summary(self) -> None:
|
|
193
|
+
"""Print human-readable summary to stdout."""
|
|
194
|
+
print(f"\n{self.game.title()} {self.difficulty.title()} Evaluation ({self.total_episodes} episodes)")
|
|
195
|
+
print("=" * 60)
|
|
196
|
+
solver_mode = (
|
|
197
|
+
"solver-free"
|
|
198
|
+
if not self.solver_config.solver_allowed
|
|
199
|
+
else f"solver-assisted (budget={self.solver_config.hint_budget})"
|
|
200
|
+
)
|
|
201
|
+
print(f"Mode: {solver_mode}")
|
|
202
|
+
print(f"Solved: {self.solved_count}/{self.total_episodes} ({self.solve_rate:.1%})")
|
|
203
|
+
print(f"Avg Steps: {self.avg_moves:.1f}")
|
|
204
|
+
print(f"Avg Invalid: {self.avg_invalid_moves:.1f}")
|
|
205
|
+
print(f"Avg Hints: {self.avg_hints:.1f}")
|
|
206
|
+
print(f"Avg Efficiency: {self.avg_efficiency:.1%}")
|
|
207
|
+
print(f"Avg Time: {self.avg_time_ms:.0f}ms")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
async def _apply_hint(game: PuzzleGame, hint_data: tuple) -> MoveResult:
|
|
211
|
+
"""Apply a hint to the game based on game type.
|
|
212
|
+
|
|
213
|
+
Different games return hints in different formats:
|
|
214
|
+
- Grid games (Sudoku, KenKen, etc.): (row, col, value)
|
|
215
|
+
- Einstein: (house, attr, value)
|
|
216
|
+
- Mastermind: (guess_sequence,)
|
|
217
|
+
- Minesweeper: (row, col)
|
|
218
|
+
- Lights Out: (row, col)
|
|
219
|
+
- Knapsack: (item_id,)
|
|
220
|
+
- Sokoban: (direction,)
|
|
221
|
+
- Bridges: (r1, c1, r2, c2, count)
|
|
222
|
+
- Shikaku: (r1, c1, r2, c2)
|
|
223
|
+
- etc.
|
|
224
|
+
"""
|
|
225
|
+
game_name = game.name.lower()
|
|
226
|
+
|
|
227
|
+
# Grid-based number placement games
|
|
228
|
+
if game_name in [
|
|
229
|
+
"sudoku",
|
|
230
|
+
"kenken",
|
|
231
|
+
"kakuro",
|
|
232
|
+
"killer sudoku",
|
|
233
|
+
"futoshiki",
|
|
234
|
+
"binary puzzle",
|
|
235
|
+
"nonogram",
|
|
236
|
+
"hidato",
|
|
237
|
+
"fillomino",
|
|
238
|
+
]:
|
|
239
|
+
if len(hint_data) >= 3:
|
|
240
|
+
row, col, value = hint_data[0], hint_data[1], hint_data[2]
|
|
241
|
+
return await game.validate_move(row, col, value)
|
|
242
|
+
|
|
243
|
+
# Hitori - hint is (row, col, action) where action is "shade" or "unshade"
|
|
244
|
+
if game_name in ["hitori"]:
|
|
245
|
+
if len(hint_data) >= 3:
|
|
246
|
+
row, col, action = hint_data[0], hint_data[1], hint_data[2]
|
|
247
|
+
return await game.validate_move(row, col, action)
|
|
248
|
+
|
|
249
|
+
# Star placement games
|
|
250
|
+
if game_name in ["star battle"]:
|
|
251
|
+
if len(hint_data) >= 2:
|
|
252
|
+
row, col = hint_data[0], hint_data[1]
|
|
253
|
+
return await game.validate_move(row, col, "place")
|
|
254
|
+
|
|
255
|
+
# Tents game - hint is (row, col, action) where action is "place" or "remove"
|
|
256
|
+
if game_name in ["tents and trees"]:
|
|
257
|
+
if len(hint_data) >= 3:
|
|
258
|
+
row, col, action = hint_data[0], hint_data[1], hint_data[2]
|
|
259
|
+
return await game.validate_move(row, col, action)
|
|
260
|
+
|
|
261
|
+
# Einstein puzzle - hint is (person, category, value)
|
|
262
|
+
# validate_move expects (house, attr, value) where house is person name
|
|
263
|
+
if game_name in ["einstein's puzzle", "einstein"]:
|
|
264
|
+
if len(hint_data) >= 3:
|
|
265
|
+
person, category, value = hint_data[0], hint_data[1], hint_data[2]
|
|
266
|
+
return await game.validate_move(person, category, value)
|
|
267
|
+
|
|
268
|
+
# Logic Grid puzzle - hint is (person, category, value)
|
|
269
|
+
# validate_move expects (cat1, val1, cat2, val2, state)
|
|
270
|
+
# Need to convert: connect person to category=value
|
|
271
|
+
if game_name in ["logic grid"]:
|
|
272
|
+
if len(hint_data) >= 3:
|
|
273
|
+
person, category, value = hint_data[0], hint_data[1], hint_data[2]
|
|
274
|
+
# Connect person to category=value means: cat1=person, val1=person, cat2=category, val2=value
|
|
275
|
+
return await game.validate_move("person", person, category, value, True)
|
|
276
|
+
|
|
277
|
+
# Mastermind - hint is now the complete secret code tuple
|
|
278
|
+
# validate_move expects (*guess) - the full code
|
|
279
|
+
if game_name in ["mastermind"]:
|
|
280
|
+
# The hint provides the complete secret code
|
|
281
|
+
return await game.validate_move(*hint_data)
|
|
282
|
+
|
|
283
|
+
# Minesweeper
|
|
284
|
+
if game_name in ["minesweeper"]:
|
|
285
|
+
if len(hint_data) >= 2:
|
|
286
|
+
row, col = hint_data[0], hint_data[1]
|
|
287
|
+
action = hint_data[2] if len(hint_data) > 2 else "reveal"
|
|
288
|
+
return await game.validate_move(row, col, action)
|
|
289
|
+
|
|
290
|
+
# Lights Out - hint is (row, col), validate_move(row, col)
|
|
291
|
+
# The issue is that pressing a cell toggles itself and neighbors
|
|
292
|
+
# The hint gives one cell from the solution pattern, but we need to track presses
|
|
293
|
+
if game_name in ["lights out"]:
|
|
294
|
+
if len(hint_data) >= 2:
|
|
295
|
+
row, col = hint_data[0], hint_data[1]
|
|
296
|
+
return await game.validate_move(row, col)
|
|
297
|
+
|
|
298
|
+
# Bridges
|
|
299
|
+
if game_name in ["bridges"]:
|
|
300
|
+
if len(hint_data) >= 5:
|
|
301
|
+
r1, c1, r2, c2, count = hint_data[0], hint_data[1], hint_data[2], hint_data[3], hint_data[4]
|
|
302
|
+
return await game.validate_move(r1, c1, r2, c2, count)
|
|
303
|
+
|
|
304
|
+
# Shikaku
|
|
305
|
+
if game_name in ["shikaku"]:
|
|
306
|
+
if len(hint_data) >= 4:
|
|
307
|
+
r1, c1, r2, c2 = hint_data[0], hint_data[1], hint_data[2], hint_data[3]
|
|
308
|
+
return await game.validate_move(r1, c1, r2, c2)
|
|
309
|
+
|
|
310
|
+
# Slitherlink
|
|
311
|
+
if game_name in ["slitherlink"]:
|
|
312
|
+
if len(hint_data) >= 4:
|
|
313
|
+
r1, c1, r2, c2 = hint_data[0], hint_data[1], hint_data[2], hint_data[3]
|
|
314
|
+
return await game.validate_move(r1, c1, r2, c2)
|
|
315
|
+
|
|
316
|
+
# Nurikabe
|
|
317
|
+
if game_name in ["nurikabe"]:
|
|
318
|
+
if len(hint_data) >= 2:
|
|
319
|
+
row, col = hint_data[0], hint_data[1]
|
|
320
|
+
state = hint_data[2] if len(hint_data) > 2 else "sea"
|
|
321
|
+
return await game.validate_move(row, col, state)
|
|
322
|
+
|
|
323
|
+
# Knapsack - hint is (action, item_index) like ("select", 1)
|
|
324
|
+
# validate_move expects (action, item_index)
|
|
325
|
+
if game_name in ["knapsack"]:
|
|
326
|
+
if len(hint_data) >= 2:
|
|
327
|
+
action, item_index = hint_data[0], hint_data[1]
|
|
328
|
+
return await game.validate_move(action, item_index)
|
|
329
|
+
|
|
330
|
+
# Task Scheduler - hint is (task_id, worker, start_time)
|
|
331
|
+
# validate_move expects (task_id, worker_id, start_time)
|
|
332
|
+
if game_name in ["task scheduler"]:
|
|
333
|
+
if len(hint_data) >= 3:
|
|
334
|
+
task_id, worker, start_time = hint_data[0], hint_data[1], hint_data[2]
|
|
335
|
+
return await game.validate_move(task_id, worker, start_time)
|
|
336
|
+
|
|
337
|
+
# Sokoban - hint is a direction string like "up", "down", etc.
|
|
338
|
+
# Note: Sokoban requires planning/search algorithms for reliable solving.
|
|
339
|
+
# The greedy hint approach often gets stuck in loops.
|
|
340
|
+
if game_name in ["sokoban"]:
|
|
341
|
+
if hint_data:
|
|
342
|
+
direction = hint_data if isinstance(hint_data, str) else hint_data
|
|
343
|
+
return await game.validate_move(direction)
|
|
344
|
+
|
|
345
|
+
# Generic fallback - try validate_move with hint args as tuple
|
|
346
|
+
if isinstance(hint_data, tuple) and len(hint_data) >= 2:
|
|
347
|
+
return await game.validate_move(*hint_data)
|
|
348
|
+
|
|
349
|
+
# Single value fallback
|
|
350
|
+
return await game.validate_move(hint_data)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
async def run_episode(
|
|
354
|
+
game_class: type[PuzzleGame],
|
|
355
|
+
difficulty: str,
|
|
356
|
+
seed: int,
|
|
357
|
+
solver_config: SolverConfig | None = None,
|
|
358
|
+
use_hints: bool = True,
|
|
359
|
+
max_moves: int = 1000,
|
|
360
|
+
timeout_sec: float = 30.0,
|
|
361
|
+
) -> EpisodeResult:
|
|
362
|
+
"""Run a single puzzle episode using hints to solve.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
game_class: The puzzle game class to instantiate
|
|
366
|
+
difficulty: Difficulty level (easy, medium, hard)
|
|
367
|
+
seed: Random seed for reproducible puzzle generation
|
|
368
|
+
solver_config: Configuration for solver/hint usage
|
|
369
|
+
use_hints: Whether to use hints for auto-solving
|
|
370
|
+
max_moves: Maximum moves before giving up
|
|
371
|
+
timeout_sec: Maximum time in seconds before timeout
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
EpisodeResult with all metrics and status
|
|
375
|
+
"""
|
|
376
|
+
solver_config = solver_config or SolverConfig()
|
|
377
|
+
game = game_class(difficulty=difficulty, seed=seed, solver_config=solver_config)
|
|
378
|
+
await game.generate_puzzle()
|
|
379
|
+
|
|
380
|
+
# Get optimal steps for efficiency calculation
|
|
381
|
+
optimal_steps = game.optimal_steps
|
|
382
|
+
|
|
383
|
+
started_at = datetime.now()
|
|
384
|
+
start_time = time.perf_counter()
|
|
385
|
+
|
|
386
|
+
steps_taken = 0
|
|
387
|
+
invalid_actions = 0
|
|
388
|
+
hints_used = 0
|
|
389
|
+
retries = 0
|
|
390
|
+
status = EpisodeStatus.FAILED
|
|
391
|
+
|
|
392
|
+
while steps_taken < max_moves and not game.is_complete():
|
|
393
|
+
# Check for timeout
|
|
394
|
+
elapsed = time.perf_counter() - start_time
|
|
395
|
+
if elapsed > timeout_sec:
|
|
396
|
+
status = EpisodeStatus.TIMEOUT
|
|
397
|
+
break
|
|
398
|
+
|
|
399
|
+
if use_hints and game.can_use_hint():
|
|
400
|
+
hint_result = await game.get_hint()
|
|
401
|
+
if hint_result is None:
|
|
402
|
+
# No hint available, puzzle might be complete or stuck
|
|
403
|
+
break
|
|
404
|
+
|
|
405
|
+
# Hints return (hint_data, hint_message) tuple
|
|
406
|
+
hint_data, _hint_message = hint_result
|
|
407
|
+
|
|
408
|
+
# Record hint usage (increments game.hints_used for budget tracking)
|
|
409
|
+
game.record_hint()
|
|
410
|
+
hints_used += 1
|
|
411
|
+
|
|
412
|
+
# Apply the hint based on game type
|
|
413
|
+
try:
|
|
414
|
+
result = await _apply_hint(game, hint_data)
|
|
415
|
+
if result.success:
|
|
416
|
+
steps_taken += 1
|
|
417
|
+
else:
|
|
418
|
+
invalid_actions += 1
|
|
419
|
+
# If we get too many consecutive invalid moves, break
|
|
420
|
+
if invalid_actions > 50:
|
|
421
|
+
break
|
|
422
|
+
except (TypeError, ValueError, AttributeError, IndexError):
|
|
423
|
+
invalid_actions += 1
|
|
424
|
+
if invalid_actions > 50:
|
|
425
|
+
break
|
|
426
|
+
elif not use_hints:
|
|
427
|
+
# Without hints, we can't solve automatically
|
|
428
|
+
break
|
|
429
|
+
else:
|
|
430
|
+
# Hints exhausted (budget exceeded)
|
|
431
|
+
break
|
|
432
|
+
|
|
433
|
+
end_time = time.perf_counter()
|
|
434
|
+
ended_at = datetime.now()
|
|
435
|
+
wall_time_ms = int((end_time - start_time) * 1000)
|
|
436
|
+
|
|
437
|
+
if game.is_complete():
|
|
438
|
+
status = EpisodeStatus.SOLVED
|
|
439
|
+
|
|
440
|
+
# Get retries from game if tracked
|
|
441
|
+
retries = getattr(game, "retries", 0)
|
|
442
|
+
|
|
443
|
+
return EpisodeResult(
|
|
444
|
+
game=game.name,
|
|
445
|
+
difficulty=DifficultyLevel(difficulty),
|
|
446
|
+
seed=seed,
|
|
447
|
+
started_at=started_at,
|
|
448
|
+
ended_at=ended_at,
|
|
449
|
+
wall_time_ms=wall_time_ms,
|
|
450
|
+
status=status,
|
|
451
|
+
steps_taken=steps_taken,
|
|
452
|
+
invalid_actions=invalid_actions,
|
|
453
|
+
hints_used=hints_used,
|
|
454
|
+
retries=retries,
|
|
455
|
+
optimal_steps=optimal_steps,
|
|
456
|
+
solver_config=solver_config,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
async def evaluate_game(
|
|
461
|
+
game_name: str,
|
|
462
|
+
difficulty: str = "easy",
|
|
463
|
+
episodes: int = 10,
|
|
464
|
+
seeds: list[int] | None = None,
|
|
465
|
+
solver_config: SolverConfig | None = None,
|
|
466
|
+
use_hints: bool = True,
|
|
467
|
+
max_moves: int = 1000,
|
|
468
|
+
verbose: bool = False,
|
|
469
|
+
) -> EvaluationReport:
|
|
470
|
+
"""Run evaluation for a specific game.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
game_name: Name of the game to evaluate
|
|
474
|
+
difficulty: Difficulty level (easy, medium, hard)
|
|
475
|
+
episodes: Number of episodes to run
|
|
476
|
+
seeds: Specific seeds to use (generates random if None)
|
|
477
|
+
solver_config: Configuration for solver/hint usage
|
|
478
|
+
use_hints: Whether to use hints for auto-solving
|
|
479
|
+
max_moves: Maximum moves per episode
|
|
480
|
+
verbose: Print progress during evaluation
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
EvaluationReport with all episode results
|
|
484
|
+
"""
|
|
485
|
+
if game_name not in AVAILABLE_GAMES:
|
|
486
|
+
raise ValueError(f"Unknown game: {game_name}. Available: {list(AVAILABLE_GAMES.keys())}")
|
|
487
|
+
|
|
488
|
+
solver_config = solver_config or SolverConfig()
|
|
489
|
+
game_class = AVAILABLE_GAMES[game_name]
|
|
490
|
+
report = EvaluationReport(game=game_name, difficulty=difficulty, solver_config=solver_config)
|
|
491
|
+
|
|
492
|
+
# Generate seeds if not provided
|
|
493
|
+
if seeds is None:
|
|
494
|
+
import random
|
|
495
|
+
|
|
496
|
+
seeds = [random.randint(1, 2**31 - 1) for _ in range(episodes)]
|
|
497
|
+
|
|
498
|
+
for i, seed in enumerate(seeds):
|
|
499
|
+
if verbose:
|
|
500
|
+
print(f" Running episode {i + 1}/{len(seeds)} (seed={seed})...", end=" ", flush=True)
|
|
501
|
+
|
|
502
|
+
result = await run_episode(
|
|
503
|
+
game_class=game_class, # type: ignore[type-abstract]
|
|
504
|
+
difficulty=difficulty,
|
|
505
|
+
seed=seed,
|
|
506
|
+
solver_config=solver_config,
|
|
507
|
+
use_hints=use_hints,
|
|
508
|
+
max_moves=max_moves,
|
|
509
|
+
)
|
|
510
|
+
report.episodes.append(result)
|
|
511
|
+
|
|
512
|
+
if verbose:
|
|
513
|
+
status = "solved" if result.success else result.status.value
|
|
514
|
+
eff = f", eff={result.efficiency_score:.0%}" if result.success else ""
|
|
515
|
+
print(f"{status} ({result.steps_taken} steps{eff}, {result.wall_time_ms}ms)")
|
|
516
|
+
|
|
517
|
+
return report
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
async def evaluate_all_games(
|
|
521
|
+
difficulty: str = "easy",
|
|
522
|
+
episodes: int = 5,
|
|
523
|
+
solver_config: SolverConfig | None = None,
|
|
524
|
+
use_hints: bool = True,
|
|
525
|
+
verbose: bool = False,
|
|
526
|
+
) -> dict[str, EvaluationReport]:
|
|
527
|
+
"""Run evaluation for all available games.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
difficulty: Difficulty level for all games
|
|
531
|
+
episodes: Number of episodes per game
|
|
532
|
+
solver_config: Configuration for solver/hint usage
|
|
533
|
+
use_hints: Whether to use hints for auto-solving
|
|
534
|
+
verbose: Print progress during evaluation
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
Dict mapping game names to EvaluationReports
|
|
538
|
+
"""
|
|
539
|
+
reports = {}
|
|
540
|
+
|
|
541
|
+
for game_name in sorted(AVAILABLE_GAMES.keys()):
|
|
542
|
+
if verbose:
|
|
543
|
+
print(f"\nEvaluating {game_name}...")
|
|
544
|
+
|
|
545
|
+
try:
|
|
546
|
+
report = await evaluate_game(
|
|
547
|
+
game_name=game_name,
|
|
548
|
+
difficulty=difficulty,
|
|
549
|
+
episodes=episodes,
|
|
550
|
+
solver_config=solver_config,
|
|
551
|
+
use_hints=use_hints,
|
|
552
|
+
verbose=verbose,
|
|
553
|
+
)
|
|
554
|
+
reports[game_name] = report
|
|
555
|
+
except Exception as e:
|
|
556
|
+
if verbose:
|
|
557
|
+
print(f" Error: {e}")
|
|
558
|
+
|
|
559
|
+
return reports
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def parse_args() -> argparse.Namespace:
|
|
563
|
+
"""Parse command line arguments."""
|
|
564
|
+
parser = argparse.ArgumentParser(
|
|
565
|
+
description="Puzzle Arcade Evaluation Harness",
|
|
566
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
567
|
+
epilog="""
|
|
568
|
+
Examples:
|
|
569
|
+
puzzle-arcade-eval sudoku --difficulty medium --episodes 10
|
|
570
|
+
puzzle-arcade-eval --all --difficulty easy --episodes 5
|
|
571
|
+
puzzle-arcade-eval kenken --seeds 1,2,3,4,5 --output json
|
|
572
|
+
puzzle-arcade-eval sudoku --output csv > results.csv
|
|
573
|
+
|
|
574
|
+
# Solver configuration
|
|
575
|
+
puzzle-arcade-eval sudoku --solver-free # Pure model reasoning
|
|
576
|
+
puzzle-arcade-eval sudoku --hint-budget 10 # Limited hints
|
|
577
|
+
puzzle-arcade-eval sudoku --hint-penalty 0.1 # Penalize hint usage
|
|
578
|
+
""",
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
parser.add_argument(
|
|
582
|
+
"game",
|
|
583
|
+
nargs="?",
|
|
584
|
+
help="Game to evaluate (e.g., sudoku, kenken). Use --all for all games.",
|
|
585
|
+
)
|
|
586
|
+
parser.add_argument(
|
|
587
|
+
"--all",
|
|
588
|
+
action="store_true",
|
|
589
|
+
help="Evaluate all available games",
|
|
590
|
+
)
|
|
591
|
+
parser.add_argument(
|
|
592
|
+
"-d",
|
|
593
|
+
"--difficulty",
|
|
594
|
+
choices=["easy", "medium", "hard"],
|
|
595
|
+
default="easy",
|
|
596
|
+
help="Difficulty level (default: easy)",
|
|
597
|
+
)
|
|
598
|
+
parser.add_argument(
|
|
599
|
+
"-n",
|
|
600
|
+
"--episodes",
|
|
601
|
+
type=int,
|
|
602
|
+
default=10,
|
|
603
|
+
help="Number of episodes to run (default: 10)",
|
|
604
|
+
)
|
|
605
|
+
parser.add_argument(
|
|
606
|
+
"--seeds",
|
|
607
|
+
type=str,
|
|
608
|
+
help="Comma-separated list of seeds to use (e.g., 1,2,3,4,5)",
|
|
609
|
+
)
|
|
610
|
+
parser.add_argument(
|
|
611
|
+
"-o",
|
|
612
|
+
"--output",
|
|
613
|
+
choices=["text", "json", "csv", "markdown", "jsonl"],
|
|
614
|
+
default="text",
|
|
615
|
+
help="Output format (default: text)",
|
|
616
|
+
)
|
|
617
|
+
parser.add_argument(
|
|
618
|
+
"--max-moves",
|
|
619
|
+
type=int,
|
|
620
|
+
default=1000,
|
|
621
|
+
help="Maximum moves per episode (default: 1000)",
|
|
622
|
+
)
|
|
623
|
+
parser.add_argument(
|
|
624
|
+
"-v",
|
|
625
|
+
"--verbose",
|
|
626
|
+
action="store_true",
|
|
627
|
+
help="Verbose output",
|
|
628
|
+
)
|
|
629
|
+
parser.add_argument(
|
|
630
|
+
"--list-games",
|
|
631
|
+
action="store_true",
|
|
632
|
+
help="List all available games and exit",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
# Solver configuration arguments
|
|
636
|
+
solver_group = parser.add_argument_group("solver configuration")
|
|
637
|
+
solver_group.add_argument(
|
|
638
|
+
"--solver-free",
|
|
639
|
+
action="store_true",
|
|
640
|
+
help="Disable solver hints (pure model reasoning mode)",
|
|
641
|
+
)
|
|
642
|
+
solver_group.add_argument(
|
|
643
|
+
"--hint-budget",
|
|
644
|
+
type=int,
|
|
645
|
+
default=100,
|
|
646
|
+
help="Maximum number of hints allowed (default: 100)",
|
|
647
|
+
)
|
|
648
|
+
solver_group.add_argument(
|
|
649
|
+
"--hint-penalty",
|
|
650
|
+
type=float,
|
|
651
|
+
default=0.0,
|
|
652
|
+
help="Score penalty per hint used, 0.0-1.0 (default: 0.0)",
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
return parser.parse_args()
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def main() -> None:
|
|
659
|
+
"""Main entry point for the evaluation CLI."""
|
|
660
|
+
args = parse_args()
|
|
661
|
+
|
|
662
|
+
if args.list_games:
|
|
663
|
+
print("Available games:")
|
|
664
|
+
for name in sorted(AVAILABLE_GAMES.keys()):
|
|
665
|
+
game = AVAILABLE_GAMES[name]("easy") # type: ignore[abstract]
|
|
666
|
+
print(f" {name:20} - {game.description}")
|
|
667
|
+
return
|
|
668
|
+
|
|
669
|
+
if not args.game and not args.all:
|
|
670
|
+
print("Error: Please specify a game or use --all")
|
|
671
|
+
print("Use --list-games to see available games")
|
|
672
|
+
sys.exit(1)
|
|
673
|
+
|
|
674
|
+
# Parse seeds if provided
|
|
675
|
+
seeds = None
|
|
676
|
+
if args.seeds:
|
|
677
|
+
seeds = [int(s.strip()) for s in args.seeds.split(",")]
|
|
678
|
+
|
|
679
|
+
# Build solver configuration
|
|
680
|
+
if args.solver_free:
|
|
681
|
+
solver_config = SolverConfig.solver_free()
|
|
682
|
+
else:
|
|
683
|
+
solver_config = SolverConfig(
|
|
684
|
+
solver_allowed=True,
|
|
685
|
+
hint_budget=args.hint_budget,
|
|
686
|
+
hint_penalty=args.hint_penalty,
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Run evaluation
|
|
690
|
+
if args.all:
|
|
691
|
+
reports = asyncio.run(
|
|
692
|
+
evaluate_all_games(
|
|
693
|
+
difficulty=args.difficulty,
|
|
694
|
+
episodes=args.episodes,
|
|
695
|
+
solver_config=solver_config,
|
|
696
|
+
verbose=args.verbose,
|
|
697
|
+
)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
# Output results
|
|
701
|
+
if args.output == "json":
|
|
702
|
+
print(
|
|
703
|
+
json.dumps(
|
|
704
|
+
{name: json.loads(r.to_json()) for name, r in reports.items()},
|
|
705
|
+
indent=2,
|
|
706
|
+
)
|
|
707
|
+
)
|
|
708
|
+
elif args.output == "jsonl":
|
|
709
|
+
# Stream one-line JSON per episode
|
|
710
|
+
for report in reports.values():
|
|
711
|
+
for episode in report.episodes:
|
|
712
|
+
print(episode.to_jsonl())
|
|
713
|
+
elif args.output == "csv":
|
|
714
|
+
# Combine all CSVs
|
|
715
|
+
first = True
|
|
716
|
+
for report in reports.values():
|
|
717
|
+
csv_out = report.to_csv()
|
|
718
|
+
if first:
|
|
719
|
+
print(csv_out, end="")
|
|
720
|
+
first = False
|
|
721
|
+
else:
|
|
722
|
+
# Skip header for subsequent reports
|
|
723
|
+
lines = csv_out.split("\n")
|
|
724
|
+
print("\n".join(lines[1:]), end="")
|
|
725
|
+
elif args.output == "markdown":
|
|
726
|
+
for report in reports.values():
|
|
727
|
+
print(report.to_markdown())
|
|
728
|
+
print("\n---\n")
|
|
729
|
+
else:
|
|
730
|
+
print("\n" + "=" * 60)
|
|
731
|
+
print("PUZZLE ARCADE EVALUATION SUMMARY")
|
|
732
|
+
print("=" * 60)
|
|
733
|
+
for report in reports.values():
|
|
734
|
+
report.print_summary()
|
|
735
|
+
else:
|
|
736
|
+
report = asyncio.run(
|
|
737
|
+
evaluate_game(
|
|
738
|
+
game_name=args.game,
|
|
739
|
+
difficulty=args.difficulty,
|
|
740
|
+
episodes=args.episodes,
|
|
741
|
+
seeds=seeds,
|
|
742
|
+
solver_config=solver_config,
|
|
743
|
+
max_moves=args.max_moves,
|
|
744
|
+
verbose=args.verbose,
|
|
745
|
+
)
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
# Output results
|
|
749
|
+
if args.output == "json":
|
|
750
|
+
print(report.to_json())
|
|
751
|
+
elif args.output == "jsonl":
|
|
752
|
+
for episode in report.episodes:
|
|
753
|
+
print(episode.to_jsonl())
|
|
754
|
+
elif args.output == "csv":
|
|
755
|
+
print(report.to_csv())
|
|
756
|
+
elif args.output == "markdown":
|
|
757
|
+
print(report.to_markdown())
|
|
758
|
+
else:
|
|
759
|
+
report.print_summary()
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
if __name__ == "__main__":
|
|
763
|
+
main()
|