chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. chuk_puzzles_gym/eval.py +168 -46
  2. chuk_puzzles_gym/export/dataset.py +7 -1
  3. chuk_puzzles_gym/games/_base/game.py +123 -0
  4. chuk_puzzles_gym/games/binary/game.py +2 -0
  5. chuk_puzzles_gym/games/bridges/game.py +2 -0
  6. chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
  7. chuk_puzzles_gym/games/einstein/game.py +2 -0
  8. chuk_puzzles_gym/games/fillomino/game.py +2 -0
  9. chuk_puzzles_gym/games/futoshiki/game.py +2 -0
  10. chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
  11. chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
  12. chuk_puzzles_gym/games/hidato/game.py +2 -0
  13. chuk_puzzles_gym/games/hitori/game.py +2 -0
  14. chuk_puzzles_gym/games/kakuro/game.py +2 -0
  15. chuk_puzzles_gym/games/kenken/game.py +2 -0
  16. chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
  17. chuk_puzzles_gym/games/knapsack/game.py +2 -0
  18. chuk_puzzles_gym/games/lights_out/game.py +2 -0
  19. chuk_puzzles_gym/games/logic_grid/game.py +2 -0
  20. chuk_puzzles_gym/games/mastermind/game.py +2 -0
  21. chuk_puzzles_gym/games/minesweeper/game.py +2 -0
  22. chuk_puzzles_gym/games/nonogram/game.py +2 -0
  23. chuk_puzzles_gym/games/nqueens/game.py +5 -0
  24. chuk_puzzles_gym/games/numberlink/game.py +6 -0
  25. chuk_puzzles_gym/games/nurikabe/game.py +2 -0
  26. chuk_puzzles_gym/games/rush_hour/game.py +4 -0
  27. chuk_puzzles_gym/games/scheduler/game.py +2 -0
  28. chuk_puzzles_gym/games/shikaku/game.py +2 -0
  29. chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
  30. chuk_puzzles_gym/games/slitherlink/game.py +2 -0
  31. chuk_puzzles_gym/games/sokoban/game.py +2 -0
  32. chuk_puzzles_gym/games/star_battle/game.py +2 -0
  33. chuk_puzzles_gym/games/sudoku/game.py +2 -0
  34. chuk_puzzles_gym/games/tents/game.py +2 -0
  35. chuk_puzzles_gym/gym_env.py +21 -5
  36. chuk_puzzles_gym/models/__init__.py +2 -0
  37. chuk_puzzles_gym/models/evaluation.py +165 -1
  38. chuk_puzzles_gym/server.py +51 -72
  39. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
  40. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
  41. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
  42. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
  43. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
@@ -173,6 +173,8 @@ class LightsOutGame(PuzzleGame):
173
173
  Returns:
174
174
  Tuple of (hint_data, hint_message) or None if puzzle is complete
175
175
  """
176
+ if not self.can_use_hint():
177
+ return None
176
178
  # Find a cell in the solution that should be pressed
177
179
  for row in range(self.size):
178
180
  for col in range(self.size):
@@ -235,6 +235,8 @@ class LogicGridGame(PuzzleGame):
235
235
  Returns:
236
236
  Tuple of (hint_data, hint_message) or None if puzzle is complete
237
237
  """
238
+ if not self.can_use_hint():
239
+ return None
238
240
  # Find a connection that hasn't been marked
239
241
  for person in self.categories.person:
240
242
  attrs = self.solution[person]
@@ -211,6 +211,8 @@ class MastermindGame(PuzzleGame):
211
211
  Returns:
212
212
  Tuple of (hint_data, hint_message) or None if no hints available
213
213
  """
214
+ if not self.can_use_hint():
215
+ return None
214
216
  if self.is_complete():
215
217
  return None
216
218
 
@@ -310,6 +310,8 @@ class MinesweeperGame(PuzzleGame):
310
310
  Returns:
311
311
  Tuple of (hint_data, hint_message) or None
312
312
  """
313
+ if not self.can_use_hint():
314
+ return None
313
315
  if self.game_over:
314
316
  return None
315
317
 
@@ -192,6 +192,8 @@ class NonogramGame(PuzzleGame):
192
192
  Returns:
193
193
  Tuple of (hint_data, hint_message) or None if puzzle is complete
194
194
  """
195
+ if not self.can_use_hint():
196
+ return None
195
197
  unknown_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == -1]
196
198
  if not unknown_cells:
197
199
  return None
@@ -296,6 +296,11 @@ class NQueensGame(PuzzleGame):
296
296
 
297
297
  return "\n".join(lines)
298
298
 
299
+ def get_stats(self) -> str:
300
+ """Get current game statistics."""
301
+ placed = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 1)
302
+ return f"Moves: {self.moves_made} | Queens: {placed}/{self.size} | Board: {self.size}x{self.size} | Seed: {self.seed}"
303
+
299
304
  def get_rules(self) -> str:
300
305
  return (
301
306
  f"N-QUEENS ({self.size}x{self.size})\n"
@@ -317,6 +317,12 @@ class NumberlinkGame(PuzzleGame):
317
317
 
318
318
  return "\n".join(lines)
319
319
 
320
+ def get_stats(self) -> str:
321
+ """Get current game statistics."""
322
+ filled = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] != 0)
323
+ total = self.size * self.size
324
+ return f"Moves: {self.moves_made} | Filled: {filled}/{total} | Pairs: {self.num_pairs} | Seed: {self.seed}"
325
+
320
326
  def get_rules(self) -> str:
321
327
  return (
322
328
  f"NUMBERLINK ({self.size}x{self.size}, {self.num_pairs} pairs)\n"
@@ -479,6 +479,8 @@ class NurikabeGame(PuzzleGame):
479
479
  Returns:
480
480
  Tuple of (hint_data, hint_message) or None
481
481
  """
482
+ if not self.can_use_hint():
483
+ return None
482
484
  # Find a cell that differs from solution
483
485
  for row in range(self.size):
484
486
  for col in range(self.size):
@@ -454,6 +454,10 @@ class RushHourGame(PuzzleGame):
454
454
 
455
455
  return "\n".join(lines)
456
456
 
457
+ def get_stats(self) -> str:
458
+ """Get current game statistics."""
459
+ return f"Moves: {self.moves_made} | Vehicles: {len(self.vehicles)} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
460
+
457
461
  def get_rules(self) -> str:
458
462
  return (
459
463
  f"RUSH HOUR ({self.size}x{self.size})\n"
@@ -316,6 +316,8 @@ class SchedulerGame(PuzzleGame):
316
316
  Returns:
317
317
  Tuple of (hint_data, hint_message) or None
318
318
  """
319
+ if not self.can_use_hint():
320
+ return None
319
321
  # Find an unscheduled task that's in the optimal solution
320
322
  for task_id in range(self.num_tasks):
321
323
  if task_id not in self.schedule and task_id in self.optimal_schedule:
@@ -327,6 +327,8 @@ class ShikakuGame(PuzzleGame):
327
327
 
328
328
  async def get_hint(self) -> tuple[Any, str] | None:
329
329
  """Get a hint for the next move."""
330
+ if not self.can_use_hint():
331
+ return None
330
332
  # Find a rectangle from the solution that hasn't been placed yet
331
333
  solution_rects: dict[int, list[tuple[int, int]]] = {}
332
334
  for r in range(self.size):
@@ -255,6 +255,11 @@ class SkyscrapersGame(PuzzleGame):
255
255
 
256
256
  return "\n".join(lines)
257
257
 
258
+ def get_stats(self) -> str:
259
+ """Get current game statistics."""
260
+ empty = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0)
261
+ return f"Moves: {self.moves_made} | Empty cells: {empty} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
262
+
258
263
  def get_rules(self) -> str:
259
264
  return (
260
265
  f"SKYSCRAPERS ({self.size}x{self.size})\n"
@@ -272,6 +272,8 @@ class SlitherlinkGame(PuzzleGame):
272
272
  Returns:
273
273
  Tuple of (hint_data, hint_message) or None
274
274
  """
275
+ if not self.can_use_hint():
276
+ return None
275
277
  # Find an edge that's in the solution but not set by player
276
278
  for row in range(self.size + 1):
277
279
  for col in range(self.size):
@@ -499,6 +499,8 @@ class SokobanGame(PuzzleGame):
499
499
  Returns:
500
500
  Tuple of (hint_data, hint_message) or None
501
501
  """
502
+ if not self.can_use_hint():
503
+ return None
502
504
  if self.is_complete():
503
505
  return None
504
506
 
@@ -301,6 +301,8 @@ class StarBattleGame(PuzzleGame):
301
301
  Returns:
302
302
  Tuple of (hint_data, hint_message) or None if puzzle is complete
303
303
  """
304
+ if not self.can_use_hint():
305
+ return None
304
306
  # Find a star location from solution that hasn't been placed
305
307
  for r in range(self.size):
306
308
  for c in range(self.size):
@@ -249,6 +249,8 @@ class SudokuGame(PuzzleGame):
249
249
  Returns:
250
250
  Tuple of (hint_data, hint_message) or None if puzzle is complete
251
251
  """
252
+ if not self.can_use_hint():
253
+ return None
252
254
  empty_cells = [(r, c) for r in range(9) for c in range(9) if self.grid[r][c] == 0]
253
255
  if not empty_cells:
254
256
  return None
@@ -326,6 +326,8 @@ class TentsGame(PuzzleGame):
326
326
  Returns:
327
327
  Tuple of (hint_data, hint_message) or None if puzzle is complete
328
328
  """
329
+ if not self.can_use_hint():
330
+ return None
329
331
  # Find a tent location from solution that hasn't been placed
330
332
  for r in range(self.size):
331
333
  for c in range(self.size):
@@ -197,6 +197,7 @@ class PuzzleEnv:
197
197
  result = await self._execute_action(cmd, args)
198
198
  except Exception as e:
199
199
  self._game.invalid_moves += 1
200
+ self._game.reasoning_tracker.record_invalid_move()
200
201
  return (
201
202
  self._get_observation(),
202
203
  self.reward_config["invalid_attempt"],
@@ -207,17 +208,25 @@ class PuzzleEnv:
207
208
 
208
209
  self._step_count += 1
209
210
 
211
+ # Build position tuple from parsed args for reasoning tracker
212
+ position = tuple(args)
213
+
210
214
  # Calculate reward
211
215
  if result.success:
212
216
  reward = self.reward_config["correct_placement"]
213
217
 
218
+ # Feed reasoning tracker
219
+ # optimal_steps is dynamic (reflects current state), so use it directly
220
+ remaining = self._game.optimal_steps or 0
221
+ self._game.reasoning_tracker.record_valid_move(position, remaining)
222
+
214
223
  # Check for completion
215
224
  terminated = self._game.is_complete()
216
225
  if terminated:
217
226
  # Add completion bonus with efficiency multiplier
218
- optimal = self._game.optimal_steps
219
- if optimal and self._game.moves_made > 0:
220
- efficiency = min(1.0, optimal / self._game.moves_made)
227
+ opt = self._game.optimal_steps
228
+ if opt and self._game.moves_made > 0:
229
+ efficiency = min(1.0, opt / self._game.moves_made)
221
230
  else:
222
231
  efficiency = 1.0
223
232
  reward += (
@@ -226,11 +235,12 @@ class PuzzleEnv:
226
235
  else:
227
236
  reward = self.reward_config["invalid_attempt"]
228
237
  self._game.invalid_moves += 1
238
+ self._game.reasoning_tracker.record_invalid_move()
229
239
  terminated = False
230
240
 
231
241
  truncated = self._step_count >= self.max_steps
232
242
 
233
- info = {
243
+ info: dict[str, Any] = {
234
244
  "action": action_str,
235
245
  "success": result.success,
236
246
  "message": result.message,
@@ -239,6 +249,10 @@ class PuzzleEnv:
239
249
  "hints_used": self._game.hints_used,
240
250
  }
241
251
 
252
+ # Include reasoning metrics on episode end
253
+ if terminated or truncated:
254
+ info["reasoning_metrics"] = self._game.get_reasoning_metrics().to_dict()
255
+
242
256
  return self._get_observation(), reward, terminated, truncated, info
243
257
 
244
258
  async def _execute_action(self, cmd: str, args: list[str]) -> Any:
@@ -371,7 +385,7 @@ class PuzzleEnv:
371
385
  if self._game is None:
372
386
  return {"error": "no_game"}
373
387
 
374
- obs = {
388
+ obs: dict[str, Any] = {
375
389
  "game": self._game.name,
376
390
  "difficulty": self._game.difficulty.value,
377
391
  "seed": self._game.seed,
@@ -397,6 +411,7 @@ class PuzzleEnv:
397
411
  return {}
398
412
 
399
413
  profile = self._game.difficulty_profile
414
+ reasoning = self._game.get_reasoning_metrics()
400
415
  return {
401
416
  "optimal_steps": self._game.optimal_steps,
402
417
  "difficulty_profile": {
@@ -411,6 +426,7 @@ class PuzzleEnv:
411
426
  "hint_budget": self.solver_config.hint_budget,
412
427
  "hint_penalty": self.solver_config.hint_penalty,
413
428
  },
429
+ "reasoning_metrics": reasoning.to_dict(),
414
430
  }
415
431
 
416
432
  def render(self, mode: str = "ansi") -> str | None:
@@ -20,6 +20,7 @@ from .evaluation import (
20
20
  EpisodeTracer,
21
21
  EvaluationSummary,
22
22
  MoveRecord,
23
+ ReasoningMetrics,
23
24
  SolverConfig,
24
25
  TraceEvent,
25
26
  )
@@ -42,6 +43,7 @@ __all__ = [
42
43
  "EpisodeTracer",
43
44
  "EvaluationSummary",
44
45
  "MoveRecord",
46
+ "ReasoningMetrics",
45
47
  "SolverConfig",
46
48
  "TraceEvent",
47
49
  ]
@@ -38,6 +38,132 @@ class MoveRecord(BaseModel):
38
38
  timestamp_ms: int = Field(default=0, description="Milliseconds since episode start")
39
39
 
40
40
 
41
+ class ReasoningMetrics(BaseModel):
42
+ """Reasoning depth metrics for evaluating quality of agent reasoning.
43
+
44
+ Goes beyond binary success/failure to measure *how* an agent reasons:
45
+ - Backtrack detection: did the agent revise previous placements?
46
+ - Progress tracking: how steadily did the agent make progress?
47
+ - Error patterns: were errors isolated or clustered in streaks?
48
+ - Reasoning overhead: how much wasted work relative to optimal?
49
+ """
50
+
51
+ model_config = ConfigDict(frozen=True)
52
+
53
+ # Raw tracking data
54
+ backtrack_count: int = Field(
55
+ default=0,
56
+ ge=0,
57
+ description="Times agent placed a value at a previously filled position",
58
+ )
59
+ solver_distance_trace: list[int] = Field(
60
+ default_factory=list,
61
+ description="Remaining positions to fill after each valid move",
62
+ )
63
+ error_streak_max: int = Field(
64
+ default=0,
65
+ ge=0,
66
+ description="Longest consecutive run of invalid moves",
67
+ )
68
+ error_streaks: list[int] = Field(
69
+ default_factory=list,
70
+ description="Lengths of each consecutive error streak",
71
+ )
72
+ total_actions: int = Field(
73
+ default=0,
74
+ ge=0,
75
+ description="Total actions taken (valid + invalid)",
76
+ )
77
+ optimal_path_length: int | None = Field(
78
+ default=None,
79
+ ge=1,
80
+ description="Minimum steps to solve (from solver)",
81
+ )
82
+
83
+ @computed_field
84
+ @property
85
+ def reasoning_overhead(self) -> float:
86
+ """Ratio of total actions to optimal path length.
87
+
88
+ 1.0 = perfect (no wasted actions). Higher = more wasted reasoning.
89
+ Returns 0.0 if optimal path length is unknown.
90
+ """
91
+ if self.optimal_path_length is None or self.optimal_path_length == 0:
92
+ return 0.0
93
+ if self.total_actions == 0:
94
+ return 0.0
95
+ return self.total_actions / self.optimal_path_length
96
+
97
+ @computed_field
98
+ @property
99
+ def backtrack_rate(self) -> float:
100
+ """Fraction of valid moves that were backtracks (revisions).
101
+
102
+ 0.0 = no backtracks, 1.0 = every move was a revision.
103
+ """
104
+ valid_moves = len(self.solver_distance_trace)
105
+ if valid_moves == 0:
106
+ return 0.0
107
+ return self.backtrack_count / valid_moves
108
+
109
+ @computed_field
110
+ @property
111
+ def progress_velocity(self) -> float:
112
+ """Average progress per valid move (cells solved per step).
113
+
114
+ Measures how much closer to the solution each move gets.
115
+ 1.0 = every move reduces remaining by exactly 1. Lower = backtracks/plateaus.
116
+ Returns 0.0 if insufficient data.
117
+ """
118
+ trace = self.solver_distance_trace
119
+ if len(trace) < 2:
120
+ return 0.0
121
+ total_progress = trace[0] - trace[-1]
122
+ steps = len(trace) - 1
123
+ if steps == 0:
124
+ return 0.0
125
+ return total_progress / steps
126
+
127
+ @computed_field
128
+ @property
129
+ def progress_steadiness(self) -> float:
130
+ """Measure of how monotonically progress decreased (0.0 to 1.0).
131
+
132
+ 1.0 = perfectly monotonic progress (every move reduced remaining count).
133
+ 0.0 = no monotonic progress at all.
134
+ """
135
+ trace = self.solver_distance_trace
136
+ if len(trace) < 2:
137
+ return 1.0
138
+ monotonic_steps = sum(1 for i in range(1, len(trace)) if trace[i] < trace[i - 1])
139
+ return monotonic_steps / (len(trace) - 1)
140
+
141
+ @computed_field
142
+ @property
143
+ def avg_error_streak(self) -> float:
144
+ """Average length of consecutive error streaks.
145
+
146
+ Returns 0.0 if no error streaks occurred.
147
+ """
148
+ if not self.error_streaks:
149
+ return 0.0
150
+ return sum(self.error_streaks) / len(self.error_streaks)
151
+
152
+ def to_dict(self) -> dict[str, Any]:
153
+ """Convert to flat dictionary for reporting."""
154
+ return {
155
+ "backtrack_count": self.backtrack_count,
156
+ "backtrack_rate": round(self.backtrack_rate, 3),
157
+ "reasoning_overhead": round(self.reasoning_overhead, 3),
158
+ "progress_velocity": round(self.progress_velocity, 3),
159
+ "progress_steadiness": round(self.progress_steadiness, 3),
160
+ "error_streak_max": self.error_streak_max,
161
+ "avg_error_streak": round(self.avg_error_streak, 3),
162
+ "total_actions": self.total_actions,
163
+ "optimal_path_length": self.optimal_path_length,
164
+ }
165
+
166
+
41
167
  class EpisodeResult(BaseModel):
42
168
  """Complete result of a single puzzle episode with normalized metrics.
43
169
 
@@ -91,6 +217,12 @@ class EpisodeResult(BaseModel):
91
217
  description="Complete move history for detailed analysis",
92
218
  )
93
219
 
220
+ # Reasoning depth metrics
221
+ reasoning_metrics: ReasoningMetrics | None = Field(
222
+ default=None,
223
+ description="Detailed reasoning depth metrics (backtracks, progress, error patterns)",
224
+ )
225
+
94
226
  # Computed normalized metrics
95
227
  @computed_field
96
228
  @property
@@ -154,7 +286,7 @@ class EpisodeResult(BaseModel):
154
286
 
155
287
  def to_summary_dict(self) -> dict[str, Any]:
156
288
  """One-line episode summary for logging/streaming."""
157
- return {
289
+ d: dict[str, Any] = {
158
290
  "game": self.game,
159
291
  "seed": self.seed,
160
292
  "difficulty": self.difficulty.value,
@@ -165,6 +297,9 @@ class EpisodeResult(BaseModel):
165
297
  "efficiency": round(self.efficiency_score, 3),
166
298
  "time_ms": self.wall_time_ms,
167
299
  }
300
+ if self.reasoning_metrics is not None:
301
+ d["reasoning"] = self.reasoning_metrics.to_dict()
302
+ return d
168
303
 
169
304
  def to_jsonl(self) -> str:
170
305
  """Single-line JSON for streaming output."""
@@ -217,6 +352,35 @@ class EvaluationSummary(BaseModel):
217
352
  return 0.0
218
353
  return sum(e.wall_time_ms for e in self.episodes) / len(self.episodes)
219
354
 
355
+ @computed_field
356
+ @property
357
+ def avg_backtrack_rate(self) -> float:
358
+ """Average backtrack rate across episodes with reasoning metrics."""
359
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
360
+ if not with_metrics:
361
+ return 0.0
362
+ return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
363
+
364
+ @computed_field
365
+ @property
366
+ def avg_reasoning_overhead(self) -> float:
367
+ """Average reasoning overhead across episodes with reasoning metrics."""
368
+ with_metrics = [
369
+ e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
370
+ ]
371
+ if not with_metrics:
372
+ return 0.0
373
+ return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
374
+
375
+ @computed_field
376
+ @property
377
+ def avg_progress_steadiness(self) -> float:
378
+ """Average progress steadiness across episodes with reasoning metrics."""
379
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
380
+ if not with_metrics:
381
+ return 0.0
382
+ return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
383
+
220
384
 
221
385
  class TraceEvent(BaseModel):
222
386
  """A single event in an episode trace for JSONL logging."""