chuk-puzzles-gym 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chuk_puzzles_gym/eval.py CHANGED
@@ -25,7 +25,7 @@ import sys
25
25
  import time
26
26
  from dataclasses import dataclass, field
27
27
  from datetime import datetime
28
- from typing import TYPE_CHECKING
28
+ from typing import TYPE_CHECKING, Any
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  pass
@@ -100,8 +100,36 @@ class EvaluationReport:
100
100
  return 0.0
101
101
  return sum(e.hints_used for e in self.episodes) / self.total_episodes
102
102
 
103
+ @property
104
+ def avg_backtrack_rate(self) -> float:
105
+ """Average backtrack rate across episodes with reasoning metrics."""
106
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
107
+ if not with_metrics:
108
+ return 0.0
109
+ return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
110
+
111
+ @property
112
+ def avg_reasoning_overhead(self) -> float:
113
+ """Average reasoning overhead across episodes with reasoning metrics."""
114
+ with_metrics = [
115
+ e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
116
+ ]
117
+ if not with_metrics:
118
+ return 0.0
119
+ return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
120
+
121
+ @property
122
+ def avg_progress_steadiness(self) -> float:
123
+ """Average progress steadiness across episodes with reasoning metrics."""
124
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
125
+ if not with_metrics:
126
+ return 0.0
127
+ return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
128
+
103
129
  def to_markdown(self) -> str:
104
130
  """Generate markdown report."""
131
+ has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
132
+
105
133
  lines = [
106
134
  f"# {self.game.title()} {self.difficulty.title()} Evaluation",
107
135
  "",
@@ -112,24 +140,78 @@ class EvaluationReport:
112
140
  f"**Avg Hints:** {self.avg_hints:.1f}",
113
141
  f"**Avg Efficiency:** {self.avg_efficiency:.1%}",
114
142
  f"**Avg Time:** {self.avg_time_ms:.0f}ms",
115
- "",
116
- f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
117
- "",
118
- "## Episode Details",
119
- "",
120
- "| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |",
121
- "|------|--------|-------|---------|-------|------------|-----------|",
122
143
  ]
123
- for e in self.episodes:
124
- status = "solved" if e.success else e.status.value
125
- eff = f"{e.efficiency_score:.0%}" if e.success else "-"
144
+
145
+ if has_reasoning:
146
+ lines.extend(
147
+ [
148
+ "",
149
+ "### Reasoning Depth",
150
+ f"**Avg Backtrack Rate:** {self.avg_backtrack_rate:.1%}",
151
+ f"**Avg Reasoning Overhead:** {self.avg_reasoning_overhead:.2f}x",
152
+ f"**Avg Progress Steadiness:** {self.avg_progress_steadiness:.1%}",
153
+ ]
154
+ )
155
+
156
+ lines.extend(
157
+ [
158
+ "",
159
+ f"**Solver Config:** {'solver-free' if not self.solver_config.solver_allowed else f'budget={self.solver_config.hint_budget}, penalty={self.solver_config.hint_penalty}'}",
160
+ "",
161
+ "## Episode Details",
162
+ "",
163
+ ]
164
+ )
165
+
166
+ if has_reasoning:
167
+ lines.append(
168
+ "| Seed | Status | Steps | Invalid | Hints | Efficiency | Backtracks | Steadiness | Time (ms) |"
169
+ )
126
170
  lines.append(
127
- f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
171
+ "|------|--------|-------|---------|-------|------------|------------|------------|-----------|"
128
172
  )
173
+ for e in self.episodes:
174
+ status = "solved" if e.success else e.status.value
175
+ eff = f"{e.efficiency_score:.0%}" if e.success else "-"
176
+ bt = str(e.reasoning_metrics.backtrack_count) if e.reasoning_metrics else "-"
177
+ st = f"{e.reasoning_metrics.progress_steadiness:.0%}" if e.reasoning_metrics else "-"
178
+ lines.append(
179
+ f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {bt} | {st} | {e.wall_time_ms} |"
180
+ )
181
+ else:
182
+ lines.append("| Seed | Status | Steps | Invalid | Hints | Efficiency | Time (ms) |")
183
+ lines.append("|------|--------|-------|---------|-------|------------|-----------|")
184
+ for e in self.episodes:
185
+ status = "solved" if e.success else e.status.value
186
+ eff = f"{e.efficiency_score:.0%}" if e.success else "-"
187
+ lines.append(
188
+ f"| {e.seed} | {status} | {e.steps_taken} | {e.invalid_actions} | {e.hints_used} | {eff} | {e.wall_time_ms} |"
189
+ )
190
+
129
191
  return "\n".join(lines)
130
192
 
131
193
  def to_json(self) -> str:
132
194
  """Generate JSON report."""
195
+ summary: dict[str, Any] = {
196
+ "total_episodes": self.total_episodes,
197
+ "solved_count": self.solved_count,
198
+ "solve_rate": self.solve_rate,
199
+ "avg_steps": self.avg_moves,
200
+ "avg_invalid": self.avg_invalid_moves,
201
+ "avg_hints": self.avg_hints,
202
+ "avg_efficiency": self.avg_efficiency,
203
+ "avg_time_ms": self.avg_time_ms,
204
+ }
205
+
206
+ # Add aggregate reasoning metrics if available
207
+ has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
208
+ if has_reasoning:
209
+ summary["reasoning"] = {
210
+ "avg_backtrack_rate": round(self.avg_backtrack_rate, 3),
211
+ "avg_reasoning_overhead": round(self.avg_reasoning_overhead, 3),
212
+ "avg_progress_steadiness": round(self.avg_progress_steadiness, 3),
213
+ }
214
+
133
215
  return json.dumps(
134
216
  {
135
217
  "game": self.game,
@@ -139,16 +221,7 @@ class EvaluationReport:
139
221
  "hint_budget": self.solver_config.hint_budget,
140
222
  "hint_penalty": self.solver_config.hint_penalty,
141
223
  },
142
- "summary": {
143
- "total_episodes": self.total_episodes,
144
- "solved_count": self.solved_count,
145
- "solve_rate": self.solve_rate,
146
- "avg_steps": self.avg_moves,
147
- "avg_invalid": self.avg_invalid_moves,
148
- "avg_hints": self.avg_hints,
149
- "avg_efficiency": self.avg_efficiency,
150
- "avg_time_ms": self.avg_time_ms,
151
- },
224
+ "summary": summary,
152
225
  "episodes": [e.to_summary_dict() for e in self.episodes],
153
226
  },
154
227
  indent=2,
@@ -158,35 +231,61 @@ class EvaluationReport:
158
231
  """Generate CSV report."""
159
232
  import io
160
233
 
234
+ has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
235
+
161
236
  output = io.StringIO()
162
237
  writer = csv.writer(output)
163
- writer.writerow(
164
- [
165
- "game",
166
- "difficulty",
167
- "seed",
168
- "status",
169
- "steps_taken",
170
- "invalid_actions",
171
- "hints_used",
172
- "efficiency",
173
- "wall_time_ms",
174
- ]
175
- )
176
- for e in self.episodes:
177
- writer.writerow(
238
+
239
+ header = [
240
+ "game",
241
+ "difficulty",
242
+ "seed",
243
+ "status",
244
+ "steps_taken",
245
+ "invalid_actions",
246
+ "hints_used",
247
+ "efficiency",
248
+ "wall_time_ms",
249
+ ]
250
+ if has_reasoning:
251
+ header.extend(
178
252
  [
179
- e.game,
180
- e.difficulty.value,
181
- e.seed,
182
- e.status.value,
183
- e.steps_taken,
184
- e.invalid_actions,
185
- e.hints_used,
186
- f"{e.efficiency_score:.3f}",
187
- e.wall_time_ms,
253
+ "backtrack_count",
254
+ "backtrack_rate",
255
+ "reasoning_overhead",
256
+ "progress_steadiness",
257
+ "error_streak_max",
188
258
  ]
189
259
  )
260
+ writer.writerow(header)
261
+
262
+ for e in self.episodes:
263
+ row = [
264
+ e.game,
265
+ e.difficulty.value,
266
+ e.seed,
267
+ e.status.value,
268
+ e.steps_taken,
269
+ e.invalid_actions,
270
+ e.hints_used,
271
+ f"{e.efficiency_score:.3f}",
272
+ e.wall_time_ms,
273
+ ]
274
+ if has_reasoning:
275
+ rm = e.reasoning_metrics
276
+ if rm is not None:
277
+ row.extend(
278
+ [
279
+ rm.backtrack_count,
280
+ f"{rm.backtrack_rate:.3f}",
281
+ f"{rm.reasoning_overhead:.3f}",
282
+ f"{rm.progress_steadiness:.3f}",
283
+ rm.error_streak_max,
284
+ ]
285
+ )
286
+ else:
287
+ row.extend(["", "", "", "", ""])
288
+ writer.writerow(row)
190
289
  return output.getvalue()
191
290
 
192
291
  def print_summary(self) -> None:
@@ -206,6 +305,15 @@ class EvaluationReport:
206
305
  print(f"Avg Efficiency: {self.avg_efficiency:.1%}")
207
306
  print(f"Avg Time: {self.avg_time_ms:.0f}ms")
208
307
 
308
+ # Reasoning depth metrics
309
+ has_reasoning = any(e.reasoning_metrics is not None for e in self.episodes)
310
+ if has_reasoning:
311
+ print("-" * 40)
312
+ print("Reasoning Depth:")
313
+ print(f" Backtrack Rate: {self.avg_backtrack_rate:.1%}")
314
+ print(f" Reasoning Overhead: {self.avg_reasoning_overhead:.2f}x")
315
+ print(f" Progress Steadiness: {self.avg_progress_steadiness:.1%}")
316
+
209
317
 
210
318
  async def _apply_hint(game: PuzzleGame, hint_data: tuple) -> MoveResult:
211
319
  """Apply a hint to the game based on game type.
@@ -433,15 +541,22 @@ async def run_episode(
433
541
  # Apply the hint based on game type
434
542
  try:
435
543
  result = await _apply_hint(game, hint_data)
544
+ # Normalize hint_data to a tuple for position tracking
545
+ position = hint_data if isinstance(hint_data, tuple) else (hint_data,)
436
546
  if result.success:
437
547
  steps_taken += 1
548
+ # Use game's dynamic optimal_steps (reflects current state)
549
+ remaining = game.optimal_steps or 0
550
+ game.reasoning_tracker.record_valid_move(position, remaining)
438
551
  else:
439
552
  invalid_actions += 1
553
+ game.reasoning_tracker.record_invalid_move()
440
554
  # If we get too many consecutive invalid moves, break
441
555
  if invalid_actions > 50:
442
556
  break
443
557
  except (TypeError, ValueError, AttributeError, IndexError):
444
558
  invalid_actions += 1
559
+ game.reasoning_tracker.record_invalid_move()
445
560
  if invalid_actions > 50:
446
561
  break
447
562
  elif not use_hints:
@@ -461,6 +576,12 @@ async def run_episode(
461
576
  # Get retries from game if tracked
462
577
  retries = getattr(game, "retries", 0)
463
578
 
579
+ # Collect reasoning depth metrics (use pre-solve optimal_steps since
580
+ # the game's optimal_steps may be 0 after solving)
581
+ reasoning_metrics = game.reasoning_tracker.to_metrics(
582
+ optimal_path_length=optimal_steps if optimal_steps and optimal_steps >= 1 else None,
583
+ )
584
+
464
585
  return EpisodeResult(
465
586
  game=game.name,
466
587
  difficulty=DifficultyLevel(difficulty),
@@ -475,6 +596,7 @@ async def run_episode(
475
596
  retries=retries,
476
597
  optimal_steps=optimal_steps,
477
598
  solver_config=solver_config,
599
+ reasoning_metrics=reasoning_metrics,
478
600
  )
479
601
 
480
602
 
@@ -190,6 +190,12 @@ class DatasetExporter:
190
190
  if canonical:
191
191
  gold_answer = str(canonical)
192
192
 
193
+ # Build reasoning tags from complexity profile
194
+ complexity_profile = game.complexity_profile
195
+ reasoning_type = complexity_profile.get("reasoning_type", "deductive")
196
+ search_space = complexity_profile.get("search_space", "medium")
197
+ tags = [domain, difficulty.value, f"reasoning:{reasoning_type}", f"search:{search_space}"]
198
+
193
199
  # Create Problem using core schema
194
200
  return Problem(
195
201
  # Identity
@@ -214,7 +220,7 @@ class DatasetExporter:
214
220
  ),
215
221
  # Metadata
216
222
  operation_count=game.optimal_steps,
217
- tags=[domain, difficulty.value],
223
+ tags=tags,
218
224
  )
219
225
 
220
226
  @property
@@ -1,10 +1,99 @@
1
1
  """Abstract base class for all puzzle games."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import random
4
6
  from abc import ABC, abstractmethod
5
7
  from typing import Any
6
8
 
7
9
  from ...models import DifficultyLevel, DifficultyProfile, MoveResult, SolverConfig
10
+ from ...models.evaluation import ReasoningMetrics
11
+
12
+
13
+ class ReasoningTracker:
14
+ """Tracks reasoning depth metrics during puzzle gameplay.
15
+
16
+ Accumulates data about backtrack behavior, solver distance progression,
17
+ and error patterns. Produces a ReasoningMetrics snapshot on demand.
18
+
19
+ This is a lightweight, non-Pydantic class meant to be mutated during play.
20
+ """
21
+
22
+ __slots__ = (
23
+ "_placed_positions",
24
+ "_solver_distance_trace",
25
+ "_backtrack_count",
26
+ "_consecutive_errors",
27
+ "_error_streaks",
28
+ "_max_error_streak",
29
+ "_total_actions",
30
+ )
31
+
32
+ def __init__(self) -> None:
33
+ self._placed_positions: set[tuple[Any, ...]] = set()
34
+ self._solver_distance_trace: list[int] = []
35
+ self._backtrack_count: int = 0
36
+ self._consecutive_errors: int = 0
37
+ self._error_streaks: list[int] = []
38
+ self._max_error_streak: int = 0
39
+ self._total_actions: int = 0
40
+
41
+ def record_valid_move(self, position: tuple[Any, ...], remaining_count: int) -> None:
42
+ """Record a valid (successful) move.
43
+
44
+ Args:
45
+ position: The position/target of the move (for backtrack detection)
46
+ remaining_count: How many positions remain to be filled after this move
47
+ """
48
+ self._total_actions += 1
49
+
50
+ # Detect backtrack: placing at a position already placed before
51
+ if position in self._placed_positions:
52
+ self._backtrack_count += 1
53
+ self._placed_positions.add(position)
54
+
55
+ self._solver_distance_trace.append(remaining_count)
56
+
57
+ # Finalize any pending error streak
58
+ if self._consecutive_errors > 0:
59
+ self._error_streaks.append(self._consecutive_errors)
60
+ self._consecutive_errors = 0
61
+
62
+ def record_invalid_move(self) -> None:
63
+ """Record an invalid (failed) move."""
64
+ self._total_actions += 1
65
+ self._consecutive_errors += 1
66
+ self._max_error_streak = max(self._max_error_streak, self._consecutive_errors)
67
+
68
+ def to_metrics(self, optimal_path_length: int | None = None) -> ReasoningMetrics:
69
+ """Produce a frozen ReasoningMetrics snapshot.
70
+
71
+ Args:
72
+ optimal_path_length: Minimum steps to solve (from solver), if known.
73
+ """
74
+ # Finalize any pending error streak
75
+ error_streaks = list(self._error_streaks)
76
+ if self._consecutive_errors > 0:
77
+ error_streaks.append(self._consecutive_errors)
78
+
79
+ return ReasoningMetrics(
80
+ backtrack_count=self._backtrack_count,
81
+ solver_distance_trace=list(self._solver_distance_trace),
82
+ error_streak_max=self._max_error_streak,
83
+ error_streaks=error_streaks,
84
+ total_actions=self._total_actions,
85
+ optimal_path_length=optimal_path_length,
86
+ )
87
+
88
+ def reset(self) -> None:
89
+ """Reset all tracked state."""
90
+ self._placed_positions.clear()
91
+ self._solver_distance_trace.clear()
92
+ self._backtrack_count = 0
93
+ self._consecutive_errors = 0
94
+ self._error_streaks.clear()
95
+ self._max_error_streak = 0
96
+ self._total_actions = 0
8
97
 
9
98
 
10
99
  class PuzzleGame(ABC):
@@ -64,6 +153,9 @@ class PuzzleGame(ABC):
64
153
  self.game_started = False
65
154
  self._last_move_position: tuple[Any, ...] | None = None # For retry detection
66
155
 
156
+ # Reasoning depth tracker
157
+ self._reasoning_tracker = ReasoningTracker()
158
+
67
159
  @abstractmethod
68
160
  async def generate_puzzle(self) -> None:
69
161
  """Generate a new puzzle with a unique solution.
@@ -162,8 +254,11 @@ class PuzzleGame(ABC):
162
254
  """
163
255
  if success:
164
256
  self.moves_made += 1
257
+ remaining = self._compute_remaining()
258
+ self._reasoning_tracker.record_valid_move(position, remaining)
165
259
  else:
166
260
  self.invalid_moves += 1
261
+ self._reasoning_tracker.record_invalid_move()
167
262
 
168
263
  # Detect retries (same position attempted again)
169
264
  if self._last_move_position == position:
@@ -183,6 +278,34 @@ class PuzzleGame(ABC):
183
278
  self.hints_used += 1
184
279
  return True
185
280
 
281
+ def _compute_remaining(self) -> int:
282
+ """Compute how many positions remain to be filled.
283
+
284
+ Uses optimal_steps directly since it is typically dynamic
285
+ (reflects current game state, e.g. counting empty cells).
286
+ Override in subclasses for more accurate tracking.
287
+ """
288
+ return self.optimal_steps or 0
289
+
290
+ def get_reasoning_metrics(self) -> ReasoningMetrics:
291
+ """Get a snapshot of reasoning depth metrics for the current episode.
292
+
293
+ Returns:
294
+ Frozen ReasoningMetrics with all tracked data.
295
+ """
296
+ optimal = self.optimal_steps
297
+ # optimal_path_length requires ge=1; treat 0 or negative as unknown
298
+ if optimal is not None and optimal < 1:
299
+ optimal = None
300
+ return self._reasoning_tracker.to_metrics(
301
+ optimal_path_length=optimal,
302
+ )
303
+
304
+ @property
305
+ def reasoning_tracker(self) -> ReasoningTracker:
306
+ """Access the reasoning tracker directly."""
307
+ return self._reasoning_tracker
308
+
186
309
  def can_use_hint(self) -> bool:
187
310
  """Check if hints are available without consuming one.
188
311
 
@@ -197,6 +197,7 @@ class PuzzleEnv:
197
197
  result = await self._execute_action(cmd, args)
198
198
  except Exception as e:
199
199
  self._game.invalid_moves += 1
200
+ self._game.reasoning_tracker.record_invalid_move()
200
201
  return (
201
202
  self._get_observation(),
202
203
  self.reward_config["invalid_attempt"],
@@ -207,17 +208,25 @@ class PuzzleEnv:
207
208
 
208
209
  self._step_count += 1
209
210
 
211
+ # Build position tuple from parsed args for reasoning tracker
212
+ position = tuple(args)
213
+
210
214
  # Calculate reward
211
215
  if result.success:
212
216
  reward = self.reward_config["correct_placement"]
213
217
 
218
+ # Feed reasoning tracker
219
+ # optimal_steps is dynamic (reflects current state), so use it directly
220
+ remaining = self._game.optimal_steps or 0
221
+ self._game.reasoning_tracker.record_valid_move(position, remaining)
222
+
214
223
  # Check for completion
215
224
  terminated = self._game.is_complete()
216
225
  if terminated:
217
226
  # Add completion bonus with efficiency multiplier
218
- optimal = self._game.optimal_steps
219
- if optimal and self._game.moves_made > 0:
220
- efficiency = min(1.0, optimal / self._game.moves_made)
227
+ opt = self._game.optimal_steps
228
+ if opt and self._game.moves_made > 0:
229
+ efficiency = min(1.0, opt / self._game.moves_made)
221
230
  else:
222
231
  efficiency = 1.0
223
232
  reward += (
@@ -226,11 +235,12 @@ class PuzzleEnv:
226
235
  else:
227
236
  reward = self.reward_config["invalid_attempt"]
228
237
  self._game.invalid_moves += 1
238
+ self._game.reasoning_tracker.record_invalid_move()
229
239
  terminated = False
230
240
 
231
241
  truncated = self._step_count >= self.max_steps
232
242
 
233
- info = {
243
+ info: dict[str, Any] = {
234
244
  "action": action_str,
235
245
  "success": result.success,
236
246
  "message": result.message,
@@ -239,6 +249,10 @@ class PuzzleEnv:
239
249
  "hints_used": self._game.hints_used,
240
250
  }
241
251
 
252
+ # Include reasoning metrics on episode end
253
+ if terminated or truncated:
254
+ info["reasoning_metrics"] = self._game.get_reasoning_metrics().to_dict()
255
+
242
256
  return self._get_observation(), reward, terminated, truncated, info
243
257
 
244
258
  async def _execute_action(self, cmd: str, args: list[str]) -> Any:
@@ -371,7 +385,7 @@ class PuzzleEnv:
371
385
  if self._game is None:
372
386
  return {"error": "no_game"}
373
387
 
374
- obs = {
388
+ obs: dict[str, Any] = {
375
389
  "game": self._game.name,
376
390
  "difficulty": self._game.difficulty.value,
377
391
  "seed": self._game.seed,
@@ -397,6 +411,7 @@ class PuzzleEnv:
397
411
  return {}
398
412
 
399
413
  profile = self._game.difficulty_profile
414
+ reasoning = self._game.get_reasoning_metrics()
400
415
  return {
401
416
  "optimal_steps": self._game.optimal_steps,
402
417
  "difficulty_profile": {
@@ -411,6 +426,7 @@ class PuzzleEnv:
411
426
  "hint_budget": self.solver_config.hint_budget,
412
427
  "hint_penalty": self.solver_config.hint_penalty,
413
428
  },
429
+ "reasoning_metrics": reasoning.to_dict(),
414
430
  }
415
431
 
416
432
  def render(self, mode: str = "ansi") -> str | None:
@@ -20,6 +20,7 @@ from .evaluation import (
20
20
  EpisodeTracer,
21
21
  EvaluationSummary,
22
22
  MoveRecord,
23
+ ReasoningMetrics,
23
24
  SolverConfig,
24
25
  TraceEvent,
25
26
  )
@@ -42,6 +43,7 @@ __all__ = [
42
43
  "EpisodeTracer",
43
44
  "EvaluationSummary",
44
45
  "MoveRecord",
46
+ "ReasoningMetrics",
45
47
  "SolverConfig",
46
48
  "TraceEvent",
47
49
  ]
@@ -38,6 +38,132 @@ class MoveRecord(BaseModel):
38
38
  timestamp_ms: int = Field(default=0, description="Milliseconds since episode start")
39
39
 
40
40
 
41
+ class ReasoningMetrics(BaseModel):
42
+ """Reasoning depth metrics for evaluating quality of agent reasoning.
43
+
44
+ Goes beyond binary success/failure to measure *how* an agent reasons:
45
+ - Backtrack detection: did the agent revise previous placements?
46
+ - Progress tracking: how steadily did the agent make progress?
47
+ - Error patterns: were errors isolated or clustered in streaks?
48
+ - Reasoning overhead: how much wasted work relative to optimal?
49
+ """
50
+
51
+ model_config = ConfigDict(frozen=True)
52
+
53
+ # Raw tracking data
54
+ backtrack_count: int = Field(
55
+ default=0,
56
+ ge=0,
57
+ description="Times agent placed a value at a previously filled position",
58
+ )
59
+ solver_distance_trace: list[int] = Field(
60
+ default_factory=list,
61
+ description="Remaining positions to fill after each valid move",
62
+ )
63
+ error_streak_max: int = Field(
64
+ default=0,
65
+ ge=0,
66
+ description="Longest consecutive run of invalid moves",
67
+ )
68
+ error_streaks: list[int] = Field(
69
+ default_factory=list,
70
+ description="Lengths of each consecutive error streak",
71
+ )
72
+ total_actions: int = Field(
73
+ default=0,
74
+ ge=0,
75
+ description="Total actions taken (valid + invalid)",
76
+ )
77
+ optimal_path_length: int | None = Field(
78
+ default=None,
79
+ ge=1,
80
+ description="Minimum steps to solve (from solver)",
81
+ )
82
+
83
+ @computed_field
84
+ @property
85
+ def reasoning_overhead(self) -> float:
86
+ """Ratio of total actions to optimal path length.
87
+
88
+ 1.0 = perfect (no wasted actions). Higher = more wasted reasoning.
89
+ Returns 0.0 if optimal path length is unknown.
90
+ """
91
+ if self.optimal_path_length is None or self.optimal_path_length == 0:
92
+ return 0.0
93
+ if self.total_actions == 0:
94
+ return 0.0
95
+ return self.total_actions / self.optimal_path_length
96
+
97
+ @computed_field
98
+ @property
99
+ def backtrack_rate(self) -> float:
100
+ """Fraction of valid moves that were backtracks (revisions).
101
+
102
+ 0.0 = no backtracks, 1.0 = every move was a revision.
103
+ """
104
+ valid_moves = len(self.solver_distance_trace)
105
+ if valid_moves == 0:
106
+ return 0.0
107
+ return self.backtrack_count / valid_moves
108
+
109
+ @computed_field
110
+ @property
111
+ def progress_velocity(self) -> float:
112
+ """Average progress per valid move (cells solved per step).
113
+
114
+ Measures how much closer to the solution each move gets.
115
+ 1.0 = every move reduces remaining by exactly 1. Lower = backtracks/plateaus.
116
+ Returns 0.0 if insufficient data.
117
+ """
118
+ trace = self.solver_distance_trace
119
+ if len(trace) < 2:
120
+ return 0.0
121
+ total_progress = trace[0] - trace[-1]
122
+ steps = len(trace) - 1
123
+ if steps == 0:
124
+ return 0.0
125
+ return total_progress / steps
126
+
127
+ @computed_field
128
+ @property
129
+ def progress_steadiness(self) -> float:
130
+ """Measure of how monotonically progress decreased (0.0 to 1.0).
131
+
132
+ 1.0 = perfectly monotonic progress (every move reduced remaining count).
133
+ 0.0 = no monotonic progress at all.
134
+ """
135
+ trace = self.solver_distance_trace
136
+ if len(trace) < 2:
137
+ return 1.0
138
+ monotonic_steps = sum(1 for i in range(1, len(trace)) if trace[i] < trace[i - 1])
139
+ return monotonic_steps / (len(trace) - 1)
140
+
141
+ @computed_field
142
+ @property
143
+ def avg_error_streak(self) -> float:
144
+ """Average length of consecutive error streaks.
145
+
146
+ Returns 0.0 if no error streaks occurred.
147
+ """
148
+ if not self.error_streaks:
149
+ return 0.0
150
+ return sum(self.error_streaks) / len(self.error_streaks)
151
+
152
+ def to_dict(self) -> dict[str, Any]:
153
+ """Convert to flat dictionary for reporting."""
154
+ return {
155
+ "backtrack_count": self.backtrack_count,
156
+ "backtrack_rate": round(self.backtrack_rate, 3),
157
+ "reasoning_overhead": round(self.reasoning_overhead, 3),
158
+ "progress_velocity": round(self.progress_velocity, 3),
159
+ "progress_steadiness": round(self.progress_steadiness, 3),
160
+ "error_streak_max": self.error_streak_max,
161
+ "avg_error_streak": round(self.avg_error_streak, 3),
162
+ "total_actions": self.total_actions,
163
+ "optimal_path_length": self.optimal_path_length,
164
+ }
165
+
166
+
41
167
  class EpisodeResult(BaseModel):
42
168
  """Complete result of a single puzzle episode with normalized metrics.
43
169
 
@@ -91,6 +217,12 @@ class EpisodeResult(BaseModel):
91
217
  description="Complete move history for detailed analysis",
92
218
  )
93
219
 
220
+ # Reasoning depth metrics
221
+ reasoning_metrics: ReasoningMetrics | None = Field(
222
+ default=None,
223
+ description="Detailed reasoning depth metrics (backtracks, progress, error patterns)",
224
+ )
225
+
94
226
  # Computed normalized metrics
95
227
  @computed_field
96
228
  @property
@@ -154,7 +286,7 @@ class EpisodeResult(BaseModel):
154
286
 
155
287
  def to_summary_dict(self) -> dict[str, Any]:
156
288
  """One-line episode summary for logging/streaming."""
157
- return {
289
+ d: dict[str, Any] = {
158
290
  "game": self.game,
159
291
  "seed": self.seed,
160
292
  "difficulty": self.difficulty.value,
@@ -165,6 +297,9 @@ class EpisodeResult(BaseModel):
165
297
  "efficiency": round(self.efficiency_score, 3),
166
298
  "time_ms": self.wall_time_ms,
167
299
  }
300
+ if self.reasoning_metrics is not None:
301
+ d["reasoning"] = self.reasoning_metrics.to_dict()
302
+ return d
168
303
 
169
304
  def to_jsonl(self) -> str:
170
305
  """Single-line JSON for streaming output."""
@@ -217,6 +352,35 @@ class EvaluationSummary(BaseModel):
217
352
  return 0.0
218
353
  return sum(e.wall_time_ms for e in self.episodes) / len(self.episodes)
219
354
 
355
+ @computed_field
356
+ @property
357
+ def avg_backtrack_rate(self) -> float:
358
+ """Average backtrack rate across episodes with reasoning metrics."""
359
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
360
+ if not with_metrics:
361
+ return 0.0
362
+ return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
363
+
364
+ @computed_field
365
+ @property
366
+ def avg_reasoning_overhead(self) -> float:
367
+ """Average reasoning overhead across episodes with reasoning metrics."""
368
+ with_metrics = [
369
+ e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
370
+ ]
371
+ if not with_metrics:
372
+ return 0.0
373
+ return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
374
+
375
+ @computed_field
376
+ @property
377
+ def avg_progress_steadiness(self) -> float:
378
+ """Average progress steadiness across episodes with reasoning metrics."""
379
+ with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
380
+ if not with_metrics:
381
+ return 0.0
382
+ return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics) # type: ignore[union-attr]
383
+
220
384
 
221
385
  class TraceEvent(BaseModel):
222
386
  """A single event in an episode trace for JSONL logging."""
@@ -63,6 +63,9 @@ class ArcadeHandler(TelnetHandler):
63
63
  if not self.current_game:
64
64
  return
65
65
 
66
+ # Get final reasoning metrics
67
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
68
+
66
69
  if self.output_mode == OutputMode.JSON:
67
70
  await self.send_json_response(
68
71
  type="complete",
@@ -72,17 +75,27 @@ class ArcadeHandler(TelnetHandler):
72
75
  invalid_moves=self.current_game.invalid_moves,
73
76
  hints_used=self.current_game.hints_used,
74
77
  optimal_steps=self.current_game.optimal_steps,
78
+ reasoning_metrics=reasoning,
75
79
  )
76
80
  elif self.output_mode == OutputMode.STRICT:
77
81
  await self.send_line(
78
82
  f"COMPLETE:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
79
- f"{self.current_game.hints_used}"
83
+ f"{self.current_game.hints_used}:"
84
+ f"BT={reasoning['backtrack_count']}:"
85
+ f"OH={reasoning['reasoning_overhead']:.2f}:"
86
+ f"ST={reasoning['progress_steadiness']:.2f}"
80
87
  )
81
88
  else:
82
89
  await self.send_line("\n" + "=" * 50)
83
90
  await self.send_line("CONGRATULATIONS! YOU SOLVED IT!")
84
91
  await self.send_line("=" * 50)
85
92
  await self.send_line(self.current_game.get_stats())
93
+ await self.send_line("")
94
+ await self.send_line("Reasoning Depth:")
95
+ await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
96
+ await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
97
+ await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
98
+ await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
86
99
  await self.send_line("\nType 'menu' to play another game.")
87
100
  await self.send_line("=" * 50 + "\n")
88
101
 
@@ -109,6 +122,9 @@ class ArcadeHandler(TelnetHandler):
109
122
  "constraint_density": profile.constraint_density,
110
123
  }
111
124
 
125
+ # Reasoning depth metrics
126
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
127
+
112
128
  return {
113
129
  "game": self.current_game.name,
114
130
  "difficulty": self.current_game.difficulty.value,
@@ -120,6 +136,7 @@ class ArcadeHandler(TelnetHandler):
120
136
  "optimal_steps": self.current_game.optimal_steps,
121
137
  "is_complete": self.current_game.is_complete(),
122
138
  "difficulty_profile": profile_dict,
139
+ "reasoning_metrics": reasoning,
123
140
  "grid": grid,
124
141
  }
125
142
 
@@ -435,9 +452,10 @@ class ArcadeHandler(TelnetHandler):
435
452
  return
436
453
 
437
454
  if cmd_enum == GameCommand.STATS:
438
- # Show detailed stats including difficulty profile
455
+ # Show detailed stats including difficulty profile and reasoning metrics
439
456
  profile = self.current_game.difficulty_profile
440
457
  optimal = self.current_game.optimal_steps
458
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
441
459
 
442
460
  if self.output_mode == OutputMode.JSON:
443
461
  await self.send_json_response(
@@ -455,11 +473,15 @@ class ArcadeHandler(TelnetHandler):
455
473
  "state_observability": profile.state_observability,
456
474
  "constraint_density": profile.constraint_density,
457
475
  },
476
+ reasoning_metrics=reasoning,
458
477
  )
459
478
  elif self.output_mode == OutputMode.STRICT:
460
479
  await self.send_line(
461
480
  f"STATS:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
462
- f"{self.current_game.hints_used}:{optimal or 0}"
481
+ f"{self.current_game.hints_used}:{optimal or 0}:"
482
+ f"BT={reasoning['backtrack_count']}:"
483
+ f"OH={reasoning['reasoning_overhead']:.2f}:"
484
+ f"ST={reasoning['progress_steadiness']:.2f}"
463
485
  )
464
486
  else:
465
487
  await self.send_line("")
@@ -482,6 +504,15 @@ class ArcadeHandler(TelnetHandler):
482
504
  await self.send_line(f" Optimal steps: {optimal}")
483
505
  await self.send_line(f" Current efficiency: {efficiency:.1%}")
484
506
  await self.send_line("")
507
+ await self.send_line("Reasoning Depth:")
508
+ await self.send_line(f" Backtrack count: {reasoning['backtrack_count']}")
509
+ await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
510
+ await self.send_line(f" Progress velocity: {reasoning['progress_velocity']:.2f} cells/step")
511
+ await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
512
+ await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
513
+ await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
514
+ await self.send_line(f" Total actions: {reasoning['total_actions']}")
515
+ await self.send_line("")
485
516
  await self.send_line("Difficulty Profile:")
486
517
  await self.send_line(f" Logic depth: {profile.logic_depth}")
487
518
  await self.send_line(f" Branching factor: {profile.branching_factor:.1f}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chuk-puzzles-gym
3
- Version: 0.10.1
3
+ Version: 0.10.2
4
4
  Summary: Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation
5
5
  Author: Chris Hay
6
6
  License: MIT
@@ -93,10 +93,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
93
93
  - Enable with `mode agent` command
94
94
  - Machine-parseable grid format with clear start/end markers
95
95
  - Compact output optimized for LLM tool integration
96
+ - **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
97
+ - Backtrack detection (did the agent revise previous placements?)
98
+ - Progress steadiness (monotonic advance toward solution?)
99
+ - Error streak analysis (isolated mistakes vs. clustered confusion?)
100
+ - Reasoning overhead (wasted work relative to optimal path)
101
+ - Solver distance traces (remaining work after each valid move)
102
+ - Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
96
103
  - **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
97
104
  - Batch evaluation with configurable episodes
98
105
  - Multiple output formats (JSON, CSV, Markdown)
99
- - Metrics: moves, invalid moves, hints, solve time
106
+ - Metrics: moves, invalid moves, hints, solve time, reasoning depth
100
107
  - Reproducible with deterministic seeds
101
108
  - **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
102
109
  - JSONL output with complete problem definitions and solutions
@@ -500,6 +507,7 @@ games = PuzzleEnv.available_games()
500
507
 
501
508
  - **All 30 games** accessible through unified API
502
509
  - **Configurable rewards** for correct moves, invalid attempts, completion bonuses
510
+ - **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
503
511
  - **Hint system** with optional budget limits
504
512
  - **Solver-free mode** for pure reasoning benchmarks
505
513
  - **Efficiency scoring** based on optimal step counts
@@ -515,8 +523,25 @@ obs = {
515
523
  "moves": 5,
516
524
  "invalid_moves": 1,
517
525
  "hints_used": 2,
526
+ "hints_remaining": 98,
518
527
  "is_complete": False,
519
- "grid": [[4, 0, 8, ...], ...] # Game-specific state
528
+ "grid": [[4, 0, 8, ...], ...], # Game-specific state
529
+ "render": " | 1 2 3 | ...", # ASCII grid
530
+ }
531
+
532
+ # Info dict includes reasoning metrics and difficulty profile
533
+ info = {
534
+ "optimal_steps": 45,
535
+ "difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
536
+ "reasoning_metrics": {
537
+ "backtrack_count": 0,
538
+ "backtrack_rate": 0.0,
539
+ "progress_velocity": 1.0,
540
+ "progress_steadiness": 1.0,
541
+ "reasoning_overhead": 1.0,
542
+ "error_streak_max": 0,
543
+ "solver_distance_trace": [44, 43, 42, ...],
544
+ },
520
545
  }
521
546
  ```
522
547
 
@@ -546,6 +571,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
546
571
  env = PuzzleEnv("sudoku", solver_config=config)
547
572
  ```
548
573
 
574
+ ## Reasoning Depth Metrics
575
+
576
+ Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
577
+
578
+ ### Metrics
579
+
580
+ | Metric | Description | Perfect Score |
581
+ |--------|-------------|---------------|
582
+ | `backtrack_count` | Times the agent revised a previous placement | 0 |
583
+ | `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
584
+ | `progress_velocity` | Average cells solved per step | 1.0 |
585
+ | `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
586
+ | `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
587
+ | `error_streak_max` | Longest run of consecutive invalid moves | 0 |
588
+ | `avg_error_streak` | Average length of error bursts | 0.0 |
589
+ | `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
590
+
591
+ ### Usage in Gym Environment
592
+
593
+ ```python
594
+ from chuk_puzzles_gym.gym_env import PuzzleEnv
595
+
596
+ env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
597
+ obs, info = await env.reset()
598
+
599
+ # Reasoning metrics available in info after reset
600
+ print(info["reasoning_metrics"])
601
+
602
+ # ... agent plays ...
603
+ obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
604
+
605
+ # On episode end, info includes full reasoning metrics
606
+ if terminated:
607
+ metrics = info["reasoning_metrics"]
608
+ print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
609
+ print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
610
+ print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
611
+ ```
612
+
613
+ ### Usage in Server (Telnet/WebSocket)
614
+
615
+ Reasoning metrics are included automatically in server output:
616
+
617
+ - **JSON mode**: `reasoning_metrics` dict in every state response and completion message
618
+ - **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
619
+ - **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
620
+
621
+ ```
622
+ > mode json
623
+ > place 1 1 5
624
+ {"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
625
+
626
+ > stats
627
+ {"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
628
+ ```
629
+
630
+ ### Usage in Evaluation Harness
631
+
632
+ ```bash
633
+ # Reasoning metrics included in all output formats
634
+ chuk-puzzles-eval sudoku -d easy -n 10 -o json
635
+ ```
636
+
637
+ ```python
638
+ from chuk_puzzles_gym.eval import evaluate_game
639
+
640
+ report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
641
+ report.print_summary() # Includes "Reasoning Depth" section
642
+
643
+ # Aggregate metrics
644
+ print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
645
+ print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
646
+ print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
647
+ ```
648
+
649
+ ### What the Metrics Reveal
650
+
651
+ A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
652
+
653
+ A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
654
+
655
+ These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
656
+
549
657
  ## Evaluation Harness
550
658
 
551
659
  The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
@@ -604,6 +712,12 @@ Avg Time: 12ms
604
712
  | `hints_used` | Number of hints requested |
605
713
  | `wall_time_ms` | Time to solve in milliseconds |
606
714
  | `seed` | Puzzle seed for reproducibility |
715
+ | `backtrack_count` | Times agent revised a previous placement |
716
+ | `backtrack_rate` | Fraction of valid moves that were backtracks |
717
+ | `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
718
+ | `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
719
+ | `error_streak_max` | Longest run of consecutive invalid moves |
720
+ | `progress_velocity` | Average cells solved per step |
607
721
 
608
722
  ## Dataset Export
609
723
 
@@ -1194,12 +1308,13 @@ chuk-puzzles-gym/
1194
1308
  │ │ ├── base.py # GridPosition, MoveResult
1195
1309
  │ │ ├── config.py # Base GameConfig
1196
1310
  │ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
1311
+ │ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
1197
1312
  │ │ └── games.py # Game-specific models (Cage, Task, etc.)
1198
1313
  │ └── games/ # Self-contained game modules
1199
1314
  │ ├── __init__.py # AVAILABLE_GAMES registry
1200
1315
  │ ├── _base/ # Base classes
1201
1316
  │ │ ├── __init__.py
1202
- │ │ ├── game.py # PuzzleGame ABC
1317
+ │ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
1203
1318
  │ │ └── commands.py # GameCommandHandler ABC
1204
1319
  │ ├── sudoku/ # Example game module
1205
1320
  │ │ ├── __init__.py # Exports SudokuGame
@@ -1226,6 +1341,7 @@ chuk-puzzles-gym/
1226
1341
  │ ├── example_graph_coloring.py # Graph Coloring game logic demo
1227
1342
  │ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
1228
1343
  │ ├── example_rush_hour.py # Rush Hour game logic demo
1344
+ │ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
1229
1345
  │ └── README.md # Example usage guide
1230
1346
  ├── .github/workflows/ # CI/CD workflows
1231
1347
  ├── pyproject.toml # Modern Python project config
@@ -1465,9 +1581,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
1465
1581
  ### Highlights
1466
1582
 
1467
1583
  **Benchmarking & Metrics**
1468
- - Puzzle complexity metrics (constraint count, variable count, branching factor)
1469
- - Episode model for tracking game sessions
1470
- - Trace logging for offline analysis
1584
+ - ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
1585
+ - ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
1586
+ - ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
1587
+ - ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
1471
1588
 
1472
1589
  **Agent Evaluation Tools**
1473
1590
  - Batch evaluation harness CLI
@@ -1,14 +1,14 @@
1
1
  chuk_puzzles_gym/__init__.py,sha256=zh2sc6QFKrtAmMLee7vlHgXuOBoB5CjSldlKFjZTVVE,521
2
2
  chuk_puzzles_gym/constants.py,sha256=58pKdvwoaB4PF1AK4b7mLNf_Y_YFyFassd1hYH1IUNE,280
3
- chuk_puzzles_gym/eval.py,sha256=jWjfQ4OaBNY2vDwcRxw1-MC27VorLNUMfRW-lQpK3Rs,26415
4
- chuk_puzzles_gym/gym_env.py,sha256=qoQZFz2Dnbl3QjTsDNHAxAx1qomU8paXVlH-SDcwlZI,17288
5
- chuk_puzzles_gym/server.py,sha256=QnG48mXd8AKDVFUULIwLWidqDvcsCkayxbvo7h_EKBg,45947
3
+ chuk_puzzles_gym/eval.py,sha256=-ku_pshSMG5RIu-p4MdS9ju4kduyKjvTn8Q99y_UO_E,31830
4
+ chuk_puzzles_gym/gym_env.py,sha256=V2Eg1CFXKceR6vWTvAzvfanXvZL24STbw3YP8-cjkk0,18074
5
+ chuk_puzzles_gym/server.py,sha256=SWfuBO4wtm_4Ri8l5hbQmvMF7ZN4Q42Wt66neFp5-nQ,48055
6
6
  chuk_puzzles_gym/export/__init__.py,sha256=TTXBRR5CBBCL04r1iXMzxib9oOIDTC4npxy2_L1xc2A,366
7
- chuk_puzzles_gym/export/dataset.py,sha256=dZMz9m4JwpZZSigvaJjIpGKIoxUWB01gXoyNCZ4o17o,10998
7
+ chuk_puzzles_gym/export/dataset.py,sha256=bza7iCfp4POz0gCcoSRF_hTRZmuAD-59DyrrHiqo4ac,11335
8
8
  chuk_puzzles_gym/games/__init__.py,sha256=zByuxje5uVWQ4wBoGHUooHkAg5cgCljrCCXkyOLxLzo,3403
9
9
  chuk_puzzles_gym/games/_base/__init__.py,sha256=oNjoMvOVDb010ooyGxAfXBrOqmw1BAGavmaxf44tmz0,188
10
10
  chuk_puzzles_gym/games/_base/commands.py,sha256=tY0kxk08D8nPr_C_awo8qDUhkL6EHA59KnWiLlYnloY,2381
11
- chuk_puzzles_gym/games/_base/game.py,sha256=-YPJOgWsb4YVz8tS3cXJYd-y-1Tyx7eh8vs3tZEXcEA,11240
11
+ chuk_puzzles_gym/games/_base/game.py,sha256=Jwfjj4qazgaWLQLNTghfMuydy-D3KrOuUmpCM9kpjlU,15711
12
12
  chuk_puzzles_gym/games/binary/__init__.py,sha256=Pphgj0kcvHUgkM0Mq89GsWPt-Bg6DobDLi7cqliOywk,156
13
13
  chuk_puzzles_gym/games/binary/config.py,sha256=Iw8Wax1856aqaz1KvDC69Qou6z8gxIWr5rSAI0MGnWg,812
14
14
  chuk_puzzles_gym/games/binary/game.py,sha256=lRBweQIdzyRZm_jMPItZ1VAzAcsEEbxvGqjGwAlTTy0,16359
@@ -118,17 +118,17 @@ chuk_puzzles_gym/games/sudoku/game.py,sha256=35vB5x-KIs5z2b-CDV-dq5kifmVkoEkbLOx
118
118
  chuk_puzzles_gym/games/tents/__init__.py,sha256=iVxsZg7Juz3iHXTK8mfJZniFcMNnmAd2h2RjxR2TH40,133
119
119
  chuk_puzzles_gym/games/tents/config.py,sha256=gSi5epG5va8-a4ZQv5ekcFDkWQSYOSheX2j4FIs_I8Q,914
120
120
  chuk_puzzles_gym/games/tents/game.py,sha256=JGPLYvIosCwjJYhi0FCtA3YUFsgQsD9L_BEArHSOPFM,15802
121
- chuk_puzzles_gym/models/__init__.py,sha256=dZzLWsyKE993o8HFfFkxTR7XjDwYK56rB-5clwW4zPg,930
121
+ chuk_puzzles_gym/models/__init__.py,sha256=6SQn3zEcalTl-9VqKbSwvmWaYkRMuGKUkfiC25c9-h8,976
122
122
  chuk_puzzles_gym/models/base.py,sha256=L7Zug9jUXJCOhD3wKJp0ppJZNTgroDQwdYMjvAaVVqc,1156
123
123
  chuk_puzzles_gym/models/config.py,sha256=12UkPlEEFzN1k9ZfJClpVqkp7E11MWriZVAH2RkfEM4,301
124
124
  chuk_puzzles_gym/models/enums.py,sha256=xmHv0OK2zKcxpfhJP3huuXhDnnX0BDLCwWfpR9ZuraQ,2342
125
- chuk_puzzles_gym/models/evaluation.py,sha256=EwFeecWtQ-wyezPE1dhpKDUH-BTdF7cDJ_W99JLoMUM,16070
125
+ chuk_puzzles_gym/models/evaluation.py,sha256=b2ldWPih-lo2jy59pWincjv9qZuF6PsZd42LPZsZzLc,22162
126
126
  chuk_puzzles_gym/models/games.py,sha256=rnEW_Sl9xuZtvlBXBZfab34HrIhtUEiBdUSs_nvh10o,442
127
127
  chuk_puzzles_gym/trace/__init__.py,sha256=8JHaHxbTDhT9kv4e2e5Px4dCWuXY49OXmvzkMS4nKfw,273
128
128
  chuk_puzzles_gym/trace/generator.py,sha256=4pks0d_asoDE15QjM2VuzgFWTV1fZke_gHH2lVF8KVQ,34058
129
129
  chuk_puzzles_gym/utils/__init__.py,sha256=1AKPfRjT9YlBxxcA7qdKcvKBXdHJzfGtUWansrb_2VE,149
130
- chuk_puzzles_gym-0.10.1.dist-info/METADATA,sha256=HD-oYiDi5OTNMOjtvxQkB9aBuOBUARAy1RcXcjf4T2I,49935
131
- chuk_puzzles_gym-0.10.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
132
- chuk_puzzles_gym-0.10.1.dist-info/entry_points.txt,sha256=tJGHiH8wjkBev2SPNuXOLFkaXE76sW9ZFIMQw4pUj5E,181
133
- chuk_puzzles_gym-0.10.1.dist-info/top_level.txt,sha256=H3z9wKGl7CV1BPlO6t5lEtok6WW9rwGr5C1Dr3Kqx28,17
134
- chuk_puzzles_gym-0.10.1.dist-info/RECORD,,
130
+ chuk_puzzles_gym-0.10.2.dist-info/METADATA,sha256=adaIAGmTJQj7wES0bqZEETQ5pbQQJ9OrswxhNZayits,55140
131
+ chuk_puzzles_gym-0.10.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
132
+ chuk_puzzles_gym-0.10.2.dist-info/entry_points.txt,sha256=tJGHiH8wjkBev2SPNuXOLFkaXE76sW9ZFIMQw4pUj5E,181
133
+ chuk_puzzles_gym-0.10.2.dist-info/top_level.txt,sha256=H3z9wKGl7CV1BPlO6t5lEtok6WW9rwGr5C1Dr3Kqx28,17
134
+ chuk_puzzles_gym-0.10.2.dist-info/RECORD,,