PyPI - chuk-puzzles-gym - Versions diffs - 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

chuk-puzzles-gym 0.10py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

chuk_puzzles_gym/eval.py +168 -46
chuk_puzzles_gym/export/dataset.py +7 -1
chuk_puzzles_gym/games/_base/game.py +123 -0
chuk_puzzles_gym/games/binary/game.py +2 -0
chuk_puzzles_gym/games/bridges/game.py +2 -0
chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
chuk_puzzles_gym/games/einstein/game.py +2 -0
chuk_puzzles_gym/games/fillomino/game.py +2 -0
chuk_puzzles_gym/games/futoshiki/game.py +2 -0
chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
chuk_puzzles_gym/games/hidato/game.py +2 -0
chuk_puzzles_gym/games/hitori/game.py +2 -0
chuk_puzzles_gym/games/kakuro/game.py +2 -0
chuk_puzzles_gym/games/kenken/game.py +2 -0
chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
chuk_puzzles_gym/games/knapsack/game.py +2 -0
chuk_puzzles_gym/games/lights_out/game.py +2 -0
chuk_puzzles_gym/games/logic_grid/game.py +2 -0
chuk_puzzles_gym/games/mastermind/game.py +2 -0
chuk_puzzles_gym/games/minesweeper/game.py +2 -0
chuk_puzzles_gym/games/nonogram/game.py +2 -0
chuk_puzzles_gym/games/nqueens/game.py +5 -0
chuk_puzzles_gym/games/numberlink/game.py +6 -0
chuk_puzzles_gym/games/nurikabe/game.py +2 -0
chuk_puzzles_gym/games/rush_hour/game.py +4 -0
chuk_puzzles_gym/games/scheduler/game.py +2 -0
chuk_puzzles_gym/games/shikaku/game.py +2 -0
chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
chuk_puzzles_gym/games/slitherlink/game.py +2 -0
chuk_puzzles_gym/games/sokoban/game.py +2 -0
chuk_puzzles_gym/games/star_battle/game.py +2 -0
chuk_puzzles_gym/games/sudoku/game.py +2 -0
chuk_puzzles_gym/games/tents/game.py +2 -0
chuk_puzzles_gym/gym_env.py +21 -5
chuk_puzzles_gym/models/__init__.py +2 -0
chuk_puzzles_gym/models/evaluation.py +165 -1
chuk_puzzles_gym/server.py +51 -72
{chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
{chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
{chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
{chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
{chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0

chuk_puzzles_gym/games/lights_out/game.py CHANGED Viewed

@@ -173,6 +173,8 @@ class LightsOutGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         # Find a cell in the solution that should be pressed
         for row in range(self.size):
             for col in range(self.size):

chuk_puzzles_gym/games/logic_grid/game.py CHANGED Viewed

@@ -235,6 +235,8 @@ class LogicGridGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         # Find a connection that hasn't been marked
         for person in self.categories.person:
             attrs = self.solution[person]

chuk_puzzles_gym/games/mastermind/game.py CHANGED Viewed

@@ -211,6 +211,8 @@ class MastermindGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if no hints available
         """
+        if not self.can_use_hint():
+            return None
         if self.is_complete():
             return None

chuk_puzzles_gym/games/minesweeper/game.py CHANGED Viewed

@@ -310,6 +310,8 @@ class MinesweeperGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None
         """
+        if not self.can_use_hint():
+            return None
         if self.game_over:
             return None

chuk_puzzles_gym/games/nonogram/game.py CHANGED Viewed

@@ -192,6 +192,8 @@ class NonogramGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         unknown_cells = [(r, c) for r in range(self.size) for c in range(self.size) if self.grid[r][c] == -1]
         if not unknown_cells:
             return None

chuk_puzzles_gym/games/nqueens/game.py CHANGED Viewed

@@ -296,6 +296,11 @@ class NQueensGame(PuzzleGame):
         return "\n".join(lines)
+    def get_stats(self) -> str:
+        """Get current game statistics."""
+        placed = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 1)
+        return f"Moves: {self.moves_made} | Queens: {placed}/{self.size} | Board: {self.size}x{self.size} | Seed: {self.seed}"
     def get_rules(self) -> str:
         return (
             f"N-QUEENS ({self.size}x{self.size})\n"

chuk_puzzles_gym/games/numberlink/game.py CHANGED Viewed

@@ -317,6 +317,12 @@ class NumberlinkGame(PuzzleGame):
         return "\n".join(lines)
+    def get_stats(self) -> str:
+        """Get current game statistics."""
+        filled = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] != 0)
+        total = self.size * self.size
+        return f"Moves: {self.moves_made} | Filled: {filled}/{total} | Pairs: {self.num_pairs} | Seed: {self.seed}"
     def get_rules(self) -> str:
         return (
             f"NUMBERLINK ({self.size}x{self.size}, {self.num_pairs} pairs)\n"

chuk_puzzles_gym/games/nurikabe/game.py CHANGED Viewed

@@ -479,6 +479,8 @@ class NurikabeGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None
         """
+        if not self.can_use_hint():
+            return None
         # Find a cell that differs from solution
         for row in range(self.size):
             for col in range(self.size):

chuk_puzzles_gym/games/rush_hour/game.py CHANGED Viewed

@@ -454,6 +454,10 @@ class RushHourGame(PuzzleGame):
         return "\n".join(lines)
+    def get_stats(self) -> str:
+        """Get current game statistics."""
+        return f"Moves: {self.moves_made} | Vehicles: {len(self.vehicles)} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
     def get_rules(self) -> str:
         return (
             f"RUSH HOUR ({self.size}x{self.size})\n"

chuk_puzzles_gym/games/scheduler/game.py CHANGED Viewed

@@ -316,6 +316,8 @@ class SchedulerGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None
         """
+        if not self.can_use_hint():
+            return None
         # Find an unscheduled task that's in the optimal solution
         for task_id in range(self.num_tasks):
             if task_id not in self.schedule and task_id in self.optimal_schedule:

chuk_puzzles_gym/games/shikaku/game.py CHANGED Viewed

@@ -327,6 +327,8 @@ class ShikakuGame(PuzzleGame):
     async def get_hint(self) -> tuple[Any, str] | None:
         """Get a hint for the next move."""
+        if not self.can_use_hint():
+            return None
         # Find a rectangle from the solution that hasn't been placed yet
         solution_rects: dict[int, list[tuple[int, int]]] = {}
         for r in range(self.size):

chuk_puzzles_gym/games/skyscrapers/game.py CHANGED Viewed

@@ -255,6 +255,11 @@ class SkyscrapersGame(PuzzleGame):
         return "\n".join(lines)
+    def get_stats(self) -> str:
+        """Get current game statistics."""
+        empty = sum(1 for r in range(self.size) for c in range(self.size) if self.grid[r][c] == 0)
+        return f"Moves: {self.moves_made} | Empty cells: {empty} | Grid: {self.size}x{self.size} | Seed: {self.seed}"
     def get_rules(self) -> str:
         return (
             f"SKYSCRAPERS ({self.size}x{self.size})\n"

chuk_puzzles_gym/games/slitherlink/game.py CHANGED Viewed

@@ -272,6 +272,8 @@ class SlitherlinkGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None
         """
+        if not self.can_use_hint():
+            return None
         # Find an edge that's in the solution but not set by player
         for row in range(self.size + 1):
             for col in range(self.size):

chuk_puzzles_gym/games/sokoban/game.py CHANGED Viewed

@@ -499,6 +499,8 @@ class SokobanGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None
         """
+        if not self.can_use_hint():
+            return None
         if self.is_complete():
             return None

chuk_puzzles_gym/games/star_battle/game.py CHANGED Viewed

@@ -301,6 +301,8 @@ class StarBattleGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         # Find a star location from solution that hasn't been placed
         for r in range(self.size):
             for c in range(self.size):

chuk_puzzles_gym/games/sudoku/game.py CHANGED Viewed

@@ -249,6 +249,8 @@ class SudokuGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         empty_cells = [(r, c) for r in range(9) for c in range(9) if self.grid[r][c] == 0]
         if not empty_cells:
             return None

chuk_puzzles_gym/games/tents/game.py CHANGED Viewed

@@ -326,6 +326,8 @@ class TentsGame(PuzzleGame):
         Returns:
             Tuple of (hint_data, hint_message) or None if puzzle is complete
         """
+        if not self.can_use_hint():
+            return None
         # Find a tent location from solution that hasn't been placed
         for r in range(self.size):
             for c in range(self.size):

chuk_puzzles_gym/gym_env.py CHANGED Viewed

@@ -197,6 +197,7 @@ class PuzzleEnv:
             result = await self._execute_action(cmd, args)
         except Exception as e:
             self._game.invalid_moves += 1
+            self._game.reasoning_tracker.record_invalid_move()
             return (
                 self._get_observation(),
                 self.reward_config["invalid_attempt"],
@@ -207,17 +208,25 @@ class PuzzleEnv:
         self._step_count += 1
+        # Build position tuple from parsed args for reasoning tracker
+        position = tuple(args)
         # Calculate reward
         if result.success:
             reward = self.reward_config["correct_placement"]
+            # Feed reasoning tracker
+            # optimal_steps is dynamic (reflects current state), so use it directly
+            remaining = self._game.optimal_steps or 0
+            self._game.reasoning_tracker.record_valid_move(position, remaining)
             # Check for completion
             terminated = self._game.is_complete()
             if terminated:
                 # Add completion bonus with efficiency multiplier
-                optimal = self._game.optimal_steps
-                if optimal and self._game.moves_made > 0:
-                    efficiency = min(1.0, optimal / self._game.moves_made)
+                opt = self._game.optimal_steps
+                if opt and self._game.moves_made > 0:
+                    efficiency = min(1.0, opt / self._game.moves_made)
                 else:
                     efficiency = 1.0
                 reward += (
@@ -226,11 +235,12 @@ class PuzzleEnv:
         else:
             reward = self.reward_config["invalid_attempt"]
             self._game.invalid_moves += 1
+            self._game.reasoning_tracker.record_invalid_move()
             terminated = False
         truncated = self._step_count >= self.max_steps
-        info = {
+        info: dict[str, Any] = {
             "action": action_str,
             "success": result.success,
             "message": result.message,
@@ -239,6 +249,10 @@ class PuzzleEnv:
             "hints_used": self._game.hints_used,
         }
+        # Include reasoning metrics on episode end
+        if terminated or truncated:
+            info["reasoning_metrics"] = self._game.get_reasoning_metrics().to_dict()
         return self._get_observation(), reward, terminated, truncated, info
     async def _execute_action(self, cmd: str, args: list[str]) -> Any:
@@ -371,7 +385,7 @@ class PuzzleEnv:
         if self._game is None:
             return {"error": "no_game"}
-        obs = {
+        obs: dict[str, Any] = {
             "game": self._game.name,
             "difficulty": self._game.difficulty.value,
             "seed": self._game.seed,
@@ -397,6 +411,7 @@ class PuzzleEnv:
             return {}
         profile = self._game.difficulty_profile
+        reasoning = self._game.get_reasoning_metrics()
         return {
             "optimal_steps": self._game.optimal_steps,
             "difficulty_profile": {
@@ -411,6 +426,7 @@ class PuzzleEnv:
                 "hint_budget": self.solver_config.hint_budget,
                 "hint_penalty": self.solver_config.hint_penalty,
             },
+            "reasoning_metrics": reasoning.to_dict(),
         }
     def render(self, mode: str = "ansi") -> str | None:

chuk_puzzles_gym/models/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .evaluation import (
     EpisodeTracer,
     EvaluationSummary,
     MoveRecord,
+    ReasoningMetrics,
     SolverConfig,
     TraceEvent,
 )
@@ -42,6 +43,7 @@ __all__ = [
     "EpisodeTracer",
     "EvaluationSummary",
     "MoveRecord",
+    "ReasoningMetrics",
     "SolverConfig",
     "TraceEvent",
 ]

chuk_puzzles_gym/models/evaluation.py CHANGED Viewed

@@ -38,6 +38,132 @@ class MoveRecord(BaseModel):
     timestamp_ms: int = Field(default=0, description="Milliseconds since episode start")
+class ReasoningMetrics(BaseModel):
+    """Reasoning depth metrics for evaluating quality of agent reasoning.
+    Goes beyond binary success/failure to measure *how* an agent reasons:
+    - Backtrack detection: did the agent revise previous placements?
+    - Progress tracking: how steadily did the agent make progress?
+    - Error patterns: were errors isolated or clustered in streaks?
+    - Reasoning overhead: how much wasted work relative to optimal?
+    """
+    model_config = ConfigDict(frozen=True)
+    # Raw tracking data
+    backtrack_count: int = Field(
+        default=0,
+        ge=0,
+        description="Times agent placed a value at a previously filled position",
+    )
+    solver_distance_trace: list[int] = Field(
+        default_factory=list,
+        description="Remaining positions to fill after each valid move",
+    )
+    error_streak_max: int = Field(
+        default=0,
+        ge=0,
+        description="Longest consecutive run of invalid moves",
+    )
+    error_streaks: list[int] = Field(
+        default_factory=list,
+        description="Lengths of each consecutive error streak",
+    )
+    total_actions: int = Field(
+        default=0,
+        ge=0,
+        description="Total actions taken (valid + invalid)",
+    )
+    optimal_path_length: int | None = Field(
+        default=None,
+        ge=1,
+        description="Minimum steps to solve (from solver)",
+    )
+    @computed_field
+    @property
+    def reasoning_overhead(self) -> float:
+        """Ratio of total actions to optimal path length.
+        1.0 = perfect (no wasted actions). Higher = more wasted reasoning.
+        Returns 0.0 if optimal path length is unknown.
+        """
+        if self.optimal_path_length is None or self.optimal_path_length == 0:
+            return 0.0
+        if self.total_actions == 0:
+            return 0.0
+        return self.total_actions / self.optimal_path_length
+    @computed_field
+    @property
+    def backtrack_rate(self) -> float:
+        """Fraction of valid moves that were backtracks (revisions).
+        0.0 = no backtracks, 1.0 = every move was a revision.
+        """
+        valid_moves = len(self.solver_distance_trace)
+        if valid_moves == 0:
+            return 0.0
+        return self.backtrack_count / valid_moves
+    @computed_field
+    @property
+    def progress_velocity(self) -> float:
+        """Average progress per valid move (cells solved per step).
+        Measures how much closer to the solution each move gets.
+        1.0 = every move reduces remaining by exactly 1. Lower = backtracks/plateaus.
+        Returns 0.0 if insufficient data.
+        """
+        trace = self.solver_distance_trace
+        if len(trace) < 2:
+            return 0.0
+        total_progress = trace[0] - trace[-1]
+        steps = len(trace) - 1
+        if steps == 0:
+            return 0.0
+        return total_progress / steps
+    @computed_field
+    @property
+    def progress_steadiness(self) -> float:
+        """Measure of how monotonically progress decreased (0.0 to 1.0).
+        1.0 = perfectly monotonic progress (every move reduced remaining count).
+        0.0 = no monotonic progress at all.
+        """
+        trace = self.solver_distance_trace
+        if len(trace) < 2:
+            return 1.0
+        monotonic_steps = sum(1 for i in range(1, len(trace)) if trace[i] < trace[i - 1])
+        return monotonic_steps / (len(trace) - 1)
+    @computed_field
+    @property
+    def avg_error_streak(self) -> float:
+        """Average length of consecutive error streaks.
+        Returns 0.0 if no error streaks occurred.
+        """
+        if not self.error_streaks:
+            return 0.0
+        return sum(self.error_streaks) / len(self.error_streaks)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to flat dictionary for reporting."""
+        return {
+            "backtrack_count": self.backtrack_count,
+            "backtrack_rate": round(self.backtrack_rate, 3),
+            "reasoning_overhead": round(self.reasoning_overhead, 3),
+            "progress_velocity": round(self.progress_velocity, 3),
+            "progress_steadiness": round(self.progress_steadiness, 3),
+            "error_streak_max": self.error_streak_max,
+            "avg_error_streak": round(self.avg_error_streak, 3),
+            "total_actions": self.total_actions,
+            "optimal_path_length": self.optimal_path_length,
+        }
 class EpisodeResult(BaseModel):
     """Complete result of a single puzzle episode with normalized metrics.
@@ -91,6 +217,12 @@ class EpisodeResult(BaseModel):
         description="Complete move history for detailed analysis",
     )
+    # Reasoning depth metrics
+    reasoning_metrics: ReasoningMetrics | None = Field(
+        default=None,
+        description="Detailed reasoning depth metrics (backtracks, progress, error patterns)",
+    )
     # Computed normalized metrics
     @computed_field
     @property
@@ -154,7 +286,7 @@ class EpisodeResult(BaseModel):
     def to_summary_dict(self) -> dict[str, Any]:
         """One-line episode summary for logging/streaming."""
-        return {
+        d: dict[str, Any] = {
             "game": self.game,
             "seed": self.seed,
             "difficulty": self.difficulty.value,
@@ -165,6 +297,9 @@ class EpisodeResult(BaseModel):
             "efficiency": round(self.efficiency_score, 3),
             "time_ms": self.wall_time_ms,
         }
+        if self.reasoning_metrics is not None:
+            d["reasoning"] = self.reasoning_metrics.to_dict()
+        return d
     def to_jsonl(self) -> str:
         """Single-line JSON for streaming output."""
@@ -217,6 +352,35 @@ class EvaluationSummary(BaseModel):
             return 0.0
         return sum(e.wall_time_ms for e in self.episodes) / len(self.episodes)
+    @computed_field
+    @property
+    def avg_backtrack_rate(self) -> float:
+        """Average backtrack rate across episodes with reasoning metrics."""
+        with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
+        if not with_metrics:
+            return 0.0
+        return sum(e.reasoning_metrics.backtrack_rate for e in with_metrics) / len(with_metrics)  # type: ignore[union-attr]
+    @computed_field
+    @property
+    def avg_reasoning_overhead(self) -> float:
+        """Average reasoning overhead across episodes with reasoning metrics."""
+        with_metrics = [
+            e for e in self.episodes if e.reasoning_metrics is not None and e.reasoning_metrics.reasoning_overhead > 0
+        ]
+        if not with_metrics:
+            return 0.0
+        return sum(e.reasoning_metrics.reasoning_overhead for e in with_metrics) / len(with_metrics)  # type: ignore[union-attr]
+    @computed_field
+    @property
+    def avg_progress_steadiness(self) -> float:
+        """Average progress steadiness across episodes with reasoning metrics."""
+        with_metrics = [e for e in self.episodes if e.reasoning_metrics is not None]
+        if not with_metrics:
+            return 0.0
+        return sum(e.reasoning_metrics.progress_steadiness for e in with_metrics) / len(with_metrics)  # type: ignore[union-attr]
 class TraceEvent(BaseModel):
     """A single event in an episode trace for JSONL logging."""

chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl

chuk-puzzles-gym 0.10py3-none-any.whl → 0.10.2py3-none-any.whl