chuk-puzzles-gym 0.10.1__tar.gz → 0.10.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chuk_puzzles_gym-0.10.1/src/chuk_puzzles_gym.egg-info → chuk_puzzles_gym-0.10.2}/PKG-INFO +124 -7
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/README.md +123 -6
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/pyproject.toml +1 -1
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/eval.py +168 -46
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/export/dataset.py +7 -1
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/_base/game.py +123 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/gym_env.py +21 -5
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/__init__.py +2 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/evaluation.py +165 -1
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/server.py +34 -3
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2/src/chuk_puzzles_gym.egg-info}/PKG-INFO +124 -7
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/MANIFEST.in +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/config.yaml +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/setup.cfg +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/constants.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/export/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/_base/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/_base/commands.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/binary/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/binary/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/binary/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/bridges/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/bridges/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/bridges/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/cryptarithmetic/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/cryptarithmetic/commands.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/cryptarithmetic/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/cryptarithmetic/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/einstein/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/einstein/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/einstein/constants.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/einstein/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/einstein/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/fillomino/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/fillomino/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/fillomino/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/futoshiki/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/futoshiki/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/futoshiki/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/graph_coloring/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/graph_coloring/commands.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/graph_coloring/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/graph_coloring/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hidato/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hidato/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hidato/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hitori/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hitori/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/hitori/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kakuro/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kakuro/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kakuro/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kenken/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kenken/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kenken/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kenken/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/kenken/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/killer_sudoku/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/killer_sudoku/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/killer_sudoku/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/killer_sudoku/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/knapsack/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/knapsack/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/knapsack/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/knapsack/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/knapsack/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/lights_out/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/lights_out/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/lights_out/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/logic_grid/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/logic_grid/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/logic_grid/constants.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/logic_grid/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/logic_grid/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/mastermind/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/mastermind/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/mastermind/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/minesweeper/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/minesweeper/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/minesweeper/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/minesweeper/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nonogram/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nonogram/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nonogram/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nqueens/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nqueens/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nqueens/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/numberlink/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/numberlink/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/numberlink/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nurikabe/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nurikabe/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nurikabe/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/nurikabe/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/rush_hour/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/rush_hour/commands.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/rush_hour/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/rush_hour/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/rush_hour/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/constants.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/scheduler/models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/shikaku/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/shikaku/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/shikaku/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/skyscrapers/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/skyscrapers/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/skyscrapers/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/slitherlink/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/slitherlink/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/slitherlink/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sokoban/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sokoban/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sokoban/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/star_battle/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/star_battle/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/star_battle/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sudoku/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sudoku/commands.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sudoku/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/sudoku/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/tents/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/tents/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/games/tents/game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/base.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/config.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/enums.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/models/games.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/trace/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/trace/generator.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym/utils/__init__.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym.egg-info/SOURCES.txt +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym.egg-info/dependency_links.txt +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym.egg-info/entry_points.txt +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym.egg-info/requires.txt +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/src/chuk_puzzles_gym.egg-info/top_level.txt +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_base_models.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_binary_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_bridges.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_command_handlers.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_cryptarithmetic_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_deterministic_seeding.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_einstein.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_eval.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_fillomino.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_futoshiki_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_game_configs.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_graph_coloring_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_gym_env.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_hidato.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_hitori.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_kakuro_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_kenken_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_killer_sudoku.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_knapsack.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_lights_out.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_logic_grid_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_mastermind.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_minesweeper.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_nonogram_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_nqueens_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_numberlink_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_nurikabe.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_puzzle_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_rush_hour_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_scheduler.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_shikaku.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_skyscrapers_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_slitherlink.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_sokoban.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_star_battle.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_sudoku_game.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_tents.py +0 -0
- {chuk_puzzles_gym-0.10.1 → chuk_puzzles_gym-0.10.2}/tests/test_trace_generator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chuk-puzzles-gym
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.2
|
|
4
4
|
Summary: Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation
|
|
5
5
|
Author: Chris Hay
|
|
6
6
|
License: MIT
|
|
@@ -93,10 +93,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
|
|
|
93
93
|
- Enable with `mode agent` command
|
|
94
94
|
- Machine-parseable grid format with clear start/end markers
|
|
95
95
|
- Compact output optimized for LLM tool integration
|
|
96
|
+
- **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
|
|
97
|
+
- Backtrack detection (did the agent revise previous placements?)
|
|
98
|
+
- Progress steadiness (monotonic advance toward solution?)
|
|
99
|
+
- Error streak analysis (isolated mistakes vs. clustered confusion?)
|
|
100
|
+
- Reasoning overhead (wasted work relative to optimal path)
|
|
101
|
+
- Solver distance traces (remaining work after each valid move)
|
|
102
|
+
- Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
|
|
96
103
|
- **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
|
|
97
104
|
- Batch evaluation with configurable episodes
|
|
98
105
|
- Multiple output formats (JSON, CSV, Markdown)
|
|
99
|
-
- Metrics: moves, invalid moves, hints, solve time
|
|
106
|
+
- Metrics: moves, invalid moves, hints, solve time, reasoning depth
|
|
100
107
|
- Reproducible with deterministic seeds
|
|
101
108
|
- **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
|
|
102
109
|
- JSONL output with complete problem definitions and solutions
|
|
@@ -500,6 +507,7 @@ games = PuzzleEnv.available_games()
|
|
|
500
507
|
|
|
501
508
|
- **All 30 games** accessible through unified API
|
|
502
509
|
- **Configurable rewards** for correct moves, invalid attempts, completion bonuses
|
|
510
|
+
- **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
|
|
503
511
|
- **Hint system** with optional budget limits
|
|
504
512
|
- **Solver-free mode** for pure reasoning benchmarks
|
|
505
513
|
- **Efficiency scoring** based on optimal step counts
|
|
@@ -515,8 +523,25 @@ obs = {
|
|
|
515
523
|
"moves": 5,
|
|
516
524
|
"invalid_moves": 1,
|
|
517
525
|
"hints_used": 2,
|
|
526
|
+
"hints_remaining": 98,
|
|
518
527
|
"is_complete": False,
|
|
519
|
-
"grid": [[4, 0, 8, ...], ...] # Game-specific state
|
|
528
|
+
"grid": [[4, 0, 8, ...], ...], # Game-specific state
|
|
529
|
+
"render": " | 1 2 3 | ...", # ASCII grid
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Info dict includes reasoning metrics and difficulty profile
|
|
533
|
+
info = {
|
|
534
|
+
"optimal_steps": 45,
|
|
535
|
+
"difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
|
|
536
|
+
"reasoning_metrics": {
|
|
537
|
+
"backtrack_count": 0,
|
|
538
|
+
"backtrack_rate": 0.0,
|
|
539
|
+
"progress_velocity": 1.0,
|
|
540
|
+
"progress_steadiness": 1.0,
|
|
541
|
+
"reasoning_overhead": 1.0,
|
|
542
|
+
"error_streak_max": 0,
|
|
543
|
+
"solver_distance_trace": [44, 43, 42, ...],
|
|
544
|
+
},
|
|
520
545
|
}
|
|
521
546
|
```
|
|
522
547
|
|
|
@@ -546,6 +571,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
|
|
|
546
571
|
env = PuzzleEnv("sudoku", solver_config=config)
|
|
547
572
|
```
|
|
548
573
|
|
|
574
|
+
## Reasoning Depth Metrics
|
|
575
|
+
|
|
576
|
+
Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
|
|
577
|
+
|
|
578
|
+
### Metrics
|
|
579
|
+
|
|
580
|
+
| Metric | Description | Perfect Score |
|
|
581
|
+
|--------|-------------|---------------|
|
|
582
|
+
| `backtrack_count` | Times the agent revised a previous placement | 0 |
|
|
583
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
|
|
584
|
+
| `progress_velocity` | Average cells solved per step | 1.0 |
|
|
585
|
+
| `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
|
|
586
|
+
| `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
|
|
587
|
+
| `error_streak_max` | Longest run of consecutive invalid moves | 0 |
|
|
588
|
+
| `avg_error_streak` | Average length of error bursts | 0.0 |
|
|
589
|
+
| `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
|
|
590
|
+
|
|
591
|
+
### Usage in Gym Environment
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
from chuk_puzzles_gym.gym_env import PuzzleEnv
|
|
595
|
+
|
|
596
|
+
env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
|
|
597
|
+
obs, info = await env.reset()
|
|
598
|
+
|
|
599
|
+
# Reasoning metrics available in info after reset
|
|
600
|
+
print(info["reasoning_metrics"])
|
|
601
|
+
|
|
602
|
+
# ... agent plays ...
|
|
603
|
+
obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
|
|
604
|
+
|
|
605
|
+
# On episode end, info includes full reasoning metrics
|
|
606
|
+
if terminated:
|
|
607
|
+
metrics = info["reasoning_metrics"]
|
|
608
|
+
print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
|
|
609
|
+
print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
|
|
610
|
+
print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
### Usage in Server (Telnet/WebSocket)
|
|
614
|
+
|
|
615
|
+
Reasoning metrics are included automatically in server output:
|
|
616
|
+
|
|
617
|
+
- **JSON mode**: `reasoning_metrics` dict in every state response and completion message
|
|
618
|
+
- **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
|
|
619
|
+
- **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
|
|
620
|
+
|
|
621
|
+
```
|
|
622
|
+
> mode json
|
|
623
|
+
> place 1 1 5
|
|
624
|
+
{"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
|
|
625
|
+
|
|
626
|
+
> stats
|
|
627
|
+
{"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
### Usage in Evaluation Harness
|
|
631
|
+
|
|
632
|
+
```bash
|
|
633
|
+
# Reasoning metrics included in all output formats
|
|
634
|
+
chuk-puzzles-eval sudoku -d easy -n 10 -o json
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
```python
|
|
638
|
+
from chuk_puzzles_gym.eval import evaluate_game
|
|
639
|
+
|
|
640
|
+
report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
|
|
641
|
+
report.print_summary() # Includes "Reasoning Depth" section
|
|
642
|
+
|
|
643
|
+
# Aggregate metrics
|
|
644
|
+
print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
|
|
645
|
+
print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
|
|
646
|
+
print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
|
|
647
|
+
```
|
|
648
|
+
|
|
649
|
+
### What the Metrics Reveal
|
|
650
|
+
|
|
651
|
+
A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
|
|
652
|
+
|
|
653
|
+
A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
|
|
654
|
+
|
|
655
|
+
These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
|
|
656
|
+
|
|
549
657
|
## Evaluation Harness
|
|
550
658
|
|
|
551
659
|
The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
|
|
@@ -604,6 +712,12 @@ Avg Time: 12ms
|
|
|
604
712
|
| `hints_used` | Number of hints requested |
|
|
605
713
|
| `wall_time_ms` | Time to solve in milliseconds |
|
|
606
714
|
| `seed` | Puzzle seed for reproducibility |
|
|
715
|
+
| `backtrack_count` | Times agent revised a previous placement |
|
|
716
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks |
|
|
717
|
+
| `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
|
|
718
|
+
| `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
|
|
719
|
+
| `error_streak_max` | Longest run of consecutive invalid moves |
|
|
720
|
+
| `progress_velocity` | Average cells solved per step |
|
|
607
721
|
|
|
608
722
|
## Dataset Export
|
|
609
723
|
|
|
@@ -1194,12 +1308,13 @@ chuk-puzzles-gym/
|
|
|
1194
1308
|
│ │ ├── base.py # GridPosition, MoveResult
|
|
1195
1309
|
│ │ ├── config.py # Base GameConfig
|
|
1196
1310
|
│ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
|
|
1311
|
+
│ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
|
|
1197
1312
|
│ │ └── games.py # Game-specific models (Cage, Task, etc.)
|
|
1198
1313
|
│ └── games/ # Self-contained game modules
|
|
1199
1314
|
│ ├── __init__.py # AVAILABLE_GAMES registry
|
|
1200
1315
|
│ ├── _base/ # Base classes
|
|
1201
1316
|
│ │ ├── __init__.py
|
|
1202
|
-
│ │ ├── game.py # PuzzleGame ABC
|
|
1317
|
+
│ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
|
|
1203
1318
|
│ │ └── commands.py # GameCommandHandler ABC
|
|
1204
1319
|
│ ├── sudoku/ # Example game module
|
|
1205
1320
|
│ │ ├── __init__.py # Exports SudokuGame
|
|
@@ -1226,6 +1341,7 @@ chuk-puzzles-gym/
|
|
|
1226
1341
|
│ ├── example_graph_coloring.py # Graph Coloring game logic demo
|
|
1227
1342
|
│ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
|
|
1228
1343
|
│ ├── example_rush_hour.py # Rush Hour game logic demo
|
|
1344
|
+
│ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
|
|
1229
1345
|
│ └── README.md # Example usage guide
|
|
1230
1346
|
├── .github/workflows/ # CI/CD workflows
|
|
1231
1347
|
├── pyproject.toml # Modern Python project config
|
|
@@ -1465,9 +1581,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
|
|
|
1465
1581
|
### Highlights
|
|
1466
1582
|
|
|
1467
1583
|
**Benchmarking & Metrics**
|
|
1468
|
-
- Puzzle complexity metrics (constraint count, variable count, branching factor)
|
|
1469
|
-
- Episode model for tracking game sessions
|
|
1470
|
-
-
|
|
1584
|
+
- ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
|
|
1585
|
+
- ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
|
|
1586
|
+
- ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
|
|
1587
|
+
- ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
|
|
1471
1588
|
|
|
1472
1589
|
**Agent Evaluation Tools**
|
|
1473
1590
|
- Batch evaluation harness CLI
|
|
@@ -62,10 +62,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
|
|
|
62
62
|
- Enable with `mode agent` command
|
|
63
63
|
- Machine-parseable grid format with clear start/end markers
|
|
64
64
|
- Compact output optimized for LLM tool integration
|
|
65
|
+
- **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
|
|
66
|
+
- Backtrack detection (did the agent revise previous placements?)
|
|
67
|
+
- Progress steadiness (monotonic advance toward solution?)
|
|
68
|
+
- Error streak analysis (isolated mistakes vs. clustered confusion?)
|
|
69
|
+
- Reasoning overhead (wasted work relative to optimal path)
|
|
70
|
+
- Solver distance traces (remaining work after each valid move)
|
|
71
|
+
- Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
|
|
65
72
|
- **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
|
|
66
73
|
- Batch evaluation with configurable episodes
|
|
67
74
|
- Multiple output formats (JSON, CSV, Markdown)
|
|
68
|
-
- Metrics: moves, invalid moves, hints, solve time
|
|
75
|
+
- Metrics: moves, invalid moves, hints, solve time, reasoning depth
|
|
69
76
|
- Reproducible with deterministic seeds
|
|
70
77
|
- **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
|
|
71
78
|
- JSONL output with complete problem definitions and solutions
|
|
@@ -469,6 +476,7 @@ games = PuzzleEnv.available_games()
|
|
|
469
476
|
|
|
470
477
|
- **All 30 games** accessible through unified API
|
|
471
478
|
- **Configurable rewards** for correct moves, invalid attempts, completion bonuses
|
|
479
|
+
- **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
|
|
472
480
|
- **Hint system** with optional budget limits
|
|
473
481
|
- **Solver-free mode** for pure reasoning benchmarks
|
|
474
482
|
- **Efficiency scoring** based on optimal step counts
|
|
@@ -484,8 +492,25 @@ obs = {
|
|
|
484
492
|
"moves": 5,
|
|
485
493
|
"invalid_moves": 1,
|
|
486
494
|
"hints_used": 2,
|
|
495
|
+
"hints_remaining": 98,
|
|
487
496
|
"is_complete": False,
|
|
488
|
-
"grid": [[4, 0, 8, ...], ...] # Game-specific state
|
|
497
|
+
"grid": [[4, 0, 8, ...], ...], # Game-specific state
|
|
498
|
+
"render": " | 1 2 3 | ...", # ASCII grid
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
# Info dict includes reasoning metrics and difficulty profile
|
|
502
|
+
info = {
|
|
503
|
+
"optimal_steps": 45,
|
|
504
|
+
"difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
|
|
505
|
+
"reasoning_metrics": {
|
|
506
|
+
"backtrack_count": 0,
|
|
507
|
+
"backtrack_rate": 0.0,
|
|
508
|
+
"progress_velocity": 1.0,
|
|
509
|
+
"progress_steadiness": 1.0,
|
|
510
|
+
"reasoning_overhead": 1.0,
|
|
511
|
+
"error_streak_max": 0,
|
|
512
|
+
"solver_distance_trace": [44, 43, 42, ...],
|
|
513
|
+
},
|
|
489
514
|
}
|
|
490
515
|
```
|
|
491
516
|
|
|
@@ -515,6 +540,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
|
|
|
515
540
|
env = PuzzleEnv("sudoku", solver_config=config)
|
|
516
541
|
```
|
|
517
542
|
|
|
543
|
+
## Reasoning Depth Metrics
|
|
544
|
+
|
|
545
|
+
Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
|
|
546
|
+
|
|
547
|
+
### Metrics
|
|
548
|
+
|
|
549
|
+
| Metric | Description | Perfect Score |
|
|
550
|
+
|--------|-------------|---------------|
|
|
551
|
+
| `backtrack_count` | Times the agent revised a previous placement | 0 |
|
|
552
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
|
|
553
|
+
| `progress_velocity` | Average cells solved per step | 1.0 |
|
|
554
|
+
| `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
|
|
555
|
+
| `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
|
|
556
|
+
| `error_streak_max` | Longest run of consecutive invalid moves | 0 |
|
|
557
|
+
| `avg_error_streak` | Average length of error bursts | 0.0 |
|
|
558
|
+
| `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
|
|
559
|
+
|
|
560
|
+
### Usage in Gym Environment
|
|
561
|
+
|
|
562
|
+
```python
|
|
563
|
+
from chuk_puzzles_gym.gym_env import PuzzleEnv
|
|
564
|
+
|
|
565
|
+
env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
|
|
566
|
+
obs, info = await env.reset()
|
|
567
|
+
|
|
568
|
+
# Reasoning metrics available in info after reset
|
|
569
|
+
print(info["reasoning_metrics"])
|
|
570
|
+
|
|
571
|
+
# ... agent plays ...
|
|
572
|
+
obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
|
|
573
|
+
|
|
574
|
+
# On episode end, info includes full reasoning metrics
|
|
575
|
+
if terminated:
|
|
576
|
+
metrics = info["reasoning_metrics"]
|
|
577
|
+
print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
|
|
578
|
+
print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
|
|
579
|
+
print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
### Usage in Server (Telnet/WebSocket)
|
|
583
|
+
|
|
584
|
+
Reasoning metrics are included automatically in server output:
|
|
585
|
+
|
|
586
|
+
- **JSON mode**: `reasoning_metrics` dict in every state response and completion message
|
|
587
|
+
- **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
|
|
588
|
+
- **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
|
|
589
|
+
|
|
590
|
+
```
|
|
591
|
+
> mode json
|
|
592
|
+
> place 1 1 5
|
|
593
|
+
{"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
|
|
594
|
+
|
|
595
|
+
> stats
|
|
596
|
+
{"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
### Usage in Evaluation Harness
|
|
600
|
+
|
|
601
|
+
```bash
|
|
602
|
+
# Reasoning metrics included in all output formats
|
|
603
|
+
chuk-puzzles-eval sudoku -d easy -n 10 -o json
|
|
604
|
+
```
|
|
605
|
+
|
|
606
|
+
```python
|
|
607
|
+
from chuk_puzzles_gym.eval import evaluate_game
|
|
608
|
+
|
|
609
|
+
report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
|
|
610
|
+
report.print_summary() # Includes "Reasoning Depth" section
|
|
611
|
+
|
|
612
|
+
# Aggregate metrics
|
|
613
|
+
print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
|
|
614
|
+
print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
|
|
615
|
+
print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
|
|
616
|
+
```
|
|
617
|
+
|
|
618
|
+
### What the Metrics Reveal
|
|
619
|
+
|
|
620
|
+
A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
|
|
621
|
+
|
|
622
|
+
A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
|
|
623
|
+
|
|
624
|
+
These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
|
|
625
|
+
|
|
518
626
|
## Evaluation Harness
|
|
519
627
|
|
|
520
628
|
The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
|
|
@@ -573,6 +681,12 @@ Avg Time: 12ms
|
|
|
573
681
|
| `hints_used` | Number of hints requested |
|
|
574
682
|
| `wall_time_ms` | Time to solve in milliseconds |
|
|
575
683
|
| `seed` | Puzzle seed for reproducibility |
|
|
684
|
+
| `backtrack_count` | Times agent revised a previous placement |
|
|
685
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks |
|
|
686
|
+
| `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
|
|
687
|
+
| `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
|
|
688
|
+
| `error_streak_max` | Longest run of consecutive invalid moves |
|
|
689
|
+
| `progress_velocity` | Average cells solved per step |
|
|
576
690
|
|
|
577
691
|
## Dataset Export
|
|
578
692
|
|
|
@@ -1163,12 +1277,13 @@ chuk-puzzles-gym/
|
|
|
1163
1277
|
│ │ ├── base.py # GridPosition, MoveResult
|
|
1164
1278
|
│ │ ├── config.py # Base GameConfig
|
|
1165
1279
|
│ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
|
|
1280
|
+
│ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
|
|
1166
1281
|
│ │ └── games.py # Game-specific models (Cage, Task, etc.)
|
|
1167
1282
|
│ └── games/ # Self-contained game modules
|
|
1168
1283
|
│ ├── __init__.py # AVAILABLE_GAMES registry
|
|
1169
1284
|
│ ├── _base/ # Base classes
|
|
1170
1285
|
│ │ ├── __init__.py
|
|
1171
|
-
│ │ ├── game.py # PuzzleGame ABC
|
|
1286
|
+
│ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
|
|
1172
1287
|
│ │ └── commands.py # GameCommandHandler ABC
|
|
1173
1288
|
│ ├── sudoku/ # Example game module
|
|
1174
1289
|
│ │ ├── __init__.py # Exports SudokuGame
|
|
@@ -1195,6 +1310,7 @@ chuk-puzzles-gym/
|
|
|
1195
1310
|
│ ├── example_graph_coloring.py # Graph Coloring game logic demo
|
|
1196
1311
|
│ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
|
|
1197
1312
|
│ ├── example_rush_hour.py # Rush Hour game logic demo
|
|
1313
|
+
│ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
|
|
1198
1314
|
│ └── README.md # Example usage guide
|
|
1199
1315
|
├── .github/workflows/ # CI/CD workflows
|
|
1200
1316
|
├── pyproject.toml # Modern Python project config
|
|
@@ -1434,9 +1550,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
|
|
|
1434
1550
|
### Highlights
|
|
1435
1551
|
|
|
1436
1552
|
**Benchmarking & Metrics**
|
|
1437
|
-
- Puzzle complexity metrics (constraint count, variable count, branching factor)
|
|
1438
|
-
- Episode model for tracking game sessions
|
|
1439
|
-
-
|
|
1553
|
+
- ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
|
|
1554
|
+
- ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
|
|
1555
|
+
- ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
|
|
1556
|
+
- ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
|
|
1440
1557
|
|
|
1441
1558
|
**Agent Evaluation Tools**
|
|
1442
1559
|
- Batch evaluation harness CLI
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "chuk-puzzles-gym"
|
|
7
|
-
version = "0.10.
|
|
7
|
+
version = "0.10.2"
|
|
8
8
|
description = "Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|