chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chuk_puzzles_gym/eval.py +168 -46
- chuk_puzzles_gym/export/dataset.py +7 -1
- chuk_puzzles_gym/games/_base/game.py +123 -0
- chuk_puzzles_gym/games/binary/game.py +2 -0
- chuk_puzzles_gym/games/bridges/game.py +2 -0
- chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
- chuk_puzzles_gym/games/einstein/game.py +2 -0
- chuk_puzzles_gym/games/fillomino/game.py +2 -0
- chuk_puzzles_gym/games/futoshiki/game.py +2 -0
- chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
- chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
- chuk_puzzles_gym/games/hidato/game.py +2 -0
- chuk_puzzles_gym/games/hitori/game.py +2 -0
- chuk_puzzles_gym/games/kakuro/game.py +2 -0
- chuk_puzzles_gym/games/kenken/game.py +2 -0
- chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
- chuk_puzzles_gym/games/knapsack/game.py +2 -0
- chuk_puzzles_gym/games/lights_out/game.py +2 -0
- chuk_puzzles_gym/games/logic_grid/game.py +2 -0
- chuk_puzzles_gym/games/mastermind/game.py +2 -0
- chuk_puzzles_gym/games/minesweeper/game.py +2 -0
- chuk_puzzles_gym/games/nonogram/game.py +2 -0
- chuk_puzzles_gym/games/nqueens/game.py +5 -0
- chuk_puzzles_gym/games/numberlink/game.py +6 -0
- chuk_puzzles_gym/games/nurikabe/game.py +2 -0
- chuk_puzzles_gym/games/rush_hour/game.py +4 -0
- chuk_puzzles_gym/games/scheduler/game.py +2 -0
- chuk_puzzles_gym/games/shikaku/game.py +2 -0
- chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
- chuk_puzzles_gym/games/slitherlink/game.py +2 -0
- chuk_puzzles_gym/games/sokoban/game.py +2 -0
- chuk_puzzles_gym/games/star_battle/game.py +2 -0
- chuk_puzzles_gym/games/sudoku/game.py +2 -0
- chuk_puzzles_gym/games/tents/game.py +2 -0
- chuk_puzzles_gym/gym_env.py +21 -5
- chuk_puzzles_gym/models/__init__.py +2 -0
- chuk_puzzles_gym/models/evaluation.py +165 -1
- chuk_puzzles_gym/server.py +51 -72
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
- {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
chuk_puzzles_gym/server.py
CHANGED
|
@@ -63,6 +63,9 @@ class ArcadeHandler(TelnetHandler):
|
|
|
63
63
|
if not self.current_game:
|
|
64
64
|
return
|
|
65
65
|
|
|
66
|
+
# Get final reasoning metrics
|
|
67
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
68
|
+
|
|
66
69
|
if self.output_mode == OutputMode.JSON:
|
|
67
70
|
await self.send_json_response(
|
|
68
71
|
type="complete",
|
|
@@ -72,17 +75,27 @@ class ArcadeHandler(TelnetHandler):
|
|
|
72
75
|
invalid_moves=self.current_game.invalid_moves,
|
|
73
76
|
hints_used=self.current_game.hints_used,
|
|
74
77
|
optimal_steps=self.current_game.optimal_steps,
|
|
78
|
+
reasoning_metrics=reasoning,
|
|
75
79
|
)
|
|
76
80
|
elif self.output_mode == OutputMode.STRICT:
|
|
77
81
|
await self.send_line(
|
|
78
82
|
f"COMPLETE:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
|
|
79
|
-
f"{self.current_game.hints_used}"
|
|
83
|
+
f"{self.current_game.hints_used}:"
|
|
84
|
+
f"BT={reasoning['backtrack_count']}:"
|
|
85
|
+
f"OH={reasoning['reasoning_overhead']:.2f}:"
|
|
86
|
+
f"ST={reasoning['progress_steadiness']:.2f}"
|
|
80
87
|
)
|
|
81
88
|
else:
|
|
82
89
|
await self.send_line("\n" + "=" * 50)
|
|
83
90
|
await self.send_line("CONGRATULATIONS! YOU SOLVED IT!")
|
|
84
91
|
await self.send_line("=" * 50)
|
|
85
92
|
await self.send_line(self.current_game.get_stats())
|
|
93
|
+
await self.send_line("")
|
|
94
|
+
await self.send_line("Reasoning Depth:")
|
|
95
|
+
await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
|
|
96
|
+
await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
|
|
97
|
+
await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
|
|
98
|
+
await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
|
|
86
99
|
await self.send_line("\nType 'menu' to play another game.")
|
|
87
100
|
await self.send_line("=" * 50 + "\n")
|
|
88
101
|
|
|
@@ -109,6 +122,9 @@ class ArcadeHandler(TelnetHandler):
|
|
|
109
122
|
"constraint_density": profile.constraint_density,
|
|
110
123
|
}
|
|
111
124
|
|
|
125
|
+
# Reasoning depth metrics
|
|
126
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
127
|
+
|
|
112
128
|
return {
|
|
113
129
|
"game": self.current_game.name,
|
|
114
130
|
"difficulty": self.current_game.difficulty.value,
|
|
@@ -120,6 +136,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
120
136
|
"optimal_steps": self.current_game.optimal_steps,
|
|
121
137
|
"is_complete": self.current_game.is_complete(),
|
|
122
138
|
"difficulty_profile": profile_dict,
|
|
139
|
+
"reasoning_metrics": reasoning,
|
|
123
140
|
"grid": grid,
|
|
124
141
|
}
|
|
125
142
|
|
|
@@ -435,9 +452,10 @@ class ArcadeHandler(TelnetHandler):
|
|
|
435
452
|
return
|
|
436
453
|
|
|
437
454
|
if cmd_enum == GameCommand.STATS:
|
|
438
|
-
# Show detailed stats including difficulty profile
|
|
455
|
+
# Show detailed stats including difficulty profile and reasoning metrics
|
|
439
456
|
profile = self.current_game.difficulty_profile
|
|
440
457
|
optimal = self.current_game.optimal_steps
|
|
458
|
+
reasoning = self.current_game.get_reasoning_metrics().to_dict()
|
|
441
459
|
|
|
442
460
|
if self.output_mode == OutputMode.JSON:
|
|
443
461
|
await self.send_json_response(
|
|
@@ -455,11 +473,15 @@ class ArcadeHandler(TelnetHandler):
|
|
|
455
473
|
"state_observability": profile.state_observability,
|
|
456
474
|
"constraint_density": profile.constraint_density,
|
|
457
475
|
},
|
|
476
|
+
reasoning_metrics=reasoning,
|
|
458
477
|
)
|
|
459
478
|
elif self.output_mode == OutputMode.STRICT:
|
|
460
479
|
await self.send_line(
|
|
461
480
|
f"STATS:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
|
|
462
|
-
f"{self.current_game.hints_used}:{optimal or 0}"
|
|
481
|
+
f"{self.current_game.hints_used}:{optimal or 0}:"
|
|
482
|
+
f"BT={reasoning['backtrack_count']}:"
|
|
483
|
+
f"OH={reasoning['reasoning_overhead']:.2f}:"
|
|
484
|
+
f"ST={reasoning['progress_steadiness']:.2f}"
|
|
463
485
|
)
|
|
464
486
|
else:
|
|
465
487
|
await self.send_line("")
|
|
@@ -482,6 +504,15 @@ class ArcadeHandler(TelnetHandler):
|
|
|
482
504
|
await self.send_line(f" Optimal steps: {optimal}")
|
|
483
505
|
await self.send_line(f" Current efficiency: {efficiency:.1%}")
|
|
484
506
|
await self.send_line("")
|
|
507
|
+
await self.send_line("Reasoning Depth:")
|
|
508
|
+
await self.send_line(f" Backtrack count: {reasoning['backtrack_count']}")
|
|
509
|
+
await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
|
|
510
|
+
await self.send_line(f" Progress velocity: {reasoning['progress_velocity']:.2f} cells/step")
|
|
511
|
+
await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
|
|
512
|
+
await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
|
|
513
|
+
await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
|
|
514
|
+
await self.send_line(f" Total actions: {reasoning['total_actions']}")
|
|
515
|
+
await self.send_line("")
|
|
485
516
|
await self.send_line("Difficulty Profile:")
|
|
486
517
|
await self.send_line(f" Logic depth: {profile.logic_depth}")
|
|
487
518
|
await self.send_line(f" Branching factor: {profile.branching_factor:.1f}")
|
|
@@ -599,10 +630,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
599
630
|
if self.game_handler and cmd_enum in self.game_handler.supported_commands:
|
|
600
631
|
result = await self.game_handler.handle_command(cmd_enum, parts[1:])
|
|
601
632
|
|
|
602
|
-
# Track invalid moves
|
|
603
|
-
if not result.result.success:
|
|
604
|
-
self.current_game.invalid_moves += 1
|
|
605
|
-
|
|
606
633
|
# Send result based on output mode
|
|
607
634
|
code = "OK" if result.result.success else "INVALID"
|
|
608
635
|
await self.send_result(result.result.success, result.result.message, code)
|
|
@@ -626,9 +653,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
626
653
|
num = int(parts[3])
|
|
627
654
|
|
|
628
655
|
result = await self.current_game.validate_move(row, col, num)
|
|
629
|
-
|
|
630
|
-
if not result.success:
|
|
631
|
-
self.current_game.invalid_moves += 1
|
|
656
|
+
self.current_game.record_move((row, col), result.success)
|
|
632
657
|
|
|
633
658
|
await self.send_result(result.success, result.message, "PLACED" if result.success else "INVALID_MOVE")
|
|
634
659
|
|
|
@@ -639,7 +664,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
639
664
|
await self.send_game_complete()
|
|
640
665
|
|
|
641
666
|
except ValueError:
|
|
642
|
-
self.current_game.invalid_moves += 1
|
|
643
667
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
644
668
|
return
|
|
645
669
|
|
|
@@ -653,9 +677,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
653
677
|
col = int(parts[2])
|
|
654
678
|
|
|
655
679
|
result = await self.current_game.validate_move(row, col, 0)
|
|
656
|
-
|
|
657
|
-
if not result.success:
|
|
658
|
-
self.current_game.invalid_moves += 1
|
|
680
|
+
self.current_game.record_move((row, col), result.success)
|
|
659
681
|
|
|
660
682
|
await self.send_result(result.success, result.message, "CLEARED" if result.success else "INVALID_CLEAR")
|
|
661
683
|
|
|
@@ -663,7 +685,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
663
685
|
await self.display_puzzle()
|
|
664
686
|
|
|
665
687
|
except ValueError:
|
|
666
|
-
self.current_game.invalid_moves += 1
|
|
667
688
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
668
689
|
return
|
|
669
690
|
|
|
@@ -693,9 +714,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
693
714
|
col = int(parts[2])
|
|
694
715
|
|
|
695
716
|
result = await self.current_game.validate_move(row, col)
|
|
696
|
-
|
|
697
|
-
if not result.success:
|
|
698
|
-
self.current_game.invalid_moves += 1
|
|
717
|
+
self.current_game.record_move((row, col), result.success)
|
|
699
718
|
|
|
700
719
|
await self.send_result(result.success, result.message, "PRESSED" if result.success else "INVALID_PRESS")
|
|
701
720
|
|
|
@@ -706,7 +725,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
706
725
|
await self.send_game_complete()
|
|
707
726
|
|
|
708
727
|
except ValueError:
|
|
709
|
-
self.current_game.invalid_moves += 1
|
|
710
728
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
711
729
|
return
|
|
712
730
|
|
|
@@ -718,9 +736,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
718
736
|
|
|
719
737
|
cat1, val1, cat2, val2 = parts[1], parts[2], parts[3], parts[4]
|
|
720
738
|
result = await self.current_game.validate_move(cat1, val1, cat2, val2, True)
|
|
721
|
-
|
|
722
|
-
if not result.success:
|
|
723
|
-
self.current_game.invalid_moves += 1
|
|
739
|
+
self.current_game.record_move((cat1, val1, cat2, val2), result.success)
|
|
724
740
|
|
|
725
741
|
await self.send_result(result.success, result.message, "CONNECTED" if result.success else "INVALID_CONNECT")
|
|
726
742
|
if result.success:
|
|
@@ -736,9 +752,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
736
752
|
|
|
737
753
|
cat1, val1, cat2, val2 = parts[1], parts[2], parts[3], parts[4]
|
|
738
754
|
result = await self.current_game.validate_move(cat1, val1, cat2, val2, False)
|
|
739
|
-
|
|
740
|
-
if not result.success:
|
|
741
|
-
self.current_game.invalid_moves += 1
|
|
755
|
+
self.current_game.record_move((cat1, val1, cat2, val2), result.success)
|
|
742
756
|
|
|
743
757
|
await self.send_result(result.success, result.message, "EXCLUDED" if result.success else "INVALID_EXCLUDE")
|
|
744
758
|
if result.success:
|
|
@@ -758,9 +772,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
758
772
|
col = int(parts[2])
|
|
759
773
|
|
|
760
774
|
result = await self.current_game.validate_move("reveal", row, col)
|
|
761
|
-
|
|
762
|
-
if not result.success:
|
|
763
|
-
self.current_game.invalid_moves += 1
|
|
775
|
+
self.current_game.record_move((row, col), result.success)
|
|
764
776
|
|
|
765
777
|
await self.send_result(
|
|
766
778
|
result.success, result.message, "REVEALED" if result.success else "INVALID_REVEAL"
|
|
@@ -783,7 +795,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
783
795
|
await self.send_line("=" * 50 + "\n")
|
|
784
796
|
|
|
785
797
|
except ValueError:
|
|
786
|
-
self.current_game.invalid_moves += 1
|
|
787
798
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
788
799
|
return
|
|
789
800
|
|
|
@@ -797,9 +808,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
797
808
|
col = int(parts[2])
|
|
798
809
|
|
|
799
810
|
result = await self.current_game.validate_move("flag", row, col)
|
|
800
|
-
|
|
801
|
-
if not result.success:
|
|
802
|
-
self.current_game.invalid_moves += 1
|
|
811
|
+
self.current_game.record_move((row, col), result.success)
|
|
803
812
|
|
|
804
813
|
await self.send_result(result.success, result.message, "FLAGGED" if result.success else "INVALID_FLAG")
|
|
805
814
|
|
|
@@ -807,7 +816,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
807
816
|
await self.display_puzzle()
|
|
808
817
|
|
|
809
818
|
except ValueError:
|
|
810
|
-
self.current_game.invalid_moves += 1
|
|
811
819
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
812
820
|
return
|
|
813
821
|
|
|
@@ -824,9 +832,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
824
832
|
state = int(parts[4])
|
|
825
833
|
|
|
826
834
|
result = await self.current_game.validate_move(edge_type, row, col, state)
|
|
827
|
-
|
|
828
|
-
if not result.success:
|
|
829
|
-
self.current_game.invalid_moves += 1
|
|
835
|
+
self.current_game.record_move((edge_type, row, col), result.success)
|
|
830
836
|
|
|
831
837
|
await self.send_result(result.success, result.message, "SET" if result.success else "INVALID_SET")
|
|
832
838
|
|
|
@@ -837,7 +843,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
837
843
|
await self.send_game_complete()
|
|
838
844
|
|
|
839
845
|
except ValueError:
|
|
840
|
-
self.current_game.invalid_moves += 1
|
|
841
846
|
await self.send_result(False, "Invalid input. Use numbers only for row, col, state.", "PARSE_ERROR")
|
|
842
847
|
return
|
|
843
848
|
|
|
@@ -851,9 +856,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
851
856
|
guess = [int(p) for p in parts[1:]]
|
|
852
857
|
|
|
853
858
|
result = await self.current_game.validate_move(*guess)
|
|
854
|
-
|
|
855
|
-
if not result.success:
|
|
856
|
-
self.current_game.invalid_moves += 1
|
|
859
|
+
self.current_game.record_move(tuple(guess), result.success)
|
|
857
860
|
|
|
858
861
|
await self.send_result(result.success, result.message, "GUESSED" if result.success else "INVALID_GUESS")
|
|
859
862
|
|
|
@@ -874,7 +877,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
874
877
|
await self.send_line("=" * 50 + "\n")
|
|
875
878
|
|
|
876
879
|
except ValueError:
|
|
877
|
-
self.current_game.invalid_moves += 1
|
|
878
880
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
879
881
|
return
|
|
880
882
|
|
|
@@ -888,9 +890,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
888
890
|
item_index = int(parts[1])
|
|
889
891
|
|
|
890
892
|
result = await self.current_game.validate_move("select", item_index)
|
|
891
|
-
|
|
892
|
-
if not result.success:
|
|
893
|
-
self.current_game.invalid_moves += 1
|
|
893
|
+
self.current_game.record_move((item_index,), result.success)
|
|
894
894
|
|
|
895
895
|
await self.send_result(
|
|
896
896
|
result.success, result.message, "SELECTED" if result.success else "INVALID_SELECT"
|
|
@@ -900,7 +900,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
900
900
|
await self.display_puzzle()
|
|
901
901
|
|
|
902
902
|
except ValueError:
|
|
903
|
-
self.current_game.invalid_moves += 1
|
|
904
903
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
905
904
|
return
|
|
906
905
|
|
|
@@ -913,9 +912,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
913
912
|
item_index = int(parts[1])
|
|
914
913
|
|
|
915
914
|
result = await self.current_game.validate_move("deselect", item_index)
|
|
916
|
-
|
|
917
|
-
if not result.success:
|
|
918
|
-
self.current_game.invalid_moves += 1
|
|
915
|
+
self.current_game.record_move((item_index,), result.success)
|
|
919
916
|
|
|
920
917
|
await self.send_result(
|
|
921
918
|
result.success, result.message, "DESELECTED" if result.success else "INVALID_DESELECT"
|
|
@@ -925,7 +922,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
925
922
|
await self.display_puzzle()
|
|
926
923
|
|
|
927
924
|
except ValueError:
|
|
928
|
-
self.current_game.invalid_moves += 1
|
|
929
925
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
930
926
|
return
|
|
931
927
|
|
|
@@ -941,9 +937,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
941
937
|
color = parts[3].lower()
|
|
942
938
|
|
|
943
939
|
result = await self.current_game.validate_move(row, col, color)
|
|
944
|
-
|
|
945
|
-
if not result.success:
|
|
946
|
-
self.current_game.invalid_moves += 1
|
|
940
|
+
self.current_game.record_move((row, col), result.success)
|
|
947
941
|
|
|
948
942
|
await self.send_result(result.success, result.message, "MARKED" if result.success else "INVALID_MARK")
|
|
949
943
|
|
|
@@ -954,7 +948,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
954
948
|
await self.send_game_complete()
|
|
955
949
|
|
|
956
950
|
except ValueError:
|
|
957
|
-
self.current_game.invalid_moves += 1
|
|
958
951
|
await self.send_result(False, "Invalid input. Row and col must be numbers.", "PARSE_ERROR")
|
|
959
952
|
return
|
|
960
953
|
|
|
@@ -969,9 +962,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
969
962
|
col = int(parts[2])
|
|
970
963
|
|
|
971
964
|
result = await self.current_game.validate_move(row, col, "shade")
|
|
972
|
-
|
|
973
|
-
if not result.success:
|
|
974
|
-
self.current_game.invalid_moves += 1
|
|
965
|
+
self.current_game.record_move((row, col), result.success)
|
|
975
966
|
|
|
976
967
|
await self.send_result(result.success, result.message, "SHADED" if result.success else "INVALID_SHADE")
|
|
977
968
|
|
|
@@ -982,7 +973,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
982
973
|
await self.send_game_complete()
|
|
983
974
|
|
|
984
975
|
except ValueError:
|
|
985
|
-
self.current_game.invalid_moves += 1
|
|
986
976
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
987
977
|
return
|
|
988
978
|
|
|
@@ -1000,9 +990,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1000
990
|
count = int(parts[5])
|
|
1001
991
|
|
|
1002
992
|
result = await self.current_game.validate_move(r1, c1, r2, c2, count)
|
|
1003
|
-
|
|
1004
|
-
if not result.success:
|
|
1005
|
-
self.current_game.invalid_moves += 1
|
|
993
|
+
self.current_game.record_move((r1, c1, r2, c2), result.success)
|
|
1006
994
|
|
|
1007
995
|
await self.send_result(
|
|
1008
996
|
result.success, result.message, "BRIDGED" if result.success else "INVALID_BRIDGE"
|
|
@@ -1015,7 +1003,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1015
1003
|
await self.send_game_complete()
|
|
1016
1004
|
|
|
1017
1005
|
except ValueError:
|
|
1018
|
-
self.current_game.invalid_moves += 1
|
|
1019
1006
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
1020
1007
|
return
|
|
1021
1008
|
|
|
@@ -1028,9 +1015,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1028
1015
|
direction = parts[1].lower()
|
|
1029
1016
|
|
|
1030
1017
|
result = await self.current_game.validate_move(direction)
|
|
1031
|
-
|
|
1032
|
-
if not result.success:
|
|
1033
|
-
self.current_game.invalid_moves += 1
|
|
1018
|
+
self.current_game.record_move((direction,), result.success)
|
|
1034
1019
|
|
|
1035
1020
|
await self.send_result(result.success, result.message, "MOVED" if result.success else "INVALID_MOVE")
|
|
1036
1021
|
|
|
@@ -1054,9 +1039,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1054
1039
|
start_time = int(parts[3])
|
|
1055
1040
|
|
|
1056
1041
|
result = await self.current_game.validate_move(task_id, worker_id, start_time)
|
|
1057
|
-
|
|
1058
|
-
if not result.success:
|
|
1059
|
-
self.current_game.invalid_moves += 1
|
|
1042
|
+
self.current_game.record_move((task_id,), result.success)
|
|
1060
1043
|
|
|
1061
1044
|
await self.send_result(
|
|
1062
1045
|
result.success, result.message, "ASSIGNED" if result.success else "INVALID_ASSIGN"
|
|
@@ -1068,7 +1051,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1068
1051
|
await self.send_game_complete()
|
|
1069
1052
|
|
|
1070
1053
|
except ValueError:
|
|
1071
|
-
self.current_game.invalid_moves += 1
|
|
1072
1054
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
1073
1055
|
return
|
|
1074
1056
|
|
|
@@ -1081,9 +1063,7 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1081
1063
|
task_id = int(parts[1])
|
|
1082
1064
|
|
|
1083
1065
|
result = await self.current_game.validate_move(task_id, 0, -1)
|
|
1084
|
-
|
|
1085
|
-
if not result.success:
|
|
1086
|
-
self.current_game.invalid_moves += 1
|
|
1066
|
+
self.current_game.record_move((task_id,), result.success)
|
|
1087
1067
|
|
|
1088
1068
|
await self.send_result(
|
|
1089
1069
|
result.success, result.message, "UNASSIGNED" if result.success else "INVALID_UNASSIGN"
|
|
@@ -1093,7 +1073,6 @@ class ArcadeHandler(TelnetHandler):
|
|
|
1093
1073
|
await self.display_puzzle()
|
|
1094
1074
|
|
|
1095
1075
|
except ValueError:
|
|
1096
|
-
self.current_game.invalid_moves += 1
|
|
1097
1076
|
await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
|
|
1098
1077
|
return
|
|
1099
1078
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chuk-puzzles-gym
|
|
3
|
-
Version: 0.10
|
|
3
|
+
Version: 0.10.2
|
|
4
4
|
Summary: Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation
|
|
5
5
|
Author: Chris Hay
|
|
6
6
|
License: MIT
|
|
@@ -93,10 +93,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
|
|
|
93
93
|
- Enable with `mode agent` command
|
|
94
94
|
- Machine-parseable grid format with clear start/end markers
|
|
95
95
|
- Compact output optimized for LLM tool integration
|
|
96
|
+
- **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
|
|
97
|
+
- Backtrack detection (did the agent revise previous placements?)
|
|
98
|
+
- Progress steadiness (monotonic advance toward solution?)
|
|
99
|
+
- Error streak analysis (isolated mistakes vs. clustered confusion?)
|
|
100
|
+
- Reasoning overhead (wasted work relative to optimal path)
|
|
101
|
+
- Solver distance traces (remaining work after each valid move)
|
|
102
|
+
- Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
|
|
96
103
|
- **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
|
|
97
104
|
- Batch evaluation with configurable episodes
|
|
98
105
|
- Multiple output formats (JSON, CSV, Markdown)
|
|
99
|
-
- Metrics: moves, invalid moves, hints, solve time
|
|
106
|
+
- Metrics: moves, invalid moves, hints, solve time, reasoning depth
|
|
100
107
|
- Reproducible with deterministic seeds
|
|
101
108
|
- **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
|
|
102
109
|
- JSONL output with complete problem definitions and solutions
|
|
@@ -500,6 +507,7 @@ games = PuzzleEnv.available_games()
|
|
|
500
507
|
|
|
501
508
|
- **All 30 games** accessible through unified API
|
|
502
509
|
- **Configurable rewards** for correct moves, invalid attempts, completion bonuses
|
|
510
|
+
- **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
|
|
503
511
|
- **Hint system** with optional budget limits
|
|
504
512
|
- **Solver-free mode** for pure reasoning benchmarks
|
|
505
513
|
- **Efficiency scoring** based on optimal step counts
|
|
@@ -515,8 +523,25 @@ obs = {
|
|
|
515
523
|
"moves": 5,
|
|
516
524
|
"invalid_moves": 1,
|
|
517
525
|
"hints_used": 2,
|
|
526
|
+
"hints_remaining": 98,
|
|
518
527
|
"is_complete": False,
|
|
519
|
-
"grid": [[4, 0, 8, ...], ...] # Game-specific state
|
|
528
|
+
"grid": [[4, 0, 8, ...], ...], # Game-specific state
|
|
529
|
+
"render": " | 1 2 3 | ...", # ASCII grid
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Info dict includes reasoning metrics and difficulty profile
|
|
533
|
+
info = {
|
|
534
|
+
"optimal_steps": 45,
|
|
535
|
+
"difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
|
|
536
|
+
"reasoning_metrics": {
|
|
537
|
+
"backtrack_count": 0,
|
|
538
|
+
"backtrack_rate": 0.0,
|
|
539
|
+
"progress_velocity": 1.0,
|
|
540
|
+
"progress_steadiness": 1.0,
|
|
541
|
+
"reasoning_overhead": 1.0,
|
|
542
|
+
"error_streak_max": 0,
|
|
543
|
+
"solver_distance_trace": [44, 43, 42, ...],
|
|
544
|
+
},
|
|
520
545
|
}
|
|
521
546
|
```
|
|
522
547
|
|
|
@@ -546,6 +571,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
|
|
|
546
571
|
env = PuzzleEnv("sudoku", solver_config=config)
|
|
547
572
|
```
|
|
548
573
|
|
|
574
|
+
## Reasoning Depth Metrics
|
|
575
|
+
|
|
576
|
+
Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
|
|
577
|
+
|
|
578
|
+
### Metrics
|
|
579
|
+
|
|
580
|
+
| Metric | Description | Perfect Score |
|
|
581
|
+
|--------|-------------|---------------|
|
|
582
|
+
| `backtrack_count` | Times the agent revised a previous placement | 0 |
|
|
583
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
|
|
584
|
+
| `progress_velocity` | Average cells solved per step | 1.0 |
|
|
585
|
+
| `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
|
|
586
|
+
| `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
|
|
587
|
+
| `error_streak_max` | Longest run of consecutive invalid moves | 0 |
|
|
588
|
+
| `avg_error_streak` | Average length of error bursts | 0.0 |
|
|
589
|
+
| `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
|
|
590
|
+
|
|
591
|
+
### Usage in Gym Environment
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
from chuk_puzzles_gym.gym_env import PuzzleEnv
|
|
595
|
+
|
|
596
|
+
env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
|
|
597
|
+
obs, info = await env.reset()
|
|
598
|
+
|
|
599
|
+
# Reasoning metrics available in info after reset
|
|
600
|
+
print(info["reasoning_metrics"])
|
|
601
|
+
|
|
602
|
+
# ... agent plays ...
|
|
603
|
+
obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
|
|
604
|
+
|
|
605
|
+
# On episode end, info includes full reasoning metrics
|
|
606
|
+
if terminated:
|
|
607
|
+
metrics = info["reasoning_metrics"]
|
|
608
|
+
print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
|
|
609
|
+
print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
|
|
610
|
+
print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
### Usage in Server (Telnet/WebSocket)
|
|
614
|
+
|
|
615
|
+
Reasoning metrics are included automatically in server output:
|
|
616
|
+
|
|
617
|
+
- **JSON mode**: `reasoning_metrics` dict in every state response and completion message
|
|
618
|
+
- **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
|
|
619
|
+
- **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
|
|
620
|
+
|
|
621
|
+
```
|
|
622
|
+
> mode json
|
|
623
|
+
> place 1 1 5
|
|
624
|
+
{"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
|
|
625
|
+
|
|
626
|
+
> stats
|
|
627
|
+
{"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
|
|
628
|
+
```
|
|
629
|
+
|
|
630
|
+
### Usage in Evaluation Harness
|
|
631
|
+
|
|
632
|
+
```bash
|
|
633
|
+
# Reasoning metrics included in all output formats
|
|
634
|
+
chuk-puzzles-eval sudoku -d easy -n 10 -o json
|
|
635
|
+
```
|
|
636
|
+
|
|
637
|
+
```python
|
|
638
|
+
from chuk_puzzles_gym.eval import evaluate_game
|
|
639
|
+
|
|
640
|
+
report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
|
|
641
|
+
report.print_summary() # Includes "Reasoning Depth" section
|
|
642
|
+
|
|
643
|
+
# Aggregate metrics
|
|
644
|
+
print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
|
|
645
|
+
print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
|
|
646
|
+
print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
|
|
647
|
+
```
|
|
648
|
+
|
|
649
|
+
### What the Metrics Reveal
|
|
650
|
+
|
|
651
|
+
A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
|
|
652
|
+
|
|
653
|
+
A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
|
|
654
|
+
|
|
655
|
+
These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
|
|
656
|
+
|
|
549
657
|
## Evaluation Harness
|
|
550
658
|
|
|
551
659
|
The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
|
|
@@ -604,6 +712,12 @@ Avg Time: 12ms
|
|
|
604
712
|
| `hints_used` | Number of hints requested |
|
|
605
713
|
| `wall_time_ms` | Time to solve in milliseconds |
|
|
606
714
|
| `seed` | Puzzle seed for reproducibility |
|
|
715
|
+
| `backtrack_count` | Times agent revised a previous placement |
|
|
716
|
+
| `backtrack_rate` | Fraction of valid moves that were backtracks |
|
|
717
|
+
| `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
|
|
718
|
+
| `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
|
|
719
|
+
| `error_streak_max` | Longest run of consecutive invalid moves |
|
|
720
|
+
| `progress_velocity` | Average cells solved per step |
|
|
607
721
|
|
|
608
722
|
## Dataset Export
|
|
609
723
|
|
|
@@ -1194,12 +1308,13 @@ chuk-puzzles-gym/
|
|
|
1194
1308
|
│ │ ├── base.py # GridPosition, MoveResult
|
|
1195
1309
|
│ │ ├── config.py # Base GameConfig
|
|
1196
1310
|
│ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
|
|
1311
|
+
│ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
|
|
1197
1312
|
│ │ └── games.py # Game-specific models (Cage, Task, etc.)
|
|
1198
1313
|
│ └── games/ # Self-contained game modules
|
|
1199
1314
|
│ ├── __init__.py # AVAILABLE_GAMES registry
|
|
1200
1315
|
│ ├── _base/ # Base classes
|
|
1201
1316
|
│ │ ├── __init__.py
|
|
1202
|
-
│ │ ├── game.py # PuzzleGame ABC
|
|
1317
|
+
│ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
|
|
1203
1318
|
│ │ └── commands.py # GameCommandHandler ABC
|
|
1204
1319
|
│ ├── sudoku/ # Example game module
|
|
1205
1320
|
│ │ ├── __init__.py # Exports SudokuGame
|
|
@@ -1226,6 +1341,7 @@ chuk-puzzles-gym/
|
|
|
1226
1341
|
│ ├── example_graph_coloring.py # Graph Coloring game logic demo
|
|
1227
1342
|
│ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
|
|
1228
1343
|
│ ├── example_rush_hour.py # Rush Hour game logic demo
|
|
1344
|
+
│ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
|
|
1229
1345
|
│ └── README.md # Example usage guide
|
|
1230
1346
|
├── .github/workflows/ # CI/CD workflows
|
|
1231
1347
|
├── pyproject.toml # Modern Python project config
|
|
@@ -1465,9 +1581,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
|
|
|
1465
1581
|
### Highlights
|
|
1466
1582
|
|
|
1467
1583
|
**Benchmarking & Metrics**
|
|
1468
|
-
- Puzzle complexity metrics (constraint count, variable count, branching factor)
|
|
1469
|
-
- Episode model for tracking game sessions
|
|
1470
|
-
-
|
|
1584
|
+
- ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
|
|
1585
|
+
- ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
|
|
1586
|
+
- ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
|
|
1587
|
+
- ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
|
|
1471
1588
|
|
|
1472
1589
|
**Agent Evaluation Tools**
|
|
1473
1590
|
- Batch evaluation harness CLI
|