chuk-puzzles-gym 0.10__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. chuk_puzzles_gym/eval.py +168 -46
  2. chuk_puzzles_gym/export/dataset.py +7 -1
  3. chuk_puzzles_gym/games/_base/game.py +123 -0
  4. chuk_puzzles_gym/games/binary/game.py +2 -0
  5. chuk_puzzles_gym/games/bridges/game.py +2 -0
  6. chuk_puzzles_gym/games/cryptarithmetic/game.py +5 -0
  7. chuk_puzzles_gym/games/einstein/game.py +2 -0
  8. chuk_puzzles_gym/games/fillomino/game.py +2 -0
  9. chuk_puzzles_gym/games/futoshiki/game.py +2 -0
  10. chuk_puzzles_gym/games/graph_coloring/commands.py +20 -3
  11. chuk_puzzles_gym/games/graph_coloring/game.py +8 -1
  12. chuk_puzzles_gym/games/hidato/game.py +2 -0
  13. chuk_puzzles_gym/games/hitori/game.py +2 -0
  14. chuk_puzzles_gym/games/kakuro/game.py +2 -0
  15. chuk_puzzles_gym/games/kenken/game.py +2 -0
  16. chuk_puzzles_gym/games/killer_sudoku/game.py +2 -0
  17. chuk_puzzles_gym/games/knapsack/game.py +2 -0
  18. chuk_puzzles_gym/games/lights_out/game.py +2 -0
  19. chuk_puzzles_gym/games/logic_grid/game.py +2 -0
  20. chuk_puzzles_gym/games/mastermind/game.py +2 -0
  21. chuk_puzzles_gym/games/minesweeper/game.py +2 -0
  22. chuk_puzzles_gym/games/nonogram/game.py +2 -0
  23. chuk_puzzles_gym/games/nqueens/game.py +5 -0
  24. chuk_puzzles_gym/games/numberlink/game.py +6 -0
  25. chuk_puzzles_gym/games/nurikabe/game.py +2 -0
  26. chuk_puzzles_gym/games/rush_hour/game.py +4 -0
  27. chuk_puzzles_gym/games/scheduler/game.py +2 -0
  28. chuk_puzzles_gym/games/shikaku/game.py +2 -0
  29. chuk_puzzles_gym/games/skyscrapers/game.py +5 -0
  30. chuk_puzzles_gym/games/slitherlink/game.py +2 -0
  31. chuk_puzzles_gym/games/sokoban/game.py +2 -0
  32. chuk_puzzles_gym/games/star_battle/game.py +2 -0
  33. chuk_puzzles_gym/games/sudoku/game.py +2 -0
  34. chuk_puzzles_gym/games/tents/game.py +2 -0
  35. chuk_puzzles_gym/gym_env.py +21 -5
  36. chuk_puzzles_gym/models/__init__.py +2 -0
  37. chuk_puzzles_gym/models/evaluation.py +165 -1
  38. chuk_puzzles_gym/server.py +51 -72
  39. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/METADATA +124 -7
  40. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/RECORD +43 -43
  41. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/WHEEL +0 -0
  42. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/entry_points.txt +0 -0
  43. {chuk_puzzles_gym-0.10.dist-info → chuk_puzzles_gym-0.10.2.dist-info}/top_level.txt +0 -0
@@ -63,6 +63,9 @@ class ArcadeHandler(TelnetHandler):
63
63
  if not self.current_game:
64
64
  return
65
65
 
66
+ # Get final reasoning metrics
67
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
68
+
66
69
  if self.output_mode == OutputMode.JSON:
67
70
  await self.send_json_response(
68
71
  type="complete",
@@ -72,17 +75,27 @@ class ArcadeHandler(TelnetHandler):
72
75
  invalid_moves=self.current_game.invalid_moves,
73
76
  hints_used=self.current_game.hints_used,
74
77
  optimal_steps=self.current_game.optimal_steps,
78
+ reasoning_metrics=reasoning,
75
79
  )
76
80
  elif self.output_mode == OutputMode.STRICT:
77
81
  await self.send_line(
78
82
  f"COMPLETE:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
79
- f"{self.current_game.hints_used}"
83
+ f"{self.current_game.hints_used}:"
84
+ f"BT={reasoning['backtrack_count']}:"
85
+ f"OH={reasoning['reasoning_overhead']:.2f}:"
86
+ f"ST={reasoning['progress_steadiness']:.2f}"
80
87
  )
81
88
  else:
82
89
  await self.send_line("\n" + "=" * 50)
83
90
  await self.send_line("CONGRATULATIONS! YOU SOLVED IT!")
84
91
  await self.send_line("=" * 50)
85
92
  await self.send_line(self.current_game.get_stats())
93
+ await self.send_line("")
94
+ await self.send_line("Reasoning Depth:")
95
+ await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
96
+ await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
97
+ await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
98
+ await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
86
99
  await self.send_line("\nType 'menu' to play another game.")
87
100
  await self.send_line("=" * 50 + "\n")
88
101
 
@@ -109,6 +122,9 @@ class ArcadeHandler(TelnetHandler):
109
122
  "constraint_density": profile.constraint_density,
110
123
  }
111
124
 
125
+ # Reasoning depth metrics
126
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
127
+
112
128
  return {
113
129
  "game": self.current_game.name,
114
130
  "difficulty": self.current_game.difficulty.value,
@@ -120,6 +136,7 @@ class ArcadeHandler(TelnetHandler):
120
136
  "optimal_steps": self.current_game.optimal_steps,
121
137
  "is_complete": self.current_game.is_complete(),
122
138
  "difficulty_profile": profile_dict,
139
+ "reasoning_metrics": reasoning,
123
140
  "grid": grid,
124
141
  }
125
142
 
@@ -435,9 +452,10 @@ class ArcadeHandler(TelnetHandler):
435
452
  return
436
453
 
437
454
  if cmd_enum == GameCommand.STATS:
438
- # Show detailed stats including difficulty profile
455
+ # Show detailed stats including difficulty profile and reasoning metrics
439
456
  profile = self.current_game.difficulty_profile
440
457
  optimal = self.current_game.optimal_steps
458
+ reasoning = self.current_game.get_reasoning_metrics().to_dict()
441
459
 
442
460
  if self.output_mode == OutputMode.JSON:
443
461
  await self.send_json_response(
@@ -455,11 +473,15 @@ class ArcadeHandler(TelnetHandler):
455
473
  "state_observability": profile.state_observability,
456
474
  "constraint_density": profile.constraint_density,
457
475
  },
476
+ reasoning_metrics=reasoning,
458
477
  )
459
478
  elif self.output_mode == OutputMode.STRICT:
460
479
  await self.send_line(
461
480
  f"STATS:{self.current_game.moves_made}:{self.current_game.invalid_moves}:"
462
- f"{self.current_game.hints_used}:{optimal or 0}"
481
+ f"{self.current_game.hints_used}:{optimal or 0}:"
482
+ f"BT={reasoning['backtrack_count']}:"
483
+ f"OH={reasoning['reasoning_overhead']:.2f}:"
484
+ f"ST={reasoning['progress_steadiness']:.2f}"
463
485
  )
464
486
  else:
465
487
  await self.send_line("")
@@ -482,6 +504,15 @@ class ArcadeHandler(TelnetHandler):
482
504
  await self.send_line(f" Optimal steps: {optimal}")
483
505
  await self.send_line(f" Current efficiency: {efficiency:.1%}")
484
506
  await self.send_line("")
507
+ await self.send_line("Reasoning Depth:")
508
+ await self.send_line(f" Backtrack count: {reasoning['backtrack_count']}")
509
+ await self.send_line(f" Backtrack rate: {reasoning['backtrack_rate']:.0%}")
510
+ await self.send_line(f" Progress velocity: {reasoning['progress_velocity']:.2f} cells/step")
511
+ await self.send_line(f" Progress steadiness: {reasoning['progress_steadiness']:.0%}")
512
+ await self.send_line(f" Reasoning overhead: {reasoning['reasoning_overhead']:.1f}x optimal")
513
+ await self.send_line(f" Error streak max: {reasoning['error_streak_max']}")
514
+ await self.send_line(f" Total actions: {reasoning['total_actions']}")
515
+ await self.send_line("")
485
516
  await self.send_line("Difficulty Profile:")
486
517
  await self.send_line(f" Logic depth: {profile.logic_depth}")
487
518
  await self.send_line(f" Branching factor: {profile.branching_factor:.1f}")
@@ -599,10 +630,6 @@ class ArcadeHandler(TelnetHandler):
599
630
  if self.game_handler and cmd_enum in self.game_handler.supported_commands:
600
631
  result = await self.game_handler.handle_command(cmd_enum, parts[1:])
601
632
 
602
- # Track invalid moves
603
- if not result.result.success:
604
- self.current_game.invalid_moves += 1
605
-
606
633
  # Send result based on output mode
607
634
  code = "OK" if result.result.success else "INVALID"
608
635
  await self.send_result(result.result.success, result.result.message, code)
@@ -626,9 +653,7 @@ class ArcadeHandler(TelnetHandler):
626
653
  num = int(parts[3])
627
654
 
628
655
  result = await self.current_game.validate_move(row, col, num)
629
-
630
- if not result.success:
631
- self.current_game.invalid_moves += 1
656
+ self.current_game.record_move((row, col), result.success)
632
657
 
633
658
  await self.send_result(result.success, result.message, "PLACED" if result.success else "INVALID_MOVE")
634
659
 
@@ -639,7 +664,6 @@ class ArcadeHandler(TelnetHandler):
639
664
  await self.send_game_complete()
640
665
 
641
666
  except ValueError:
642
- self.current_game.invalid_moves += 1
643
667
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
644
668
  return
645
669
 
@@ -653,9 +677,7 @@ class ArcadeHandler(TelnetHandler):
653
677
  col = int(parts[2])
654
678
 
655
679
  result = await self.current_game.validate_move(row, col, 0)
656
-
657
- if not result.success:
658
- self.current_game.invalid_moves += 1
680
+ self.current_game.record_move((row, col), result.success)
659
681
 
660
682
  await self.send_result(result.success, result.message, "CLEARED" if result.success else "INVALID_CLEAR")
661
683
 
@@ -663,7 +685,6 @@ class ArcadeHandler(TelnetHandler):
663
685
  await self.display_puzzle()
664
686
 
665
687
  except ValueError:
666
- self.current_game.invalid_moves += 1
667
688
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
668
689
  return
669
690
 
@@ -693,9 +714,7 @@ class ArcadeHandler(TelnetHandler):
693
714
  col = int(parts[2])
694
715
 
695
716
  result = await self.current_game.validate_move(row, col)
696
-
697
- if not result.success:
698
- self.current_game.invalid_moves += 1
717
+ self.current_game.record_move((row, col), result.success)
699
718
 
700
719
  await self.send_result(result.success, result.message, "PRESSED" if result.success else "INVALID_PRESS")
701
720
 
@@ -706,7 +725,6 @@ class ArcadeHandler(TelnetHandler):
706
725
  await self.send_game_complete()
707
726
 
708
727
  except ValueError:
709
- self.current_game.invalid_moves += 1
710
728
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
711
729
  return
712
730
 
@@ -718,9 +736,7 @@ class ArcadeHandler(TelnetHandler):
718
736
 
719
737
  cat1, val1, cat2, val2 = parts[1], parts[2], parts[3], parts[4]
720
738
  result = await self.current_game.validate_move(cat1, val1, cat2, val2, True)
721
-
722
- if not result.success:
723
- self.current_game.invalid_moves += 1
739
+ self.current_game.record_move((cat1, val1, cat2, val2), result.success)
724
740
 
725
741
  await self.send_result(result.success, result.message, "CONNECTED" if result.success else "INVALID_CONNECT")
726
742
  if result.success:
@@ -736,9 +752,7 @@ class ArcadeHandler(TelnetHandler):
736
752
 
737
753
  cat1, val1, cat2, val2 = parts[1], parts[2], parts[3], parts[4]
738
754
  result = await self.current_game.validate_move(cat1, val1, cat2, val2, False)
739
-
740
- if not result.success:
741
- self.current_game.invalid_moves += 1
755
+ self.current_game.record_move((cat1, val1, cat2, val2), result.success)
742
756
 
743
757
  await self.send_result(result.success, result.message, "EXCLUDED" if result.success else "INVALID_EXCLUDE")
744
758
  if result.success:
@@ -758,9 +772,7 @@ class ArcadeHandler(TelnetHandler):
758
772
  col = int(parts[2])
759
773
 
760
774
  result = await self.current_game.validate_move("reveal", row, col)
761
-
762
- if not result.success:
763
- self.current_game.invalid_moves += 1
775
+ self.current_game.record_move((row, col), result.success)
764
776
 
765
777
  await self.send_result(
766
778
  result.success, result.message, "REVEALED" if result.success else "INVALID_REVEAL"
@@ -783,7 +795,6 @@ class ArcadeHandler(TelnetHandler):
783
795
  await self.send_line("=" * 50 + "\n")
784
796
 
785
797
  except ValueError:
786
- self.current_game.invalid_moves += 1
787
798
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
788
799
  return
789
800
 
@@ -797,9 +808,7 @@ class ArcadeHandler(TelnetHandler):
797
808
  col = int(parts[2])
798
809
 
799
810
  result = await self.current_game.validate_move("flag", row, col)
800
-
801
- if not result.success:
802
- self.current_game.invalid_moves += 1
811
+ self.current_game.record_move((row, col), result.success)
803
812
 
804
813
  await self.send_result(result.success, result.message, "FLAGGED" if result.success else "INVALID_FLAG")
805
814
 
@@ -807,7 +816,6 @@ class ArcadeHandler(TelnetHandler):
807
816
  await self.display_puzzle()
808
817
 
809
818
  except ValueError:
810
- self.current_game.invalid_moves += 1
811
819
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
812
820
  return
813
821
 
@@ -824,9 +832,7 @@ class ArcadeHandler(TelnetHandler):
824
832
  state = int(parts[4])
825
833
 
826
834
  result = await self.current_game.validate_move(edge_type, row, col, state)
827
-
828
- if not result.success:
829
- self.current_game.invalid_moves += 1
835
+ self.current_game.record_move((edge_type, row, col), result.success)
830
836
 
831
837
  await self.send_result(result.success, result.message, "SET" if result.success else "INVALID_SET")
832
838
 
@@ -837,7 +843,6 @@ class ArcadeHandler(TelnetHandler):
837
843
  await self.send_game_complete()
838
844
 
839
845
  except ValueError:
840
- self.current_game.invalid_moves += 1
841
846
  await self.send_result(False, "Invalid input. Use numbers only for row, col, state.", "PARSE_ERROR")
842
847
  return
843
848
 
@@ -851,9 +856,7 @@ class ArcadeHandler(TelnetHandler):
851
856
  guess = [int(p) for p in parts[1:]]
852
857
 
853
858
  result = await self.current_game.validate_move(*guess)
854
-
855
- if not result.success:
856
- self.current_game.invalid_moves += 1
859
+ self.current_game.record_move(tuple(guess), result.success)
857
860
 
858
861
  await self.send_result(result.success, result.message, "GUESSED" if result.success else "INVALID_GUESS")
859
862
 
@@ -874,7 +877,6 @@ class ArcadeHandler(TelnetHandler):
874
877
  await self.send_line("=" * 50 + "\n")
875
878
 
876
879
  except ValueError:
877
- self.current_game.invalid_moves += 1
878
880
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
879
881
  return
880
882
 
@@ -888,9 +890,7 @@ class ArcadeHandler(TelnetHandler):
888
890
  item_index = int(parts[1])
889
891
 
890
892
  result = await self.current_game.validate_move("select", item_index)
891
-
892
- if not result.success:
893
- self.current_game.invalid_moves += 1
893
+ self.current_game.record_move((item_index,), result.success)
894
894
 
895
895
  await self.send_result(
896
896
  result.success, result.message, "SELECTED" if result.success else "INVALID_SELECT"
@@ -900,7 +900,6 @@ class ArcadeHandler(TelnetHandler):
900
900
  await self.display_puzzle()
901
901
 
902
902
  except ValueError:
903
- self.current_game.invalid_moves += 1
904
903
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
905
904
  return
906
905
 
@@ -913,9 +912,7 @@ class ArcadeHandler(TelnetHandler):
913
912
  item_index = int(parts[1])
914
913
 
915
914
  result = await self.current_game.validate_move("deselect", item_index)
916
-
917
- if not result.success:
918
- self.current_game.invalid_moves += 1
915
+ self.current_game.record_move((item_index,), result.success)
919
916
 
920
917
  await self.send_result(
921
918
  result.success, result.message, "DESELECTED" if result.success else "INVALID_DESELECT"
@@ -925,7 +922,6 @@ class ArcadeHandler(TelnetHandler):
925
922
  await self.display_puzzle()
926
923
 
927
924
  except ValueError:
928
- self.current_game.invalid_moves += 1
929
925
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
930
926
  return
931
927
 
@@ -941,9 +937,7 @@ class ArcadeHandler(TelnetHandler):
941
937
  color = parts[3].lower()
942
938
 
943
939
  result = await self.current_game.validate_move(row, col, color)
944
-
945
- if not result.success:
946
- self.current_game.invalid_moves += 1
940
+ self.current_game.record_move((row, col), result.success)
947
941
 
948
942
  await self.send_result(result.success, result.message, "MARKED" if result.success else "INVALID_MARK")
949
943
 
@@ -954,7 +948,6 @@ class ArcadeHandler(TelnetHandler):
954
948
  await self.send_game_complete()
955
949
 
956
950
  except ValueError:
957
- self.current_game.invalid_moves += 1
958
951
  await self.send_result(False, "Invalid input. Row and col must be numbers.", "PARSE_ERROR")
959
952
  return
960
953
 
@@ -969,9 +962,7 @@ class ArcadeHandler(TelnetHandler):
969
962
  col = int(parts[2])
970
963
 
971
964
  result = await self.current_game.validate_move(row, col, "shade")
972
-
973
- if not result.success:
974
- self.current_game.invalid_moves += 1
965
+ self.current_game.record_move((row, col), result.success)
975
966
 
976
967
  await self.send_result(result.success, result.message, "SHADED" if result.success else "INVALID_SHADE")
977
968
 
@@ -982,7 +973,6 @@ class ArcadeHandler(TelnetHandler):
982
973
  await self.send_game_complete()
983
974
 
984
975
  except ValueError:
985
- self.current_game.invalid_moves += 1
986
976
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
987
977
  return
988
978
 
@@ -1000,9 +990,7 @@ class ArcadeHandler(TelnetHandler):
1000
990
  count = int(parts[5])
1001
991
 
1002
992
  result = await self.current_game.validate_move(r1, c1, r2, c2, count)
1003
-
1004
- if not result.success:
1005
- self.current_game.invalid_moves += 1
993
+ self.current_game.record_move((r1, c1, r2, c2), result.success)
1006
994
 
1007
995
  await self.send_result(
1008
996
  result.success, result.message, "BRIDGED" if result.success else "INVALID_BRIDGE"
@@ -1015,7 +1003,6 @@ class ArcadeHandler(TelnetHandler):
1015
1003
  await self.send_game_complete()
1016
1004
 
1017
1005
  except ValueError:
1018
- self.current_game.invalid_moves += 1
1019
1006
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
1020
1007
  return
1021
1008
 
@@ -1028,9 +1015,7 @@ class ArcadeHandler(TelnetHandler):
1028
1015
  direction = parts[1].lower()
1029
1016
 
1030
1017
  result = await self.current_game.validate_move(direction)
1031
-
1032
- if not result.success:
1033
- self.current_game.invalid_moves += 1
1018
+ self.current_game.record_move((direction,), result.success)
1034
1019
 
1035
1020
  await self.send_result(result.success, result.message, "MOVED" if result.success else "INVALID_MOVE")
1036
1021
 
@@ -1054,9 +1039,7 @@ class ArcadeHandler(TelnetHandler):
1054
1039
  start_time = int(parts[3])
1055
1040
 
1056
1041
  result = await self.current_game.validate_move(task_id, worker_id, start_time)
1057
-
1058
- if not result.success:
1059
- self.current_game.invalid_moves += 1
1042
+ self.current_game.record_move((task_id,), result.success)
1060
1043
 
1061
1044
  await self.send_result(
1062
1045
  result.success, result.message, "ASSIGNED" if result.success else "INVALID_ASSIGN"
@@ -1068,7 +1051,6 @@ class ArcadeHandler(TelnetHandler):
1068
1051
  await self.send_game_complete()
1069
1052
 
1070
1053
  except ValueError:
1071
- self.current_game.invalid_moves += 1
1072
1054
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
1073
1055
  return
1074
1056
 
@@ -1081,9 +1063,7 @@ class ArcadeHandler(TelnetHandler):
1081
1063
  task_id = int(parts[1])
1082
1064
 
1083
1065
  result = await self.current_game.validate_move(task_id, 0, -1)
1084
-
1085
- if not result.success:
1086
- self.current_game.invalid_moves += 1
1066
+ self.current_game.record_move((task_id,), result.success)
1087
1067
 
1088
1068
  await self.send_result(
1089
1069
  result.success, result.message, "UNASSIGNED" if result.success else "INVALID_UNASSIGN"
@@ -1093,7 +1073,6 @@ class ArcadeHandler(TelnetHandler):
1093
1073
  await self.display_puzzle()
1094
1074
 
1095
1075
  except ValueError:
1096
- self.current_game.invalid_moves += 1
1097
1076
  await self.send_result(False, "Invalid input. Use numbers only.", "PARSE_ERROR")
1098
1077
  return
1099
1078
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chuk-puzzles-gym
3
- Version: 0.10
3
+ Version: 0.10.2
4
4
  Summary: Multi-game puzzle gym for LLM training and benchmarking - 30 constraint puzzles with synthetic data generation
5
5
  Author: Chris Hay
6
6
  License: MIT
@@ -93,10 +93,17 @@ Once connected, type `help` to see available games, or `sudoku easy` to start pl
93
93
  - Enable with `mode agent` command
94
94
  - Machine-parseable grid format with clear start/end markers
95
95
  - Compact output optimized for LLM tool integration
96
+ - **Reasoning Depth Metrics** - Measure *how* agents reason, not just if they succeed
97
+ - Backtrack detection (did the agent revise previous placements?)
98
+ - Progress steadiness (monotonic advance toward solution?)
99
+ - Error streak analysis (isolated mistakes vs. clustered confusion?)
100
+ - Reasoning overhead (wasted work relative to optimal path)
101
+ - Solver distance traces (remaining work after each valid move)
102
+ - Available in all paths: Gym env, eval harness, and server (telnet/WebSocket)
96
103
  - **Evaluation Harness** (`chuk-puzzles-eval`) - Built-in benchmarking CLI
97
104
  - Batch evaluation with configurable episodes
98
105
  - Multiple output formats (JSON, CSV, Markdown)
99
- - Metrics: moves, invalid moves, hints, solve time
106
+ - Metrics: moves, invalid moves, hints, solve time, reasoning depth
100
107
  - Reproducible with deterministic seeds
101
108
  - **Dataset Export** (`chuk-puzzles-export`) - Synthetic data generation for LLM training
102
109
  - JSONL output with complete problem definitions and solutions
@@ -500,6 +507,7 @@ games = PuzzleEnv.available_games()
500
507
 
501
508
  - **All 30 games** accessible through unified API
502
509
  - **Configurable rewards** for correct moves, invalid attempts, completion bonuses
510
+ - **Reasoning depth metrics** tracking backtracks, progress steadiness, error patterns
503
511
  - **Hint system** with optional budget limits
504
512
  - **Solver-free mode** for pure reasoning benchmarks
505
513
  - **Efficiency scoring** based on optimal step counts
@@ -515,8 +523,25 @@ obs = {
515
523
  "moves": 5,
516
524
  "invalid_moves": 1,
517
525
  "hints_used": 2,
526
+ "hints_remaining": 98,
518
527
  "is_complete": False,
519
- "grid": [[4, 0, 8, ...], ...] # Game-specific state
528
+ "grid": [[4, 0, 8, ...], ...], # Game-specific state
529
+ "render": " | 1 2 3 | ...", # ASCII grid
530
+ }
531
+
532
+ # Info dict includes reasoning metrics and difficulty profile
533
+ info = {
534
+ "optimal_steps": 45,
535
+ "difficulty_profile": {"logic_depth": 2, "branching_factor": 2.0, ...},
536
+ "reasoning_metrics": {
537
+ "backtrack_count": 0,
538
+ "backtrack_rate": 0.0,
539
+ "progress_velocity": 1.0,
540
+ "progress_steadiness": 1.0,
541
+ "reasoning_overhead": 1.0,
542
+ "error_streak_max": 0,
543
+ "solver_distance_trace": [44, 43, 42, ...],
544
+ },
520
545
  }
521
546
  ```
522
547
 
@@ -546,6 +571,89 @@ config = SolverConfig(hint_budget=5, hint_penalty=0.1)
546
571
  env = PuzzleEnv("sudoku", solver_config=config)
547
572
  ```
548
573
 
574
+ ## Reasoning Depth Metrics
575
+
576
+ Beyond binary success/failure, the system measures **how** an agent reasons through puzzles. These metrics are available in all interaction paths: the Gym environment, the evaluation harness, and the telnet/WebSocket server.
577
+
578
+ ### Metrics
579
+
580
+ | Metric | Description | Perfect Score |
581
+ |--------|-------------|---------------|
582
+ | `backtrack_count` | Times the agent revised a previous placement | 0 |
583
+ | `backtrack_rate` | Fraction of valid moves that were backtracks | 0% |
584
+ | `progress_velocity` | Average cells solved per step | 1.0 |
585
+ | `progress_steadiness` | How monotonically remaining work decreases (1.0 = never stalls) | 100% |
586
+ | `reasoning_overhead` | Total actions / optimal path length (1.0 = no waste) | 1.0x |
587
+ | `error_streak_max` | Longest run of consecutive invalid moves | 0 |
588
+ | `avg_error_streak` | Average length of error bursts | 0.0 |
589
+ | `solver_distance_trace` | Remaining positions after each valid move | Monotonically decreasing |
590
+
591
+ ### Usage in Gym Environment
592
+
593
+ ```python
594
+ from chuk_puzzles_gym.gym_env import PuzzleEnv
595
+
596
+ env = PuzzleEnv("sudoku", difficulty="easy", seed=42)
597
+ obs, info = await env.reset()
598
+
599
+ # Reasoning metrics available in info after reset
600
+ print(info["reasoning_metrics"])
601
+
602
+ # ... agent plays ...
603
+ obs, reward, terminated, truncated, info = await env.step("place 1 1 5")
604
+
605
+ # On episode end, info includes full reasoning metrics
606
+ if terminated:
607
+ metrics = info["reasoning_metrics"]
608
+ print(f"Backtrack rate: {metrics['backtrack_rate']:.0%}")
609
+ print(f"Overhead: {metrics['reasoning_overhead']:.1f}x")
610
+ print(f"Steadiness: {metrics['progress_steadiness']:.0%}")
611
+ ```
612
+
613
+ ### Usage in Server (Telnet/WebSocket)
614
+
615
+ Reasoning metrics are included automatically in server output:
616
+
617
+ - **JSON mode**: `reasoning_metrics` dict in every state response and completion message
618
+ - **STRICT mode**: `BT=`, `OH=`, `ST=` fields appended to STATS and COMPLETE messages
619
+ - **Normal mode**: "Reasoning Depth" section shown on completion and in `stats` command
620
+
621
+ ```
622
+ > mode json
623
+ > place 1 1 5
624
+ {"type":"result","success":true,...,"state":{...,"reasoning_metrics":{"backtrack_count":0,...}}}
625
+
626
+ > stats
627
+ {"type":"stats",...,"reasoning_metrics":{"backtrack_count":0,"backtrack_rate":0.0,...}}
628
+ ```
629
+
630
+ ### Usage in Evaluation Harness
631
+
632
+ ```bash
633
+ # Reasoning metrics included in all output formats
634
+ chuk-puzzles-eval sudoku -d easy -n 10 -o json
635
+ ```
636
+
637
+ ```python
638
+ from chuk_puzzles_gym.eval import evaluate_game
639
+
640
+ report = await evaluate_game("sudoku", difficulty="easy", episodes=10)
641
+ report.print_summary() # Includes "Reasoning Depth" section
642
+
643
+ # Aggregate metrics
644
+ print(f"Avg backtrack rate: {report.avg_backtrack_rate:.0%}")
645
+ print(f"Avg overhead: {report.avg_reasoning_overhead:.1f}x")
646
+ print(f"Avg steadiness: {report.avg_progress_steadiness:.0%}")
647
+ ```
648
+
649
+ ### What the Metrics Reveal
650
+
651
+ A **perfect solver** shows: 0 backtracks, 1.0x overhead, 100% steadiness, 1.0 velocity.
652
+
653
+ A **struggling agent** shows: high backtrack rate (revising decisions), error streaks (clustered confusion), low steadiness (stalling progress), and high overhead (wasted work).
654
+
655
+ These patterns are visible even when two agents both eventually solve a puzzle — the metrics expose the **quality of the reasoning path**, not just the outcome.
656
+
549
657
  ## Evaluation Harness
550
658
 
551
659
  The project includes a built-in **evaluation harness** for benchmarking puzzle-solving agents:
@@ -604,6 +712,12 @@ Avg Time: 12ms
604
712
  | `hints_used` | Number of hints requested |
605
713
  | `wall_time_ms` | Time to solve in milliseconds |
606
714
  | `seed` | Puzzle seed for reproducibility |
715
+ | `backtrack_count` | Times agent revised a previous placement |
716
+ | `backtrack_rate` | Fraction of valid moves that were backtracks |
717
+ | `progress_steadiness` | How monotonically progress advances (1.0 = perfect) |
718
+ | `reasoning_overhead` | Total actions / optimal path (1.0 = no waste) |
719
+ | `error_streak_max` | Longest run of consecutive invalid moves |
720
+ | `progress_velocity` | Average cells solved per step |
607
721
 
608
722
  ## Dataset Export
609
723
 
@@ -1194,12 +1308,13 @@ chuk-puzzles-gym/
1194
1308
  │ │ ├── base.py # GridPosition, MoveResult
1195
1309
  │ │ ├── config.py # Base GameConfig
1196
1310
  │ │ ├── enums.py # DifficultyLevel, GameCommand, etc.
1311
+ │ │ ├── evaluation.py # ReasoningMetrics, EpisodeResult, EvaluationSummary
1197
1312
  │ │ └── games.py # Game-specific models (Cage, Task, etc.)
1198
1313
  │ └── games/ # Self-contained game modules
1199
1314
  │ ├── __init__.py # AVAILABLE_GAMES registry
1200
1315
  │ ├── _base/ # Base classes
1201
1316
  │ │ ├── __init__.py
1202
- │ │ ├── game.py # PuzzleGame ABC
1317
+ │ │ ├── game.py # PuzzleGame ABC + ReasoningTracker
1203
1318
  │ │ └── commands.py # GameCommandHandler ABC
1204
1319
  │ ├── sudoku/ # Example game module
1205
1320
  │ │ ├── __init__.py # Exports SudokuGame
@@ -1226,6 +1341,7 @@ chuk-puzzles-gym/
1226
1341
  │ ├── example_graph_coloring.py # Graph Coloring game logic demo
1227
1342
  │ ├── example_cryptarithmetic.py# Cryptarithmetic game logic demo
1228
1343
  │ ├── example_rush_hour.py # Rush Hour game logic demo
1344
+ │ ├── example_reasoning_metrics.py # Reasoning depth metrics demo
1229
1345
  │ └── README.md # Example usage guide
1230
1346
  ├── .github/workflows/ # CI/CD workflows
1231
1347
  ├── pyproject.toml # Modern Python project config
@@ -1465,9 +1581,10 @@ See [ROADMAP.md](ROADMAP.md) for the full development roadmap.
1465
1581
  ### Highlights
1466
1582
 
1467
1583
  **Benchmarking & Metrics**
1468
- - Puzzle complexity metrics (constraint count, variable count, branching factor)
1469
- - Episode model for tracking game sessions
1470
- - Trace logging for offline analysis
1584
+ - ~~Puzzle complexity metrics~~ (implemented: constraint count, variable count, branching factor)
1585
+ - ~~Episode model for tracking game sessions~~ (implemented: EpisodeResult with ReasoningMetrics)
1586
+ - ~~Reasoning depth metrics~~ (implemented: backtrack detection, progress steadiness, error patterns)
1587
+ - ~~Trace logging for offline analysis~~ (implemented: solver distance traces in all output paths)
1471
1588
 
1472
1589
  **Agent Evaluation Tools**
1473
1590
  - Batch evaluation harness CLI