synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/balance.py +3 -15
  4. synth_ai/cli/demo.py +68 -9
  5. synth_ai/cli/rl_demo.py +137 -0
  6. synth_ai/cli/root.py +65 -0
  7. synth_ai/config/base_url.py +47 -0
  8. synth_ai/demos/core/__init__.py +1 -0
  9. synth_ai/demos/core/cli.py +621 -0
  10. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/core.py +374 -0
  12. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  13. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  14. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  15. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  16. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  17. synth_ai/environments/examples/bandit/__init__.py +33 -0
  18. synth_ai/environments/examples/bandit/engine.py +294 -0
  19. synth_ai/environments/examples/bandit/environment.py +194 -0
  20. synth_ai/environments/examples/bandit/taskset.py +200 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  83. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  84. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  85. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  86. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  87. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  88. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  89. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  90. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  91. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  92. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  93. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  94. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  97. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  98. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  100. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  101. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  102. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  103. synth_ai/environments/examples/red/units/__init__.py +1 -0
  104. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  105. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  106. synth_ai/environments/service/app.py +8 -0
  107. synth_ai/http.py +102 -0
  108. synth_ai/inference/__init__.py +7 -0
  109. synth_ai/inference/client.py +20 -0
  110. synth_ai/install_sqld.sh +40 -0
  111. synth_ai/jobs/client.py +246 -0
  112. synth_ai/learning/__init__.py +24 -0
  113. synth_ai/learning/client.py +149 -0
  114. synth_ai/learning/config.py +43 -0
  115. synth_ai/learning/constants.py +29 -0
  116. synth_ai/learning/ft_client.py +59 -0
  117. synth_ai/learning/health.py +43 -0
  118. synth_ai/learning/jobs.py +205 -0
  119. synth_ai/learning/rl_client.py +256 -0
  120. synth_ai/learning/sse.py +58 -0
  121. synth_ai/learning/validators.py +48 -0
  122. synth_ai/lm/core/main_v3.py +13 -0
  123. synth_ai/lm/core/synth_models.py +48 -0
  124. synth_ai/lm/core/vendor_clients.py +9 -6
  125. synth_ai/lm/vendors/core/openai_api.py +31 -3
  126. synth_ai/lm/vendors/openai_standard.py +45 -14
  127. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  128. synth_ai/lm/vendors/synth_client.py +372 -28
  129. synth_ai/rl/__init__.py +30 -0
  130. synth_ai/rl/contracts.py +32 -0
  131. synth_ai/rl/env_keys.py +137 -0
  132. synth_ai/rl/secrets.py +19 -0
  133. synth_ai/scripts/verify_rewards.py +100 -0
  134. synth_ai/task/__init__.py +10 -0
  135. synth_ai/task/contracts.py +120 -0
  136. synth_ai/task/health.py +28 -0
  137. synth_ai/task/validators.py +12 -0
  138. synth_ai/tracing_v3/hooks.py +3 -1
  139. synth_ai/tracing_v3/session_tracer.py +123 -2
  140. synth_ai/tracing_v3/turso/manager.py +218 -0
  141. synth_ai/tracing_v3/turso/models.py +53 -0
  142. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  143. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
  144. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  145. synth_ai/tui/__init__.py +0 -1
  146. synth_ai/tui/__main__.py +0 -13
  147. synth_ai/tui/cli/__init__.py +0 -1
  148. synth_ai/tui/cli/query_experiments.py +0 -164
  149. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  150. synth_ai/tui/dashboard.py +0 -340
  151. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  152. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  154. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,464 @@
1
+ """
2
+ MCTS Implementation Guide for Crafter Environment
3
+ ================================================
4
+
5
+ Based on analysis of Pokemon RED and Sokoban MCTS implementations, here's how we could
6
+ implement Monte Carlo Tree Search for the Crafter environment:
7
+
8
+ 1. **State Serialization & Tree Storage**
9
+ - Use FilesystemSnapshotStore and TrajectoryTreeStore (like RED)
10
+ - Serialize Crafter environment state using pickle + gzip compression
11
+ - Each tree node stores a complete environment snapshot
12
+ - Child nodes are created by taking actions from parent states
13
+
14
+ 2. **Action Space**
15
+ - Crafter has 17 discrete actions: noop, move(4), do, place_stone, place_table,
16
+ place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe,
17
+ make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword
18
+ - Much larger than Pokemon's 8 buttons, so may need action filtering
19
+
20
+ 3. **Heuristic Evaluation Function**
21
+ Key metrics to consider for Crafter:
22
+ - Achievements unlocked (primary objective)
23
+ - Health, hunger, thirst levels (survival)
24
+ - Inventory contents (resources, tools, weapons)
25
+ - Distance to nearest resources (exploration bonus)
26
+ - Day/night cycle position
27
+ - Proximity to dangers (zombies, skeletons)
28
+
29
+ Example scoring:
30
+ ```python
31
+ score = 0.0
32
+ score += len(achievements) * 100 # Major reward for achievements
33
+ score += health * 5
34
+ score += (9 - hunger) * 2 # Lower hunger is better
35
+ score += (9 - thirst) * 2 # Lower thirst is better
36
+ score += inventory_value() # Value of items
37
+ score -= day_count * 0.1 # Slight penalty for time
38
+ ```
39
+
40
+ 4. **Terminal State Detection**
41
+ - All achievements unlocked (win)
42
+ - Player health reaches 0 (loss)
43
+ - Maximum steps/days reached (timeout)
44
+
45
+ 5. **Rollout Policy**
46
+ - Random actions work poorly in Crafter due to survival mechanics
47
+ - Consider biased rollouts:
48
+ * Prioritize "do" action near resources
49
+ * Avoid moving into danger zones
50
+ * Seek food/water when low
51
+ - Alternatively, use a simple policy network
52
+
53
+ 6. **MCTS Algorithm Structure**
54
+ ```python
55
+ async def crafter_mcts_plan(tree, root_id, rollouts_per_action=10, max_depth=20):
56
+ plan = []
57
+ node_id = root_id
58
+
59
+ for depth in range(max_depth):
60
+ # Load environment from node
61
+ env = deserialize_crafter_env(tree.load_snapshot_blob(node_id))
62
+
63
+ if is_terminal(env):
64
+ break
65
+
66
+ # Evaluate each action
67
+ q_values = {}
68
+ for action in CRAFTER_ACTIONS:
69
+ # Expand node if needed
70
+ child_id = expand_if_needed(tree, node_id, action, env)
71
+
72
+ # Run rollouts from child state
73
+ scores = []
74
+ for _ in range(rollouts_per_action):
75
+ score = await rollout(child_env, max_steps=50)
76
+ scores.append(score)
77
+
78
+ q_values[action] = np.mean(scores)
79
+
80
+ # Select best action
81
+ best_action = max(q_values, key=q_values.get)
82
+ plan.append(best_action)
83
+ node_id = get_child_for_action(tree, node_id, best_action)
84
+
85
+ return plan
86
+ ```
87
+
88
+ 7. **Optimizations for Crafter**
89
+ - **Action Filtering**: Not all 17 actions are valid at every state
90
+ * Can't craft without resources
91
+ * Can't place objects without them in inventory
92
+ * Filter invalid actions before expansion
93
+
94
+ - **Progressive Widening**: Start with core actions (move, do),
95
+ gradually add crafting actions as resources are collected
96
+
97
+ - **Domain Knowledge Integration**:
98
+ * Prioritize water/food collection when low
99
+ * Seek shelter at night
100
+ * Maintain tool progression (wood ļæ½ stone ļæ½ iron)
101
+
102
+ 8. **Challenges Specific to Crafter**
103
+ - Long-horizon planning required (achievements need multi-step sequences)
104
+ - Day/night cycle adds urgency
105
+ - Resource management is critical
106
+ - Combat situations need quick responses
107
+ - Much richer state space than Pokemon RED movement
108
+
109
+ 9. **Implementation Steps**
110
+ 1. Create serialization methods for CrafterEnvironment
111
+ 2. Implement heuristic scoring based on game state
112
+ 3. Build MCTS planner with Crafter-specific optimizations
113
+ 4. Add action filtering and validity checking
114
+ 5. Test on simple achievement sequences first
115
+ 6. Gradually increase complexity to full achievement set
116
+
117
+ The key insight from RED's implementation is that MCTS works well for exploration
118
+ and planning in environments with discrete actions and clear progress metrics.
119
+ Crafter's achievement system provides natural waypoints for the search tree.
120
+ """
121
+
122
+ # ============================================================================
123
+ # CRAFTER ENVIRONMENT SERIALIZATION TESTS
124
+ # ============================================================================
125
+
126
+ import asyncio
127
+ import gzip
128
+ import pickle
129
+ import tempfile
130
+ from pathlib import Path
131
+ from typing import Dict, Any
132
+
133
+ import pytest
134
+
135
+ from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
136
+ from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstance
137
+ from synth_ai.environments.environment.tools import EnvToolCall
138
+ from synth_ai.environments.reproducibility.tree import FilesystemSnapshotStore, TrajectoryTreeStore
139
+
140
+
141
+ @pytest.mark.asyncio
142
+ async def test_basic_crafter_serialization():
143
+ """Test basic serialization/deserialization of Crafter environment."""
144
+ # Create task instance
145
+ from synth_ai.environments.tasks.core import Impetus, Intent
146
+ from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
147
+ from uuid import uuid4
148
+
149
+ metadata = CrafterTaskInstanceMetadata(
150
+ difficulty="easy",
151
+ seed=42,
152
+ num_trees_radius=5,
153
+ num_cows_radius=2,
154
+ num_hostiles_radius=0
155
+ )
156
+ task = CrafterTaskInstance(
157
+ id=uuid4(),
158
+ impetus=Impetus(instructions="Test serialization"),
159
+ intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
160
+ metadata=metadata,
161
+ is_reproducible=True,
162
+ initial_engine_snapshot=None
163
+ )
164
+
165
+ # Create and initialize environment
166
+ env = CrafterClassicEnvironment(task)
167
+ await env.initialize()
168
+
169
+ # Get initial state
170
+ initial_snapshot = await env._serialize_engine()
171
+ initial_pub = env.engine._get_public_state_from_env()
172
+
173
+ # Take a few actions
174
+ actions = [
175
+ EnvToolCall(tool="interact", args={"action": 5}), # do
176
+ EnvToolCall(tool="interact", args={"action": 2}), # right
177
+ EnvToolCall(tool="interact", args={"action": 5}), # do
178
+ ]
179
+
180
+ for action in actions:
181
+ await env.step(action)
182
+
183
+ # Get state after actions
184
+ after_pub = env.engine._get_public_state_from_env()
185
+ after_snapshot = await env._serialize_engine()
186
+
187
+ # Verify state changed
188
+ assert initial_pub.player_position != after_pub.player_position or initial_pub.inventory != after_pub.inventory
189
+ assert initial_snapshot.total_reward_snapshot != after_snapshot.total_reward_snapshot
190
+
191
+ # Deserialize from initial snapshot
192
+ restored_env = await CrafterClassicEnvironment._deserialize_engine(initial_snapshot, task)
193
+ restored_pub = restored_env.engine._get_public_state_from_env()
194
+
195
+ # Verify restoration
196
+ assert restored_pub.player_position == initial_pub.player_position
197
+ assert restored_pub.inventory == initial_pub.inventory
198
+ assert restored_pub.achievements_status == initial_pub.achievements_status
199
+
200
+ print("āœ“ Basic serialization test passed")
201
+
202
+
203
+ @pytest.mark.asyncio
204
+ async def test_crafter_tree_storage():
205
+ """Test storing Crafter states in a trajectory tree (for MCTS)."""
206
+ from synth_ai.environments.tasks.core import Impetus, Intent
207
+ from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
208
+ from uuid import uuid4
209
+
210
+ metadata = CrafterTaskInstanceMetadata(
211
+ difficulty="medium",
212
+ seed=123,
213
+ num_trees_radius=3,
214
+ num_cows_radius=1,
215
+ num_hostiles_radius=1
216
+ )
217
+ task = CrafterTaskInstance(
218
+ id=uuid4(),
219
+ impetus=Impetus(instructions="Test tree storage"),
220
+ intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
221
+ metadata=metadata,
222
+ is_reproducible=True,
223
+ initial_engine_snapshot=None
224
+ )
225
+ env = CrafterClassicEnvironment(task)
226
+ await env.initialize()
227
+
228
+ # Set up tree storage
229
+ with tempfile.TemporaryDirectory() as tmpdir:
230
+ snap_store_path = Path(tmpdir) / "crafter_tree"
231
+ tree = TrajectoryTreeStore(FilesystemSnapshotStore(snap_store_path))
232
+
233
+ # Add root snapshot
234
+ root_snapshot = await env._serialize_engine()
235
+ root_blob = gzip.compress(pickle.dumps(root_snapshot))
236
+ root_id = tree.add_root(root_blob)
237
+
238
+ # Expand tree with different actions
239
+ action_rewards = {}
240
+ for action_idx in [0, 1, 2, 3, 4, 5]: # noop, move directions, do
241
+ # Restore from root
242
+ root_env_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(root_id)))
243
+ env = await CrafterClassicEnvironment._deserialize_engine(root_env_snapshot, task)
244
+
245
+ # Take action
246
+ call = EnvToolCall(tool="interact", args={"action": action_idx})
247
+ obs = await env.step(call)
248
+
249
+ # Store child
250
+ child_snapshot = await env._serialize_engine()
251
+ child_blob = gzip.compress(pickle.dumps(child_snapshot))
252
+ child_id = tree.add_child(
253
+ root_id,
254
+ child_blob,
255
+ action=action_idx,
256
+ reward=obs.get("reward_last_step", 0.0),
257
+ terminated=obs.get("terminated", False),
258
+ info={"total_reward": obs.get("total_reward", 0.0)}
259
+ )
260
+
261
+ action_rewards[action_idx] = obs.get("reward_last_step", 0.0)
262
+
263
+ # Verify tree structure
264
+ children = tree.get_children(root_id)
265
+ assert len(children) == 6
266
+
267
+ # Verify we can load any child state
268
+ for child_id in children:
269
+ child_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(child_id)))
270
+ child_env = await CrafterClassicEnvironment._deserialize_engine(child_snapshot, task)
271
+ # Should be able to continue from this state
272
+ await child_env.step(EnvToolCall(tool="interact", args={"action": 0}))
273
+
274
+ print(f"āœ“ Tree storage test passed - stored {len(children)} child states")
275
+ print(f" Action rewards: {action_rewards}")
276
+
277
+
278
+ @pytest.mark.asyncio
279
+ async def test_crafter_state_consistency():
280
+ """Test that serialization preserves all important state components."""
281
+ from synth_ai.environments.tasks.core import Impetus, Intent
282
+ from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
283
+ from uuid import uuid4
284
+
285
+ metadata = CrafterTaskInstanceMetadata(
286
+ difficulty="medium",
287
+ seed=777,
288
+ num_trees_radius=4,
289
+ num_cows_radius=2,
290
+ num_hostiles_radius=1
291
+ )
292
+ task = CrafterTaskInstance(
293
+ id=uuid4(),
294
+ impetus=Impetus(instructions="Test consistency"),
295
+ intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
296
+ metadata=metadata,
297
+ is_reproducible=True,
298
+ initial_engine_snapshot=None
299
+ )
300
+ env = CrafterClassicEnvironment(task)
301
+ await env.initialize()
302
+
303
+ # Perform various actions to create a complex state
304
+ action_sequence = [
305
+ 5, # do (gather resource)
306
+ 2, # right
307
+ 5, # do
308
+ 1, # up
309
+ 5, # do
310
+ 10, # make_wood_pickaxe (if resources available)
311
+ ]
312
+
313
+ for action_idx in action_sequence:
314
+ try:
315
+ await env.step(EnvToolCall(tool="interact", args={"action": action_idx}))
316
+ except:
317
+ pass # Some actions may fail if resources not available
318
+
319
+ # Get current state details
320
+ original_pub = env.engine._get_public_state_from_env()
321
+ original_priv = env.engine._get_private_state_from_env(0, False, False)
322
+ original_total_reward = env.engine._total_reward
323
+
324
+ # Serialize
325
+ snapshot = await env._serialize_engine()
326
+
327
+ # Create new environment and deserialize
328
+ new_env = await CrafterClassicEnvironment._deserialize_engine(snapshot, task)
329
+ restored_pub = new_env.engine._get_public_state_from_env()
330
+ restored_priv = new_env.engine._get_private_state_from_env(0, False, False)
331
+
332
+ # Check public state consistency
333
+ assert restored_pub.player_position == original_pub.player_position
334
+ assert restored_pub.inventory == original_pub.inventory
335
+ assert restored_pub.achievements_status == original_pub.achievements_status
336
+ assert restored_pub.num_steps_taken == original_pub.num_steps_taken
337
+
338
+ # Check private state consistency
339
+ assert restored_priv.player_internal_stats == original_priv.player_internal_stats
340
+ assert new_env.engine._total_reward == original_total_reward
341
+
342
+ # Verify we can continue playing from restored state
343
+ before_step = restored_pub.num_steps_taken
344
+ await new_env.step(EnvToolCall(tool="interact", args={"action": 0}))
345
+ after_pub = new_env.engine._get_public_state_from_env()
346
+ assert after_pub.num_steps_taken == before_step + 1
347
+
348
+ print("āœ“ State consistency test passed")
349
+
350
+
351
+ @pytest.mark.asyncio
352
+ async def test_crafter_mcts_ready_serialization():
353
+ """Test serialization patterns needed for MCTS implementation."""
354
+ from synth_ai.environments.tasks.core import Impetus, Intent
355
+ from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
356
+ from uuid import uuid4
357
+
358
+ metadata = CrafterTaskInstanceMetadata(
359
+ difficulty="easy",
360
+ seed=999,
361
+ num_trees_radius=6,
362
+ num_cows_radius=3,
363
+ num_hostiles_radius=0
364
+ )
365
+ task = CrafterTaskInstance(
366
+ id=uuid4(),
367
+ impetus=Impetus(instructions="Test MCTS"),
368
+ intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
369
+ metadata=metadata,
370
+ is_reproducible=True,
371
+ initial_engine_snapshot=None
372
+ )
373
+
374
+ # Helper function to create heuristic score
375
+ def heuristic_score(env: CrafterClassicEnvironment) -> float:
376
+ pub = env.engine._get_public_state_from_env()
377
+ priv = env.engine._get_private_state_from_env(0, False, False)
378
+
379
+ score = 10.0 # Base score
380
+
381
+ # Achievement bonus
382
+ achievements_unlocked = sum(1 for v in pub.achievements_status.values() if v)
383
+ score += achievements_unlocked * 100.0
384
+
385
+ # Survival metrics
386
+ health = priv.player_internal_stats.get("health", 0)
387
+ hunger = priv.player_internal_stats.get("_hunger", 0)
388
+ thirst = priv.player_internal_stats.get("_thirst", 0)
389
+
390
+ score += health * 5.0
391
+ score += (9 - hunger) * 2.0 # Lower is better
392
+ score += (9 - thirst) * 2.0 # Lower is better
393
+
394
+ # Inventory value
395
+ for item, count in pub.inventory.items():
396
+ if "pickaxe" in item:
397
+ score += count * 20.0
398
+ elif "sword" in item:
399
+ score += count * 15.0
400
+ else:
401
+ score += count * 2.0
402
+
403
+ return score
404
+
405
+ # Initialize environment
406
+ env = CrafterClassicEnvironment(task)
407
+ await env.initialize()
408
+ initial_score = heuristic_score(env)
409
+
410
+ # Simulate MCTS-style exploration
411
+ with tempfile.TemporaryDirectory() as tmpdir:
412
+ snap_store = FilesystemSnapshotStore(Path(tmpdir) / "mcts_test")
413
+ tree = TrajectoryTreeStore(snap_store)
414
+
415
+ # Create root
416
+ root_snapshot = await env._serialize_engine()
417
+ root_blob = gzip.compress(pickle.dumps(root_snapshot))
418
+ root_id = tree.add_root(root_blob)
419
+
420
+ # Explore a few actions and track scores
421
+ action_scores: Dict[int, float] = {}
422
+
423
+ for action_idx in [0, 1, 2, 3, 4, 5]:
424
+ # Restore from root for fair comparison
425
+ root_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(root_id)))
426
+ test_env = await CrafterClassicEnvironment._deserialize_engine(root_snapshot, task)
427
+
428
+ # Take action
429
+ await test_env.step(EnvToolCall(tool="interact", args={"action": action_idx}))
430
+
431
+ # Calculate score after action
432
+ action_scores[action_idx] = heuristic_score(test_env)
433
+
434
+ # Find best action
435
+ best_action = max(action_scores, key=action_scores.get)
436
+
437
+ print("āœ“ MCTS-ready serialization test passed")
438
+ print(f" Initial score: {initial_score:.2f}")
439
+ print(f" Action scores: {action_scores}")
440
+ print(f" Best action: {best_action} (score: {action_scores[best_action]:.2f})")
441
+
442
+
443
+ # Run all tests
444
+ if __name__ == "__main__":
445
+ async def main():
446
+ print("Running Crafter Serialization Tests...")
447
+ print("=" * 50)
448
+
449
+ await test_basic_crafter_serialization()
450
+ print()
451
+
452
+ await test_crafter_tree_storage()
453
+ print()
454
+
455
+ await test_crafter_state_consistency()
456
+ print()
457
+
458
+ await test_crafter_mcts_ready_serialization()
459
+ print()
460
+
461
+ print("=" * 50)
462
+ print("šŸŽ‰ All tests passed!")
463
+
464
+ asyncio.run(main())
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Browse existing Crafter evaluations and launch viewer for a selected run.
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ import asyncio
11
+ from tabulate import tabulate
12
+
13
+ from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
14
+ set_current_eval_dir,
15
+ app,
16
+ )
17
+ from fastapi.staticfiles import StaticFiles
18
+ import uvicorn
19
+
20
+
21
+ def list_evaluations(evals_dir: Path = Path("src/evals/crafter")):
22
+ """List all available evaluations with summary info."""
23
+ if not evals_dir.exists():
24
+ print(f"No evaluations found at {evals_dir}")
25
+ return []
26
+
27
+ evaluations = []
28
+ for run_dir in sorted(evals_dir.glob("run_*"), reverse=True):
29
+ if run_dir.is_dir():
30
+ summary_file = run_dir / "evaluation_summary.json"
31
+ if summary_file.exists():
32
+ with open(summary_file, "r") as f:
33
+ summary = json.load(f)
34
+
35
+ eval_info = {
36
+ "run_id": run_dir.name,
37
+ "timestamp": summary["evaluation_metadata"]["timestamp"],
38
+ "models": ", ".join(summary["models_evaluated"]),
39
+ "difficulties": ", ".join(summary["difficulties_evaluated"]),
40
+ "num_trajectories": summary["evaluation_metadata"]["num_trajectories"],
41
+ "path": run_dir,
42
+ }
43
+ evaluations.append(eval_info)
44
+
45
+ return evaluations
46
+
47
+
48
+ async def view_evaluation(eval_dir: Path):
49
+ """Launch viewer for a specific evaluation."""
50
+ if not eval_dir.exists():
51
+ print(f"Evaluation directory not found: {eval_dir}")
52
+ return
53
+
54
+ viewer_dir = eval_dir / "viewer"
55
+ if not viewer_dir.exists():
56
+ print(f"Viewer files not found in {eval_dir}")
57
+ return
58
+
59
+ print(f"\nšŸ“ Viewing evaluation: {eval_dir}")
60
+ print("🌐 Launching viewer at http://localhost:8000")
61
+ print(" Press Ctrl+C to stop the viewer")
62
+
63
+ # Set the current eval directory for the viewer
64
+ set_current_eval_dir(eval_dir)
65
+
66
+ # Mount static files from the viewer directory
67
+ app.mount("/", StaticFiles(directory=str(viewer_dir), html=True), name="viewer")
68
+
69
+ # Run viewer
70
+ config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="error")
71
+ server = uvicorn.Server(config)
72
+ await server.serve()
73
+
74
+
75
+ async def main():
76
+ parser = argparse.ArgumentParser(description="Browse Crafter evaluations")
77
+ parser.add_argument(
78
+ "--eval-dir",
79
+ type=str,
80
+ default="src/evals/crafter",
81
+ help="Base directory for evaluations",
82
+ )
83
+ parser.add_argument(
84
+ "--run-id", type=str, help="Specific run ID to view (e.g., run_20240115_143022)"
85
+ )
86
+ parser.add_argument("--latest", action="store_true", help="View the latest evaluation")
87
+
88
+ args = parser.parse_args()
89
+ evals_dir = Path(args.eval_dir)
90
+
91
+ # List evaluations
92
+ evaluations = list_evaluations(evals_dir)
93
+
94
+ if not evaluations:
95
+ return
96
+
97
+ # Display table of evaluations
98
+ if not args.run_id and not args.latest:
99
+ print("\nšŸ“Š Available Crafter Evaluations:")
100
+ table_data = []
101
+ for i, eval_info in enumerate(evaluations):
102
+ # Parse timestamp for cleaner display
103
+ try:
104
+ ts = datetime.fromisoformat(eval_info["timestamp"])
105
+ ts_str = ts.strftime("%Y-%m-%d %H:%M:%S")
106
+ except:
107
+ ts_str = eval_info["timestamp"]
108
+
109
+ table_data.append(
110
+ [
111
+ i + 1,
112
+ eval_info["run_id"],
113
+ ts_str,
114
+ eval_info["models"],
115
+ eval_info["difficulties"],
116
+ eval_info["num_trajectories"],
117
+ ]
118
+ )
119
+
120
+ headers = ["#", "Run ID", "Timestamp", "Models", "Difficulties", "Trajectories"]
121
+ print(tabulate(table_data, headers=headers, tablefmt="grid"))
122
+
123
+ # Ask user to select
124
+ print("\nEnter the number of the evaluation to view (or 'q' to quit): ", end="")
125
+ choice = input().strip()
126
+
127
+ if choice.lower() == "q":
128
+ return
129
+
130
+ try:
131
+ idx = int(choice) - 1
132
+ if 0 <= idx < len(evaluations):
133
+ selected_eval = evaluations[idx]
134
+ await view_evaluation(selected_eval["path"])
135
+ else:
136
+ print("Invalid selection")
137
+ except ValueError:
138
+ print("Invalid input")
139
+
140
+ # View specific run
141
+ elif args.run_id:
142
+ eval_path = evals_dir / args.run_id
143
+ await view_evaluation(eval_path)
144
+
145
+ # View latest
146
+ elif args.latest and evaluations:
147
+ latest_eval = evaluations[0]
148
+ await view_evaluation(latest_eval["path"])
149
+
150
+
151
+ if __name__ == "__main__":
152
+ asyncio.run(main())
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to run Crafter evaluation using the standardized eval framework
4
+ """
5
+
6
+ import asyncio
7
+ import toml
8
+ from pathlib import Path
9
+ from src.synth_env.examples.crafter_classic.agent_demos.eval_framework import (
10
+ CrafterEvalFramework,
11
+ run_crafter_eval,
12
+ )
13
+
14
+
15
+ async def main():
16
+ # Load configuration
17
+ config_path = Path(__file__).parent / "eval_config.toml"
18
+ if not config_path.exists():
19
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
20
+
21
+ config = toml.load(config_path)
22
+ eval_config = config["evaluation"]
23
+
24
+ models = eval_config["models"]
25
+ difficulties = eval_config["difficulties"]
26
+ max_turns = eval_config["max_turns"]
27
+ n_trajectories = eval_config["trajectories_per_condition"]
28
+
29
+ print("šŸŽÆ Crafter Multi-Action Model Comparison (Eval Framework)")
30
+ print("=" * 60)
31
+ print(f"Models: {', '.join(models)}")
32
+ print(f"Difficulties: {', '.join(difficulties)}")
33
+ print(f"Max turns: {max_turns}")
34
+ print(f"Trajectories per condition: {n_trajectories}")
35
+ print("=" * 60)
36
+
37
+ # Run evaluation using the framework
38
+ results = await run_crafter_eval(
39
+ model_names=models,
40
+ difficulties=difficulties,
41
+ num_trajectories=n_trajectories,
42
+ max_turns=max_turns,
43
+ )
44
+
45
+ # The framework already prints detailed reports
46
+ print("\nšŸ† Evaluation completed!")
47
+ return results
48
+
49
+
50
+ if __name__ == "__main__":
51
+ asyncio.run(main())