synth-ai 0.2.4.dev7__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/__init__.py +1 -1
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/balance.py +3 -15
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/config/base_url.py +47 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/http.py +102 -0
- synth_ai/inference/__init__.py +7 -0
- synth_ai/inference/client.py +20 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai/jobs/client.py +246 -0
- synth_ai/learning/__init__.py +24 -0
- synth_ai/learning/client.py +149 -0
- synth_ai/learning/config.py +43 -0
- synth_ai/learning/constants.py +29 -0
- synth_ai/learning/ft_client.py +59 -0
- synth_ai/learning/health.py +43 -0
- synth_ai/learning/jobs.py +205 -0
- synth_ai/learning/rl_client.py +256 -0
- synth_ai/learning/sse.py +58 -0
- synth_ai/learning/validators.py +48 -0
- synth_ai/lm/core/main_v3.py +13 -0
- synth_ai/lm/core/synth_models.py +48 -0
- synth_ai/lm/core/vendor_clients.py +9 -6
- synth_ai/lm/vendors/core/openai_api.py +31 -3
- synth_ai/lm/vendors/openai_standard.py +45 -14
- synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
- synth_ai/lm/vendors/synth_client.py +372 -28
- synth_ai/rl/__init__.py +30 -0
- synth_ai/rl/contracts.py +32 -0
- synth_ai/rl/env_keys.py +137 -0
- synth_ai/rl/secrets.py +19 -0
- synth_ai/scripts/verify_rewards.py +100 -0
- synth_ai/task/__init__.py +10 -0
- synth_ai/task/contracts.py +120 -0
- synth_ai/task/health.py +28 -0
- synth_ai/task/validators.py +12 -0
- synth_ai/tracing_v3/hooks.py +3 -1
- synth_ai/tracing_v3/session_tracer.py +123 -2
- synth_ai/tracing_v3/turso/manager.py +218 -0
- synth_ai/tracing_v3/turso/models.py +53 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai/tui/__init__.py +0 -1
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -340
- synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MCTS Implementation Guide for Crafter Environment
|
|
3
|
+
================================================
|
|
4
|
+
|
|
5
|
+
Based on analysis of Pokemon RED and Sokoban MCTS implementations, here's how we could
|
|
6
|
+
implement Monte Carlo Tree Search for the Crafter environment:
|
|
7
|
+
|
|
8
|
+
1. **State Serialization & Tree Storage**
|
|
9
|
+
- Use FilesystemSnapshotStore and TrajectoryTreeStore (like RED)
|
|
10
|
+
- Serialize Crafter environment state using pickle + gzip compression
|
|
11
|
+
- Each tree node stores a complete environment snapshot
|
|
12
|
+
- Child nodes are created by taking actions from parent states
|
|
13
|
+
|
|
14
|
+
2. **Action Space**
|
|
15
|
+
- Crafter has 17 discrete actions: noop, move(4), do, place_stone, place_table,
|
|
16
|
+
place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe,
|
|
17
|
+
make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword
|
|
18
|
+
- Much larger than Pokemon's 8 buttons, so may need action filtering
|
|
19
|
+
|
|
20
|
+
3. **Heuristic Evaluation Function**
|
|
21
|
+
Key metrics to consider for Crafter:
|
|
22
|
+
- Achievements unlocked (primary objective)
|
|
23
|
+
- Health, hunger, thirst levels (survival)
|
|
24
|
+
- Inventory contents (resources, tools, weapons)
|
|
25
|
+
- Distance to nearest resources (exploration bonus)
|
|
26
|
+
- Day/night cycle position
|
|
27
|
+
- Proximity to dangers (zombies, skeletons)
|
|
28
|
+
|
|
29
|
+
Example scoring:
|
|
30
|
+
```python
|
|
31
|
+
score = 0.0
|
|
32
|
+
score += len(achievements) * 100 # Major reward for achievements
|
|
33
|
+
score += health * 5
|
|
34
|
+
score += (9 - hunger) * 2 # Lower hunger is better
|
|
35
|
+
score += (9 - thirst) * 2 # Lower thirst is better
|
|
36
|
+
score += inventory_value() # Value of items
|
|
37
|
+
score -= day_count * 0.1 # Slight penalty for time
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
4. **Terminal State Detection**
|
|
41
|
+
- All achievements unlocked (win)
|
|
42
|
+
- Player health reaches 0 (loss)
|
|
43
|
+
- Maximum steps/days reached (timeout)
|
|
44
|
+
|
|
45
|
+
5. **Rollout Policy**
|
|
46
|
+
- Random actions work poorly in Crafter due to survival mechanics
|
|
47
|
+
- Consider biased rollouts:
|
|
48
|
+
* Prioritize "do" action near resources
|
|
49
|
+
* Avoid moving into danger zones
|
|
50
|
+
* Seek food/water when low
|
|
51
|
+
- Alternatively, use a simple policy network
|
|
52
|
+
|
|
53
|
+
6. **MCTS Algorithm Structure**
|
|
54
|
+
```python
|
|
55
|
+
async def crafter_mcts_plan(tree, root_id, rollouts_per_action=10, max_depth=20):
|
|
56
|
+
plan = []
|
|
57
|
+
node_id = root_id
|
|
58
|
+
|
|
59
|
+
for depth in range(max_depth):
|
|
60
|
+
# Load environment from node
|
|
61
|
+
env = deserialize_crafter_env(tree.load_snapshot_blob(node_id))
|
|
62
|
+
|
|
63
|
+
if is_terminal(env):
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# Evaluate each action
|
|
67
|
+
q_values = {}
|
|
68
|
+
for action in CRAFTER_ACTIONS:
|
|
69
|
+
# Expand node if needed
|
|
70
|
+
child_id = expand_if_needed(tree, node_id, action, env)
|
|
71
|
+
|
|
72
|
+
# Run rollouts from child state
|
|
73
|
+
scores = []
|
|
74
|
+
for _ in range(rollouts_per_action):
|
|
75
|
+
score = await rollout(child_env, max_steps=50)
|
|
76
|
+
scores.append(score)
|
|
77
|
+
|
|
78
|
+
q_values[action] = np.mean(scores)
|
|
79
|
+
|
|
80
|
+
# Select best action
|
|
81
|
+
best_action = max(q_values, key=q_values.get)
|
|
82
|
+
plan.append(best_action)
|
|
83
|
+
node_id = get_child_for_action(tree, node_id, best_action)
|
|
84
|
+
|
|
85
|
+
return plan
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
7. **Optimizations for Crafter**
|
|
89
|
+
- **Action Filtering**: Not all 17 actions are valid at every state
|
|
90
|
+
* Can't craft without resources
|
|
91
|
+
* Can't place objects without them in inventory
|
|
92
|
+
* Filter invalid actions before expansion
|
|
93
|
+
|
|
94
|
+
- **Progressive Widening**: Start with core actions (move, do),
|
|
95
|
+
gradually add crafting actions as resources are collected
|
|
96
|
+
|
|
97
|
+
- **Domain Knowledge Integration**:
|
|
98
|
+
* Prioritize water/food collection when low
|
|
99
|
+
* Seek shelter at night
|
|
100
|
+
* Maintain tool progression (wood ļæ½ stone ļæ½ iron)
|
|
101
|
+
|
|
102
|
+
8. **Challenges Specific to Crafter**
|
|
103
|
+
- Long-horizon planning required (achievements need multi-step sequences)
|
|
104
|
+
- Day/night cycle adds urgency
|
|
105
|
+
- Resource management is critical
|
|
106
|
+
- Combat situations need quick responses
|
|
107
|
+
- Much richer state space than Pokemon RED movement
|
|
108
|
+
|
|
109
|
+
9. **Implementation Steps**
|
|
110
|
+
1. Create serialization methods for CrafterEnvironment
|
|
111
|
+
2. Implement heuristic scoring based on game state
|
|
112
|
+
3. Build MCTS planner with Crafter-specific optimizations
|
|
113
|
+
4. Add action filtering and validity checking
|
|
114
|
+
5. Test on simple achievement sequences first
|
|
115
|
+
6. Gradually increase complexity to full achievement set
|
|
116
|
+
|
|
117
|
+
The key insight from RED's implementation is that MCTS works well for exploration
|
|
118
|
+
and planning in environments with discrete actions and clear progress metrics.
|
|
119
|
+
Crafter's achievement system provides natural waypoints for the search tree.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
# ============================================================================
|
|
123
|
+
# CRAFTER ENVIRONMENT SERIALIZATION TESTS
|
|
124
|
+
# ============================================================================
|
|
125
|
+
|
|
126
|
+
import asyncio
|
|
127
|
+
import gzip
|
|
128
|
+
import pickle
|
|
129
|
+
import tempfile
|
|
130
|
+
from pathlib import Path
|
|
131
|
+
from typing import Dict, Any
|
|
132
|
+
|
|
133
|
+
import pytest
|
|
134
|
+
|
|
135
|
+
from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
|
|
136
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstance
|
|
137
|
+
from synth_ai.environments.environment.tools import EnvToolCall
|
|
138
|
+
from synth_ai.environments.reproducibility.tree import FilesystemSnapshotStore, TrajectoryTreeStore
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@pytest.mark.asyncio
|
|
142
|
+
async def test_basic_crafter_serialization():
|
|
143
|
+
"""Test basic serialization/deserialization of Crafter environment."""
|
|
144
|
+
# Create task instance
|
|
145
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
146
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
|
|
147
|
+
from uuid import uuid4
|
|
148
|
+
|
|
149
|
+
metadata = CrafterTaskInstanceMetadata(
|
|
150
|
+
difficulty="easy",
|
|
151
|
+
seed=42,
|
|
152
|
+
num_trees_radius=5,
|
|
153
|
+
num_cows_radius=2,
|
|
154
|
+
num_hostiles_radius=0
|
|
155
|
+
)
|
|
156
|
+
task = CrafterTaskInstance(
|
|
157
|
+
id=uuid4(),
|
|
158
|
+
impetus=Impetus(instructions="Test serialization"),
|
|
159
|
+
intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
|
|
160
|
+
metadata=metadata,
|
|
161
|
+
is_reproducible=True,
|
|
162
|
+
initial_engine_snapshot=None
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Create and initialize environment
|
|
166
|
+
env = CrafterClassicEnvironment(task)
|
|
167
|
+
await env.initialize()
|
|
168
|
+
|
|
169
|
+
# Get initial state
|
|
170
|
+
initial_snapshot = await env._serialize_engine()
|
|
171
|
+
initial_pub = env.engine._get_public_state_from_env()
|
|
172
|
+
|
|
173
|
+
# Take a few actions
|
|
174
|
+
actions = [
|
|
175
|
+
EnvToolCall(tool="interact", args={"action": 5}), # do
|
|
176
|
+
EnvToolCall(tool="interact", args={"action": 2}), # right
|
|
177
|
+
EnvToolCall(tool="interact", args={"action": 5}), # do
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
for action in actions:
|
|
181
|
+
await env.step(action)
|
|
182
|
+
|
|
183
|
+
# Get state after actions
|
|
184
|
+
after_pub = env.engine._get_public_state_from_env()
|
|
185
|
+
after_snapshot = await env._serialize_engine()
|
|
186
|
+
|
|
187
|
+
# Verify state changed
|
|
188
|
+
assert initial_pub.player_position != after_pub.player_position or initial_pub.inventory != after_pub.inventory
|
|
189
|
+
assert initial_snapshot.total_reward_snapshot != after_snapshot.total_reward_snapshot
|
|
190
|
+
|
|
191
|
+
# Deserialize from initial snapshot
|
|
192
|
+
restored_env = await CrafterClassicEnvironment._deserialize_engine(initial_snapshot, task)
|
|
193
|
+
restored_pub = restored_env.engine._get_public_state_from_env()
|
|
194
|
+
|
|
195
|
+
# Verify restoration
|
|
196
|
+
assert restored_pub.player_position == initial_pub.player_position
|
|
197
|
+
assert restored_pub.inventory == initial_pub.inventory
|
|
198
|
+
assert restored_pub.achievements_status == initial_pub.achievements_status
|
|
199
|
+
|
|
200
|
+
print("ā Basic serialization test passed")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
@pytest.mark.asyncio
|
|
204
|
+
async def test_crafter_tree_storage():
|
|
205
|
+
"""Test storing Crafter states in a trajectory tree (for MCTS)."""
|
|
206
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
207
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
|
|
208
|
+
from uuid import uuid4
|
|
209
|
+
|
|
210
|
+
metadata = CrafterTaskInstanceMetadata(
|
|
211
|
+
difficulty="medium",
|
|
212
|
+
seed=123,
|
|
213
|
+
num_trees_radius=3,
|
|
214
|
+
num_cows_radius=1,
|
|
215
|
+
num_hostiles_radius=1
|
|
216
|
+
)
|
|
217
|
+
task = CrafterTaskInstance(
|
|
218
|
+
id=uuid4(),
|
|
219
|
+
impetus=Impetus(instructions="Test tree storage"),
|
|
220
|
+
intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
|
|
221
|
+
metadata=metadata,
|
|
222
|
+
is_reproducible=True,
|
|
223
|
+
initial_engine_snapshot=None
|
|
224
|
+
)
|
|
225
|
+
env = CrafterClassicEnvironment(task)
|
|
226
|
+
await env.initialize()
|
|
227
|
+
|
|
228
|
+
# Set up tree storage
|
|
229
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
230
|
+
snap_store_path = Path(tmpdir) / "crafter_tree"
|
|
231
|
+
tree = TrajectoryTreeStore(FilesystemSnapshotStore(snap_store_path))
|
|
232
|
+
|
|
233
|
+
# Add root snapshot
|
|
234
|
+
root_snapshot = await env._serialize_engine()
|
|
235
|
+
root_blob = gzip.compress(pickle.dumps(root_snapshot))
|
|
236
|
+
root_id = tree.add_root(root_blob)
|
|
237
|
+
|
|
238
|
+
# Expand tree with different actions
|
|
239
|
+
action_rewards = {}
|
|
240
|
+
for action_idx in [0, 1, 2, 3, 4, 5]: # noop, move directions, do
|
|
241
|
+
# Restore from root
|
|
242
|
+
root_env_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(root_id)))
|
|
243
|
+
env = await CrafterClassicEnvironment._deserialize_engine(root_env_snapshot, task)
|
|
244
|
+
|
|
245
|
+
# Take action
|
|
246
|
+
call = EnvToolCall(tool="interact", args={"action": action_idx})
|
|
247
|
+
obs = await env.step(call)
|
|
248
|
+
|
|
249
|
+
# Store child
|
|
250
|
+
child_snapshot = await env._serialize_engine()
|
|
251
|
+
child_blob = gzip.compress(pickle.dumps(child_snapshot))
|
|
252
|
+
child_id = tree.add_child(
|
|
253
|
+
root_id,
|
|
254
|
+
child_blob,
|
|
255
|
+
action=action_idx,
|
|
256
|
+
reward=obs.get("reward_last_step", 0.0),
|
|
257
|
+
terminated=obs.get("terminated", False),
|
|
258
|
+
info={"total_reward": obs.get("total_reward", 0.0)}
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
action_rewards[action_idx] = obs.get("reward_last_step", 0.0)
|
|
262
|
+
|
|
263
|
+
# Verify tree structure
|
|
264
|
+
children = tree.get_children(root_id)
|
|
265
|
+
assert len(children) == 6
|
|
266
|
+
|
|
267
|
+
# Verify we can load any child state
|
|
268
|
+
for child_id in children:
|
|
269
|
+
child_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(child_id)))
|
|
270
|
+
child_env = await CrafterClassicEnvironment._deserialize_engine(child_snapshot, task)
|
|
271
|
+
# Should be able to continue from this state
|
|
272
|
+
await child_env.step(EnvToolCall(tool="interact", args={"action": 0}))
|
|
273
|
+
|
|
274
|
+
print(f"ā Tree storage test passed - stored {len(children)} child states")
|
|
275
|
+
print(f" Action rewards: {action_rewards}")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
@pytest.mark.asyncio
|
|
279
|
+
async def test_crafter_state_consistency():
|
|
280
|
+
"""Test that serialization preserves all important state components."""
|
|
281
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
282
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
|
|
283
|
+
from uuid import uuid4
|
|
284
|
+
|
|
285
|
+
metadata = CrafterTaskInstanceMetadata(
|
|
286
|
+
difficulty="medium",
|
|
287
|
+
seed=777,
|
|
288
|
+
num_trees_radius=4,
|
|
289
|
+
num_cows_radius=2,
|
|
290
|
+
num_hostiles_radius=1
|
|
291
|
+
)
|
|
292
|
+
task = CrafterTaskInstance(
|
|
293
|
+
id=uuid4(),
|
|
294
|
+
impetus=Impetus(instructions="Test consistency"),
|
|
295
|
+
intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
|
|
296
|
+
metadata=metadata,
|
|
297
|
+
is_reproducible=True,
|
|
298
|
+
initial_engine_snapshot=None
|
|
299
|
+
)
|
|
300
|
+
env = CrafterClassicEnvironment(task)
|
|
301
|
+
await env.initialize()
|
|
302
|
+
|
|
303
|
+
# Perform various actions to create a complex state
|
|
304
|
+
action_sequence = [
|
|
305
|
+
5, # do (gather resource)
|
|
306
|
+
2, # right
|
|
307
|
+
5, # do
|
|
308
|
+
1, # up
|
|
309
|
+
5, # do
|
|
310
|
+
10, # make_wood_pickaxe (if resources available)
|
|
311
|
+
]
|
|
312
|
+
|
|
313
|
+
for action_idx in action_sequence:
|
|
314
|
+
try:
|
|
315
|
+
await env.step(EnvToolCall(tool="interact", args={"action": action_idx}))
|
|
316
|
+
except:
|
|
317
|
+
pass # Some actions may fail if resources not available
|
|
318
|
+
|
|
319
|
+
# Get current state details
|
|
320
|
+
original_pub = env.engine._get_public_state_from_env()
|
|
321
|
+
original_priv = env.engine._get_private_state_from_env(0, False, False)
|
|
322
|
+
original_total_reward = env.engine._total_reward
|
|
323
|
+
|
|
324
|
+
# Serialize
|
|
325
|
+
snapshot = await env._serialize_engine()
|
|
326
|
+
|
|
327
|
+
# Create new environment and deserialize
|
|
328
|
+
new_env = await CrafterClassicEnvironment._deserialize_engine(snapshot, task)
|
|
329
|
+
restored_pub = new_env.engine._get_public_state_from_env()
|
|
330
|
+
restored_priv = new_env.engine._get_private_state_from_env(0, False, False)
|
|
331
|
+
|
|
332
|
+
# Check public state consistency
|
|
333
|
+
assert restored_pub.player_position == original_pub.player_position
|
|
334
|
+
assert restored_pub.inventory == original_pub.inventory
|
|
335
|
+
assert restored_pub.achievements_status == original_pub.achievements_status
|
|
336
|
+
assert restored_pub.num_steps_taken == original_pub.num_steps_taken
|
|
337
|
+
|
|
338
|
+
# Check private state consistency
|
|
339
|
+
assert restored_priv.player_internal_stats == original_priv.player_internal_stats
|
|
340
|
+
assert new_env.engine._total_reward == original_total_reward
|
|
341
|
+
|
|
342
|
+
# Verify we can continue playing from restored state
|
|
343
|
+
before_step = restored_pub.num_steps_taken
|
|
344
|
+
await new_env.step(EnvToolCall(tool="interact", args={"action": 0}))
|
|
345
|
+
after_pub = new_env.engine._get_public_state_from_env()
|
|
346
|
+
assert after_pub.num_steps_taken == before_step + 1
|
|
347
|
+
|
|
348
|
+
print("ā State consistency test passed")
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
@pytest.mark.asyncio
|
|
352
|
+
async def test_crafter_mcts_ready_serialization():
|
|
353
|
+
"""Test serialization patterns needed for MCTS implementation."""
|
|
354
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
355
|
+
from synth_ai.environments.examples.crafter_classic.taskset import CrafterTaskInstanceMetadata
|
|
356
|
+
from uuid import uuid4
|
|
357
|
+
|
|
358
|
+
metadata = CrafterTaskInstanceMetadata(
|
|
359
|
+
difficulty="easy",
|
|
360
|
+
seed=999,
|
|
361
|
+
num_trees_radius=6,
|
|
362
|
+
num_cows_radius=3,
|
|
363
|
+
num_hostiles_radius=0
|
|
364
|
+
)
|
|
365
|
+
task = CrafterTaskInstance(
|
|
366
|
+
id=uuid4(),
|
|
367
|
+
impetus=Impetus(instructions="Test MCTS"),
|
|
368
|
+
intent=Intent(rubric={"goal": "Test"}, gold_trajectories=None, gold_state_diff={}),
|
|
369
|
+
metadata=metadata,
|
|
370
|
+
is_reproducible=True,
|
|
371
|
+
initial_engine_snapshot=None
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Helper function to create heuristic score
|
|
375
|
+
def heuristic_score(env: CrafterClassicEnvironment) -> float:
|
|
376
|
+
pub = env.engine._get_public_state_from_env()
|
|
377
|
+
priv = env.engine._get_private_state_from_env(0, False, False)
|
|
378
|
+
|
|
379
|
+
score = 10.0 # Base score
|
|
380
|
+
|
|
381
|
+
# Achievement bonus
|
|
382
|
+
achievements_unlocked = sum(1 for v in pub.achievements_status.values() if v)
|
|
383
|
+
score += achievements_unlocked * 100.0
|
|
384
|
+
|
|
385
|
+
# Survival metrics
|
|
386
|
+
health = priv.player_internal_stats.get("health", 0)
|
|
387
|
+
hunger = priv.player_internal_stats.get("_hunger", 0)
|
|
388
|
+
thirst = priv.player_internal_stats.get("_thirst", 0)
|
|
389
|
+
|
|
390
|
+
score += health * 5.0
|
|
391
|
+
score += (9 - hunger) * 2.0 # Lower is better
|
|
392
|
+
score += (9 - thirst) * 2.0 # Lower is better
|
|
393
|
+
|
|
394
|
+
# Inventory value
|
|
395
|
+
for item, count in pub.inventory.items():
|
|
396
|
+
if "pickaxe" in item:
|
|
397
|
+
score += count * 20.0
|
|
398
|
+
elif "sword" in item:
|
|
399
|
+
score += count * 15.0
|
|
400
|
+
else:
|
|
401
|
+
score += count * 2.0
|
|
402
|
+
|
|
403
|
+
return score
|
|
404
|
+
|
|
405
|
+
# Initialize environment
|
|
406
|
+
env = CrafterClassicEnvironment(task)
|
|
407
|
+
await env.initialize()
|
|
408
|
+
initial_score = heuristic_score(env)
|
|
409
|
+
|
|
410
|
+
# Simulate MCTS-style exploration
|
|
411
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
412
|
+
snap_store = FilesystemSnapshotStore(Path(tmpdir) / "mcts_test")
|
|
413
|
+
tree = TrajectoryTreeStore(snap_store)
|
|
414
|
+
|
|
415
|
+
# Create root
|
|
416
|
+
root_snapshot = await env._serialize_engine()
|
|
417
|
+
root_blob = gzip.compress(pickle.dumps(root_snapshot))
|
|
418
|
+
root_id = tree.add_root(root_blob)
|
|
419
|
+
|
|
420
|
+
# Explore a few actions and track scores
|
|
421
|
+
action_scores: Dict[int, float] = {}
|
|
422
|
+
|
|
423
|
+
for action_idx in [0, 1, 2, 3, 4, 5]:
|
|
424
|
+
# Restore from root for fair comparison
|
|
425
|
+
root_snapshot = pickle.loads(gzip.decompress(tree.load_snapshot_blob(root_id)))
|
|
426
|
+
test_env = await CrafterClassicEnvironment._deserialize_engine(root_snapshot, task)
|
|
427
|
+
|
|
428
|
+
# Take action
|
|
429
|
+
await test_env.step(EnvToolCall(tool="interact", args={"action": action_idx}))
|
|
430
|
+
|
|
431
|
+
# Calculate score after action
|
|
432
|
+
action_scores[action_idx] = heuristic_score(test_env)
|
|
433
|
+
|
|
434
|
+
# Find best action
|
|
435
|
+
best_action = max(action_scores, key=action_scores.get)
|
|
436
|
+
|
|
437
|
+
print("ā MCTS-ready serialization test passed")
|
|
438
|
+
print(f" Initial score: {initial_score:.2f}")
|
|
439
|
+
print(f" Action scores: {action_scores}")
|
|
440
|
+
print(f" Best action: {best_action} (score: {action_scores[best_action]:.2f})")
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# Run all tests
|
|
444
|
+
if __name__ == "__main__":
|
|
445
|
+
async def main():
|
|
446
|
+
print("Running Crafter Serialization Tests...")
|
|
447
|
+
print("=" * 50)
|
|
448
|
+
|
|
449
|
+
await test_basic_crafter_serialization()
|
|
450
|
+
print()
|
|
451
|
+
|
|
452
|
+
await test_crafter_tree_storage()
|
|
453
|
+
print()
|
|
454
|
+
|
|
455
|
+
await test_crafter_state_consistency()
|
|
456
|
+
print()
|
|
457
|
+
|
|
458
|
+
await test_crafter_mcts_ready_serialization()
|
|
459
|
+
print()
|
|
460
|
+
|
|
461
|
+
print("=" * 50)
|
|
462
|
+
print("š All tests passed!")
|
|
463
|
+
|
|
464
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Browse existing Crafter evaluations and launch viewer for a selected run.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import asyncio
|
|
11
|
+
from tabulate import tabulate
|
|
12
|
+
|
|
13
|
+
from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
|
|
14
|
+
set_current_eval_dir,
|
|
15
|
+
app,
|
|
16
|
+
)
|
|
17
|
+
from fastapi.staticfiles import StaticFiles
|
|
18
|
+
import uvicorn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def list_evaluations(evals_dir: Path = Path("src/evals/crafter")):
|
|
22
|
+
"""List all available evaluations with summary info."""
|
|
23
|
+
if not evals_dir.exists():
|
|
24
|
+
print(f"No evaluations found at {evals_dir}")
|
|
25
|
+
return []
|
|
26
|
+
|
|
27
|
+
evaluations = []
|
|
28
|
+
for run_dir in sorted(evals_dir.glob("run_*"), reverse=True):
|
|
29
|
+
if run_dir.is_dir():
|
|
30
|
+
summary_file = run_dir / "evaluation_summary.json"
|
|
31
|
+
if summary_file.exists():
|
|
32
|
+
with open(summary_file, "r") as f:
|
|
33
|
+
summary = json.load(f)
|
|
34
|
+
|
|
35
|
+
eval_info = {
|
|
36
|
+
"run_id": run_dir.name,
|
|
37
|
+
"timestamp": summary["evaluation_metadata"]["timestamp"],
|
|
38
|
+
"models": ", ".join(summary["models_evaluated"]),
|
|
39
|
+
"difficulties": ", ".join(summary["difficulties_evaluated"]),
|
|
40
|
+
"num_trajectories": summary["evaluation_metadata"]["num_trajectories"],
|
|
41
|
+
"path": run_dir,
|
|
42
|
+
}
|
|
43
|
+
evaluations.append(eval_info)
|
|
44
|
+
|
|
45
|
+
return evaluations
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def view_evaluation(eval_dir: Path):
|
|
49
|
+
"""Launch viewer for a specific evaluation."""
|
|
50
|
+
if not eval_dir.exists():
|
|
51
|
+
print(f"Evaluation directory not found: {eval_dir}")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
viewer_dir = eval_dir / "viewer"
|
|
55
|
+
if not viewer_dir.exists():
|
|
56
|
+
print(f"Viewer files not found in {eval_dir}")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
print(f"\nš Viewing evaluation: {eval_dir}")
|
|
60
|
+
print("š Launching viewer at http://localhost:8000")
|
|
61
|
+
print(" Press Ctrl+C to stop the viewer")
|
|
62
|
+
|
|
63
|
+
# Set the current eval directory for the viewer
|
|
64
|
+
set_current_eval_dir(eval_dir)
|
|
65
|
+
|
|
66
|
+
# Mount static files from the viewer directory
|
|
67
|
+
app.mount("/", StaticFiles(directory=str(viewer_dir), html=True), name="viewer")
|
|
68
|
+
|
|
69
|
+
# Run viewer
|
|
70
|
+
config = uvicorn.Config(app, host="0.0.0.0", port=8000, log_level="error")
|
|
71
|
+
server = uvicorn.Server(config)
|
|
72
|
+
await server.serve()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def main():
|
|
76
|
+
parser = argparse.ArgumentParser(description="Browse Crafter evaluations")
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--eval-dir",
|
|
79
|
+
type=str,
|
|
80
|
+
default="src/evals/crafter",
|
|
81
|
+
help="Base directory for evaluations",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--run-id", type=str, help="Specific run ID to view (e.g., run_20240115_143022)"
|
|
85
|
+
)
|
|
86
|
+
parser.add_argument("--latest", action="store_true", help="View the latest evaluation")
|
|
87
|
+
|
|
88
|
+
args = parser.parse_args()
|
|
89
|
+
evals_dir = Path(args.eval_dir)
|
|
90
|
+
|
|
91
|
+
# List evaluations
|
|
92
|
+
evaluations = list_evaluations(evals_dir)
|
|
93
|
+
|
|
94
|
+
if not evaluations:
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
# Display table of evaluations
|
|
98
|
+
if not args.run_id and not args.latest:
|
|
99
|
+
print("\nš Available Crafter Evaluations:")
|
|
100
|
+
table_data = []
|
|
101
|
+
for i, eval_info in enumerate(evaluations):
|
|
102
|
+
# Parse timestamp for cleaner display
|
|
103
|
+
try:
|
|
104
|
+
ts = datetime.fromisoformat(eval_info["timestamp"])
|
|
105
|
+
ts_str = ts.strftime("%Y-%m-%d %H:%M:%S")
|
|
106
|
+
except:
|
|
107
|
+
ts_str = eval_info["timestamp"]
|
|
108
|
+
|
|
109
|
+
table_data.append(
|
|
110
|
+
[
|
|
111
|
+
i + 1,
|
|
112
|
+
eval_info["run_id"],
|
|
113
|
+
ts_str,
|
|
114
|
+
eval_info["models"],
|
|
115
|
+
eval_info["difficulties"],
|
|
116
|
+
eval_info["num_trajectories"],
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
headers = ["#", "Run ID", "Timestamp", "Models", "Difficulties", "Trajectories"]
|
|
121
|
+
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
|
122
|
+
|
|
123
|
+
# Ask user to select
|
|
124
|
+
print("\nEnter the number of the evaluation to view (or 'q' to quit): ", end="")
|
|
125
|
+
choice = input().strip()
|
|
126
|
+
|
|
127
|
+
if choice.lower() == "q":
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
idx = int(choice) - 1
|
|
132
|
+
if 0 <= idx < len(evaluations):
|
|
133
|
+
selected_eval = evaluations[idx]
|
|
134
|
+
await view_evaluation(selected_eval["path"])
|
|
135
|
+
else:
|
|
136
|
+
print("Invalid selection")
|
|
137
|
+
except ValueError:
|
|
138
|
+
print("Invalid input")
|
|
139
|
+
|
|
140
|
+
# View specific run
|
|
141
|
+
elif args.run_id:
|
|
142
|
+
eval_path = evals_dir / args.run_id
|
|
143
|
+
await view_evaluation(eval_path)
|
|
144
|
+
|
|
145
|
+
# View latest
|
|
146
|
+
elif args.latest and evaluations:
|
|
147
|
+
latest_eval = evaluations[0]
|
|
148
|
+
await view_evaluation(latest_eval["path"])
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Script to run Crafter evaluation using the standardized eval framework
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import toml
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from src.synth_env.examples.crafter_classic.agent_demos.eval_framework import (
|
|
10
|
+
CrafterEvalFramework,
|
|
11
|
+
run_crafter_eval,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def main():
|
|
16
|
+
# Load configuration
|
|
17
|
+
config_path = Path(__file__).parent / "eval_config.toml"
|
|
18
|
+
if not config_path.exists():
|
|
19
|
+
raise FileNotFoundError(f"Configuration file not found: {config_path}")
|
|
20
|
+
|
|
21
|
+
config = toml.load(config_path)
|
|
22
|
+
eval_config = config["evaluation"]
|
|
23
|
+
|
|
24
|
+
models = eval_config["models"]
|
|
25
|
+
difficulties = eval_config["difficulties"]
|
|
26
|
+
max_turns = eval_config["max_turns"]
|
|
27
|
+
n_trajectories = eval_config["trajectories_per_condition"]
|
|
28
|
+
|
|
29
|
+
print("šÆ Crafter Multi-Action Model Comparison (Eval Framework)")
|
|
30
|
+
print("=" * 60)
|
|
31
|
+
print(f"Models: {', '.join(models)}")
|
|
32
|
+
print(f"Difficulties: {', '.join(difficulties)}")
|
|
33
|
+
print(f"Max turns: {max_turns}")
|
|
34
|
+
print(f"Trajectories per condition: {n_trajectories}")
|
|
35
|
+
print("=" * 60)
|
|
36
|
+
|
|
37
|
+
# Run evaluation using the framework
|
|
38
|
+
results = await run_crafter_eval(
|
|
39
|
+
model_names=models,
|
|
40
|
+
difficulties=difficulties,
|
|
41
|
+
num_trajectories=n_trajectories,
|
|
42
|
+
max_turns=max_turns,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# The framework already prints detailed reports
|
|
46
|
+
print("\nš Evaluation completed!")
|
|
47
|
+
return results
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
asyncio.run(main())
|