synth-ai 0.2.4.dev8__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
- {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Simple example demonstrating how to use tracing_v3 with Crafter.
|
|
4
|
+
This shows the basic pattern for converting v2 code to v3.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
import time
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
# Add parent directory to path
|
|
15
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
|
|
16
|
+
|
|
17
|
+
# Import v3 tracing components
|
|
18
|
+
# Import Crafter hooks for v3
|
|
19
|
+
from synth_ai.environments.examples.crafter_classic.trace_hooks_v3 import CRAFTER_HOOKS
|
|
20
|
+
|
|
21
|
+
# Import LM
|
|
22
|
+
from synth_ai.lm.core.main_v2 import LM
|
|
23
|
+
from synth_ai.tracing_v3.abstractions import (
|
|
24
|
+
EnvironmentEvent,
|
|
25
|
+
LMCAISEvent,
|
|
26
|
+
RuntimeEvent,
|
|
27
|
+
SessionEventMarkovBlanketMessage,
|
|
28
|
+
TimeRecord,
|
|
29
|
+
)
|
|
30
|
+
from synth_ai.tracing_v3.decorators import set_session_id, set_turn_number
|
|
31
|
+
from synth_ai.tracing_v3.session_tracer import SessionTracer
|
|
32
|
+
from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def simple_crafter_session():
|
|
36
|
+
"""Run a simple Crafter session with v3 tracing."""
|
|
37
|
+
|
|
38
|
+
# 1. Create session tracer with hooks
|
|
39
|
+
tracer = SessionTracer(
|
|
40
|
+
hooks=CRAFTER_HOOKS,
|
|
41
|
+
db_url="sqlite+libsql://http://127.0.0.1:8080", # Turso URL
|
|
42
|
+
auto_save=True
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# 2. Start a session
|
|
46
|
+
session_id = await tracer.start_session(
|
|
47
|
+
metadata={
|
|
48
|
+
"experiment": "v3_example",
|
|
49
|
+
"model": "gpt-4o-mini",
|
|
50
|
+
"difficulty": "easy"
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
print(f"Started session: {session_id}")
|
|
54
|
+
|
|
55
|
+
# 3. Simulate a few game turns
|
|
56
|
+
for turn in range(5):
|
|
57
|
+
# Start timestep
|
|
58
|
+
await tracer.start_timestep(f"turn_{turn}", turn_number=turn)
|
|
59
|
+
|
|
60
|
+
# Record observation message
|
|
61
|
+
observation = {
|
|
62
|
+
"inventory": {"wood": turn, "stone": 0},
|
|
63
|
+
"nearby": ["tree", "stone"],
|
|
64
|
+
"status": {"health": 9, "food": 8 - turn}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
await tracer.record_message(
|
|
68
|
+
content=json.dumps(observation),
|
|
69
|
+
message_type="observation",
|
|
70
|
+
metadata={"source": "environment"}
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Record LM event (simulated)
|
|
74
|
+
lm_event = LMCAISEvent(
|
|
75
|
+
system_instance_id="crafter_agent",
|
|
76
|
+
time_record=TimeRecord(
|
|
77
|
+
event_time=time.time(),
|
|
78
|
+
message_time=turn
|
|
79
|
+
),
|
|
80
|
+
model_name="gpt-4o-mini",
|
|
81
|
+
provider="openai",
|
|
82
|
+
input_tokens=100 + turn * 10,
|
|
83
|
+
output_tokens=20,
|
|
84
|
+
total_tokens=120 + turn * 10,
|
|
85
|
+
cost_usd=0.001 * (turn + 1),
|
|
86
|
+
latency_ms=100 + turn * 50
|
|
87
|
+
)
|
|
88
|
+
await tracer.record_event(lm_event)
|
|
89
|
+
|
|
90
|
+
# Record action
|
|
91
|
+
action = "collect_wood" if turn % 2 == 0 else "move_right"
|
|
92
|
+
await tracer.record_message(
|
|
93
|
+
content=action,
|
|
94
|
+
message_type="action",
|
|
95
|
+
metadata={"source": "agent"}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Record runtime event
|
|
99
|
+
runtime_event = RuntimeEvent(
|
|
100
|
+
system_instance_id="crafter_env",
|
|
101
|
+
time_record=TimeRecord(
|
|
102
|
+
event_time=time.time(),
|
|
103
|
+
message_time=turn
|
|
104
|
+
),
|
|
105
|
+
actions=[5 if turn % 2 == 0 else 2], # action IDs
|
|
106
|
+
metadata={
|
|
107
|
+
"action_name": action,
|
|
108
|
+
"valid": True
|
|
109
|
+
}
|
|
110
|
+
)
|
|
111
|
+
await tracer.record_event(runtime_event)
|
|
112
|
+
|
|
113
|
+
# Record environment event with achievements
|
|
114
|
+
achievements_before = {"collect_wood": turn > 0}
|
|
115
|
+
achievements_after = {"collect_wood": True} if action == "collect_wood" else achievements_before
|
|
116
|
+
|
|
117
|
+
env_event = EnvironmentEvent(
|
|
118
|
+
system_instance_id="crafter_env",
|
|
119
|
+
time_record=TimeRecord(
|
|
120
|
+
event_time=time.time(),
|
|
121
|
+
message_time=turn
|
|
122
|
+
),
|
|
123
|
+
reward=1.0 if action == "collect_wood" else 0.0,
|
|
124
|
+
terminated=False,
|
|
125
|
+
system_state_before={
|
|
126
|
+
"public_state": {"achievements_status": achievements_before}
|
|
127
|
+
},
|
|
128
|
+
system_state_after={
|
|
129
|
+
"public_state": {"achievements_status": achievements_after}
|
|
130
|
+
}
|
|
131
|
+
)
|
|
132
|
+
await tracer.record_event(env_event)
|
|
133
|
+
|
|
134
|
+
# End timestep
|
|
135
|
+
await tracer.end_timestep()
|
|
136
|
+
|
|
137
|
+
print(f"Completed turn {turn}")
|
|
138
|
+
|
|
139
|
+
# 4. End session (auto-saves to database)
|
|
140
|
+
trace = await tracer.end_session()
|
|
141
|
+
print(f"Session ended. Total events: {len(trace.event_history)}")
|
|
142
|
+
|
|
143
|
+
# 5. Query the saved data
|
|
144
|
+
db_manager = AsyncSQLTraceManager("sqlite+libsql://http://127.0.0.1:8080")
|
|
145
|
+
await db_manager.initialize()
|
|
146
|
+
|
|
147
|
+
# Get session data
|
|
148
|
+
session_data = await db_manager.get_session_trace(session_id)
|
|
149
|
+
if session_data:
|
|
150
|
+
print(f"\nRetrieved session from database:")
|
|
151
|
+
print(f" Session ID: {session_data['session_id']}")
|
|
152
|
+
print(f" Timesteps: {session_data['num_timesteps']}")
|
|
153
|
+
print(f" Events: {session_data['num_events']}")
|
|
154
|
+
print(f" Messages: {session_data['num_messages']}")
|
|
155
|
+
|
|
156
|
+
# Query model usage
|
|
157
|
+
model_usage = await db_manager.get_model_usage()
|
|
158
|
+
print(f"\nModel usage statistics:")
|
|
159
|
+
print(model_usage)
|
|
160
|
+
|
|
161
|
+
await db_manager.close()
|
|
162
|
+
await tracer.close()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
async def context_manager_example():
|
|
166
|
+
"""Example using context managers for cleaner code."""
|
|
167
|
+
|
|
168
|
+
tracer = SessionTracer(
|
|
169
|
+
hooks=CRAFTER_HOOKS,
|
|
170
|
+
db_url="sqlite+libsql://http://127.0.0.1:8080"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Use context managers for automatic cleanup
|
|
174
|
+
async with tracer.session(metadata={"example": "context_manager"}) as session_id:
|
|
175
|
+
print(f"In session: {session_id}")
|
|
176
|
+
|
|
177
|
+
async with tracer.timestep("step_1", turn_number=0) as step:
|
|
178
|
+
print(f"In timestep: {step.step_id}")
|
|
179
|
+
|
|
180
|
+
# Record some events
|
|
181
|
+
await tracer.record_message(
|
|
182
|
+
content="Hello from context manager",
|
|
183
|
+
message_type="user"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
event = RuntimeEvent(
|
|
187
|
+
system_instance_id="example_system",
|
|
188
|
+
time_record=TimeRecord(event_time=time.time()),
|
|
189
|
+
actions=[1],
|
|
190
|
+
metadata={"example": True}
|
|
191
|
+
)
|
|
192
|
+
await tracer.record_event(event)
|
|
193
|
+
|
|
194
|
+
print("Session automatically ended and saved")
|
|
195
|
+
await tracer.close()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def main():
|
|
199
|
+
"""Run the examples."""
|
|
200
|
+
print("=== V3 Tracing Examples ===\n")
|
|
201
|
+
|
|
202
|
+
print("1. Running simple Crafter session...")
|
|
203
|
+
await simple_crafter_session()
|
|
204
|
+
|
|
205
|
+
print("\n2. Running context manager example...")
|
|
206
|
+
await context_manager_example()
|
|
207
|
+
|
|
208
|
+
print("\n✅ Examples completed!")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
# Make sure sqld is running on port 8080
|
|
213
|
+
print("Note: This example assumes sqld is running on http://127.0.0.1:8080")
|
|
214
|
+
print("Start it with: sqld --http-listen 127.0.0.1:8080\n")
|
|
215
|
+
|
|
216
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Script to compare traces between OpenAI direct API and LM class implementations.
|
|
4
|
+
Runs both versions and compares the captured events.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, List, Any, Tuple
|
|
14
|
+
|
|
15
|
+
# Colors for output
|
|
16
|
+
GREEN = '\033[92m'
|
|
17
|
+
RED = '\033[91m'
|
|
18
|
+
YELLOW = '\033[93m'
|
|
19
|
+
BLUE = '\033[94m'
|
|
20
|
+
RESET = '\033[0m'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run_openai_version(model: str = "gpt-4o-mini", episodes: int = 1, max_turns: int = 2) -> Tuple[bool, str]:
|
|
24
|
+
"""Run the OpenAI version and return trace directory."""
|
|
25
|
+
print(f"{BLUE}Running OpenAI version...{RESET}")
|
|
26
|
+
|
|
27
|
+
cmd = [
|
|
28
|
+
sys.executable,
|
|
29
|
+
"test_crafter_react_agent_openai.py",
|
|
30
|
+
"--model", model,
|
|
31
|
+
"--episodes", str(episodes),
|
|
32
|
+
"--max-turns", str(max_turns)
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
|
|
37
|
+
if result.returncode == 0:
|
|
38
|
+
print(f"{GREEN}✓ OpenAI version completed successfully{RESET}")
|
|
39
|
+
# Extract trace directory from output
|
|
40
|
+
for line in result.stdout.split('\n'):
|
|
41
|
+
if "Saved trace to" in line:
|
|
42
|
+
trace_path = line.split("Saved trace to")[-1].strip()
|
|
43
|
+
trace_dir = Path(trace_path).parent
|
|
44
|
+
return True, str(trace_dir)
|
|
45
|
+
return True, "./traces"
|
|
46
|
+
else:
|
|
47
|
+
print(f"{RED}✗ OpenAI version failed{RESET}")
|
|
48
|
+
print(result.stderr)
|
|
49
|
+
return False, ""
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(f"{RED}✗ Error running OpenAI version: {e}{RESET}")
|
|
52
|
+
return False, ""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def run_lm_version(model: str = "gpt-4o-mini", episodes: int = 1, max_turns: int = 2) -> Tuple[bool, str]:
|
|
56
|
+
"""Run the LM class version and return trace directory."""
|
|
57
|
+
print(f"{BLUE}Running LM version...{RESET}")
|
|
58
|
+
|
|
59
|
+
cmd = [
|
|
60
|
+
sys.executable,
|
|
61
|
+
"test_crafter_react_agent_lm.py",
|
|
62
|
+
"--model", model,
|
|
63
|
+
"--episodes", str(episodes),
|
|
64
|
+
"--max-turns", str(max_turns)
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
|
|
69
|
+
if result.returncode == 0:
|
|
70
|
+
print(f"{GREEN}✓ LM version completed successfully{RESET}")
|
|
71
|
+
# Extract trace directory from output
|
|
72
|
+
for line in result.stdout.split('\n'):
|
|
73
|
+
if "Saved trace to" in line:
|
|
74
|
+
trace_path = line.split("Saved trace to")[-1].strip()
|
|
75
|
+
trace_dir = Path(trace_path).parent
|
|
76
|
+
return True, str(trace_dir)
|
|
77
|
+
return True, "./traces_v2_lm"
|
|
78
|
+
else:
|
|
79
|
+
print(f"{RED}✗ LM version failed{RESET}")
|
|
80
|
+
print(result.stderr)
|
|
81
|
+
return False, ""
|
|
82
|
+
except Exception as e:
|
|
83
|
+
print(f"{RED}✗ Error running LM version: {e}{RESET}")
|
|
84
|
+
return False, ""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def load_trace(trace_file: Path) -> Dict[str, Any]:
|
|
88
|
+
"""Load a trace file."""
|
|
89
|
+
try:
|
|
90
|
+
with open(trace_file, 'r') as f:
|
|
91
|
+
return json.load(f)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
print(f"{RED}Error loading trace {trace_file}: {e}{RESET}")
|
|
94
|
+
return {}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def extract_cais_events(trace_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
98
|
+
"""Extract CAISEvents from trace data."""
|
|
99
|
+
events = trace_data.get('event_history', [])
|
|
100
|
+
return [e for e in events if e.get('system_instance_id', '').startswith('crafter-react-agent')]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_env_events(trace_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
104
|
+
"""Extract EnvironmentEvents from trace data."""
|
|
105
|
+
events = trace_data.get('event_history', [])
|
|
106
|
+
return [e for e in events if 'reward' in e and not e.get('system_instance_id', '').startswith('crafter-react-agent')]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def compare_events(events1: List[Dict], events2: List[Dict], event_type: str) -> bool:
|
|
110
|
+
"""Compare two lists of events and report differences."""
|
|
111
|
+
print(f"\n{BLUE}Comparing {event_type}...{RESET}")
|
|
112
|
+
|
|
113
|
+
if len(events1) != len(events2):
|
|
114
|
+
print(f"{RED}✗ Different number of events: {len(events1)} vs {len(events2)}{RESET}")
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
all_match = True
|
|
118
|
+
for i, (e1, e2) in enumerate(zip(events1, events2)):
|
|
119
|
+
print(f"\n Event {i+1}:")
|
|
120
|
+
|
|
121
|
+
# Compare key fields
|
|
122
|
+
fields_to_compare = {
|
|
123
|
+
'CAISEvent': ['system_instance_id', 'llm_call_records', 'metadata'],
|
|
124
|
+
'EnvironmentEvent': ['reward', 'done', 'info']
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
for field in fields_to_compare.get(event_type, []):
|
|
128
|
+
if field in e1 and field in e2:
|
|
129
|
+
# Special handling for llm_call_records
|
|
130
|
+
if field == 'llm_call_records':
|
|
131
|
+
if len(e1[field]) != len(e2[field]):
|
|
132
|
+
print(f" {RED}✗ Different number of LLM calls{RESET}")
|
|
133
|
+
all_match = False
|
|
134
|
+
else:
|
|
135
|
+
# Compare model and tool calls
|
|
136
|
+
for j, (llm1, llm2) in enumerate(zip(e1[field], e2[field])):
|
|
137
|
+
model1 = llm1.get('model', '')
|
|
138
|
+
model2 = llm2.get('model', '')
|
|
139
|
+
if model1 != model2:
|
|
140
|
+
print(f" {RED}✗ Different models: {model1} vs {model2}{RESET}")
|
|
141
|
+
all_match = False
|
|
142
|
+
else:
|
|
143
|
+
print(f" {GREEN}✓ Model matches: {model1}{RESET}")
|
|
144
|
+
|
|
145
|
+
# Check tool calls
|
|
146
|
+
resp1 = llm1.get('response', {})
|
|
147
|
+
resp2 = llm2.get('response', {})
|
|
148
|
+
tools1 = extract_tool_calls(resp1)
|
|
149
|
+
tools2 = extract_tool_calls(resp2)
|
|
150
|
+
|
|
151
|
+
if tools1 != tools2:
|
|
152
|
+
print(f" {YELLOW}⚠ Different tool calls: {tools1} vs {tools2}{RESET}")
|
|
153
|
+
# This might be OK due to LLM non-determinism
|
|
154
|
+
elif field == 'metadata':
|
|
155
|
+
# Compare token counts if available
|
|
156
|
+
tokens1 = {k: v for k, v in e1[field].items() if 'token' in k}
|
|
157
|
+
tokens2 = {k: v for k, v in e2[field].items() if 'token' in k}
|
|
158
|
+
if tokens1 and tokens2:
|
|
159
|
+
print(f" Tokens (OpenAI): {tokens1}")
|
|
160
|
+
print(f" Tokens (LM): {tokens2}")
|
|
161
|
+
else:
|
|
162
|
+
if e1[field] == e2[field]:
|
|
163
|
+
print(f" {GREEN}✓ {field} matches{RESET}")
|
|
164
|
+
else:
|
|
165
|
+
print(f" {RED}✗ {field} differs: {e1[field]} vs {e2[field]}{RESET}")
|
|
166
|
+
all_match = False
|
|
167
|
+
|
|
168
|
+
return all_match
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def extract_tool_calls(response: Dict[str, Any]) -> List[str]:
|
|
172
|
+
"""Extract tool call names from response."""
|
|
173
|
+
tool_names = []
|
|
174
|
+
choices = response.get('choices', [])
|
|
175
|
+
if choices and isinstance(choices[0], dict):
|
|
176
|
+
tool_calls = choices[0].get('message', {}).get('tool_calls', [])
|
|
177
|
+
for tc in tool_calls:
|
|
178
|
+
tool_names.append(tc.get('function', {}).get('name', 'unknown'))
|
|
179
|
+
return tool_names
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def compare_traces(openai_dir: Path, lm_dir: Path, episode: int = 0) -> bool:
|
|
183
|
+
"""Compare traces from both versions."""
|
|
184
|
+
print(f"\n{BLUE}{'='*60}{RESET}")
|
|
185
|
+
print(f"{BLUE}Comparing traces for episode {episode}{RESET}")
|
|
186
|
+
print(f"{BLUE}{'='*60}{RESET}")
|
|
187
|
+
|
|
188
|
+
# Load traces - OpenAI uses different naming convention
|
|
189
|
+
# Look for most recent session files
|
|
190
|
+
openai_files = list(openai_dir.glob(f"session_episode_{episode}_*.json"))
|
|
191
|
+
if openai_files:
|
|
192
|
+
# Get the most recent one
|
|
193
|
+
openai_trace_file = max(openai_files, key=lambda f: f.stat().st_mtime)
|
|
194
|
+
else:
|
|
195
|
+
openai_trace_file = openai_dir / f"trace_episode_{episode}.json"
|
|
196
|
+
|
|
197
|
+
lm_trace_file = lm_dir / f"trace_episode_{episode}.json"
|
|
198
|
+
|
|
199
|
+
if not openai_trace_file.exists():
|
|
200
|
+
print(f"{RED}✗ OpenAI trace not found: {openai_trace_file}{RESET}")
|
|
201
|
+
return False
|
|
202
|
+
|
|
203
|
+
if not lm_trace_file.exists():
|
|
204
|
+
print(f"{RED}✗ LM trace not found: {lm_trace_file}{RESET}")
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
openai_trace = load_trace(openai_trace_file)
|
|
208
|
+
lm_trace = load_trace(lm_trace_file)
|
|
209
|
+
|
|
210
|
+
# Extract events
|
|
211
|
+
openai_cais = extract_cais_events(openai_trace)
|
|
212
|
+
lm_cais = extract_cais_events(lm_trace)
|
|
213
|
+
|
|
214
|
+
openai_env = extract_env_events(openai_trace)
|
|
215
|
+
lm_env = extract_env_events(lm_trace)
|
|
216
|
+
|
|
217
|
+
# Compare
|
|
218
|
+
cais_match = compare_events(openai_cais, lm_cais, "CAISEvent")
|
|
219
|
+
env_match = compare_events(openai_env, lm_env, "EnvironmentEvent")
|
|
220
|
+
|
|
221
|
+
# Check messages
|
|
222
|
+
print(f"\n{BLUE}Comparing messages...{RESET}")
|
|
223
|
+
openai_msgs = openai_trace.get('message_history', [])
|
|
224
|
+
lm_msgs = lm_trace.get('message_history', [])
|
|
225
|
+
|
|
226
|
+
if len(openai_msgs) != len(lm_msgs):
|
|
227
|
+
print(f"{RED}✗ Different number of messages: {len(openai_msgs)} vs {len(lm_msgs)}{RESET}")
|
|
228
|
+
else:
|
|
229
|
+
print(f"{GREEN}✓ Same number of messages: {len(openai_msgs)}{RESET}")
|
|
230
|
+
|
|
231
|
+
return cais_match and env_match
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def main():
|
|
235
|
+
"""Run both versions and compare traces."""
|
|
236
|
+
print(f"{BLUE}{'='*60}{RESET}")
|
|
237
|
+
print(f"{BLUE}Crafter Trace Comparison: OpenAI vs LM{RESET}")
|
|
238
|
+
print(f"{BLUE}{'='*60}{RESET}")
|
|
239
|
+
|
|
240
|
+
# Configuration
|
|
241
|
+
model = "gpt-4o-mini"
|
|
242
|
+
episodes = 2
|
|
243
|
+
max_turns = 3
|
|
244
|
+
|
|
245
|
+
print(f"\nConfiguration:")
|
|
246
|
+
print(f" Model: {model}")
|
|
247
|
+
print(f" Episodes: {episodes}")
|
|
248
|
+
print(f" Max turns: {max_turns}")
|
|
249
|
+
|
|
250
|
+
# Run OpenAI version
|
|
251
|
+
openai_success, openai_dir = run_openai_version(model, episodes, max_turns)
|
|
252
|
+
if not openai_success:
|
|
253
|
+
print(f"{RED}Failed to run OpenAI version{RESET}")
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
# Small delay to avoid rate limits
|
|
257
|
+
await asyncio.sleep(2)
|
|
258
|
+
|
|
259
|
+
# Run LM version
|
|
260
|
+
lm_success, lm_dir = run_lm_version(model, episodes, max_turns)
|
|
261
|
+
if not lm_success:
|
|
262
|
+
print(f"{RED}Failed to run LM version{RESET}")
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
# Compare traces
|
|
266
|
+
print(f"\n{BLUE}Trace directories:{RESET}")
|
|
267
|
+
print(f" OpenAI: {openai_dir}")
|
|
268
|
+
print(f" LM: {lm_dir}")
|
|
269
|
+
|
|
270
|
+
all_match = True
|
|
271
|
+
for episode in range(episodes):
|
|
272
|
+
match = compare_traces(Path(openai_dir), Path(lm_dir), episode)
|
|
273
|
+
all_match = all_match and match
|
|
274
|
+
|
|
275
|
+
# Final verdict
|
|
276
|
+
print(f"\n{BLUE}{'='*60}{RESET}")
|
|
277
|
+
print(f"{BLUE}FINAL VERDICT{RESET}")
|
|
278
|
+
print(f"{BLUE}{'='*60}{RESET}")
|
|
279
|
+
|
|
280
|
+
if all_match:
|
|
281
|
+
print(f"{GREEN}✅ Traces match! LM class produces equivalent v2 traces.{RESET}")
|
|
282
|
+
else:
|
|
283
|
+
print(f"{YELLOW}⚠️ Some differences found. This may be due to:{RESET}")
|
|
284
|
+
print(f"{YELLOW} - LLM non-determinism (different responses){RESET}")
|
|
285
|
+
print(f"{YELLOW} - Minor implementation differences{RESET}")
|
|
286
|
+
print(f"{YELLOW} - Timing variations{RESET}")
|
|
287
|
+
|
|
288
|
+
print(f"\nKey observations:")
|
|
289
|
+
print(f" • Both versions create CAISEvents with LLM call records")
|
|
290
|
+
print(f" • Both capture environment events and observations")
|
|
291
|
+
print(f" • Token counts and metadata are preserved")
|
|
292
|
+
print(f" • The LM class successfully replaces direct OpenAI calls")
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
if __name__ == "__main__":
|
|
296
|
+
asyncio.run(main())
|
synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run script for Full Enchilada Crafter Evaluation
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import argparse
|
|
8
|
+
from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
|
|
9
|
+
run_full_enchilada_eval,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
async def main():
|
|
14
|
+
parser = argparse.ArgumentParser(description="Run Full Enchilada Crafter Evaluation")
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"--models", nargs="+", default=["gpt-4o-mini"], help="Model names to evaluate"
|
|
17
|
+
)
|
|
18
|
+
parser.add_argument(
|
|
19
|
+
"--difficulties",
|
|
20
|
+
nargs="+",
|
|
21
|
+
default=["easy", "hard"],
|
|
22
|
+
help="Difficulty levels to test",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--num-trajectories",
|
|
26
|
+
type=int,
|
|
27
|
+
default=3,
|
|
28
|
+
help="Number of trajectories per condition",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("--max-turns", type=int, default=30, help="Maximum turns per trajectory")
|
|
31
|
+
parser.add_argument("--no-images", action="store_true", help="Disable image capture")
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--no-viewer",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="Don't launch the viewer after evaluation",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--output-dir",
|
|
39
|
+
type=str,
|
|
40
|
+
default=None,
|
|
41
|
+
help="Output directory (default: src/evals/crafter/run_TIMESTAMP)",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
args = parser.parse_args()
|
|
45
|
+
|
|
46
|
+
await run_full_enchilada_eval(
|
|
47
|
+
model_names=args.models,
|
|
48
|
+
difficulties=args.difficulties,
|
|
49
|
+
num_trajectories=args.num_trajectories,
|
|
50
|
+
max_turns=args.max_turns,
|
|
51
|
+
capture_images=not args.no_images,
|
|
52
|
+
launch_viewer=not args.no_viewer,
|
|
53
|
+
output_dir=args.output_dir,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
asyncio.run(main())
|