synth-ai 0.2.4.dev8__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (111) hide show
  1. synth_ai/cli/__init__.py +6 -0
  2. synth_ai/cli/demo.py +68 -9
  3. synth_ai/cli/rl_demo.py +137 -0
  4. synth_ai/cli/root.py +65 -0
  5. synth_ai/demos/core/__init__.py +1 -0
  6. synth_ai/demos/core/cli.py +621 -0
  7. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  8. synth_ai/demos/demo_task_apps/core.py +374 -0
  9. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  10. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  11. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  12. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  13. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  14. synth_ai/environments/examples/bandit/__init__.py +33 -0
  15. synth_ai/environments/examples/bandit/engine.py +294 -0
  16. synth_ai/environments/examples/bandit/environment.py +194 -0
  17. synth_ai/environments/examples/bandit/taskset.py +200 -0
  18. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  82. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  83. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  84. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  85. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  86. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  87. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  88. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  89. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  90. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  91. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  92. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  93. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  94. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  96. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  97. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  98. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  99. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  100. synth_ai/environments/examples/red/units/__init__.py +1 -0
  101. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  102. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  103. synth_ai/environments/service/app.py +8 -0
  104. synth_ai/install_sqld.sh +40 -0
  105. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  106. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
  107. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  108. synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
  109. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  110. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  111. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,216 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple example demonstrating how to use tracing_v3 with Crafter.
4
+ This shows the basic pattern for converting v2 code to v3.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import sys
10
+ import time
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+
14
+ # Add parent directory to path
15
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
16
+
17
+ # Import v3 tracing components
18
+ # Import Crafter hooks for v3
19
+ from synth_ai.environments.examples.crafter_classic.trace_hooks_v3 import CRAFTER_HOOKS
20
+
21
+ # Import LM
22
+ from synth_ai.lm.core.main_v2 import LM
23
+ from synth_ai.tracing_v3.abstractions import (
24
+ EnvironmentEvent,
25
+ LMCAISEvent,
26
+ RuntimeEvent,
27
+ SessionEventMarkovBlanketMessage,
28
+ TimeRecord,
29
+ )
30
+ from synth_ai.tracing_v3.decorators import set_session_id, set_turn_number
31
+ from synth_ai.tracing_v3.session_tracer import SessionTracer
32
+ from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager
33
+
34
+
35
+ async def simple_crafter_session():
36
+ """Run a simple Crafter session with v3 tracing."""
37
+
38
+ # 1. Create session tracer with hooks
39
+ tracer = SessionTracer(
40
+ hooks=CRAFTER_HOOKS,
41
+ db_url="sqlite+libsql://http://127.0.0.1:8080", # Turso URL
42
+ auto_save=True
43
+ )
44
+
45
+ # 2. Start a session
46
+ session_id = await tracer.start_session(
47
+ metadata={
48
+ "experiment": "v3_example",
49
+ "model": "gpt-4o-mini",
50
+ "difficulty": "easy"
51
+ }
52
+ )
53
+ print(f"Started session: {session_id}")
54
+
55
+ # 3. Simulate a few game turns
56
+ for turn in range(5):
57
+ # Start timestep
58
+ await tracer.start_timestep(f"turn_{turn}", turn_number=turn)
59
+
60
+ # Record observation message
61
+ observation = {
62
+ "inventory": {"wood": turn, "stone": 0},
63
+ "nearby": ["tree", "stone"],
64
+ "status": {"health": 9, "food": 8 - turn}
65
+ }
66
+
67
+ await tracer.record_message(
68
+ content=json.dumps(observation),
69
+ message_type="observation",
70
+ metadata={"source": "environment"}
71
+ )
72
+
73
+ # Record LM event (simulated)
74
+ lm_event = LMCAISEvent(
75
+ system_instance_id="crafter_agent",
76
+ time_record=TimeRecord(
77
+ event_time=time.time(),
78
+ message_time=turn
79
+ ),
80
+ model_name="gpt-4o-mini",
81
+ provider="openai",
82
+ input_tokens=100 + turn * 10,
83
+ output_tokens=20,
84
+ total_tokens=120 + turn * 10,
85
+ cost_usd=0.001 * (turn + 1),
86
+ latency_ms=100 + turn * 50
87
+ )
88
+ await tracer.record_event(lm_event)
89
+
90
+ # Record action
91
+ action = "collect_wood" if turn % 2 == 0 else "move_right"
92
+ await tracer.record_message(
93
+ content=action,
94
+ message_type="action",
95
+ metadata={"source": "agent"}
96
+ )
97
+
98
+ # Record runtime event
99
+ runtime_event = RuntimeEvent(
100
+ system_instance_id="crafter_env",
101
+ time_record=TimeRecord(
102
+ event_time=time.time(),
103
+ message_time=turn
104
+ ),
105
+ actions=[5 if turn % 2 == 0 else 2], # action IDs
106
+ metadata={
107
+ "action_name": action,
108
+ "valid": True
109
+ }
110
+ )
111
+ await tracer.record_event(runtime_event)
112
+
113
+ # Record environment event with achievements
114
+ achievements_before = {"collect_wood": turn > 0}
115
+ achievements_after = {"collect_wood": True} if action == "collect_wood" else achievements_before
116
+
117
+ env_event = EnvironmentEvent(
118
+ system_instance_id="crafter_env",
119
+ time_record=TimeRecord(
120
+ event_time=time.time(),
121
+ message_time=turn
122
+ ),
123
+ reward=1.0 if action == "collect_wood" else 0.0,
124
+ terminated=False,
125
+ system_state_before={
126
+ "public_state": {"achievements_status": achievements_before}
127
+ },
128
+ system_state_after={
129
+ "public_state": {"achievements_status": achievements_after}
130
+ }
131
+ )
132
+ await tracer.record_event(env_event)
133
+
134
+ # End timestep
135
+ await tracer.end_timestep()
136
+
137
+ print(f"Completed turn {turn}")
138
+
139
+ # 4. End session (auto-saves to database)
140
+ trace = await tracer.end_session()
141
+ print(f"Session ended. Total events: {len(trace.event_history)}")
142
+
143
+ # 5. Query the saved data
144
+ db_manager = AsyncSQLTraceManager("sqlite+libsql://http://127.0.0.1:8080")
145
+ await db_manager.initialize()
146
+
147
+ # Get session data
148
+ session_data = await db_manager.get_session_trace(session_id)
149
+ if session_data:
150
+ print(f"\nRetrieved session from database:")
151
+ print(f" Session ID: {session_data['session_id']}")
152
+ print(f" Timesteps: {session_data['num_timesteps']}")
153
+ print(f" Events: {session_data['num_events']}")
154
+ print(f" Messages: {session_data['num_messages']}")
155
+
156
+ # Query model usage
157
+ model_usage = await db_manager.get_model_usage()
158
+ print(f"\nModel usage statistics:")
159
+ print(model_usage)
160
+
161
+ await db_manager.close()
162
+ await tracer.close()
163
+
164
+
165
+ async def context_manager_example():
166
+ """Example using context managers for cleaner code."""
167
+
168
+ tracer = SessionTracer(
169
+ hooks=CRAFTER_HOOKS,
170
+ db_url="sqlite+libsql://http://127.0.0.1:8080"
171
+ )
172
+
173
+ # Use context managers for automatic cleanup
174
+ async with tracer.session(metadata={"example": "context_manager"}) as session_id:
175
+ print(f"In session: {session_id}")
176
+
177
+ async with tracer.timestep("step_1", turn_number=0) as step:
178
+ print(f"In timestep: {step.step_id}")
179
+
180
+ # Record some events
181
+ await tracer.record_message(
182
+ content="Hello from context manager",
183
+ message_type="user"
184
+ )
185
+
186
+ event = RuntimeEvent(
187
+ system_instance_id="example_system",
188
+ time_record=TimeRecord(event_time=time.time()),
189
+ actions=[1],
190
+ metadata={"example": True}
191
+ )
192
+ await tracer.record_event(event)
193
+
194
+ print("Session automatically ended and saved")
195
+ await tracer.close()
196
+
197
+
198
+ async def main():
199
+ """Run the examples."""
200
+ print("=== V3 Tracing Examples ===\n")
201
+
202
+ print("1. Running simple Crafter session...")
203
+ await simple_crafter_session()
204
+
205
+ print("\n2. Running context manager example...")
206
+ await context_manager_example()
207
+
208
+ print("\n✅ Examples completed!")
209
+
210
+
211
+ if __name__ == "__main__":
212
+ # Make sure sqld is running on port 8080
213
+ print("Note: This example assumes sqld is running on http://127.0.0.1:8080")
214
+ print("Start it with: sqld --http-listen 127.0.0.1:8080\n")
215
+
216
+ asyncio.run(main())
@@ -0,0 +1,296 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to compare traces between OpenAI direct API and LM class implementations.
4
+ Runs both versions and compares the captured events.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import subprocess
10
+ import sys
11
+ import os
12
+ from pathlib import Path
13
+ from typing import Dict, List, Any, Tuple
14
+
15
+ # Colors for output
16
+ GREEN = '\033[92m'
17
+ RED = '\033[91m'
18
+ YELLOW = '\033[93m'
19
+ BLUE = '\033[94m'
20
+ RESET = '\033[0m'
21
+
22
+
23
+ def run_openai_version(model: str = "gpt-4o-mini", episodes: int = 1, max_turns: int = 2) -> Tuple[bool, str]:
24
+ """Run the OpenAI version and return trace directory."""
25
+ print(f"{BLUE}Running OpenAI version...{RESET}")
26
+
27
+ cmd = [
28
+ sys.executable,
29
+ "test_crafter_react_agent_openai.py",
30
+ "--model", model,
31
+ "--episodes", str(episodes),
32
+ "--max-turns", str(max_turns)
33
+ ]
34
+
35
+ try:
36
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
37
+ if result.returncode == 0:
38
+ print(f"{GREEN}✓ OpenAI version completed successfully{RESET}")
39
+ # Extract trace directory from output
40
+ for line in result.stdout.split('\n'):
41
+ if "Saved trace to" in line:
42
+ trace_path = line.split("Saved trace to")[-1].strip()
43
+ trace_dir = Path(trace_path).parent
44
+ return True, str(trace_dir)
45
+ return True, "./traces"
46
+ else:
47
+ print(f"{RED}✗ OpenAI version failed{RESET}")
48
+ print(result.stderr)
49
+ return False, ""
50
+ except Exception as e:
51
+ print(f"{RED}✗ Error running OpenAI version: {e}{RESET}")
52
+ return False, ""
53
+
54
+
55
+ def run_lm_version(model: str = "gpt-4o-mini", episodes: int = 1, max_turns: int = 2) -> Tuple[bool, str]:
56
+ """Run the LM class version and return trace directory."""
57
+ print(f"{BLUE}Running LM version...{RESET}")
58
+
59
+ cmd = [
60
+ sys.executable,
61
+ "test_crafter_react_agent_lm.py",
62
+ "--model", model,
63
+ "--episodes", str(episodes),
64
+ "--max-turns", str(max_turns)
65
+ ]
66
+
67
+ try:
68
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=Path(__file__).parent)
69
+ if result.returncode == 0:
70
+ print(f"{GREEN}✓ LM version completed successfully{RESET}")
71
+ # Extract trace directory from output
72
+ for line in result.stdout.split('\n'):
73
+ if "Saved trace to" in line:
74
+ trace_path = line.split("Saved trace to")[-1].strip()
75
+ trace_dir = Path(trace_path).parent
76
+ return True, str(trace_dir)
77
+ return True, "./traces_v2_lm"
78
+ else:
79
+ print(f"{RED}✗ LM version failed{RESET}")
80
+ print(result.stderr)
81
+ return False, ""
82
+ except Exception as e:
83
+ print(f"{RED}✗ Error running LM version: {e}{RESET}")
84
+ return False, ""
85
+
86
+
87
+ def load_trace(trace_file: Path) -> Dict[str, Any]:
88
+ """Load a trace file."""
89
+ try:
90
+ with open(trace_file, 'r') as f:
91
+ return json.load(f)
92
+ except Exception as e:
93
+ print(f"{RED}Error loading trace {trace_file}: {e}{RESET}")
94
+ return {}
95
+
96
+
97
+ def extract_cais_events(trace_data: Dict[str, Any]) -> List[Dict[str, Any]]:
98
+ """Extract CAISEvents from trace data."""
99
+ events = trace_data.get('event_history', [])
100
+ return [e for e in events if e.get('system_instance_id', '').startswith('crafter-react-agent')]
101
+
102
+
103
+ def extract_env_events(trace_data: Dict[str, Any]) -> List[Dict[str, Any]]:
104
+ """Extract EnvironmentEvents from trace data."""
105
+ events = trace_data.get('event_history', [])
106
+ return [e for e in events if 'reward' in e and not e.get('system_instance_id', '').startswith('crafter-react-agent')]
107
+
108
+
109
+ def compare_events(events1: List[Dict], events2: List[Dict], event_type: str) -> bool:
110
+ """Compare two lists of events and report differences."""
111
+ print(f"\n{BLUE}Comparing {event_type}...{RESET}")
112
+
113
+ if len(events1) != len(events2):
114
+ print(f"{RED}✗ Different number of events: {len(events1)} vs {len(events2)}{RESET}")
115
+ return False
116
+
117
+ all_match = True
118
+ for i, (e1, e2) in enumerate(zip(events1, events2)):
119
+ print(f"\n Event {i+1}:")
120
+
121
+ # Compare key fields
122
+ fields_to_compare = {
123
+ 'CAISEvent': ['system_instance_id', 'llm_call_records', 'metadata'],
124
+ 'EnvironmentEvent': ['reward', 'done', 'info']
125
+ }
126
+
127
+ for field in fields_to_compare.get(event_type, []):
128
+ if field in e1 and field in e2:
129
+ # Special handling for llm_call_records
130
+ if field == 'llm_call_records':
131
+ if len(e1[field]) != len(e2[field]):
132
+ print(f" {RED}✗ Different number of LLM calls{RESET}")
133
+ all_match = False
134
+ else:
135
+ # Compare model and tool calls
136
+ for j, (llm1, llm2) in enumerate(zip(e1[field], e2[field])):
137
+ model1 = llm1.get('model', '')
138
+ model2 = llm2.get('model', '')
139
+ if model1 != model2:
140
+ print(f" {RED}✗ Different models: {model1} vs {model2}{RESET}")
141
+ all_match = False
142
+ else:
143
+ print(f" {GREEN}✓ Model matches: {model1}{RESET}")
144
+
145
+ # Check tool calls
146
+ resp1 = llm1.get('response', {})
147
+ resp2 = llm2.get('response', {})
148
+ tools1 = extract_tool_calls(resp1)
149
+ tools2 = extract_tool_calls(resp2)
150
+
151
+ if tools1 != tools2:
152
+ print(f" {YELLOW}⚠ Different tool calls: {tools1} vs {tools2}{RESET}")
153
+ # This might be OK due to LLM non-determinism
154
+ elif field == 'metadata':
155
+ # Compare token counts if available
156
+ tokens1 = {k: v for k, v in e1[field].items() if 'token' in k}
157
+ tokens2 = {k: v for k, v in e2[field].items() if 'token' in k}
158
+ if tokens1 and tokens2:
159
+ print(f" Tokens (OpenAI): {tokens1}")
160
+ print(f" Tokens (LM): {tokens2}")
161
+ else:
162
+ if e1[field] == e2[field]:
163
+ print(f" {GREEN}✓ {field} matches{RESET}")
164
+ else:
165
+ print(f" {RED}✗ {field} differs: {e1[field]} vs {e2[field]}{RESET}")
166
+ all_match = False
167
+
168
+ return all_match
169
+
170
+
171
+ def extract_tool_calls(response: Dict[str, Any]) -> List[str]:
172
+ """Extract tool call names from response."""
173
+ tool_names = []
174
+ choices = response.get('choices', [])
175
+ if choices and isinstance(choices[0], dict):
176
+ tool_calls = choices[0].get('message', {}).get('tool_calls', [])
177
+ for tc in tool_calls:
178
+ tool_names.append(tc.get('function', {}).get('name', 'unknown'))
179
+ return tool_names
180
+
181
+
182
+ def compare_traces(openai_dir: Path, lm_dir: Path, episode: int = 0) -> bool:
183
+ """Compare traces from both versions."""
184
+ print(f"\n{BLUE}{'='*60}{RESET}")
185
+ print(f"{BLUE}Comparing traces for episode {episode}{RESET}")
186
+ print(f"{BLUE}{'='*60}{RESET}")
187
+
188
+ # Load traces - OpenAI uses different naming convention
189
+ # Look for most recent session files
190
+ openai_files = list(openai_dir.glob(f"session_episode_{episode}_*.json"))
191
+ if openai_files:
192
+ # Get the most recent one
193
+ openai_trace_file = max(openai_files, key=lambda f: f.stat().st_mtime)
194
+ else:
195
+ openai_trace_file = openai_dir / f"trace_episode_{episode}.json"
196
+
197
+ lm_trace_file = lm_dir / f"trace_episode_{episode}.json"
198
+
199
+ if not openai_trace_file.exists():
200
+ print(f"{RED}✗ OpenAI trace not found: {openai_trace_file}{RESET}")
201
+ return False
202
+
203
+ if not lm_trace_file.exists():
204
+ print(f"{RED}✗ LM trace not found: {lm_trace_file}{RESET}")
205
+ return False
206
+
207
+ openai_trace = load_trace(openai_trace_file)
208
+ lm_trace = load_trace(lm_trace_file)
209
+
210
+ # Extract events
211
+ openai_cais = extract_cais_events(openai_trace)
212
+ lm_cais = extract_cais_events(lm_trace)
213
+
214
+ openai_env = extract_env_events(openai_trace)
215
+ lm_env = extract_env_events(lm_trace)
216
+
217
+ # Compare
218
+ cais_match = compare_events(openai_cais, lm_cais, "CAISEvent")
219
+ env_match = compare_events(openai_env, lm_env, "EnvironmentEvent")
220
+
221
+ # Check messages
222
+ print(f"\n{BLUE}Comparing messages...{RESET}")
223
+ openai_msgs = openai_trace.get('message_history', [])
224
+ lm_msgs = lm_trace.get('message_history', [])
225
+
226
+ if len(openai_msgs) != len(lm_msgs):
227
+ print(f"{RED}✗ Different number of messages: {len(openai_msgs)} vs {len(lm_msgs)}{RESET}")
228
+ else:
229
+ print(f"{GREEN}✓ Same number of messages: {len(openai_msgs)}{RESET}")
230
+
231
+ return cais_match and env_match
232
+
233
+
234
+ async def main():
235
+ """Run both versions and compare traces."""
236
+ print(f"{BLUE}{'='*60}{RESET}")
237
+ print(f"{BLUE}Crafter Trace Comparison: OpenAI vs LM{RESET}")
238
+ print(f"{BLUE}{'='*60}{RESET}")
239
+
240
+ # Configuration
241
+ model = "gpt-4o-mini"
242
+ episodes = 2
243
+ max_turns = 3
244
+
245
+ print(f"\nConfiguration:")
246
+ print(f" Model: {model}")
247
+ print(f" Episodes: {episodes}")
248
+ print(f" Max turns: {max_turns}")
249
+
250
+ # Run OpenAI version
251
+ openai_success, openai_dir = run_openai_version(model, episodes, max_turns)
252
+ if not openai_success:
253
+ print(f"{RED}Failed to run OpenAI version{RESET}")
254
+ return
255
+
256
+ # Small delay to avoid rate limits
257
+ await asyncio.sleep(2)
258
+
259
+ # Run LM version
260
+ lm_success, lm_dir = run_lm_version(model, episodes, max_turns)
261
+ if not lm_success:
262
+ print(f"{RED}Failed to run LM version{RESET}")
263
+ return
264
+
265
+ # Compare traces
266
+ print(f"\n{BLUE}Trace directories:{RESET}")
267
+ print(f" OpenAI: {openai_dir}")
268
+ print(f" LM: {lm_dir}")
269
+
270
+ all_match = True
271
+ for episode in range(episodes):
272
+ match = compare_traces(Path(openai_dir), Path(lm_dir), episode)
273
+ all_match = all_match and match
274
+
275
+ # Final verdict
276
+ print(f"\n{BLUE}{'='*60}{RESET}")
277
+ print(f"{BLUE}FINAL VERDICT{RESET}")
278
+ print(f"{BLUE}{'='*60}{RESET}")
279
+
280
+ if all_match:
281
+ print(f"{GREEN}✅ Traces match! LM class produces equivalent v2 traces.{RESET}")
282
+ else:
283
+ print(f"{YELLOW}⚠️ Some differences found. This may be due to:{RESET}")
284
+ print(f"{YELLOW} - LLM non-determinism (different responses){RESET}")
285
+ print(f"{YELLOW} - Minor implementation differences{RESET}")
286
+ print(f"{YELLOW} - Timing variations{RESET}")
287
+
288
+ print(f"\nKey observations:")
289
+ print(f" • Both versions create CAISEvents with LLM call records")
290
+ print(f" • Both capture environment events and observations")
291
+ print(f" • Token counts and metadata are preserved")
292
+ print(f" • The LM class successfully replaces direct OpenAI calls")
293
+
294
+
295
+ if __name__ == "__main__":
296
+ asyncio.run(main())
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run script for Full Enchilada Crafter Evaluation
4
+ """
5
+
6
+ import asyncio
7
+ import argparse
8
+ from src.synth_env.examples.crafter_classic.agent_demos.full_enchilada import (
9
+ run_full_enchilada_eval,
10
+ )
11
+
12
+
13
+ async def main():
14
+ parser = argparse.ArgumentParser(description="Run Full Enchilada Crafter Evaluation")
15
+ parser.add_argument(
16
+ "--models", nargs="+", default=["gpt-4o-mini"], help="Model names to evaluate"
17
+ )
18
+ parser.add_argument(
19
+ "--difficulties",
20
+ nargs="+",
21
+ default=["easy", "hard"],
22
+ help="Difficulty levels to test",
23
+ )
24
+ parser.add_argument(
25
+ "--num-trajectories",
26
+ type=int,
27
+ default=3,
28
+ help="Number of trajectories per condition",
29
+ )
30
+ parser.add_argument("--max-turns", type=int, default=30, help="Maximum turns per trajectory")
31
+ parser.add_argument("--no-images", action="store_true", help="Disable image capture")
32
+ parser.add_argument(
33
+ "--no-viewer",
34
+ action="store_true",
35
+ help="Don't launch the viewer after evaluation",
36
+ )
37
+ parser.add_argument(
38
+ "--output-dir",
39
+ type=str,
40
+ default=None,
41
+ help="Output directory (default: src/evals/crafter/run_TIMESTAMP)",
42
+ )
43
+
44
+ args = parser.parse_args()
45
+
46
+ await run_full_enchilada_eval(
47
+ model_names=args.models,
48
+ difficulties=args.difficulties,
49
+ num_trajectories=args.num_trajectories,
50
+ max_turns=args.max_turns,
51
+ capture_images=not args.no_images,
52
+ launch_viewer=not args.no_viewer,
53
+ output_dir=args.output_dir,
54
+ )
55
+
56
+
57
+ if __name__ == "__main__":
58
+ asyncio.run(main())