synth-ai 0.2.4.dev8__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (111) hide show
  1. synth_ai/cli/__init__.py +6 -0
  2. synth_ai/cli/demo.py +68 -9
  3. synth_ai/cli/rl_demo.py +137 -0
  4. synth_ai/cli/root.py +65 -0
  5. synth_ai/demos/core/__init__.py +1 -0
  6. synth_ai/demos/core/cli.py +621 -0
  7. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  8. synth_ai/demos/demo_task_apps/core.py +374 -0
  9. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  10. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  11. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  12. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  13. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  14. synth_ai/environments/examples/bandit/__init__.py +33 -0
  15. synth_ai/environments/examples/bandit/engine.py +294 -0
  16. synth_ai/environments/examples/bandit/environment.py +194 -0
  17. synth_ai/environments/examples/bandit/taskset.py +200 -0
  18. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  19. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  20. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  82. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  83. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  84. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  85. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  86. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  87. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  88. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  89. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  90. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  91. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  92. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  93. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  94. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  96. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  97. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  98. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  99. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  100. synth_ai/environments/examples/red/units/__init__.py +1 -0
  101. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  102. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  103. synth_ai/environments/service/app.py +8 -0
  104. synth_ai/install_sqld.sh +40 -0
  105. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  106. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
  107. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  108. synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
  109. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  110. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  111. {synth_ai-0.2.4.dev8.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,206 @@
1
+ #!/usr/bin/env python3
2
+ """Analyze trace data to understand why no achievements were unlocked."""
3
+
4
+ import duckdb
5
+ import json
6
+ from pathlib import Path
7
+ from collections import defaultdict, Counter
8
+
9
+ def analyze_traces(db_path: str):
10
+ """Analyze trace data to identify issues."""
11
+ conn = duckdb.connect(db_path, read_only=True)
12
+
13
+ print("šŸ” Analyzing trace data...\n")
14
+
15
+ # 1. Check basic statistics
16
+ print("šŸ“Š BASIC STATISTICS")
17
+ print("=" * 50)
18
+
19
+ # First, show available tables
20
+ tables = conn.execute("SHOW TABLES").fetchall()
21
+ print("Available tables:")
22
+ for table in tables:
23
+ print(f" - {table[0]}")
24
+
25
+ # Count sessions
26
+ session_count = conn.execute("SELECT COUNT(DISTINCT session_id) FROM session_traces").fetchone()[0]
27
+ print(f"\nTotal sessions: {session_count}")
28
+
29
+ # Count events
30
+ event_count = conn.execute("SELECT COUNT(*) FROM session_traces").fetchone()[0]
31
+ print(f"Total events: {event_count}")
32
+
33
+ # Check event types
34
+ print("\nšŸ“‹ EVENT TYPE DISTRIBUTION")
35
+ print("-" * 30)
36
+ event_types = conn.execute("""
37
+ SELECT event_type, COUNT(*) as count
38
+ FROM session_traces
39
+ GROUP BY event_type
40
+ ORDER BY count DESC
41
+ """).fetchall()
42
+
43
+ for event_type, count in event_types:
44
+ print(f"{event_type}: {count}")
45
+
46
+ # 2. Analyze agent decisions
47
+ print("\nšŸ¤– AGENT DECISIONS ANALYSIS")
48
+ print("=" * 50)
49
+
50
+ # Get all generation completion events
51
+ completions = conn.execute("""
52
+ SELECT event_data
53
+ FROM session_traces
54
+ WHERE event_type = 'generation_completion'
55
+ LIMIT 50
56
+ """).fetchall()
57
+
58
+ if completions:
59
+ print(f"Found {len(completions)} generation completions (showing first 50)")
60
+
61
+ # Analyze first few completions
62
+ for i, (event_data,) in enumerate(completions[:5]):
63
+ data = json.loads(event_data)
64
+ print(f"\n--- Completion {i+1} ---")
65
+
66
+ # Extract response
67
+ if 'response' in data and data['response']:
68
+ response = data['response']
69
+ if 'content' in response:
70
+ print(f"Content preview: {response['content'][:200]}...")
71
+ if 'tool_calls' in response:
72
+ print(f"Tool calls: {response['tool_calls']}")
73
+ else:
74
+ print("No response found in event data")
75
+ else:
76
+ print("āŒ No generation completion events found!")
77
+
78
+ # 3. Analyze runtime events (actions taken)
79
+ print("\nšŸŽ® RUNTIME EVENTS (ACTIONS)")
80
+ print("=" * 50)
81
+
82
+ runtime_events = conn.execute("""
83
+ SELECT event_data
84
+ FROM session_traces
85
+ WHERE event_type = 'runtime_event'
86
+ LIMIT 100
87
+ """).fetchall()
88
+
89
+ if runtime_events:
90
+ action_counter = Counter()
91
+
92
+ for (event_data,) in runtime_events:
93
+ data = json.loads(event_data)
94
+ if 'metadata' in data and 'action_name' in data['metadata']:
95
+ action_counter[data['metadata']['action_name']] += 1
96
+
97
+ print(f"Found {len(runtime_events)} runtime events")
98
+ print("\nAction distribution:")
99
+ for action, count in action_counter.most_common():
100
+ print(f" {action}: {count}")
101
+ else:
102
+ print("āŒ No runtime events found!")
103
+
104
+ # 4. Analyze environment events (results)
105
+ print("\nšŸŒ ENVIRONMENT EVENTS")
106
+ print("=" * 50)
107
+
108
+ env_events = conn.execute("""
109
+ SELECT event_data
110
+ FROM session_traces
111
+ WHERE event_type = 'environment_event'
112
+ LIMIT 100
113
+ """).fetchall()
114
+
115
+ if env_events:
116
+ reward_sum = 0
117
+ achievements_found = []
118
+
119
+ for (event_data,) in env_events:
120
+ data = json.loads(event_data)
121
+
122
+ # Check rewards
123
+ if 'reward' in data:
124
+ reward_sum += data['reward'] or 0
125
+
126
+ # Check for achievements in state
127
+ if 'system_state_after' in data:
128
+ state = data['system_state_after']
129
+ if 'public_state' in state and 'achievements' in state['public_state']:
130
+ achievements = state['public_state']['achievements']
131
+ for ach, unlocked in achievements.items():
132
+ if unlocked:
133
+ achievements_found.append(ach)
134
+
135
+ print(f"Found {len(env_events)} environment events")
136
+ print(f"Total reward across all events: {reward_sum}")
137
+ print(f"Achievements found: {set(achievements_found) if achievements_found else 'None'}")
138
+ else:
139
+ print("āŒ No environment events found!")
140
+
141
+ # 5. Check for errors
142
+ print("\nāš ļø ERROR CHECK")
143
+ print("=" * 50)
144
+
145
+ # Look for error messages in events
146
+ error_events = conn.execute("""
147
+ SELECT event_type, event_data
148
+ FROM session_traces
149
+ WHERE event_data LIKE '%error%' OR event_data LIKE '%Error%'
150
+ LIMIT 10
151
+ """).fetchall()
152
+
153
+ if error_events:
154
+ print(f"Found {len(error_events)} events with potential errors:")
155
+ for event_type, event_data in error_events[:3]:
156
+ print(f"\n{event_type}:")
157
+ data = json.loads(event_data)
158
+ print(json.dumps(data, indent=2)[:500])
159
+ else:
160
+ print("No obvious errors found in events")
161
+
162
+ # 6. Sample a full episode flow
163
+ print("\nšŸ“– SAMPLE EPISODE FLOW")
164
+ print("=" * 50)
165
+
166
+ # Get events from first session
167
+ first_session = conn.execute("SELECT DISTINCT session_id FROM session_traces LIMIT 1").fetchone()
168
+ if first_session:
169
+ session_id = first_session[0]
170
+ print(f"Analyzing session: {session_id}")
171
+
172
+ session_events = conn.execute("""
173
+ SELECT event_type, event_data, created_at
174
+ FROM session_traces
175
+ WHERE session_id = ?
176
+ ORDER BY created_at
177
+ LIMIT 20
178
+ """, [session_id]).fetchall()
179
+
180
+ print(f"\nFirst 20 events in session:")
181
+ for i, (event_type, event_data, created_at) in enumerate(session_events):
182
+ data = json.loads(event_data)
183
+ print(f"\n{i+1}. {event_type} at {created_at}")
184
+
185
+ # Show relevant info based on event type
186
+ if event_type == 'generation_completion':
187
+ if 'response' in data and 'tool_calls' in data['response']:
188
+ print(f" Tool calls: {data['response']['tool_calls']}")
189
+ elif event_type == 'runtime_event':
190
+ if 'metadata' in data:
191
+ print(f" Action: {data['metadata'].get('action_name', 'Unknown')}")
192
+ elif event_type == 'environment_event':
193
+ if 'reward' in data:
194
+ print(f" Reward: {data['reward']}")
195
+
196
+ conn.close()
197
+
198
+ if __name__ == "__main__":
199
+ db_path = "./traces_v2_synth/traces.duckdb"
200
+ if Path(db_path).exists():
201
+ analyze_traces(db_path)
202
+ else:
203
+ print(f"āŒ Database not found at {db_path}")
204
+ print("Available databases:")
205
+ for db in Path(".").glob("**/traces.duckdb"):
206
+ print(f" - {db}")
@@ -0,0 +1,49 @@
1
+ #!/usr/bin/env python3
2
+ """Check DuckDB schema to understand table structure."""
3
+
4
+ import duckdb
5
+ from pathlib import Path
6
+
7
+ def check_schema(db_path: str):
8
+ """Check database schema."""
9
+ conn = duckdb.connect(db_path, read_only=True)
10
+
11
+ print("šŸ” Checking database schema...\n")
12
+
13
+ # Get all tables
14
+ tables = conn.execute("SHOW TABLES").fetchall()
15
+ print("šŸ“‹ Tables in database:")
16
+ for table in tables:
17
+ print(f" - {table[0]}")
18
+
19
+ # Check schema of key tables
20
+ key_tables = ['session_traces', 'events', 'messages', 'session_timesteps']
21
+
22
+ for table_name in key_tables:
23
+ if any(t[0] == table_name for t in tables):
24
+ print(f"\nšŸ“Š Schema for {table_name}:")
25
+ print("-" * 50)
26
+ schema = conn.execute(f"DESCRIBE {table_name}").fetchall()
27
+ for col_name, col_type, _, _, _, _ in schema:
28
+ print(f" {col_name}: {col_type}")
29
+
30
+ # Show sample data
31
+ print(f"\nšŸ“„ Sample data from {table_name} (first 2 rows):")
32
+ sample = conn.execute(f"SELECT * FROM {table_name} LIMIT 2").fetchall()
33
+ if sample:
34
+ # Get column names
35
+ cols = [desc[0] for desc in conn.execute(f"SELECT * FROM {table_name} LIMIT 0").description]
36
+ print(f" Columns: {cols}")
37
+ for row in sample:
38
+ print(f" {row}")
39
+ else:
40
+ print(" (No data)")
41
+
42
+ conn.close()
43
+
44
+ if __name__ == "__main__":
45
+ db_path = "./traces_v2_synth/traces.duckdb"
46
+ if Path(db_path).exists():
47
+ check_schema(db_path)
48
+ else:
49
+ print(f"āŒ Database not found at {db_path}")
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env python3
2
+ """Check the latest results JSON file."""
3
+
4
+ import json
5
+ import glob
6
+ from pathlib import Path
7
+
8
+ # Find the latest results file
9
+ result_files = glob.glob("crafter_lm_synth_results_*.json")
10
+ if not result_files:
11
+ print("No result files found")
12
+ exit(1)
13
+
14
+ # Get the most recent file
15
+ latest_file = max(result_files, key=lambda f: Path(f).stat().st_mtime)
16
+ print(f"šŸ“Š Checking latest results: {latest_file}\n")
17
+
18
+ with open(latest_file) as f:
19
+ data = json.load(f)
20
+
21
+ # Extract key metrics
22
+ total_episodes = data.get('total_episodes', 0)
23
+ total_steps = data.get('total_steps', 0)
24
+ model = data.get('model', 'unknown')
25
+
26
+ print(f"Model: {model}")
27
+ print(f"Episodes: {total_episodes}")
28
+ print(f"Total Steps: {total_steps}")
29
+
30
+ # Check episode results
31
+ episodes = data.get('episodes', [])
32
+ if episodes:
33
+ print(f"\nšŸ“Š Episode Summary:")
34
+ for i, ep in enumerate(episodes):
35
+ steps = ep.get('steps', 0)
36
+ reward = ep.get('total_reward', 0)
37
+ achievements = ep.get('achievements_unlocked', [])
38
+
39
+ print(f"\nEpisode {i}:")
40
+ print(f" Steps: {steps}")
41
+ print(f" Reward: {reward}")
42
+ print(f" Achievements: {len(achievements)}")
43
+ if achievements:
44
+ print(f" - {', '.join(achievements[:5])}")
45
+ if len(achievements) > 5:
46
+ print(f" ... and {len(achievements) - 5} more")
47
+
48
+ # Check inventory at end
49
+ inventory = ep.get('final_inventory', {})
50
+ non_zero = {k: v for k, v in inventory.items() if v > 0 and k not in ['health', 'food', 'drink', 'energy']}
51
+ if non_zero:
52
+ print(f" Final inventory: {non_zero}")
53
+
54
+ # Sample actions
55
+ if 'action_history' in ep and ep['action_history']:
56
+ print(f" Sample actions: {ep['action_history'][:5]}")
57
+
58
+ # Overall statistics
59
+ avg_reward = sum(ep.get('total_reward', 0) for ep in episodes) / len(episodes) if episodes else 0
60
+ total_achievements = sum(len(ep.get('achievements_unlocked', [])) for ep in episodes)
61
+
62
+ print(f"\nšŸ“Š Overall Statistics:")
63
+ print(f"Average reward per episode: {avg_reward:.2f}")
64
+ print(f"Total achievements unlocked: {total_achievements}")
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env python3
2
+ """Debug why agent is not using multiple actions."""
3
+
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Read the latest run output from stdin or file
9
+ if len(sys.argv) > 1:
10
+ with open(sys.argv[1]) as f:
11
+ output = f.read()
12
+ else:
13
+ # Check for latest log file
14
+ log_files = list(Path(".").glob("crafter_run_*.log"))
15
+ if not log_files:
16
+ print("No log files found. Run with: python test_crafter_react_agent_lm_synth.py --model 'Qwen/Qwen2.5-14B-Instruct' --episodes 1 --max-steps 10 --verbose 2>&1 | tee crafter_run.log")
17
+ exit(1)
18
+
19
+ latest_log = max(log_files, key=lambda f: f.stat().st_mtime)
20
+ with open(latest_log) as f:
21
+ output = f.read()
22
+
23
+ # Parse tool calls
24
+ tool_calls = []
25
+ single_action_count = 0
26
+ multi_action_count = 0
27
+
28
+ for line in output.split('\n'):
29
+ if "šŸ”§ Turn" in line and "Tool Call:" in line:
30
+ turn_info = {"turn": line}
31
+ tool_calls.append(turn_info)
32
+ elif "Actions:" in line and tool_calls:
33
+ actions_str = line.strip().split("Actions:")[1].strip()
34
+ try:
35
+ # Parse the action list
36
+ actions = eval(actions_str) if actions_str else []
37
+ tool_calls[-1]["actions"] = actions
38
+ tool_calls[-1]["action_count"] = len(actions)
39
+
40
+ if len(actions) == 1:
41
+ single_action_count += 1
42
+ elif len(actions) > 1:
43
+ multi_action_count += 1
44
+ except:
45
+ tool_calls[-1]["actions"] = "parse_error"
46
+ tool_calls[-1]["action_count"] = 0
47
+
48
+ print("šŸ” AGENT ACTION ANALYSIS\n")
49
+ print(f"Total tool calls: {len(tool_calls)}")
50
+ print(f"Single action calls: {single_action_count}")
51
+ print(f"Multi-action calls: {multi_action_count}")
52
+ print(f"Average actions per call: {sum(tc.get('action_count', 0) for tc in tool_calls) / len(tool_calls) if tool_calls else 0:.2f}")
53
+
54
+ # Show distribution
55
+ action_counts = {}
56
+ for tc in tool_calls:
57
+ count = tc.get('action_count', 0)
58
+ action_counts[count] = action_counts.get(count, 0) + 1
59
+
60
+ print("\nAction count distribution:")
61
+ for count in sorted(action_counts.keys()):
62
+ print(f" {count} actions: {action_counts[count]} times")
63
+
64
+ # Show examples of multi-action calls
65
+ print("\nšŸ“‹ Multi-action examples:")
66
+ multi_examples = [tc for tc in tool_calls if tc.get('action_count', 0) > 1]
67
+ for example in multi_examples[:5]:
68
+ print(f" {example['turn']}")
69
+ print(f" Actions: {example['actions']}")
70
+
71
+ # Check for response parsing issues
72
+ print("\nšŸ” Response preview analysis:")
73
+ response_previews = []
74
+ for line in output.split('\n'):
75
+ if "šŸ“ Raw response preview:" in line:
76
+ preview = line.split("preview:")[1].strip()
77
+ response_previews.append(preview)
78
+
79
+ if response_previews:
80
+ print(f"Found {len(response_previews)} response previews")
81
+ # Check if responses mention multiple actions
82
+ multi_action_mentions = 0
83
+ for preview in response_previews[:5]:
84
+ if any(word in preview.lower() for word in ['multiple', 'sequence', 'then', 'after']):
85
+ multi_action_mentions += 1
86
+ print(f" - {preview[:100]}...")
87
+
88
+ print(f"\nResponses mentioning sequences: {multi_action_mentions}/{len(response_previews[:5])}")
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env python3
2
+ """Quick check of recent traces without locking the database."""
3
+
4
+ import sqlite3
5
+ import json
6
+ from pathlib import Path
7
+ from collections import Counter
8
+
9
+ # Use SQLite interface which is more permissive with locks
10
+ db_path = "./traces_v2_synth/traces.duckdb"
11
+
12
+ if Path(db_path).exists():
13
+ try:
14
+ # DuckDB files can be read with SQLite in read-only mode
15
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
16
+ cursor = conn.cursor()
17
+
18
+ print("šŸ” Quick trace analysis...\n")
19
+
20
+ # Get recent events
21
+ cursor.execute("""
22
+ SELECT event_type, metadata
23
+ FROM events
24
+ ORDER BY id DESC
25
+ LIMIT 100
26
+ """)
27
+
28
+ events = cursor.fetchall()
29
+ print(f"Found {len(events)} recent events\n")
30
+
31
+ # Count event types
32
+ event_types = Counter([e[0] for e in events])
33
+ print("Event type distribution:")
34
+ for etype, count in event_types.most_common():
35
+ print(f" {etype}: {count}")
36
+
37
+ # Check for actions
38
+ print("\nšŸŽ® Recent actions:")
39
+ action_count = 0
40
+ action_types = Counter()
41
+
42
+ for event_type, metadata_str in events:
43
+ if metadata_str and event_type == 'runtime':
44
+ try:
45
+ metadata = json.loads(metadata_str)
46
+ if 'action_name' in metadata:
47
+ action_types[metadata['action_name']] += 1
48
+ action_count += 1
49
+ if action_count <= 10:
50
+ print(f" - {metadata['action_name']}")
51
+ except:
52
+ pass
53
+
54
+ if action_types:
55
+ print(f"\nAction summary:")
56
+ for action, count in action_types.most_common():
57
+ print(f" {action}: {count}")
58
+
59
+ conn.close()
60
+
61
+ except Exception as e:
62
+ print(f"Error: {e}")
63
+ print("\nTrying alternative analysis...")
64
+
65
+ # If we can't read the DB, check for any JSON trace files
66
+ trace_files = list(Path("./traces_v2_synth").glob("session_*.json"))
67
+ if trace_files:
68
+ print(f"Found {len(trace_files)} JSON trace files")
69
+ latest = max(trace_files, key=lambda f: f.stat().st_mtime)
70
+ print(f"Latest: {latest.name}")
71
+
72
+ with open(latest) as f:
73
+ data = json.load(f)
74
+ print(f"Session ID: {data.get('session_id', 'Unknown')}")
75
+ print(f"Events: {len(data.get('events', []))}")
76
+ else:
77
+ print("No trace database found")