synth-ai 0.2.4.dev8__py3-none-any.whl ā 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/__init__.py +1 -1
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +685 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.5.dist-info/METADATA +106 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.5.dist-info}/RECORD +111 -12
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.5.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.5.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.5.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Analyze trace data to understand why no achievements were unlocked."""
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from collections import defaultdict, Counter
|
|
8
|
+
|
|
9
|
+
def analyze_traces(db_path: str):
|
|
10
|
+
"""Analyze trace data to identify issues."""
|
|
11
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
12
|
+
|
|
13
|
+
print("š Analyzing trace data...\n")
|
|
14
|
+
|
|
15
|
+
# 1. Check basic statistics
|
|
16
|
+
print("š BASIC STATISTICS")
|
|
17
|
+
print("=" * 50)
|
|
18
|
+
|
|
19
|
+
# First, show available tables
|
|
20
|
+
tables = conn.execute("SHOW TABLES").fetchall()
|
|
21
|
+
print("Available tables:")
|
|
22
|
+
for table in tables:
|
|
23
|
+
print(f" - {table[0]}")
|
|
24
|
+
|
|
25
|
+
# Count sessions
|
|
26
|
+
session_count = conn.execute("SELECT COUNT(DISTINCT session_id) FROM session_traces").fetchone()[0]
|
|
27
|
+
print(f"\nTotal sessions: {session_count}")
|
|
28
|
+
|
|
29
|
+
# Count events
|
|
30
|
+
event_count = conn.execute("SELECT COUNT(*) FROM session_traces").fetchone()[0]
|
|
31
|
+
print(f"Total events: {event_count}")
|
|
32
|
+
|
|
33
|
+
# Check event types
|
|
34
|
+
print("\nš EVENT TYPE DISTRIBUTION")
|
|
35
|
+
print("-" * 30)
|
|
36
|
+
event_types = conn.execute("""
|
|
37
|
+
SELECT event_type, COUNT(*) as count
|
|
38
|
+
FROM session_traces
|
|
39
|
+
GROUP BY event_type
|
|
40
|
+
ORDER BY count DESC
|
|
41
|
+
""").fetchall()
|
|
42
|
+
|
|
43
|
+
for event_type, count in event_types:
|
|
44
|
+
print(f"{event_type}: {count}")
|
|
45
|
+
|
|
46
|
+
# 2. Analyze agent decisions
|
|
47
|
+
print("\nš¤ AGENT DECISIONS ANALYSIS")
|
|
48
|
+
print("=" * 50)
|
|
49
|
+
|
|
50
|
+
# Get all generation completion events
|
|
51
|
+
completions = conn.execute("""
|
|
52
|
+
SELECT event_data
|
|
53
|
+
FROM session_traces
|
|
54
|
+
WHERE event_type = 'generation_completion'
|
|
55
|
+
LIMIT 50
|
|
56
|
+
""").fetchall()
|
|
57
|
+
|
|
58
|
+
if completions:
|
|
59
|
+
print(f"Found {len(completions)} generation completions (showing first 50)")
|
|
60
|
+
|
|
61
|
+
# Analyze first few completions
|
|
62
|
+
for i, (event_data,) in enumerate(completions[:5]):
|
|
63
|
+
data = json.loads(event_data)
|
|
64
|
+
print(f"\n--- Completion {i+1} ---")
|
|
65
|
+
|
|
66
|
+
# Extract response
|
|
67
|
+
if 'response' in data and data['response']:
|
|
68
|
+
response = data['response']
|
|
69
|
+
if 'content' in response:
|
|
70
|
+
print(f"Content preview: {response['content'][:200]}...")
|
|
71
|
+
if 'tool_calls' in response:
|
|
72
|
+
print(f"Tool calls: {response['tool_calls']}")
|
|
73
|
+
else:
|
|
74
|
+
print("No response found in event data")
|
|
75
|
+
else:
|
|
76
|
+
print("ā No generation completion events found!")
|
|
77
|
+
|
|
78
|
+
# 3. Analyze runtime events (actions taken)
|
|
79
|
+
print("\nš® RUNTIME EVENTS (ACTIONS)")
|
|
80
|
+
print("=" * 50)
|
|
81
|
+
|
|
82
|
+
runtime_events = conn.execute("""
|
|
83
|
+
SELECT event_data
|
|
84
|
+
FROM session_traces
|
|
85
|
+
WHERE event_type = 'runtime_event'
|
|
86
|
+
LIMIT 100
|
|
87
|
+
""").fetchall()
|
|
88
|
+
|
|
89
|
+
if runtime_events:
|
|
90
|
+
action_counter = Counter()
|
|
91
|
+
|
|
92
|
+
for (event_data,) in runtime_events:
|
|
93
|
+
data = json.loads(event_data)
|
|
94
|
+
if 'metadata' in data and 'action_name' in data['metadata']:
|
|
95
|
+
action_counter[data['metadata']['action_name']] += 1
|
|
96
|
+
|
|
97
|
+
print(f"Found {len(runtime_events)} runtime events")
|
|
98
|
+
print("\nAction distribution:")
|
|
99
|
+
for action, count in action_counter.most_common():
|
|
100
|
+
print(f" {action}: {count}")
|
|
101
|
+
else:
|
|
102
|
+
print("ā No runtime events found!")
|
|
103
|
+
|
|
104
|
+
# 4. Analyze environment events (results)
|
|
105
|
+
print("\nš ENVIRONMENT EVENTS")
|
|
106
|
+
print("=" * 50)
|
|
107
|
+
|
|
108
|
+
env_events = conn.execute("""
|
|
109
|
+
SELECT event_data
|
|
110
|
+
FROM session_traces
|
|
111
|
+
WHERE event_type = 'environment_event'
|
|
112
|
+
LIMIT 100
|
|
113
|
+
""").fetchall()
|
|
114
|
+
|
|
115
|
+
if env_events:
|
|
116
|
+
reward_sum = 0
|
|
117
|
+
achievements_found = []
|
|
118
|
+
|
|
119
|
+
for (event_data,) in env_events:
|
|
120
|
+
data = json.loads(event_data)
|
|
121
|
+
|
|
122
|
+
# Check rewards
|
|
123
|
+
if 'reward' in data:
|
|
124
|
+
reward_sum += data['reward'] or 0
|
|
125
|
+
|
|
126
|
+
# Check for achievements in state
|
|
127
|
+
if 'system_state_after' in data:
|
|
128
|
+
state = data['system_state_after']
|
|
129
|
+
if 'public_state' in state and 'achievements' in state['public_state']:
|
|
130
|
+
achievements = state['public_state']['achievements']
|
|
131
|
+
for ach, unlocked in achievements.items():
|
|
132
|
+
if unlocked:
|
|
133
|
+
achievements_found.append(ach)
|
|
134
|
+
|
|
135
|
+
print(f"Found {len(env_events)} environment events")
|
|
136
|
+
print(f"Total reward across all events: {reward_sum}")
|
|
137
|
+
print(f"Achievements found: {set(achievements_found) if achievements_found else 'None'}")
|
|
138
|
+
else:
|
|
139
|
+
print("ā No environment events found!")
|
|
140
|
+
|
|
141
|
+
# 5. Check for errors
|
|
142
|
+
print("\nā ļø ERROR CHECK")
|
|
143
|
+
print("=" * 50)
|
|
144
|
+
|
|
145
|
+
# Look for error messages in events
|
|
146
|
+
error_events = conn.execute("""
|
|
147
|
+
SELECT event_type, event_data
|
|
148
|
+
FROM session_traces
|
|
149
|
+
WHERE event_data LIKE '%error%' OR event_data LIKE '%Error%'
|
|
150
|
+
LIMIT 10
|
|
151
|
+
""").fetchall()
|
|
152
|
+
|
|
153
|
+
if error_events:
|
|
154
|
+
print(f"Found {len(error_events)} events with potential errors:")
|
|
155
|
+
for event_type, event_data in error_events[:3]:
|
|
156
|
+
print(f"\n{event_type}:")
|
|
157
|
+
data = json.loads(event_data)
|
|
158
|
+
print(json.dumps(data, indent=2)[:500])
|
|
159
|
+
else:
|
|
160
|
+
print("No obvious errors found in events")
|
|
161
|
+
|
|
162
|
+
# 6. Sample a full episode flow
|
|
163
|
+
print("\nš SAMPLE EPISODE FLOW")
|
|
164
|
+
print("=" * 50)
|
|
165
|
+
|
|
166
|
+
# Get events from first session
|
|
167
|
+
first_session = conn.execute("SELECT DISTINCT session_id FROM session_traces LIMIT 1").fetchone()
|
|
168
|
+
if first_session:
|
|
169
|
+
session_id = first_session[0]
|
|
170
|
+
print(f"Analyzing session: {session_id}")
|
|
171
|
+
|
|
172
|
+
session_events = conn.execute("""
|
|
173
|
+
SELECT event_type, event_data, created_at
|
|
174
|
+
FROM session_traces
|
|
175
|
+
WHERE session_id = ?
|
|
176
|
+
ORDER BY created_at
|
|
177
|
+
LIMIT 20
|
|
178
|
+
""", [session_id]).fetchall()
|
|
179
|
+
|
|
180
|
+
print(f"\nFirst 20 events in session:")
|
|
181
|
+
for i, (event_type, event_data, created_at) in enumerate(session_events):
|
|
182
|
+
data = json.loads(event_data)
|
|
183
|
+
print(f"\n{i+1}. {event_type} at {created_at}")
|
|
184
|
+
|
|
185
|
+
# Show relevant info based on event type
|
|
186
|
+
if event_type == 'generation_completion':
|
|
187
|
+
if 'response' in data and 'tool_calls' in data['response']:
|
|
188
|
+
print(f" Tool calls: {data['response']['tool_calls']}")
|
|
189
|
+
elif event_type == 'runtime_event':
|
|
190
|
+
if 'metadata' in data:
|
|
191
|
+
print(f" Action: {data['metadata'].get('action_name', 'Unknown')}")
|
|
192
|
+
elif event_type == 'environment_event':
|
|
193
|
+
if 'reward' in data:
|
|
194
|
+
print(f" Reward: {data['reward']}")
|
|
195
|
+
|
|
196
|
+
conn.close()
|
|
197
|
+
|
|
198
|
+
if __name__ == "__main__":
|
|
199
|
+
db_path = "./traces_v2_synth/traces.duckdb"
|
|
200
|
+
if Path(db_path).exists():
|
|
201
|
+
analyze_traces(db_path)
|
|
202
|
+
else:
|
|
203
|
+
print(f"ā Database not found at {db_path}")
|
|
204
|
+
print("Available databases:")
|
|
205
|
+
for db in Path(".").glob("**/traces.duckdb"):
|
|
206
|
+
print(f" - {db}")
|
synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Check DuckDB schema to understand table structure."""
|
|
3
|
+
|
|
4
|
+
import duckdb
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
def check_schema(db_path: str):
|
|
8
|
+
"""Check database schema."""
|
|
9
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
10
|
+
|
|
11
|
+
print("š Checking database schema...\n")
|
|
12
|
+
|
|
13
|
+
# Get all tables
|
|
14
|
+
tables = conn.execute("SHOW TABLES").fetchall()
|
|
15
|
+
print("š Tables in database:")
|
|
16
|
+
for table in tables:
|
|
17
|
+
print(f" - {table[0]}")
|
|
18
|
+
|
|
19
|
+
# Check schema of key tables
|
|
20
|
+
key_tables = ['session_traces', 'events', 'messages', 'session_timesteps']
|
|
21
|
+
|
|
22
|
+
for table_name in key_tables:
|
|
23
|
+
if any(t[0] == table_name for t in tables):
|
|
24
|
+
print(f"\nš Schema for {table_name}:")
|
|
25
|
+
print("-" * 50)
|
|
26
|
+
schema = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
|
27
|
+
for col_name, col_type, _, _, _, _ in schema:
|
|
28
|
+
print(f" {col_name}: {col_type}")
|
|
29
|
+
|
|
30
|
+
# Show sample data
|
|
31
|
+
print(f"\nš Sample data from {table_name} (first 2 rows):")
|
|
32
|
+
sample = conn.execute(f"SELECT * FROM {table_name} LIMIT 2").fetchall()
|
|
33
|
+
if sample:
|
|
34
|
+
# Get column names
|
|
35
|
+
cols = [desc[0] for desc in conn.execute(f"SELECT * FROM {table_name} LIMIT 0").description]
|
|
36
|
+
print(f" Columns: {cols}")
|
|
37
|
+
for row in sample:
|
|
38
|
+
print(f" {row}")
|
|
39
|
+
else:
|
|
40
|
+
print(" (No data)")
|
|
41
|
+
|
|
42
|
+
conn.close()
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
db_path = "./traces_v2_synth/traces.duckdb"
|
|
46
|
+
if Path(db_path).exists():
|
|
47
|
+
check_schema(db_path)
|
|
48
|
+
else:
|
|
49
|
+
print(f"ā Database not found at {db_path}")
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Check the latest results JSON file."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import glob
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Find the latest results file
|
|
9
|
+
result_files = glob.glob("crafter_lm_synth_results_*.json")
|
|
10
|
+
if not result_files:
|
|
11
|
+
print("No result files found")
|
|
12
|
+
exit(1)
|
|
13
|
+
|
|
14
|
+
# Get the most recent file
|
|
15
|
+
latest_file = max(result_files, key=lambda f: Path(f).stat().st_mtime)
|
|
16
|
+
print(f"š Checking latest results: {latest_file}\n")
|
|
17
|
+
|
|
18
|
+
with open(latest_file) as f:
|
|
19
|
+
data = json.load(f)
|
|
20
|
+
|
|
21
|
+
# Extract key metrics
|
|
22
|
+
total_episodes = data.get('total_episodes', 0)
|
|
23
|
+
total_steps = data.get('total_steps', 0)
|
|
24
|
+
model = data.get('model', 'unknown')
|
|
25
|
+
|
|
26
|
+
print(f"Model: {model}")
|
|
27
|
+
print(f"Episodes: {total_episodes}")
|
|
28
|
+
print(f"Total Steps: {total_steps}")
|
|
29
|
+
|
|
30
|
+
# Check episode results
|
|
31
|
+
episodes = data.get('episodes', [])
|
|
32
|
+
if episodes:
|
|
33
|
+
print(f"\nš Episode Summary:")
|
|
34
|
+
for i, ep in enumerate(episodes):
|
|
35
|
+
steps = ep.get('steps', 0)
|
|
36
|
+
reward = ep.get('total_reward', 0)
|
|
37
|
+
achievements = ep.get('achievements_unlocked', [])
|
|
38
|
+
|
|
39
|
+
print(f"\nEpisode {i}:")
|
|
40
|
+
print(f" Steps: {steps}")
|
|
41
|
+
print(f" Reward: {reward}")
|
|
42
|
+
print(f" Achievements: {len(achievements)}")
|
|
43
|
+
if achievements:
|
|
44
|
+
print(f" - {', '.join(achievements[:5])}")
|
|
45
|
+
if len(achievements) > 5:
|
|
46
|
+
print(f" ... and {len(achievements) - 5} more")
|
|
47
|
+
|
|
48
|
+
# Check inventory at end
|
|
49
|
+
inventory = ep.get('final_inventory', {})
|
|
50
|
+
non_zero = {k: v for k, v in inventory.items() if v > 0 and k not in ['health', 'food', 'drink', 'energy']}
|
|
51
|
+
if non_zero:
|
|
52
|
+
print(f" Final inventory: {non_zero}")
|
|
53
|
+
|
|
54
|
+
# Sample actions
|
|
55
|
+
if 'action_history' in ep and ep['action_history']:
|
|
56
|
+
print(f" Sample actions: {ep['action_history'][:5]}")
|
|
57
|
+
|
|
58
|
+
# Overall statistics
|
|
59
|
+
avg_reward = sum(ep.get('total_reward', 0) for ep in episodes) / len(episodes) if episodes else 0
|
|
60
|
+
total_achievements = sum(len(ep.get('achievements_unlocked', [])) for ep in episodes)
|
|
61
|
+
|
|
62
|
+
print(f"\nš Overall Statistics:")
|
|
63
|
+
print(f"Average reward per episode: {avg_reward:.2f}")
|
|
64
|
+
print(f"Total achievements unlocked: {total_achievements}")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Debug why agent is not using multiple actions."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Read the latest run output from stdin or file
|
|
9
|
+
if len(sys.argv) > 1:
|
|
10
|
+
with open(sys.argv[1]) as f:
|
|
11
|
+
output = f.read()
|
|
12
|
+
else:
|
|
13
|
+
# Check for latest log file
|
|
14
|
+
log_files = list(Path(".").glob("crafter_run_*.log"))
|
|
15
|
+
if not log_files:
|
|
16
|
+
print("No log files found. Run with: python test_crafter_react_agent_lm_synth.py --model 'Qwen/Qwen2.5-14B-Instruct' --episodes 1 --max-steps 10 --verbose 2>&1 | tee crafter_run.log")
|
|
17
|
+
exit(1)
|
|
18
|
+
|
|
19
|
+
latest_log = max(log_files, key=lambda f: f.stat().st_mtime)
|
|
20
|
+
with open(latest_log) as f:
|
|
21
|
+
output = f.read()
|
|
22
|
+
|
|
23
|
+
# Parse tool calls
|
|
24
|
+
tool_calls = []
|
|
25
|
+
single_action_count = 0
|
|
26
|
+
multi_action_count = 0
|
|
27
|
+
|
|
28
|
+
for line in output.split('\n'):
|
|
29
|
+
if "š§ Turn" in line and "Tool Call:" in line:
|
|
30
|
+
turn_info = {"turn": line}
|
|
31
|
+
tool_calls.append(turn_info)
|
|
32
|
+
elif "Actions:" in line and tool_calls:
|
|
33
|
+
actions_str = line.strip().split("Actions:")[1].strip()
|
|
34
|
+
try:
|
|
35
|
+
# Parse the action list
|
|
36
|
+
actions = eval(actions_str) if actions_str else []
|
|
37
|
+
tool_calls[-1]["actions"] = actions
|
|
38
|
+
tool_calls[-1]["action_count"] = len(actions)
|
|
39
|
+
|
|
40
|
+
if len(actions) == 1:
|
|
41
|
+
single_action_count += 1
|
|
42
|
+
elif len(actions) > 1:
|
|
43
|
+
multi_action_count += 1
|
|
44
|
+
except:
|
|
45
|
+
tool_calls[-1]["actions"] = "parse_error"
|
|
46
|
+
tool_calls[-1]["action_count"] = 0
|
|
47
|
+
|
|
48
|
+
print("š AGENT ACTION ANALYSIS\n")
|
|
49
|
+
print(f"Total tool calls: {len(tool_calls)}")
|
|
50
|
+
print(f"Single action calls: {single_action_count}")
|
|
51
|
+
print(f"Multi-action calls: {multi_action_count}")
|
|
52
|
+
print(f"Average actions per call: {sum(tc.get('action_count', 0) for tc in tool_calls) / len(tool_calls) if tool_calls else 0:.2f}")
|
|
53
|
+
|
|
54
|
+
# Show distribution
|
|
55
|
+
action_counts = {}
|
|
56
|
+
for tc in tool_calls:
|
|
57
|
+
count = tc.get('action_count', 0)
|
|
58
|
+
action_counts[count] = action_counts.get(count, 0) + 1
|
|
59
|
+
|
|
60
|
+
print("\nAction count distribution:")
|
|
61
|
+
for count in sorted(action_counts.keys()):
|
|
62
|
+
print(f" {count} actions: {action_counts[count]} times")
|
|
63
|
+
|
|
64
|
+
# Show examples of multi-action calls
|
|
65
|
+
print("\nš Multi-action examples:")
|
|
66
|
+
multi_examples = [tc for tc in tool_calls if tc.get('action_count', 0) > 1]
|
|
67
|
+
for example in multi_examples[:5]:
|
|
68
|
+
print(f" {example['turn']}")
|
|
69
|
+
print(f" Actions: {example['actions']}")
|
|
70
|
+
|
|
71
|
+
# Check for response parsing issues
|
|
72
|
+
print("\nš Response preview analysis:")
|
|
73
|
+
response_previews = []
|
|
74
|
+
for line in output.split('\n'):
|
|
75
|
+
if "š Raw response preview:" in line:
|
|
76
|
+
preview = line.split("preview:")[1].strip()
|
|
77
|
+
response_previews.append(preview)
|
|
78
|
+
|
|
79
|
+
if response_previews:
|
|
80
|
+
print(f"Found {len(response_previews)} response previews")
|
|
81
|
+
# Check if responses mention multiple actions
|
|
82
|
+
multi_action_mentions = 0
|
|
83
|
+
for preview in response_previews[:5]:
|
|
84
|
+
if any(word in preview.lower() for word in ['multiple', 'sequence', 'then', 'after']):
|
|
85
|
+
multi_action_mentions += 1
|
|
86
|
+
print(f" - {preview[:100]}...")
|
|
87
|
+
|
|
88
|
+
print(f"\nResponses mentioning sequences: {multi_action_mentions}/{len(response_previews[:5])}")
|
synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Quick check of recent traces without locking the database."""
|
|
3
|
+
|
|
4
|
+
import sqlite3
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from collections import Counter
|
|
8
|
+
|
|
9
|
+
# Use SQLite interface which is more permissive with locks
|
|
10
|
+
db_path = "./traces_v2_synth/traces.duckdb"
|
|
11
|
+
|
|
12
|
+
if Path(db_path).exists():
|
|
13
|
+
try:
|
|
14
|
+
# DuckDB files can be read with SQLite in read-only mode
|
|
15
|
+
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
|
|
16
|
+
cursor = conn.cursor()
|
|
17
|
+
|
|
18
|
+
print("š Quick trace analysis...\n")
|
|
19
|
+
|
|
20
|
+
# Get recent events
|
|
21
|
+
cursor.execute("""
|
|
22
|
+
SELECT event_type, metadata
|
|
23
|
+
FROM events
|
|
24
|
+
ORDER BY id DESC
|
|
25
|
+
LIMIT 100
|
|
26
|
+
""")
|
|
27
|
+
|
|
28
|
+
events = cursor.fetchall()
|
|
29
|
+
print(f"Found {len(events)} recent events\n")
|
|
30
|
+
|
|
31
|
+
# Count event types
|
|
32
|
+
event_types = Counter([e[0] for e in events])
|
|
33
|
+
print("Event type distribution:")
|
|
34
|
+
for etype, count in event_types.most_common():
|
|
35
|
+
print(f" {etype}: {count}")
|
|
36
|
+
|
|
37
|
+
# Check for actions
|
|
38
|
+
print("\nš® Recent actions:")
|
|
39
|
+
action_count = 0
|
|
40
|
+
action_types = Counter()
|
|
41
|
+
|
|
42
|
+
for event_type, metadata_str in events:
|
|
43
|
+
if metadata_str and event_type == 'runtime':
|
|
44
|
+
try:
|
|
45
|
+
metadata = json.loads(metadata_str)
|
|
46
|
+
if 'action_name' in metadata:
|
|
47
|
+
action_types[metadata['action_name']] += 1
|
|
48
|
+
action_count += 1
|
|
49
|
+
if action_count <= 10:
|
|
50
|
+
print(f" - {metadata['action_name']}")
|
|
51
|
+
except:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
if action_types:
|
|
55
|
+
print(f"\nAction summary:")
|
|
56
|
+
for action, count in action_types.most_common():
|
|
57
|
+
print(f" {action}: {count}")
|
|
58
|
+
|
|
59
|
+
conn.close()
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
print(f"Error: {e}")
|
|
63
|
+
print("\nTrying alternative analysis...")
|
|
64
|
+
|
|
65
|
+
# If we can't read the DB, check for any JSON trace files
|
|
66
|
+
trace_files = list(Path("./traces_v2_synth").glob("session_*.json"))
|
|
67
|
+
if trace_files:
|
|
68
|
+
print(f"Found {len(trace_files)} JSON trace files")
|
|
69
|
+
latest = max(trace_files, key=lambda f: f.stat().st_mtime)
|
|
70
|
+
print(f"Latest: {latest.name}")
|
|
71
|
+
|
|
72
|
+
with open(latest) as f:
|
|
73
|
+
data = json.load(f)
|
|
74
|
+
print(f"Session ID: {data.get('session_id', 'Unknown')}")
|
|
75
|
+
print(f"Events: {len(data.get('events', []))}")
|
|
76
|
+
else:
|
|
77
|
+
print("No trace database found")
|