synth-ai 0.2.4.dev7__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/__init__.py +1 -1
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/balance.py +3 -15
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/config/base_url.py +47 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/http.py +102 -0
- synth_ai/inference/__init__.py +7 -0
- synth_ai/inference/client.py +20 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai/jobs/client.py +246 -0
- synth_ai/learning/__init__.py +24 -0
- synth_ai/learning/client.py +149 -0
- synth_ai/learning/config.py +43 -0
- synth_ai/learning/constants.py +29 -0
- synth_ai/learning/ft_client.py +59 -0
- synth_ai/learning/health.py +43 -0
- synth_ai/learning/jobs.py +205 -0
- synth_ai/learning/rl_client.py +256 -0
- synth_ai/learning/sse.py +58 -0
- synth_ai/learning/validators.py +48 -0
- synth_ai/lm/core/main_v3.py +13 -0
- synth_ai/lm/core/synth_models.py +48 -0
- synth_ai/lm/core/vendor_clients.py +9 -6
- synth_ai/lm/vendors/core/openai_api.py +31 -3
- synth_ai/lm/vendors/openai_standard.py +45 -14
- synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
- synth_ai/lm/vendors/synth_client.py +372 -28
- synth_ai/rl/__init__.py +30 -0
- synth_ai/rl/contracts.py +32 -0
- synth_ai/rl/env_keys.py +137 -0
- synth_ai/rl/secrets.py +19 -0
- synth_ai/scripts/verify_rewards.py +100 -0
- synth_ai/task/__init__.py +10 -0
- synth_ai/task/contracts.py +120 -0
- synth_ai/task/health.py +28 -0
- synth_ai/task/validators.py +12 -0
- synth_ai/tracing_v3/hooks.py +3 -1
- synth_ai/tracing_v3/session_tracer.py +123 -2
- synth_ai/tracing_v3/turso/manager.py +218 -0
- synth_ai/tracing_v3/turso/models.py +53 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai/tui/__init__.py +0 -1
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -340
- synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev7.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Check where hook results are being stored in the database.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
def check_hook_storage():
|
|
10
|
+
conn = duckdb.connect("crafter_traces.duckdb")
|
|
11
|
+
|
|
12
|
+
# Get recent experiment
|
|
13
|
+
result = conn.execute("SELECT id, name FROM experiments ORDER BY created_at DESC LIMIT 1").fetchall()
|
|
14
|
+
exp_id = result[0][0]
|
|
15
|
+
exp_name = result[0][1]
|
|
16
|
+
|
|
17
|
+
print(f"š Checking hook storage for experiment: {exp_name} ({exp_id})")
|
|
18
|
+
print("=" * 60)
|
|
19
|
+
|
|
20
|
+
# Check session metadata
|
|
21
|
+
print("\nš SESSION METADATA ANALYSIS:")
|
|
22
|
+
result = conn.execute("SELECT session_id, metadata FROM session_traces WHERE experiment_id = ?", [exp_id]).fetchall()
|
|
23
|
+
|
|
24
|
+
for row in result:
|
|
25
|
+
session_id, metadata = row
|
|
26
|
+
metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
|
|
27
|
+
|
|
28
|
+
print(f"\nSession: {session_id}")
|
|
29
|
+
for i, item in enumerate(metadata_list):
|
|
30
|
+
metadata_type = item.get('metadata_type', 'unknown')
|
|
31
|
+
data = item.get('data', {})
|
|
32
|
+
print(f" Item {i}: {metadata_type}")
|
|
33
|
+
print(f" Keys: {list(data.keys())}")
|
|
34
|
+
|
|
35
|
+
# Check for hook-related data
|
|
36
|
+
if 'achievements' in data:
|
|
37
|
+
achievements = data['achievements']
|
|
38
|
+
unlocked = [k for k, v in achievements.items() if v]
|
|
39
|
+
print(f" Achievements: {unlocked}")
|
|
40
|
+
|
|
41
|
+
if 'num_achievements' in data:
|
|
42
|
+
print(f" Num achievements: {data['num_achievements']}")
|
|
43
|
+
|
|
44
|
+
# Check if there are any hook events
|
|
45
|
+
print(f"\nš HOOK EVENTS CHECK:")
|
|
46
|
+
result = conn.execute("""
|
|
47
|
+
SELECT COUNT(*)
|
|
48
|
+
FROM events e
|
|
49
|
+
JOIN session_traces st ON e.session_id = st.session_id
|
|
50
|
+
WHERE st.experiment_id = ? AND e.event_type = 'hook'
|
|
51
|
+
""", [exp_id]).fetchall()
|
|
52
|
+
|
|
53
|
+
hook_count = result[0][0]
|
|
54
|
+
print(f"Hook events found: {hook_count}")
|
|
55
|
+
|
|
56
|
+
if hook_count > 0:
|
|
57
|
+
print("Sample hook events:")
|
|
58
|
+
result = conn.execute("""
|
|
59
|
+
SELECT e.session_id, e.metadata
|
|
60
|
+
FROM events e
|
|
61
|
+
JOIN session_traces st ON e.session_id = st.session_id
|
|
62
|
+
WHERE st.experiment_id = ? AND e.event_type = 'hook'
|
|
63
|
+
LIMIT 3
|
|
64
|
+
""", [exp_id]).fetchall()
|
|
65
|
+
|
|
66
|
+
for row in result:
|
|
67
|
+
session_id, metadata = row
|
|
68
|
+
print(f" Session {session_id}: {metadata}")
|
|
69
|
+
|
|
70
|
+
# Check for any other hook-related data
|
|
71
|
+
print(f"\nš OTHER HOOK DATA:")
|
|
72
|
+
result = conn.execute("""
|
|
73
|
+
SELECT e.event_type, COUNT(*)
|
|
74
|
+
FROM events e
|
|
75
|
+
JOIN session_traces st ON e.session_id = st.session_id
|
|
76
|
+
WHERE st.experiment_id = ?
|
|
77
|
+
GROUP BY e.event_type
|
|
78
|
+
""", [exp_id]).fetchall()
|
|
79
|
+
|
|
80
|
+
print("Event types in experiment:")
|
|
81
|
+
for event_type, count in result:
|
|
82
|
+
print(f" {event_type}: {count}")
|
|
83
|
+
|
|
84
|
+
conn.close()
|
|
85
|
+
|
|
86
|
+
if __name__ == "__main__":
|
|
87
|
+
check_hook_storage()
|
synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Check what seeds are being used in the two experiments.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
# Experiment IDs
|
|
10
|
+
EXPERIMENTS = {
|
|
11
|
+
"gpt-4.1-nano": "194a3cd2-ecd3-4081-b46d-a7883e4a86f9",
|
|
12
|
+
"gpt-4.1-mini": "da74a769-b33d-4b60-ae2a-52a4b67b3f35"
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
def check_seeds():
|
|
16
|
+
"""Check seeds for both experiments."""
|
|
17
|
+
conn = duckdb.connect("crafter_traces.duckdb")
|
|
18
|
+
|
|
19
|
+
for model_name, exp_id in EXPERIMENTS.items():
|
|
20
|
+
print(f"\nš {model_name.upper()} EXPERIMENT SEEDS")
|
|
21
|
+
print("-" * 50)
|
|
22
|
+
|
|
23
|
+
# Get all sessions for this experiment
|
|
24
|
+
query = """
|
|
25
|
+
SELECT session_id, metadata
|
|
26
|
+
FROM session_traces
|
|
27
|
+
WHERE experiment_id = ?
|
|
28
|
+
ORDER BY session_id
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
results = conn.execute(query, [exp_id]).fetchall()
|
|
32
|
+
|
|
33
|
+
seeds = []
|
|
34
|
+
for session_id, metadata in results:
|
|
35
|
+
if metadata:
|
|
36
|
+
try:
|
|
37
|
+
metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
|
|
38
|
+
|
|
39
|
+
for meta_item in metadata_list:
|
|
40
|
+
if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
|
|
41
|
+
data = meta_item.get('data', {})
|
|
42
|
+
|
|
43
|
+
# Look for seed information
|
|
44
|
+
if 'seed' in data:
|
|
45
|
+
seeds.append({
|
|
46
|
+
'session_id': session_id,
|
|
47
|
+
'seed': data['seed']
|
|
48
|
+
})
|
|
49
|
+
elif 'instance_num' in data:
|
|
50
|
+
# Sometimes seed is derived from instance_num
|
|
51
|
+
seeds.append({
|
|
52
|
+
'session_id': session_id,
|
|
53
|
+
'instance_num': data['instance_num']
|
|
54
|
+
})
|
|
55
|
+
except Exception as e:
|
|
56
|
+
print(f"Error parsing metadata for {session_id}: {e}")
|
|
57
|
+
|
|
58
|
+
if seeds:
|
|
59
|
+
print(f"Found {len(seeds)} sessions with seed info:")
|
|
60
|
+
for seed_info in seeds:
|
|
61
|
+
print(f" {seed_info}")
|
|
62
|
+
else:
|
|
63
|
+
print("No explicit seed information found in metadata")
|
|
64
|
+
print("Checking for instance numbers...")
|
|
65
|
+
|
|
66
|
+
# Check for instance numbers as a proxy for seeds
|
|
67
|
+
instance_nums = []
|
|
68
|
+
for session_id, metadata in results:
|
|
69
|
+
if metadata:
|
|
70
|
+
try:
|
|
71
|
+
metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
|
|
72
|
+
for meta_item in metadata_list:
|
|
73
|
+
if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
|
|
74
|
+
data = meta_item.get('data', {})
|
|
75
|
+
if 'instance_num' in data:
|
|
76
|
+
instance_nums.append(data['instance_num'])
|
|
77
|
+
except:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
if instance_nums:
|
|
81
|
+
print(f"Instance numbers found: {sorted(instance_nums)}")
|
|
82
|
+
else:
|
|
83
|
+
print("No instance numbers found either")
|
|
84
|
+
|
|
85
|
+
conn.close()
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
check_seeds()
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Compare which seeds/instances performed better in nano vs mini.
|
|
4
|
+
Shows detailed performance breakdown by instance.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import duckdb
|
|
8
|
+
import json
|
|
9
|
+
from typing import Dict, List, Any
|
|
10
|
+
|
|
11
|
+
# Experiment IDs
|
|
12
|
+
EXPERIMENTS = {
|
|
13
|
+
"gpt-4.1-nano": "194a3cd2-ecd3-4081-b46d-a7883e4a86f9",
|
|
14
|
+
"gpt-4.1-mini": "da74a769-b33d-4b60-ae2a-52a4b67b3f35"
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def get_instance_performance(conn, experiment_id: str) -> Dict[int, Dict[str, Any]]:
|
|
18
|
+
"""Get performance data for each instance in an experiment."""
|
|
19
|
+
query = """
|
|
20
|
+
SELECT session_id, metadata
|
|
21
|
+
FROM session_traces
|
|
22
|
+
WHERE experiment_id = ?
|
|
23
|
+
ORDER BY session_id
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
results = conn.execute(query, [experiment_id]).fetchall()
|
|
27
|
+
|
|
28
|
+
instance_data = {}
|
|
29
|
+
|
|
30
|
+
for session_id, metadata in results:
|
|
31
|
+
if metadata:
|
|
32
|
+
try:
|
|
33
|
+
metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
|
|
34
|
+
|
|
35
|
+
# Look for instance_num in first metadata item
|
|
36
|
+
instance_num = None
|
|
37
|
+
for meta_item in metadata_list:
|
|
38
|
+
if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
|
|
39
|
+
data = meta_item.get('data', {})
|
|
40
|
+
if 'instance_num' in data:
|
|
41
|
+
instance_num = data['instance_num']
|
|
42
|
+
break
|
|
43
|
+
|
|
44
|
+
# Look for achievement data in any metadata item
|
|
45
|
+
achievements = {}
|
|
46
|
+
num_achievements = 0
|
|
47
|
+
total_reward = 0.0
|
|
48
|
+
rollout_length = 0
|
|
49
|
+
terminated = False
|
|
50
|
+
|
|
51
|
+
for meta_item in metadata_list:
|
|
52
|
+
if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
|
|
53
|
+
data = meta_item.get('data', {})
|
|
54
|
+
|
|
55
|
+
if 'achievements' in data:
|
|
56
|
+
achievements = data.get('achievements', {})
|
|
57
|
+
num_achievements = data.get('num_achievements', 0)
|
|
58
|
+
total_reward = data.get('total_reward', 0.0)
|
|
59
|
+
rollout_length = data.get('rollout_length', 0)
|
|
60
|
+
terminated = data.get('terminated', False)
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
if instance_num is not None:
|
|
64
|
+
unlocked_achievements = [ach for ach, unlocked in achievements.items() if unlocked]
|
|
65
|
+
|
|
66
|
+
instance_data[instance_num] = {
|
|
67
|
+
'session_id': session_id,
|
|
68
|
+
'num_achievements': num_achievements,
|
|
69
|
+
'unlocked_achievements': unlocked_achievements,
|
|
70
|
+
'total_reward': total_reward,
|
|
71
|
+
'rollout_length': rollout_length,
|
|
72
|
+
'terminated': terminated,
|
|
73
|
+
'all_achievements': achievements
|
|
74
|
+
}
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(f"Error parsing metadata for {session_id}: {e}")
|
|
77
|
+
|
|
78
|
+
return instance_data
|
|
79
|
+
|
|
80
|
+
def compare_instance_performance():
|
|
81
|
+
"""Compare performance between nano and mini for each instance."""
|
|
82
|
+
conn = duckdb.connect("crafter_traces.duckdb")
|
|
83
|
+
|
|
84
|
+
# Get performance data for both experiments
|
|
85
|
+
nano_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-nano"])
|
|
86
|
+
mini_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-mini"])
|
|
87
|
+
|
|
88
|
+
print("š INSTANCE-BY-INSTANCE PERFORMANCE COMPARISON")
|
|
89
|
+
print("=" * 80)
|
|
90
|
+
|
|
91
|
+
# Compare each instance
|
|
92
|
+
nano_wins = []
|
|
93
|
+
mini_wins = []
|
|
94
|
+
ties = []
|
|
95
|
+
|
|
96
|
+
for instance_num in range(1, 11): # Instances 1-10
|
|
97
|
+
nano_perf = nano_data.get(instance_num, {})
|
|
98
|
+
mini_perf = mini_data.get(instance_num, {})
|
|
99
|
+
|
|
100
|
+
nano_achievements = nano_perf.get('num_achievements', 0)
|
|
101
|
+
mini_achievements = mini_perf.get('num_achievements', 0)
|
|
102
|
+
|
|
103
|
+
nano_unlocked = nano_perf.get('unlocked_achievements', [])
|
|
104
|
+
mini_unlocked = mini_perf.get('unlocked_achievements', [])
|
|
105
|
+
|
|
106
|
+
print(f"\nš Instance {instance_num} (Seed {42 + instance_num}):")
|
|
107
|
+
print(f" GPT-4.1-NANO: {nano_achievements} achievements - {nano_unlocked}")
|
|
108
|
+
print(f" GPT-4.1-MINI: {mini_achievements} achievements - {mini_unlocked}")
|
|
109
|
+
|
|
110
|
+
if nano_achievements > mini_achievements:
|
|
111
|
+
nano_wins.append(instance_num)
|
|
112
|
+
print(f" š NANO WINS")
|
|
113
|
+
elif mini_achievements > nano_achievements:
|
|
114
|
+
mini_wins.append(instance_num)
|
|
115
|
+
print(f" š MINI WINS")
|
|
116
|
+
else:
|
|
117
|
+
ties.append(instance_num)
|
|
118
|
+
print(f" š¤ TIE")
|
|
119
|
+
|
|
120
|
+
# Summary statistics
|
|
121
|
+
print(f"\nš PERFORMANCE SUMMARY")
|
|
122
|
+
print("=" * 50)
|
|
123
|
+
print(f"NANO wins: {len(nano_wins)} instances - {nano_wins}")
|
|
124
|
+
print(f"MINI wins: {len(mini_wins)} instances - {mini_wins}")
|
|
125
|
+
print(f"Ties: {len(ties)} instances - {ties}")
|
|
126
|
+
|
|
127
|
+
# Calculate total achievements by model
|
|
128
|
+
total_nano_achievements = sum(nano_data[i].get('num_achievements', 0) for i in range(1, 11))
|
|
129
|
+
total_mini_achievements = sum(mini_data[i].get('num_achievements', 0) for i in range(1, 11))
|
|
130
|
+
|
|
131
|
+
print(f"\nš TOTAL ACHIEVEMENTS:")
|
|
132
|
+
print(f" GPT-4.1-NANO: {total_nano_achievements}")
|
|
133
|
+
print(f" GPT-4.1-MINI: {total_mini_achievements}")
|
|
134
|
+
|
|
135
|
+
# Show which instances each model dominated
|
|
136
|
+
print(f"\nšÆ INSTANCE DOMINANCE:")
|
|
137
|
+
if nano_wins:
|
|
138
|
+
print(f" NANO dominated: {nano_wins}")
|
|
139
|
+
if mini_wins:
|
|
140
|
+
print(f" MINI dominated: {mini_wins}")
|
|
141
|
+
if ties:
|
|
142
|
+
print(f" Tied instances: {ties}")
|
|
143
|
+
|
|
144
|
+
# Detailed breakdown by achievement type
|
|
145
|
+
print(f"\nš ACHIEVEMENT BREAKDOWN BY INSTANCE:")
|
|
146
|
+
print("-" * 60)
|
|
147
|
+
|
|
148
|
+
all_achievements = set()
|
|
149
|
+
for data in [nano_data, mini_data]:
|
|
150
|
+
for instance_data in data.values():
|
|
151
|
+
all_achievements.update(instance_data.get('all_achievements', {}).keys())
|
|
152
|
+
|
|
153
|
+
achievement_types = sorted(all_achievements)
|
|
154
|
+
|
|
155
|
+
for achievement in achievement_types:
|
|
156
|
+
nano_count = sum(1 for data in nano_data.values()
|
|
157
|
+
if data.get('all_achievements', {}).get(achievement, False))
|
|
158
|
+
mini_count = sum(1 for data in mini_data.values()
|
|
159
|
+
if data.get('all_achievements', {}).get(achievement, False))
|
|
160
|
+
|
|
161
|
+
print(f"{achievement:20} | NANO: {nano_count:2d} | MINI: {mini_count:2d} | {'MINI' if mini_count > nano_count else 'NANO' if nano_count > mini_count else 'TIE'}")
|
|
162
|
+
|
|
163
|
+
conn.close()
|
|
164
|
+
|
|
165
|
+
def show_detailed_instance_analysis():
|
|
166
|
+
"""Show detailed analysis of each instance."""
|
|
167
|
+
conn = duckdb.connect("crafter_traces.duckdb")
|
|
168
|
+
|
|
169
|
+
nano_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-nano"])
|
|
170
|
+
mini_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-mini"])
|
|
171
|
+
|
|
172
|
+
print(f"\nš DETAILED INSTANCE ANALYSIS")
|
|
173
|
+
print("=" * 80)
|
|
174
|
+
|
|
175
|
+
for instance_num in range(1, 11):
|
|
176
|
+
nano_perf = nano_data.get(instance_num, {})
|
|
177
|
+
mini_perf = mini_data.get(instance_num, {})
|
|
178
|
+
|
|
179
|
+
print(f"\nš Instance {instance_num} (Seed {42 + instance_num}):")
|
|
180
|
+
print(f" NANO: {nano_perf.get('num_achievements', 0)} achievements, reward: {nano_perf.get('total_reward', 0.0):.2f}, length: {nano_perf.get('rollout_length', 0)}")
|
|
181
|
+
print(f" MINI: {mini_perf.get('num_achievements', 0)} achievements, reward: {mini_perf.get('total_reward', 0.0):.2f}, length: {mini_perf.get('rollout_length', 0)}")
|
|
182
|
+
|
|
183
|
+
# Show specific achievements
|
|
184
|
+
nano_achievements = nano_perf.get('unlocked_achievements', [])
|
|
185
|
+
mini_achievements = mini_perf.get('unlocked_achievements', [])
|
|
186
|
+
|
|
187
|
+
if nano_achievements or mini_achievements:
|
|
188
|
+
print(f" NANO unlocked: {nano_achievements}")
|
|
189
|
+
print(f" MINI unlocked: {mini_achievements}")
|
|
190
|
+
|
|
191
|
+
conn.close()
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
|
+
compare_instance_performance()
|
|
195
|
+
show_detailed_instance_analysis()
|