synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/balance.py +3 -15
  4. synth_ai/cli/demo.py +68 -9
  5. synth_ai/cli/rl_demo.py +137 -0
  6. synth_ai/cli/root.py +65 -0
  7. synth_ai/config/base_url.py +47 -0
  8. synth_ai/demos/core/__init__.py +1 -0
  9. synth_ai/demos/core/cli.py +621 -0
  10. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/core.py +374 -0
  12. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  13. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  14. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  15. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  16. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  17. synth_ai/environments/examples/bandit/__init__.py +33 -0
  18. synth_ai/environments/examples/bandit/engine.py +294 -0
  19. synth_ai/environments/examples/bandit/environment.py +194 -0
  20. synth_ai/environments/examples/bandit/taskset.py +200 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  83. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  84. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  85. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  86. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  87. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  88. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  89. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  90. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  91. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  92. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  93. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  94. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  97. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  98. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  100. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  101. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  102. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  103. synth_ai/environments/examples/red/units/__init__.py +1 -0
  104. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  105. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  106. synth_ai/environments/service/app.py +8 -0
  107. synth_ai/http.py +102 -0
  108. synth_ai/inference/__init__.py +7 -0
  109. synth_ai/inference/client.py +20 -0
  110. synth_ai/install_sqld.sh +40 -0
  111. synth_ai/jobs/client.py +246 -0
  112. synth_ai/learning/__init__.py +24 -0
  113. synth_ai/learning/client.py +149 -0
  114. synth_ai/learning/config.py +43 -0
  115. synth_ai/learning/constants.py +29 -0
  116. synth_ai/learning/ft_client.py +59 -0
  117. synth_ai/learning/health.py +43 -0
  118. synth_ai/learning/jobs.py +205 -0
  119. synth_ai/learning/rl_client.py +256 -0
  120. synth_ai/learning/sse.py +58 -0
  121. synth_ai/learning/validators.py +48 -0
  122. synth_ai/lm/core/main_v3.py +13 -0
  123. synth_ai/lm/core/synth_models.py +48 -0
  124. synth_ai/lm/core/vendor_clients.py +9 -6
  125. synth_ai/lm/vendors/core/openai_api.py +31 -3
  126. synth_ai/lm/vendors/openai_standard.py +45 -14
  127. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  128. synth_ai/lm/vendors/synth_client.py +372 -28
  129. synth_ai/rl/__init__.py +30 -0
  130. synth_ai/rl/contracts.py +32 -0
  131. synth_ai/rl/env_keys.py +137 -0
  132. synth_ai/rl/secrets.py +19 -0
  133. synth_ai/scripts/verify_rewards.py +100 -0
  134. synth_ai/task/__init__.py +10 -0
  135. synth_ai/task/contracts.py +120 -0
  136. synth_ai/task/health.py +28 -0
  137. synth_ai/task/validators.py +12 -0
  138. synth_ai/tracing_v3/hooks.py +3 -1
  139. synth_ai/tracing_v3/session_tracer.py +123 -2
  140. synth_ai/tracing_v3/turso/manager.py +218 -0
  141. synth_ai/tracing_v3/turso/models.py +53 -0
  142. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  143. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
  144. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  145. synth_ai/tui/__init__.py +0 -1
  146. synth_ai/tui/__main__.py +0 -13
  147. synth_ai/tui/cli/__init__.py +0 -1
  148. synth_ai/tui/cli/query_experiments.py +0 -164
  149. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  150. synth_ai/tui/dashboard.py +0 -340
  151. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  152. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  154. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Check where hook results are being stored in the database.
4
+ """
5
+
6
+ import duckdb
7
+ import json
8
+
9
+ def check_hook_storage():
10
+ conn = duckdb.connect("crafter_traces.duckdb")
11
+
12
+ # Get recent experiment
13
+ result = conn.execute("SELECT id, name FROM experiments ORDER BY created_at DESC LIMIT 1").fetchall()
14
+ exp_id = result[0][0]
15
+ exp_name = result[0][1]
16
+
17
+ print(f"šŸ” Checking hook storage for experiment: {exp_name} ({exp_id})")
18
+ print("=" * 60)
19
+
20
+ # Check session metadata
21
+ print("\nšŸ“‹ SESSION METADATA ANALYSIS:")
22
+ result = conn.execute("SELECT session_id, metadata FROM session_traces WHERE experiment_id = ?", [exp_id]).fetchall()
23
+
24
+ for row in result:
25
+ session_id, metadata = row
26
+ metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
27
+
28
+ print(f"\nSession: {session_id}")
29
+ for i, item in enumerate(metadata_list):
30
+ metadata_type = item.get('metadata_type', 'unknown')
31
+ data = item.get('data', {})
32
+ print(f" Item {i}: {metadata_type}")
33
+ print(f" Keys: {list(data.keys())}")
34
+
35
+ # Check for hook-related data
36
+ if 'achievements' in data:
37
+ achievements = data['achievements']
38
+ unlocked = [k for k, v in achievements.items() if v]
39
+ print(f" Achievements: {unlocked}")
40
+
41
+ if 'num_achievements' in data:
42
+ print(f" Num achievements: {data['num_achievements']}")
43
+
44
+ # Check if there are any hook events
45
+ print(f"\nšŸ” HOOK EVENTS CHECK:")
46
+ result = conn.execute("""
47
+ SELECT COUNT(*)
48
+ FROM events e
49
+ JOIN session_traces st ON e.session_id = st.session_id
50
+ WHERE st.experiment_id = ? AND e.event_type = 'hook'
51
+ """, [exp_id]).fetchall()
52
+
53
+ hook_count = result[0][0]
54
+ print(f"Hook events found: {hook_count}")
55
+
56
+ if hook_count > 0:
57
+ print("Sample hook events:")
58
+ result = conn.execute("""
59
+ SELECT e.session_id, e.metadata
60
+ FROM events e
61
+ JOIN session_traces st ON e.session_id = st.session_id
62
+ WHERE st.experiment_id = ? AND e.event_type = 'hook'
63
+ LIMIT 3
64
+ """, [exp_id]).fetchall()
65
+
66
+ for row in result:
67
+ session_id, metadata = row
68
+ print(f" Session {session_id}: {metadata}")
69
+
70
+ # Check for any other hook-related data
71
+ print(f"\nšŸ” OTHER HOOK DATA:")
72
+ result = conn.execute("""
73
+ SELECT e.event_type, COUNT(*)
74
+ FROM events e
75
+ JOIN session_traces st ON e.session_id = st.session_id
76
+ WHERE st.experiment_id = ?
77
+ GROUP BY e.event_type
78
+ """, [exp_id]).fetchall()
79
+
80
+ print("Event types in experiment:")
81
+ for event_type, count in result:
82
+ print(f" {event_type}: {count}")
83
+
84
+ conn.close()
85
+
86
+ if __name__ == "__main__":
87
+ check_hook_storage()
@@ -0,0 +1,88 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Check what seeds are being used in the two experiments.
4
+ """
5
+
6
+ import duckdb
7
+ import json
8
+
9
+ # Experiment IDs
10
+ EXPERIMENTS = {
11
+ "gpt-4.1-nano": "194a3cd2-ecd3-4081-b46d-a7883e4a86f9",
12
+ "gpt-4.1-mini": "da74a769-b33d-4b60-ae2a-52a4b67b3f35"
13
+ }
14
+
15
+ def check_seeds():
16
+ """Check seeds for both experiments."""
17
+ conn = duckdb.connect("crafter_traces.duckdb")
18
+
19
+ for model_name, exp_id in EXPERIMENTS.items():
20
+ print(f"\nšŸ” {model_name.upper()} EXPERIMENT SEEDS")
21
+ print("-" * 50)
22
+
23
+ # Get all sessions for this experiment
24
+ query = """
25
+ SELECT session_id, metadata
26
+ FROM session_traces
27
+ WHERE experiment_id = ?
28
+ ORDER BY session_id
29
+ """
30
+
31
+ results = conn.execute(query, [exp_id]).fetchall()
32
+
33
+ seeds = []
34
+ for session_id, metadata in results:
35
+ if metadata:
36
+ try:
37
+ metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
38
+
39
+ for meta_item in metadata_list:
40
+ if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
41
+ data = meta_item.get('data', {})
42
+
43
+ # Look for seed information
44
+ if 'seed' in data:
45
+ seeds.append({
46
+ 'session_id': session_id,
47
+ 'seed': data['seed']
48
+ })
49
+ elif 'instance_num' in data:
50
+ # Sometimes seed is derived from instance_num
51
+ seeds.append({
52
+ 'session_id': session_id,
53
+ 'instance_num': data['instance_num']
54
+ })
55
+ except Exception as e:
56
+ print(f"Error parsing metadata for {session_id}: {e}")
57
+
58
+ if seeds:
59
+ print(f"Found {len(seeds)} sessions with seed info:")
60
+ for seed_info in seeds:
61
+ print(f" {seed_info}")
62
+ else:
63
+ print("No explicit seed information found in metadata")
64
+ print("Checking for instance numbers...")
65
+
66
+ # Check for instance numbers as a proxy for seeds
67
+ instance_nums = []
68
+ for session_id, metadata in results:
69
+ if metadata:
70
+ try:
71
+ metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
72
+ for meta_item in metadata_list:
73
+ if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
74
+ data = meta_item.get('data', {})
75
+ if 'instance_num' in data:
76
+ instance_nums.append(data['instance_num'])
77
+ except:
78
+ pass
79
+
80
+ if instance_nums:
81
+ print(f"Instance numbers found: {sorted(instance_nums)}")
82
+ else:
83
+ print("No instance numbers found either")
84
+
85
+ conn.close()
86
+
87
+ if __name__ == "__main__":
88
+ check_seeds()
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Compare which seeds/instances performed better in nano vs mini.
4
+ Shows detailed performance breakdown by instance.
5
+ """
6
+
7
+ import duckdb
8
+ import json
9
+ from typing import Dict, List, Any
10
+
11
+ # Experiment IDs
12
+ EXPERIMENTS = {
13
+ "gpt-4.1-nano": "194a3cd2-ecd3-4081-b46d-a7883e4a86f9",
14
+ "gpt-4.1-mini": "da74a769-b33d-4b60-ae2a-52a4b67b3f35"
15
+ }
16
+
17
+ def get_instance_performance(conn, experiment_id: str) -> Dict[int, Dict[str, Any]]:
18
+ """Get performance data for each instance in an experiment."""
19
+ query = """
20
+ SELECT session_id, metadata
21
+ FROM session_traces
22
+ WHERE experiment_id = ?
23
+ ORDER BY session_id
24
+ """
25
+
26
+ results = conn.execute(query, [experiment_id]).fetchall()
27
+
28
+ instance_data = {}
29
+
30
+ for session_id, metadata in results:
31
+ if metadata:
32
+ try:
33
+ metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
34
+
35
+ # Look for instance_num in first metadata item
36
+ instance_num = None
37
+ for meta_item in metadata_list:
38
+ if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
39
+ data = meta_item.get('data', {})
40
+ if 'instance_num' in data:
41
+ instance_num = data['instance_num']
42
+ break
43
+
44
+ # Look for achievement data in any metadata item
45
+ achievements = {}
46
+ num_achievements = 0
47
+ total_reward = 0.0
48
+ rollout_length = 0
49
+ terminated = False
50
+
51
+ for meta_item in metadata_list:
52
+ if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
53
+ data = meta_item.get('data', {})
54
+
55
+ if 'achievements' in data:
56
+ achievements = data.get('achievements', {})
57
+ num_achievements = data.get('num_achievements', 0)
58
+ total_reward = data.get('total_reward', 0.0)
59
+ rollout_length = data.get('rollout_length', 0)
60
+ terminated = data.get('terminated', False)
61
+ break
62
+
63
+ if instance_num is not None:
64
+ unlocked_achievements = [ach for ach, unlocked in achievements.items() if unlocked]
65
+
66
+ instance_data[instance_num] = {
67
+ 'session_id': session_id,
68
+ 'num_achievements': num_achievements,
69
+ 'unlocked_achievements': unlocked_achievements,
70
+ 'total_reward': total_reward,
71
+ 'rollout_length': rollout_length,
72
+ 'terminated': terminated,
73
+ 'all_achievements': achievements
74
+ }
75
+ except Exception as e:
76
+ print(f"Error parsing metadata for {session_id}: {e}")
77
+
78
+ return instance_data
79
+
80
+ def compare_instance_performance():
81
+ """Compare performance between nano and mini for each instance."""
82
+ conn = duckdb.connect("crafter_traces.duckdb")
83
+
84
+ # Get performance data for both experiments
85
+ nano_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-nano"])
86
+ mini_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-mini"])
87
+
88
+ print("šŸ” INSTANCE-BY-INSTANCE PERFORMANCE COMPARISON")
89
+ print("=" * 80)
90
+
91
+ # Compare each instance
92
+ nano_wins = []
93
+ mini_wins = []
94
+ ties = []
95
+
96
+ for instance_num in range(1, 11): # Instances 1-10
97
+ nano_perf = nano_data.get(instance_num, {})
98
+ mini_perf = mini_data.get(instance_num, {})
99
+
100
+ nano_achievements = nano_perf.get('num_achievements', 0)
101
+ mini_achievements = mini_perf.get('num_achievements', 0)
102
+
103
+ nano_unlocked = nano_perf.get('unlocked_achievements', [])
104
+ mini_unlocked = mini_perf.get('unlocked_achievements', [])
105
+
106
+ print(f"\nšŸ“Š Instance {instance_num} (Seed {42 + instance_num}):")
107
+ print(f" GPT-4.1-NANO: {nano_achievements} achievements - {nano_unlocked}")
108
+ print(f" GPT-4.1-MINI: {mini_achievements} achievements - {mini_unlocked}")
109
+
110
+ if nano_achievements > mini_achievements:
111
+ nano_wins.append(instance_num)
112
+ print(f" šŸ† NANO WINS")
113
+ elif mini_achievements > nano_achievements:
114
+ mini_wins.append(instance_num)
115
+ print(f" šŸ† MINI WINS")
116
+ else:
117
+ ties.append(instance_num)
118
+ print(f" šŸ¤ TIE")
119
+
120
+ # Summary statistics
121
+ print(f"\nšŸ“ˆ PERFORMANCE SUMMARY")
122
+ print("=" * 50)
123
+ print(f"NANO wins: {len(nano_wins)} instances - {nano_wins}")
124
+ print(f"MINI wins: {len(mini_wins)} instances - {mini_wins}")
125
+ print(f"Ties: {len(ties)} instances - {ties}")
126
+
127
+ # Calculate total achievements by model
128
+ total_nano_achievements = sum(nano_data[i].get('num_achievements', 0) for i in range(1, 11))
129
+ total_mini_achievements = sum(mini_data[i].get('num_achievements', 0) for i in range(1, 11))
130
+
131
+ print(f"\nšŸ† TOTAL ACHIEVEMENTS:")
132
+ print(f" GPT-4.1-NANO: {total_nano_achievements}")
133
+ print(f" GPT-4.1-MINI: {total_mini_achievements}")
134
+
135
+ # Show which instances each model dominated
136
+ print(f"\nšŸŽÆ INSTANCE DOMINANCE:")
137
+ if nano_wins:
138
+ print(f" NANO dominated: {nano_wins}")
139
+ if mini_wins:
140
+ print(f" MINI dominated: {mini_wins}")
141
+ if ties:
142
+ print(f" Tied instances: {ties}")
143
+
144
+ # Detailed breakdown by achievement type
145
+ print(f"\nšŸ“Š ACHIEVEMENT BREAKDOWN BY INSTANCE:")
146
+ print("-" * 60)
147
+
148
+ all_achievements = set()
149
+ for data in [nano_data, mini_data]:
150
+ for instance_data in data.values():
151
+ all_achievements.update(instance_data.get('all_achievements', {}).keys())
152
+
153
+ achievement_types = sorted(all_achievements)
154
+
155
+ for achievement in achievement_types:
156
+ nano_count = sum(1 for data in nano_data.values()
157
+ if data.get('all_achievements', {}).get(achievement, False))
158
+ mini_count = sum(1 for data in mini_data.values()
159
+ if data.get('all_achievements', {}).get(achievement, False))
160
+
161
+ print(f"{achievement:20} | NANO: {nano_count:2d} | MINI: {mini_count:2d} | {'MINI' if mini_count > nano_count else 'NANO' if nano_count > mini_count else 'TIE'}")
162
+
163
+ conn.close()
164
+
165
+ def show_detailed_instance_analysis():
166
+ """Show detailed analysis of each instance."""
167
+ conn = duckdb.connect("crafter_traces.duckdb")
168
+
169
+ nano_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-nano"])
170
+ mini_data = get_instance_performance(conn, EXPERIMENTS["gpt-4.1-mini"])
171
+
172
+ print(f"\nšŸ” DETAILED INSTANCE ANALYSIS")
173
+ print("=" * 80)
174
+
175
+ for instance_num in range(1, 11):
176
+ nano_perf = nano_data.get(instance_num, {})
177
+ mini_perf = mini_data.get(instance_num, {})
178
+
179
+ print(f"\nšŸ“‹ Instance {instance_num} (Seed {42 + instance_num}):")
180
+ print(f" NANO: {nano_perf.get('num_achievements', 0)} achievements, reward: {nano_perf.get('total_reward', 0.0):.2f}, length: {nano_perf.get('rollout_length', 0)}")
181
+ print(f" MINI: {mini_perf.get('num_achievements', 0)} achievements, reward: {mini_perf.get('total_reward', 0.0):.2f}, length: {mini_perf.get('rollout_length', 0)}")
182
+
183
+ # Show specific achievements
184
+ nano_achievements = nano_perf.get('unlocked_achievements', [])
185
+ mini_achievements = mini_perf.get('unlocked_achievements', [])
186
+
187
+ if nano_achievements or mini_achievements:
188
+ print(f" NANO unlocked: {nano_achievements}")
189
+ print(f" MINI unlocked: {mini_achievements}")
190
+
191
+ conn.close()
192
+
193
+ if __name__ == "__main__":
194
+ compare_instance_performance()
195
+ show_detailed_instance_analysis()