synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/balance.py +3 -15
  4. synth_ai/cli/demo.py +68 -9
  5. synth_ai/cli/rl_demo.py +137 -0
  6. synth_ai/cli/root.py +65 -0
  7. synth_ai/config/base_url.py +47 -0
  8. synth_ai/demos/core/__init__.py +1 -0
  9. synth_ai/demos/core/cli.py +621 -0
  10. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/core.py +374 -0
  12. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  13. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  14. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  15. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  16. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  17. synth_ai/environments/examples/bandit/__init__.py +33 -0
  18. synth_ai/environments/examples/bandit/engine.py +294 -0
  19. synth_ai/environments/examples/bandit/environment.py +194 -0
  20. synth_ai/environments/examples/bandit/taskset.py +200 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  83. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  84. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  85. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  86. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  87. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  88. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  89. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  90. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  91. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  92. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  93. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  94. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  97. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  98. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  100. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  101. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  102. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  103. synth_ai/environments/examples/red/units/__init__.py +1 -0
  104. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  105. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  106. synth_ai/environments/service/app.py +8 -0
  107. synth_ai/http.py +102 -0
  108. synth_ai/inference/__init__.py +7 -0
  109. synth_ai/inference/client.py +20 -0
  110. synth_ai/install_sqld.sh +40 -0
  111. synth_ai/jobs/client.py +246 -0
  112. synth_ai/learning/__init__.py +24 -0
  113. synth_ai/learning/client.py +149 -0
  114. synth_ai/learning/config.py +43 -0
  115. synth_ai/learning/constants.py +29 -0
  116. synth_ai/learning/ft_client.py +59 -0
  117. synth_ai/learning/health.py +43 -0
  118. synth_ai/learning/jobs.py +205 -0
  119. synth_ai/learning/rl_client.py +256 -0
  120. synth_ai/learning/sse.py +58 -0
  121. synth_ai/learning/validators.py +48 -0
  122. synth_ai/lm/core/main_v3.py +13 -0
  123. synth_ai/lm/core/synth_models.py +48 -0
  124. synth_ai/lm/core/vendor_clients.py +9 -6
  125. synth_ai/lm/vendors/core/openai_api.py +31 -3
  126. synth_ai/lm/vendors/openai_standard.py +45 -14
  127. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  128. synth_ai/lm/vendors/synth_client.py +372 -28
  129. synth_ai/rl/__init__.py +30 -0
  130. synth_ai/rl/contracts.py +32 -0
  131. synth_ai/rl/env_keys.py +137 -0
  132. synth_ai/rl/secrets.py +19 -0
  133. synth_ai/scripts/verify_rewards.py +100 -0
  134. synth_ai/task/__init__.py +10 -0
  135. synth_ai/task/contracts.py +120 -0
  136. synth_ai/task/health.py +28 -0
  137. synth_ai/task/validators.py +12 -0
  138. synth_ai/tracing_v3/hooks.py +3 -1
  139. synth_ai/tracing_v3/session_tracer.py +123 -2
  140. synth_ai/tracing_v3/turso/manager.py +218 -0
  141. synth_ai/tracing_v3/turso/models.py +53 -0
  142. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  143. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
  144. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  145. synth_ai/tui/__init__.py +0 -1
  146. synth_ai/tui/__main__.py +0 -13
  147. synth_ai/tui/cli/__init__.py +0 -1
  148. synth_ai/tui/cli/query_experiments.py +0 -164
  149. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  150. synth_ai/tui/dashboard.py +0 -340
  151. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  152. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  154. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,324 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Compare experiments between gpt-4.1-nano and gpt-4.1-mini.
4
+ Analyzes performance differences, achievement patterns, and instance difficulty.
5
+ """
6
+
7
+ import json
8
+ from typing import Any, Dict, List
9
+
10
+ import duckdb
11
+ import pandas as pd
12
+
13
+ # Experiment IDs from the runs
14
+ EXPERIMENTS = {
15
+ "gpt-4o-mini": "137683ed-3bd5-4bd3-9162-dae0371ddd3d",
16
+ "gpt-4o": "207307d5-4105-4a18-bb93-89936047fa18"
17
+ }
18
+
19
+ def connect_to_db():
20
+ """Connect to the DuckDB database."""
21
+ return duckdb.connect("synth_ai/traces/crafter_traces.duckdb")
22
+
23
+ def get_experiment_summary(conn, experiment_id: str) -> Dict[str, Any]:
24
+ """Get basic experiment information."""
25
+ query = """
26
+ SELECT
27
+ e.id,
28
+ e.name,
29
+ e.description,
30
+ e.created_at,
31
+ sv.branch,
32
+ sv.commit
33
+ FROM experiments e
34
+ LEFT JOIN experimental_systems es ON e.id = es.experiment_id
35
+ LEFT JOIN system_versions sv ON es.system_version_id = sv.id
36
+ WHERE e.id = ?
37
+ """
38
+
39
+ result = conn.execute(query, [experiment_id]).fetchone()
40
+ if result:
41
+ return {
42
+ "experiment_id": result[0],
43
+ "name": result[1],
44
+ "description": result[2],
45
+ "created_at": result[3],
46
+ "branch": result[4],
47
+ "commit": result[5]
48
+ }
49
+ return None
50
+
51
+ def get_session_stats(conn, experiment_id: str) -> pd.DataFrame:
52
+ """Get session-level statistics for an experiment."""
53
+ query = """
54
+ SELECT
55
+ st.session_id,
56
+ st.created_at,
57
+ st.num_timesteps,
58
+ st.num_events,
59
+ st.num_messages,
60
+ st.metadata
61
+ FROM session_traces st
62
+ WHERE st.experiment_id = ?
63
+ ORDER BY st.created_at
64
+ """
65
+
66
+ return conn.execute(query, [experiment_id]).df()
67
+
68
+ def get_achievement_analysis(conn, experiment_id: str) -> Dict[str, Any]:
69
+ """Analyze achievements for an experiment."""
70
+ # Get session traces with achievement data
71
+ query = """
72
+ SELECT
73
+ st.session_id,
74
+ st.metadata
75
+ FROM session_traces st
76
+ WHERE st.experiment_id = ?
77
+ AND st.metadata IS NOT NULL
78
+ """
79
+
80
+ results = conn.execute(query, [experiment_id]).fetchall()
81
+
82
+ all_achievements = []
83
+ session_achievements = []
84
+
85
+ for session_id, metadata in results:
86
+ if metadata:
87
+ # Parse the JSON metadata
88
+ try:
89
+ import json
90
+ metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
91
+
92
+ for meta_item in metadata_list:
93
+ if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
94
+ data = meta_item.get('data', {})
95
+ if 'achievements' in data:
96
+ achievements_dict = data['achievements']
97
+ num_achievements = data.get('num_achievements', 0)
98
+
99
+ # Extract unlocked achievements
100
+ unlocked = [ach for ach, unlocked in achievements_dict.items() if unlocked]
101
+ all_achievements.extend(unlocked)
102
+
103
+ session_achievements.append({
104
+ 'session_id': session_id,
105
+ 'num_achievements': num_achievements,
106
+ 'unlocked_achievements': unlocked,
107
+ 'total_achievements': len(achievements_dict)
108
+ })
109
+ except Exception as e:
110
+ print(f"Error parsing metadata for session {session_id}: {e}")
111
+
112
+ # Count achievements
113
+ achievement_counts = {}
114
+ for ach in all_achievements:
115
+ achievement_counts[ach] = achievement_counts.get(ach, 0) + 1
116
+
117
+ return {
118
+ "total_achievements": len(all_achievements),
119
+ "unique_achievements": len(set(all_achievements)),
120
+ "achievement_counts": achievement_counts,
121
+ "achievement_list": all_achievements,
122
+ "session_achievements": session_achievements
123
+ }
124
+
125
+ def get_model_usage_analysis(conn, experiment_id: str) -> pd.DataFrame:
126
+ """Analyze model usage and costs."""
127
+ query = """
128
+ SELECT
129
+ e.model_name,
130
+ e.provider,
131
+ COUNT(*) as call_count,
132
+ SUM(e.prompt_tokens) as total_prompt_tokens,
133
+ SUM(e.completion_tokens) as total_completion_tokens,
134
+ SUM(e.total_tokens) as total_tokens,
135
+ SUM(e.cost) as total_cost,
136
+ AVG(e.latency_ms) as avg_latency_ms,
137
+ AVG(e.prompt_tokens) as avg_prompt_tokens,
138
+ AVG(e.completion_tokens) as avg_completion_tokens
139
+ FROM session_traces st
140
+ JOIN events e ON st.session_id = e.session_id
141
+ WHERE st.experiment_id = ?
142
+ AND e.event_type = 'lm_cais'
143
+ GROUP BY e.model_name, e.provider
144
+ """
145
+
146
+ return conn.execute(query, [experiment_id]).df()
147
+
148
+ def get_session_performance_comparison(conn) -> pd.DataFrame:
149
+ """Compare session performance between experiments."""
150
+ query = """
151
+ SELECT
152
+ st.experiment_id,
153
+ e.name as experiment_name,
154
+ COUNT(st.session_id) as total_sessions,
155
+ AVG(st.num_timesteps) as avg_timesteps,
156
+ AVG(st.num_events) as avg_events,
157
+ AVG(st.num_messages) as avg_messages,
158
+ SUM(st.num_timesteps) as total_timesteps,
159
+ SUM(st.num_events) as total_events,
160
+ SUM(st.num_messages) as total_messages
161
+ FROM session_traces st
162
+ JOIN experiments e ON st.experiment_id = e.id
163
+ WHERE st.experiment_id IN (?, ?)
164
+ GROUP BY st.experiment_id, e.name
165
+ ORDER BY e.name
166
+ """
167
+
168
+ return conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
169
+
170
+ def get_achievement_comparison(conn) -> pd.DataFrame:
171
+ """Compare achievements between experiments."""
172
+ # This is a more complex query to extract achievements from metadata
173
+ query = """
174
+ WITH achievement_data AS (
175
+ SELECT
176
+ st.experiment_id,
177
+ e.name as experiment_name,
178
+ st.session_id,
179
+ e.metadata,
180
+ e.event_metadata
181
+ FROM session_traces st
182
+ JOIN experiments e ON st.experiment_id = e.experiment_id
183
+ JOIN events ev ON st.session_id = ev.session_id
184
+ WHERE st.experiment_id IN (?, ?)
185
+ AND ev.event_type = 'environment'
186
+ AND ev.metadata IS NOT NULL
187
+ )
188
+ SELECT
189
+ experiment_id,
190
+ experiment_name,
191
+ COUNT(DISTINCT session_id) as sessions_with_achievements,
192
+ COUNT(*) as total_achievement_events
193
+ FROM achievement_data
194
+ GROUP BY experiment_id, experiment_name
195
+ """
196
+
197
+ return conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
198
+
199
+ def analyze_instance_difficulty(conn) -> Dict[str, Any]:
200
+ """Analyze which instances were more difficult for each model."""
201
+ query = """
202
+ SELECT
203
+ st.experiment_id,
204
+ e.name as experiment_name,
205
+ st.session_id,
206
+ st.num_timesteps,
207
+ st.num_events,
208
+ st.metadata
209
+ FROM session_traces st
210
+ JOIN experiments e ON st.experiment_id = e.id
211
+ WHERE st.experiment_id IN (?, ?)
212
+ ORDER BY st.experiment_id, st.session_id
213
+ """
214
+
215
+ df = conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
216
+
217
+ # Group by experiment and analyze session patterns
218
+ analysis = {}
219
+ for experiment_id in [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]:
220
+ exp_data = df[df['experiment_id'] == experiment_id]
221
+ analysis[experiment_id] = {
222
+ "total_sessions": len(exp_data),
223
+ "avg_timesteps": exp_data['num_timesteps'].mean(),
224
+ "avg_events": exp_data['num_events'].mean(),
225
+ "max_timesteps": exp_data['num_timesteps'].max(),
226
+ "min_timesteps": exp_data['num_timesteps'].min(),
227
+ "session_lengths": exp_data['num_timesteps'].tolist()
228
+ }
229
+
230
+ return analysis
231
+
232
+ def main():
233
+ """Main analysis function."""
234
+ print("šŸ” COMPARING GPT-4O-MINI vs GPT-4O EXPERIMENTS")
235
+ print("=" * 80)
236
+
237
+ conn = connect_to_db()
238
+
239
+ # Get experiment summaries
240
+ print("\nšŸ“‹ EXPERIMENT SUMMARIES")
241
+ print("-" * 40)
242
+
243
+ for model_name, exp_id in EXPERIMENTS.items():
244
+ summary = get_experiment_summary(conn, exp_id)
245
+ if summary:
246
+ print(f"\n{model_name.upper()}:")
247
+ print(f" Name: {summary['name']}")
248
+ print(f" ID: {summary['experiment_id']}")
249
+ print(f" Created: {summary['created_at']}")
250
+ print(f" Git: {summary['branch']} @ {summary['commit'][:8]}")
251
+
252
+ # Session performance comparison
253
+ print("\nšŸ“Š SESSION PERFORMANCE COMPARISON")
254
+ print("-" * 40)
255
+
256
+ perf_df = get_session_performance_comparison(conn)
257
+ print(perf_df.to_string(index=False))
258
+
259
+ # Achievement analysis
260
+ print("\nšŸ† ACHIEVEMENT ANALYSIS")
261
+ print("-" * 40)
262
+
263
+ for model_name, exp_id in EXPERIMENTS.items():
264
+ print(f"\n{model_name.upper()}:")
265
+ achievement_data = get_achievement_analysis(conn, exp_id)
266
+ print(f" Total Achievements: {achievement_data['total_achievements']}")
267
+ print(f" Unique Achievements: {achievement_data['unique_achievements']}")
268
+ print(f" Achievement Counts: {achievement_data['achievement_counts']}")
269
+
270
+ # Model usage analysis
271
+ print("\nšŸ’° MODEL USAGE ANALYSIS")
272
+ print("-" * 40)
273
+
274
+ for model_name, exp_id in EXPERIMENTS.items():
275
+ print(f"\n{model_name.upper()}:")
276
+ usage_df = get_model_usage_analysis(conn, exp_id)
277
+ if not usage_df.empty:
278
+ print(usage_df.to_string(index=False))
279
+ else:
280
+ print(" No model usage data found")
281
+
282
+ # Instance difficulty analysis
283
+ print("\nšŸŽÆ INSTANCE DIFFICULTY ANALYSIS")
284
+ print("-" * 40)
285
+
286
+ difficulty_analysis = analyze_instance_difficulty(conn)
287
+
288
+ for model_name, exp_id in EXPERIMENTS.items():
289
+ data = difficulty_analysis[exp_id]
290
+ print(f"\n{model_name.upper()}:")
291
+ print(f" Total Sessions: {data['total_sessions']}")
292
+ print(f" Avg Timesteps: {data['avg_timesteps']:.1f}")
293
+ print(f" Avg Events: {data['avg_events']:.1f}")
294
+ print(f" Timestep Range: {data['min_timesteps']} - {data['max_timesteps']}")
295
+
296
+ # Performance comparison summary
297
+ print("\nšŸ“ˆ PERFORMANCE COMPARISON SUMMARY")
298
+ print("-" * 40)
299
+
300
+ mini_data = difficulty_analysis[EXPERIMENTS["gpt-4o-mini"]]
301
+ full_data = difficulty_analysis[EXPERIMENTS["gpt-4o"]]
302
+
303
+ print(f"GPT-4O-MINI:")
304
+ print(f" Sessions: {mini_data['total_sessions']}")
305
+ print(f" Avg Timesteps: {mini_data['avg_timesteps']:.1f}")
306
+ print(f" Avg Events: {mini_data['avg_events']:.1f}")
307
+
308
+ print(f"\nGPT-4O:")
309
+ print(f" Sessions: {full_data['total_sessions']}")
310
+ print(f" Avg Timesteps: {full_data['avg_timesteps']:.1f}")
311
+ print(f" Avg Events: {full_data['avg_events']:.1f}")
312
+
313
+ # Calculate improvements
314
+ timestep_improvement = ((full_data['avg_timesteps'] - mini_data['avg_timesteps']) / mini_data['avg_timesteps']) * 100
315
+ event_improvement = ((full_data['avg_events'] - mini_data['avg_events']) / mini_data['avg_events']) * 100
316
+
317
+ print(f"\nšŸ“Š IMPROVEMENTS:")
318
+ print(f" Timesteps: {timestep_improvement:+.1f}%")
319
+ print(f" Events: {event_improvement:+.1f}%")
320
+
321
+ conn.close()
322
+
323
+ if __name__ == "__main__":
324
+ main()