synth-ai 0.2.4.dev8__py3-none-any.whl ā 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/RECORD +110 -11
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +0 -635
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev8.dist-info ā synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Compare experiments between gpt-4.1-nano and gpt-4.1-mini.
|
|
4
|
+
Analyzes performance differences, achievement patterns, and instance difficulty.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
from typing import Any, Dict, List
|
|
9
|
+
|
|
10
|
+
import duckdb
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
# Experiment IDs from the runs
|
|
14
|
+
EXPERIMENTS = {
|
|
15
|
+
"gpt-4o-mini": "137683ed-3bd5-4bd3-9162-dae0371ddd3d",
|
|
16
|
+
"gpt-4o": "207307d5-4105-4a18-bb93-89936047fa18"
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def connect_to_db():
|
|
20
|
+
"""Connect to the DuckDB database."""
|
|
21
|
+
return duckdb.connect("synth_ai/traces/crafter_traces.duckdb")
|
|
22
|
+
|
|
23
|
+
def get_experiment_summary(conn, experiment_id: str) -> Dict[str, Any]:
|
|
24
|
+
"""Get basic experiment information."""
|
|
25
|
+
query = """
|
|
26
|
+
SELECT
|
|
27
|
+
e.id,
|
|
28
|
+
e.name,
|
|
29
|
+
e.description,
|
|
30
|
+
e.created_at,
|
|
31
|
+
sv.branch,
|
|
32
|
+
sv.commit
|
|
33
|
+
FROM experiments e
|
|
34
|
+
LEFT JOIN experimental_systems es ON e.id = es.experiment_id
|
|
35
|
+
LEFT JOIN system_versions sv ON es.system_version_id = sv.id
|
|
36
|
+
WHERE e.id = ?
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
result = conn.execute(query, [experiment_id]).fetchone()
|
|
40
|
+
if result:
|
|
41
|
+
return {
|
|
42
|
+
"experiment_id": result[0],
|
|
43
|
+
"name": result[1],
|
|
44
|
+
"description": result[2],
|
|
45
|
+
"created_at": result[3],
|
|
46
|
+
"branch": result[4],
|
|
47
|
+
"commit": result[5]
|
|
48
|
+
}
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def get_session_stats(conn, experiment_id: str) -> pd.DataFrame:
|
|
52
|
+
"""Get session-level statistics for an experiment."""
|
|
53
|
+
query = """
|
|
54
|
+
SELECT
|
|
55
|
+
st.session_id,
|
|
56
|
+
st.created_at,
|
|
57
|
+
st.num_timesteps,
|
|
58
|
+
st.num_events,
|
|
59
|
+
st.num_messages,
|
|
60
|
+
st.metadata
|
|
61
|
+
FROM session_traces st
|
|
62
|
+
WHERE st.experiment_id = ?
|
|
63
|
+
ORDER BY st.created_at
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
return conn.execute(query, [experiment_id]).df()
|
|
67
|
+
|
|
68
|
+
def get_achievement_analysis(conn, experiment_id: str) -> Dict[str, Any]:
|
|
69
|
+
"""Analyze achievements for an experiment."""
|
|
70
|
+
# Get session traces with achievement data
|
|
71
|
+
query = """
|
|
72
|
+
SELECT
|
|
73
|
+
st.session_id,
|
|
74
|
+
st.metadata
|
|
75
|
+
FROM session_traces st
|
|
76
|
+
WHERE st.experiment_id = ?
|
|
77
|
+
AND st.metadata IS NOT NULL
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
results = conn.execute(query, [experiment_id]).fetchall()
|
|
81
|
+
|
|
82
|
+
all_achievements = []
|
|
83
|
+
session_achievements = []
|
|
84
|
+
|
|
85
|
+
for session_id, metadata in results:
|
|
86
|
+
if metadata:
|
|
87
|
+
# Parse the JSON metadata
|
|
88
|
+
try:
|
|
89
|
+
import json
|
|
90
|
+
metadata_list = json.loads(metadata) if isinstance(metadata, str) else metadata
|
|
91
|
+
|
|
92
|
+
for meta_item in metadata_list:
|
|
93
|
+
if isinstance(meta_item, dict) and meta_item.get('metadata_type') == 'SessionMetadum':
|
|
94
|
+
data = meta_item.get('data', {})
|
|
95
|
+
if 'achievements' in data:
|
|
96
|
+
achievements_dict = data['achievements']
|
|
97
|
+
num_achievements = data.get('num_achievements', 0)
|
|
98
|
+
|
|
99
|
+
# Extract unlocked achievements
|
|
100
|
+
unlocked = [ach for ach, unlocked in achievements_dict.items() if unlocked]
|
|
101
|
+
all_achievements.extend(unlocked)
|
|
102
|
+
|
|
103
|
+
session_achievements.append({
|
|
104
|
+
'session_id': session_id,
|
|
105
|
+
'num_achievements': num_achievements,
|
|
106
|
+
'unlocked_achievements': unlocked,
|
|
107
|
+
'total_achievements': len(achievements_dict)
|
|
108
|
+
})
|
|
109
|
+
except Exception as e:
|
|
110
|
+
print(f"Error parsing metadata for session {session_id}: {e}")
|
|
111
|
+
|
|
112
|
+
# Count achievements
|
|
113
|
+
achievement_counts = {}
|
|
114
|
+
for ach in all_achievements:
|
|
115
|
+
achievement_counts[ach] = achievement_counts.get(ach, 0) + 1
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
"total_achievements": len(all_achievements),
|
|
119
|
+
"unique_achievements": len(set(all_achievements)),
|
|
120
|
+
"achievement_counts": achievement_counts,
|
|
121
|
+
"achievement_list": all_achievements,
|
|
122
|
+
"session_achievements": session_achievements
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
def get_model_usage_analysis(conn, experiment_id: str) -> pd.DataFrame:
|
|
126
|
+
"""Analyze model usage and costs."""
|
|
127
|
+
query = """
|
|
128
|
+
SELECT
|
|
129
|
+
e.model_name,
|
|
130
|
+
e.provider,
|
|
131
|
+
COUNT(*) as call_count,
|
|
132
|
+
SUM(e.prompt_tokens) as total_prompt_tokens,
|
|
133
|
+
SUM(e.completion_tokens) as total_completion_tokens,
|
|
134
|
+
SUM(e.total_tokens) as total_tokens,
|
|
135
|
+
SUM(e.cost) as total_cost,
|
|
136
|
+
AVG(e.latency_ms) as avg_latency_ms,
|
|
137
|
+
AVG(e.prompt_tokens) as avg_prompt_tokens,
|
|
138
|
+
AVG(e.completion_tokens) as avg_completion_tokens
|
|
139
|
+
FROM session_traces st
|
|
140
|
+
JOIN events e ON st.session_id = e.session_id
|
|
141
|
+
WHERE st.experiment_id = ?
|
|
142
|
+
AND e.event_type = 'lm_cais'
|
|
143
|
+
GROUP BY e.model_name, e.provider
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
return conn.execute(query, [experiment_id]).df()
|
|
147
|
+
|
|
148
|
+
def get_session_performance_comparison(conn) -> pd.DataFrame:
|
|
149
|
+
"""Compare session performance between experiments."""
|
|
150
|
+
query = """
|
|
151
|
+
SELECT
|
|
152
|
+
st.experiment_id,
|
|
153
|
+
e.name as experiment_name,
|
|
154
|
+
COUNT(st.session_id) as total_sessions,
|
|
155
|
+
AVG(st.num_timesteps) as avg_timesteps,
|
|
156
|
+
AVG(st.num_events) as avg_events,
|
|
157
|
+
AVG(st.num_messages) as avg_messages,
|
|
158
|
+
SUM(st.num_timesteps) as total_timesteps,
|
|
159
|
+
SUM(st.num_events) as total_events,
|
|
160
|
+
SUM(st.num_messages) as total_messages
|
|
161
|
+
FROM session_traces st
|
|
162
|
+
JOIN experiments e ON st.experiment_id = e.id
|
|
163
|
+
WHERE st.experiment_id IN (?, ?)
|
|
164
|
+
GROUP BY st.experiment_id, e.name
|
|
165
|
+
ORDER BY e.name
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
return conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
|
|
169
|
+
|
|
170
|
+
def get_achievement_comparison(conn) -> pd.DataFrame:
|
|
171
|
+
"""Compare achievements between experiments."""
|
|
172
|
+
# This is a more complex query to extract achievements from metadata
|
|
173
|
+
query = """
|
|
174
|
+
WITH achievement_data AS (
|
|
175
|
+
SELECT
|
|
176
|
+
st.experiment_id,
|
|
177
|
+
e.name as experiment_name,
|
|
178
|
+
st.session_id,
|
|
179
|
+
e.metadata,
|
|
180
|
+
e.event_metadata
|
|
181
|
+
FROM session_traces st
|
|
182
|
+
JOIN experiments e ON st.experiment_id = e.experiment_id
|
|
183
|
+
JOIN events ev ON st.session_id = ev.session_id
|
|
184
|
+
WHERE st.experiment_id IN (?, ?)
|
|
185
|
+
AND ev.event_type = 'environment'
|
|
186
|
+
AND ev.metadata IS NOT NULL
|
|
187
|
+
)
|
|
188
|
+
SELECT
|
|
189
|
+
experiment_id,
|
|
190
|
+
experiment_name,
|
|
191
|
+
COUNT(DISTINCT session_id) as sessions_with_achievements,
|
|
192
|
+
COUNT(*) as total_achievement_events
|
|
193
|
+
FROM achievement_data
|
|
194
|
+
GROUP BY experiment_id, experiment_name
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
return conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
|
|
198
|
+
|
|
199
|
+
def analyze_instance_difficulty(conn) -> Dict[str, Any]:
|
|
200
|
+
"""Analyze which instances were more difficult for each model."""
|
|
201
|
+
query = """
|
|
202
|
+
SELECT
|
|
203
|
+
st.experiment_id,
|
|
204
|
+
e.name as experiment_name,
|
|
205
|
+
st.session_id,
|
|
206
|
+
st.num_timesteps,
|
|
207
|
+
st.num_events,
|
|
208
|
+
st.metadata
|
|
209
|
+
FROM session_traces st
|
|
210
|
+
JOIN experiments e ON st.experiment_id = e.id
|
|
211
|
+
WHERE st.experiment_id IN (?, ?)
|
|
212
|
+
ORDER BY st.experiment_id, st.session_id
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
df = conn.execute(query, [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]).df()
|
|
216
|
+
|
|
217
|
+
# Group by experiment and analyze session patterns
|
|
218
|
+
analysis = {}
|
|
219
|
+
for experiment_id in [EXPERIMENTS["gpt-4o-mini"], EXPERIMENTS["gpt-4o"]]:
|
|
220
|
+
exp_data = df[df['experiment_id'] == experiment_id]
|
|
221
|
+
analysis[experiment_id] = {
|
|
222
|
+
"total_sessions": len(exp_data),
|
|
223
|
+
"avg_timesteps": exp_data['num_timesteps'].mean(),
|
|
224
|
+
"avg_events": exp_data['num_events'].mean(),
|
|
225
|
+
"max_timesteps": exp_data['num_timesteps'].max(),
|
|
226
|
+
"min_timesteps": exp_data['num_timesteps'].min(),
|
|
227
|
+
"session_lengths": exp_data['num_timesteps'].tolist()
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return analysis
|
|
231
|
+
|
|
232
|
+
def main():
|
|
233
|
+
"""Main analysis function."""
|
|
234
|
+
print("š COMPARING GPT-4O-MINI vs GPT-4O EXPERIMENTS")
|
|
235
|
+
print("=" * 80)
|
|
236
|
+
|
|
237
|
+
conn = connect_to_db()
|
|
238
|
+
|
|
239
|
+
# Get experiment summaries
|
|
240
|
+
print("\nš EXPERIMENT SUMMARIES")
|
|
241
|
+
print("-" * 40)
|
|
242
|
+
|
|
243
|
+
for model_name, exp_id in EXPERIMENTS.items():
|
|
244
|
+
summary = get_experiment_summary(conn, exp_id)
|
|
245
|
+
if summary:
|
|
246
|
+
print(f"\n{model_name.upper()}:")
|
|
247
|
+
print(f" Name: {summary['name']}")
|
|
248
|
+
print(f" ID: {summary['experiment_id']}")
|
|
249
|
+
print(f" Created: {summary['created_at']}")
|
|
250
|
+
print(f" Git: {summary['branch']} @ {summary['commit'][:8]}")
|
|
251
|
+
|
|
252
|
+
# Session performance comparison
|
|
253
|
+
print("\nš SESSION PERFORMANCE COMPARISON")
|
|
254
|
+
print("-" * 40)
|
|
255
|
+
|
|
256
|
+
perf_df = get_session_performance_comparison(conn)
|
|
257
|
+
print(perf_df.to_string(index=False))
|
|
258
|
+
|
|
259
|
+
# Achievement analysis
|
|
260
|
+
print("\nš ACHIEVEMENT ANALYSIS")
|
|
261
|
+
print("-" * 40)
|
|
262
|
+
|
|
263
|
+
for model_name, exp_id in EXPERIMENTS.items():
|
|
264
|
+
print(f"\n{model_name.upper()}:")
|
|
265
|
+
achievement_data = get_achievement_analysis(conn, exp_id)
|
|
266
|
+
print(f" Total Achievements: {achievement_data['total_achievements']}")
|
|
267
|
+
print(f" Unique Achievements: {achievement_data['unique_achievements']}")
|
|
268
|
+
print(f" Achievement Counts: {achievement_data['achievement_counts']}")
|
|
269
|
+
|
|
270
|
+
# Model usage analysis
|
|
271
|
+
print("\nš° MODEL USAGE ANALYSIS")
|
|
272
|
+
print("-" * 40)
|
|
273
|
+
|
|
274
|
+
for model_name, exp_id in EXPERIMENTS.items():
|
|
275
|
+
print(f"\n{model_name.upper()}:")
|
|
276
|
+
usage_df = get_model_usage_analysis(conn, exp_id)
|
|
277
|
+
if not usage_df.empty:
|
|
278
|
+
print(usage_df.to_string(index=False))
|
|
279
|
+
else:
|
|
280
|
+
print(" No model usage data found")
|
|
281
|
+
|
|
282
|
+
# Instance difficulty analysis
|
|
283
|
+
print("\nšÆ INSTANCE DIFFICULTY ANALYSIS")
|
|
284
|
+
print("-" * 40)
|
|
285
|
+
|
|
286
|
+
difficulty_analysis = analyze_instance_difficulty(conn)
|
|
287
|
+
|
|
288
|
+
for model_name, exp_id in EXPERIMENTS.items():
|
|
289
|
+
data = difficulty_analysis[exp_id]
|
|
290
|
+
print(f"\n{model_name.upper()}:")
|
|
291
|
+
print(f" Total Sessions: {data['total_sessions']}")
|
|
292
|
+
print(f" Avg Timesteps: {data['avg_timesteps']:.1f}")
|
|
293
|
+
print(f" Avg Events: {data['avg_events']:.1f}")
|
|
294
|
+
print(f" Timestep Range: {data['min_timesteps']} - {data['max_timesteps']}")
|
|
295
|
+
|
|
296
|
+
# Performance comparison summary
|
|
297
|
+
print("\nš PERFORMANCE COMPARISON SUMMARY")
|
|
298
|
+
print("-" * 40)
|
|
299
|
+
|
|
300
|
+
mini_data = difficulty_analysis[EXPERIMENTS["gpt-4o-mini"]]
|
|
301
|
+
full_data = difficulty_analysis[EXPERIMENTS["gpt-4o"]]
|
|
302
|
+
|
|
303
|
+
print(f"GPT-4O-MINI:")
|
|
304
|
+
print(f" Sessions: {mini_data['total_sessions']}")
|
|
305
|
+
print(f" Avg Timesteps: {mini_data['avg_timesteps']:.1f}")
|
|
306
|
+
print(f" Avg Events: {mini_data['avg_events']:.1f}")
|
|
307
|
+
|
|
308
|
+
print(f"\nGPT-4O:")
|
|
309
|
+
print(f" Sessions: {full_data['total_sessions']}")
|
|
310
|
+
print(f" Avg Timesteps: {full_data['avg_timesteps']:.1f}")
|
|
311
|
+
print(f" Avg Events: {full_data['avg_events']:.1f}")
|
|
312
|
+
|
|
313
|
+
# Calculate improvements
|
|
314
|
+
timestep_improvement = ((full_data['avg_timesteps'] - mini_data['avg_timesteps']) / mini_data['avg_timesteps']) * 100
|
|
315
|
+
event_improvement = ((full_data['avg_events'] - mini_data['avg_events']) / mini_data['avg_events']) * 100
|
|
316
|
+
|
|
317
|
+
print(f"\nš IMPROVEMENTS:")
|
|
318
|
+
print(f" Timesteps: {timestep_improvement:+.1f}%")
|
|
319
|
+
print(f" Events: {event_improvement:+.1f}%")
|
|
320
|
+
|
|
321
|
+
conn.close()
|
|
322
|
+
|
|
323
|
+
if __name__ == "__main__":
|
|
324
|
+
main()
|