synth-ai 0.2.9.dev3__py3-none-any.whl → 0.2.9.dev5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +17 -0
- examples/common_old/backend.py +21 -0
- examples/crafter_debug_render.py +180 -0
- examples/evals_old/README.md +98 -0
- examples/evals_old/__init__.py +6 -0
- examples/evals_old/compare_models.py +1037 -0
- examples/evals_old/example_log.md +145 -0
- examples/evals_old/run_demo.sh +126 -0
- examples/evals_old/trace_analysis.py +270 -0
- examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
- examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
- examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
- examples/finetuning_old/synth_qwen_v1/README.md +68 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
- examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
- examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
- examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
- examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
- examples/finetuning_old/synth_qwen_v1/util.py +147 -0
- examples/rl/README.md +169 -0
- examples/rl/configs/eval_base_qwen.toml +15 -0
- examples/rl/configs/eval_rl_qwen.toml +11 -0
- examples/rl/configs/rl_from_base_qwen.toml +35 -0
- examples/rl/configs/rl_from_base_qwen17.toml +74 -0
- examples/rl/configs/rl_from_ft_qwen.toml +35 -0
- examples/rl/download_dataset.py +64 -0
- examples/rl/run_eval.py +435 -0
- examples/rl/run_rl_and_save.py +94 -0
- examples/rl/task_app/README.md +22 -0
- {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
- examples/rl/task_app/math_task_app.py +107 -0
- examples/rl_old/task_app.py +962 -0
- examples/run_crafter_demo.sh +10 -0
- examples/warming_up_to_rl/analyze_trace_db.py +420 -0
- examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
- examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
- examples/warming_up_to_rl/export_trace_sft.py +541 -0
- examples/warming_up_to_rl/groq_test.py +88 -0
- examples/warming_up_to_rl/manage_secrets.py +127 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/readme.md +172 -0
- examples/warming_up_to_rl/run_eval.py +434 -0
- examples/warming_up_to_rl/run_fft_and_save.py +309 -0
- examples/warming_up_to_rl/run_local_rollout.py +188 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
- examples/warming_up_to_rl/run_rl_and_save.py +101 -0
- examples/warming_up_to_rl/run_rollout_remote.py +129 -0
- examples/warming_up_to_rl/task_app/README.md +38 -0
- {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
- synth_ai/api/train/config_finder.py +18 -18
- synth_ai/api/train/env_resolver.py +28 -1
- synth_ai/cli/task_apps.py +291 -56
- synth_ai/task/apps/__init__.py +54 -13
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/METADATA +1 -1
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/RECORD +106 -13
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/top_level.txt +1 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev3.dist-info → synth_ai-0.2.9.dev5.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# Run a Crafter agent demo with Gemini
|
|
4
|
+
# This script demonstrates a reactive agent in the Crafter environment
|
|
5
|
+
|
|
6
|
+
echo "🚀 Starting Crafter agent demo with Gemini 1.5 Flash..."
|
|
7
|
+
echo "Make sure the synth-ai service is running: uvx synth-ai serve"
|
|
8
|
+
echo ""
|
|
9
|
+
|
|
10
|
+
uv run python -m synth_ai.environments.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Summarise tracing_v3 SQLite data (models, rewards, achievements)."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import sqlite3
|
|
9
|
+
import sys
|
|
10
|
+
from collections import Counter, defaultdict
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
13
|
+
|
|
14
|
+
Row = sqlite3.Row
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def connect(db_path: Path) -> sqlite3.Connection:
|
|
18
|
+
conn = sqlite3.connect(str(db_path))
|
|
19
|
+
conn.row_factory = sqlite3.Row
|
|
20
|
+
return conn
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def fetch_model_usage(conn: sqlite3.Connection) -> list[dict[str, Any]]:
|
|
24
|
+
rows = conn.execute(
|
|
25
|
+
"""
|
|
26
|
+
SELECT
|
|
27
|
+
model_name,
|
|
28
|
+
provider,
|
|
29
|
+
COUNT(*) AS calls,
|
|
30
|
+
COALESCE(SUM(total_tokens), 0) AS total_tokens,
|
|
31
|
+
COALESCE(SUM(input_tokens), 0) AS input_tokens,
|
|
32
|
+
COALESCE(SUM(output_tokens), 0) AS output_tokens,
|
|
33
|
+
COALESCE(AVG(latency_ms), 0) AS avg_latency_ms
|
|
34
|
+
FROM events
|
|
35
|
+
WHERE event_type = 'cais' AND model_name IS NOT NULL
|
|
36
|
+
GROUP BY model_name, provider
|
|
37
|
+
ORDER BY calls DESC
|
|
38
|
+
"""
|
|
39
|
+
).fetchall()
|
|
40
|
+
stats: list[dict[str, Any]] = []
|
|
41
|
+
for row in rows:
|
|
42
|
+
stats.append(
|
|
43
|
+
{
|
|
44
|
+
"model_name": row["model_name"],
|
|
45
|
+
"provider": row["provider"],
|
|
46
|
+
"calls": int(row["calls"] or 0),
|
|
47
|
+
"total_tokens": int(row["total_tokens"] or 0),
|
|
48
|
+
"input_tokens": int(row["input_tokens"] or 0),
|
|
49
|
+
"output_tokens": int(row["output_tokens"] or 0),
|
|
50
|
+
"avg_latency_ms": float(row["avg_latency_ms"] or 0.0),
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
return stats
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _parse_json(value: Any) -> Any:
|
|
57
|
+
if value is None:
|
|
58
|
+
return None
|
|
59
|
+
if isinstance(value, (dict, list)):
|
|
60
|
+
return value
|
|
61
|
+
try:
|
|
62
|
+
return json.loads(value)
|
|
63
|
+
except Exception:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
AchievementMap = dict[Tuple[str, int], dict[str, list[str]]]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def fetch_achievement_data(
|
|
71
|
+
conn: sqlite3.Connection,
|
|
72
|
+
) -> tuple[
|
|
73
|
+
AchievementMap,
|
|
74
|
+
Counter,
|
|
75
|
+
Counter,
|
|
76
|
+
Counter,
|
|
77
|
+
dict[str, set[str]],
|
|
78
|
+
dict[str, set[str]],
|
|
79
|
+
]:
|
|
80
|
+
"""Return per-turn achievement map and summary counters.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
achievements_map: {(session_id, turn) -> {"unique": [...], "all": [...]}}
|
|
84
|
+
unique_counts_per_session: Counter mapping session -> total unique achievements
|
|
85
|
+
achievement_name_counts: Counter mapping achievement name -> occurrences (unique)
|
|
86
|
+
achievement_size_counts: Counter mapping number of unique achievements per session -> frequency
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
achievements_map: AchievementMap = defaultdict(lambda: {"unique": [], "all": []})
|
|
90
|
+
session_unique_sets: dict[str, set[str]] = defaultdict(set)
|
|
91
|
+
session_final_achievements: dict[str, set[str]] = defaultdict(set)
|
|
92
|
+
achievement_name_counts: Counter = Counter()
|
|
93
|
+
|
|
94
|
+
# Unique achievements (reward_type = unique_achievement_delta)
|
|
95
|
+
rows = conn.execute(
|
|
96
|
+
"""
|
|
97
|
+
SELECT er.session_id, er.reward_value, er.annotation, ev.metadata
|
|
98
|
+
FROM event_rewards er
|
|
99
|
+
JOIN events ev ON er.event_id = ev.id
|
|
100
|
+
WHERE er.reward_type = 'unique_achievement_delta' AND er.reward_value > 0
|
|
101
|
+
"""
|
|
102
|
+
).fetchall()
|
|
103
|
+
for row in rows:
|
|
104
|
+
session_id = row["session_id"]
|
|
105
|
+
annotation = _parse_json(row["annotation"]) or {}
|
|
106
|
+
metadata = _parse_json(row["metadata"]) or {}
|
|
107
|
+
turn = metadata.get("turn")
|
|
108
|
+
if turn is None:
|
|
109
|
+
continue
|
|
110
|
+
new_unique = annotation.get("new_unique") or []
|
|
111
|
+
if not isinstance(new_unique, list):
|
|
112
|
+
continue
|
|
113
|
+
if new_unique:
|
|
114
|
+
achievements_map[(session_id, int(turn))]["unique"].extend(new_unique)
|
|
115
|
+
session_unique_sets[session_id].update(new_unique)
|
|
116
|
+
|
|
117
|
+
# All achievements (reward_type = achievement_delta)
|
|
118
|
+
rows = conn.execute(
|
|
119
|
+
"""
|
|
120
|
+
SELECT er.session_id, er.reward_value, er.annotation, ev.metadata
|
|
121
|
+
FROM event_rewards er
|
|
122
|
+
JOIN events ev ON er.event_id = ev.id
|
|
123
|
+
WHERE er.reward_type = 'achievement_delta' AND er.reward_value > 0
|
|
124
|
+
"""
|
|
125
|
+
).fetchall()
|
|
126
|
+
for row in rows:
|
|
127
|
+
session_id = row["session_id"]
|
|
128
|
+
annotation = _parse_json(row["annotation"]) or {}
|
|
129
|
+
metadata = _parse_json(row["metadata"]) or {}
|
|
130
|
+
turn = metadata.get("turn")
|
|
131
|
+
if turn is None:
|
|
132
|
+
continue
|
|
133
|
+
turned_true = annotation.get("turned_true") or []
|
|
134
|
+
if not isinstance(turned_true, list):
|
|
135
|
+
continue
|
|
136
|
+
if turned_true:
|
|
137
|
+
achievements_map[(session_id, int(turn))]["all"].extend(turned_true)
|
|
138
|
+
|
|
139
|
+
# Fallback to outcome rewards metadata to capture final achievements
|
|
140
|
+
rows = conn.execute(
|
|
141
|
+
"""
|
|
142
|
+
SELECT session_id, reward_metadata
|
|
143
|
+
FROM outcome_rewards
|
|
144
|
+
WHERE reward_metadata IS NOT NULL
|
|
145
|
+
"""
|
|
146
|
+
).fetchall()
|
|
147
|
+
for row in rows:
|
|
148
|
+
session_id = row["session_id"]
|
|
149
|
+
metadata = _parse_json(row["reward_metadata"])
|
|
150
|
+
if not isinstance(metadata, dict):
|
|
151
|
+
continue
|
|
152
|
+
final_achievements = metadata.get("achievements") or []
|
|
153
|
+
if isinstance(final_achievements, list):
|
|
154
|
+
cleaned = [a for a in final_achievements if isinstance(a, str)]
|
|
155
|
+
session_unique_sets[session_id].update(cleaned)
|
|
156
|
+
session_final_achievements[session_id].update(cleaned)
|
|
157
|
+
|
|
158
|
+
# Build counters from the unique sets
|
|
159
|
+
unique_counts_per_session: Counter = Counter()
|
|
160
|
+
for session_id, achievement_set in session_unique_sets.items():
|
|
161
|
+
unique_counts_per_session[session_id] = len(achievement_set)
|
|
162
|
+
achievement_name_counts.update(achievement_set)
|
|
163
|
+
|
|
164
|
+
achievement_size_counts: Counter = Counter()
|
|
165
|
+
for session_id, count in unique_counts_per_session.items():
|
|
166
|
+
achievement_size_counts[count] += 1
|
|
167
|
+
|
|
168
|
+
return (
|
|
169
|
+
achievements_map,
|
|
170
|
+
unique_counts_per_session,
|
|
171
|
+
achievement_name_counts,
|
|
172
|
+
achievement_size_counts,
|
|
173
|
+
session_unique_sets,
|
|
174
|
+
session_final_achievements,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def fetch_reward_summary(conn: sqlite3.Connection) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
|
179
|
+
"""Aggregate reward information from outcome_rewards and event_rewards."""
|
|
180
|
+
|
|
181
|
+
outcome_row = conn.execute(
|
|
182
|
+
"""
|
|
183
|
+
SELECT
|
|
184
|
+
COUNT(*) AS episodes,
|
|
185
|
+
COALESCE(SUM(total_reward), 0) AS total_reward,
|
|
186
|
+
COALESCE(AVG(total_reward), 0) AS avg_reward,
|
|
187
|
+
COALESCE(MIN(total_reward), 0) AS min_reward,
|
|
188
|
+
COALESCE(MAX(total_reward), 0) AS max_reward
|
|
189
|
+
FROM outcome_rewards
|
|
190
|
+
"""
|
|
191
|
+
).fetchone()
|
|
192
|
+
|
|
193
|
+
reward_breakdown_rows = conn.execute(
|
|
194
|
+
"""
|
|
195
|
+
SELECT
|
|
196
|
+
reward_type,
|
|
197
|
+
COUNT(*) AS events,
|
|
198
|
+
COALESCE(SUM(reward_value), 0) AS total_value,
|
|
199
|
+
COALESCE(AVG(reward_value), 0) AS avg_value
|
|
200
|
+
FROM event_rewards
|
|
201
|
+
GROUP BY reward_type
|
|
202
|
+
ORDER BY events DESC
|
|
203
|
+
"""
|
|
204
|
+
).fetchall()
|
|
205
|
+
|
|
206
|
+
breakdown: list[dict[str, Any]] = []
|
|
207
|
+
for row in reward_breakdown_rows:
|
|
208
|
+
breakdown.append(
|
|
209
|
+
{
|
|
210
|
+
"reward_type": row["reward_type"],
|
|
211
|
+
"events": int(row["events"] or 0),
|
|
212
|
+
"total_value": float(row["total_value"] or 0.0),
|
|
213
|
+
"avg_value": float(row["avg_value"] or 0.0),
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
outcome = {
|
|
218
|
+
"episodes": int(outcome_row["episodes"] or 0),
|
|
219
|
+
"total_reward": float(outcome_row["total_reward"] or 0.0),
|
|
220
|
+
"avg_reward": float(outcome_row["avg_reward"] or 0.0),
|
|
221
|
+
"min_reward": float(outcome_row["min_reward"] or 0.0),
|
|
222
|
+
"max_reward": float(outcome_row["max_reward"] or 0.0),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return outcome, breakdown
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def format_model_stats(stats: list[dict[str, Any]]) -> str:
|
|
229
|
+
if not stats:
|
|
230
|
+
return "No model usage recorded."
|
|
231
|
+
lines = ["Model usage (by LLM calls):"]
|
|
232
|
+
header = f"{'Model':30} {'Provider':10} {'Calls':>7} {'Tokens (in/out)':>20} {'Avg latency ms':>15}"
|
|
233
|
+
lines.append(header)
|
|
234
|
+
lines.append("-" * len(header))
|
|
235
|
+
for item in stats:
|
|
236
|
+
lines.append(
|
|
237
|
+
f"{item['model_name'][:30]:30} "
|
|
238
|
+
f"{(item['provider'] or '')[:10]:10} "
|
|
239
|
+
f"{item['calls']:7d} "
|
|
240
|
+
f"{item['total_tokens']:10d}/{item['output_tokens']:>8d} "
|
|
241
|
+
f"{item['avg_latency_ms']:15.1f}"
|
|
242
|
+
)
|
|
243
|
+
return "\n".join(lines)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def format_achievement_summary(
|
|
247
|
+
name_counts: Counter, size_counts: Counter
|
|
248
|
+
) -> str:
|
|
249
|
+
lines = ["Unique achievements unlocked:"]
|
|
250
|
+
if name_counts:
|
|
251
|
+
top = name_counts.most_common()
|
|
252
|
+
for name, count in top:
|
|
253
|
+
lines.append(f" {name:25} -> {count}")
|
|
254
|
+
else:
|
|
255
|
+
lines.append(" (none recorded)")
|
|
256
|
+
|
|
257
|
+
lines.append("")
|
|
258
|
+
lines.append("Sessions bucketed by unique achievement count:")
|
|
259
|
+
if size_counts:
|
|
260
|
+
for size in sorted(size_counts):
|
|
261
|
+
lines.append(f" {size:2d} unique -> {size_counts[size]} session(s)")
|
|
262
|
+
else:
|
|
263
|
+
lines.append(" (no sessions with achievements)")
|
|
264
|
+
return "\n".join(lines)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def format_reward_summary(outcome: dict[str, Any], breakdown: list[dict[str, Any]]) -> str:
|
|
268
|
+
lines = ["Episode outcome rewards:"]
|
|
269
|
+
episodes = outcome.get("episodes", 0)
|
|
270
|
+
if episodes:
|
|
271
|
+
lines.append(
|
|
272
|
+
f" Episodes: {episodes} | total={outcome['total_reward']:.2f} | "
|
|
273
|
+
f"avg={outcome['avg_reward']:.2f} | min/max={outcome['min_reward']:.2f}/{outcome['max_reward']:.2f}"
|
|
274
|
+
)
|
|
275
|
+
else:
|
|
276
|
+
lines.append(" (no outcome rewards recorded)")
|
|
277
|
+
|
|
278
|
+
lines.append("")
|
|
279
|
+
lines.append("Event reward breakdown (event_rewards table):")
|
|
280
|
+
if breakdown:
|
|
281
|
+
header = f"{'Reward type':20} {'Events':>8} {'Total value':>14} {'Avg value':>12}"
|
|
282
|
+
lines.append(header)
|
|
283
|
+
lines.append("-" * len(header))
|
|
284
|
+
for row in breakdown:
|
|
285
|
+
lines.append(
|
|
286
|
+
f"{row['reward_type'][:20]:20} "
|
|
287
|
+
f"{row['events']:8d} "
|
|
288
|
+
f"{row['total_value']:14.3f} "
|
|
289
|
+
f"{row['avg_value']:12.3f}"
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
lines.append(" (no event rewards recorded)")
|
|
293
|
+
|
|
294
|
+
return "\n".join(lines)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def compute_model_achievement_stats(
|
|
298
|
+
conn: sqlite3.Connection, session_unique_sets: dict[str, Set[str]]
|
|
299
|
+
) -> dict[str, dict[str, Any]]:
|
|
300
|
+
"""Aggregate unique-achievement stats per model."""
|
|
301
|
+
|
|
302
|
+
rows = conn.execute(
|
|
303
|
+
"""
|
|
304
|
+
SELECT session_id, model_name, provider, COUNT(*) AS calls
|
|
305
|
+
FROM events
|
|
306
|
+
WHERE event_type = 'cais' AND model_name IS NOT NULL
|
|
307
|
+
GROUP BY session_id, model_name, provider
|
|
308
|
+
"""
|
|
309
|
+
).fetchall()
|
|
310
|
+
|
|
311
|
+
session_models: dict[str, tuple[str, str, int]] = {}
|
|
312
|
+
for row in rows:
|
|
313
|
+
session_id = row["session_id"]
|
|
314
|
+
calls = int(row["calls"] or 0)
|
|
315
|
+
current = session_models.get(session_id)
|
|
316
|
+
if current is None or calls > current[2]:
|
|
317
|
+
session_models[session_id] = (row["model_name"], row["provider"], calls)
|
|
318
|
+
|
|
319
|
+
model_stats: dict[str, dict[str, Any]] = {}
|
|
320
|
+
for session_id, (model_name, provider, _calls) in session_models.items():
|
|
321
|
+
achievements = session_unique_sets.get(session_id, set())
|
|
322
|
+
unique_count = len(achievements)
|
|
323
|
+
|
|
324
|
+
stats = model_stats.setdefault(
|
|
325
|
+
model_name,
|
|
326
|
+
{
|
|
327
|
+
"providers": set(),
|
|
328
|
+
"sessions": 0,
|
|
329
|
+
"sessions_with_unique": 0,
|
|
330
|
+
"total_unique": 0,
|
|
331
|
+
"max_unique": 0,
|
|
332
|
+
"achievement_counts": Counter(),
|
|
333
|
+
},
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
stats["providers"].add(provider or "unknown")
|
|
337
|
+
stats["sessions"] += 1
|
|
338
|
+
stats["total_unique"] += unique_count
|
|
339
|
+
stats["max_unique"] = max(stats["max_unique"], unique_count)
|
|
340
|
+
if unique_count > 0:
|
|
341
|
+
stats["sessions_with_unique"] += 1
|
|
342
|
+
stats["achievement_counts"].update(achievements)
|
|
343
|
+
|
|
344
|
+
return model_stats
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def format_model_achievement_stats(model_stats: dict[str, dict[str, Any]]) -> str:
|
|
348
|
+
if not model_stats:
|
|
349
|
+
return "Achievement stats by model:\n (no model sessions recorded)"
|
|
350
|
+
|
|
351
|
+
lines = ["Achievement stats by model:"]
|
|
352
|
+
for model_name in sorted(model_stats.keys(), key=lambda m: model_stats[m]["sessions"], reverse=True):
|
|
353
|
+
stats = model_stats[model_name]
|
|
354
|
+
providers = ", ".join(sorted(stats["providers"])) if stats["providers"] else "-"
|
|
355
|
+
sessions = stats["sessions"]
|
|
356
|
+
total_unique = stats["total_unique"]
|
|
357
|
+
avg_unique = total_unique / sessions if sessions else 0.0
|
|
358
|
+
sessions_with_unique = stats["sessions_with_unique"]
|
|
359
|
+
max_unique = stats["max_unique"]
|
|
360
|
+
lines.append(
|
|
361
|
+
f" {model_name} (providers: {providers})\n"
|
|
362
|
+
f" sessions={sessions} with_unique={sessions_with_unique} "
|
|
363
|
+
f"avg_unique={avg_unique:.2f} max_unique={max_unique}"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
achievement_counts = stats["achievement_counts"]
|
|
367
|
+
if achievement_counts:
|
|
368
|
+
lines.append(" achievements:")
|
|
369
|
+
for name, count in sorted(
|
|
370
|
+
achievement_counts.items(), key=lambda item: item[1], reverse=True
|
|
371
|
+
):
|
|
372
|
+
lines.append(f" {name}: {count}")
|
|
373
|
+
else:
|
|
374
|
+
lines.append(" achievements: none")
|
|
375
|
+
|
|
376
|
+
return "\n".join(lines)
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def main() -> None:
|
|
380
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
381
|
+
parser.add_argument(
|
|
382
|
+
"--db",
|
|
383
|
+
type=Path,
|
|
384
|
+
default=Path("traces/v3/synth_ai.db"),
|
|
385
|
+
help="Path to the tracing_v3 SQLite database",
|
|
386
|
+
)
|
|
387
|
+
args = parser.parse_args()
|
|
388
|
+
|
|
389
|
+
if not args.db.exists():
|
|
390
|
+
print(f"Database not found: {args.db}", file=sys.stderr)
|
|
391
|
+
raise SystemExit(1)
|
|
392
|
+
|
|
393
|
+
conn = connect(args.db)
|
|
394
|
+
try:
|
|
395
|
+
model_stats = fetch_model_usage(conn)
|
|
396
|
+
print(format_model_stats(model_stats))
|
|
397
|
+
print("")
|
|
398
|
+
|
|
399
|
+
(
|
|
400
|
+
_achievements_map,
|
|
401
|
+
_unique_counts_per_session,
|
|
402
|
+
name_counts,
|
|
403
|
+
size_counts,
|
|
404
|
+
session_unique_sets,
|
|
405
|
+
_session_final_achievements,
|
|
406
|
+
) = fetch_achievement_data(conn)
|
|
407
|
+
outcome_summary, reward_breakdown = fetch_reward_summary(conn)
|
|
408
|
+
|
|
409
|
+
print(format_reward_summary(outcome_summary, reward_breakdown))
|
|
410
|
+
print("")
|
|
411
|
+
print(format_achievement_summary(name_counts, size_counts))
|
|
412
|
+
print("")
|
|
413
|
+
model_achievement_stats = compute_model_achievement_stats(conn, session_unique_sets)
|
|
414
|
+
print(format_model_achievement_stats(model_achievement_stats))
|
|
415
|
+
finally:
|
|
416
|
+
conn.close()
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
if __name__ == "__main__":
|
|
420
|
+
main()
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Crafter Full Finetune (FFT) example on H100
|
|
2
|
+
# Adjust paths and hyperparameters to your environment before running.
|
|
3
|
+
|
|
4
|
+
[job]
|
|
5
|
+
model = "Qwen/Qwen3-4B" # base model to finetune
|
|
6
|
+
# Path to your SFT JSONL dataset
|
|
7
|
+
# You can point this to an absolute path or keep relative to this TOML
|
|
8
|
+
# data = "../data/crafter_sft.jsonl"
|
|
9
|
+
|
|
10
|
+
# Optional: how long to poll the job (seconds)
|
|
11
|
+
poll_seconds = 1800
|
|
12
|
+
|
|
13
|
+
[compute]
|
|
14
|
+
# Cluster shape
|
|
15
|
+
gpu_type = "H100"
|
|
16
|
+
gpu_count = 4
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data.topology]
|
|
20
|
+
# world_size / container count (optional; inferred when omitted)
|
|
21
|
+
container_count = 4
|
|
22
|
+
|
|
23
|
+
[training]
|
|
24
|
+
mode = "full_finetune" # for documentation; backend decides based on metadata
|
|
25
|
+
use_qlora = false
|
|
26
|
+
|
|
27
|
+
[hyperparameters]
|
|
28
|
+
# epochs
|
|
29
|
+
n_epochs = 2
|
|
30
|
+
|
|
31
|
+
# global batch shape (examples; adjust to your budget)
|
|
32
|
+
world_size = 4
|
|
33
|
+
sequence_length = 2048
|
|
34
|
+
# provide either global_batch OR (per_device_batch×grad_accum×world_size)
|
|
35
|
+
# global_batch = 512
|
|
36
|
+
per_device_batch = 2
|
|
37
|
+
gradient_accumulation_steps = 64
|
|
38
|
+
|
|
39
|
+
# optimizer/schedule
|
|
40
|
+
learning_rate = 8e-6
|
|
41
|
+
warmup_ratio = 0.03
|
|
42
|
+
|
|
43
|
+
[hyperparameters.parallelism]
|
|
44
|
+
use_deepspeed = true
|
|
45
|
+
deepspeed_stage = 3
|
|
46
|
+
fsdp = false
|
|
47
|
+
bf16 = true
|
|
48
|
+
fp16 = false
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# FFT job config for Qwen/Qwen3-4B on Crafter SFT dataset
|
|
2
|
+
|
|
3
|
+
[algorithm]
|
|
4
|
+
type = "offline"
|
|
5
|
+
method = "supervised_finetune"
|
|
6
|
+
variety = "fft"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-4B"
|
|
11
|
+
# Limit training to the first 100 conversations (export a 100-row JSONL and point to it here)
|
|
12
|
+
# data = "../ft_data/qwen3_32b_ach_ge3_raw_filtered.head100.jsonl"
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
# Adjust as needed for your quota
|
|
16
|
+
gpu_type = "H100"
|
|
17
|
+
gpu_count = 1
|
|
18
|
+
nodes = 1
|
|
19
|
+
|
|
20
|
+
[data]
|
|
21
|
+
# Optional topology metadata (left empty for now)
|
|
22
|
+
topology = {}
|
|
23
|
+
|
|
24
|
+
# Optional local validation dataset path (JSONL). If set, the client will upload
|
|
25
|
+
# this file and wire up validation so the frontend can display val.loss.
|
|
26
|
+
validation_path = "../ft_data/qwen3_32b_ach_ge3_raw_filtered.tokens_1000000_seed_123.val_2000.jsonl"
|
|
27
|
+
|
|
28
|
+
[training]
|
|
29
|
+
mode = "sft_offline"
|
|
30
|
+
use_qlora = false
|
|
31
|
+
|
|
32
|
+
# Validation settings to emit val.loss on the frontend
|
|
33
|
+
[training.validation]
|
|
34
|
+
enabled = true
|
|
35
|
+
evaluation_strategy = "steps"
|
|
36
|
+
eval_steps = 20
|
|
37
|
+
save_best_model_at_end = true
|
|
38
|
+
metric_for_best_model = "val.loss"
|
|
39
|
+
greater_is_better = false
|
|
40
|
+
|
|
41
|
+
[hyperparameters]
|
|
42
|
+
# Minimal safe defaults; backend can override
|
|
43
|
+
n_epochs = 1
|
|
44
|
+
batch_size = 1
|
|
45
|
+
gradient_accumulation_steps = 64
|
|
46
|
+
sequence_length = 4096
|
|
47
|
+
learning_rate = 5e-6
|
|
48
|
+
warmup_ratio = 0.03
|
|
49
|
+
train_kind = "fft"
|
|
50
|
+
|
|
51
|
+
# Optional parallelism block example
|
|
52
|
+
#[hyperparameters.parallelism]
|
|
53
|
+
# tensor_parallel_size = 1
|
|
54
|
+
# pipeline_parallel_size = 1
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Eval config for finetuned Qwen/Qwen3-4B (FFT) via task app rollout
|
|
2
|
+
|
|
3
|
+
# Required
|
|
4
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
5
|
+
# Replace with your finished job id if different
|
|
6
|
+
model = "fft:Qwen/Qwen3-4B:job_a53914f51dc146b5"
|
|
7
|
+
policy_name = "crafter"
|
|
8
|
+
# Backend inference proxy base (no /v1); server will append /v1/chat/completions
|
|
9
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/inference"
|
|
10
|
+
|
|
11
|
+
# Optional
|
|
12
|
+
num_episodes = 10
|
|
13
|
+
max_turns = 10
|
|
14
|
+
concurrency = 10
|
|
15
|
+
# difficulty = "easy"
|
|
16
|
+
|
|
17
|
+
# Thinking params (optional; align with your service expectations)
|
|
18
|
+
thinking_mode = "think"
|
|
19
|
+
thinking_budget = 1024
|
|
20
|
+
max_tokens = 1024
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Eval config for Groq Qwen3-32B
|
|
2
|
+
# Fields mirror run_eval.py expectations
|
|
3
|
+
|
|
4
|
+
# Required
|
|
5
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
6
|
+
model = "qwen/qwen3-32b"
|
|
7
|
+
inference_url = "https://api.groq.com/openai"
|
|
8
|
+
|
|
9
|
+
# Optional
|
|
10
|
+
num_episodes = 20
|
|
11
|
+
max_turns = 10
|
|
12
|
+
concurrency = 10
|
|
13
|
+
# difficulty = "easy"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Eval config for Synth Modal inference Qwen/Qwen3-4B via task app rollout
|
|
2
|
+
|
|
3
|
+
# Required
|
|
4
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
5
|
+
model = "Qwen/Qwen3-4B"
|
|
6
|
+
policy_name = "crafter"
|
|
7
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/inference" # Modal inference base (no /v1)
|
|
8
|
+
max_tokens = 1024
|
|
9
|
+
thinking_mode = "think"
|
|
10
|
+
thinking_budget = 1024
|
|
11
|
+
|
|
12
|
+
# Optional
|
|
13
|
+
num_episodes = 20
|
|
14
|
+
max_turns = 10
|
|
15
|
+
concurrency = 10
|
|
16
|
+
# difficulty = "easy"
|
|
17
|
+
|
|
18
|
+
# Notes:
|
|
19
|
+
# - run_eval.py --use-rollout will detect provider=vllm for this model and
|
|
20
|
+
# fetch the vLLM base from the task app /info to use as inference_url.
|
|
21
|
+
# - Ensure the task app mounts the openai-api-key secret if your vLLM gateway
|
|
22
|
+
# requires a bearer token (OPENAI_API_KEY). Otherwise it will call unauthenticated.
|
|
23
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# RL training starting from base Qwen/Qwen3-4B (TOML-only model selection)
|
|
2
|
+
|
|
3
|
+
[algorithm]
|
|
4
|
+
type = "online"
|
|
5
|
+
method = "policy_gradient"
|
|
6
|
+
variety = "gspo"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
[services]
|
|
10
|
+
task_url = "https://synth-laboratories--grpo-crafter-task-app-final-warming--ceb5b2.modal.run"
|
|
11
|
+
|
|
12
|
+
[compute]
|
|
13
|
+
# Cluster shape for RL pipeline
|
|
14
|
+
gpu_type = "H100"
|
|
15
|
+
gpu_count = 8
|
|
16
|
+
|
|
17
|
+
[topology]
|
|
18
|
+
# Split GPUs across vLLM, training, and reference
|
|
19
|
+
# Must sum to compute.gpu_count
|
|
20
|
+
type = "single_node_split"
|
|
21
|
+
gpus_for_vllm = 4
|
|
22
|
+
gpus_for_training = 3
|
|
23
|
+
gpus_for_ref = 1
|
|
24
|
+
tensor_parallel = 4
|
|
25
|
+
|
|
26
|
+
[vllm]
|
|
27
|
+
# Serving tensor parallel size
|
|
28
|
+
tensor_parallel_size = 4
|
|
29
|
+
max_model_len = 8192
|
|
30
|
+
|
|
31
|
+
[reference]
|
|
32
|
+
# Required by trainer/runtime; ensures dedicated/scoped scoring server config exists
|
|
33
|
+
placement = "dedicated"
|
|
34
|
+
port = 8002
|
|
35
|
+
tp = 1
|
|
36
|
+
health_max_wait_s = 180
|
|
37
|
+
health_interval_ms = 300
|
|
38
|
+
|
|
39
|
+
[model]
|
|
40
|
+
# Base model start
|
|
41
|
+
base = "Qwen/Qwen3-4B"
|
|
42
|
+
label = "crafter-rl-from-base"
|
|
43
|
+
|
|
44
|
+
[rollout]
|
|
45
|
+
max_turns = 10
|
|
46
|
+
episodes_per_batch = 64
|
|
47
|
+
policy_name = "crafter"
|
|
48
|
+
|
|
49
|
+
[evaluation]
|
|
50
|
+
# Run baseline evaluation over the first 100 seeds every 20 training iterations
|
|
51
|
+
instances = 10
|
|
52
|
+
every_n_iters = 10
|
|
53
|
+
seeds = [
|
|
54
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[training]
|
|
58
|
+
log_interval = 1
|
|
59
|
+
weight_sync_interval = 1
|
|
60
|
+
# Additional RL hyperparameters can go here
|
|
61
|
+
|
|
62
|
+
# Stepwise rewards (Crafter decision-level)
|
|
63
|
+
step_rewards_enabled = true
|
|
64
|
+
step_rewards_mode = "decision_stepwise" # "off" | "decision_stepwise" | "env_sparse"
|
|
65
|
+
step_rewards_beta = 0.0
|
|
66
|
+
step_rewards_indicator_lambda = 1.0
|
|
67
|
+
# Optional selector for decision scalar: "unique" | "absolute" (default unique)
|
|
68
|
+
event_rewards_kind = "unique"
|
|
69
|
+
|
|
70
|
+
[training.weight_sync]
|
|
71
|
+
enable = true
|
|
72
|
+
targets = ["policy"]
|
|
73
|
+
weight_sync_interval = 1
|