synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- synth_ai/__init__.py +1 -1
- synth_ai/cli/__init__.py +6 -0
- synth_ai/cli/balance.py +3 -15
- synth_ai/cli/demo.py +68 -9
- synth_ai/cli/rl_demo.py +137 -0
- synth_ai/cli/root.py +65 -0
- synth_ai/config/base_url.py +47 -0
- synth_ai/demos/core/__init__.py +1 -0
- synth_ai/demos/core/cli.py +621 -0
- synth_ai/demos/demo_task_apps/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/core.py +374 -0
- synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/math/app.py +37 -0
- synth_ai/demos/demo_task_apps/math/config.toml +44 -0
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
- synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
- synth_ai/environments/examples/bandit/__init__.py +33 -0
- synth_ai/environments/examples/bandit/engine.py +294 -0
- synth_ai/environments/examples/bandit/environment.py +194 -0
- synth_ai/environments/examples/bandit/taskset.py +200 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
- synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
- synth_ai/environments/examples/crafter_classic/environment.py +41 -2
- synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
- synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
- synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
- synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
- synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
- synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
- synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
- synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
- synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
- synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
- synth_ai/environments/examples/red/units/__init__.py +1 -0
- synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
- synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
- synth_ai/environments/service/app.py +8 -0
- synth_ai/http.py +102 -0
- synth_ai/inference/__init__.py +7 -0
- synth_ai/inference/client.py +20 -0
- synth_ai/install_sqld.sh +40 -0
- synth_ai/jobs/client.py +246 -0
- synth_ai/learning/__init__.py +24 -0
- synth_ai/learning/client.py +149 -0
- synth_ai/learning/config.py +43 -0
- synth_ai/learning/constants.py +29 -0
- synth_ai/learning/ft_client.py +59 -0
- synth_ai/learning/health.py +43 -0
- synth_ai/learning/jobs.py +205 -0
- synth_ai/learning/rl_client.py +256 -0
- synth_ai/learning/sse.py +58 -0
- synth_ai/learning/validators.py +48 -0
- synth_ai/lm/core/main_v3.py +13 -0
- synth_ai/lm/core/synth_models.py +48 -0
- synth_ai/lm/core/vendor_clients.py +9 -6
- synth_ai/lm/vendors/core/openai_api.py +31 -3
- synth_ai/lm/vendors/openai_standard.py +45 -14
- synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
- synth_ai/lm/vendors/synth_client.py +372 -28
- synth_ai/rl/__init__.py +30 -0
- synth_ai/rl/contracts.py +32 -0
- synth_ai/rl/env_keys.py +137 -0
- synth_ai/rl/secrets.py +19 -0
- synth_ai/scripts/verify_rewards.py +100 -0
- synth_ai/task/__init__.py +10 -0
- synth_ai/task/contracts.py +120 -0
- synth_ai/task/health.py +28 -0
- synth_ai/task/validators.py +12 -0
- synth_ai/tracing_v3/hooks.py +3 -1
- synth_ai/tracing_v3/session_tracer.py +123 -2
- synth_ai/tracing_v3/turso/manager.py +218 -0
- synth_ai/tracing_v3/turso/models.py +53 -0
- synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
- synth_ai/tui/__init__.py +0 -1
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -340
- synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Prepare and Validate JSONL Data for Vertex AI Fine-tuning
|
|
4
|
+
=========================================================
|
|
5
|
+
This script validates and prepares JSONL data for Gemini fine-tuning on Vertex AI.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import argparse
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Any, Tuple
|
|
12
|
+
import sys
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
import random
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def validate_jsonl_line(line: str, line_num: int) -> Tuple[bool, str, Dict[str, Any]]:
|
|
18
|
+
"""Validate a single JSONL line for Vertex AI compatibility."""
|
|
19
|
+
try:
|
|
20
|
+
data = json.loads(line.strip())
|
|
21
|
+
except json.JSONDecodeError as e:
|
|
22
|
+
return False, f"Line {line_num}: Invalid JSON - {e}", {}
|
|
23
|
+
|
|
24
|
+
# Check required structure
|
|
25
|
+
if "messages" not in data:
|
|
26
|
+
return False, f"Line {line_num}: Missing 'messages' field", {}
|
|
27
|
+
|
|
28
|
+
messages = data["messages"]
|
|
29
|
+
if not isinstance(messages, list):
|
|
30
|
+
return False, f"Line {line_num}: 'messages' must be a list", {}
|
|
31
|
+
|
|
32
|
+
if len(messages) < 2:
|
|
33
|
+
return False, f"Line {line_num}: Need at least 2 messages (user and assistant)", {}
|
|
34
|
+
|
|
35
|
+
# Validate message structure
|
|
36
|
+
valid_roles = {"user", "assistant", "system"}
|
|
37
|
+
for i, msg in enumerate(messages):
|
|
38
|
+
if not isinstance(msg, dict):
|
|
39
|
+
return False, f"Line {line_num}: Message {i} must be a dict", {}
|
|
40
|
+
|
|
41
|
+
if "role" not in msg:
|
|
42
|
+
return False, f"Line {line_num}: Message {i} missing 'role'", {}
|
|
43
|
+
|
|
44
|
+
if "content" not in msg:
|
|
45
|
+
return False, f"Line {line_num}: Message {i} missing 'content'", {}
|
|
46
|
+
|
|
47
|
+
if msg["role"] not in valid_roles:
|
|
48
|
+
return False, f"Line {line_num}: Message {i} invalid role '{msg['role']}'", {}
|
|
49
|
+
|
|
50
|
+
# Check conversation flow
|
|
51
|
+
if messages[-1]["role"] != "assistant":
|
|
52
|
+
return False, f"Line {line_num}: Last message must be from assistant", {}
|
|
53
|
+
|
|
54
|
+
# Calculate token estimate (rough)
|
|
55
|
+
total_chars = sum(len(msg["content"]) for msg in messages)
|
|
56
|
+
token_estimate = total_chars // 4 # Rough estimate
|
|
57
|
+
|
|
58
|
+
return True, "OK", {
|
|
59
|
+
"num_messages": len(messages),
|
|
60
|
+
"roles": [msg["role"] for msg in messages],
|
|
61
|
+
"token_estimate": token_estimate,
|
|
62
|
+
"total_chars": total_chars
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def analyze_jsonl_file(file_path: Path) -> Dict[str, Any]:
|
|
67
|
+
"""Analyze a JSONL file for Vertex AI compatibility."""
|
|
68
|
+
stats = {
|
|
69
|
+
"total_lines": 0,
|
|
70
|
+
"valid_lines": 0,
|
|
71
|
+
"invalid_lines": 0,
|
|
72
|
+
"errors": [],
|
|
73
|
+
"token_distribution": defaultdict(int),
|
|
74
|
+
"message_count_distribution": defaultdict(int),
|
|
75
|
+
"role_patterns": defaultdict(int),
|
|
76
|
+
"total_tokens_estimate": 0
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
with open(file_path, 'r') as f:
|
|
80
|
+
for line_num, line in enumerate(f, 1):
|
|
81
|
+
if not line.strip():
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
stats["total_lines"] += 1
|
|
85
|
+
is_valid, error_msg, line_stats = validate_jsonl_line(line, line_num)
|
|
86
|
+
|
|
87
|
+
if is_valid:
|
|
88
|
+
stats["valid_lines"] += 1
|
|
89
|
+
stats["total_tokens_estimate"] += line_stats["token_estimate"]
|
|
90
|
+
|
|
91
|
+
# Bucket token counts
|
|
92
|
+
tokens = line_stats["token_estimate"]
|
|
93
|
+
if tokens < 100:
|
|
94
|
+
stats["token_distribution"]["<100"] += 1
|
|
95
|
+
elif tokens < 500:
|
|
96
|
+
stats["token_distribution"]["100-500"] += 1
|
|
97
|
+
elif tokens < 1000:
|
|
98
|
+
stats["token_distribution"]["500-1000"] += 1
|
|
99
|
+
elif tokens < 2000:
|
|
100
|
+
stats["token_distribution"]["1000-2000"] += 1
|
|
101
|
+
else:
|
|
102
|
+
stats["token_distribution"]["2000+"] += 1
|
|
103
|
+
|
|
104
|
+
# Message count distribution
|
|
105
|
+
msg_count = line_stats["num_messages"]
|
|
106
|
+
stats["message_count_distribution"][msg_count] += 1
|
|
107
|
+
|
|
108
|
+
# Role patterns
|
|
109
|
+
role_pattern = "->".join(line_stats["roles"])
|
|
110
|
+
stats["role_patterns"][role_pattern] += 1
|
|
111
|
+
else:
|
|
112
|
+
stats["invalid_lines"] += 1
|
|
113
|
+
stats["errors"].append(error_msg)
|
|
114
|
+
if len(stats["errors"]) > 10:
|
|
115
|
+
stats["errors"].append("... (truncated)")
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
return stats
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def create_subset(input_path: Path, output_path: Path, num_examples: int,
|
|
122
|
+
shuffle: bool = True, seed: int = 42):
|
|
123
|
+
"""Create a subset of the JSONL file."""
|
|
124
|
+
# Read all valid lines
|
|
125
|
+
valid_lines = []
|
|
126
|
+
with open(input_path, 'r') as f:
|
|
127
|
+
for line in f:
|
|
128
|
+
if line.strip():
|
|
129
|
+
is_valid, _, _ = validate_jsonl_line(line, len(valid_lines) + 1)
|
|
130
|
+
if is_valid:
|
|
131
|
+
valid_lines.append(line.strip())
|
|
132
|
+
|
|
133
|
+
# Sample subset
|
|
134
|
+
if shuffle:
|
|
135
|
+
random.seed(seed)
|
|
136
|
+
random.shuffle(valid_lines)
|
|
137
|
+
|
|
138
|
+
subset = valid_lines[:num_examples]
|
|
139
|
+
|
|
140
|
+
# Write subset
|
|
141
|
+
with open(output_path, 'w') as f:
|
|
142
|
+
for line in subset:
|
|
143
|
+
f.write(line + '\n')
|
|
144
|
+
|
|
145
|
+
print(f"✅ Created subset with {len(subset)} examples at {output_path}")
|
|
146
|
+
return len(subset)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def convert_for_vertex_ai(input_path: Path, output_path: Path,
|
|
150
|
+
add_system_prompt: bool = True):
|
|
151
|
+
"""Convert JSONL to Vertex AI format with optional enhancements."""
|
|
152
|
+
converted_count = 0
|
|
153
|
+
|
|
154
|
+
system_prompt = """You are an expert Crafter player. Your goal is to achieve as many objectives as possible efficiently.
|
|
155
|
+
|
|
156
|
+
Key objectives: collect resources, craft tools (pickaxe → stone pickaxe → iron pickaxe), make iron sword, survive.
|
|
157
|
+
|
|
158
|
+
Always think step-by-step about your current situation and plan your next action carefully."""
|
|
159
|
+
|
|
160
|
+
with open(input_path, 'r') as inf, open(output_path, 'w') as outf:
|
|
161
|
+
for line in inf:
|
|
162
|
+
if not line.strip():
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
data = json.loads(line)
|
|
167
|
+
messages = data["messages"]
|
|
168
|
+
|
|
169
|
+
# Optionally add system prompt
|
|
170
|
+
if add_system_prompt and messages[0]["role"] != "system":
|
|
171
|
+
messages = [{"role": "system", "content": system_prompt}] + messages
|
|
172
|
+
|
|
173
|
+
# Ensure proper format
|
|
174
|
+
formatted_data = {"messages": messages}
|
|
175
|
+
|
|
176
|
+
outf.write(json.dumps(formatted_data) + '\n')
|
|
177
|
+
converted_count += 1
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
print(f"⚠️ Skipping line due to error: {e}")
|
|
181
|
+
|
|
182
|
+
print(f"✅ Converted {converted_count} examples to {output_path}")
|
|
183
|
+
return converted_count
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def estimate_training_cost(stats: Dict[str, Any], price_per_million: float = 4.0):
|
|
187
|
+
"""Estimate Vertex AI training cost."""
|
|
188
|
+
total_tokens = stats["total_tokens_estimate"]
|
|
189
|
+
total_millions = total_tokens / 1_000_000
|
|
190
|
+
estimated_cost = total_millions * price_per_million
|
|
191
|
+
|
|
192
|
+
return {
|
|
193
|
+
"total_tokens": total_tokens,
|
|
194
|
+
"total_millions": round(total_millions, 2),
|
|
195
|
+
"estimated_cost_usd": round(estimated_cost, 2),
|
|
196
|
+
"price_per_million": price_per_million
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def print_analysis_report(stats: Dict[str, Any], cost_estimate: Dict[str, Any]):
|
|
201
|
+
"""Print a detailed analysis report."""
|
|
202
|
+
print("\n" + "=" * 60)
|
|
203
|
+
print("📊 VERTEX AI FINE-TUNING DATA ANALYSIS")
|
|
204
|
+
print("=" * 60)
|
|
205
|
+
|
|
206
|
+
print(f"\n✅ Valid examples: {stats['valid_lines']}")
|
|
207
|
+
print(f"❌ Invalid examples: {stats['invalid_lines']}")
|
|
208
|
+
print(f"📝 Total lines: {stats['total_lines']}")
|
|
209
|
+
print(f"✔️ Validation rate: {stats['valid_lines']/stats['total_lines']*100:.1f}%")
|
|
210
|
+
|
|
211
|
+
if stats['errors']:
|
|
212
|
+
print(f"\n⚠️ First few errors:")
|
|
213
|
+
for error in stats['errors'][:5]:
|
|
214
|
+
print(f" - {error}")
|
|
215
|
+
|
|
216
|
+
print(f"\n📊 Token Distribution:")
|
|
217
|
+
for bucket, count in sorted(stats['token_distribution'].items()):
|
|
218
|
+
print(f" {bucket} tokens: {count} examples")
|
|
219
|
+
|
|
220
|
+
print(f"\n💬 Message Patterns:")
|
|
221
|
+
for pattern, count in sorted(stats['role_patterns'].items(),
|
|
222
|
+
key=lambda x: x[1], reverse=True)[:5]:
|
|
223
|
+
print(f" {pattern}: {count} examples")
|
|
224
|
+
|
|
225
|
+
print(f"\n💰 Cost Estimate:")
|
|
226
|
+
print(f" Total tokens: {cost_estimate['total_tokens']:,}")
|
|
227
|
+
print(f" Token millions: {cost_estimate['total_millions']}")
|
|
228
|
+
print(f" Estimated cost: ${cost_estimate['estimated_cost_usd']} USD")
|
|
229
|
+
print(f" (at ${cost_estimate['price_per_million']}/million tokens)")
|
|
230
|
+
|
|
231
|
+
print("\n📝 Recommendations:")
|
|
232
|
+
if stats['valid_lines'] < 100:
|
|
233
|
+
print(" ⚠️ Dataset is small. Consider generating more examples.")
|
|
234
|
+
if stats['valid_lines'] > 10000:
|
|
235
|
+
print(" 💡 Large dataset. Consider creating a smaller subset for initial tests.")
|
|
236
|
+
if cost_estimate['estimated_cost_usd'] > 100:
|
|
237
|
+
print(" 💰 High estimated cost. Consider using a subset for initial experiments.")
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def main():
|
|
241
|
+
parser = argparse.ArgumentParser(description="Prepare and validate JSONL for Vertex AI")
|
|
242
|
+
parser.add_argument("jsonl_path", type=Path, help="Path to JSONL file")
|
|
243
|
+
parser.add_argument("--validate", action="store_true", help="Validate the JSONL file")
|
|
244
|
+
parser.add_argument("--create-subset", type=int, help="Create subset with N examples")
|
|
245
|
+
parser.add_argument("--convert", action="store_true", help="Convert to Vertex AI format")
|
|
246
|
+
parser.add_argument("--add-system", action="store_true", help="Add system prompt to messages")
|
|
247
|
+
parser.add_argument("--output", type=Path, help="Output path for converted/subset file")
|
|
248
|
+
|
|
249
|
+
args = parser.parse_args()
|
|
250
|
+
|
|
251
|
+
if not args.jsonl_path.exists():
|
|
252
|
+
sys.exit(f"❌ File not found: {args.jsonl_path}")
|
|
253
|
+
|
|
254
|
+
# Always run validation
|
|
255
|
+
print(f"🔍 Analyzing {args.jsonl_path}...")
|
|
256
|
+
stats = analyze_jsonl_file(args.jsonl_path)
|
|
257
|
+
cost_estimate = estimate_training_cost(stats)
|
|
258
|
+
|
|
259
|
+
if args.validate or (not args.create_subset and not args.convert):
|
|
260
|
+
print_analysis_report(stats, cost_estimate)
|
|
261
|
+
|
|
262
|
+
# Create subset if requested
|
|
263
|
+
if args.create_subset:
|
|
264
|
+
output_path = args.output or args.jsonl_path.with_name(
|
|
265
|
+
f"{args.jsonl_path.stem}_subset_{args.create_subset}.jsonl"
|
|
266
|
+
)
|
|
267
|
+
create_subset(args.jsonl_path, output_path, args.create_subset)
|
|
268
|
+
|
|
269
|
+
# Convert if requested
|
|
270
|
+
if args.convert:
|
|
271
|
+
output_path = args.output or args.jsonl_path.with_name(
|
|
272
|
+
f"{args.jsonl_path.stem}_vertex.jsonl"
|
|
273
|
+
)
|
|
274
|
+
convert_for_vertex_ai(args.jsonl_path, output_path, args.add_system)
|
|
275
|
+
|
|
276
|
+
print("\n✅ Done!")
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
if __name__ == "__main__":
|
|
280
|
+
main()
|