synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (154) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/__init__.py +6 -0
  3. synth_ai/cli/balance.py +3 -15
  4. synth_ai/cli/demo.py +68 -9
  5. synth_ai/cli/rl_demo.py +137 -0
  6. synth_ai/cli/root.py +65 -0
  7. synth_ai/config/base_url.py +47 -0
  8. synth_ai/demos/core/__init__.py +1 -0
  9. synth_ai/demos/core/cli.py +621 -0
  10. synth_ai/demos/demo_task_apps/__init__.py +1 -0
  11. synth_ai/demos/demo_task_apps/core.py +374 -0
  12. synth_ai/demos/demo_task_apps/math/__init__.py +1 -0
  13. synth_ai/demos/demo_task_apps/math/app.py +37 -0
  14. synth_ai/demos/demo_task_apps/math/config.toml +44 -0
  15. synth_ai/demos/demo_task_apps/math/deploy_modal.py +60 -0
  16. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +22 -0
  17. synth_ai/environments/examples/bandit/__init__.py +33 -0
  18. synth_ai/environments/examples/bandit/engine.py +294 -0
  19. synth_ai/environments/examples/bandit/environment.py +194 -0
  20. synth_ai/environments/examples/bandit/taskset.py +200 -0
  21. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +250 -0
  22. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +59 -0
  23. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +152 -0
  24. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +24 -0
  25. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +1194 -0
  26. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +56 -0
  27. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +32 -0
  28. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +724 -0
  29. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +384 -0
  30. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +53 -0
  31. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +178 -0
  32. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +222 -0
  33. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +183 -0
  34. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +210 -0
  35. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +206 -0
  36. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +49 -0
  37. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +64 -0
  38. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +88 -0
  39. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +77 -0
  40. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +324 -0
  41. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +580 -0
  42. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +362 -0
  43. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +49 -0
  44. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +332 -0
  45. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +97 -0
  46. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +217 -0
  47. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +87 -0
  48. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +88 -0
  49. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +195 -0
  50. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +400 -0
  51. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +195 -0
  52. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +56 -0
  53. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +858 -0
  54. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +52 -0
  55. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +874 -0
  56. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +1412 -0
  57. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +216 -0
  58. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +296 -0
  59. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +58 -0
  60. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +464 -0
  61. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +152 -0
  62. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +51 -0
  63. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +1412 -0
  64. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +112 -0
  65. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +203 -0
  66. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +305 -0
  67. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +126 -0
  68. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +94 -0
  69. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +142 -0
  70. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +26 -0
  71. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +984 -0
  72. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +724 -0
  73. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +386 -0
  74. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +205 -0
  75. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +150 -0
  76. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +283 -0
  77. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +280 -0
  78. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +456 -0
  79. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +166 -0
  80. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +102 -0
  81. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +128 -0
  82. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +655 -0
  83. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +202 -0
  84. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +166 -0
  85. synth_ai/environments/examples/crafter_classic/environment.py +41 -2
  86. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +1 -0
  87. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +202 -0
  88. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +159 -0
  89. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +158 -0
  90. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +71 -0
  91. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +105 -0
  92. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +119 -0
  93. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +52 -0
  94. synth_ai/environments/examples/enron/units/keyword_stats.py +112 -0
  95. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +1188 -0
  96. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +48 -0
  97. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +562 -0
  98. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +221 -0
  99. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +981 -0
  100. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +74 -0
  101. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +831 -0
  102. synth_ai/environments/examples/red/agent_demos/__init__.py +1 -0
  103. synth_ai/environments/examples/red/units/__init__.py +1 -0
  104. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +899 -0
  105. synth_ai/environments/examples/sokoban/units/astar_common.py +95 -0
  106. synth_ai/environments/service/app.py +8 -0
  107. synth_ai/http.py +102 -0
  108. synth_ai/inference/__init__.py +7 -0
  109. synth_ai/inference/client.py +20 -0
  110. synth_ai/install_sqld.sh +40 -0
  111. synth_ai/jobs/client.py +246 -0
  112. synth_ai/learning/__init__.py +24 -0
  113. synth_ai/learning/client.py +149 -0
  114. synth_ai/learning/config.py +43 -0
  115. synth_ai/learning/constants.py +29 -0
  116. synth_ai/learning/ft_client.py +59 -0
  117. synth_ai/learning/health.py +43 -0
  118. synth_ai/learning/jobs.py +205 -0
  119. synth_ai/learning/rl_client.py +256 -0
  120. synth_ai/learning/sse.py +58 -0
  121. synth_ai/learning/validators.py +48 -0
  122. synth_ai/lm/core/main_v3.py +13 -0
  123. synth_ai/lm/core/synth_models.py +48 -0
  124. synth_ai/lm/core/vendor_clients.py +9 -6
  125. synth_ai/lm/vendors/core/openai_api.py +31 -3
  126. synth_ai/lm/vendors/openai_standard.py +45 -14
  127. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  128. synth_ai/lm/vendors/synth_client.py +372 -28
  129. synth_ai/rl/__init__.py +30 -0
  130. synth_ai/rl/contracts.py +32 -0
  131. synth_ai/rl/env_keys.py +137 -0
  132. synth_ai/rl/secrets.py +19 -0
  133. synth_ai/scripts/verify_rewards.py +100 -0
  134. synth_ai/task/__init__.py +10 -0
  135. synth_ai/task/contracts.py +120 -0
  136. synth_ai/task/health.py +28 -0
  137. synth_ai/task/validators.py +12 -0
  138. synth_ai/tracing_v3/hooks.py +3 -1
  139. synth_ai/tracing_v3/session_tracer.py +123 -2
  140. synth_ai/tracing_v3/turso/manager.py +218 -0
  141. synth_ai/tracing_v3/turso/models.py +53 -0
  142. synth_ai-0.2.4.dev9.dist-info/METADATA +91 -0
  143. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/RECORD +147 -30
  144. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/entry_points.txt +1 -0
  145. synth_ai/tui/__init__.py +0 -1
  146. synth_ai/tui/__main__.py +0 -13
  147. synth_ai/tui/cli/__init__.py +0 -1
  148. synth_ai/tui/cli/query_experiments.py +0 -164
  149. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  150. synth_ai/tui/dashboard.py +0 -340
  151. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  152. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/WHEEL +0 -0
  153. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/licenses/LICENSE +0 -0
  154. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Prepare and Validate JSONL Data for Vertex AI Fine-tuning
4
+ =========================================================
5
+ This script validates and prepares JSONL data for Gemini fine-tuning on Vertex AI.
6
+ """
7
+
8
+ import json
9
+ import argparse
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Tuple
12
+ import sys
13
+ from collections import defaultdict
14
+ import random
15
+
16
+
17
+ def validate_jsonl_line(line: str, line_num: int) -> Tuple[bool, str, Dict[str, Any]]:
18
+ """Validate a single JSONL line for Vertex AI compatibility."""
19
+ try:
20
+ data = json.loads(line.strip())
21
+ except json.JSONDecodeError as e:
22
+ return False, f"Line {line_num}: Invalid JSON - {e}", {}
23
+
24
+ # Check required structure
25
+ if "messages" not in data:
26
+ return False, f"Line {line_num}: Missing 'messages' field", {}
27
+
28
+ messages = data["messages"]
29
+ if not isinstance(messages, list):
30
+ return False, f"Line {line_num}: 'messages' must be a list", {}
31
+
32
+ if len(messages) < 2:
33
+ return False, f"Line {line_num}: Need at least 2 messages (user and assistant)", {}
34
+
35
+ # Validate message structure
36
+ valid_roles = {"user", "assistant", "system"}
37
+ for i, msg in enumerate(messages):
38
+ if not isinstance(msg, dict):
39
+ return False, f"Line {line_num}: Message {i} must be a dict", {}
40
+
41
+ if "role" not in msg:
42
+ return False, f"Line {line_num}: Message {i} missing 'role'", {}
43
+
44
+ if "content" not in msg:
45
+ return False, f"Line {line_num}: Message {i} missing 'content'", {}
46
+
47
+ if msg["role"] not in valid_roles:
48
+ return False, f"Line {line_num}: Message {i} invalid role '{msg['role']}'", {}
49
+
50
+ # Check conversation flow
51
+ if messages[-1]["role"] != "assistant":
52
+ return False, f"Line {line_num}: Last message must be from assistant", {}
53
+
54
+ # Calculate token estimate (rough)
55
+ total_chars = sum(len(msg["content"]) for msg in messages)
56
+ token_estimate = total_chars // 4 # Rough estimate
57
+
58
+ return True, "OK", {
59
+ "num_messages": len(messages),
60
+ "roles": [msg["role"] for msg in messages],
61
+ "token_estimate": token_estimate,
62
+ "total_chars": total_chars
63
+ }
64
+
65
+
66
+ def analyze_jsonl_file(file_path: Path) -> Dict[str, Any]:
67
+ """Analyze a JSONL file for Vertex AI compatibility."""
68
+ stats = {
69
+ "total_lines": 0,
70
+ "valid_lines": 0,
71
+ "invalid_lines": 0,
72
+ "errors": [],
73
+ "token_distribution": defaultdict(int),
74
+ "message_count_distribution": defaultdict(int),
75
+ "role_patterns": defaultdict(int),
76
+ "total_tokens_estimate": 0
77
+ }
78
+
79
+ with open(file_path, 'r') as f:
80
+ for line_num, line in enumerate(f, 1):
81
+ if not line.strip():
82
+ continue
83
+
84
+ stats["total_lines"] += 1
85
+ is_valid, error_msg, line_stats = validate_jsonl_line(line, line_num)
86
+
87
+ if is_valid:
88
+ stats["valid_lines"] += 1
89
+ stats["total_tokens_estimate"] += line_stats["token_estimate"]
90
+
91
+ # Bucket token counts
92
+ tokens = line_stats["token_estimate"]
93
+ if tokens < 100:
94
+ stats["token_distribution"]["<100"] += 1
95
+ elif tokens < 500:
96
+ stats["token_distribution"]["100-500"] += 1
97
+ elif tokens < 1000:
98
+ stats["token_distribution"]["500-1000"] += 1
99
+ elif tokens < 2000:
100
+ stats["token_distribution"]["1000-2000"] += 1
101
+ else:
102
+ stats["token_distribution"]["2000+"] += 1
103
+
104
+ # Message count distribution
105
+ msg_count = line_stats["num_messages"]
106
+ stats["message_count_distribution"][msg_count] += 1
107
+
108
+ # Role patterns
109
+ role_pattern = "->".join(line_stats["roles"])
110
+ stats["role_patterns"][role_pattern] += 1
111
+ else:
112
+ stats["invalid_lines"] += 1
113
+ stats["errors"].append(error_msg)
114
+ if len(stats["errors"]) > 10:
115
+ stats["errors"].append("... (truncated)")
116
+ break
117
+
118
+ return stats
119
+
120
+
121
+ def create_subset(input_path: Path, output_path: Path, num_examples: int,
122
+ shuffle: bool = True, seed: int = 42):
123
+ """Create a subset of the JSONL file."""
124
+ # Read all valid lines
125
+ valid_lines = []
126
+ with open(input_path, 'r') as f:
127
+ for line in f:
128
+ if line.strip():
129
+ is_valid, _, _ = validate_jsonl_line(line, len(valid_lines) + 1)
130
+ if is_valid:
131
+ valid_lines.append(line.strip())
132
+
133
+ # Sample subset
134
+ if shuffle:
135
+ random.seed(seed)
136
+ random.shuffle(valid_lines)
137
+
138
+ subset = valid_lines[:num_examples]
139
+
140
+ # Write subset
141
+ with open(output_path, 'w') as f:
142
+ for line in subset:
143
+ f.write(line + '\n')
144
+
145
+ print(f"✅ Created subset with {len(subset)} examples at {output_path}")
146
+ return len(subset)
147
+
148
+
149
+ def convert_for_vertex_ai(input_path: Path, output_path: Path,
150
+ add_system_prompt: bool = True):
151
+ """Convert JSONL to Vertex AI format with optional enhancements."""
152
+ converted_count = 0
153
+
154
+ system_prompt = """You are an expert Crafter player. Your goal is to achieve as many objectives as possible efficiently.
155
+
156
+ Key objectives: collect resources, craft tools (pickaxe → stone pickaxe → iron pickaxe), make iron sword, survive.
157
+
158
+ Always think step-by-step about your current situation and plan your next action carefully."""
159
+
160
+ with open(input_path, 'r') as inf, open(output_path, 'w') as outf:
161
+ for line in inf:
162
+ if not line.strip():
163
+ continue
164
+
165
+ try:
166
+ data = json.loads(line)
167
+ messages = data["messages"]
168
+
169
+ # Optionally add system prompt
170
+ if add_system_prompt and messages[0]["role"] != "system":
171
+ messages = [{"role": "system", "content": system_prompt}] + messages
172
+
173
+ # Ensure proper format
174
+ formatted_data = {"messages": messages}
175
+
176
+ outf.write(json.dumps(formatted_data) + '\n')
177
+ converted_count += 1
178
+
179
+ except Exception as e:
180
+ print(f"⚠️ Skipping line due to error: {e}")
181
+
182
+ print(f"✅ Converted {converted_count} examples to {output_path}")
183
+ return converted_count
184
+
185
+
186
+ def estimate_training_cost(stats: Dict[str, Any], price_per_million: float = 4.0):
187
+ """Estimate Vertex AI training cost."""
188
+ total_tokens = stats["total_tokens_estimate"]
189
+ total_millions = total_tokens / 1_000_000
190
+ estimated_cost = total_millions * price_per_million
191
+
192
+ return {
193
+ "total_tokens": total_tokens,
194
+ "total_millions": round(total_millions, 2),
195
+ "estimated_cost_usd": round(estimated_cost, 2),
196
+ "price_per_million": price_per_million
197
+ }
198
+
199
+
200
+ def print_analysis_report(stats: Dict[str, Any], cost_estimate: Dict[str, Any]):
201
+ """Print a detailed analysis report."""
202
+ print("\n" + "=" * 60)
203
+ print("📊 VERTEX AI FINE-TUNING DATA ANALYSIS")
204
+ print("=" * 60)
205
+
206
+ print(f"\n✅ Valid examples: {stats['valid_lines']}")
207
+ print(f"❌ Invalid examples: {stats['invalid_lines']}")
208
+ print(f"📝 Total lines: {stats['total_lines']}")
209
+ print(f"✔️ Validation rate: {stats['valid_lines']/stats['total_lines']*100:.1f}%")
210
+
211
+ if stats['errors']:
212
+ print(f"\n⚠️ First few errors:")
213
+ for error in stats['errors'][:5]:
214
+ print(f" - {error}")
215
+
216
+ print(f"\n📊 Token Distribution:")
217
+ for bucket, count in sorted(stats['token_distribution'].items()):
218
+ print(f" {bucket} tokens: {count} examples")
219
+
220
+ print(f"\n💬 Message Patterns:")
221
+ for pattern, count in sorted(stats['role_patterns'].items(),
222
+ key=lambda x: x[1], reverse=True)[:5]:
223
+ print(f" {pattern}: {count} examples")
224
+
225
+ print(f"\n💰 Cost Estimate:")
226
+ print(f" Total tokens: {cost_estimate['total_tokens']:,}")
227
+ print(f" Token millions: {cost_estimate['total_millions']}")
228
+ print(f" Estimated cost: ${cost_estimate['estimated_cost_usd']} USD")
229
+ print(f" (at ${cost_estimate['price_per_million']}/million tokens)")
230
+
231
+ print("\n📝 Recommendations:")
232
+ if stats['valid_lines'] < 100:
233
+ print(" ⚠️ Dataset is small. Consider generating more examples.")
234
+ if stats['valid_lines'] > 10000:
235
+ print(" 💡 Large dataset. Consider creating a smaller subset for initial tests.")
236
+ if cost_estimate['estimated_cost_usd'] > 100:
237
+ print(" 💰 High estimated cost. Consider using a subset for initial experiments.")
238
+
239
+
240
+ def main():
241
+ parser = argparse.ArgumentParser(description="Prepare and validate JSONL for Vertex AI")
242
+ parser.add_argument("jsonl_path", type=Path, help="Path to JSONL file")
243
+ parser.add_argument("--validate", action="store_true", help="Validate the JSONL file")
244
+ parser.add_argument("--create-subset", type=int, help="Create subset with N examples")
245
+ parser.add_argument("--convert", action="store_true", help="Convert to Vertex AI format")
246
+ parser.add_argument("--add-system", action="store_true", help="Add system prompt to messages")
247
+ parser.add_argument("--output", type=Path, help="Output path for converted/subset file")
248
+
249
+ args = parser.parse_args()
250
+
251
+ if not args.jsonl_path.exists():
252
+ sys.exit(f"❌ File not found: {args.jsonl_path}")
253
+
254
+ # Always run validation
255
+ print(f"🔍 Analyzing {args.jsonl_path}...")
256
+ stats = analyze_jsonl_file(args.jsonl_path)
257
+ cost_estimate = estimate_training_cost(stats)
258
+
259
+ if args.validate or (not args.create_subset and not args.convert):
260
+ print_analysis_report(stats, cost_estimate)
261
+
262
+ # Create subset if requested
263
+ if args.create_subset:
264
+ output_path = args.output or args.jsonl_path.with_name(
265
+ f"{args.jsonl_path.stem}_subset_{args.create_subset}.jsonl"
266
+ )
267
+ create_subset(args.jsonl_path, output_path, args.create_subset)
268
+
269
+ # Convert if requested
270
+ if args.convert:
271
+ output_path = args.output or args.jsonl_path.with_name(
272
+ f"{args.jsonl_path.stem}_vertex.jsonl"
273
+ )
274
+ convert_for_vertex_ai(args.jsonl_path, output_path, args.add_system)
275
+
276
+ print("\n✅ Done!")
277
+
278
+
279
+ if __name__ == "__main__":
280
+ main()