@synsci/cli-darwin-x64 1.1.55 → 1.1.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -76,6 +76,7 @@ Use this for standard SFT with JSONL or HuggingFace datasets.
76
76
  ### Implementation
77
77
 
78
78
  ```python
79
+ import json
79
80
  import chz
80
81
  import asyncio
81
82
  from tinker_cookbook.supervised import train
@@ -84,9 +85,12 @@ from tinker_cookbook.supervised.data import FromConversationFileBuilder
84
85
  from tinker_cookbook.renderers import TrainOnWhat
85
86
  from tinker_cookbook.model_info import get_recommended_renderer_name
86
87
  from tinker_cookbook.hyperparam_utils import get_lr
88
+ from tinker_cookbook.tokenizer_utils import get_tokenizer
87
89
 
88
90
  model_name = "Qwen/Qwen3-30B-A3B"
89
91
  renderer_name = get_recommended_renderer_name(model_name)
92
+ num_epochs = 3
93
+ data_file = "training_data.jsonl"
90
94
 
91
95
  common_config = ChatDatasetBuilderCommonConfig(
92
96
  model_name_for_tokenizer=model_name,
@@ -98,7 +102,7 @@ common_config = ChatDatasetBuilderCommonConfig(
98
102
 
99
103
  dataset_builder = FromConversationFileBuilder(
100
104
  common_config=common_config,
101
- file_path="training_data.jsonl",
105
+ file_path=data_file,
102
106
  )
103
107
 
104
108
  blueprint = chz.Blueprint(train.Config).apply({
@@ -107,12 +111,23 @@ blueprint = chz.Blueprint(train.Config).apply({
107
111
  "dataset_builder": dataset_builder,
108
112
  "learning_rate": get_lr(model_name),
109
113
  "lr_schedule": "linear",
110
- "num_epochs": 3,
114
+ "num_epochs": num_epochs,
111
115
  "lora_rank": 32,
112
116
  })
113
117
 
114
118
  config = blueprint.make()
115
119
  asyncio.run(train.main(config))
120
+
121
+ # --- Exact usage reporting (auto-captured by CLI) ---
122
+ tokenizer = get_tokenizer(model_name)
123
+ total_tokens = 0
124
+ with open(data_file) as f:
125
+ for line in f:
126
+ row = json.loads(line)
127
+ text = " ".join(m.get("content", "") for m in row.get("messages", []))
128
+ total_tokens += len(tokenizer.encode(text))
129
+ total_tokens *= num_epochs
130
+ print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
116
131
  ```
117
132
 
118
133
  ### Data Format
@@ -145,6 +160,7 @@ Use this for training with reward functions — math reasoning, format complianc
145
160
  ### Implementation (Cookbook)
146
161
 
147
162
  ```python
163
+ import json
148
164
  import asyncio
149
165
  import chz
150
166
  from tinker_cookbook.rl import train
@@ -153,10 +169,13 @@ from tinker_cookbook import model_info
153
169
 
154
170
  model_name = "meta-llama/Llama-3.1-8B"
155
171
  renderer_name = model_info.get_recommended_renderer_name(model_name)
172
+ batch_size = 128
173
+ group_size = 16
174
+ max_tokens = 256
156
175
 
157
176
  builder = Gsm8kDatasetBuilder(
158
- batch_size=128,
159
- group_size=16,
177
+ batch_size=batch_size,
178
+ group_size=group_size,
160
179
  renderer_name=renderer_name,
161
180
  model_name_for_tokenizer=model_name,
162
181
  )
@@ -166,11 +185,27 @@ blueprint = chz.Blueprint(train.Config).apply({
166
185
  "log_path": "/tmp/rl-run",
167
186
  "dataset_builder": builder,
168
187
  "learning_rate": 4e-5,
169
- "max_tokens": 256,
188
+ "max_tokens": max_tokens,
170
189
  })
171
190
 
172
191
  config = blueprint.make()
173
192
  asyncio.run(train.main(config))
193
+
194
+ # --- Exact usage reporting (auto-captured by CLI) ---
195
+ # For RL: estimate from batch_size × group_size × max_tokens × num_batches
196
+ # The exact count comes from the training loop — check /tmp/rl-run for logs
197
+ import glob, os
198
+ log_files = sorted(glob.glob("/tmp/rl-run/metrics*.json"))
199
+ total_tokens = 0
200
+ for lf in log_files:
201
+ with open(lf) as f:
202
+ for line in f:
203
+ m = json.loads(line)
204
+ total_tokens += m.get("num_tokens", 0)
205
+ if total_tokens == 0:
206
+ # Fallback estimate: batch_size × group_size × max_tokens × num_batches
207
+ total_tokens = batch_size * group_size * max_tokens * 100
208
+ print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
174
209
  ```
175
210
 
176
211
  ### Custom RL with Low-Level API
@@ -178,16 +213,20 @@ asyncio.run(train.main(config))
178
213
  For full control over sampling, reward computation, and advantage centering:
179
214
 
180
215
  ```python
216
+ import json
181
217
  import tinker
182
218
  from tinker import types
183
219
  from tinker.types.tensor_data import TensorData
184
220
  import torch
185
221
 
222
+ model_name = "meta-llama/Llama-3.1-8B"
186
223
  service_client = tinker.ServiceClient()
187
224
  training_client = service_client.create_lora_training_client(
188
- base_model="meta-llama/Llama-3.1-8B", rank=32
225
+ base_model=model_name, rank=32
189
226
  )
190
227
 
228
+ total_tokens = 0 # Track exact tokens for billing
229
+
191
230
  for batch_idx, batch_rows in enumerate(dataset):
192
231
  path = training_client.save_weights_for_sampler(name=f"{batch_idx:06d}").result().path
193
232
  sampling_client = service_client.create_sampling_client(model_path=path)
@@ -220,9 +259,15 @@ for batch_idx, batch_rows in enumerate(dataset):
220
259
  )
221
260
  datums.append(datum)
222
261
 
262
+ # Track exact token count from datums
263
+ total_tokens += sum(d.model_input.length() for d in datums)
264
+
223
265
  fwd_bwd = training_client.forward_backward(datums, loss_fn="importance_sampling")
224
266
  optim = training_client.optim_step(types.AdamParams(learning_rate=4e-5))
225
267
  fwd_bwd.result(); optim.result()
268
+
269
+ # --- Exact usage reporting (auto-captured by CLI) ---
270
+ print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
226
271
  ```
227
272
 
228
273
  ### Available RL Loss Functions
@@ -342,30 +387,30 @@ See [Evaluations](references/evaluations.md) for custom evaluators and LLM-as-ju
342
387
 
343
388
  ### Pre-Training Cost Estimation
344
389
 
345
- **ALWAYS estimate cost before starting Tinker training.** Load the `tinker-training-cost` skill and use its pricing tables or the bundled calculator:
390
+ **ALWAYS estimate cost before starting Tinker training.** Load the `tinker-training-cost` skill and use its pricing tables or calculate manually:
346
391
 
347
- ```bash
348
- python scripts/calculate_cost.py training_data.jsonl --model Qwen3-8B --epochs 3 --json
392
+ ```
393
+ Training Cost = (total_tokens × epochs × train_price_per_million) / 1,000,000
349
394
  ```
350
395
 
351
396
  Present the cost estimate to the user for approval before starting training.
352
397
 
353
- ### Post-Training Usage Reporting
398
+ ### Automatic Usage Reporting (Ground Truth)
354
399
 
355
- After training completes, report usage for billing:
400
+ **CRITICAL**: All training scripts MUST print a `[SYNSCI_USAGE]` line at the end. The CLI automatically captures this and reports exact billing to the dashboard.
356
401
 
357
- ```typescript
358
- // Called automatically by the CLI after Tinker training
359
- await SynSci.reportUsage({
360
- service: "tinker",
361
- event_type: "training",
362
- model: "Qwen3-8B",
363
- tokens_used: 5000000,
364
- metadata: { run_id: "my-run", epochs: 3, dataset: "training_data.jsonl" },
365
- })
402
+ ```python
403
+ # Add this at the END of every training script:
404
+ import json
405
+ print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
366
406
  ```
367
407
 
368
- The CLI tracks all Tinker usage and reports it to the Synthetic Sciences dashboard for billing.
408
+ How token counting works per workflow:
409
+ - **Cookbook SFT**: Tokenize dataset with `get_tokenizer(model_name)`, multiply by `num_epochs`
410
+ - **Cookbook RL**: Parse training logs for `num_tokens`, or estimate from `batch_size × group_size × max_tokens × batches`
411
+ - **Low-level API**: Sum `datum.model_input.length()` across all `forward_backward()` calls
412
+
413
+ The CLI bash tool scans output for `[SYNSCI_USAGE]` markers and auto-reports to the dashboard — no manual reporting needed.
369
414
 
370
415
  ## Common Issues
371
416
 
package/bin/synsc CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@synsci/cli-darwin-x64",
3
- "version": "1.1.55",
3
+ "version": "1.1.57",
4
4
  "os": [
5
5
  "darwin"
6
6
  ],