@synsci/cli-darwin-x64 1.1.55 → 1.1.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/tinker/SKILL.md +66 -21
- package/bin/synsc +0 -0
- package/package.json +1 -1
|
@@ -76,6 +76,7 @@ Use this for standard SFT with JSONL or HuggingFace datasets.
|
|
|
76
76
|
### Implementation
|
|
77
77
|
|
|
78
78
|
```python
|
|
79
|
+
import json
|
|
79
80
|
import chz
|
|
80
81
|
import asyncio
|
|
81
82
|
from tinker_cookbook.supervised import train
|
|
@@ -84,9 +85,12 @@ from tinker_cookbook.supervised.data import FromConversationFileBuilder
|
|
|
84
85
|
from tinker_cookbook.renderers import TrainOnWhat
|
|
85
86
|
from tinker_cookbook.model_info import get_recommended_renderer_name
|
|
86
87
|
from tinker_cookbook.hyperparam_utils import get_lr
|
|
88
|
+
from tinker_cookbook.tokenizer_utils import get_tokenizer
|
|
87
89
|
|
|
88
90
|
model_name = "Qwen/Qwen3-30B-A3B"
|
|
89
91
|
renderer_name = get_recommended_renderer_name(model_name)
|
|
92
|
+
num_epochs = 3
|
|
93
|
+
data_file = "training_data.jsonl"
|
|
90
94
|
|
|
91
95
|
common_config = ChatDatasetBuilderCommonConfig(
|
|
92
96
|
model_name_for_tokenizer=model_name,
|
|
@@ -98,7 +102,7 @@ common_config = ChatDatasetBuilderCommonConfig(
|
|
|
98
102
|
|
|
99
103
|
dataset_builder = FromConversationFileBuilder(
|
|
100
104
|
common_config=common_config,
|
|
101
|
-
file_path=
|
|
105
|
+
file_path=data_file,
|
|
102
106
|
)
|
|
103
107
|
|
|
104
108
|
blueprint = chz.Blueprint(train.Config).apply({
|
|
@@ -107,12 +111,23 @@ blueprint = chz.Blueprint(train.Config).apply({
|
|
|
107
111
|
"dataset_builder": dataset_builder,
|
|
108
112
|
"learning_rate": get_lr(model_name),
|
|
109
113
|
"lr_schedule": "linear",
|
|
110
|
-
"num_epochs":
|
|
114
|
+
"num_epochs": num_epochs,
|
|
111
115
|
"lora_rank": 32,
|
|
112
116
|
})
|
|
113
117
|
|
|
114
118
|
config = blueprint.make()
|
|
115
119
|
asyncio.run(train.main(config))
|
|
120
|
+
|
|
121
|
+
# --- Exact usage reporting (auto-captured by CLI) ---
|
|
122
|
+
tokenizer = get_tokenizer(model_name)
|
|
123
|
+
total_tokens = 0
|
|
124
|
+
with open(data_file) as f:
|
|
125
|
+
for line in f:
|
|
126
|
+
row = json.loads(line)
|
|
127
|
+
text = " ".join(m.get("content", "") for m in row.get("messages", []))
|
|
128
|
+
total_tokens += len(tokenizer.encode(text))
|
|
129
|
+
total_tokens *= num_epochs
|
|
130
|
+
print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
|
|
116
131
|
```
|
|
117
132
|
|
|
118
133
|
### Data Format
|
|
@@ -145,6 +160,7 @@ Use this for training with reward functions — math reasoning, format complianc
|
|
|
145
160
|
### Implementation (Cookbook)
|
|
146
161
|
|
|
147
162
|
```python
|
|
163
|
+
import json
|
|
148
164
|
import asyncio
|
|
149
165
|
import chz
|
|
150
166
|
from tinker_cookbook.rl import train
|
|
@@ -153,10 +169,13 @@ from tinker_cookbook import model_info
|
|
|
153
169
|
|
|
154
170
|
model_name = "meta-llama/Llama-3.1-8B"
|
|
155
171
|
renderer_name = model_info.get_recommended_renderer_name(model_name)
|
|
172
|
+
batch_size = 128
|
|
173
|
+
group_size = 16
|
|
174
|
+
max_tokens = 256
|
|
156
175
|
|
|
157
176
|
builder = Gsm8kDatasetBuilder(
|
|
158
|
-
batch_size=
|
|
159
|
-
group_size=
|
|
177
|
+
batch_size=batch_size,
|
|
178
|
+
group_size=group_size,
|
|
160
179
|
renderer_name=renderer_name,
|
|
161
180
|
model_name_for_tokenizer=model_name,
|
|
162
181
|
)
|
|
@@ -166,11 +185,27 @@ blueprint = chz.Blueprint(train.Config).apply({
|
|
|
166
185
|
"log_path": "/tmp/rl-run",
|
|
167
186
|
"dataset_builder": builder,
|
|
168
187
|
"learning_rate": 4e-5,
|
|
169
|
-
"max_tokens":
|
|
188
|
+
"max_tokens": max_tokens,
|
|
170
189
|
})
|
|
171
190
|
|
|
172
191
|
config = blueprint.make()
|
|
173
192
|
asyncio.run(train.main(config))
|
|
193
|
+
|
|
194
|
+
# --- Exact usage reporting (auto-captured by CLI) ---
|
|
195
|
+
# For RL: estimate from batch_size × group_size × max_tokens × num_batches
|
|
196
|
+
# The exact count comes from the training loop — check /tmp/rl-run for logs
|
|
197
|
+
import glob, os
|
|
198
|
+
log_files = sorted(glob.glob("/tmp/rl-run/metrics*.json"))
|
|
199
|
+
total_tokens = 0
|
|
200
|
+
for lf in log_files:
|
|
201
|
+
with open(lf) as f:
|
|
202
|
+
for line in f:
|
|
203
|
+
m = json.loads(line)
|
|
204
|
+
total_tokens += m.get("num_tokens", 0)
|
|
205
|
+
if total_tokens == 0:
|
|
206
|
+
# Fallback estimate: batch_size × group_size × max_tokens × num_batches
|
|
207
|
+
total_tokens = batch_size * group_size * max_tokens * 100
|
|
208
|
+
print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
|
|
174
209
|
```
|
|
175
210
|
|
|
176
211
|
### Custom RL with Low-Level API
|
|
@@ -178,16 +213,20 @@ asyncio.run(train.main(config))
|
|
|
178
213
|
For full control over sampling, reward computation, and advantage centering:
|
|
179
214
|
|
|
180
215
|
```python
|
|
216
|
+
import json
|
|
181
217
|
import tinker
|
|
182
218
|
from tinker import types
|
|
183
219
|
from tinker.types.tensor_data import TensorData
|
|
184
220
|
import torch
|
|
185
221
|
|
|
222
|
+
model_name = "meta-llama/Llama-3.1-8B"
|
|
186
223
|
service_client = tinker.ServiceClient()
|
|
187
224
|
training_client = service_client.create_lora_training_client(
|
|
188
|
-
base_model=
|
|
225
|
+
base_model=model_name, rank=32
|
|
189
226
|
)
|
|
190
227
|
|
|
228
|
+
total_tokens = 0 # Track exact tokens for billing
|
|
229
|
+
|
|
191
230
|
for batch_idx, batch_rows in enumerate(dataset):
|
|
192
231
|
path = training_client.save_weights_for_sampler(name=f"{batch_idx:06d}").result().path
|
|
193
232
|
sampling_client = service_client.create_sampling_client(model_path=path)
|
|
@@ -220,9 +259,15 @@ for batch_idx, batch_rows in enumerate(dataset):
|
|
|
220
259
|
)
|
|
221
260
|
datums.append(datum)
|
|
222
261
|
|
|
262
|
+
# Track exact token count from datums
|
|
263
|
+
total_tokens += sum(d.model_input.length() for d in datums)
|
|
264
|
+
|
|
223
265
|
fwd_bwd = training_client.forward_backward(datums, loss_fn="importance_sampling")
|
|
224
266
|
optim = training_client.optim_step(types.AdamParams(learning_rate=4e-5))
|
|
225
267
|
fwd_bwd.result(); optim.result()
|
|
268
|
+
|
|
269
|
+
# --- Exact usage reporting (auto-captured by CLI) ---
|
|
270
|
+
print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
|
|
226
271
|
```
|
|
227
272
|
|
|
228
273
|
### Available RL Loss Functions
|
|
@@ -342,30 +387,30 @@ See [Evaluations](references/evaluations.md) for custom evaluators and LLM-as-ju
|
|
|
342
387
|
|
|
343
388
|
### Pre-Training Cost Estimation
|
|
344
389
|
|
|
345
|
-
**ALWAYS estimate cost before starting Tinker training.** Load the `tinker-training-cost` skill and use its pricing tables or
|
|
390
|
+
**ALWAYS estimate cost before starting Tinker training.** Load the `tinker-training-cost` skill and use its pricing tables or calculate manually:
|
|
346
391
|
|
|
347
|
-
```
|
|
348
|
-
|
|
392
|
+
```
|
|
393
|
+
Training Cost = (total_tokens × epochs × train_price_per_million) / 1,000,000
|
|
349
394
|
```
|
|
350
395
|
|
|
351
396
|
Present the cost estimate to the user for approval before starting training.
|
|
352
397
|
|
|
353
|
-
###
|
|
398
|
+
### Automatic Usage Reporting (Ground Truth)
|
|
354
399
|
|
|
355
|
-
|
|
400
|
+
**CRITICAL**: All training scripts MUST print a `[SYNSCI_USAGE]` line at the end. The CLI automatically captures this and reports exact billing to the dashboard.
|
|
356
401
|
|
|
357
|
-
```
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
event_type: "training",
|
|
362
|
-
model: "Qwen3-8B",
|
|
363
|
-
tokens_used: 5000000,
|
|
364
|
-
metadata: { run_id: "my-run", epochs: 3, dataset: "training_data.jsonl" },
|
|
365
|
-
})
|
|
402
|
+
```python
|
|
403
|
+
# Add this at the END of every training script:
|
|
404
|
+
import json
|
|
405
|
+
print(f'\n[SYNSCI_USAGE] {json.dumps({"service": "tinker", "event_type": "training", "model": model_name, "tokens_used": total_tokens})}')
|
|
366
406
|
```
|
|
367
407
|
|
|
368
|
-
|
|
408
|
+
How token counting works per workflow:
|
|
409
|
+
- **Cookbook SFT**: Tokenize dataset with `get_tokenizer(model_name)`, multiply by `num_epochs`
|
|
410
|
+
- **Cookbook RL**: Parse training logs for `num_tokens`, or estimate from `batch_size × group_size × max_tokens × batches`
|
|
411
|
+
- **Low-level API**: Sum `datum.model_input.length()` across all `forward_backward()` calls
|
|
412
|
+
|
|
413
|
+
The CLI bash tool scans output for `[SYNSCI_USAGE]` markers and auto-reports to the dashboard — no manual reporting needed.
|
|
369
414
|
|
|
370
415
|
## Common Issues
|
|
371
416
|
|
package/bin/synsc
CHANGED
|
Binary file
|