@synsci/cli-darwin-x64-baseline 1.1.73 → 1.1.75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/hugging-face-datasets/SKILL.md +2 -2
- package/bin/skills/hugging-face-evaluation/SKILL.md +2 -2
- package/bin/skills/hugging-face-jobs/SKILL.md +1 -2
- package/bin/skills/hugging-face-model-trainer/SKILL.md +1 -2
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +1 -1
- package/bin/skills/hugging-face-tool-builder/SKILL.md +1 -1
- package/bin/skills/hugging-face-trackio/SKILL.md +2 -2
- package/bin/skills/llm-as-judge-evaluation/SKILL.md +385 -0
- package/bin/skills/llm-as-judge-evaluation/references/pairwise-comparison.md +95 -0
- package/bin/skills/llm-as-judge-evaluation/references/scoring-rubrics.md +169 -0
- package/bin/skills/model-economics/SKILL.md +238 -0
- package/bin/skills/training-data-pipeline/SKILL.md +427 -0
- package/bin/skills/training-data-pipeline/references/data-quality.md +136 -0
- package/bin/skills/training-data-pipeline/references/frontier-distillation.md +129 -0
- package/bin/skills/training-data-pipeline/references/production-data-formatting.md +126 -0
- package/bin/synsc +0 -0
- package/package.json +1 -1
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: training-data-pipeline
|
|
3
|
+
description: Build training datasets for LLM specialization from production data, frontier model distillation, and synthetic bootstrapping. Use when formatting production logs into SFT data, distilling from frontier APIs, or preparing data for fine-tuning. Covers JSONL formatting, data quality validation, deduplication, and train/eval splitting.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Training Data, Data Pipeline, Fine-Tuning, Distillation, Synthetic Data, Production Data, JSONL, Data Quality]
|
|
8
|
+
dependencies: [datasets, transformers, openai]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Training Data Pipeline
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use this skill when you need to:
|
|
16
|
+
- **Format production logs** into SFT training data (API logs, user corrections, accept/reject signals)
|
|
17
|
+
- **Distill from frontier models** using batch APIs (OpenAI, Anthropic) to label production inputs
|
|
18
|
+
- **Bootstrap synthetic data** when fewer than 1000 real examples exist
|
|
19
|
+
- **Validate data quality** before training (dedup, schema check, diversity metrics)
|
|
20
|
+
- **Split data** into train/eval sets with production data reserved for evaluation
|
|
21
|
+
|
|
22
|
+
### Three Data Paths
|
|
23
|
+
|
|
24
|
+
| Path | When to Use | Data Source | Cost |
|
|
25
|
+
|------|-------------|-------------|------|
|
|
26
|
+
| A) Production data | Have API logs or user feedback | Your own production systems | Free (already collected) |
|
|
27
|
+
| B) Frontier distillation | Have production inputs but no labels | OpenAI/Anthropic batch APIs | ~50% of real-time API cost |
|
|
28
|
+
| C) Synthetic bootstrap | < 1000 real examples | Frontier model generation | Varies by volume |
|
|
29
|
+
|
|
30
|
+
**Always prefer Path A** — production data is the moat competitors can't replicate.
|
|
31
|
+
|
|
32
|
+
## JSONL Chat Format
|
|
33
|
+
|
|
34
|
+
All training platforms (Tinker, Unsloth, TRL, Axolotl) accept this standard chat format:
|
|
35
|
+
|
|
36
|
+
```jsonl
|
|
37
|
+
{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is 2+2?"}, {"role": "assistant", "content": "4"}]}
|
|
38
|
+
{"messages": [{"role": "user", "content": "Translate to French: Hello"}, {"role": "assistant", "content": "Bonjour"}]}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Format Rules
|
|
42
|
+
- One JSON object per line, no trailing commas
|
|
43
|
+
- `messages` array with `role` and `content` fields
|
|
44
|
+
- Roles: `system` (optional, first only), `user`, `assistant` (alternating)
|
|
45
|
+
- Multi-turn: alternate user/assistant pairs within a single messages array
|
|
46
|
+
- UTF-8 encoding, no BOM
|
|
47
|
+
- `assistant` messages are the training targets — everything else is context
|
|
48
|
+
|
|
49
|
+
### Platform-Specific Notes
|
|
50
|
+
|
|
51
|
+
**Tinker**: Standard chat format above. Max 32K tokens per example. System message optional.
|
|
52
|
+
|
|
53
|
+
**Unsloth/TRL**: Same format. Also accepts `{"prompt": "...", "completion": "..."}` for simple pairs. Chat format preferred for multi-turn.
|
|
54
|
+
|
|
55
|
+
**Axolotl**: Supports multiple formats via config. Recommend `chat_template` type with standard JSONL.
|
|
56
|
+
|
|
57
|
+
## Path A: Production Data Collection
|
|
58
|
+
|
|
59
|
+
### From API Logs
|
|
60
|
+
|
|
61
|
+
If you log API requests/responses, convert them directly:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import json
|
|
65
|
+
|
|
66
|
+
def api_log_to_training(log_entry):
|
|
67
|
+
"""Convert an API request/response log to training format."""
|
|
68
|
+
messages = []
|
|
69
|
+
|
|
70
|
+
# Add system prompt if present
|
|
71
|
+
if log_entry.get("system_prompt"):
|
|
72
|
+
messages.append({
|
|
73
|
+
"role": "system",
|
|
74
|
+
"content": log_entry["system_prompt"]
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
# Add the user's input
|
|
78
|
+
messages.append({
|
|
79
|
+
"role": "user",
|
|
80
|
+
"content": log_entry["user_input"]
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
# Add the response (use corrected version if available)
|
|
84
|
+
response = log_entry.get("corrected_response") or log_entry["api_response"]
|
|
85
|
+
messages.append({
|
|
86
|
+
"role": "assistant",
|
|
87
|
+
"content": response
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
return {"messages": messages}
|
|
91
|
+
|
|
92
|
+
# Process logs
|
|
93
|
+
with open("api_logs.jsonl") as f, open("training_data.jsonl", "w") as out:
|
|
94
|
+
for line in f:
|
|
95
|
+
log = json.loads(line)
|
|
96
|
+
example = api_log_to_training(log)
|
|
97
|
+
out.write(json.dumps(example) + "\n")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### From User Corrections
|
|
101
|
+
|
|
102
|
+
User corrections (edits to model output) are the highest-quality training signal:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
def correction_to_training(original_input, corrected_output, system_prompt=None):
|
|
106
|
+
"""Convert a user correction into a training example.
|
|
107
|
+
The corrected output becomes the training target."""
|
|
108
|
+
messages = []
|
|
109
|
+
if system_prompt:
|
|
110
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
111
|
+
messages.append({"role": "user", "content": original_input})
|
|
112
|
+
messages.append({"role": "assistant", "content": corrected_output})
|
|
113
|
+
return {"messages": messages}
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### From Accept/Reject Signals
|
|
117
|
+
|
|
118
|
+
If users accept or reject model outputs, use accepted outputs as positive examples:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
def filter_accepted(logs):
|
|
122
|
+
"""Keep only examples where user accepted the output."""
|
|
123
|
+
accepted = []
|
|
124
|
+
for log in logs:
|
|
125
|
+
if log.get("user_action") == "accepted":
|
|
126
|
+
accepted.append({
|
|
127
|
+
"messages": [
|
|
128
|
+
{"role": "user", "content": log["input"]},
|
|
129
|
+
{"role": "assistant", "content": log["output"]}
|
|
130
|
+
]
|
|
131
|
+
})
|
|
132
|
+
return accepted
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Path B: Frontier Distillation
|
|
136
|
+
|
|
137
|
+
Use frontier models to label your production inputs. Best when you have real inputs but no gold labels.
|
|
138
|
+
|
|
139
|
+
### OpenAI Batch API (50% discount)
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import json
|
|
143
|
+
|
|
144
|
+
def create_batch_file(inputs, system_prompt, model="gpt-4o"):
|
|
145
|
+
"""Create a batch file for OpenAI Batch API."""
|
|
146
|
+
requests = []
|
|
147
|
+
for i, user_input in enumerate(inputs):
|
|
148
|
+
requests.append({
|
|
149
|
+
"custom_id": f"request-{i}",
|
|
150
|
+
"method": "POST",
|
|
151
|
+
"url": "/v1/chat/completions",
|
|
152
|
+
"body": {
|
|
153
|
+
"model": model,
|
|
154
|
+
"messages": [
|
|
155
|
+
{"role": "system", "content": system_prompt},
|
|
156
|
+
{"role": "user", "content": user_input}
|
|
157
|
+
],
|
|
158
|
+
"max_tokens": 4096
|
|
159
|
+
}
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
with open("batch_input.jsonl", "w") as f:
|
|
163
|
+
for req in requests:
|
|
164
|
+
f.write(json.dumps(req) + "\n")
|
|
165
|
+
return "batch_input.jsonl"
|
|
166
|
+
|
|
167
|
+
# Submit batch
|
|
168
|
+
# openai api batches create -i batch_input.jsonl -e /v1/chat/completions -c 24h
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Anthropic Batch API
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
import anthropic
|
|
175
|
+
|
|
176
|
+
client = anthropic.Anthropic()
|
|
177
|
+
|
|
178
|
+
def create_anthropic_batch(inputs, system_prompt, model="claude-sonnet-4-5-20250929"):
|
|
179
|
+
"""Create batch request for Anthropic Message Batches API."""
|
|
180
|
+
requests = []
|
|
181
|
+
for i, user_input in enumerate(inputs):
|
|
182
|
+
requests.append({
|
|
183
|
+
"custom_id": f"request-{i}",
|
|
184
|
+
"params": {
|
|
185
|
+
"model": model,
|
|
186
|
+
"max_tokens": 4096,
|
|
187
|
+
"system": system_prompt,
|
|
188
|
+
"messages": [
|
|
189
|
+
{"role": "user", "content": user_input}
|
|
190
|
+
]
|
|
191
|
+
}
|
|
192
|
+
})
|
|
193
|
+
|
|
194
|
+
batch = client.messages.batches.create(requests=requests)
|
|
195
|
+
return batch.id
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Processing Batch Results
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
def batch_results_to_training(results_file, inputs, system_prompt=None):
|
|
202
|
+
"""Convert batch API results into training JSONL."""
|
|
203
|
+
training = []
|
|
204
|
+
with open(results_file) as f:
|
|
205
|
+
for line in f:
|
|
206
|
+
result = json.loads(line)
|
|
207
|
+
idx = int(result["custom_id"].split("-")[1])
|
|
208
|
+
messages = []
|
|
209
|
+
if system_prompt:
|
|
210
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
211
|
+
messages.append({"role": "user", "content": inputs[idx]})
|
|
212
|
+
# Extract assistant response from batch result
|
|
213
|
+
content = result["response"]["body"]["choices"][0]["message"]["content"]
|
|
214
|
+
messages.append({"role": "assistant", "content": content})
|
|
215
|
+
training.append({"messages": messages})
|
|
216
|
+
|
|
217
|
+
with open("distilled_training.jsonl", "w") as f:
|
|
218
|
+
for example in training:
|
|
219
|
+
f.write(json.dumps(example) + "\n")
|
|
220
|
+
return len(training)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Path C: Synthetic Bootstrapping
|
|
224
|
+
|
|
225
|
+
Generate training data from scratch when you have < 1000 real examples. Use as a starting point, then replace with production data as it accumulates.
|
|
226
|
+
|
|
227
|
+
### Seed Prompt Strategy
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
import openai
|
|
231
|
+
|
|
232
|
+
client = openai.OpenAI()
|
|
233
|
+
|
|
234
|
+
def generate_synthetic_examples(task_description, seed_examples, n=500, model="gpt-4o"):
|
|
235
|
+
"""Generate diverse synthetic training examples from seed examples."""
|
|
236
|
+
|
|
237
|
+
meta_prompt = f"""You are generating training data for an LLM that will be fine-tuned for:
|
|
238
|
+
{task_description}
|
|
239
|
+
|
|
240
|
+
Here are {len(seed_examples)} real examples of the desired behavior:
|
|
241
|
+
{json.dumps(seed_examples[:5], indent=2)}
|
|
242
|
+
|
|
243
|
+
Generate a NEW, diverse example. The input should cover a different scenario than
|
|
244
|
+
the seeds. The output should match the quality and style of the examples above.
|
|
245
|
+
|
|
246
|
+
Return JSON: {{"input": "...", "output": "..."}}"""
|
|
247
|
+
|
|
248
|
+
examples = []
|
|
249
|
+
for i in range(n):
|
|
250
|
+
response = client.chat.completions.create(
|
|
251
|
+
model=model,
|
|
252
|
+
messages=[{"role": "user", "content": meta_prompt}],
|
|
253
|
+
response_format={"type": "json_object"},
|
|
254
|
+
temperature=0.9, # High temp for diversity
|
|
255
|
+
)
|
|
256
|
+
example = json.loads(response.choices[0].message.content)
|
|
257
|
+
examples.append({
|
|
258
|
+
"messages": [
|
|
259
|
+
{"role": "user", "content": example["input"]},
|
|
260
|
+
{"role": "assistant", "content": example["output"]}
|
|
261
|
+
]
|
|
262
|
+
})
|
|
263
|
+
return examples
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Diversity Strategies
|
|
267
|
+
- Vary temperature (0.7-1.0) across generation batches
|
|
268
|
+
- Use different frontier models (GPT-4o, Claude, Gemini) to reduce model-specific bias
|
|
269
|
+
- Seed with representative prompts from different categories/difficulty levels
|
|
270
|
+
- Include edge cases and adversarial examples explicitly in seed prompts
|
|
271
|
+
|
|
272
|
+
## Data Quality Validation
|
|
273
|
+
|
|
274
|
+
### Schema Validation
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
def validate_jsonl(filepath):
|
|
278
|
+
"""Validate JSONL training file format."""
|
|
279
|
+
errors = []
|
|
280
|
+
valid = 0
|
|
281
|
+
with open(filepath) as f:
|
|
282
|
+
for i, line in enumerate(f, 1):
|
|
283
|
+
try:
|
|
284
|
+
obj = json.loads(line)
|
|
285
|
+
except json.JSONDecodeError as e:
|
|
286
|
+
errors.append(f"Line {i}: Invalid JSON — {e}")
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
if "messages" not in obj:
|
|
290
|
+
errors.append(f"Line {i}: Missing 'messages' key")
|
|
291
|
+
continue
|
|
292
|
+
|
|
293
|
+
msgs = obj["messages"]
|
|
294
|
+
if not isinstance(msgs, list) or len(msgs) < 2:
|
|
295
|
+
errors.append(f"Line {i}: 'messages' must be a list with >= 2 entries")
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
# Check roles
|
|
299
|
+
has_user = any(m.get("role") == "user" for m in msgs)
|
|
300
|
+
has_assistant = any(m.get("role") == "assistant" for m in msgs)
|
|
301
|
+
if not has_user or not has_assistant:
|
|
302
|
+
errors.append(f"Line {i}: Must have at least one user and one assistant message")
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
for j, msg in enumerate(msgs):
|
|
306
|
+
if "role" not in msg or "content" not in msg:
|
|
307
|
+
errors.append(f"Line {i}, message {j}: Missing 'role' or 'content'")
|
|
308
|
+
elif msg["role"] not in ("system", "user", "assistant"):
|
|
309
|
+
errors.append(f"Line {i}, message {j}: Invalid role '{msg['role']}'")
|
|
310
|
+
|
|
311
|
+
valid += 1
|
|
312
|
+
|
|
313
|
+
return {"valid": valid, "errors": errors, "total": valid + len(errors)}
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### Deduplication (MinHash)
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
from datasketch import MinHash, MinHashLSH
|
|
320
|
+
|
|
321
|
+
def deduplicate_dataset(examples, threshold=0.8):
|
|
322
|
+
"""Remove near-duplicate examples using MinHash LSH."""
|
|
323
|
+
lsh = MinHashLSH(threshold=threshold, num_perm=128)
|
|
324
|
+
unique = []
|
|
325
|
+
|
|
326
|
+
for i, ex in enumerate(examples):
|
|
327
|
+
# Hash the assistant's response (training target)
|
|
328
|
+
text = ex["messages"][-1]["content"]
|
|
329
|
+
m = MinHash(num_perm=128)
|
|
330
|
+
for word in text.lower().split():
|
|
331
|
+
m.update(word.encode("utf-8"))
|
|
332
|
+
|
|
333
|
+
key = f"doc-{i}"
|
|
334
|
+
if not lsh.query(m):
|
|
335
|
+
lsh.insert(key, m)
|
|
336
|
+
unique.append(ex)
|
|
337
|
+
|
|
338
|
+
removed = len(examples) - len(unique)
|
|
339
|
+
print(f"Removed {removed} duplicates ({removed/len(examples)*100:.1f}%)")
|
|
340
|
+
return unique
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
### Diversity Metrics
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
from collections import Counter
|
|
347
|
+
|
|
348
|
+
def distinct_n(texts, n=2):
|
|
349
|
+
"""Calculate distinct-n metric (ratio of unique n-grams to total n-grams)."""
|
|
350
|
+
total_ngrams = Counter()
|
|
351
|
+
for text in texts:
|
|
352
|
+
words = text.lower().split()
|
|
353
|
+
ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
|
|
354
|
+
total_ngrams.update(ngrams)
|
|
355
|
+
if sum(total_ngrams.values()) == 0:
|
|
356
|
+
return 0
|
|
357
|
+
return len(total_ngrams) / sum(total_ngrams.values())
|
|
358
|
+
|
|
359
|
+
def dataset_diversity_report(examples):
|
|
360
|
+
"""Generate diversity metrics for a training dataset."""
|
|
361
|
+
responses = [ex["messages"][-1]["content"] for ex in examples]
|
|
362
|
+
inputs = [m["content"] for ex in examples for m in ex["messages"] if m["role"] == "user"]
|
|
363
|
+
|
|
364
|
+
report = {
|
|
365
|
+
"total_examples": len(examples),
|
|
366
|
+
"avg_response_length": sum(len(r.split()) for r in responses) / len(responses),
|
|
367
|
+
"avg_input_length": sum(len(i.split()) for i in inputs) / len(inputs),
|
|
368
|
+
"distinct_1": distinct_n(responses, 1),
|
|
369
|
+
"distinct_2": distinct_n(responses, 2),
|
|
370
|
+
"distinct_3": distinct_n(responses, 3),
|
|
371
|
+
}
|
|
372
|
+
return report
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
## Train/Eval Split
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
import random
|
|
379
|
+
|
|
380
|
+
def split_dataset(examples, eval_ratio=0.1, production_indices=None):
|
|
381
|
+
"""Split dataset into train/eval, keeping production data in eval for ground truth.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
examples: List of training examples
|
|
385
|
+
eval_ratio: Fraction of data for evaluation (default 10%)
|
|
386
|
+
production_indices: Indices of real production examples (always go to eval)
|
|
387
|
+
"""
|
|
388
|
+
production_indices = set(production_indices or [])
|
|
389
|
+
synthetic = [ex for i, ex in enumerate(examples) if i not in production_indices]
|
|
390
|
+
production = [ex for i, ex in enumerate(examples) if i in production_indices]
|
|
391
|
+
|
|
392
|
+
# Production data goes to eval (ground truth)
|
|
393
|
+
eval_set = list(production)
|
|
394
|
+
|
|
395
|
+
# Fill remaining eval budget from synthetic
|
|
396
|
+
remaining_eval = max(0, int(len(examples) * eval_ratio) - len(eval_set))
|
|
397
|
+
random.shuffle(synthetic)
|
|
398
|
+
eval_set.extend(synthetic[:remaining_eval])
|
|
399
|
+
train_set = synthetic[remaining_eval:]
|
|
400
|
+
|
|
401
|
+
print(f"Train: {len(train_set)}, Eval: {len(eval_set)} "
|
|
402
|
+
f"({len(production)} production + {len(eval_set)-len(production)} synthetic)")
|
|
403
|
+
return train_set, eval_set
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
## Common Issues
|
|
407
|
+
|
|
408
|
+
| Issue | Cause | Fix |
|
|
409
|
+
|-------|-------|-----|
|
|
410
|
+
| `JSONDecodeError` | Trailing commas or malformed JSON | Run `validate_jsonl()` and fix flagged lines |
|
|
411
|
+
| Tokenizer mismatch | Data tokenized for wrong model | Always use target model's tokenizer for length checks |
|
|
412
|
+
| Training loss doesn't decrease | Data too noisy or contradictory | Filter low-quality examples, check for duplicates |
|
|
413
|
+
| Model repeats training data | Overfitting on small dataset | Add more diverse examples, reduce epochs |
|
|
414
|
+
| Data leakage | Eval examples appear in training | Use `split_dataset()` with `production_indices` |
|
|
415
|
+
| Encoding errors | Non-UTF-8 characters | `text.encode('utf-8', errors='replace').decode('utf-8')` |
|
|
416
|
+
| Examples too long | Exceeds model context | Truncate or split long conversations, check tokenizer limits |
|
|
417
|
+
|
|
418
|
+
## Quick Start Checklist
|
|
419
|
+
|
|
420
|
+
1. **Identify data source**: Production logs (A), frontier distillation (B), or synthetic (C)
|
|
421
|
+
2. **Format to JSONL**: Standard chat format with messages array
|
|
422
|
+
3. **Validate**: Run `validate_jsonl()` on the output file
|
|
423
|
+
4. **Deduplicate**: Run MinHash dedup with 0.8 threshold
|
|
424
|
+
5. **Check diversity**: Run `dataset_diversity_report()`, aim for distinct-2 > 0.5
|
|
425
|
+
6. **Split**: 90/10 train/eval, production data in eval set
|
|
426
|
+
7. **Count tokens**: Verify no examples exceed model's context window
|
|
427
|
+
8. **Proceed to training**: Load `tinker` or `unsloth` skill for next step
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Data Quality Reference
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Data quality directly determines model quality. This reference covers validation,
|
|
6
|
+
filtering, and quality metrics for training data.
|
|
7
|
+
|
|
8
|
+
## Quality Dimensions
|
|
9
|
+
|
|
10
|
+
| Dimension | What It Measures | Target |
|
|
11
|
+
|-----------|-----------------|--------|
|
|
12
|
+
| Correctness | Are responses factually accurate? | Manual review sample |
|
|
13
|
+
| Consistency | Do similar inputs produce similar outputs? | Low variance on paraphrases |
|
|
14
|
+
| Completeness | Are responses thorough? | Task-dependent length targets |
|
|
15
|
+
| Format compliance | Do responses match required format? | 100% schema validation pass |
|
|
16
|
+
| Diversity | Does the dataset cover the input space? | distinct-2 > 0.5 |
|
|
17
|
+
| Deduplication | Are near-duplicates removed? | < 5% duplicate rate |
|
|
18
|
+
|
|
19
|
+
## Automated Quality Checks
|
|
20
|
+
|
|
21
|
+
### Token Length Distribution
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from transformers import AutoTokenizer
|
|
25
|
+
|
|
26
|
+
def token_length_analysis(examples, model_name="meta-llama/Llama-3.1-8B"):
|
|
27
|
+
"""Analyze token lengths to catch outliers and set training params."""
|
|
28
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
29
|
+
|
|
30
|
+
lengths = []
|
|
31
|
+
for ex in examples:
|
|
32
|
+
text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
|
|
33
|
+
tokens = tokenizer.encode(text)
|
|
34
|
+
lengths.append(len(tokens))
|
|
35
|
+
|
|
36
|
+
import numpy as np
|
|
37
|
+
lengths = np.array(lengths)
|
|
38
|
+
return {
|
|
39
|
+
"count": len(lengths),
|
|
40
|
+
"mean": float(np.mean(lengths)),
|
|
41
|
+
"median": float(np.median(lengths)),
|
|
42
|
+
"p95": float(np.percentile(lengths, 95)),
|
|
43
|
+
"p99": float(np.percentile(lengths, 99)),
|
|
44
|
+
"max": int(np.max(lengths)),
|
|
45
|
+
"recommended_max_seq_length": int(np.percentile(lengths, 99) * 1.1),
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Response Quality Scoring
|
|
50
|
+
|
|
51
|
+
Use an LLM judge to score training examples:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
def score_example_quality(example, criteria, model="gpt-4o-mini"):
|
|
55
|
+
"""Score a training example on 1-5 scale using LLM judge."""
|
|
56
|
+
user_msg = next(m["content"] for m in example["messages"] if m["role"] == "user")
|
|
57
|
+
assistant_msg = next(m["content"] for m in example["messages"] if m["role"] == "assistant")
|
|
58
|
+
|
|
59
|
+
prompt = f"""Rate this response on a 1-5 scale for each criterion.
|
|
60
|
+
|
|
61
|
+
Input: {user_msg}
|
|
62
|
+
Response: {assistant_msg}
|
|
63
|
+
|
|
64
|
+
Criteria:
|
|
65
|
+
{criteria}
|
|
66
|
+
|
|
67
|
+
Return JSON: {{"scores": {{"criterion_name": score, ...}}, "overall": score, "reasoning": "..."}}"""
|
|
68
|
+
|
|
69
|
+
# Call LLM and parse response
|
|
70
|
+
# Filter examples below threshold (e.g., overall < 3)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## PII Detection and Redaction
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import re
|
|
77
|
+
|
|
78
|
+
PII_PATTERNS = {
|
|
79
|
+
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|
80
|
+
"phone": r'\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b',
|
|
81
|
+
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
|
|
82
|
+
"credit_card": r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
|
|
83
|
+
"ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def redact_pii(text, patterns=PII_PATTERNS):
|
|
87
|
+
"""Replace PII patterns with placeholder tokens."""
|
|
88
|
+
for name, pattern in patterns.items():
|
|
89
|
+
text = re.sub(pattern, f"[{name.upper()}_REDACTED]", text)
|
|
90
|
+
return text
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Dataset Health Report
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
def dataset_health_report(filepath):
|
|
97
|
+
"""Generate a comprehensive health report for a training dataset."""
|
|
98
|
+
import json
|
|
99
|
+
|
|
100
|
+
examples = []
|
|
101
|
+
with open(filepath) as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
examples.append(json.loads(line))
|
|
104
|
+
|
|
105
|
+
# Basic stats
|
|
106
|
+
report = {
|
|
107
|
+
"total_examples": len(examples),
|
|
108
|
+
"avg_turns_per_example": sum(
|
|
109
|
+
len(ex["messages"]) for ex in examples
|
|
110
|
+
) / len(examples),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# Role distribution
|
|
114
|
+
roles = {}
|
|
115
|
+
for ex in examples:
|
|
116
|
+
for msg in ex["messages"]:
|
|
117
|
+
roles[msg["role"]] = roles.get(msg["role"], 0) + 1
|
|
118
|
+
report["role_distribution"] = roles
|
|
119
|
+
|
|
120
|
+
# Length stats
|
|
121
|
+
response_lengths = [
|
|
122
|
+
len(ex["messages"][-1]["content"].split())
|
|
123
|
+
for ex in examples
|
|
124
|
+
]
|
|
125
|
+
report["response_word_count"] = {
|
|
126
|
+
"min": min(response_lengths),
|
|
127
|
+
"max": max(response_lengths),
|
|
128
|
+
"mean": sum(response_lengths) / len(response_lengths),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Empty/short responses
|
|
132
|
+
short = sum(1 for l in response_lengths if l < 10)
|
|
133
|
+
report["short_responses"] = f"{short} ({short/len(examples)*100:.1f}%)"
|
|
134
|
+
|
|
135
|
+
return report
|
|
136
|
+
```
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Frontier Distillation Reference
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Frontier distillation uses a large teacher model (GPT-4o, Claude, Gemini) to generate
|
|
6
|
+
high-quality labels for your production inputs. The student model learns to replicate
|
|
7
|
+
the teacher's behavior on your specific task at a fraction of the inference cost.
|
|
8
|
+
|
|
9
|
+
## When to Use
|
|
10
|
+
|
|
11
|
+
- You have production inputs but no gold-standard labels
|
|
12
|
+
- You want to match frontier quality on a specific task
|
|
13
|
+
- Volume justifies the one-time labeling cost (labels are reusable)
|
|
14
|
+
- Your task is narrow enough that a smaller model can learn it
|
|
15
|
+
|
|
16
|
+
## Batch API Comparison
|
|
17
|
+
|
|
18
|
+
| Provider | API | Discount | Turnaround | Max Batch |
|
|
19
|
+
|----------|-----|----------|------------|-----------|
|
|
20
|
+
| OpenAI | Batch API | 50% off | Up to 24h | 50,000 requests |
|
|
21
|
+
| Anthropic | Message Batches | 50% off | Up to 24h | 100,000 requests |
|
|
22
|
+
| Google | Batch Predict | Varies | Hours | Large |
|
|
23
|
+
|
|
24
|
+
## Distillation Prompt Design
|
|
25
|
+
|
|
26
|
+
The quality of distilled data depends on the prompt. Be explicit about format, style, and constraints.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
DISTILLATION_SYSTEM_PROMPT = """You are generating training data for a specialized model.
|
|
30
|
+
|
|
31
|
+
Task: {task_description}
|
|
32
|
+
|
|
33
|
+
Requirements:
|
|
34
|
+
- Output format: {format_spec}
|
|
35
|
+
- Tone: {tone}
|
|
36
|
+
- Length: {length_constraint}
|
|
37
|
+
- Must include: {required_elements}
|
|
38
|
+
- Must NOT include: {forbidden_elements}
|
|
39
|
+
|
|
40
|
+
Produce the highest quality response possible. This will be used as a training
|
|
41
|
+
target for a smaller model."""
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Key Principles
|
|
45
|
+
|
|
46
|
+
1. **Be explicit** — The teacher model should know exactly what format you need
|
|
47
|
+
2. **Include constraints** — Length, format, required sections, forbidden content
|
|
48
|
+
3. **Match production conditions** — Use the same system prompt you use in production
|
|
49
|
+
4. **Verify quality** — Sample and manually review 50-100 examples before using all
|
|
50
|
+
|
|
51
|
+
## Quality Filtering
|
|
52
|
+
|
|
53
|
+
Not all teacher outputs are good training data. Filter before training:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
def filter_distilled_data(examples, min_length=50, max_length=4000):
|
|
57
|
+
"""Filter distilled examples by quality heuristics."""
|
|
58
|
+
filtered = []
|
|
59
|
+
for ex in examples:
|
|
60
|
+
response = ex["messages"][-1]["content"]
|
|
61
|
+
|
|
62
|
+
# Length check
|
|
63
|
+
if len(response) < min_length or len(response) > max_length:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# Refusal detection
|
|
67
|
+
refusal_phrases = [
|
|
68
|
+
"I cannot", "I'm unable to", "I don't have access",
|
|
69
|
+
"As an AI", "I'm not able to"
|
|
70
|
+
]
|
|
71
|
+
if any(phrase.lower() in response.lower() for phrase in refusal_phrases):
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Format compliance (customize per task)
|
|
75
|
+
# if not response.startswith("{"): # e.g., JSON output expected
|
|
76
|
+
# continue
|
|
77
|
+
|
|
78
|
+
filtered.append(ex)
|
|
79
|
+
|
|
80
|
+
print(f"Kept {len(filtered)}/{len(examples)} ({len(filtered)/len(examples)*100:.1f}%)")
|
|
81
|
+
return filtered
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Cost Estimation
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
def estimate_distillation_cost(num_examples, avg_input_tokens, avg_output_tokens, model="gpt-4o"):
|
|
88
|
+
"""Estimate batch distillation cost."""
|
|
89
|
+
# Batch API prices (50% of real-time)
|
|
90
|
+
prices = {
|
|
91
|
+
"gpt-4o": {"input": 1.25, "output": 5.00}, # per 1M tokens, batch
|
|
92
|
+
"gpt-4o-mini": {"input": 0.075, "output": 0.30}, # per 1M tokens, batch
|
|
93
|
+
"claude-sonnet": {"input": 1.50, "output": 7.50}, # per 1M tokens, batch
|
|
94
|
+
}
|
|
95
|
+
p = prices.get(model, prices["gpt-4o"])
|
|
96
|
+
|
|
97
|
+
input_cost = (num_examples * avg_input_tokens / 1_000_000) * p["input"]
|
|
98
|
+
output_cost = (num_examples * avg_output_tokens / 1_000_000) * p["output"]
|
|
99
|
+
total = input_cost + output_cost
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"model": model,
|
|
103
|
+
"examples": num_examples,
|
|
104
|
+
"input_cost": f"${input_cost:.2f}",
|
|
105
|
+
"output_cost": f"${output_cost:.2f}",
|
|
106
|
+
"total_cost": f"${total:.2f}",
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Multi-Model Distillation
|
|
111
|
+
|
|
112
|
+
Using multiple teacher models reduces single-model bias:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
def multi_teacher_distillation(inputs, system_prompt, models=None):
|
|
116
|
+
"""Generate labels from multiple teachers and take majority or best."""
|
|
117
|
+
models = models or ["gpt-4o", "claude-sonnet-4-5-20250929"]
|
|
118
|
+
|
|
119
|
+
# Generate labels from each teacher
|
|
120
|
+
all_labels = {model: generate_labels(inputs, system_prompt, model) for model in models}
|
|
121
|
+
|
|
122
|
+
# Strategy 1: Use best model as primary, others for validation
|
|
123
|
+
primary = all_labels[models[0]]
|
|
124
|
+
|
|
125
|
+
# Strategy 2: Use agreement as quality signal
|
|
126
|
+
# Keep examples where all teachers agree (highest confidence)
|
|
127
|
+
|
|
128
|
+
return primary # Or implement agreement filtering
|
|
129
|
+
```
|