@synsci/cli-darwin-arm64 1.1.72 → 1.1.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/llm-as-judge-evaluation/SKILL.md +385 -0
- package/bin/skills/llm-as-judge-evaluation/references/pairwise-comparison.md +95 -0
- package/bin/skills/llm-as-judge-evaluation/references/scoring-rubrics.md +169 -0
- package/bin/skills/model-economics/SKILL.md +238 -0
- package/bin/skills/training-data-pipeline/SKILL.md +427 -0
- package/bin/skills/training-data-pipeline/references/data-quality.md +136 -0
- package/bin/skills/training-data-pipeline/references/frontier-distillation.md +129 -0
- package/bin/skills/training-data-pipeline/references/production-data-formatting.md +126 -0
- package/bin/synsc +0 -0
- package/package.json +1 -1
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Data Quality Reference
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Data quality directly determines model quality. This reference covers validation,
|
|
6
|
+
filtering, and quality metrics for training data.
|
|
7
|
+
|
|
8
|
+
## Quality Dimensions
|
|
9
|
+
|
|
10
|
+
| Dimension | What It Measures | Target |
|
|
11
|
+
|-----------|-----------------|--------|
|
|
12
|
+
| Correctness | Are responses factually accurate? | Manual review sample |
|
|
13
|
+
| Consistency | Do similar inputs produce similar outputs? | Low variance on paraphrases |
|
|
14
|
+
| Completeness | Are responses thorough? | Task-dependent length targets |
|
|
15
|
+
| Format compliance | Do responses match required format? | 100% schema validation pass |
|
|
16
|
+
| Diversity | Does the dataset cover the input space? | distinct-2 > 0.5 |
|
|
17
|
+
| Deduplication | Are near-duplicates removed? | < 5% duplicate rate |
|
|
18
|
+
|
|
19
|
+
## Automated Quality Checks
|
|
20
|
+
|
|
21
|
+
### Token Length Distribution
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from transformers import AutoTokenizer
|
|
25
|
+
|
|
26
|
+
def token_length_analysis(examples, model_name="meta-llama/Llama-3.1-8B"):
|
|
27
|
+
"""Analyze token lengths to catch outliers and set training params."""
|
|
28
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
29
|
+
|
|
30
|
+
lengths = []
|
|
31
|
+
for ex in examples:
|
|
32
|
+
text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
|
|
33
|
+
tokens = tokenizer.encode(text)
|
|
34
|
+
lengths.append(len(tokens))
|
|
35
|
+
|
|
36
|
+
import numpy as np
|
|
37
|
+
lengths = np.array(lengths)
|
|
38
|
+
return {
|
|
39
|
+
"count": len(lengths),
|
|
40
|
+
"mean": float(np.mean(lengths)),
|
|
41
|
+
"median": float(np.median(lengths)),
|
|
42
|
+
"p95": float(np.percentile(lengths, 95)),
|
|
43
|
+
"p99": float(np.percentile(lengths, 99)),
|
|
44
|
+
"max": int(np.max(lengths)),
|
|
45
|
+
"recommended_max_seq_length": int(np.percentile(lengths, 99) * 1.1),
|
|
46
|
+
}
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Response Quality Scoring
|
|
50
|
+
|
|
51
|
+
Use an LLM judge to score training examples:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
def score_example_quality(example, criteria, model="gpt-4o-mini"):
|
|
55
|
+
"""Score a training example on 1-5 scale using LLM judge."""
|
|
56
|
+
user_msg = next(m["content"] for m in example["messages"] if m["role"] == "user")
|
|
57
|
+
assistant_msg = next(m["content"] for m in example["messages"] if m["role"] == "assistant")
|
|
58
|
+
|
|
59
|
+
prompt = f"""Rate this response on a 1-5 scale for each criterion.
|
|
60
|
+
|
|
61
|
+
Input: {user_msg}
|
|
62
|
+
Response: {assistant_msg}
|
|
63
|
+
|
|
64
|
+
Criteria:
|
|
65
|
+
{criteria}
|
|
66
|
+
|
|
67
|
+
Return JSON: {{"scores": {{"criterion_name": score, ...}}, "overall": score, "reasoning": "..."}}"""
|
|
68
|
+
|
|
69
|
+
# Call LLM and parse response
|
|
70
|
+
# Filter examples below threshold (e.g., overall < 3)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## PII Detection and Redaction
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import re
|
|
77
|
+
|
|
78
|
+
PII_PATTERNS = {
|
|
79
|
+
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|
80
|
+
"phone": r'\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b',
|
|
81
|
+
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
|
|
82
|
+
"credit_card": r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
|
|
83
|
+
"ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def redact_pii(text, patterns=PII_PATTERNS):
|
|
87
|
+
"""Replace PII patterns with placeholder tokens."""
|
|
88
|
+
for name, pattern in patterns.items():
|
|
89
|
+
text = re.sub(pattern, f"[{name.upper()}_REDACTED]", text)
|
|
90
|
+
return text
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Dataset Health Report
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
def dataset_health_report(filepath):
|
|
97
|
+
"""Generate a comprehensive health report for a training dataset."""
|
|
98
|
+
import json
|
|
99
|
+
|
|
100
|
+
examples = []
|
|
101
|
+
with open(filepath) as f:
|
|
102
|
+
for line in f:
|
|
103
|
+
examples.append(json.loads(line))
|
|
104
|
+
|
|
105
|
+
# Basic stats
|
|
106
|
+
report = {
|
|
107
|
+
"total_examples": len(examples),
|
|
108
|
+
"avg_turns_per_example": sum(
|
|
109
|
+
len(ex["messages"]) for ex in examples
|
|
110
|
+
) / len(examples),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
# Role distribution
|
|
114
|
+
roles = {}
|
|
115
|
+
for ex in examples:
|
|
116
|
+
for msg in ex["messages"]:
|
|
117
|
+
roles[msg["role"]] = roles.get(msg["role"], 0) + 1
|
|
118
|
+
report["role_distribution"] = roles
|
|
119
|
+
|
|
120
|
+
# Length stats
|
|
121
|
+
response_lengths = [
|
|
122
|
+
len(ex["messages"][-1]["content"].split())
|
|
123
|
+
for ex in examples
|
|
124
|
+
]
|
|
125
|
+
report["response_word_count"] = {
|
|
126
|
+
"min": min(response_lengths),
|
|
127
|
+
"max": max(response_lengths),
|
|
128
|
+
"mean": sum(response_lengths) / len(response_lengths),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Empty/short responses
|
|
132
|
+
short = sum(1 for l in response_lengths if l < 10)
|
|
133
|
+
report["short_responses"] = f"{short} ({short/len(examples)*100:.1f}%)"
|
|
134
|
+
|
|
135
|
+
return report
|
|
136
|
+
```
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Frontier Distillation Reference
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Frontier distillation uses a large teacher model (GPT-4o, Claude, Gemini) to generate
|
|
6
|
+
high-quality labels for your production inputs. The student model learns to replicate
|
|
7
|
+
the teacher's behavior on your specific task at a fraction of the inference cost.
|
|
8
|
+
|
|
9
|
+
## When to Use
|
|
10
|
+
|
|
11
|
+
- You have production inputs but no gold-standard labels
|
|
12
|
+
- You want to match frontier quality on a specific task
|
|
13
|
+
- Volume justifies the one-time labeling cost (labels are reusable)
|
|
14
|
+
- Your task is narrow enough that a smaller model can learn it
|
|
15
|
+
|
|
16
|
+
## Batch API Comparison
|
|
17
|
+
|
|
18
|
+
| Provider | API | Discount | Turnaround | Max Batch |
|
|
19
|
+
|----------|-----|----------|------------|-----------|
|
|
20
|
+
| OpenAI | Batch API | 50% off | Up to 24h | 50,000 requests |
|
|
21
|
+
| Anthropic | Message Batches | 50% off | Up to 24h | 100,000 requests |
|
|
22
|
+
| Google | Batch Predict | Varies | Hours | Large |
|
|
23
|
+
|
|
24
|
+
## Distillation Prompt Design
|
|
25
|
+
|
|
26
|
+
The quality of distilled data depends on the prompt. Be explicit about format, style, and constraints.
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
DISTILLATION_SYSTEM_PROMPT = """You are generating training data for a specialized model.
|
|
30
|
+
|
|
31
|
+
Task: {task_description}
|
|
32
|
+
|
|
33
|
+
Requirements:
|
|
34
|
+
- Output format: {format_spec}
|
|
35
|
+
- Tone: {tone}
|
|
36
|
+
- Length: {length_constraint}
|
|
37
|
+
- Must include: {required_elements}
|
|
38
|
+
- Must NOT include: {forbidden_elements}
|
|
39
|
+
|
|
40
|
+
Produce the highest quality response possible. This will be used as a training
|
|
41
|
+
target for a smaller model."""
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Key Principles
|
|
45
|
+
|
|
46
|
+
1. **Be explicit** — The teacher model should know exactly what format you need
|
|
47
|
+
2. **Include constraints** — Length, format, required sections, forbidden content
|
|
48
|
+
3. **Match production conditions** — Use the same system prompt you use in production
|
|
49
|
+
4. **Verify quality** — Sample and manually review 50-100 examples before using all
|
|
50
|
+
|
|
51
|
+
## Quality Filtering
|
|
52
|
+
|
|
53
|
+
Not all teacher outputs are good training data. Filter before training:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
def filter_distilled_data(examples, min_length=50, max_length=4000):
|
|
57
|
+
"""Filter distilled examples by quality heuristics."""
|
|
58
|
+
filtered = []
|
|
59
|
+
for ex in examples:
|
|
60
|
+
response = ex["messages"][-1]["content"]
|
|
61
|
+
|
|
62
|
+
# Length check
|
|
63
|
+
if len(response) < min_length or len(response) > max_length:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
# Refusal detection
|
|
67
|
+
refusal_phrases = [
|
|
68
|
+
"I cannot", "I'm unable to", "I don't have access",
|
|
69
|
+
"As an AI", "I'm not able to"
|
|
70
|
+
]
|
|
71
|
+
if any(phrase.lower() in response.lower() for phrase in refusal_phrases):
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
# Format compliance (customize per task)
|
|
75
|
+
# if not response.startswith("{"): # e.g., JSON output expected
|
|
76
|
+
# continue
|
|
77
|
+
|
|
78
|
+
filtered.append(ex)
|
|
79
|
+
|
|
80
|
+
print(f"Kept {len(filtered)}/{len(examples)} ({len(filtered)/len(examples)*100:.1f}%)")
|
|
81
|
+
return filtered
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Cost Estimation
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
def estimate_distillation_cost(num_examples, avg_input_tokens, avg_output_tokens, model="gpt-4o"):
|
|
88
|
+
"""Estimate batch distillation cost."""
|
|
89
|
+
# Batch API prices (50% of real-time)
|
|
90
|
+
prices = {
|
|
91
|
+
"gpt-4o": {"input": 1.25, "output": 5.00}, # per 1M tokens, batch
|
|
92
|
+
"gpt-4o-mini": {"input": 0.075, "output": 0.30}, # per 1M tokens, batch
|
|
93
|
+
"claude-sonnet": {"input": 1.50, "output": 7.50}, # per 1M tokens, batch
|
|
94
|
+
}
|
|
95
|
+
p = prices.get(model, prices["gpt-4o"])
|
|
96
|
+
|
|
97
|
+
input_cost = (num_examples * avg_input_tokens / 1_000_000) * p["input"]
|
|
98
|
+
output_cost = (num_examples * avg_output_tokens / 1_000_000) * p["output"]
|
|
99
|
+
total = input_cost + output_cost
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"model": model,
|
|
103
|
+
"examples": num_examples,
|
|
104
|
+
"input_cost": f"${input_cost:.2f}",
|
|
105
|
+
"output_cost": f"${output_cost:.2f}",
|
|
106
|
+
"total_cost": f"${total:.2f}",
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Multi-Model Distillation
|
|
111
|
+
|
|
112
|
+
Using multiple teacher models reduces single-model bias:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
def multi_teacher_distillation(inputs, system_prompt, models=None):
|
|
116
|
+
"""Generate labels from multiple teachers and take majority or best."""
|
|
117
|
+
models = models or ["gpt-4o", "claude-sonnet-4-5-20250929"]
|
|
118
|
+
|
|
119
|
+
# Generate labels from each teacher
|
|
120
|
+
all_labels = {model: generate_labels(inputs, system_prompt, model) for model in models}
|
|
121
|
+
|
|
122
|
+
# Strategy 1: Use best model as primary, others for validation
|
|
123
|
+
primary = all_labels[models[0]]
|
|
124
|
+
|
|
125
|
+
# Strategy 2: Use agreement as quality signal
|
|
126
|
+
# Keep examples where all teachers agree (highest confidence)
|
|
127
|
+
|
|
128
|
+
return primary # Or implement agreement filtering
|
|
129
|
+
```
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# Production Data Formatting Reference
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Production data is the most valuable training signal for model specialization. This reference covers patterns for extracting, cleaning, and formatting production data from common sources.
|
|
6
|
+
|
|
7
|
+
## Data Source Patterns
|
|
8
|
+
|
|
9
|
+
### REST API Logs
|
|
10
|
+
|
|
11
|
+
Most production systems log API requests and responses. Common formats:
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
# Typical API log structure
|
|
15
|
+
log_entry = {
|
|
16
|
+
"timestamp": "2026-01-15T10:30:00Z",
|
|
17
|
+
"request_id": "req_abc123",
|
|
18
|
+
"user_id": "user_456",
|
|
19
|
+
"endpoint": "/v1/chat/completions",
|
|
20
|
+
"input": {
|
|
21
|
+
"model": "gpt-4o",
|
|
22
|
+
"messages": [...],
|
|
23
|
+
"temperature": 0.7
|
|
24
|
+
},
|
|
25
|
+
"output": {
|
|
26
|
+
"choices": [{"message": {"content": "..."}}],
|
|
27
|
+
"usage": {"prompt_tokens": 150, "completion_tokens": 200}
|
|
28
|
+
},
|
|
29
|
+
"latency_ms": 1200,
|
|
30
|
+
"status": 200
|
|
31
|
+
}
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Extraction pattern**: Pull `input.messages` and `output.choices[0].message.content`, format into standard JSONL.
|
|
35
|
+
|
|
36
|
+
### Database Records
|
|
37
|
+
|
|
38
|
+
If your product stores LLM interactions in a database:
|
|
39
|
+
|
|
40
|
+
```sql
|
|
41
|
+
SELECT
|
|
42
|
+
system_prompt,
|
|
43
|
+
user_input,
|
|
44
|
+
COALESCE(corrected_response, model_response) as target_response,
|
|
45
|
+
user_feedback
|
|
46
|
+
FROM llm_interactions
|
|
47
|
+
WHERE user_feedback != 'rejected'
|
|
48
|
+
AND created_at > NOW() - INTERVAL '90 days'
|
|
49
|
+
ORDER BY created_at DESC;
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Key**: Always prefer `corrected_response` over raw `model_response` when available.
|
|
53
|
+
|
|
54
|
+
### Structured Feedback
|
|
55
|
+
|
|
56
|
+
If users rate or edit model outputs:
|
|
57
|
+
|
|
58
|
+
| Signal | Quality | Use |
|
|
59
|
+
|--------|---------|-----|
|
|
60
|
+
| User edited output | Highest | Use edited version as training target |
|
|
61
|
+
| Thumbs up / accepted | High | Use original output as training target |
|
|
62
|
+
| Thumbs down / rejected | Medium | Exclude from SFT, use for DPO (rejected example) |
|
|
63
|
+
| No feedback | Low | Use with caution, filter by heuristics |
|
|
64
|
+
|
|
65
|
+
## Cleaning Pipeline
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
def clean_production_data(examples):
|
|
69
|
+
"""Standard cleaning pipeline for production data."""
|
|
70
|
+
cleaned = []
|
|
71
|
+
for ex in examples:
|
|
72
|
+
messages = ex["messages"]
|
|
73
|
+
|
|
74
|
+
# Skip empty or trivial examples
|
|
75
|
+
assistant_msg = next((m for m in messages if m["role"] == "assistant"), None)
|
|
76
|
+
if not assistant_msg or len(assistant_msg["content"].strip()) < 10:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Normalize whitespace
|
|
80
|
+
for msg in messages:
|
|
81
|
+
msg["content"] = " ".join(msg["content"].split())
|
|
82
|
+
|
|
83
|
+
# Remove PII patterns (customize for your domain)
|
|
84
|
+
for msg in messages:
|
|
85
|
+
msg["content"] = redact_pii(msg["content"])
|
|
86
|
+
|
|
87
|
+
# Skip if user input is too short (likely a test)
|
|
88
|
+
user_msg = next((m for m in messages if m["role"] == "user"), None)
|
|
89
|
+
if user_msg and len(user_msg["content"].strip()) < 5:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
cleaned.append({"messages": messages})
|
|
93
|
+
|
|
94
|
+
return cleaned
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Multi-Turn Conversations
|
|
98
|
+
|
|
99
|
+
For products with multi-turn interactions, preserve the full conversation:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
def conversation_to_training(conversation):
|
|
103
|
+
"""Convert a multi-turn conversation to training format.
|
|
104
|
+
Each assistant turn becomes a training example with full history."""
|
|
105
|
+
examples = []
|
|
106
|
+
messages = []
|
|
107
|
+
|
|
108
|
+
for turn in conversation["turns"]:
|
|
109
|
+
messages.append({"role": turn["role"], "content": turn["content"]})
|
|
110
|
+
|
|
111
|
+
# Create an example at each assistant turn
|
|
112
|
+
if turn["role"] == "assistant":
|
|
113
|
+
examples.append({"messages": list(messages)})
|
|
114
|
+
|
|
115
|
+
return examples
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Volume Guidelines
|
|
119
|
+
|
|
120
|
+
| Dataset Size | Expected Quality | Recommended Approach |
|
|
121
|
+
|-------------|-----------------|---------------------|
|
|
122
|
+
| < 100 | Insufficient for SFT | Use synthetic bootstrapping first |
|
|
123
|
+
| 100-1,000 | Minimum viable | LoRA fine-tune, careful eval |
|
|
124
|
+
| 1,000-10,000 | Good | Standard LoRA or QLoRA |
|
|
125
|
+
| 10,000-100,000 | Strong | Full fine-tune viable |
|
|
126
|
+
| > 100,000 | Excellent | Multi-epoch training, curriculum learning |
|
package/bin/synsc
CHANGED
|
Binary file
|