@synsci/cli-darwin-arm64 1.1.72 → 1.1.74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ # Data Quality Reference
2
+
3
+ ## Overview
4
+
5
+ Data quality directly determines model quality. This reference covers validation,
6
+ filtering, and quality metrics for training data.
7
+
8
+ ## Quality Dimensions
9
+
10
+ | Dimension | What It Measures | Target |
11
+ |-----------|-----------------|--------|
12
+ | Correctness | Are responses factually accurate? | Manual review sample |
13
+ | Consistency | Do similar inputs produce similar outputs? | Low variance on paraphrases |
14
+ | Completeness | Are responses thorough? | Task-dependent length targets |
15
+ | Format compliance | Do responses match required format? | 100% schema validation pass |
16
+ | Diversity | Does the dataset cover the input space? | distinct-2 > 0.5 |
17
+ | Deduplication | Are near-duplicates removed? | < 5% duplicate rate |
18
+
19
+ ## Automated Quality Checks
20
+
21
+ ### Token Length Distribution
22
+
23
+ ```python
24
+ from transformers import AutoTokenizer
25
+
26
+ def token_length_analysis(examples, model_name="meta-llama/Llama-3.1-8B"):
27
+ """Analyze token lengths to catch outliers and set training params."""
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+
30
+ lengths = []
31
+ for ex in examples:
32
+ text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
33
+ tokens = tokenizer.encode(text)
34
+ lengths.append(len(tokens))
35
+
36
+ import numpy as np
37
+ lengths = np.array(lengths)
38
+ return {
39
+ "count": len(lengths),
40
+ "mean": float(np.mean(lengths)),
41
+ "median": float(np.median(lengths)),
42
+ "p95": float(np.percentile(lengths, 95)),
43
+ "p99": float(np.percentile(lengths, 99)),
44
+ "max": int(np.max(lengths)),
45
+ "recommended_max_seq_length": int(np.percentile(lengths, 99) * 1.1),
46
+ }
47
+ ```
48
+
49
+ ### Response Quality Scoring
50
+
51
+ Use an LLM judge to score training examples:
52
+
53
+ ```python
54
+ def score_example_quality(example, criteria, model="gpt-4o-mini"):
55
+ """Score a training example on 1-5 scale using LLM judge."""
56
+ user_msg = next(m["content"] for m in example["messages"] if m["role"] == "user")
57
+ assistant_msg = next(m["content"] for m in example["messages"] if m["role"] == "assistant")
58
+
59
+ prompt = f"""Rate this response on a 1-5 scale for each criterion.
60
+
61
+ Input: {user_msg}
62
+ Response: {assistant_msg}
63
+
64
+ Criteria:
65
+ {criteria}
66
+
67
+ Return JSON: {{"scores": {{"criterion_name": score, ...}}, "overall": score, "reasoning": "..."}}"""
68
+
69
+ # Call LLM and parse response
70
+ # Filter examples below threshold (e.g., overall < 3)
71
+ ```
72
+
73
+ ## PII Detection and Redaction
74
+
75
+ ```python
76
+ import re
77
+
78
+ PII_PATTERNS = {
79
+ "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
80
+ "phone": r'\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b',
81
+ "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
82
+ "credit_card": r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
83
+ "ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
84
+ }
85
+
86
+ def redact_pii(text, patterns=PII_PATTERNS):
87
+ """Replace PII patterns with placeholder tokens."""
88
+ for name, pattern in patterns.items():
89
+ text = re.sub(pattern, f"[{name.upper()}_REDACTED]", text)
90
+ return text
91
+ ```
92
+
93
+ ## Dataset Health Report
94
+
95
+ ```python
96
+ def dataset_health_report(filepath):
97
+ """Generate a comprehensive health report for a training dataset."""
98
+ import json
99
+
100
+ examples = []
101
+ with open(filepath) as f:
102
+ for line in f:
103
+ examples.append(json.loads(line))
104
+
105
+ # Basic stats
106
+ report = {
107
+ "total_examples": len(examples),
108
+ "avg_turns_per_example": sum(
109
+ len(ex["messages"]) for ex in examples
110
+ ) / len(examples),
111
+ }
112
+
113
+ # Role distribution
114
+ roles = {}
115
+ for ex in examples:
116
+ for msg in ex["messages"]:
117
+ roles[msg["role"]] = roles.get(msg["role"], 0) + 1
118
+ report["role_distribution"] = roles
119
+
120
+ # Length stats
121
+ response_lengths = [
122
+ len(ex["messages"][-1]["content"].split())
123
+ for ex in examples
124
+ ]
125
+ report["response_word_count"] = {
126
+ "min": min(response_lengths),
127
+ "max": max(response_lengths),
128
+ "mean": sum(response_lengths) / len(response_lengths),
129
+ }
130
+
131
+ # Empty/short responses
132
+ short = sum(1 for l in response_lengths if l < 10)
133
+ report["short_responses"] = f"{short} ({short/len(examples)*100:.1f}%)"
134
+
135
+ return report
136
+ ```
@@ -0,0 +1,129 @@
1
+ # Frontier Distillation Reference
2
+
3
+ ## Overview
4
+
5
+ Frontier distillation uses a large teacher model (GPT-4o, Claude, Gemini) to generate
6
+ high-quality labels for your production inputs. The student model learns to replicate
7
+ the teacher's behavior on your specific task at a fraction of the inference cost.
8
+
9
+ ## When to Use
10
+
11
+ - You have production inputs but no gold-standard labels
12
+ - You want to match frontier quality on a specific task
13
+ - Volume justifies the one-time labeling cost (labels are reusable)
14
+ - Your task is narrow enough that a smaller model can learn it
15
+
16
+ ## Batch API Comparison
17
+
18
+ | Provider | API | Discount | Turnaround | Max Batch |
19
+ |----------|-----|----------|------------|-----------|
20
+ | OpenAI | Batch API | 50% off | Up to 24h | 50,000 requests |
21
+ | Anthropic | Message Batches | 50% off | Up to 24h | 100,000 requests |
22
+ | Google | Batch Predict | Varies | Hours | Large |
23
+
24
+ ## Distillation Prompt Design
25
+
26
+ The quality of distilled data depends on the prompt. Be explicit about format, style, and constraints.
27
+
28
+ ```python
29
+ DISTILLATION_SYSTEM_PROMPT = """You are generating training data for a specialized model.
30
+
31
+ Task: {task_description}
32
+
33
+ Requirements:
34
+ - Output format: {format_spec}
35
+ - Tone: {tone}
36
+ - Length: {length_constraint}
37
+ - Must include: {required_elements}
38
+ - Must NOT include: {forbidden_elements}
39
+
40
+ Produce the highest quality response possible. This will be used as a training
41
+ target for a smaller model."""
42
+ ```
43
+
44
+ ### Key Principles
45
+
46
+ 1. **Be explicit** — The teacher model should know exactly what format you need
47
+ 2. **Include constraints** — Length, format, required sections, forbidden content
48
+ 3. **Match production conditions** — Use the same system prompt you use in production
49
+ 4. **Verify quality** — Sample and manually review 50-100 examples before using all
50
+
51
+ ## Quality Filtering
52
+
53
+ Not all teacher outputs are good training data. Filter before training:
54
+
55
+ ```python
56
+ def filter_distilled_data(examples, min_length=50, max_length=4000):
57
+ """Filter distilled examples by quality heuristics."""
58
+ filtered = []
59
+ for ex in examples:
60
+ response = ex["messages"][-1]["content"]
61
+
62
+ # Length check
63
+ if len(response) < min_length or len(response) > max_length:
64
+ continue
65
+
66
+ # Refusal detection
67
+ refusal_phrases = [
68
+ "I cannot", "I'm unable to", "I don't have access",
69
+ "As an AI", "I'm not able to"
70
+ ]
71
+ if any(phrase.lower() in response.lower() for phrase in refusal_phrases):
72
+ continue
73
+
74
+ # Format compliance (customize per task)
75
+ # if not response.startswith("{"): # e.g., JSON output expected
76
+ # continue
77
+
78
+ filtered.append(ex)
79
+
80
+ print(f"Kept {len(filtered)}/{len(examples)} ({len(filtered)/len(examples)*100:.1f}%)")
81
+ return filtered
82
+ ```
83
+
84
+ ## Cost Estimation
85
+
86
+ ```python
87
+ def estimate_distillation_cost(num_examples, avg_input_tokens, avg_output_tokens, model="gpt-4o"):
88
+ """Estimate batch distillation cost."""
89
+ # Batch API prices (50% of real-time)
90
+ prices = {
91
+ "gpt-4o": {"input": 1.25, "output": 5.00}, # per 1M tokens, batch
92
+ "gpt-4o-mini": {"input": 0.075, "output": 0.30}, # per 1M tokens, batch
93
+ "claude-sonnet": {"input": 1.50, "output": 7.50}, # per 1M tokens, batch
94
+ }
95
+ p = prices.get(model, prices["gpt-4o"])
96
+
97
+ input_cost = (num_examples * avg_input_tokens / 1_000_000) * p["input"]
98
+ output_cost = (num_examples * avg_output_tokens / 1_000_000) * p["output"]
99
+ total = input_cost + output_cost
100
+
101
+ return {
102
+ "model": model,
103
+ "examples": num_examples,
104
+ "input_cost": f"${input_cost:.2f}",
105
+ "output_cost": f"${output_cost:.2f}",
106
+ "total_cost": f"${total:.2f}",
107
+ }
108
+ ```
109
+
110
+ ## Multi-Model Distillation
111
+
112
+ Using multiple teacher models reduces single-model bias:
113
+
114
+ ```python
115
+ def multi_teacher_distillation(inputs, system_prompt, models=None):
116
+ """Generate labels from multiple teachers and take majority or best."""
117
+ models = models or ["gpt-4o", "claude-sonnet-4-5-20250929"]
118
+
119
+ # Generate labels from each teacher
120
+ all_labels = {model: generate_labels(inputs, system_prompt, model) for model in models}
121
+
122
+ # Strategy 1: Use best model as primary, others for validation
123
+ primary = all_labels[models[0]]
124
+
125
+ # Strategy 2: Use agreement as quality signal
126
+ # Keep examples where all teachers agree (highest confidence)
127
+
128
+ return primary # Or implement agreement filtering
129
+ ```
@@ -0,0 +1,126 @@
1
+ # Production Data Formatting Reference
2
+
3
+ ## Overview
4
+
5
+ Production data is the most valuable training signal for model specialization. This reference covers patterns for extracting, cleaning, and formatting production data from common sources.
6
+
7
+ ## Data Source Patterns
8
+
9
+ ### REST API Logs
10
+
11
+ Most production systems log API requests and responses. Common formats:
12
+
13
+ ```python
14
+ # Typical API log structure
15
+ log_entry = {
16
+ "timestamp": "2026-01-15T10:30:00Z",
17
+ "request_id": "req_abc123",
18
+ "user_id": "user_456",
19
+ "endpoint": "/v1/chat/completions",
20
+ "input": {
21
+ "model": "gpt-4o",
22
+ "messages": [...],
23
+ "temperature": 0.7
24
+ },
25
+ "output": {
26
+ "choices": [{"message": {"content": "..."}}],
27
+ "usage": {"prompt_tokens": 150, "completion_tokens": 200}
28
+ },
29
+ "latency_ms": 1200,
30
+ "status": 200
31
+ }
32
+ ```
33
+
34
+ **Extraction pattern**: Pull `input.messages` and `output.choices[0].message.content`, format into standard JSONL.
35
+
36
+ ### Database Records
37
+
38
+ If your product stores LLM interactions in a database:
39
+
40
+ ```sql
41
+ SELECT
42
+ system_prompt,
43
+ user_input,
44
+ COALESCE(corrected_response, model_response) as target_response,
45
+ user_feedback
46
+ FROM llm_interactions
47
+ WHERE user_feedback != 'rejected'
48
+ AND created_at > NOW() - INTERVAL '90 days'
49
+ ORDER BY created_at DESC;
50
+ ```
51
+
52
+ **Key**: Always prefer `corrected_response` over raw `model_response` when available.
53
+
54
+ ### Structured Feedback
55
+
56
+ If users rate or edit model outputs:
57
+
58
+ | Signal | Quality | Use |
59
+ |--------|---------|-----|
60
+ | User edited output | Highest | Use edited version as training target |
61
+ | Thumbs up / accepted | High | Use original output as training target |
62
+ | Thumbs down / rejected | Medium | Exclude from SFT, use for DPO (rejected example) |
63
+ | No feedback | Low | Use with caution, filter by heuristics |
64
+
65
+ ## Cleaning Pipeline
66
+
67
+ ```python
68
+ def clean_production_data(examples):
69
+ """Standard cleaning pipeline for production data."""
70
+ cleaned = []
71
+ for ex in examples:
72
+ messages = ex["messages"]
73
+
74
+ # Skip empty or trivial examples
75
+ assistant_msg = next((m for m in messages if m["role"] == "assistant"), None)
76
+ if not assistant_msg or len(assistant_msg["content"].strip()) < 10:
77
+ continue
78
+
79
+ # Normalize whitespace
80
+ for msg in messages:
81
+ msg["content"] = " ".join(msg["content"].split())
82
+
83
+ # Remove PII patterns (customize for your domain)
84
+ for msg in messages:
85
+ msg["content"] = redact_pii(msg["content"])
86
+
87
+ # Skip if user input is too short (likely a test)
88
+ user_msg = next((m for m in messages if m["role"] == "user"), None)
89
+ if user_msg and len(user_msg["content"].strip()) < 5:
90
+ continue
91
+
92
+ cleaned.append({"messages": messages})
93
+
94
+ return cleaned
95
+ ```
96
+
97
+ ## Multi-Turn Conversations
98
+
99
+ For products with multi-turn interactions, preserve the full conversation:
100
+
101
+ ```python
102
+ def conversation_to_training(conversation):
103
+ """Convert a multi-turn conversation to training format.
104
+ Each assistant turn becomes a training example with full history."""
105
+ examples = []
106
+ messages = []
107
+
108
+ for turn in conversation["turns"]:
109
+ messages.append({"role": turn["role"], "content": turn["content"]})
110
+
111
+ # Create an example at each assistant turn
112
+ if turn["role"] == "assistant":
113
+ examples.append({"messages": list(messages)})
114
+
115
+ return examples
116
+ ```
117
+
118
+ ## Volume Guidelines
119
+
120
+ | Dataset Size | Expected Quality | Recommended Approach |
121
+ |-------------|-----------------|---------------------|
122
+ | < 100 | Insufficient for SFT | Use synthetic bootstrapping first |
123
+ | 100-1,000 | Minimum viable | LoRA fine-tune, careful eval |
124
+ | 1,000-10,000 | Good | Standard LoRA or QLoRA |
125
+ | 10,000-100,000 | Strong | Full fine-tune viable |
126
+ | > 100,000 | Excellent | Multi-epoch training, curriculum learning |
package/bin/synsc CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@synsci/cli-darwin-arm64",
3
- "version": "1.1.72",
3
+ "version": "1.1.74",
4
4
  "os": [
5
5
  "darwin"
6
6
  ],