@patricio0312rev/skillset 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/LICENSE +21 -0
- package/README.md +176 -0
- package/bin/cli.js +37 -0
- package/package.json +55 -0
- package/src/commands/init.js +301 -0
- package/src/index.js +168 -0
- package/src/lib/config.js +200 -0
- package/src/lib/generator.js +166 -0
- package/src/utils/display.js +95 -0
- package/src/utils/readme.js +196 -0
- package/src/utils/tool-specific.js +233 -0
- package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
- package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
- package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
- package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
- package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
- package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
- package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
- package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
- package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
- package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
- package/templates/architecture/adr-writer/ SKILL.md +250 -0
- package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
- package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
- package/templates/architecture/migration-planner/ SKILL.md +376 -0
- package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
- package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
- package/templates/architecture/rfc-generator/ SKILL.md +362 -0
- package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
- package/templates/architecture/system-design-generator/ SKILL.md +339 -0
- package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
- package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
- package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
- package/templates/backend/auth-module-builder/ SKILL.md +99 -0
- package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
- package/templates/backend/caching-strategist/ SKILL.md +190 -0
- package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
- package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
- package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
- package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
- package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
- package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
- package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
- package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
- package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
- package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
- package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
- package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
- package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
- package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
- package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
- package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
- package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
- package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
- package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
- package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
- package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
- package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
- package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
- package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
- package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
- package/templates/foundation/changelog-writer/ SKILL.md +431 -0
- package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
- package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
- package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
- package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
- package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
- package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
- package/templates/foundation/explaining-code/SKILL.md +13 -0
- package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
- package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
- package/templates/foundation/project-scaffolder/references/templates.md +126 -0
- package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
- package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
- package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
- package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
- package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
- package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
- package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
- package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
- package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
- package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
- package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
- package/templates/frontend/table-builder/ SKILL.md +350 -0
- package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
- package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
- package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
- package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
- package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
- package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
- package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
- package/templates/performance/observability-setup/ SKILL.md +232 -0
- package/templates/performance/postmortem-writer/ SKILL.md +203 -0
- package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
- package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
- package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
- package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
- package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
- package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
- package/templates/security/secrets-scanner/ SKILL.md +462 -0
- package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
- package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
- package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
- package/templates/security/threat-model-generator/ SKILL.md +394 -0
- package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
- package/templates/testing/coverage-strategist/ SKILL.md +436 -0
- package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
- package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
- package/templates/testing/integration-test-builder/ SKILL.md +525 -0
- package/templates/testing/mocking-assistant/ SKILL.md +383 -0
- package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
- package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
- package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
- package/templates/testing/unit-test-generator/ SKILL.md +548 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: cost-latency-optimizer
|
|
3
|
+
description: Reduces LLM costs and improves response times through caching, model selection, batching, and prompt optimization. Provides cost breakdowns, latency hotspots, and configuration recommendations. Use for "cost reduction", "performance optimization", "latency improvement", or "efficiency".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Cost & Latency Optimizer
|
|
7
|
+
|
|
8
|
+
Optimize LLM applications for cost and performance.
|
|
9
|
+
|
|
10
|
+
## Cost Breakdown Analysis
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
class CostAnalyzer:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
self.costs = {
|
|
16
|
+
"llm_calls": 0,
|
|
17
|
+
"embeddings": 0,
|
|
18
|
+
"tool_calls": 0,
|
|
19
|
+
}
|
|
20
|
+
self.counts = {
|
|
21
|
+
"llm_calls": 0,
|
|
22
|
+
"embeddings": 0,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def track_llm_call(self, tokens_in: int, tokens_out: int):
|
|
26
|
+
# GPT-4 pricing
|
|
27
|
+
cost = (tokens_in / 1000) * 0.03 + (tokens_out / 1000) * 0.06
|
|
28
|
+
self.costs["llm_calls"] += cost
|
|
29
|
+
self.counts["llm_calls"] += 1
|
|
30
|
+
|
|
31
|
+
def report(self):
|
|
32
|
+
return {
|
|
33
|
+
"total_cost": sum(self.costs.values()),
|
|
34
|
+
"breakdown": self.costs,
|
|
35
|
+
"avg_cost_per_call": self.costs["llm_calls"] / self.counts["llm_calls"],
|
|
36
|
+
}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Caching Strategy
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import hashlib
|
|
43
|
+
from functools import lru_cache
|
|
44
|
+
|
|
45
|
+
class LLMCache:
|
|
46
|
+
def __init__(self, redis_client):
|
|
47
|
+
self.cache = redis_client
|
|
48
|
+
self.ttl = 3600 # 1 hour
|
|
49
|
+
|
|
50
|
+
def get_cache_key(self, prompt: str, model: str) -> str:
|
|
51
|
+
content = f"{model}:{prompt}"
|
|
52
|
+
return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"
|
|
53
|
+
|
|
54
|
+
def get(self, prompt: str, model: str):
|
|
55
|
+
key = self.get_cache_key(prompt, model)
|
|
56
|
+
return self.cache.get(key)
|
|
57
|
+
|
|
58
|
+
def set(self, prompt: str, model: str, response: str):
|
|
59
|
+
key = self.get_cache_key(prompt, model)
|
|
60
|
+
self.cache.setex(key, self.ttl, response)
|
|
61
|
+
|
|
62
|
+
# Usage
|
|
63
|
+
cache = LLMCache(redis_client)
|
|
64
|
+
|
|
65
|
+
def cached_llm_call(prompt: str, model: str = "gpt-4"):
|
|
66
|
+
# Check cache
|
|
67
|
+
cached = cache.get(prompt, model)
|
|
68
|
+
if cached:
|
|
69
|
+
return cached
|
|
70
|
+
|
|
71
|
+
# Call LLM
|
|
72
|
+
response = llm(prompt, model=model)
|
|
73
|
+
|
|
74
|
+
# Cache result
|
|
75
|
+
cache.set(prompt, model, response)
|
|
76
|
+
|
|
77
|
+
return response
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Model Selection
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
MODEL_PRICING = {
|
|
84
|
+
"gpt-4": {"input": 0.03, "output": 0.06},
|
|
85
|
+
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
|
|
86
|
+
"claude-3-opus": {"input": 0.015, "output": 0.075},
|
|
87
|
+
"claude-3-sonnet": {"input": 0.003, "output": 0.015},
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
def select_model_by_complexity(query: str) -> str:
|
|
91
|
+
"""Use cheaper models for simple queries"""
|
|
92
|
+
# Classify complexity
|
|
93
|
+
complexity = classify_complexity(query)
|
|
94
|
+
|
|
95
|
+
if complexity == "simple":
|
|
96
|
+
return "gpt-3.5-turbo" # 60x cheaper
|
|
97
|
+
elif complexity == "medium":
|
|
98
|
+
return "claude-3-sonnet"
|
|
99
|
+
else:
|
|
100
|
+
return "gpt-4"
|
|
101
|
+
|
|
102
|
+
def classify_complexity(query: str) -> str:
|
|
103
|
+
# Simple heuristics
|
|
104
|
+
if len(query) < 100 and "?" in query:
|
|
105
|
+
return "simple"
|
|
106
|
+
elif any(word in query.lower() for word in ["analyze", "complex", "detailed"]):
|
|
107
|
+
return "complex"
|
|
108
|
+
return "medium"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Prompt Optimization
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def optimize_prompt(prompt: str) -> str:
|
|
115
|
+
"""Reduce token count while preserving meaning"""
|
|
116
|
+
optimizations = [
|
|
117
|
+
# Remove extra whitespace
|
|
118
|
+
lambda p: re.sub(r'\s+', ' ', p),
|
|
119
|
+
|
|
120
|
+
# Remove examples if not critical
|
|
121
|
+
lambda p: p.split("Examples:")[0] if "Examples:" in p else p,
|
|
122
|
+
|
|
123
|
+
# Use abbreviations
|
|
124
|
+
lambda p: p.replace("For example", "E.g."),
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
for optimize in optimizations:
|
|
128
|
+
prompt = optimize(prompt)
|
|
129
|
+
|
|
130
|
+
return prompt.strip()
|
|
131
|
+
|
|
132
|
+
# Example: 500 tokens → 350 tokens = 30% cost reduction
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Batching
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
async def batch_llm_calls(prompts: List[str], batch_size: int = 5):
|
|
139
|
+
"""Process multiple prompts in parallel"""
|
|
140
|
+
results = []
|
|
141
|
+
|
|
142
|
+
for i in range(0, len(prompts), batch_size):
|
|
143
|
+
batch = prompts[i:i + batch_size]
|
|
144
|
+
|
|
145
|
+
# Parallel execution
|
|
146
|
+
batch_results = await asyncio.gather(*[
|
|
147
|
+
llm_async(prompt) for prompt in batch
|
|
148
|
+
])
|
|
149
|
+
|
|
150
|
+
results.extend(batch_results)
|
|
151
|
+
|
|
152
|
+
return results
|
|
153
|
+
|
|
154
|
+
# 10 sequential calls: ~30 seconds
|
|
155
|
+
# 10 batched calls (5 parallel): ~6 seconds
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Latency Hotspot Analysis
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
import time
|
|
162
|
+
|
|
163
|
+
class LatencyTracker:
|
|
164
|
+
def __init__(self):
|
|
165
|
+
self.timings = {}
|
|
166
|
+
|
|
167
|
+
def track(self, operation: str):
|
|
168
|
+
def decorator(func):
|
|
169
|
+
def wrapper(*args, **kwargs):
|
|
170
|
+
start = time.time()
|
|
171
|
+
result = func(*args, **kwargs)
|
|
172
|
+
duration = time.time() - start
|
|
173
|
+
|
|
174
|
+
if operation not in self.timings:
|
|
175
|
+
self.timings[operation] = []
|
|
176
|
+
self.timings[operation].append(duration)
|
|
177
|
+
|
|
178
|
+
return result
|
|
179
|
+
return wrapper
|
|
180
|
+
return decorator
|
|
181
|
+
|
|
182
|
+
def report(self):
|
|
183
|
+
return {
|
|
184
|
+
op: {
|
|
185
|
+
"count": len(times),
|
|
186
|
+
"total": sum(times),
|
|
187
|
+
"avg": sum(times) / len(times),
|
|
188
|
+
"p95": sorted(times)[int(len(times) * 0.95)]
|
|
189
|
+
}
|
|
190
|
+
for op, times in self.timings.items()
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
# Usage
|
|
194
|
+
tracker = LatencyTracker()
|
|
195
|
+
|
|
196
|
+
@tracker.track("llm_call")
|
|
197
|
+
def call_llm(prompt):
|
|
198
|
+
return llm(prompt)
|
|
199
|
+
|
|
200
|
+
# After 100 calls
|
|
201
|
+
print(tracker.report())
|
|
202
|
+
# {"llm_call": {"avg": 2.3, "p95": 4.1, ...}}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Optimization Recommendations
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
def generate_recommendations(cost_analysis, latency_analysis):
|
|
209
|
+
recs = []
|
|
210
|
+
|
|
211
|
+
# High LLM costs
|
|
212
|
+
if cost_analysis["costs"]["llm_calls"] > 10:
|
|
213
|
+
recs.append({
|
|
214
|
+
"issue": "High LLM costs",
|
|
215
|
+
"recommendation": "Implement caching for repeated queries",
|
|
216
|
+
"impact": "50-80% cost reduction",
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
if cost_analysis["avg_cost_per_call"] > 0.01:
|
|
220
|
+
recs.append({
|
|
221
|
+
"issue": "Using expensive model for all queries",
|
|
222
|
+
"recommendation": "Use gpt-3.5-turbo for simple queries",
|
|
223
|
+
"impact": "60% cost reduction",
|
|
224
|
+
})
|
|
225
|
+
|
|
226
|
+
# High latency
|
|
227
|
+
if latency_analysis["llm_call"]["avg"] > 3:
|
|
228
|
+
recs.append({
|
|
229
|
+
"issue": "High LLM latency",
|
|
230
|
+
"recommendation": "Batch parallel calls, use streaming",
|
|
231
|
+
"impact": "50% latency reduction",
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
return recs
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Streaming for Faster TTFB
|
|
238
|
+
|
|
239
|
+
```python
|
|
240
|
+
async def streaming_llm(prompt: str):
|
|
241
|
+
"""Stream tokens as they're generated"""
|
|
242
|
+
async for chunk in llm_stream(prompt):
|
|
243
|
+
yield chunk
|
|
244
|
+
# User sees partial response immediately
|
|
245
|
+
|
|
246
|
+
# Time to First Byte: ~200ms (streaming) vs ~2s (waiting for full response)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Best Practices
|
|
250
|
+
|
|
251
|
+
1. **Cache aggressively**: Identical queries cached
|
|
252
|
+
2. **Model selection**: Use cheaper models when possible
|
|
253
|
+
3. **Prompt optimization**: Reduce unnecessary tokens
|
|
254
|
+
4. **Batching**: Parallel execution for throughput
|
|
255
|
+
5. **Streaming**: Faster perceived latency
|
|
256
|
+
6. **Monitor costs**: Track per-user, per-feature
|
|
257
|
+
7. **Set budgets**: Alert on anomalies
|
|
258
|
+
|
|
259
|
+
## Output Checklist
|
|
260
|
+
|
|
261
|
+
- [ ] Cost tracking implementation
|
|
262
|
+
- [ ] Caching layer
|
|
263
|
+
- [ ] Model selection logic
|
|
264
|
+
- [ ] Prompt optimization
|
|
265
|
+
- [ ] Batching for parallel calls
|
|
266
|
+
- [ ] Latency tracking
|
|
267
|
+
- [ ] Hotspot analysis
|
|
268
|
+
- [ ] Optimization recommendations
|
|
269
|
+
- [ ] Budget alerts
|
|
270
|
+
- [ ] Performance dashboard
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: doc-to-vector-dataset-generator
|
|
3
|
+
description: Converts documents into clean, chunked datasets suitable for embeddings and vector search. Produces chunked JSONL files with metadata, deduplication logic, and quality checks. Use when preparing "training data", "vector datasets", "document processing", or "embedding data".
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Doc-to-Vector Dataset Generator
|
|
7
|
+
|
|
8
|
+
Transform documents into high-quality vector search datasets.
|
|
9
|
+
|
|
10
|
+
## Pipeline Steps
|
|
11
|
+
|
|
12
|
+
1. **Extract text** from various formats (PDF, DOCX, HTML)
|
|
13
|
+
2. **Clean text** (remove noise, normalize)
|
|
14
|
+
3. **Chunk strategically** (semantic boundaries)
|
|
15
|
+
4. **Add metadata** (source, timestamps, classification)
|
|
16
|
+
5. **Deduplicate** (near-duplicate detection)
|
|
17
|
+
6. **Quality check** (length, content validation)
|
|
18
|
+
7. **Export JSONL** (one chunk per line)
|
|
19
|
+
|
|
20
|
+
## Text Extraction
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
# PDF extraction
|
|
24
|
+
import pymupdf
|
|
25
|
+
|
|
26
|
+
def extract_pdf(filepath: str) -> str:
|
|
27
|
+
doc = pymupdf.open(filepath)
|
|
28
|
+
text = ""
|
|
29
|
+
for page in doc:
|
|
30
|
+
text += page.get_text()
|
|
31
|
+
return text
|
|
32
|
+
|
|
33
|
+
# Markdown extraction
|
|
34
|
+
def extract_markdown(filepath: str) -> str:
|
|
35
|
+
with open(filepath) as f:
|
|
36
|
+
return f.read()
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Text Cleaning
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import re
|
|
43
|
+
|
|
44
|
+
def clean_text(text: str) -> str:
|
|
45
|
+
# Remove extra whitespace
|
|
46
|
+
text = re.sub(r'\s+', ' ', text)
|
|
47
|
+
|
|
48
|
+
# Remove page numbers
|
|
49
|
+
text = re.sub(r'Page \d+', '', text)
|
|
50
|
+
|
|
51
|
+
# Remove URLs (optional)
|
|
52
|
+
text = re.sub(r'http\S+', '', text)
|
|
53
|
+
|
|
54
|
+
# Normalize unicode
|
|
55
|
+
text = text.encode('utf-8', 'ignore').decode('utf-8')
|
|
56
|
+
|
|
57
|
+
return text.strip()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Semantic Chunking
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
def semantic_chunk(text: str, max_chunk_size: int = 1000) -> List[str]:
|
|
64
|
+
"""Chunk at semantic boundaries (paragraphs, sentences)"""
|
|
65
|
+
# Split by paragraphs first
|
|
66
|
+
paragraphs = text.split('\n\n')
|
|
67
|
+
|
|
68
|
+
chunks = []
|
|
69
|
+
current_chunk = ""
|
|
70
|
+
|
|
71
|
+
for para in paragraphs:
|
|
72
|
+
if len(current_chunk) + len(para) <= max_chunk_size:
|
|
73
|
+
current_chunk += para + "\n\n"
|
|
74
|
+
else:
|
|
75
|
+
if current_chunk:
|
|
76
|
+
chunks.append(current_chunk.strip())
|
|
77
|
+
current_chunk = para + "\n\n"
|
|
78
|
+
|
|
79
|
+
if current_chunk:
|
|
80
|
+
chunks.append(current_chunk.strip())
|
|
81
|
+
|
|
82
|
+
return chunks
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Metadata Extraction
|
|
86
|
+
|
|
87
|
+
````python
|
|
88
|
+
def extract_metadata(filepath: str, chunk: str, chunk_idx: int) -> dict:
|
|
89
|
+
return {
|
|
90
|
+
"source": filepath,
|
|
91
|
+
"chunk_id": f"{hash(filepath)}_{chunk_idx}",
|
|
92
|
+
"chunk_index": chunk_idx,
|
|
93
|
+
"char_count": len(chunk),
|
|
94
|
+
"word_count": len(chunk.split()),
|
|
95
|
+
"created_at": datetime.now().isoformat(),
|
|
96
|
+
|
|
97
|
+
# Content classification
|
|
98
|
+
"has_code": bool(re.search(r'```|def |class |function', chunk)),
|
|
99
|
+
"has_table": bool(re.search(r'\|.*\|', chunk)),
|
|
100
|
+
"language": detect_language(chunk),
|
|
101
|
+
}
|
|
102
|
+
````
|
|
103
|
+
|
|
104
|
+
## Deduplication
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
108
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
109
|
+
|
|
110
|
+
def deduplicate_chunks(chunks: List[dict], threshold: float = 0.95) -> List[dict]:
|
|
111
|
+
"""Remove near-duplicate chunks"""
|
|
112
|
+
texts = [c["text"] for c in chunks]
|
|
113
|
+
|
|
114
|
+
# Compute TF-IDF vectors
|
|
115
|
+
vectorizer = TfidfVectorizer()
|
|
116
|
+
vectors = vectorizer.fit_transform(texts)
|
|
117
|
+
|
|
118
|
+
# Compute pairwise similarity
|
|
119
|
+
similarity_matrix = cosine_similarity(vectors)
|
|
120
|
+
|
|
121
|
+
# Find duplicates
|
|
122
|
+
to_remove = set()
|
|
123
|
+
for i in range(len(chunks)):
|
|
124
|
+
if i in to_remove:
|
|
125
|
+
continue
|
|
126
|
+
for j in range(i+1, len(chunks)):
|
|
127
|
+
if similarity_matrix[i][j] > threshold:
|
|
128
|
+
to_remove.add(j)
|
|
129
|
+
|
|
130
|
+
# Return unique chunks
|
|
131
|
+
return [c for i, c in enumerate(chunks) if i not in to_remove]
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Quality Checks
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
def quality_check(chunk: dict) -> bool:
|
|
138
|
+
"""Validate chunk quality"""
|
|
139
|
+
text = chunk["text"]
|
|
140
|
+
|
|
141
|
+
# Min length check
|
|
142
|
+
if len(text) < 50:
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
# Max length check
|
|
146
|
+
if len(text) > 5000:
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
# Content check (not just numbers/symbols)
|
|
150
|
+
alpha_ratio = sum(c.isalpha() for c in text) / len(text)
|
|
151
|
+
if alpha_ratio < 0.5:
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
# Language check (English only)
|
|
155
|
+
if chunk["metadata"]["language"] != "en":
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
return True
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## JSONL Export
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
import json
|
|
165
|
+
|
|
166
|
+
def export_jsonl(chunks: List[dict], output_path: str):
|
|
167
|
+
"""Export chunks as JSONL (one JSON object per line)"""
|
|
168
|
+
with open(output_path, 'w') as f:
|
|
169
|
+
for chunk in chunks:
|
|
170
|
+
f.write(json.dumps(chunk) + '\n')
|
|
171
|
+
|
|
172
|
+
# Example output format
|
|
173
|
+
{
|
|
174
|
+
"text": "Chunk text content here...",
|
|
175
|
+
"metadata": {
|
|
176
|
+
"source": "docs/auth.md",
|
|
177
|
+
"chunk_id": "abc123_0",
|
|
178
|
+
"chunk_index": 0,
|
|
179
|
+
"char_count": 542,
|
|
180
|
+
"word_count": 89,
|
|
181
|
+
"has_code": true
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Complete Pipeline
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
def process_documents(input_dir: str, output_path: str):
|
|
190
|
+
all_chunks = []
|
|
191
|
+
|
|
192
|
+
# Process each document
|
|
193
|
+
for filepath in glob(f"{input_dir}/**/*.md"):
|
|
194
|
+
# Extract and clean
|
|
195
|
+
text = extract_markdown(filepath)
|
|
196
|
+
text = clean_text(text)
|
|
197
|
+
|
|
198
|
+
# Chunk
|
|
199
|
+
chunks = semantic_chunk(text)
|
|
200
|
+
|
|
201
|
+
# Add metadata
|
|
202
|
+
for i, chunk in enumerate(chunks):
|
|
203
|
+
chunk_obj = {
|
|
204
|
+
"text": chunk,
|
|
205
|
+
"metadata": extract_metadata(filepath, chunk, i)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Quality check
|
|
209
|
+
if quality_check(chunk_obj):
|
|
210
|
+
all_chunks.append(chunk_obj)
|
|
211
|
+
|
|
212
|
+
# Deduplicate
|
|
213
|
+
unique_chunks = deduplicate_chunks(all_chunks)
|
|
214
|
+
|
|
215
|
+
# Export
|
|
216
|
+
export_jsonl(unique_chunks, output_path)
|
|
217
|
+
|
|
218
|
+
print(f"Processed {len(unique_chunks)} chunks")
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Best Practices
|
|
222
|
+
|
|
223
|
+
- Chunk at semantic boundaries
|
|
224
|
+
- Rich metadata for filtering
|
|
225
|
+
- Deduplicate aggressively
|
|
226
|
+
- Quality checks prevent garbage
|
|
227
|
+
- JSONL format for streaming
|
|
228
|
+
- Version your datasets
|
|
229
|
+
|
|
230
|
+
## Output Checklist
|
|
231
|
+
|
|
232
|
+
- [ ] Text extraction from all formats
|
|
233
|
+
- [ ] Cleaning pipeline implemented
|
|
234
|
+
- [ ] Semantic chunking strategy
|
|
235
|
+
- [ ] Metadata schema defined
|
|
236
|
+
- [ ] Deduplication logic
|
|
237
|
+
- [ ] Quality validation checks
|
|
238
|
+
- [ ] JSONL export format
|
|
239
|
+
- [ ] Dataset statistics logged
|