@patricio0312rev/skillset 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/CHANGELOG.md +29 -0
  2. package/LICENSE +21 -0
  3. package/README.md +176 -0
  4. package/bin/cli.js +37 -0
  5. package/package.json +55 -0
  6. package/src/commands/init.js +301 -0
  7. package/src/index.js +168 -0
  8. package/src/lib/config.js +200 -0
  9. package/src/lib/generator.js +166 -0
  10. package/src/utils/display.js +95 -0
  11. package/src/utils/readme.js +196 -0
  12. package/src/utils/tool-specific.js +233 -0
  13. package/templates/ai-engineering/agent-orchestration-planner/ SKILL.md +266 -0
  14. package/templates/ai-engineering/cost-latency-optimizer/ SKILL.md +270 -0
  15. package/templates/ai-engineering/doc-to-vector-dataset-generator/ SKILL.md +239 -0
  16. package/templates/ai-engineering/evaluation-harness/ SKILL.md +219 -0
  17. package/templates/ai-engineering/guardrails-safety-filter-builder/ SKILL.md +226 -0
  18. package/templates/ai-engineering/llm-debugger/ SKILL.md +283 -0
  19. package/templates/ai-engineering/prompt-regression-tester/ SKILL.md +216 -0
  20. package/templates/ai-engineering/prompt-template-builder/ SKILL.md +393 -0
  21. package/templates/ai-engineering/rag-pipeline-builder/ SKILL.md +244 -0
  22. package/templates/ai-engineering/tool-function-schema-designer/ SKILL.md +219 -0
  23. package/templates/architecture/adr-writer/ SKILL.md +250 -0
  24. package/templates/architecture/api-versioning-deprecation-planner/ SKILL.md +331 -0
  25. package/templates/architecture/domain-model-boundaries-mapper/ SKILL.md +300 -0
  26. package/templates/architecture/migration-planner/ SKILL.md +376 -0
  27. package/templates/architecture/performance-budget-setter/ SKILL.md +318 -0
  28. package/templates/architecture/reliability-strategy-builder/ SKILL.md +286 -0
  29. package/templates/architecture/rfc-generator/ SKILL.md +362 -0
  30. package/templates/architecture/scalability-playbook/ SKILL.md +279 -0
  31. package/templates/architecture/system-design-generator/ SKILL.md +339 -0
  32. package/templates/architecture/tech-debt-prioritizer/ SKILL.md +329 -0
  33. package/templates/backend/api-contract-normalizer/ SKILL.md +487 -0
  34. package/templates/backend/api-endpoint-generator/ SKILL.md +415 -0
  35. package/templates/backend/auth-module-builder/ SKILL.md +99 -0
  36. package/templates/backend/background-jobs-designer/ SKILL.md +166 -0
  37. package/templates/backend/caching-strategist/ SKILL.md +190 -0
  38. package/templates/backend/error-handling-standardizer/ SKILL.md +174 -0
  39. package/templates/backend/rate-limiting-abuse-protection/ SKILL.md +147 -0
  40. package/templates/backend/rbac-permissions-builder/ SKILL.md +158 -0
  41. package/templates/backend/service-layer-extractor/ SKILL.md +269 -0
  42. package/templates/backend/webhook-receiver-hardener/ SKILL.md +211 -0
  43. package/templates/ci-cd/artifact-sbom-publisher/ SKILL.md +236 -0
  44. package/templates/ci-cd/caching-strategy-optimizer/ SKILL.md +195 -0
  45. package/templates/ci-cd/deployment-checklist-generator/ SKILL.md +381 -0
  46. package/templates/ci-cd/github-actions-pipeline-creator/ SKILL.md +348 -0
  47. package/templates/ci-cd/monorepo-ci-optimizer/ SKILL.md +298 -0
  48. package/templates/ci-cd/preview-environments-builder/ SKILL.md +187 -0
  49. package/templates/ci-cd/quality-gates-enforcer/ SKILL.md +342 -0
  50. package/templates/ci-cd/release-automation-builder/ SKILL.md +281 -0
  51. package/templates/ci-cd/rollback-workflow-builder/ SKILL.md +372 -0
  52. package/templates/ci-cd/secrets-env-manager/ SKILL.md +242 -0
  53. package/templates/db-management/backup-restore-runbook-generator/ SKILL.md +505 -0
  54. package/templates/db-management/data-integrity-auditor/ SKILL.md +505 -0
  55. package/templates/db-management/data-retention-archiving-planner/ SKILL.md +430 -0
  56. package/templates/db-management/data-seeding-fixtures-builder/ SKILL.md +375 -0
  57. package/templates/db-management/db-performance-watchlist/ SKILL.md +425 -0
  58. package/templates/db-management/etl-sync-job-builder/ SKILL.md +457 -0
  59. package/templates/db-management/multi-tenant-safety-checker/ SKILL.md +398 -0
  60. package/templates/db-management/prisma-migration-assistant/ SKILL.md +379 -0
  61. package/templates/db-management/schema-consistency-checker/ SKILL.md +440 -0
  62. package/templates/db-management/sql-query-optimizer/ SKILL.md +324 -0
  63. package/templates/foundation/changelog-writer/ SKILL.md +431 -0
  64. package/templates/foundation/code-formatter-installer/ SKILL.md +320 -0
  65. package/templates/foundation/codebase-summarizer/ SKILL.md +360 -0
  66. package/templates/foundation/dependency-doctor/ SKILL.md +163 -0
  67. package/templates/foundation/dev-environment-bootstrapper/ SKILL.md +259 -0
  68. package/templates/foundation/dev-onboarding-builder/ SKILL.md +556 -0
  69. package/templates/foundation/docs-starter-kit/ SKILL.md +574 -0
  70. package/templates/foundation/explaining-code/SKILL.md +13 -0
  71. package/templates/foundation/git-hygiene-enforcer/ SKILL.md +455 -0
  72. package/templates/foundation/project-scaffolder/ SKILL.md +65 -0
  73. package/templates/foundation/project-scaffolder/references/templates.md +126 -0
  74. package/templates/foundation/repo-structure-linter/ SKILL.md +0 -0
  75. package/templates/foundation/repo-structure-linter/references/conventions.md +98 -0
  76. package/templates/frontend/animation-micro-interaction-pack/ SKILL.md +41 -0
  77. package/templates/frontend/component-scaffold-generator/ SKILL.md +562 -0
  78. package/templates/frontend/design-to-component-translator/ SKILL.md +547 -0
  79. package/templates/frontend/form-wizard-builder/ SKILL.md +553 -0
  80. package/templates/frontend/frontend-refactor-planner/ SKILL.md +37 -0
  81. package/templates/frontend/i18n-frontend-implementer/ SKILL.md +44 -0
  82. package/templates/frontend/modal-drawer-system/ SKILL.md +377 -0
  83. package/templates/frontend/page-layout-builder/ SKILL.md +630 -0
  84. package/templates/frontend/state-ux-flow-builder/ SKILL.md +23 -0
  85. package/templates/frontend/table-builder/ SKILL.md +350 -0
  86. package/templates/performance/alerting-dashboard-builder/ SKILL.md +162 -0
  87. package/templates/performance/backend-latency-profiler-helper/ SKILL.md +108 -0
  88. package/templates/performance/caching-cdn-strategy-planner/ SKILL.md +150 -0
  89. package/templates/performance/capacity-planning-helper/ SKILL.md +242 -0
  90. package/templates/performance/core-web-vitals-tuner/ SKILL.md +126 -0
  91. package/templates/performance/incident-runbook-generator/ SKILL.md +162 -0
  92. package/templates/performance/load-test-scenario-builder/ SKILL.md +256 -0
  93. package/templates/performance/observability-setup/ SKILL.md +232 -0
  94. package/templates/performance/postmortem-writer/ SKILL.md +203 -0
  95. package/templates/performance/structured-logging-standardizer/ SKILL.md +122 -0
  96. package/templates/security/auth-security-reviewer/ SKILL.md +428 -0
  97. package/templates/security/dependency-vulnerability-triage/ SKILL.md +495 -0
  98. package/templates/security/input-validation-sanitization-auditor/ SKILL.md +76 -0
  99. package/templates/security/pii-redaction-logging-policy-builder/ SKILL.md +65 -0
  100. package/templates/security/rbac-policy-tester/ SKILL.md +80 -0
  101. package/templates/security/secrets-scanner/ SKILL.md +462 -0
  102. package/templates/security/secure-headers-csp-builder/ SKILL.md +404 -0
  103. package/templates/security/security-incident-playbook-generator/ SKILL.md +76 -0
  104. package/templates/security/security-pr-checklist-skill/ SKILL.md +62 -0
  105. package/templates/security/threat-model-generator/ SKILL.md +394 -0
  106. package/templates/testing/contract-testing-builder/ SKILL.md +492 -0
  107. package/templates/testing/coverage-strategist/ SKILL.md +436 -0
  108. package/templates/testing/e2e-test-builder/ SKILL.md +382 -0
  109. package/templates/testing/flaky-test-detective/ SKILL.md +416 -0
  110. package/templates/testing/integration-test-builder/ SKILL.md +525 -0
  111. package/templates/testing/mocking-assistant/ SKILL.md +383 -0
  112. package/templates/testing/snapshot-test-refactorer/ SKILL.md +375 -0
  113. package/templates/testing/test-data-factory-builder/ SKILL.md +449 -0
  114. package/templates/testing/test-reporting-triage-skill/ SKILL.md +469 -0
  115. package/templates/testing/unit-test-generator/ SKILL.md +548 -0
@@ -0,0 +1,270 @@
1
+ ---
2
+ name: cost-latency-optimizer
3
+ description: Reduces LLM costs and improves response times through caching, model selection, batching, and prompt optimization. Provides cost breakdowns, latency hotspots, and configuration recommendations. Use for "cost reduction", "performance optimization", "latency improvement", or "efficiency".
4
+ ---
5
+
6
+ # Cost & Latency Optimizer
7
+
8
+ Optimize LLM applications for cost and performance.
9
+
10
+ ## Cost Breakdown Analysis
11
+
12
+ ```python
13
+ class CostAnalyzer:
14
+ def __init__(self):
15
+ self.costs = {
16
+ "llm_calls": 0,
17
+ "embeddings": 0,
18
+ "tool_calls": 0,
19
+ }
20
+ self.counts = {
21
+ "llm_calls": 0,
22
+ "embeddings": 0,
23
+ }
24
+
25
+ def track_llm_call(self, tokens_in: int, tokens_out: int):
26
+ # GPT-4 pricing
27
+ cost = (tokens_in / 1000) * 0.03 + (tokens_out / 1000) * 0.06
28
+ self.costs["llm_calls"] += cost
29
+ self.counts["llm_calls"] += 1
30
+
31
+ def report(self):
32
+ return {
33
+ "total_cost": sum(self.costs.values()),
34
+ "breakdown": self.costs,
35
+ "avg_cost_per_call": self.costs["llm_calls"] / self.counts["llm_calls"],
36
+ }
37
+ ```
38
+
39
+ ## Caching Strategy
40
+
41
+ ```python
42
+ import hashlib
43
+ from functools import lru_cache
44
+
45
+ class LLMCache:
46
+ def __init__(self, redis_client):
47
+ self.cache = redis_client
48
+ self.ttl = 3600 # 1 hour
49
+
50
+ def get_cache_key(self, prompt: str, model: str) -> str:
51
+ content = f"{model}:{prompt}"
52
+ return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"
53
+
54
+ def get(self, prompt: str, model: str):
55
+ key = self.get_cache_key(prompt, model)
56
+ return self.cache.get(key)
57
+
58
+ def set(self, prompt: str, model: str, response: str):
59
+ key = self.get_cache_key(prompt, model)
60
+ self.cache.setex(key, self.ttl, response)
61
+
62
+ # Usage
63
+ cache = LLMCache(redis_client)
64
+
65
+ def cached_llm_call(prompt: str, model: str = "gpt-4"):
66
+ # Check cache
67
+ cached = cache.get(prompt, model)
68
+ if cached:
69
+ return cached
70
+
71
+ # Call LLM
72
+ response = llm(prompt, model=model)
73
+
74
+ # Cache result
75
+ cache.set(prompt, model, response)
76
+
77
+ return response
78
+ ```
79
+
80
+ ## Model Selection
81
+
82
+ ```python
83
+ MODEL_PRICING = {
84
+ "gpt-4": {"input": 0.03, "output": 0.06},
85
+ "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
86
+ "claude-3-opus": {"input": 0.015, "output": 0.075},
87
+ "claude-3-sonnet": {"input": 0.003, "output": 0.015},
88
+ }
89
+
90
+ def select_model_by_complexity(query: str) -> str:
91
+ """Use cheaper models for simple queries"""
92
+ # Classify complexity
93
+ complexity = classify_complexity(query)
94
+
95
+ if complexity == "simple":
96
+ return "gpt-3.5-turbo" # 60x cheaper
97
+ elif complexity == "medium":
98
+ return "claude-3-sonnet"
99
+ else:
100
+ return "gpt-4"
101
+
102
+ def classify_complexity(query: str) -> str:
103
+ # Simple heuristics
104
+ if len(query) < 100 and "?" in query:
105
+ return "simple"
106
+ elif any(word in query.lower() for word in ["analyze", "complex", "detailed"]):
107
+ return "complex"
108
+ return "medium"
109
+ ```
110
+
111
+ ## Prompt Optimization
112
+
113
+ ```python
114
+ def optimize_prompt(prompt: str) -> str:
115
+ """Reduce token count while preserving meaning"""
116
+ optimizations = [
117
+ # Remove extra whitespace
118
+ lambda p: re.sub(r'\s+', ' ', p),
119
+
120
+ # Remove examples if not critical
121
+ lambda p: p.split("Examples:")[0] if "Examples:" in p else p,
122
+
123
+ # Use abbreviations
124
+ lambda p: p.replace("For example", "E.g."),
125
+ ]
126
+
127
+ for optimize in optimizations:
128
+ prompt = optimize(prompt)
129
+
130
+ return prompt.strip()
131
+
132
+ # Example: 500 tokens → 350 tokens = 30% cost reduction
133
+ ```
134
+
135
+ ## Batching
136
+
137
+ ```python
138
+ async def batch_llm_calls(prompts: List[str], batch_size: int = 5):
139
+ """Process multiple prompts in parallel"""
140
+ results = []
141
+
142
+ for i in range(0, len(prompts), batch_size):
143
+ batch = prompts[i:i + batch_size]
144
+
145
+ # Parallel execution
146
+ batch_results = await asyncio.gather(*[
147
+ llm_async(prompt) for prompt in batch
148
+ ])
149
+
150
+ results.extend(batch_results)
151
+
152
+ return results
153
+
154
+ # 10 sequential calls: ~30 seconds
155
+ # 10 batched calls (5 parallel): ~6 seconds
156
+ ```
157
+
158
+ ## Latency Hotspot Analysis
159
+
160
+ ```python
161
+ import time
162
+
163
+ class LatencyTracker:
164
+ def __init__(self):
165
+ self.timings = {}
166
+
167
+ def track(self, operation: str):
168
+ def decorator(func):
169
+ def wrapper(*args, **kwargs):
170
+ start = time.time()
171
+ result = func(*args, **kwargs)
172
+ duration = time.time() - start
173
+
174
+ if operation not in self.timings:
175
+ self.timings[operation] = []
176
+ self.timings[operation].append(duration)
177
+
178
+ return result
179
+ return wrapper
180
+ return decorator
181
+
182
+ def report(self):
183
+ return {
184
+ op: {
185
+ "count": len(times),
186
+ "total": sum(times),
187
+ "avg": sum(times) / len(times),
188
+ "p95": sorted(times)[int(len(times) * 0.95)]
189
+ }
190
+ for op, times in self.timings.items()
191
+ }
192
+
193
+ # Usage
194
+ tracker = LatencyTracker()
195
+
196
+ @tracker.track("llm_call")
197
+ def call_llm(prompt):
198
+ return llm(prompt)
199
+
200
+ # After 100 calls
201
+ print(tracker.report())
202
+ # {"llm_call": {"avg": 2.3, "p95": 4.1, ...}}
203
+ ```
204
+
205
+ ## Optimization Recommendations
206
+
207
+ ```python
208
+ def generate_recommendations(cost_analysis, latency_analysis):
209
+ recs = []
210
+
211
+ # High LLM costs
212
+ if cost_analysis["costs"]["llm_calls"] > 10:
213
+ recs.append({
214
+ "issue": "High LLM costs",
215
+ "recommendation": "Implement caching for repeated queries",
216
+ "impact": "50-80% cost reduction",
217
+ })
218
+
219
+ if cost_analysis["avg_cost_per_call"] > 0.01:
220
+ recs.append({
221
+ "issue": "Using expensive model for all queries",
222
+ "recommendation": "Use gpt-3.5-turbo for simple queries",
223
+ "impact": "60% cost reduction",
224
+ })
225
+
226
+ # High latency
227
+ if latency_analysis["llm_call"]["avg"] > 3:
228
+ recs.append({
229
+ "issue": "High LLM latency",
230
+ "recommendation": "Batch parallel calls, use streaming",
231
+ "impact": "50% latency reduction",
232
+ })
233
+
234
+ return recs
235
+ ```
236
+
237
+ ## Streaming for Faster TTFB
238
+
239
+ ```python
240
+ async def streaming_llm(prompt: str):
241
+ """Stream tokens as they're generated"""
242
+ async for chunk in llm_stream(prompt):
243
+ yield chunk
244
+ # User sees partial response immediately
245
+
246
+ # Time to First Byte: ~200ms (streaming) vs ~2s (waiting for full response)
247
+ ```
248
+
249
+ ## Best Practices
250
+
251
+ 1. **Cache aggressively**: Identical queries cached
252
+ 2. **Model selection**: Use cheaper models when possible
253
+ 3. **Prompt optimization**: Reduce unnecessary tokens
254
+ 4. **Batching**: Parallel execution for throughput
255
+ 5. **Streaming**: Faster perceived latency
256
+ 6. **Monitor costs**: Track per-user, per-feature
257
+ 7. **Set budgets**: Alert on anomalies
258
+
259
+ ## Output Checklist
260
+
261
+ - [ ] Cost tracking implementation
262
+ - [ ] Caching layer
263
+ - [ ] Model selection logic
264
+ - [ ] Prompt optimization
265
+ - [ ] Batching for parallel calls
266
+ - [ ] Latency tracking
267
+ - [ ] Hotspot analysis
268
+ - [ ] Optimization recommendations
269
+ - [ ] Budget alerts
270
+ - [ ] Performance dashboard
@@ -0,0 +1,239 @@
1
+ ---
2
+ name: doc-to-vector-dataset-generator
3
+ description: Converts documents into clean, chunked datasets suitable for embeddings and vector search. Produces chunked JSONL files with metadata, deduplication logic, and quality checks. Use when preparing "training data", "vector datasets", "document processing", or "embedding data".
4
+ ---
5
+
6
+ # Doc-to-Vector Dataset Generator
7
+
8
+ Transform documents into high-quality vector search datasets.
9
+
10
+ ## Pipeline Steps
11
+
12
+ 1. **Extract text** from various formats (PDF, DOCX, HTML)
13
+ 2. **Clean text** (remove noise, normalize)
14
+ 3. **Chunk strategically** (semantic boundaries)
15
+ 4. **Add metadata** (source, timestamps, classification)
16
+ 5. **Deduplicate** (near-duplicate detection)
17
+ 6. **Quality check** (length, content validation)
18
+ 7. **Export JSONL** (one chunk per line)
19
+
20
+ ## Text Extraction
21
+
22
+ ```python
23
+ # PDF extraction
24
+ import pymupdf
25
+
26
+ def extract_pdf(filepath: str) -> str:
27
+ doc = pymupdf.open(filepath)
28
+ text = ""
29
+ for page in doc:
30
+ text += page.get_text()
31
+ return text
32
+
33
+ # Markdown extraction
34
+ def extract_markdown(filepath: str) -> str:
35
+ with open(filepath) as f:
36
+ return f.read()
37
+ ```
38
+
39
+ ## Text Cleaning
40
+
41
+ ```python
42
+ import re
43
+
44
+ def clean_text(text: str) -> str:
45
+ # Remove extra whitespace
46
+ text = re.sub(r'\s+', ' ', text)
47
+
48
+ # Remove page numbers
49
+ text = re.sub(r'Page \d+', '', text)
50
+
51
+ # Remove URLs (optional)
52
+ text = re.sub(r'http\S+', '', text)
53
+
54
+ # Normalize unicode
55
+ text = text.encode('utf-8', 'ignore').decode('utf-8')
56
+
57
+ return text.strip()
58
+ ```
59
+
60
+ ## Semantic Chunking
61
+
62
+ ```python
63
+ def semantic_chunk(text: str, max_chunk_size: int = 1000) -> List[str]:
64
+ """Chunk at semantic boundaries (paragraphs, sentences)"""
65
+ # Split by paragraphs first
66
+ paragraphs = text.split('\n\n')
67
+
68
+ chunks = []
69
+ current_chunk = ""
70
+
71
+ for para in paragraphs:
72
+ if len(current_chunk) + len(para) <= max_chunk_size:
73
+ current_chunk += para + "\n\n"
74
+ else:
75
+ if current_chunk:
76
+ chunks.append(current_chunk.strip())
77
+ current_chunk = para + "\n\n"
78
+
79
+ if current_chunk:
80
+ chunks.append(current_chunk.strip())
81
+
82
+ return chunks
83
+ ```
84
+
85
+ ## Metadata Extraction
86
+
87
+ ````python
88
+ def extract_metadata(filepath: str, chunk: str, chunk_idx: int) -> dict:
89
+ return {
90
+ "source": filepath,
91
+ "chunk_id": f"{hash(filepath)}_{chunk_idx}",
92
+ "chunk_index": chunk_idx,
93
+ "char_count": len(chunk),
94
+ "word_count": len(chunk.split()),
95
+ "created_at": datetime.now().isoformat(),
96
+
97
+ # Content classification
98
+ "has_code": bool(re.search(r'```|def |class |function', chunk)),
99
+ "has_table": bool(re.search(r'\|.*\|', chunk)),
100
+ "language": detect_language(chunk),
101
+ }
102
+ ````
103
+
104
+ ## Deduplication
105
+
106
+ ```python
107
+ from sklearn.feature_extraction.text import TfidfVectorizer
108
+ from sklearn.metrics.pairwise import cosine_similarity
109
+
110
+ def deduplicate_chunks(chunks: List[dict], threshold: float = 0.95) -> List[dict]:
111
+ """Remove near-duplicate chunks"""
112
+ texts = [c["text"] for c in chunks]
113
+
114
+ # Compute TF-IDF vectors
115
+ vectorizer = TfidfVectorizer()
116
+ vectors = vectorizer.fit_transform(texts)
117
+
118
+ # Compute pairwise similarity
119
+ similarity_matrix = cosine_similarity(vectors)
120
+
121
+ # Find duplicates
122
+ to_remove = set()
123
+ for i in range(len(chunks)):
124
+ if i in to_remove:
125
+ continue
126
+ for j in range(i+1, len(chunks)):
127
+ if similarity_matrix[i][j] > threshold:
128
+ to_remove.add(j)
129
+
130
+ # Return unique chunks
131
+ return [c for i, c in enumerate(chunks) if i not in to_remove]
132
+ ```
133
+
134
+ ## Quality Checks
135
+
136
+ ```python
137
+ def quality_check(chunk: dict) -> bool:
138
+ """Validate chunk quality"""
139
+ text = chunk["text"]
140
+
141
+ # Min length check
142
+ if len(text) < 50:
143
+ return False
144
+
145
+ # Max length check
146
+ if len(text) > 5000:
147
+ return False
148
+
149
+ # Content check (not just numbers/symbols)
150
+ alpha_ratio = sum(c.isalpha() for c in text) / len(text)
151
+ if alpha_ratio < 0.5:
152
+ return False
153
+
154
+ # Language check (English only)
155
+ if chunk["metadata"]["language"] != "en":
156
+ return False
157
+
158
+ return True
159
+ ```
160
+
161
+ ## JSONL Export
162
+
163
+ ```python
164
+ import json
165
+
166
+ def export_jsonl(chunks: List[dict], output_path: str):
167
+ """Export chunks as JSONL (one JSON object per line)"""
168
+ with open(output_path, 'w') as f:
169
+ for chunk in chunks:
170
+ f.write(json.dumps(chunk) + '\n')
171
+
172
+ # Example output format
173
+ {
174
+ "text": "Chunk text content here...",
175
+ "metadata": {
176
+ "source": "docs/auth.md",
177
+ "chunk_id": "abc123_0",
178
+ "chunk_index": 0,
179
+ "char_count": 542,
180
+ "word_count": 89,
181
+ "has_code": true
182
+ }
183
+ }
184
+ ```
185
+
186
+ ## Complete Pipeline
187
+
188
+ ```python
189
+ def process_documents(input_dir: str, output_path: str):
190
+ all_chunks = []
191
+
192
+ # Process each document
193
+ for filepath in glob(f"{input_dir}/**/*.md"):
194
+ # Extract and clean
195
+ text = extract_markdown(filepath)
196
+ text = clean_text(text)
197
+
198
+ # Chunk
199
+ chunks = semantic_chunk(text)
200
+
201
+ # Add metadata
202
+ for i, chunk in enumerate(chunks):
203
+ chunk_obj = {
204
+ "text": chunk,
205
+ "metadata": extract_metadata(filepath, chunk, i)
206
+ }
207
+
208
+ # Quality check
209
+ if quality_check(chunk_obj):
210
+ all_chunks.append(chunk_obj)
211
+
212
+ # Deduplicate
213
+ unique_chunks = deduplicate_chunks(all_chunks)
214
+
215
+ # Export
216
+ export_jsonl(unique_chunks, output_path)
217
+
218
+ print(f"Processed {len(unique_chunks)} chunks")
219
+ ```
220
+
221
+ ## Best Practices
222
+
223
+ - Chunk at semantic boundaries
224
+ - Rich metadata for filtering
225
+ - Deduplicate aggressively
226
+ - Quality checks prevent garbage
227
+ - JSONL format for streaming
228
+ - Version your datasets
229
+
230
+ ## Output Checklist
231
+
232
+ - [ ] Text extraction from all formats
233
+ - [ ] Cleaning pipeline implemented
234
+ - [ ] Semantic chunking strategy
235
+ - [ ] Metadata schema defined
236
+ - [ ] Deduplication logic
237
+ - [ ] Quality validation checks
238
+ - [ ] JSONL export format
239
+ - [ ] Dataset statistics logged