@musashishao/agent-kit 1.2.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/mcp-gateway/README.md +121 -0
- package/.agent/mcp-gateway/dist/index.d.ts +11 -0
- package/.agent/mcp-gateway/dist/index.js +504 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.d.ts +56 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.js +112 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.d.ts +58 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.js +172 -0
- package/.agent/mcp-gateway/dist/sync/index.d.ts +6 -0
- package/.agent/mcp-gateway/dist/sync/index.js +6 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.d.ts +69 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.js +169 -0
- package/.agent/mcp-gateway/package.json +28 -0
- package/.agent/mcp-gateway/src/index.ts +608 -0
- package/.agent/mcp-gateway/src/sync/debouncer.ts +129 -0
- package/.agent/mcp-gateway/src/sync/incremental_syncer.ts +237 -0
- package/.agent/mcp-gateway/src/sync/index.ts +7 -0
- package/.agent/mcp-gateway/src/sync/timestamp_checker.ts +194 -0
- package/.agent/scripts/ak_cli.py +533 -0
- package/.agent/scripts/setup_host.py +557 -0
- package/.agent/scripts/verify_install.py +174 -0
- package/.agent/skills/app-builder/SKILL.md +51 -1
- package/.agent/skills/app-builder/scripts/generate_ai_infra.py +510 -0
- package/.agent/skills/documentation-templates/SKILL.md +9 -1
- package/.agent/skills/documentation-templates/agents-template.md +202 -0
- package/.agent/skills/graph-mapper/SKILL.md +211 -0
- package/.agent/skills/graph-mapper/scripts/generate_graph.py +503 -0
- package/.agent/skills/rag-engineering/SKILL.md +342 -0
- package/.agent/skills/rag-engineering/chunking-strategies.md +229 -0
- package/.agent/skills/rag-engineering/contextual-retrieval.md +261 -0
- package/.agent/skills/rag-engineering/hybrid-search.md +356 -0
- package/.agent/skills/rag-engineering/scripts/chunk_code.py +606 -0
- package/.agent/templates/mcp_configs/claude_desktop.json +14 -0
- package/.agent/templates/mcp_configs/cursor.json +13 -0
- package/.agent/templates/mcp_configs/vscode.json +13 -0
- package/.agent/workflows/create.md +70 -2
- package/bin/cli.js +91 -0
- package/docs/AI_DATA_INFRASTRUCTURE.md +288 -0
- package/docs/CHANGELOG_AI_INFRA.md +111 -0
- package/package.json +7 -2
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: rag-engineering
|
|
3
|
+
description: Advanced RAG pipeline engineering for codebases. Contextual chunking, semantic embeddings, hybrid search, and reranking strategies. Build production-grade retrieval systems.
|
|
4
|
+
allowed-tools: Read, Write, Edit, Bash, Glob, Grep
|
|
5
|
+
skills:
|
|
6
|
+
- database-design
|
|
7
|
+
- architecture
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# RAG Engineering
|
|
11
|
+
|
|
12
|
+
> Build **Production-Grade Retrieval-Augmented Generation** pipelines for code.
|
|
13
|
+
|
|
14
|
+
## 🎯 Purpose
|
|
15
|
+
|
|
16
|
+
This skill teaches you to build RAG systems that actually work for large codebases (200k+ files). Standard RAG fails for code because:
|
|
17
|
+
- Random chunking breaks functions mid-logic
|
|
18
|
+
- Semantic search misses exact identifiers (`Error 503`)
|
|
19
|
+
- No context = AI retrieves wrong code
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 🔧 Quick Reference
|
|
24
|
+
|
|
25
|
+
| File | Purpose |
|
|
26
|
+
|------|---------|
|
|
27
|
+
| [chunking-strategies.md](chunking-strategies.md) | Smart chunking by logical boundaries |
|
|
28
|
+
| [contextual-retrieval.md](contextual-retrieval.md) | Add context before embedding (Anthropic method) |
|
|
29
|
+
| [hybrid-search.md](hybrid-search.md) | Vector + BM25 combination |
|
|
30
|
+
| [scripts/chunk_code.py](scripts/chunk_code.py) | Code chunking script |
|
|
31
|
+
| [scripts/embed_chunks.py](scripts/embed_chunks.py) | Embedding generation script |
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 📊 The RAG Pipeline
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
39
|
+
│ Ingest │───►│ Chunk │───►│ Embed │───►│ Store │
|
|
40
|
+
│ (Files) │ │ (Smart) │ │ (Semantic) │ │ (Vector DB) │
|
|
41
|
+
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
42
|
+
│
|
|
43
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
44
|
+
│ Output │◄───│ Rerank │◄───│ Retrieve │◄──────────┘
|
|
45
|
+
│ (Context) │ │ (Top K) │ │ (Hybrid) │
|
|
46
|
+
└─────────────┘ └─────────────┘ └─────────────┘
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## 1. Smart Chunking (The Foundation)
|
|
52
|
+
|
|
53
|
+
### ❌ Bad: Random Chunking
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
Chunk 1: "function calculateTax(amount) { const rate = 0.1; retu"
|
|
57
|
+
Chunk 2: "rn amount * rate; } function calculateTotal(items) {"
|
|
58
|
+
```
|
|
59
|
+
→ AI gets broken, meaningless code
|
|
60
|
+
|
|
61
|
+
### ✅ Good: Logical Boundary Chunking
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Chunk 1: "function calculateTax(amount) { const rate = 0.1; return amount * rate; }"
|
|
65
|
+
Chunk 2: "function calculateTotal(items) { ... }"
|
|
66
|
+
```
|
|
67
|
+
→ AI gets complete, understandable units
|
|
68
|
+
|
|
69
|
+
### Chunking Rules
|
|
70
|
+
|
|
71
|
+
| Language | Boundaries | Tool |
|
|
72
|
+
|----------|------------|------|
|
|
73
|
+
| TypeScript/JS | Function, Class, Module | tree-sitter, regex |
|
|
74
|
+
| Python | Function, Class, Module | ast module |
|
|
75
|
+
| Go | Function, Type, Package | go/parser |
|
|
76
|
+
|
|
77
|
+
### Chunk Metadata
|
|
78
|
+
|
|
79
|
+
Always attach metadata to chunks:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"content": "function calculateTax(amount) { ... }",
|
|
84
|
+
"metadata": {
|
|
85
|
+
"file_path": "src/utils/tax.ts",
|
|
86
|
+
"type": "function",
|
|
87
|
+
"name": "calculateTax",
|
|
88
|
+
"start_line": 15,
|
|
89
|
+
"end_line": 20,
|
|
90
|
+
"exports": ["calculateTax"],
|
|
91
|
+
"imports": ["TAX_RATE from ./constants"]
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## 2. Contextual Retrieval (The Secret Sauce)
|
|
99
|
+
|
|
100
|
+
> **Anthropic's technique that reduces retrieval errors by 49%.**
|
|
101
|
+
|
|
102
|
+
### Problem
|
|
103
|
+
A chunk like `return amount * rate;` is meaningless without context.
|
|
104
|
+
|
|
105
|
+
### Solution
|
|
106
|
+
Before embedding, prepend a context summary:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
[CONTEXT]
|
|
110
|
+
This chunk is from file `src/utils/tax.ts`.
|
|
111
|
+
It contains the function `calculateTax` which calculates tax for billing.
|
|
112
|
+
This function is used by the ShippingService and InvoiceGenerator.
|
|
113
|
+
[/CONTEXT]
|
|
114
|
+
|
|
115
|
+
function calculateTax(amount: number): number {
|
|
116
|
+
const rate = TAX_RATE;
|
|
117
|
+
return amount * rate;
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### How to Generate Context
|
|
122
|
+
|
|
123
|
+
Use LLM to generate summaries:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
def generate_context(chunk: str, file_path: str, project_summary: str) -> str:
|
|
127
|
+
prompt = f"""
|
|
128
|
+
Given this code chunk from {file_path}:
|
|
129
|
+
```
|
|
130
|
+
{chunk}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Project context: {project_summary}
|
|
134
|
+
|
|
135
|
+
Write a 2-3 sentence summary explaining:
|
|
136
|
+
1. What this code does
|
|
137
|
+
2. Where it fits in the project
|
|
138
|
+
3. What depends on it
|
|
139
|
+
"""
|
|
140
|
+
return llm.generate(prompt)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 3. Hybrid Search (Best of Both Worlds)
|
|
146
|
+
|
|
147
|
+
### Why Hybrid?
|
|
148
|
+
|
|
149
|
+
| Search Type | Strength | Weakness |
|
|
150
|
+
|-------------|----------|----------|
|
|
151
|
+
| **Vector (Semantic)** | Finds similar meaning | Misses exact terms |
|
|
152
|
+
| **BM25 (Keyword)** | Finds exact matches | Misses synonyms |
|
|
153
|
+
| **Hybrid** | Both! | Slightly more complex |
|
|
154
|
+
|
|
155
|
+
### Implementation Strategy
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
def hybrid_search(query: str, top_k: int = 20) -> List[Chunk]:
|
|
159
|
+
# 1. Vector search (semantic)
|
|
160
|
+
vector_results = vector_db.search(embed(query), top_k=top_k)
|
|
161
|
+
|
|
162
|
+
# 2. BM25 search (keyword)
|
|
163
|
+
bm25_results = bm25_index.search(query, top_k=top_k)
|
|
164
|
+
|
|
165
|
+
# 3. Reciprocal Rank Fusion (RRF)
|
|
166
|
+
combined = reciprocal_rank_fusion(vector_results, bm25_results)
|
|
167
|
+
|
|
168
|
+
return combined[:top_k]
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Reciprocal Rank Fusion (RRF)
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
def reciprocal_rank_fusion(results_list: List[List], k: int = 60) -> List:
|
|
175
|
+
"""Combine multiple result lists using RRF."""
|
|
176
|
+
scores = {}
|
|
177
|
+
|
|
178
|
+
for results in results_list:
|
|
179
|
+
for rank, item in enumerate(results):
|
|
180
|
+
if item.id not in scores:
|
|
181
|
+
scores[item.id] = 0
|
|
182
|
+
scores[item.id] += 1 / (k + rank)
|
|
183
|
+
|
|
184
|
+
# Sort by combined score
|
|
185
|
+
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## 4. Reranking (The Final Filter)
|
|
191
|
+
|
|
192
|
+
### Why Rerank?
|
|
193
|
+
|
|
194
|
+
Initial retrieval gets top 100 candidates. Reranking picks the best 10-20.
|
|
195
|
+
|
|
196
|
+
### Reranking Strategies
|
|
197
|
+
|
|
198
|
+
| Strategy | Accuracy | Speed | Cost |
|
|
199
|
+
|----------|----------|-------|------|
|
|
200
|
+
| **Cross-encoder** | Highest | Slow | High |
|
|
201
|
+
| **ColBERT** | High | Medium | Medium |
|
|
202
|
+
| **LLM-based** | High | Slow | High |
|
|
203
|
+
| **Cohere Rerank** | High | Fast | API cost |
|
|
204
|
+
|
|
205
|
+
### Simple Reranker Implementation
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
def rerank(query: str, chunks: List[Chunk], top_k: int = 10) -> List[Chunk]:
|
|
209
|
+
"""Rerank chunks using cross-encoder or LLM."""
|
|
210
|
+
|
|
211
|
+
scored_chunks = []
|
|
212
|
+
for chunk in chunks:
|
|
213
|
+
# Use LLM to score relevance
|
|
214
|
+
score = llm.score_relevance(query, chunk.content)
|
|
215
|
+
scored_chunks.append((chunk, score))
|
|
216
|
+
|
|
217
|
+
# Sort by score and return top k
|
|
218
|
+
scored_chunks.sort(key=lambda x: x[1], reverse=True)
|
|
219
|
+
return [c for c, s in scored_chunks[:top_k]]
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## 5. Vector Database Selection
|
|
225
|
+
|
|
226
|
+
### For Agent Kit (Local/Lightweight)
|
|
227
|
+
|
|
228
|
+
| Database | Best For | Setup |
|
|
229
|
+
|----------|----------|-------|
|
|
230
|
+
| **ChromaDB** | Local development | `pip install chromadb` |
|
|
231
|
+
| **SQLite + sqlite-vec** | Embedded apps | Single file |
|
|
232
|
+
| **LanceDB** | Large local datasets | `pip install lancedb` |
|
|
233
|
+
|
|
234
|
+
### For Production
|
|
235
|
+
|
|
236
|
+
| Database | Best For | Notes |
|
|
237
|
+
|----------|----------|-------|
|
|
238
|
+
| **Pinecone** | Managed, scalable | Free tier available |
|
|
239
|
+
| **Weaviate** | Self-hosted + managed | GraphQL API |
|
|
240
|
+
| **Qdrant** | High performance | Rust-based |
|
|
241
|
+
| **pgvector** | PostgreSQL users | Familiar stack |
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## 6. Embedding Models
|
|
246
|
+
|
|
247
|
+
### For Code (Recommended)
|
|
248
|
+
|
|
249
|
+
| Model | Dimensions | Best For |
|
|
250
|
+
|-------|------------|----------|
|
|
251
|
+
| **Voyage Code 2** | 1536 | Code-specific |
|
|
252
|
+
| **text-embedding-3-large** | 3072 | General + Code |
|
|
253
|
+
| **Gemini Text Embedding** | 768 | Google ecosystem |
|
|
254
|
+
| **CodeBERT** | 768 | Open source |
|
|
255
|
+
|
|
256
|
+
### Embedding Best Practices
|
|
257
|
+
|
|
258
|
+
1. **Normalize embeddings** for cosine similarity
|
|
259
|
+
2. **Batch processing** for efficiency
|
|
260
|
+
3. **Cache embeddings** - don't re-embed unchanged files
|
|
261
|
+
4. **Dimension reduction** if storage is concern (PCA)
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## 7. Integration with Agent Kit
|
|
266
|
+
|
|
267
|
+
### Workflow
|
|
268
|
+
|
|
269
|
+
1. **On Project Init (`/create`):**
|
|
270
|
+
- Run chunking script
|
|
271
|
+
- Generate embeddings
|
|
272
|
+
- Store in local vector DB
|
|
273
|
+
|
|
274
|
+
2. **On Code Change (Pre-commit hook):**
|
|
275
|
+
- Detect changed files
|
|
276
|
+
- Re-chunk and re-embed only changes
|
|
277
|
+
- Update vector DB
|
|
278
|
+
|
|
279
|
+
3. **On AI Query:**
|
|
280
|
+
- Hybrid search for relevant chunks
|
|
281
|
+
- Rerank results
|
|
282
|
+
- Inject top chunks into context
|
|
283
|
+
|
|
284
|
+
### File Structure
|
|
285
|
+
|
|
286
|
+
```
|
|
287
|
+
project-root/
|
|
288
|
+
├── .agent/
|
|
289
|
+
│ ├── AGENTS.md # Layer 1: Map
|
|
290
|
+
│ ├── graph.json # Layer 2: Knowledge Graph
|
|
291
|
+
│ └── rag/ # Layer 3: RAG Data
|
|
292
|
+
│ ├── chunks.json # Chunked code
|
|
293
|
+
│ ├── embeddings.npy # Vector embeddings
|
|
294
|
+
│ └── index.db # Vector index (ChromaDB/SQLite)
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
---
|
|
298
|
+
|
|
299
|
+
## 📋 Implementation Checklist
|
|
300
|
+
|
|
301
|
+
### Phase 1: Basic RAG
|
|
302
|
+
- [ ] Implement logical chunking (by function/class)
|
|
303
|
+
- [ ] Choose embedding model
|
|
304
|
+
- [ ] Set up local vector DB (ChromaDB)
|
|
305
|
+
- [ ] Basic semantic search
|
|
306
|
+
|
|
307
|
+
### Phase 2: Enhanced RAG
|
|
308
|
+
- [ ] Add contextual summaries to chunks
|
|
309
|
+
- [ ] Implement BM25 for hybrid search
|
|
310
|
+
- [ ] Add RRF fusion
|
|
311
|
+
- [ ] Implement basic reranking
|
|
312
|
+
|
|
313
|
+
### Phase 3: Production RAG
|
|
314
|
+
- [ ] Incremental indexing (only changed files)
|
|
315
|
+
- [ ] Caching layer
|
|
316
|
+
- [ ] Evaluation metrics (hit rate, MRR)
|
|
317
|
+
- [ ] Monitoring and logging
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
## ⚠️ Common Pitfalls
|
|
322
|
+
|
|
323
|
+
| Pitfall | Solution |
|
|
324
|
+
|---------|----------|
|
|
325
|
+
| Chunking breaks code | Use AST-based chunking |
|
|
326
|
+
| Retrieving wrong file | Add file path to chunk text |
|
|
327
|
+
| Missing exact matches | Add BM25/keyword search |
|
|
328
|
+
| Too many irrelevant results | Implement reranking |
|
|
329
|
+
| Slow embedding | Batch + cache |
|
|
330
|
+
| Stale index | Automate re-indexing |
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## 🔗 Related Skills
|
|
335
|
+
|
|
336
|
+
- `graph-mapper` - Knowledge Graph for dependencies
|
|
337
|
+
- `database-design` - Vector DB selection
|
|
338
|
+
- `mcp-builder` - Build MCP server for RAG queries
|
|
339
|
+
|
|
340
|
+
---
|
|
341
|
+
|
|
342
|
+
> **Remember:** RAG is not magic. It's engineering. Measure, iterate, improve.
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# Chunking Strategies for Code
|
|
2
|
+
|
|
3
|
+
> Break code into meaningful, complete units - not random fragments.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Golden Rule
|
|
8
|
+
|
|
9
|
+
> **A chunk should be a complete, understandable unit of code.**
|
|
10
|
+
|
|
11
|
+
If you read a chunk in isolation, you should understand what it does.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Strategy 1: Function-Level Chunking
|
|
16
|
+
|
|
17
|
+
Best for: Most codebases
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
// ✅ One chunk = One function
|
|
21
|
+
function calculateTax(amount: number): number {
|
|
22
|
+
const rate = 0.1;
|
|
23
|
+
return amount * rate;
|
|
24
|
+
}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Implementation
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import re
|
|
31
|
+
|
|
32
|
+
def chunk_by_functions_ts(code: str) -> list[dict]:
|
|
33
|
+
"""Chunk TypeScript/JavaScript by functions."""
|
|
34
|
+
|
|
35
|
+
# Pattern for function declarations
|
|
36
|
+
pattern = r'''
|
|
37
|
+
(?:export\s+)? # Optional export
|
|
38
|
+
(?:async\s+)? # Optional async
|
|
39
|
+
(?:function\s+(\w+)| # function name() or
|
|
40
|
+
(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>) # arrow function
|
|
41
|
+
[^{]*\{ # Opening brace
|
|
42
|
+
'''
|
|
43
|
+
|
|
44
|
+
chunks = []
|
|
45
|
+
# Use AST parser for robust extraction (recommended)
|
|
46
|
+
# This regex is simplified example
|
|
47
|
+
|
|
48
|
+
return chunks
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Strategy 2: Class-Level Chunking
|
|
54
|
+
|
|
55
|
+
Best for: OOP-heavy codebases
|
|
56
|
+
|
|
57
|
+
```typescript
|
|
58
|
+
// ✅ One chunk = One class (with all methods)
|
|
59
|
+
class UserService {
|
|
60
|
+
constructor(private db: Database) {}
|
|
61
|
+
|
|
62
|
+
async getUser(id: string): Promise<User> {
|
|
63
|
+
return this.db.findById(id);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async createUser(data: UserData): Promise<User> {
|
|
67
|
+
return this.db.create(data);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### When to Split Classes
|
|
73
|
+
|
|
74
|
+
If class > 500 lines:
|
|
75
|
+
- Chunk class definition + constructor
|
|
76
|
+
- Chunk each method separately
|
|
77
|
+
- Link via metadata
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Strategy 3: Module-Level Chunking
|
|
82
|
+
|
|
83
|
+
Best for: Utility files, configs
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
// ✅ One chunk = Entire file (if small)
|
|
87
|
+
// constants.ts
|
|
88
|
+
export const API_URL = 'https://api.example.com';
|
|
89
|
+
export const MAX_RETRIES = 3;
|
|
90
|
+
export const TIMEOUT_MS = 5000;
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Size Limits
|
|
94
|
+
|
|
95
|
+
| File Size | Strategy |
|
|
96
|
+
|-----------|----------|
|
|
97
|
+
| < 100 lines | Entire file as one chunk |
|
|
98
|
+
| 100-500 lines | Split by function/class |
|
|
99
|
+
| > 500 lines | Split by function + subdivide large functions |
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Strategy 4: Semantic Chunking
|
|
104
|
+
|
|
105
|
+
Best for: Mixed content (code + docs)
|
|
106
|
+
|
|
107
|
+
Split by semantic meaning rather than syntax:
|
|
108
|
+
|
|
109
|
+
1. **Imports section** → One chunk
|
|
110
|
+
2. **Type definitions** → One chunk
|
|
111
|
+
3. **Main logic** → Multiple chunks
|
|
112
|
+
4. **Exports** → One chunk
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Chunk Overlap
|
|
117
|
+
|
|
118
|
+
### Why Overlap?
|
|
119
|
+
|
|
120
|
+
Context at boundaries gets lost without overlap.
|
|
121
|
+
|
|
122
|
+
### Recommended Overlap
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
CHUNK_SIZE = 1500 # tokens
|
|
126
|
+
OVERLAP = 200 # tokens
|
|
127
|
+
|
|
128
|
+
# Result: Each chunk shares 200 tokens with neighbors
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Implementation
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
def chunk_with_overlap(text: str, chunk_size: int, overlap: int) -> list[str]:
|
|
135
|
+
tokens = tokenize(text)
|
|
136
|
+
chunks = []
|
|
137
|
+
|
|
138
|
+
start = 0
|
|
139
|
+
while start < len(tokens):
|
|
140
|
+
end = min(start + chunk_size, len(tokens))
|
|
141
|
+
chunks.append(detokenize(tokens[start:end]))
|
|
142
|
+
start += chunk_size - overlap
|
|
143
|
+
|
|
144
|
+
return chunks
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Metadata: The Critical Addition
|
|
150
|
+
|
|
151
|
+
Every chunk MUST have metadata:
|
|
152
|
+
|
|
153
|
+
```json
|
|
154
|
+
{
|
|
155
|
+
"id": "chunk_abc123",
|
|
156
|
+
"content": "function calculateTax(amount) { ... }",
|
|
157
|
+
"metadata": {
|
|
158
|
+
"file_path": "src/utils/tax.ts",
|
|
159
|
+
"file_type": "typescript",
|
|
160
|
+
"chunk_type": "function",
|
|
161
|
+
"name": "calculateTax",
|
|
162
|
+
"start_line": 15,
|
|
163
|
+
"end_line": 20,
|
|
164
|
+
"tokens": 45,
|
|
165
|
+
"hash": "sha256:...",
|
|
166
|
+
"dependencies": ["TAX_RATE"],
|
|
167
|
+
"dependents": ["ShippingService", "InvoiceGenerator"],
|
|
168
|
+
"last_modified": "2025-01-24T10:00:00Z"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Language-Specific Parsers
|
|
176
|
+
|
|
177
|
+
### TypeScript/JavaScript
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Recommended: tree-sitter
|
|
181
|
+
npm install tree-sitter tree-sitter-typescript
|
|
182
|
+
|
|
183
|
+
# Alternative: @babel/parser
|
|
184
|
+
npm install @babel/parser
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Python
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
import ast
|
|
191
|
+
|
|
192
|
+
def extract_functions(code: str) -> list[dict]:
|
|
193
|
+
tree = ast.parse(code)
|
|
194
|
+
functions = []
|
|
195
|
+
|
|
196
|
+
for node in ast.walk(tree):
|
|
197
|
+
if isinstance(node, ast.FunctionDef):
|
|
198
|
+
functions.append({
|
|
199
|
+
'name': node.name,
|
|
200
|
+
'start_line': node.lineno,
|
|
201
|
+
'end_line': node.end_lineno,
|
|
202
|
+
'code': ast.get_source_segment(code, node)
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
return functions
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Go
|
|
209
|
+
|
|
210
|
+
```go
|
|
211
|
+
// Use go/parser package
|
|
212
|
+
import (
|
|
213
|
+
"go/parser"
|
|
214
|
+
"go/token"
|
|
215
|
+
)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Quality Checklist
|
|
221
|
+
|
|
222
|
+
Before chunking:
|
|
223
|
+
|
|
224
|
+
- [ ] Chunk size is appropriate (500-1500 tokens)
|
|
225
|
+
- [ ] Overlap is configured (10-20%)
|
|
226
|
+
- [ ] Metadata is captured
|
|
227
|
+
- [ ] Complete units (no broken functions)
|
|
228
|
+
- [ ] Imports are preserved or referenced
|
|
229
|
+
- [ ] Comments are included with code
|