@musashishao/agent-kit 1.2.2 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/mcp-gateway/README.md +121 -0
- package/.agent/mcp-gateway/dist/index.d.ts +11 -0
- package/.agent/mcp-gateway/dist/index.js +504 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.d.ts +56 -0
- package/.agent/mcp-gateway/dist/sync/debouncer.js +112 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.d.ts +58 -0
- package/.agent/mcp-gateway/dist/sync/incremental_syncer.js +172 -0
- package/.agent/mcp-gateway/dist/sync/index.d.ts +6 -0
- package/.agent/mcp-gateway/dist/sync/index.js +6 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.d.ts +69 -0
- package/.agent/mcp-gateway/dist/sync/timestamp_checker.js +169 -0
- package/.agent/mcp-gateway/package.json +28 -0
- package/.agent/mcp-gateway/src/index.ts +608 -0
- package/.agent/mcp-gateway/src/sync/debouncer.ts +129 -0
- package/.agent/mcp-gateway/src/sync/incremental_syncer.ts +237 -0
- package/.agent/mcp-gateway/src/sync/index.ts +7 -0
- package/.agent/mcp-gateway/src/sync/timestamp_checker.ts +194 -0
- package/.agent/scripts/ak_cli.py +549 -0
- package/.agent/scripts/setup_host.py +557 -0
- package/.agent/scripts/verify_install.py +174 -0
- package/.agent/skills/app-builder/SKILL.md +51 -1
- package/.agent/skills/app-builder/scripts/generate_ai_infra.py +510 -0
- package/.agent/skills/documentation-templates/SKILL.md +9 -1
- package/.agent/skills/documentation-templates/agents-template.md +202 -0
- package/.agent/skills/graph-mapper/SKILL.md +211 -0
- package/.agent/skills/graph-mapper/scripts/generate_graph.py +705 -0
- package/.agent/skills/rag-engineering/SKILL.md +342 -0
- package/.agent/skills/rag-engineering/chunking-strategies.md +229 -0
- package/.agent/skills/rag-engineering/contextual-retrieval.md +261 -0
- package/.agent/skills/rag-engineering/hybrid-search.md +356 -0
- package/.agent/skills/rag-engineering/scripts/chunk_code.py +916 -0
- package/.agent/templates/mcp_configs/claude_desktop.json +14 -0
- package/.agent/templates/mcp_configs/cursor.json +13 -0
- package/.agent/templates/mcp_configs/vscode.json +13 -0
- package/.agent/workflows/create.md +70 -2
- package/bin/cli.js +91 -0
- package/docs/AI_DATA_INFRASTRUCTURE.md +288 -0
- package/docs/CHANGELOG_AI_INFRA.md +111 -0
- package/docs/PLAN-universal-intelligence.md +48 -0
- package/package.json +7 -2
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Contextual Retrieval
|
|
2
|
+
|
|
3
|
+
> Anthropic's technique that reduces retrieval failures by 49%.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## The Problem
|
|
8
|
+
|
|
9
|
+
Standard chunks lack context:
|
|
10
|
+
|
|
11
|
+
```typescript
|
|
12
|
+
// This chunk is meaningless in isolation
|
|
13
|
+
return amount * rate;
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Questions the AI can't answer:
|
|
17
|
+
- What is `amount`?
|
|
18
|
+
- Where does `rate` come from?
|
|
19
|
+
- What calls this code?
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## The Solution
|
|
24
|
+
|
|
25
|
+
Add a **context summary** before embedding:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
[CONTEXT]
|
|
29
|
+
File: src/utils/tax.ts
|
|
30
|
+
Function: calculateTax
|
|
31
|
+
Purpose: Calculates tax amount for customer billing
|
|
32
|
+
Called by: ShippingService.calculateShipping(), InvoiceGenerator.generate()
|
|
33
|
+
Dependencies: TAX_RATE constant from ./constants.ts
|
|
34
|
+
[/CONTEXT]
|
|
35
|
+
|
|
36
|
+
function calculateTax(amount: number): number {
|
|
37
|
+
const rate = TAX_RATE;
|
|
38
|
+
return amount * rate;
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Now the embedding captures:
|
|
43
|
+
- File location
|
|
44
|
+
- Function purpose
|
|
45
|
+
- Callers and dependencies
|
|
46
|
+
- Business context
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Implementation
|
|
51
|
+
|
|
52
|
+
### Step 1: Gather Context Sources
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
def gather_context(chunk: dict, project_info: dict, graph: dict) -> dict:
|
|
56
|
+
"""Collect all context sources for a chunk."""
|
|
57
|
+
|
|
58
|
+
file_path = chunk['metadata']['file_path']
|
|
59
|
+
chunk_name = chunk['metadata'].get('name', 'unknown')
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
'file_path': file_path,
|
|
63
|
+
'chunk_name': chunk_name,
|
|
64
|
+
'file_purpose': get_file_purpose(file_path, project_info),
|
|
65
|
+
'dependencies': graph.get_dependencies(file_path),
|
|
66
|
+
'dependents': graph.get_dependents(file_path),
|
|
67
|
+
'project_context': project_info.get('summary', ''),
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Step 2: Generate Context Summary
|
|
72
|
+
|
|
73
|
+
Option A: **Template-based** (Fast, deterministic)
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
def generate_context_template(ctx: dict) -> str:
|
|
77
|
+
"""Generate context using templates."""
|
|
78
|
+
|
|
79
|
+
return f"""[CONTEXT]
|
|
80
|
+
File: {ctx['file_path']}
|
|
81
|
+
Name: {ctx['chunk_name']}
|
|
82
|
+
Purpose: {ctx['file_purpose']}
|
|
83
|
+
Imported by: {', '.join(ctx['dependents'][:5]) or 'None'}
|
|
84
|
+
Imports: {', '.join(ctx['dependencies'][:5]) or 'None'}
|
|
85
|
+
[/CONTEXT]"""
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Option B: **LLM-generated** (Rich, semantic)
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
def generate_context_llm(chunk: str, ctx: dict) -> str:
|
|
92
|
+
"""Generate context using LLM."""
|
|
93
|
+
|
|
94
|
+
prompt = f"""Analyze this code chunk and write a 2-3 sentence summary.
|
|
95
|
+
|
|
96
|
+
File: {ctx['file_path']}
|
|
97
|
+
Project: {ctx['project_context']}
|
|
98
|
+
Dependencies: {ctx['dependencies']}
|
|
99
|
+
Used by: {ctx['dependents']}
|
|
100
|
+
|
|
101
|
+
Code:
|
|
102
|
+
```
|
|
103
|
+
{chunk}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Summary (2-3 sentences explaining what this code does and its role):"""
|
|
107
|
+
|
|
108
|
+
return llm.generate(prompt, max_tokens=100)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Step 3: Combine Context + Content
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def create_contextual_chunk(chunk: dict, context: str) -> dict:
|
|
115
|
+
"""Create final chunk with context prepended."""
|
|
116
|
+
|
|
117
|
+
contextual_content = f"{context}\n\n{chunk['content']}"
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
'id': chunk['id'],
|
|
121
|
+
'content': contextual_content,
|
|
122
|
+
'original_content': chunk['content'],
|
|
123
|
+
'context': context,
|
|
124
|
+
'metadata': chunk['metadata']
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Context Sources (Priority Order)
|
|
131
|
+
|
|
132
|
+
| Source | Information | Priority |
|
|
133
|
+
|--------|-------------|----------|
|
|
134
|
+
| **File Path** | Location in project | P0 (Always) |
|
|
135
|
+
| **AGENTS.md** | Project overview | P0 (Always) |
|
|
136
|
+
| **Knowledge Graph** | Dependencies | P1 (If available) |
|
|
137
|
+
| **Docstrings/Comments** | Developer intent | P1 (If available) |
|
|
138
|
+
| **Git History** | Change context | P2 (Optional) |
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## When to Use LLM vs Template
|
|
143
|
+
|
|
144
|
+
| Scenario | Approach | Reason |
|
|
145
|
+
|----------|----------|--------|
|
|
146
|
+
| Initial indexing | Template | Speed |
|
|
147
|
+
| High-value files | LLM | Quality |
|
|
148
|
+
| Simple utilities | Template | Overkill for LLM |
|
|
149
|
+
| Complex business logic | LLM | Need semantic understanding |
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Cost Optimization
|
|
154
|
+
|
|
155
|
+
LLM context generation is expensive. Optimize:
|
|
156
|
+
|
|
157
|
+
1. **Cache aggressively** - Context rarely changes
|
|
158
|
+
2. **Batch processing** - Send multiple chunks per request
|
|
159
|
+
3. **Use smaller models** - Claude Haiku, GPT-3.5-turbo
|
|
160
|
+
4. **Template for simple cases** - 80% template, 20% LLM
|
|
161
|
+
|
|
162
|
+
### Cost Estimation
|
|
163
|
+
|
|
164
|
+
| Codebase Size | Chunks | LLM Calls | Est. Cost |
|
|
165
|
+
|---------------|--------|-----------|-----------|
|
|
166
|
+
| 10k lines | ~200 | 200 | ~$0.50 |
|
|
167
|
+
| 100k lines | ~2,000 | 2,000 | ~$5.00 |
|
|
168
|
+
| 1M lines | ~20,000 | 20,000 | ~$50.00 |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Quality Measurement
|
|
173
|
+
|
|
174
|
+
### Good Context Indicators
|
|
175
|
+
|
|
176
|
+
- [ ] Mentions file purpose
|
|
177
|
+
- [ ] Lists key dependencies
|
|
178
|
+
- [ ] Describes what code does (not just syntax)
|
|
179
|
+
- [ ] Under 100 tokens (concise)
|
|
180
|
+
|
|
181
|
+
### Bad Context Examples
|
|
182
|
+
|
|
183
|
+
❌ Too vague:
|
|
184
|
+
```
|
|
185
|
+
[CONTEXT]
|
|
186
|
+
This is code from the project.
|
|
187
|
+
[/CONTEXT]
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
❌ Too long:
|
|
191
|
+
```
|
|
192
|
+
[CONTEXT]
|
|
193
|
+
This function is located in src/utils/tax.ts which is part of the utilities
|
|
194
|
+
folder that contains various helper functions used throughout the application
|
|
195
|
+
for different purposes including but not limited to calculations, formatting,
|
|
196
|
+
validation, and data transformation. The specific function calculateTax...
|
|
197
|
+
(200 more tokens)
|
|
198
|
+
[/CONTEXT]
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
✅ Just right:
|
|
202
|
+
```
|
|
203
|
+
[CONTEXT]
|
|
204
|
+
File: src/utils/tax.ts
|
|
205
|
+
Function calculateTax computes tax for billing. Used by ShippingService
|
|
206
|
+
and InvoiceGenerator. Depends on TAX_RATE from constants.
|
|
207
|
+
[/CONTEXT]
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Integration with graph-mapper
|
|
213
|
+
|
|
214
|
+
Use Knowledge Graph data for richer context:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from graph_mapper import load_graph
|
|
218
|
+
|
|
219
|
+
def enrich_context_with_graph(chunk: dict, graph_path: str) -> dict:
|
|
220
|
+
"""Add dependency info from graph."""
|
|
221
|
+
|
|
222
|
+
graph = load_graph(graph_path)
|
|
223
|
+
file_id = chunk['metadata']['file_path']
|
|
224
|
+
|
|
225
|
+
# Get direct dependencies
|
|
226
|
+
imports = graph.get_imports(file_id)
|
|
227
|
+
|
|
228
|
+
# Get files that import this
|
|
229
|
+
imported_by = graph.get_importers(file_id)
|
|
230
|
+
|
|
231
|
+
# Calculate impact score
|
|
232
|
+
impact = graph.calculate_impact(file_id)
|
|
233
|
+
|
|
234
|
+
return {
|
|
235
|
+
'imports': imports,
|
|
236
|
+
'imported_by': imported_by,
|
|
237
|
+
'impact_score': impact['score'],
|
|
238
|
+
'impact_files': impact['files'][:5]
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Retrieval Impact
|
|
245
|
+
|
|
246
|
+
### Without Contextual Retrieval
|
|
247
|
+
|
|
248
|
+
Query: "How is tax calculated for shipping?"
|
|
249
|
+
|
|
250
|
+
Results: Random code mentioning "tax" or "shipping"
|
|
251
|
+
- ❌ `const TAX = 0.1;` (irrelevant constant)
|
|
252
|
+
- ❌ `// TODO: add tax` (comment)
|
|
253
|
+
- ⚠️ `calculateTax(...)` (correct but no context)
|
|
254
|
+
|
|
255
|
+
### With Contextual Retrieval
|
|
256
|
+
|
|
257
|
+
Results: Properly contextualized chunks
|
|
258
|
+
- ✅ `[calculateTax used by ShippingService] function calculateTax...`
|
|
259
|
+
- ✅ `[ShippingService imports calculateTax] class ShippingService...`
|
|
260
|
+
|
|
261
|
+
**Improvement: 35-49% reduction in failed retrievals** (Anthropic research)
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
# Hybrid Search
|
|
2
|
+
|
|
3
|
+
> Combine semantic (vector) and keyword (BM25) search for best results.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Why Hybrid?
|
|
8
|
+
|
|
9
|
+
| Query Type | Vector Search | BM25 Search | Winner |
|
|
10
|
+
|------------|---------------|-------------|--------|
|
|
11
|
+
| "How to handle user authentication" | ✅ Great | ⚠️ Okay | Vector |
|
|
12
|
+
| "Error 503" | ❌ Poor | ✅ Great | BM25 |
|
|
13
|
+
| "calculateTax function" | ⚠️ Okay | ✅ Great | BM25 |
|
|
14
|
+
| "similar to login flow" | ✅ Great | ❌ Poor | Vector |
|
|
15
|
+
|
|
16
|
+
**Hybrid = Best of both worlds**
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Architecture
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
┌─────────────────┐
|
|
24
|
+
│ Query │
|
|
25
|
+
└────────┬────────┘
|
|
26
|
+
│
|
|
27
|
+
┌──────────────┴──────────────┐
|
|
28
|
+
│ │
|
|
29
|
+
▼ ▼
|
|
30
|
+
┌─────────────────┐ ┌─────────────────┐
|
|
31
|
+
│ Vector Search │ │ BM25 Search │
|
|
32
|
+
│ (Semantic) │ │ (Keyword) │
|
|
33
|
+
└────────┬────────┘ └────────┬────────┘
|
|
34
|
+
│ │
|
|
35
|
+
│ Top 50 results each │
|
|
36
|
+
│ │
|
|
37
|
+
└──────────────┬──────────────┘
|
|
38
|
+
│
|
|
39
|
+
▼
|
|
40
|
+
┌─────────────────┐
|
|
41
|
+
│ Fusion (RRF) │
|
|
42
|
+
│ Combine ranks │
|
|
43
|
+
└────────┬────────┘
|
|
44
|
+
│
|
|
45
|
+
▼
|
|
46
|
+
┌─────────────────┐
|
|
47
|
+
│ Reranker │
|
|
48
|
+
│ Top 10 │
|
|
49
|
+
└────────┬────────┘
|
|
50
|
+
│
|
|
51
|
+
▼
|
|
52
|
+
┌─────────────────┐
|
|
53
|
+
│ Final Results │
|
|
54
|
+
└─────────────────┘
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Implementation
|
|
60
|
+
|
|
61
|
+
### Step 1: Set Up Both Indexes
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import chromadb
|
|
65
|
+
from rank_bm25 import BM25Okapi
|
|
66
|
+
|
|
67
|
+
class HybridSearchEngine:
|
|
68
|
+
def __init__(self, chunks: list[dict]):
|
|
69
|
+
# Vector index (ChromaDB)
|
|
70
|
+
self.chroma_client = chromadb.Client()
|
|
71
|
+
self.collection = self.chroma_client.create_collection("code_chunks")
|
|
72
|
+
|
|
73
|
+
# Add chunks to vector DB
|
|
74
|
+
self.collection.add(
|
|
75
|
+
documents=[c['content'] for c in chunks],
|
|
76
|
+
ids=[c['id'] for c in chunks],
|
|
77
|
+
metadatas=[c['metadata'] for c in chunks]
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# BM25 index
|
|
81
|
+
tokenized_chunks = [c['content'].split() for c in chunks]
|
|
82
|
+
self.bm25 = BM25Okapi(tokenized_chunks)
|
|
83
|
+
self.chunk_ids = [c['id'] for c in chunks]
|
|
84
|
+
self.chunks = {c['id']: c for c in chunks}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Step 2: Implement Search Methods
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
def vector_search(self, query: str, top_k: int = 50) -> list[tuple]:
|
|
91
|
+
"""Semantic search using embeddings."""
|
|
92
|
+
results = self.collection.query(
|
|
93
|
+
query_texts=[query],
|
|
94
|
+
n_results=top_k
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Return [(id, score), ...]
|
|
98
|
+
return list(zip(
|
|
99
|
+
results['ids'][0],
|
|
100
|
+
results['distances'][0]
|
|
101
|
+
))
|
|
102
|
+
|
|
103
|
+
def bm25_search(self, query: str, top_k: int = 50) -> list[tuple]:
|
|
104
|
+
"""Keyword search using BM25."""
|
|
105
|
+
tokenized_query = query.split()
|
|
106
|
+
scores = self.bm25.get_scores(tokenized_query)
|
|
107
|
+
|
|
108
|
+
# Get top k
|
|
109
|
+
top_indices = sorted(
|
|
110
|
+
range(len(scores)),
|
|
111
|
+
key=lambda i: scores[i],
|
|
112
|
+
reverse=True
|
|
113
|
+
)[:top_k]
|
|
114
|
+
|
|
115
|
+
return [(self.chunk_ids[i], scores[i]) for i in top_indices]
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Step 3: Reciprocal Rank Fusion
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
def reciprocal_rank_fusion(
|
|
122
|
+
self,
|
|
123
|
+
results_list: list[list[tuple]],
|
|
124
|
+
k: int = 60
|
|
125
|
+
) -> list[tuple]:
|
|
126
|
+
"""
|
|
127
|
+
Combine multiple ranked lists using RRF.
|
|
128
|
+
|
|
129
|
+
Formula: score(d) = Σ 1/(k + rank(d))
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
results_list: List of [(id, score), ...] for each search method
|
|
133
|
+
k: Constant to prevent high scores (default 60)
|
|
134
|
+
"""
|
|
135
|
+
scores = {}
|
|
136
|
+
|
|
137
|
+
for results in results_list:
|
|
138
|
+
for rank, (doc_id, _) in enumerate(results):
|
|
139
|
+
if doc_id not in scores:
|
|
140
|
+
scores[doc_id] = 0
|
|
141
|
+
scores[doc_id] += 1 / (k + rank + 1)
|
|
142
|
+
|
|
143
|
+
# Sort by combined score
|
|
144
|
+
sorted_results = sorted(
|
|
145
|
+
scores.items(),
|
|
146
|
+
key=lambda x: x[1],
|
|
147
|
+
reverse=True
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return sorted_results
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Step 4: Hybrid Search Method
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def hybrid_search(
|
|
157
|
+
self,
|
|
158
|
+
query: str,
|
|
159
|
+
top_k: int = 20,
|
|
160
|
+
vector_weight: float = 0.5
|
|
161
|
+
) -> list[dict]:
|
|
162
|
+
"""
|
|
163
|
+
Perform hybrid search combining vector and BM25.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
query: Search query
|
|
167
|
+
top_k: Number of results to return
|
|
168
|
+
vector_weight: Weight for vector search (0-1)
|
|
169
|
+
"""
|
|
170
|
+
# Get results from both methods
|
|
171
|
+
vector_results = self.vector_search(query, top_k=50)
|
|
172
|
+
bm25_results = self.bm25_search(query, top_k=50)
|
|
173
|
+
|
|
174
|
+
# Fuse results
|
|
175
|
+
fused = self.reciprocal_rank_fusion([vector_results, bm25_results])
|
|
176
|
+
|
|
177
|
+
# Get top k chunks
|
|
178
|
+
top_ids = [doc_id for doc_id, score in fused[:top_k]]
|
|
179
|
+
|
|
180
|
+
return [self.chunks[doc_id] for doc_id in top_ids]
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Weighting Strategies
|
|
186
|
+
|
|
187
|
+
### Fixed Weights
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
# Equal weight
|
|
191
|
+
vector_weight = 0.5
|
|
192
|
+
bm25_weight = 0.5
|
|
193
|
+
|
|
194
|
+
# Semantic-heavy (for conceptual queries)
|
|
195
|
+
vector_weight = 0.7
|
|
196
|
+
bm25_weight = 0.3
|
|
197
|
+
|
|
198
|
+
# Keyword-heavy (for exact matches)
|
|
199
|
+
vector_weight = 0.3
|
|
200
|
+
bm25_weight = 0.7
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Dynamic Weights
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
def determine_weights(query: str) -> tuple[float, float]:
|
|
207
|
+
"""Dynamically adjust weights based on query type."""
|
|
208
|
+
|
|
209
|
+
# Check for exact identifiers (function names, error codes)
|
|
210
|
+
has_identifier = bool(re.search(r'[A-Z][a-z]+[A-Z]|Error\s*\d+|_\w+_', query))
|
|
211
|
+
|
|
212
|
+
# Check for conceptual language
|
|
213
|
+
conceptual_words = {'how', 'what', 'why', 'similar', 'like', 'related'}
|
|
214
|
+
is_conceptual = any(word in query.lower() for word in conceptual_words)
|
|
215
|
+
|
|
216
|
+
if has_identifier:
|
|
217
|
+
return (0.3, 0.7) # Favor BM25
|
|
218
|
+
elif is_conceptual:
|
|
219
|
+
return (0.7, 0.3) # Favor Vector
|
|
220
|
+
else:
|
|
221
|
+
return (0.5, 0.5) # Equal
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Alternative: Weighted Score Fusion
|
|
227
|
+
|
|
228
|
+
Instead of RRF, use weighted scores directly:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
def weighted_score_fusion(
|
|
232
|
+
vector_results: list[tuple],
|
|
233
|
+
bm25_results: list[tuple],
|
|
234
|
+
vector_weight: float = 0.5
|
|
235
|
+
) -> list[tuple]:
|
|
236
|
+
"""Combine using normalized weighted scores."""
|
|
237
|
+
|
|
238
|
+
# Normalize scores to 0-1 range
|
|
239
|
+
def normalize(results):
|
|
240
|
+
if not results:
|
|
241
|
+
return {}
|
|
242
|
+
scores = [s for _, s in results]
|
|
243
|
+
min_s, max_s = min(scores), max(scores)
|
|
244
|
+
range_s = max_s - min_s or 1
|
|
245
|
+
return {
|
|
246
|
+
doc_id: (score - min_s) / range_s
|
|
247
|
+
for doc_id, score in results
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
vector_norm = normalize(vector_results)
|
|
251
|
+
bm25_norm = normalize(bm25_results)
|
|
252
|
+
|
|
253
|
+
# Combine weighted scores
|
|
254
|
+
all_ids = set(vector_norm.keys()) | set(bm25_norm.keys())
|
|
255
|
+
combined = {}
|
|
256
|
+
|
|
257
|
+
for doc_id in all_ids:
|
|
258
|
+
v_score = vector_norm.get(doc_id, 0) * vector_weight
|
|
259
|
+
b_score = bm25_norm.get(doc_id, 0) * (1 - vector_weight)
|
|
260
|
+
combined[doc_id] = v_score + b_score
|
|
261
|
+
|
|
262
|
+
return sorted(combined.items(), key=lambda x: x[1], reverse=True)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Performance Optimization
|
|
268
|
+
|
|
269
|
+
### Caching
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
from functools import lru_cache
|
|
273
|
+
|
|
274
|
+
@lru_cache(maxsize=1000)
|
|
275
|
+
def cached_search(query: str, top_k: int = 20) -> tuple:
|
|
276
|
+
"""Cache frequent queries."""
|
|
277
|
+
results = hybrid_search(query, top_k)
|
|
278
|
+
return tuple(r['id'] for r in results)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### Batch Queries
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
def batch_hybrid_search(queries: list[str], top_k: int = 20) -> list[list[dict]]:
|
|
285
|
+
"""Process multiple queries efficiently."""
|
|
286
|
+
|
|
287
|
+
# Batch vector search
|
|
288
|
+
vector_results = collection.query(
|
|
289
|
+
query_texts=queries,
|
|
290
|
+
n_results=50
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# BM25 for each (can't batch easily)
|
|
294
|
+
bm25_results = [bm25_search(q, 50) for q in queries]
|
|
295
|
+
|
|
296
|
+
# Fuse each
|
|
297
|
+
return [
|
|
298
|
+
reciprocal_rank_fusion([vector_results[i], bm25_results[i]])[:top_k]
|
|
299
|
+
for i in range(len(queries))
|
|
300
|
+
]
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Evaluation Metrics
|
|
306
|
+
|
|
307
|
+
### Hit Rate @ K
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
def hit_rate(queries: list, ground_truth: list, k: int = 10) -> float:
|
|
311
|
+
"""Percentage of queries where correct answer is in top K."""
|
|
312
|
+
hits = 0
|
|
313
|
+
for query, expected in zip(queries, ground_truth):
|
|
314
|
+
results = hybrid_search(query, top_k=k)
|
|
315
|
+
result_ids = [r['id'] for r in results]
|
|
316
|
+
if expected in result_ids:
|
|
317
|
+
hits += 1
|
|
318
|
+
return hits / len(queries)
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Mean Reciprocal Rank (MRR)
|
|
322
|
+
|
|
323
|
+
```python
|
|
324
|
+
def mrr(queries: list, ground_truth: list) -> float:
|
|
325
|
+
"""Average of 1/rank of first correct result."""
|
|
326
|
+
reciprocal_ranks = []
|
|
327
|
+
for query, expected in zip(queries, ground_truth):
|
|
328
|
+
results = hybrid_search(query, top_k=100)
|
|
329
|
+
for rank, result in enumerate(results, 1):
|
|
330
|
+
if result['id'] == expected:
|
|
331
|
+
reciprocal_ranks.append(1 / rank)
|
|
332
|
+
break
|
|
333
|
+
else:
|
|
334
|
+
reciprocal_ranks.append(0)
|
|
335
|
+
return sum(reciprocal_ranks) / len(reciprocal_ranks)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
---
|
|
339
|
+
|
|
340
|
+
## Quick Setup (Copy-Paste Ready)
|
|
341
|
+
|
|
342
|
+
```python
|
|
343
|
+
# Install dependencies
|
|
344
|
+
# pip install chromadb rank-bm25
|
|
345
|
+
|
|
346
|
+
import chromadb
|
|
347
|
+
from rank_bm25 import BM25Okapi
|
|
348
|
+
|
|
349
|
+
def create_hybrid_engine(chunks):
|
|
350
|
+
"""One-liner to create hybrid search engine."""
|
|
351
|
+
return HybridSearchEngine(chunks)
|
|
352
|
+
|
|
353
|
+
# Usage
|
|
354
|
+
engine = create_hybrid_engine(my_chunks)
|
|
355
|
+
results = engine.hybrid_search("how to calculate tax", top_k=10)
|
|
356
|
+
```
|