@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +424 -8
- package/file-indexer.ts +21 -1
- package/package.json +12 -2
- package/tools/codeindex.ts +135 -16
- package/tools/search.ts +46 -11
- package/vectorizer/bm25-index.ts +155 -0
- package/vectorizer/chunkers/chunker-factory.ts +98 -0
- package/vectorizer/chunkers/code-chunker.ts +325 -0
- package/vectorizer/chunkers/markdown-chunker.ts +177 -0
- package/vectorizer/content-cleaner.ts +136 -0
- package/vectorizer/hybrid-search.ts +97 -0
- package/vectorizer/index.js +395 -16
- package/vectorizer/metadata-extractor.ts +125 -0
- package/vectorizer/query-cache.ts +126 -0
- package/vectorizer/search-metrics.ts +155 -0
- package/vectorizer.yaml +81 -0
package/tools/codeindex.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Code Index Status & Management Tool
|
|
2
|
+
* Code Index Status & Management Tool (v2)
|
|
3
3
|
*
|
|
4
4
|
* Uses bundled vectorizer. Index data is stored in `.opencode/vectors/<index>/`.
|
|
5
|
+
* v2: added "test" action for gold dataset testing, richer stats.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
import { tool } from "@opencode-ai/plugin"
|
|
@@ -59,6 +60,7 @@ Actions:
|
|
|
59
60
|
- "status" → Show index statistics
|
|
60
61
|
- "list" → List all available indexes with stats
|
|
61
62
|
- "reindex" → Re-index files using local vectorizer
|
|
63
|
+
- "test" → Run gold dataset quality tests (if configured)
|
|
62
64
|
|
|
63
65
|
Available indexes:
|
|
64
66
|
- "code" - Source code files
|
|
@@ -66,7 +68,7 @@ Available indexes:
|
|
|
66
68
|
- "config" - Configuration files`,
|
|
67
69
|
|
|
68
70
|
args: {
|
|
69
|
-
action: tool.schema.enum(["status", "list", "reindex"]).describe("Action to perform"),
|
|
71
|
+
action: tool.schema.enum(["status", "list", "reindex", "test"]).describe("Action to perform"),
|
|
70
72
|
index: tool.schema.string().optional().default("code").describe("Index name: code, docs, config"),
|
|
71
73
|
dir: tool.schema.string().optional().describe("Directory to index (default: project root)"),
|
|
72
74
|
},
|
|
@@ -87,7 +89,7 @@ Available indexes:
|
|
|
87
89
|
} catch {}
|
|
88
90
|
|
|
89
91
|
if (indexes.length === 0) {
|
|
90
|
-
output +=
|
|
92
|
+
output += `No indexes created yet\n\nCreate indexes:\n\n\`\`\`\n`
|
|
91
93
|
output += `codeindex({ action: "reindex", index: "code" })\n`
|
|
92
94
|
output += `codeindex({ action: "reindex", index: "docs", dir: "docs/" })\n`
|
|
93
95
|
output += `\`\`\`\n`
|
|
@@ -95,31 +97,62 @@ Available indexes:
|
|
|
95
97
|
output += `### Active Indexes\n\n`
|
|
96
98
|
for (const idx of indexes) {
|
|
97
99
|
try {
|
|
98
|
-
const
|
|
99
|
-
const
|
|
100
|
-
|
|
100
|
+
const indexer = await new CodebaseIndexer(projectRoot, idx).init()
|
|
101
|
+
const stats = await indexer.getStats()
|
|
102
|
+
await indexer.unloadModel()
|
|
101
103
|
const desc = INDEX_DESCRIPTIONS[idx] || "Custom index"
|
|
102
|
-
|
|
104
|
+
const features = stats.features
|
|
105
|
+
? ` | chunking: ${stats.features.chunking}, hybrid: ${stats.features.hybrid ? "on" : "off"}`
|
|
106
|
+
: ""
|
|
107
|
+
output += `- **${idx}** - ${desc} (files: ${stats.fileCount}, chunks: ${stats.chunkCount}${features})\n`
|
|
103
108
|
} catch {
|
|
104
109
|
output += `- ${idx}\n`
|
|
105
110
|
}
|
|
106
111
|
}
|
|
107
112
|
}
|
|
108
113
|
|
|
109
|
-
output += `\n### Usage\n\n\`\`\`\nsearch({ query: "your query", index: "code" })\n\`\`\``
|
|
114
|
+
output += `\n### Usage\n\n\`\`\`\nsearch({ query: "your query", index: "code" })\nsearch({ query: "your query", hybrid: true }) // v2: hybrid search\nsearch({ query: "your query", fileType: "code", language: "typescript" }) // v2: filters\n\`\`\``
|
|
110
115
|
return output
|
|
111
116
|
}
|
|
112
117
|
|
|
113
118
|
if (args.action === "status") {
|
|
114
119
|
const hashesFile = path.join(vectorsDir, indexName, "hashes.json")
|
|
115
120
|
try {
|
|
116
|
-
const
|
|
117
|
-
const
|
|
118
|
-
|
|
119
|
-
|
|
121
|
+
const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
|
|
122
|
+
const stats = await indexer.getStats()
|
|
123
|
+
await indexer.unloadModel()
|
|
124
|
+
|
|
125
|
+
const sampleFiles = Object.keys(JSON.parse(await fs.readFile(hashesFile, "utf8"))).slice(0, 5)
|
|
120
126
|
const desc = INDEX_DESCRIPTIONS[indexName] || "Custom index"
|
|
121
127
|
|
|
122
|
-
|
|
128
|
+
let output = `## Index Status: "${indexName}"\n\n`
|
|
129
|
+
output += `**Description:** ${desc}\n`
|
|
130
|
+
output += `**Files indexed:** ${stats.fileCount}\n`
|
|
131
|
+
output += `**Total chunks:** ${stats.chunkCount}\n`
|
|
132
|
+
output += `**Model:** ${stats.model}\n`
|
|
133
|
+
|
|
134
|
+
if (stats.features) {
|
|
135
|
+
output += `\n**Features:**\n`
|
|
136
|
+
output += `- Chunking strategy: ${stats.features.chunking}\n`
|
|
137
|
+
output += `- Hybrid search: ${stats.features.hybrid ? "enabled" : "disabled"}\n`
|
|
138
|
+
output += `- Metrics: ${stats.features.metrics ? "enabled" : "disabled"}\n`
|
|
139
|
+
output += `- Query cache: ${stats.features.cache ? "enabled" : "disabled"}\n`
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Show metrics summary if available
|
|
143
|
+
try {
|
|
144
|
+
const metrics = await indexer.getMetrics()
|
|
145
|
+
if (metrics.total_queries > 0) {
|
|
146
|
+
output += `\n**Search Metrics:**\n`
|
|
147
|
+
output += `- Total queries: ${metrics.total_queries}\n`
|
|
148
|
+
output += `- Avg results/query: ${metrics.avg_results_per_query.toFixed(1)}\n`
|
|
149
|
+
output += `- Zero results rate: ${(metrics.zero_results_rate * 100).toFixed(1)}%\n`
|
|
150
|
+
output += `- Avg relevance: ${metrics.avg_relevance.toFixed(3)}\n`
|
|
151
|
+
}
|
|
152
|
+
} catch {}
|
|
153
|
+
|
|
154
|
+
output += `\n**Sample indexed files:**\n${sampleFiles.map((f) => `- ${f}`).join("\n")}${stats.fileCount > 5 ? `\n- ... and ${stats.fileCount - 5} more` : ""}`
|
|
155
|
+
return output
|
|
123
156
|
} catch {
|
|
124
157
|
return `## Index Status: "${indexName}"\n\nIndex "${indexName}" not created yet. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
|
|
125
158
|
}
|
|
@@ -148,12 +181,98 @@ Available indexes:
|
|
|
148
181
|
await indexer.unloadModel()
|
|
149
182
|
const stats = await indexer.getStats()
|
|
150
183
|
|
|
151
|
-
|
|
184
|
+
let output = `## Re-indexing Complete\n\n`
|
|
185
|
+
output += `**Index:** ${indexName}\n`
|
|
186
|
+
output += `**Directory:** ${args.dir || "(project root)"}\n`
|
|
187
|
+
output += `**Files found:** ${files.length}\n`
|
|
188
|
+
output += `**Files indexed:** ${indexed}\n`
|
|
189
|
+
output += `**Files unchanged:** ${skipped}\n`
|
|
190
|
+
output += `**Total chunks:** ${stats.chunkCount}\n`
|
|
191
|
+
if (stats.features) {
|
|
192
|
+
output += `**Chunking:** ${stats.features.chunking}\n`
|
|
193
|
+
}
|
|
194
|
+
return output
|
|
195
|
+
} catch (error: any) {
|
|
196
|
+
return `Re-indexing failed: ${error.message || String(error)}`
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (args.action === "test") {
|
|
201
|
+
try {
|
|
202
|
+
const goldPath = path.join(projectRoot, ".opencode", "vectors", "gold-dataset.yaml")
|
|
203
|
+
let goldContent: string
|
|
204
|
+
try {
|
|
205
|
+
goldContent = await fs.readFile(goldPath, "utf8")
|
|
206
|
+
} catch {
|
|
207
|
+
return `## Gold Dataset Test\n\nNo gold dataset found at: ${goldPath}\n\nCreate one with test queries and expected results.\nSee docs/search-plugin-upgrade-plan.md for format.`
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Simple YAML parsing for test queries
|
|
211
|
+
const tests: { query: string; expected_files: string[]; min_relevance: number; description?: string }[] = []
|
|
212
|
+
const queryBlocks = goldContent.split(/\n\s+-\s+query:\s*/)
|
|
213
|
+
for (const block of queryBlocks.slice(1)) {
|
|
214
|
+
const queryMatch = block.match(/^["']?([^"'\n]+)["']?/)
|
|
215
|
+
const filesMatch = block.match(/expected_files:\s*\n((?:\s+-\s+.+\n?)+)/)
|
|
216
|
+
const relMatch = block.match(/min_relevance:\s*([\d.]+)/)
|
|
217
|
+
const descMatch = block.match(/description:\s*["']?([^"'\n]+)/)
|
|
218
|
+
|
|
219
|
+
if (queryMatch) {
|
|
220
|
+
const expectedFiles = filesMatch
|
|
221
|
+
? filesMatch[1].split("\n").map(l => l.replace(/^\s+-\s+["']?/, "").replace(/["']$/, "").trim()).filter(Boolean)
|
|
222
|
+
: []
|
|
223
|
+
tests.push({
|
|
224
|
+
query: queryMatch[1].trim(),
|
|
225
|
+
expected_files: expectedFiles,
|
|
226
|
+
min_relevance: relMatch ? parseFloat(relMatch[1]) : 0.7,
|
|
227
|
+
description: descMatch ? descMatch[1].trim() : undefined,
|
|
228
|
+
})
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (tests.length === 0) {
|
|
233
|
+
return `## Gold Dataset Test\n\nNo test queries found in gold dataset.`
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
|
|
237
|
+
let passed = 0
|
|
238
|
+
let failed = 0
|
|
239
|
+
let output = `## Gold Dataset Test Results\n\n`
|
|
240
|
+
|
|
241
|
+
for (const t of tests) {
|
|
242
|
+
const results = await indexer.search(t.query, 10, false)
|
|
243
|
+
const foundFiles = results.map((r: any) => r.file)
|
|
244
|
+
const foundExpected = t.expected_files.filter(f => foundFiles.includes(f))
|
|
245
|
+
const topScore = results.length > 0 && results[0]._distance != null
|
|
246
|
+
? 1 - results[0]._distance
|
|
247
|
+
: 0
|
|
248
|
+
|
|
249
|
+
const pass = foundExpected.length >= Math.ceil(t.expected_files.length * 0.5) && topScore >= t.min_relevance
|
|
250
|
+
|
|
251
|
+
if (pass) {
|
|
252
|
+
passed++
|
|
253
|
+
output += `**PASS** Query: "${t.query}"\n`
|
|
254
|
+
} else {
|
|
255
|
+
failed++
|
|
256
|
+
output += `**FAIL** Query: "${t.query}"\n`
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
output += ` Found: ${foundFiles.slice(0, 3).map((f: string) => `${f} (${(1 - (results.find((r: any) => r.file === f)?._distance ?? 1)).toFixed(2)})`).join(", ")}\n`
|
|
260
|
+
if (foundExpected.length < t.expected_files.length) {
|
|
261
|
+
const missing = t.expected_files.filter(f => !foundFiles.includes(f))
|
|
262
|
+
output += ` Missing: ${missing.join(", ")}\n`
|
|
263
|
+
}
|
|
264
|
+
output += `\n`
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
await indexer.unloadModel()
|
|
268
|
+
|
|
269
|
+
output += `---\n**Summary:** ${passed}/${tests.length} tests passed (${Math.round(passed / tests.length * 100)}%)\n`
|
|
270
|
+
return output
|
|
152
271
|
} catch (error: any) {
|
|
153
|
-
return
|
|
272
|
+
return `Gold dataset test failed: ${error.message || String(error)}`
|
|
154
273
|
}
|
|
155
274
|
}
|
|
156
275
|
|
|
157
|
-
return `Unknown action: ${args.action}. Use: status, list, or
|
|
276
|
+
return `Unknown action: ${args.action}. Use: status, list, reindex, or test`
|
|
158
277
|
},
|
|
159
278
|
})
|
package/tools/search.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Semantic Code Search Tool
|
|
2
|
+
* Semantic Code Search Tool (v2)
|
|
3
3
|
*
|
|
4
4
|
* Uses local embeddings + LanceDB vector store via bundled vectorizer.
|
|
5
|
+
* v2: hybrid search, metadata filtering, rich result metadata.
|
|
5
6
|
* Index data is stored in `.opencode/vectors/<index>/`.
|
|
6
7
|
*/
|
|
7
8
|
|
|
@@ -33,6 +34,13 @@ Examples:
|
|
|
33
34
|
searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
|
|
34
35
|
freshen: tool.schema.boolean().optional().default(true).describe("Auto-update stale files before searching (default: true)"),
|
|
35
36
|
includeArchived: tool.schema.boolean().optional().default(false).describe("Include archived files in results (default: false). Files are archived if in /archive/ folder or have 'archived: true' in frontmatter."),
|
|
37
|
+
// v2 params
|
|
38
|
+
hybrid: tool.schema.boolean().optional().describe("Enable hybrid search (vector + BM25 keyword matching). Improves exact keyword recall."),
|
|
39
|
+
fileType: tool.schema.string().optional().describe("Filter by file type: 'code', 'docs', or 'config'"),
|
|
40
|
+
language: tool.schema.string().optional().describe("Filter by language: 'typescript', 'python', 'markdown', etc."),
|
|
41
|
+
modifiedAfter: tool.schema.string().optional().describe("Filter: only files modified after this ISO date (e.g. '2024-01-01')"),
|
|
42
|
+
modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
|
|
43
|
+
tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
|
|
36
44
|
},
|
|
37
45
|
|
|
38
46
|
async execute(args) {
|
|
@@ -43,6 +51,15 @@ Examples:
|
|
|
43
51
|
const limit = args.limit || 10
|
|
44
52
|
const indexName = args.index || "code"
|
|
45
53
|
|
|
54
|
+
// Build search options from v2 params
|
|
55
|
+
const searchOptions: Record<string, any> = {}
|
|
56
|
+
if (args.hybrid != null) searchOptions.hybrid = args.hybrid
|
|
57
|
+
if (args.fileType) searchOptions.fileType = args.fileType
|
|
58
|
+
if (args.language) searchOptions.language = args.language
|
|
59
|
+
if (args.modifiedAfter) searchOptions.modifiedAfter = args.modifiedAfter
|
|
60
|
+
if (args.modifiedBefore) searchOptions.modifiedBefore = args.modifiedBefore
|
|
61
|
+
if (args.tags) searchOptions.tags = args.tags.split(",").map((t: string) => t.trim()).filter(Boolean)
|
|
62
|
+
|
|
46
63
|
// Auto-freshen stale files before searching
|
|
47
64
|
if (args.freshen !== false) {
|
|
48
65
|
const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
|
|
@@ -56,7 +73,7 @@ Examples:
|
|
|
56
73
|
await tempIndexer.unloadModel()
|
|
57
74
|
|
|
58
75
|
if (indexes.length === 0) {
|
|
59
|
-
return
|
|
76
|
+
return `No indexes found. Create one with: codeindex({ action: "reindex", index: "code" })`
|
|
60
77
|
}
|
|
61
78
|
|
|
62
79
|
for (const idx of indexes) {
|
|
@@ -64,42 +81,60 @@ Examples:
|
|
|
64
81
|
if (args.freshen !== false) {
|
|
65
82
|
await indexer.freshen()
|
|
66
83
|
}
|
|
67
|
-
const results = await indexer.search(args.query, limit, args.includeArchived)
|
|
84
|
+
const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
|
|
68
85
|
allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
|
|
69
86
|
await indexer.unloadModel()
|
|
70
87
|
}
|
|
71
88
|
|
|
72
|
-
allResults.sort((a, b) =>
|
|
89
|
+
allResults.sort((a, b) => {
|
|
90
|
+
// Prefer combinedScore (hybrid), fall back to distance
|
|
91
|
+
const scoreA = a._combinedScore ?? (a._distance != null ? 1 - a._distance : 0)
|
|
92
|
+
const scoreB = b._combinedScore ?? (b._distance != null ? 1 - b._distance : 0)
|
|
93
|
+
return scoreB - scoreA
|
|
94
|
+
})
|
|
73
95
|
allResults = allResults.slice(0, limit)
|
|
74
96
|
} else {
|
|
75
97
|
const hashesFile = path.join(projectRoot, ".opencode", "vectors", indexName, "hashes.json")
|
|
76
98
|
try {
|
|
77
99
|
await fs.access(hashesFile)
|
|
78
100
|
} catch {
|
|
79
|
-
return
|
|
101
|
+
return `Index "${indexName}" not found. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
|
|
80
102
|
}
|
|
81
103
|
|
|
82
104
|
const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
|
|
83
|
-
const results = await indexer.search(args.query, limit, args.includeArchived)
|
|
105
|
+
const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
|
|
84
106
|
allResults = results.map((r: any) => ({ ...r, _index: indexName }))
|
|
85
107
|
await indexer.unloadModel()
|
|
86
108
|
}
|
|
87
109
|
|
|
88
110
|
if (allResults.length === 0) {
|
|
89
111
|
const scope = args.searchAll ? "any index" : `index "${indexName}"`
|
|
90
|
-
return `No results found in ${scope} for: "${args.query}"\n\nTry:\n- Different keywords\n- Re-index with: codeindex({ action: "reindex", index: "${indexName}" })`
|
|
112
|
+
return `No results found in ${scope} for: "${args.query}"\n\nTry:\n- Different keywords\n- Enable hybrid search: search({ query: "...", hybrid: true })\n- Re-index with: codeindex({ action: "reindex", index: "${indexName}" })`
|
|
91
113
|
}
|
|
92
114
|
|
|
93
115
|
const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
|
|
94
|
-
|
|
116
|
+
const hybridLabel = args.hybrid ? " [hybrid]" : ""
|
|
117
|
+
let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel})\n\n`
|
|
95
118
|
|
|
96
119
|
for (let i = 0; i < allResults.length; i++) {
|
|
97
120
|
const r = allResults[i]
|
|
98
|
-
const score = r.
|
|
121
|
+
const score = r._combinedScore != null
|
|
122
|
+
? r._combinedScore.toFixed(3)
|
|
123
|
+
: r._distance != null
|
|
124
|
+
? (1 - r._distance).toFixed(3)
|
|
125
|
+
: "N/A"
|
|
99
126
|
const indexLabel = args.searchAll ? ` [${r._index}]` : ""
|
|
100
127
|
|
|
128
|
+
// v2: show rich metadata when available
|
|
129
|
+
const metaParts: string[] = []
|
|
130
|
+
if (r.language && r.language !== "unknown") metaParts.push(r.language)
|
|
131
|
+
if (r.heading_context) metaParts.push(`"${r.heading_context}"`)
|
|
132
|
+
if (r.function_name) metaParts.push(`fn: ${r.function_name}`)
|
|
133
|
+
if (r.class_name) metaParts.push(`class: ${r.class_name}`)
|
|
134
|
+
const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
|
|
135
|
+
|
|
101
136
|
output += `### ${i + 1}. ${r.file}${indexLabel}\n`
|
|
102
|
-
output += `**Relevance:** ${score}\n\n`
|
|
137
|
+
output += `**Relevance:** ${score}${metaLine}\n\n`
|
|
103
138
|
output += "```\n"
|
|
104
139
|
const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
|
|
105
140
|
output += content
|
|
@@ -109,7 +144,7 @@ Examples:
|
|
|
109
144
|
output += `---\n*Found ${allResults.length} results. Use Read tool to see full files.*`
|
|
110
145
|
return output
|
|
111
146
|
} catch (error: any) {
|
|
112
|
-
return
|
|
147
|
+
return `Search failed: ${error.message || String(error)}`
|
|
113
148
|
}
|
|
114
149
|
},
|
|
115
150
|
})
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 Index — keyword-based search using Okapi BM25 scoring.
|
|
3
|
+
*
|
|
4
|
+
* Builds an inverted index from chunk content and scores queries
|
|
5
|
+
* against it. Designed to complement vector similarity search.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ── BM25 parameters ────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
const K1 = 1.2 // term frequency saturation
|
|
11
|
+
const B = 0.75 // length normalization
|
|
12
|
+
|
|
13
|
+
// ── Types ───────────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
interface DocEntry {
|
|
16
|
+
id: number
|
|
17
|
+
termFreqs: Map<string, number>
|
|
18
|
+
length: number // total tokens
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface BM25Result {
|
|
22
|
+
id: number
|
|
23
|
+
score: number
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// ── Tokenizer ───────────────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
const STOP_WORDS = new Set([
|
|
29
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
30
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
31
|
+
"should", "may", "might", "shall", "can", "need", "must",
|
|
32
|
+
"and", "or", "but", "not", "no", "nor",
|
|
33
|
+
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
|
|
34
|
+
"into", "about", "between", "through", "during", "before", "after",
|
|
35
|
+
"this", "that", "these", "those", "it", "its",
|
|
36
|
+
"i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
|
|
37
|
+
"my", "your", "his", "our", "their",
|
|
38
|
+
"what", "which", "who", "whom", "where", "when", "how", "why",
|
|
39
|
+
"if", "then", "else", "so", "than", "too", "very",
|
|
40
|
+
])
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Tokenize text into lowercase terms, filtering stop words and short tokens.
|
|
44
|
+
*/
|
|
45
|
+
export function tokenize(text: string): string[] {
|
|
46
|
+
return text
|
|
47
|
+
.toLowerCase()
|
|
48
|
+
.replace(/[^a-z0-9_\-]/g, " ")
|
|
49
|
+
.split(/\s+/)
|
|
50
|
+
.filter((t) => t.length > 1 && !STOP_WORDS.has(t))
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ── BM25 Index class ────────────────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
export class BM25Index {
|
|
56
|
+
private docs: DocEntry[] = []
|
|
57
|
+
private invertedIndex: Map<string, Set<number>> = new Map()
|
|
58
|
+
private avgDocLength: number = 0
|
|
59
|
+
private docCount: number = 0
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Build index from a list of text documents.
|
|
63
|
+
* Each document is identified by its array index.
|
|
64
|
+
*/
|
|
65
|
+
build(documents: string[]): void {
|
|
66
|
+
this.docs = []
|
|
67
|
+
this.invertedIndex = new Map()
|
|
68
|
+
|
|
69
|
+
let totalLength = 0
|
|
70
|
+
|
|
71
|
+
for (let i = 0; i < documents.length; i++) {
|
|
72
|
+
const tokens = tokenize(documents[i])
|
|
73
|
+
const termFreqs = new Map<string, number>()
|
|
74
|
+
|
|
75
|
+
for (const token of tokens) {
|
|
76
|
+
termFreqs.set(token, (termFreqs.get(token) || 0) + 1)
|
|
77
|
+
|
|
78
|
+
if (!this.invertedIndex.has(token)) {
|
|
79
|
+
this.invertedIndex.set(token, new Set())
|
|
80
|
+
}
|
|
81
|
+
this.invertedIndex.get(token)!.add(i)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
this.docs.push({ id: i, termFreqs, length: tokens.length })
|
|
85
|
+
totalLength += tokens.length
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
this.docCount = documents.length
|
|
89
|
+
this.avgDocLength = this.docCount > 0 ? totalLength / this.docCount : 0
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Score a query against indexed documents.
|
|
94
|
+
* Returns array sorted by descending score.
|
|
95
|
+
*/
|
|
96
|
+
search(query: string, limit: number = 50): BM25Result[] {
|
|
97
|
+
const queryTerms = tokenize(query)
|
|
98
|
+
if (queryTerms.length === 0) return []
|
|
99
|
+
|
|
100
|
+
// Collect candidate docs (any doc containing at least one query term)
|
|
101
|
+
const candidateIds = new Set<number>()
|
|
102
|
+
for (const term of queryTerms) {
|
|
103
|
+
const postings = this.invertedIndex.get(term)
|
|
104
|
+
if (postings) {
|
|
105
|
+
for (const id of postings) candidateIds.add(id)
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (candidateIds.size === 0) return []
|
|
110
|
+
|
|
111
|
+
// Score each candidate
|
|
112
|
+
const results: BM25Result[] = []
|
|
113
|
+
|
|
114
|
+
for (const docId of candidateIds) {
|
|
115
|
+
const doc = this.docs[docId]
|
|
116
|
+
let score = 0
|
|
117
|
+
|
|
118
|
+
for (const term of queryTerms) {
|
|
119
|
+
const tf = doc.termFreqs.get(term) || 0
|
|
120
|
+
if (tf === 0) continue
|
|
121
|
+
|
|
122
|
+
const df = this.invertedIndex.get(term)?.size || 0
|
|
123
|
+
const idf = Math.log((this.docCount - df + 0.5) / (df + 0.5) + 1)
|
|
124
|
+
const tfNorm = (tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (doc.length / this.avgDocLength)))
|
|
125
|
+
|
|
126
|
+
score += idf * tfNorm
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (score > 0) {
|
|
130
|
+
results.push({ id: docId, score })
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
results.sort((a, b) => b.score - a.score)
|
|
135
|
+
return results.slice(0, limit)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/** Number of indexed documents. */
|
|
139
|
+
get size(): number {
|
|
140
|
+
return this.docCount
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Number of unique terms. */
|
|
144
|
+
get vocabularySize(): number {
|
|
145
|
+
return this.invertedIndex.size
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/** Release all memory held by the index. */
|
|
149
|
+
clear(): void {
|
|
150
|
+
this.docs = []
|
|
151
|
+
this.invertedIndex = new Map()
|
|
152
|
+
this.avgDocLength = 0
|
|
153
|
+
this.docCount = 0
|
|
154
|
+
}
|
|
155
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunker Factory — selects the appropriate chunker based on file type.
|
|
3
|
+
*
|
|
4
|
+
* Routes: markdown → markdown-chunker, code → code-chunker, else → fixed.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
|
|
8
|
+
import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
|
|
9
|
+
import type { FileType } from "../metadata-extractor"
|
|
10
|
+
|
|
11
|
+
// ── Types ───────────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export type ChunkingStrategy = "fixed" | "semantic" | "hybrid"
|
|
14
|
+
|
|
15
|
+
export interface ChunkingConfig {
|
|
16
|
+
strategy: ChunkingStrategy
|
|
17
|
+
markdown: MarkdownChunkConfig
|
|
18
|
+
code: CodeChunkConfig
|
|
19
|
+
fixed: { max_chars: number }
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
|
|
23
|
+
strategy: "semantic",
|
|
24
|
+
markdown: DEFAULT_MD_CONFIG,
|
|
25
|
+
code: DEFAULT_CODE_CONFIG,
|
|
26
|
+
fixed: { max_chars: 1500 },
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Unified chunk output from any chunker. */
|
|
30
|
+
export interface UnifiedChunk {
|
|
31
|
+
content: string
|
|
32
|
+
heading_context?: string
|
|
33
|
+
function_name?: string
|
|
34
|
+
class_name?: string
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ── Fixed chunker (legacy) ──────────────────────────────────────────────────
|
|
38
|
+
|
|
39
|
+
function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
|
|
40
|
+
const chunks: UnifiedChunk[] = []
|
|
41
|
+
const lines = content.split("\n")
|
|
42
|
+
let current: string[] = []
|
|
43
|
+
let currentLen = 0
|
|
44
|
+
|
|
45
|
+
for (const line of lines) {
|
|
46
|
+
if (currentLen + line.length + 1 > maxChars && current.length > 0) {
|
|
47
|
+
chunks.push({ content: current.join("\n") })
|
|
48
|
+
current = []
|
|
49
|
+
currentLen = 0
|
|
50
|
+
}
|
|
51
|
+
current.push(line)
|
|
52
|
+
currentLen += line.length + 1
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (current.length > 0) {
|
|
56
|
+
chunks.push({ content: current.join("\n") })
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return chunks
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Chunk content using the appropriate strategy for the given file type.
|
|
66
|
+
*/
|
|
67
|
+
export function chunkContent(
|
|
68
|
+
content: string,
|
|
69
|
+
fileType: FileType,
|
|
70
|
+
language: string,
|
|
71
|
+
config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
|
|
72
|
+
): UnifiedChunk[] {
|
|
73
|
+
// If strategy is "fixed", always use fixed chunker
|
|
74
|
+
if (config.strategy === "fixed") {
|
|
75
|
+
return chunkFixed(content, config.fixed.max_chars)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Semantic or hybrid: pick by file type
|
|
79
|
+
if (fileType === "docs" || language === "markdown") {
|
|
80
|
+
const mdChunks = chunkMarkdown(content, config.markdown)
|
|
81
|
+
return mdChunks.map((c) => ({
|
|
82
|
+
content: c.content,
|
|
83
|
+
heading_context: c.heading_context,
|
|
84
|
+
}))
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (fileType === "code") {
|
|
88
|
+
const codeChunks = chunkCode(content, config.code)
|
|
89
|
+
return codeChunks.map((c) => ({
|
|
90
|
+
content: c.content,
|
|
91
|
+
function_name: c.function_name,
|
|
92
|
+
class_name: c.class_name,
|
|
93
|
+
}))
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Config files or unknown — fixed
|
|
97
|
+
return chunkFixed(content, config.fixed.max_chars)
|
|
98
|
+
}
|