@comfanion/usethis_search 3.0.0-dev.21 → 3.0.0-dev.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/tools/search.ts +114 -18
- package/vectorizer/index.ts +9 -9
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "3.0.0-dev.
|
|
3
|
+
"version": "3.0.0-dev.23",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
package/tools/search.ts
CHANGED
|
@@ -25,7 +25,8 @@ Examples:
|
|
|
25
25
|
- "authentication logic" → finds auth-related code
|
|
26
26
|
- "database connection handling" → finds DB setup code
|
|
27
27
|
- "how to deploy" with index: "docs" → finds deployment docs
|
|
28
|
-
- "API keys" with index: "config" → finds config with API settings
|
|
28
|
+
- "API keys" with index: "config" → finds config with API settings
|
|
29
|
+
- search({ query: "tenant", path: "internal/domain/" }) → searches only in internal/domain/`,
|
|
29
30
|
|
|
30
31
|
args: {
|
|
31
32
|
query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
|
|
@@ -42,6 +43,7 @@ Examples:
|
|
|
42
43
|
modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
|
|
43
44
|
tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
|
|
44
45
|
minScore: tool.schema.number().optional().default(0.35).describe("Minimum relevance score (0-1). Results below this threshold are dropped. Default: 0.35"),
|
|
46
|
+
path: tool.schema.string().optional().describe("Filter by file path prefix (e.g. 'internal/domain/', 'src/components'). Only returns files under this path."),
|
|
45
47
|
},
|
|
46
48
|
|
|
47
49
|
async execute(args) {
|
|
@@ -119,9 +121,9 @@ Examples:
|
|
|
119
121
|
}
|
|
120
122
|
|
|
121
123
|
allResults.sort((a, b) => {
|
|
122
|
-
// Prefer combinedScore (hybrid), fall back to
|
|
123
|
-
const scoreA = a._combinedScore ?? (a._distance != null ? 1 - a._distance : 0)
|
|
124
|
-
const scoreB = b._combinedScore ?? (b._distance != null ? 1 - b._distance : 0)
|
|
124
|
+
// Prefer combinedScore (hybrid), fall back to L2→similarity conversion
|
|
125
|
+
const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
|
|
126
|
+
const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
|
|
125
127
|
return scoreB - scoreA
|
|
126
128
|
})
|
|
127
129
|
allResults = allResults.slice(0, limit)
|
|
@@ -130,7 +132,16 @@ Examples:
|
|
|
130
132
|
try {
|
|
131
133
|
await fs.access(hashesFile)
|
|
132
134
|
} catch {
|
|
133
|
-
|
|
135
|
+
// Index doesn't exist — check what indexes ARE available
|
|
136
|
+
const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
|
|
137
|
+
const available = await tempIndexer.listIndexes()
|
|
138
|
+
await tempIndexer.unloadModel()
|
|
139
|
+
|
|
140
|
+
if (available.length > 0) {
|
|
141
|
+
const list = available.map(i => `"${i}"`).join(", ")
|
|
142
|
+
return `Index "${indexName}" not found. Available indexes: ${list}.\n\nTry: search({ query: "${args.query}", index: "${available[0]}" })\nOr search all: search({ query: "${args.query}", searchAll: true })`
|
|
143
|
+
}
|
|
144
|
+
return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
|
|
134
145
|
}
|
|
135
146
|
|
|
136
147
|
const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
|
|
@@ -146,23 +157,83 @@ Examples:
|
|
|
146
157
|
return score >= minScore
|
|
147
158
|
})
|
|
148
159
|
|
|
149
|
-
|
|
160
|
+
// ── Path filter — restrict to subtree ──────────────────────────────────
|
|
161
|
+
if (args.path) {
|
|
162
|
+
const prefix = args.path.replace(/\/+$/, "") // normalize trailing slash
|
|
163
|
+
allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ── Reranking — boost results where query keywords appear in text ──────
|
|
167
|
+
// Also store score components for breakdown display
|
|
168
|
+
const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
|
|
169
|
+
for (const r of allResults) {
|
|
170
|
+
// Vector score (L2 → similarity)
|
|
171
|
+
const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
|
|
172
|
+
r._vectorScore = vectorScore
|
|
173
|
+
|
|
174
|
+
// BM25 component (present only in hybrid mode — embedded in _combinedScore)
|
|
175
|
+
// If _combinedScore exists and differs from vectorScore, the difference is BM25 contribution
|
|
176
|
+
r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
|
|
177
|
+
|
|
178
|
+
// Base score before keyword boost
|
|
179
|
+
const baseScore = r._combinedScore ?? vectorScore
|
|
180
|
+
|
|
181
|
+
// Keyword matching
|
|
182
|
+
const text = (r.content || "").toLowerCase()
|
|
183
|
+
const matchedKeywords: string[] = []
|
|
184
|
+
if (queryKeywords.length > 0) {
|
|
185
|
+
for (const kw of queryKeywords) {
|
|
186
|
+
if (text.includes(kw)) matchedKeywords.push(kw)
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
r._matchedKeywords = matchedKeywords
|
|
190
|
+
const keywordBonus = queryKeywords.length > 0 ? (matchedKeywords.length / queryKeywords.length) * 0.15 : 0
|
|
191
|
+
r._keywordBonus = keywordBonus
|
|
192
|
+
r._finalScore = baseScore + keywordBonus
|
|
193
|
+
}
|
|
194
|
+
allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
|
|
195
|
+
|
|
196
|
+
// ── Group by file — best chunk per file, with chunk count ─────────────
|
|
197
|
+
const fileGroups = new Map<string, { best: any, chunks: any[] }>()
|
|
198
|
+
for (const r of allResults) {
|
|
199
|
+
const key = r.file
|
|
200
|
+
if (!fileGroups.has(key)) {
|
|
201
|
+
fileGroups.set(key, { best: r, chunks: [r] })
|
|
202
|
+
} else {
|
|
203
|
+
const group = fileGroups.get(key)!
|
|
204
|
+
group.chunks.push(r)
|
|
205
|
+
if ((r._finalScore ?? 0) > (group.best._finalScore ?? 0)) {
|
|
206
|
+
group.best = r
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Sort groups by best chunk score, take top N unique files
|
|
212
|
+
const sortedGroups = [...fileGroups.values()]
|
|
213
|
+
.sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
|
|
214
|
+
.slice(0, limit)
|
|
215
|
+
|
|
216
|
+
if (sortedGroups.length === 0) {
|
|
150
217
|
const scope = args.searchAll ? "any index" : `index "${indexName}"`
|
|
151
218
|
return `No results found in ${scope} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords\n- Lower minScore threshold: search({ query: "...", minScore: 0.2 })\n- Enable hybrid search: search({ query: "...", hybrid: true })`
|
|
152
219
|
}
|
|
153
220
|
|
|
221
|
+
// ── Confidence signal ──────────────────────────────────────────────────
|
|
222
|
+
const topScore = sortedGroups[0].best._finalScore ?? 0
|
|
154
223
|
const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
|
|
155
224
|
const hybridLabel = args.hybrid ? " [hybrid]" : ""
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
225
|
+
const pathLabel = args.path ? ` path:"${args.path}"` : ""
|
|
226
|
+
let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel}${pathLabel})\n\n`
|
|
227
|
+
|
|
228
|
+
if (topScore < 0.45) {
|
|
229
|
+
output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords, different phrasing, or hybrid: true.\n\n`
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
for (let i = 0; i < sortedGroups.length; i++) {
|
|
233
|
+
const { best: r, chunks } = sortedGroups[i]
|
|
234
|
+
const score = (r._finalScore ?? 0).toFixed(3)
|
|
165
235
|
const indexLabel = args.searchAll ? ` [${r._index}]` : ""
|
|
236
|
+
const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
|
|
166
237
|
|
|
167
238
|
// v2: show rich metadata when available
|
|
168
239
|
const metaParts: string[] = []
|
|
@@ -172,13 +243,36 @@ Examples:
|
|
|
172
243
|
if (r.class_name) metaParts.push(`class: ${r.class_name}`)
|
|
173
244
|
const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
|
|
174
245
|
|
|
175
|
-
|
|
176
|
-
|
|
246
|
+
// Score breakdown: vector + bm25 + keywords
|
|
247
|
+
const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
|
|
248
|
+
if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
|
|
249
|
+
if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
|
|
250
|
+
const breakdown = breakdownParts.join(", ")
|
|
251
|
+
|
|
252
|
+
// Matched keywords
|
|
253
|
+
const kwDisplay = r._matchedKeywords && r._matchedKeywords.length > 0
|
|
254
|
+
? ` | matched: "${r._matchedKeywords.join('", "')}"`
|
|
255
|
+
: ""
|
|
256
|
+
|
|
257
|
+
output += `### ${i + 1}. ${r.file}${indexLabel}${chunkNote}\n`
|
|
258
|
+
output += `**Score:** ${score} (${breakdown}${kwDisplay})${metaLine}\n\n`
|
|
177
259
|
output += "```\n"
|
|
178
260
|
const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
|
|
179
261
|
output += content
|
|
180
262
|
output += "\n```\n"
|
|
181
263
|
|
|
264
|
+
// Show second-best chunk from same file if available (brief)
|
|
265
|
+
if (chunks.length > 1) {
|
|
266
|
+
const second = chunks.find((c: any) => c !== r)
|
|
267
|
+
if (second) {
|
|
268
|
+
const secMeta: string[] = []
|
|
269
|
+
if (second.function_name) secMeta.push(`fn: ${second.function_name}`)
|
|
270
|
+
if (second.heading_context) secMeta.push(`"${second.heading_context}"`)
|
|
271
|
+
const secLabel = secMeta.length > 0 ? ` ${secMeta.join(", ")}` : ""
|
|
272
|
+
output += `\n*Also:${secLabel}*\n`
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
182
276
|
if (r.relatedContext && r.relatedContext.length > 0) {
|
|
183
277
|
output += "\n**Related Context:**\n"
|
|
184
278
|
for (const rel of r.relatedContext) {
|
|
@@ -193,7 +287,9 @@ Examples:
|
|
|
193
287
|
output += "\n"
|
|
194
288
|
}
|
|
195
289
|
|
|
196
|
-
|
|
290
|
+
const totalChunks = allResults.length
|
|
291
|
+
const uniqueFiles = sortedGroups.length
|
|
292
|
+
output += `---\n*${uniqueFiles} files (${totalChunks} chunks). Use Read tool to see full files.*`
|
|
197
293
|
return output
|
|
198
294
|
} catch (error: any) {
|
|
199
295
|
return `Search failed: ${error.message || String(error)}`
|
package/vectorizer/index.ts
CHANGED
|
@@ -239,7 +239,7 @@ async function loadConfig(projectRoot) {
|
|
|
239
239
|
}
|
|
240
240
|
|
|
241
241
|
// Parse vectorizer section from YAML
|
|
242
|
-
const vectorizerMatch = content.match(/^vectorizer:([\s\S]*?)(?=^[a-zA-Z_\-]
|
|
242
|
+
const vectorizerMatch = content.match(/^vectorizer:([\s\S]*?)(?=^[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
243
243
|
if (!vectorizerMatch) {
|
|
244
244
|
await ensureDefaultConfig(projectRoot);
|
|
245
245
|
return;
|
|
@@ -255,7 +255,7 @@ async function loadConfig(projectRoot) {
|
|
|
255
255
|
}
|
|
256
256
|
|
|
257
257
|
// ── Parse cleaning config ───────────────────────────────────────────────
|
|
258
|
-
const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]
|
|
258
|
+
const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
259
259
|
if (cleaningMatch) {
|
|
260
260
|
const cs = cleaningMatch[1];
|
|
261
261
|
CLEANING_CONFIG = {
|
|
@@ -267,7 +267,7 @@ async function loadConfig(projectRoot) {
|
|
|
267
267
|
}
|
|
268
268
|
|
|
269
269
|
// ── Parse chunking config ───────────────────────────────────────────────
|
|
270
|
-
const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]
|
|
270
|
+
const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
271
271
|
if (chunkingMatch) {
|
|
272
272
|
const cs = chunkingMatch[1];
|
|
273
273
|
const strategy = parseString(cs, "strategy", "semantic");
|
|
@@ -292,7 +292,7 @@ async function loadConfig(projectRoot) {
|
|
|
292
292
|
}
|
|
293
293
|
|
|
294
294
|
// ── Parse search config ─────────────────────────────────────────────────
|
|
295
|
-
const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]
|
|
295
|
+
const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
296
296
|
if (searchMatch) {
|
|
297
297
|
const ss = searchMatch[1];
|
|
298
298
|
HYBRID_CONFIG = {
|
|
@@ -302,7 +302,7 @@ async function loadConfig(projectRoot) {
|
|
|
302
302
|
}
|
|
303
303
|
|
|
304
304
|
// ── Parse quality config ────────────────────────────────────────────────
|
|
305
|
-
const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]
|
|
305
|
+
const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
306
306
|
if (qualityMatch) {
|
|
307
307
|
const qs = qualityMatch[1];
|
|
308
308
|
METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
|
|
@@ -310,7 +310,7 @@ async function loadConfig(projectRoot) {
|
|
|
310
310
|
}
|
|
311
311
|
|
|
312
312
|
// ── Parse graph config (v3) ──────────────────────────────────────────────
|
|
313
|
-
const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]
|
|
313
|
+
const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
314
314
|
if (graphMatch) {
|
|
315
315
|
const gs = graphMatch[1];
|
|
316
316
|
GRAPH_CONFIG.enabled = parseBool(gs, "enabled", DEFAULT_GRAPH_CONFIG.enabled);
|
|
@@ -321,7 +321,7 @@ async function loadConfig(projectRoot) {
|
|
|
321
321
|
GRAPH_CONFIG.read_intercept = parseBool(gs, "read_intercept", DEFAULT_GRAPH_CONFIG.read_intercept);
|
|
322
322
|
|
|
323
323
|
// Nested lsp: section
|
|
324
|
-
const lspMatch = gs.match(/^\s+lsp:\s*\n([\s\S]*?)(?=^\s{4}[a-zA-Z_\-]
|
|
324
|
+
const lspMatch = gs.match(/^\s+lsp:\s*\n([\s\S]*?)(?=^\s{4}[a-zA-Z_\-]+:|(?![\s\S]))/m);
|
|
325
325
|
if (lspMatch) {
|
|
326
326
|
const ls = lspMatch[1];
|
|
327
327
|
GRAPH_CONFIG.lsp.enabled = parseBool(ls, "enabled", DEFAULT_GRAPH_CONFIG.lsp.enabled);
|
|
@@ -342,7 +342,7 @@ async function loadConfig(projectRoot) {
|
|
|
342
342
|
}
|
|
343
343
|
|
|
344
344
|
// Parse indexes section
|
|
345
|
-
const indexesMatch = section.match(/^\s{2}indexes:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\s{2}exclude
|
|
345
|
+
const indexesMatch = section.match(/^\s{2}indexes:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\s{2}exclude:|(?![\s\S]))/m);
|
|
346
346
|
if (!indexesMatch) return;
|
|
347
347
|
|
|
348
348
|
const indexesSection = indexesMatch[1];
|
|
@@ -350,7 +350,7 @@ async function loadConfig(projectRoot) {
|
|
|
350
350
|
// Parse each index (code, docs, config)
|
|
351
351
|
for (const indexName of ["code", "docs", "config"]) {
|
|
352
352
|
const indexRegex = new RegExp(
|
|
353
|
-
`^\\s{4}${indexName}:\\s*\\n([\\s\\S]*?)(?=^\\s{4}[a-zA-Z_\\-]
|
|
353
|
+
`^\\s{4}${indexName}:\\s*\\n([\\s\\S]*?)(?=^\\s{4}[a-zA-Z_\\-]+:|(?![\\s\\S]))`,
|
|
354
354
|
"m",
|
|
355
355
|
);
|
|
356
356
|
const indexMatch = indexesSection.match(indexRegex);
|