@comfanion/usethis_search 3.0.0-dev.8 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/tools/search.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  /**
2
- * Semantic Code Search Tool (v2)
2
+ * Semantic Code Search Tool (v3)
3
3
  *
4
4
  * Uses local embeddings + LanceDB vector store via bundled vectorizer.
5
- * v2: hybrid search, metadata filtering, rich result metadata.
5
+ * v3: simplified agent API — 5 params, config-driven defaults, smart filter.
6
6
  * Index data is stored in `.opencode/vectors/<index>/`.
7
7
  */
8
8
 
@@ -10,7 +10,82 @@ import { tool } from "@opencode-ai/plugin"
10
10
  import path from "path"
11
11
  import fs from "fs/promises"
12
12
 
13
- import { CodebaseIndexer } from "../vectorizer/index.js"
13
+ import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
14
+
15
+ // ── Extension → language mapping (for filter parsing) ─────────────────────
16
+ const EXT_TO_LANG: Record<string, string> = {
17
+ go: "go", py: "python", ts: "typescript", tsx: "typescript",
18
+ js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
19
+ rs: "rust", java: "java", kt: "kotlin", swift: "swift",
20
+ c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
21
+ rb: "ruby", php: "php", scala: "scala", clj: "clojure",
22
+ md: "markdown", mdx: "markdown", txt: "text",
23
+ yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
24
+ }
25
+ const LANG_NAMES = new Set(Object.values(EXT_TO_LANG))
26
+
27
+ /**
28
+ * Parse the `filter` param into path prefix and/or language filter.
29
+ *
30
+ * Supported formats:
31
+ * "internal/domain/" → pathPrefix = "internal/domain"
32
+ * "*.go" → language = "go"
33
+ * ".go" → language = "go"
34
+ * "go" → language = "go"
35
+ * "internal/*.go" → pathPrefix = "internal", language = "go"
36
+ * "internal/**\/*.go" → pathPrefix = "internal", language = "go"
37
+ * "service" → pathContains = "service"
38
+ */
39
+ function parseFilter(filter: string): {
40
+ pathPrefix?: string
41
+ language?: string
42
+ pathContains?: string
43
+ } {
44
+ if (!filter) return {}
45
+
46
+ const f = filter.trim()
47
+
48
+ // "internal/**/*.go" or "internal/*.go" → path + extension
49
+ const globMatch = f.match(/^([^*]+?)(?:\/\*\*)?\/?\*\.(\w+)$/)
50
+ if (globMatch) {
51
+ const prefix = globMatch[1].replace(/\/+$/, "")
52
+ const ext = globMatch[2]
53
+ return {
54
+ pathPrefix: prefix,
55
+ language: EXT_TO_LANG[ext] || undefined,
56
+ }
57
+ }
58
+
59
+ // "*.go" or ".go" → extension only
60
+ const extMatch = f.match(/^\*?\.(\w+)$/)
61
+ if (extMatch) {
62
+ const ext = extMatch[1]
63
+ return { language: EXT_TO_LANG[ext] || undefined }
64
+ }
65
+
66
+ // "go", "python", "typescript" → language name
67
+ const lower = f.toLowerCase()
68
+ if (LANG_NAMES.has(lower)) {
69
+ return { language: lower }
70
+ }
71
+ // "go" could also be ext
72
+ if (EXT_TO_LANG[lower]) {
73
+ return { language: EXT_TO_LANG[lower] }
74
+ }
75
+
76
+ // Ends with "/" → path prefix
77
+ if (f.endsWith("/")) {
78
+ return { pathPrefix: f.replace(/\/+$/, "") }
79
+ }
80
+
81
+ // Contains "/" → path prefix (e.g. "internal/domain")
82
+ if (f.includes("/")) {
83
+ return { pathPrefix: f.replace(/\/+$/, "") }
84
+ }
85
+
86
+ // Anything else → substring match on file path
87
+ return { pathContains: f }
88
+ }
14
89
 
15
90
  export default tool({
16
91
  description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
@@ -22,74 +97,74 @@ Available indexes:
22
97
  - searchAll: true - Search across all indexes
23
98
 
24
99
  Examples:
25
- - "authentication logic" → finds auth-related code
26
- - "database connection handling" finds DB setup code
27
- - "how to deploy" with index: "docs" → finds deployment docs
28
- - "API keys" with index: "config" → finds config with API settings`,
100
+ - search({ query: "authentication logic" })
101
+ - search({ query: "how to deploy", index: "docs" })
102
+ - search({ query: "tenant management", filter: "internal/domain/" })
103
+ - search({ query: "event handling", filter: "*.go" })
104
+ - search({ query: "API routes", filter: "internal/**/*.go" })
105
+ - search({ query: "metrics", searchAll: true })`,
29
106
 
30
107
  args: {
31
108
  query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
32
- index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config, or custom name"),
33
- limit: tool.schema.number().optional().default(10).describe("Number of results to return (default: 10)"),
109
+ index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
110
+ limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
34
111
  searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
35
- freshen: tool.schema.boolean().optional().default(true).describe("Auto-update stale files before searching (default: true)"),
36
- includeArchived: tool.schema.boolean().optional().default(false).describe("Include archived files in results (default: false). Files are archived if in /archive/ folder or have 'archived: true' in frontmatter."),
37
- // v2 params
38
- hybrid: tool.schema.boolean().optional().describe("Enable hybrid search (vector + BM25 keyword matching). Improves exact keyword recall."),
39
- fileType: tool.schema.string().optional().describe("Filter by file type: 'code', 'docs', or 'config'"),
40
- language: tool.schema.string().optional().describe("Filter by language: 'typescript', 'python', 'markdown', etc."),
41
- modifiedAfter: tool.schema.string().optional().describe("Filter: only files modified after this ISO date (e.g. '2024-01-01')"),
42
- modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
43
- tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
112
+ filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
44
113
  },
45
114
 
46
115
  async execute(args) {
47
116
  const projectRoot = process.cwd()
48
117
 
49
118
  try {
50
- let allResults: any[] = []
51
- const limit = args.limit || 10
119
+ // Load config defaults (parsed from vectorizer.yaml)
120
+ const cfg = getSearchConfig()
121
+ const limit = args.limit || cfg.default_limit || 10
52
122
  const indexName = args.index || "code"
123
+ const minScore = cfg.min_score ?? 0.35
124
+ const includeArchived = cfg.include_archived ?? false
125
+
126
+ // Parse filter into path/language constraints
127
+ const filterParsed = args.filter ? parseFilter(args.filter) : {}
53
128
 
54
- // Build search options from v2 params
129
+ // Build search options — hybrid is always from per-index config
55
130
  const searchOptions: Record<string, any> = {}
56
- if (args.hybrid != null) searchOptions.hybrid = args.hybrid
57
- if (args.fileType) searchOptions.fileType = args.fileType
58
- if (args.language) searchOptions.language = args.language
59
- if (args.modifiedAfter) searchOptions.modifiedAfter = args.modifiedAfter
60
- if (args.modifiedBefore) searchOptions.modifiedBefore = args.modifiedBefore
61
- if (args.tags) searchOptions.tags = args.tags.split(",").map((t: string) => t.trim()).filter(Boolean)
62
-
63
- // Auto-freshen stale files before searching
64
- if (args.freshen !== false) {
65
- const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
66
- await tempIndexer.freshen()
67
- await tempIndexer.unloadModel()
131
+ if (filterParsed.language) searchOptions.language = filterParsed.language
132
+
133
+ // Freshen from config (default: false auto_index handles it)
134
+ if (cfg.freshen) {
135
+ try {
136
+ const indexer = await getIndexer(projectRoot, indexName)
137
+ await indexer.freshen()
138
+ releaseIndexer(projectRoot, indexName)
139
+ } catch {
140
+ // non-fatal search can proceed without freshen
141
+ }
68
142
  }
69
143
 
144
+ let allResults: any[] = []
145
+
70
146
  if (args.searchAll) {
71
- const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
147
+ const tempIndexer = await getIndexer(projectRoot, "code")
72
148
  const indexes = await tempIndexer.listIndexes()
73
- await tempIndexer.unloadModel()
149
+ releaseIndexer(projectRoot, "code")
74
150
 
75
151
  if (indexes.length === 0) {
76
- return `No indexes found. Create one with: codeindex({ action: "reindex", index: "code" })`
152
+ return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
77
153
  }
78
154
 
79
155
  for (const idx of indexes) {
80
- const indexer = await new CodebaseIndexer(projectRoot, idx).init()
81
- if (args.freshen !== false) {
82
- await indexer.freshen()
156
+ const indexer = await getIndexer(projectRoot, idx)
157
+ try {
158
+ const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
159
+ allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
160
+ } finally {
161
+ releaseIndexer(projectRoot, idx)
83
162
  }
84
- const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
85
- allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
86
- await indexer.unloadModel()
87
163
  }
88
164
 
89
165
  allResults.sort((a, b) => {
90
- // Prefer combinedScore (hybrid), fall back to distance
91
- const scoreA = a._combinedScore ?? (a._distance != null ? 1 - a._distance : 0)
92
- const scoreB = b._combinedScore ?? (b._distance != null ? 1 - b._distance : 0)
166
+ const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
167
+ const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
93
168
  return scoreB - scoreA
94
169
  })
95
170
  allResults = allResults.slice(0, limit)
@@ -98,34 +173,120 @@ Examples:
98
173
  try {
99
174
  await fs.access(hashesFile)
100
175
  } catch {
101
- return `Index "${indexName}" not found. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
176
+ // Index doesn't exist check what indexes ARE available
177
+ const tempIndexer = await getIndexer(projectRoot, "code")
178
+ const available = await tempIndexer.listIndexes()
179
+ releaseIndexer(projectRoot, "code")
180
+
181
+ if (available.length > 0) {
182
+ const list = available.map(i => `"${i}"`).join(", ")
183
+ return `Index "${indexName}" not found. Available indexes: ${list}.\n\nTry: search({ query: "${args.query}", index: "${available[0]}" })\nOr search all: search({ query: "${args.query}", searchAll: true })`
184
+ }
185
+ return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
186
+ }
187
+
188
+ const indexer = await getIndexer(projectRoot, indexName)
189
+ try {
190
+ const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
191
+ allResults = results.map((r: any) => ({ ...r, _index: indexName }))
192
+ } finally {
193
+ releaseIndexer(projectRoot, indexName)
102
194
  }
195
+ }
103
196
 
104
- const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
105
- const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
106
- allResults = results.map((r: any) => ({ ...r, _index: indexName }))
107
- await indexer.unloadModel()
197
+ // ── Score cutoff drop low-relevance results ──────────────────────────
198
+ allResults = allResults.filter(r => {
199
+ const score = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
200
+ return score >= minScore
201
+ })
202
+
203
+ // ── Filter — apply path/language constraints from `filter` param ───────
204
+ if (filterParsed.pathPrefix) {
205
+ const prefix = filterParsed.pathPrefix
206
+ allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
207
+ }
208
+ if (filterParsed.pathContains) {
209
+ const needle = filterParsed.pathContains.toLowerCase()
210
+ allResults = allResults.filter(r => r.file && r.file.toLowerCase().includes(needle))
108
211
  }
212
+ // Language filter is already passed to searchOptions above, but double-check
213
+ // in case vectorizer didn't filter (e.g. docs index has no language field)
214
+ if (filterParsed.language) {
215
+ allResults = allResults.filter(r => !r.language || r.language === filterParsed.language || r.language === "unknown")
216
+ }
217
+
218
+ // ── Reranking — boost results where query keywords appear in text ──────
219
+ const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
220
+ for (const r of allResults) {
221
+ const isBM25Only = !!r._bm25Only
222
+ const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
223
+ r._vectorScore = vectorScore
224
+ r._bm25Component = isBM25Only
225
+ ? (r._combinedScore ?? 0)
226
+ : (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
227
+ const baseScore = r._combinedScore ?? vectorScore
109
228
 
110
- if (allResults.length === 0) {
229
+ const text = (r.content || "").toLowerCase()
230
+ const matchedKeywords: string[] = []
231
+ if (queryKeywords.length > 0) {
232
+ for (const kw of queryKeywords) {
233
+ if (text.includes(kw)) matchedKeywords.push(kw)
234
+ }
235
+ }
236
+ r._matchedKeywords = matchedKeywords
237
+ const keywordBonus = queryKeywords.length > 0 ? (matchedKeywords.length / queryKeywords.length) * 0.15 : 0
238
+ r._keywordBonus = keywordBonus
239
+ r._finalScore = baseScore + keywordBonus
240
+ }
241
+ allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
242
+
243
+ // ── Group by file — best chunk per file, with chunk count ─────────────
244
+ const fileGroups = new Map<string, { best: any, chunks: any[] }>()
245
+ for (const r of allResults) {
246
+ const key = r.file
247
+ if (!fileGroups.has(key)) {
248
+ fileGroups.set(key, { best: r, chunks: [r] })
249
+ } else {
250
+ const group = fileGroups.get(key)!
251
+ group.chunks.push(r)
252
+ if ((r._finalScore ?? 0) > (group.best._finalScore ?? 0)) {
253
+ group.best = r
254
+ }
255
+ }
256
+ }
257
+
258
+ const sortedGroups = [...fileGroups.values()]
259
+ .sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
260
+ .slice(0, limit)
261
+
262
+ if (sortedGroups.length === 0) {
111
263
  const scope = args.searchAll ? "any index" : `index "${indexName}"`
112
- return `No results found in ${scope} for: "${args.query}"\n\nTry:\n- Different keywords\n- Enable hybrid search: search({ query: "...", hybrid: true })\n- Re-index with: codeindex({ action: "reindex", index: "${indexName}" })`
264
+ const filterNote = args.filter ? ` with filter "${args.filter}"` : ""
265
+ return `No results found in ${scope}${filterNote} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords or phrasing\n- Remove or broaden the filter\n- search({ query: "...", searchAll: true })`
113
266
  }
114
267
 
268
+ // ── Confidence signal ──────────────────────────────────────────────────
269
+ const topScore = sortedGroups[0].best._finalScore ?? 0
270
+ const hasBM25Only = allResults.some((r: any) => r._bm25Only)
115
271
  const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
116
- const hybridLabel = args.hybrid ? " [hybrid]" : ""
117
- let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel})\n\n`
118
-
119
- for (let i = 0; i < allResults.length; i++) {
120
- const r = allResults[i]
121
- const score = r._combinedScore != null
122
- ? r._combinedScore.toFixed(3)
123
- : r._distance != null
124
- ? (1 - r._distance).toFixed(3)
125
- : "N/A"
272
+ const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
273
+ let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
274
+
275
+ if (hasBM25Only) {
276
+ output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
277
+ }
278
+
279
+ if (topScore < 0.45) {
280
+ output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
281
+ }
282
+
283
+ for (let i = 0; i < sortedGroups.length; i++) {
284
+ const { best: r, chunks } = sortedGroups[i]
285
+ const score = (r._finalScore ?? 0).toFixed(3)
126
286
  const indexLabel = args.searchAll ? ` [${r._index}]` : ""
287
+ const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
127
288
 
128
- // v2: show rich metadata when available
289
+ // Rich metadata
129
290
  const metaParts: string[] = []
130
291
  if (r.language && r.language !== "unknown") metaParts.push(r.language)
131
292
  if (r.heading_context) metaParts.push(`"${r.heading_context}"`)
@@ -133,13 +294,38 @@ Examples:
133
294
  if (r.class_name) metaParts.push(`class: ${r.class_name}`)
134
295
  const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
135
296
 
136
- output += `### ${i + 1}. ${r.file}${indexLabel}\n`
137
- output += `**Relevance:** ${score}${metaLine}\n\n`
297
+ // Score breakdown
298
+ const breakdownParts: string[] = r._bm25Only
299
+ ? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
300
+ : [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
301
+ if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
302
+ if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
303
+ const breakdown = breakdownParts.join(", ")
304
+
305
+ // Matched keywords
306
+ const kwDisplay = r._matchedKeywords && r._matchedKeywords.length > 0
307
+ ? ` | matched: "${r._matchedKeywords.join('", "')}"`
308
+ : ""
309
+
310
+ output += `### ${i + 1}. ${r.file}${indexLabel}${chunkNote}\n`
311
+ output += `**Score:** ${score} (${breakdown}${kwDisplay})${metaLine}\n\n`
138
312
  output += "```\n"
139
313
  const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
140
314
  output += content
141
315
  output += "\n```\n"
142
316
 
317
+ // Second-best chunk hint
318
+ if (chunks.length > 1) {
319
+ const second = chunks.find((c: any) => c !== r)
320
+ if (second) {
321
+ const secMeta: string[] = []
322
+ if (second.function_name) secMeta.push(`fn: ${second.function_name}`)
323
+ if (second.heading_context) secMeta.push(`"${second.heading_context}"`)
324
+ const secLabel = secMeta.length > 0 ? ` ${secMeta.join(", ")}` : ""
325
+ output += `\n*Also:${secLabel}*\n`
326
+ }
327
+ }
328
+
143
329
  if (r.relatedContext && r.relatedContext.length > 0) {
144
330
  output += "\n**Related Context:**\n"
145
331
  for (const rel of r.relatedContext) {
@@ -154,7 +340,9 @@ Examples:
154
340
  output += "\n"
155
341
  }
156
342
 
157
- output += `---\n*Found ${allResults.length} results. Use Read tool to see full files.*`
343
+ const totalChunks = allResults.length
344
+ const uniqueFiles = sortedGroups.length
345
+ output += `---\n*${uniqueFiles} files (${totalChunks} chunks). Use Read tool to see full files.*`
158
346
  return output
159
347
  } catch (error: any) {
160
348
  return `Search failed: ${error.message || String(error)}`
@@ -12,7 +12,7 @@
12
12
 
13
13
  import path from "path"
14
14
  import fs from "fs/promises"
15
- import { ChunkWithId } from "../graph-builder"
15
+ import { ChunkWithId, buildDefaultChunkId } from "../graph-builder"
16
16
  import { LSPClient, LSPSymbolInformation, SymbolKind } from "./lsp-client"
17
17
 
18
18
  export interface Relation {
@@ -252,7 +252,9 @@ export class LSPAnalyzer {
252
252
  return result
253
253
  }
254
254
 
255
- /** Convert LSP location URI + line → chunk_id. */
255
+ /** Convert LSP location URI + line → chunk_id.
256
+ * For same-file refs, resolves to exact chunk by line.
257
+ * For cross-file refs, returns the default (first) chunk of the target file. */
256
258
  private locationToChunkId(currentFile: string, uri: string, line: number, root: string): string | null {
257
259
  // uri = file:///absolute/path/to/file.ts
258
260
  const filePath = uri.startsWith("file://") ? uri.slice(7) : uri
@@ -261,11 +263,9 @@ export class LSPAnalyzer {
261
263
  // Skip external files (node_modules, etc.)
262
264
  if (relPath.startsWith("..") || relPath.includes("node_modules")) return null
263
265
 
264
- const withoutExt = relPath.replace(/\.[^/.]+$/, "")
265
- const normalized = withoutExt.replace(/[^a-zA-Z0-9]/g, "_")
266
- // For cross-file references, point to chunk 0 (first chunk of target file)
267
- // For same-file, we could be more precise but chunk 0 is sufficient for graph
268
- return `chunk_${normalized}_0`
266
+ // Same file → use findChunkForPosition (called separately with chunks)
267
+ // Cross-file default chunk
268
+ return buildDefaultChunkId(relPath)
269
269
  }
270
270
 
271
271
  private findChunkForPosition(chunks: ChunkWithId[], line: number): string | null {