@comfanion/usethis_search 3.0.0-dev.23 → 3.0.0-dev.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "3.0.0-dev.23",
3
+ "version": "3.0.0-dev.24",
4
4
  "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
package/tools/search.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  /**
2
- * Semantic Code Search Tool (v2)
2
+ * Semantic Code Search Tool (v3)
3
3
  *
4
4
  * Uses local embeddings + LanceDB vector store via bundled vectorizer.
5
- * v2: hybrid search, metadata filtering, rich result metadata.
5
+ * v3: simplified agent API — 5 params, config-driven defaults, smart filter.
6
6
  * Index data is stored in `.opencode/vectors/<index>/`.
7
7
  */
8
8
 
@@ -10,7 +10,82 @@ import { tool } from "@opencode-ai/plugin"
10
10
  import path from "path"
11
11
  import fs from "fs/promises"
12
12
 
13
- import { CodebaseIndexer } from "../vectorizer/index.ts"
13
+ import { CodebaseIndexer, getSearchConfig } from "../vectorizer/index.ts"
14
+
15
+ // ── Extension → language mapping (for filter parsing) ─────────────────────
16
+ const EXT_TO_LANG: Record<string, string> = {
17
+ go: "go", py: "python", ts: "typescript", tsx: "typescript",
18
+ js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
19
+ rs: "rust", java: "java", kt: "kotlin", swift: "swift",
20
+ c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
21
+ rb: "ruby", php: "php", scala: "scala", clj: "clojure",
22
+ md: "markdown", mdx: "markdown", txt: "text",
23
+ yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
24
+ }
25
+ const LANG_NAMES = new Set(Object.values(EXT_TO_LANG))
26
+
27
+ /**
28
+ * Parse the `filter` param into path prefix and/or language filter.
29
+ *
30
+ * Supported formats:
31
+ * "internal/domain/" → pathPrefix = "internal/domain"
32
+ * "*.go" → language = "go"
33
+ * ".go" → language = "go"
34
+ * "go" → language = "go"
35
+ * "internal/*.go" → pathPrefix = "internal", language = "go"
36
+ * "internal/**\/*.go" → pathPrefix = "internal", language = "go"
37
+ * "service" → pathContains = "service"
38
+ */
39
+ function parseFilter(filter: string): {
40
+ pathPrefix?: string
41
+ language?: string
42
+ pathContains?: string
43
+ } {
44
+ if (!filter) return {}
45
+
46
+ const f = filter.trim()
47
+
48
+ // "internal/**/*.go" or "internal/*.go" → path + extension
49
+ const globMatch = f.match(/^([^*]+?)(?:\/\*\*)?\/?\*\.(\w+)$/)
50
+ if (globMatch) {
51
+ const prefix = globMatch[1].replace(/\/+$/, "")
52
+ const ext = globMatch[2]
53
+ return {
54
+ pathPrefix: prefix,
55
+ language: EXT_TO_LANG[ext] || undefined,
56
+ }
57
+ }
58
+
59
+ // "*.go" or ".go" → extension only
60
+ const extMatch = f.match(/^\*?\.(\w+)$/)
61
+ if (extMatch) {
62
+ const ext = extMatch[1]
63
+ return { language: EXT_TO_LANG[ext] || undefined }
64
+ }
65
+
66
+ // "go", "python", "typescript" → language name
67
+ const lower = f.toLowerCase()
68
+ if (LANG_NAMES.has(lower)) {
69
+ return { language: lower }
70
+ }
71
+ // "go" could also be ext
72
+ if (EXT_TO_LANG[lower]) {
73
+ return { language: EXT_TO_LANG[lower] }
74
+ }
75
+
76
+ // Ends with "/" → path prefix
77
+ if (f.endsWith("/")) {
78
+ return { pathPrefix: f.replace(/\/+$/, "") }
79
+ }
80
+
81
+ // Contains "/" → path prefix (e.g. "internal/domain")
82
+ if (f.includes("/")) {
83
+ return { pathPrefix: f.replace(/\/+$/, "") }
84
+ }
85
+
86
+ // Anything else → substring match on file path
87
+ return { pathContains: f }
88
+ }
14
89
 
15
90
  export default tool({
16
91
  description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
@@ -22,84 +97,51 @@ Available indexes:
22
97
  - searchAll: true - Search across all indexes
23
98
 
24
99
  Examples:
25
- - "authentication logic" → finds auth-related code
26
- - "database connection handling" finds DB setup code
27
- - "how to deploy" with index: "docs" → finds deployment docs
28
- - "API keys" with index: "config" → finds config with API settings
29
- - search({ query: "tenant", path: "internal/domain/" }) → searches only in internal/domain/`,
100
+ - search({ query: "authentication logic" })
101
+ - search({ query: "how to deploy", index: "docs" })
102
+ - search({ query: "tenant management", filter: "internal/domain/" })
103
+ - search({ query: "event handling", filter: "*.go" })
104
+ - search({ query: "API routes", filter: "internal/**/*.go" })
105
+ - search({ query: "metrics", searchAll: true })`,
30
106
 
31
107
  args: {
32
108
  query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
33
- index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config, or custom name"),
34
- limit: tool.schema.number().optional().default(10).describe("Number of results to return (default: 10)"),
109
+ index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
110
+ limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
35
111
  searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
36
- freshen: tool.schema.boolean().optional().default(true).describe("Auto-update stale files before searching (default: true)"),
37
- includeArchived: tool.schema.boolean().optional().default(false).describe("Include archived files in results (default: false). Files are archived if in /archive/ folder or have 'archived: true' in frontmatter."),
38
- // v2 params
39
- hybrid: tool.schema.boolean().optional().describe("Enable hybrid search (vector + BM25 keyword matching). Improves exact keyword recall."),
40
- fileType: tool.schema.string().optional().describe("Filter by file type: 'code', 'docs', or 'config'"),
41
- language: tool.schema.string().optional().describe("Filter by language: 'typescript', 'python', 'markdown', etc."),
42
- modifiedAfter: tool.schema.string().optional().describe("Filter: only files modified after this ISO date (e.g. '2024-01-01')"),
43
- modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
44
- tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
45
- minScore: tool.schema.number().optional().default(0.35).describe("Minimum relevance score (0-1). Results below this threshold are dropped. Default: 0.35"),
46
- path: tool.schema.string().optional().describe("Filter by file path prefix (e.g. 'internal/domain/', 'src/components'). Only returns files under this path."),
112
+ filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
47
113
  },
48
114
 
49
115
  async execute(args) {
50
116
  const projectRoot = process.cwd()
51
117
 
52
118
  try {
53
- let allResults: any[] = []
54
- const limit = args.limit || 10
119
+ // Load config defaults (parsed from vectorizer.yaml)
120
+ const cfg = getSearchConfig()
121
+ const limit = args.limit || cfg.default_limit || 10
55
122
  const indexName = args.index || "code"
123
+ const minScore = cfg.min_score ?? 0.35
124
+ const includeArchived = cfg.include_archived ?? false
125
+
126
+ // Parse filter into path/language constraints
127
+ const filterParsed = args.filter ? parseFilter(args.filter) : {}
56
128
 
57
- // Build search options from v2 params
129
+ // Build search options — hybrid is always from per-index config
58
130
  const searchOptions: Record<string, any> = {}
59
- if (args.hybrid != null) searchOptions.hybrid = args.hybrid
60
-
61
- // Normalize fileType: support extensions (*.go, .go) and language names (go, python)
62
- // fileType field stores "code" | "docs" | "config", so map user-friendly values
63
- if (args.fileType) {
64
- const ft = args.fileType.replace(/^\*?\.?/, "").toLowerCase()
65
- const extToLanguage: Record<string, string> = {
66
- go: "go", py: "python", ts: "typescript", tsx: "typescript",
67
- js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
68
- rs: "rust", java: "java", kt: "kotlin", swift: "swift",
69
- c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
70
- rb: "ruby", php: "php", scala: "scala", clj: "clojure",
71
- md: "markdown", mdx: "markdown", txt: "text",
72
- yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
73
- }
74
- // Also accept full language names
75
- const langNames = new Set([
76
- "go", "python", "typescript", "javascript", "rust", "java", "kotlin",
77
- "swift", "c", "cpp", "csharp", "ruby", "php", "scala", "clojure",
78
- "markdown", "text", "yaml", "json", "toml",
79
- ])
80
-
81
- if (ft === "code" || ft === "docs" || ft === "config") {
82
- searchOptions.fileType = ft
83
- } else if (extToLanguage[ft]) {
84
- searchOptions.language = extToLanguage[ft]
85
- } else if (langNames.has(ft)) {
86
- searchOptions.language = ft
87
- } else {
88
- searchOptions.fileType = ft // pass through as-is
131
+ if (filterParsed.language) searchOptions.language = filterParsed.language
132
+
133
+ // Freshen from config (default: false auto_index handles it)
134
+ if (cfg.freshen) {
135
+ try {
136
+ const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
137
+ await tempIndexer.freshen()
138
+ await tempIndexer.unloadModel()
139
+ } catch {
140
+ // non-fatal search can proceed without freshen
89
141
  }
90
142
  }
91
143
 
92
- if (args.language) searchOptions.language = args.language
93
- if (args.modifiedAfter) searchOptions.modifiedAfter = args.modifiedAfter
94
- if (args.modifiedBefore) searchOptions.modifiedBefore = args.modifiedBefore
95
- if (args.tags) searchOptions.tags = args.tags.split(",").map((t: string) => t.trim()).filter(Boolean)
96
-
97
- // Auto-freshen stale files before searching
98
- if (args.freshen !== false) {
99
- const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
100
- await tempIndexer.freshen()
101
- await tempIndexer.unloadModel()
102
- }
144
+ let allResults: any[] = []
103
145
 
104
146
  if (args.searchAll) {
105
147
  const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
@@ -107,21 +149,17 @@ Examples:
107
149
  await tempIndexer.unloadModel()
108
150
 
109
151
  if (indexes.length === 0) {
110
- return `No indexes found. Create one with: codeindex({ action: "reindex", index: "code" })`
152
+ return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
111
153
  }
112
154
 
113
155
  for (const idx of indexes) {
114
156
  const indexer = await new CodebaseIndexer(projectRoot, idx).init()
115
- if (args.freshen !== false) {
116
- await indexer.freshen()
117
- }
118
- const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
157
+ const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
119
158
  allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
120
159
  await indexer.unloadModel()
121
160
  }
122
161
 
123
162
  allResults.sort((a, b) => {
124
- // Prefer combinedScore (hybrid), fall back to L2→similarity conversion
125
163
  const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
126
164
  const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
127
165
  return scoreB - scoreA
@@ -145,40 +183,40 @@ Examples:
145
183
  }
146
184
 
147
185
  const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
148
- const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
186
+ const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
149
187
  allResults = results.map((r: any) => ({ ...r, _index: indexName }))
150
188
  await indexer.unloadModel()
151
189
  }
152
190
 
153
191
  // ── Score cutoff — drop low-relevance results ──────────────────────────
154
- const minScore = args.minScore ?? 0.35
155
192
  allResults = allResults.filter(r => {
156
193
  const score = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
157
194
  return score >= minScore
158
195
  })
159
196
 
160
- // ── Path filter restrict to subtree ──────────────────────────────────
161
- if (args.path) {
162
- const prefix = args.path.replace(/\/+$/, "") // normalize trailing slash
197
+ // ── Filterapply path/language constraints from `filter` param ───────
198
+ if (filterParsed.pathPrefix) {
199
+ const prefix = filterParsed.pathPrefix
163
200
  allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
164
201
  }
202
+ if (filterParsed.pathContains) {
203
+ const needle = filterParsed.pathContains.toLowerCase()
204
+ allResults = allResults.filter(r => r.file && r.file.toLowerCase().includes(needle))
205
+ }
206
+ // Language filter is already passed to searchOptions above, but double-check
207
+ // in case vectorizer didn't filter (e.g. docs index has no language field)
208
+ if (filterParsed.language) {
209
+ allResults = allResults.filter(r => !r.language || r.language === filterParsed.language || r.language === "unknown")
210
+ }
165
211
 
166
212
  // ── Reranking — boost results where query keywords appear in text ──────
167
- // Also store score components for breakdown display
168
213
  const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
169
214
  for (const r of allResults) {
170
- // Vector score (L2 → similarity)
171
215
  const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
172
216
  r._vectorScore = vectorScore
173
-
174
- // BM25 component (present only in hybrid mode — embedded in _combinedScore)
175
- // If _combinedScore exists and differs from vectorScore, the difference is BM25 contribution
176
217
  r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
177
-
178
- // Base score before keyword boost
179
218
  const baseScore = r._combinedScore ?? vectorScore
180
219
 
181
- // Keyword matching
182
220
  const text = (r.content || "").toLowerCase()
183
221
  const matchedKeywords: string[] = []
184
222
  if (queryKeywords.length > 0) {
@@ -208,25 +246,24 @@ Examples:
208
246
  }
209
247
  }
210
248
 
211
- // Sort groups by best chunk score, take top N unique files
212
249
  const sortedGroups = [...fileGroups.values()]
213
250
  .sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
214
251
  .slice(0, limit)
215
252
 
216
253
  if (sortedGroups.length === 0) {
217
254
  const scope = args.searchAll ? "any index" : `index "${indexName}"`
218
- return `No results found in ${scope} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords\n- Lower minScore threshold: search({ query: "...", minScore: 0.2 })\n- Enable hybrid search: search({ query: "...", hybrid: true })`
255
+ const filterNote = args.filter ? ` with filter "${args.filter}"` : ""
256
+ return `No results found in ${scope}${filterNote} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords or phrasing\n- Remove or broaden the filter\n- search({ query: "...", searchAll: true })`
219
257
  }
220
258
 
221
259
  // ── Confidence signal ──────────────────────────────────────────────────
222
260
  const topScore = sortedGroups[0].best._finalScore ?? 0
223
261
  const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
224
- const hybridLabel = args.hybrid ? " [hybrid]" : ""
225
- const pathLabel = args.path ? ` path:"${args.path}"` : ""
226
- let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel}${pathLabel})\n\n`
262
+ const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
263
+ let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
227
264
 
228
265
  if (topScore < 0.45) {
229
- output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords, different phrasing, or hybrid: true.\n\n`
266
+ output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
230
267
  }
231
268
 
232
269
  for (let i = 0; i < sortedGroups.length; i++) {
@@ -235,7 +272,7 @@ Examples:
235
272
  const indexLabel = args.searchAll ? ` [${r._index}]` : ""
236
273
  const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
237
274
 
238
- // v2: show rich metadata when available
275
+ // Rich metadata
239
276
  const metaParts: string[] = []
240
277
  if (r.language && r.language !== "unknown") metaParts.push(r.language)
241
278
  if (r.heading_context) metaParts.push(`"${r.heading_context}"`)
@@ -243,7 +280,7 @@ Examples:
243
280
  if (r.class_name) metaParts.push(`class: ${r.class_name}`)
244
281
  const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
245
282
 
246
- // Score breakdown: vector + bm25 + keywords
283
+ // Score breakdown
247
284
  const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
248
285
  if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
249
286
  if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
@@ -261,7 +298,7 @@ Examples:
261
298
  output += content
262
299
  output += "\n```\n"
263
300
 
264
- // Show second-best chunk from same file if available (brief)
301
+ // Second-best chunk hint
265
302
  if (chunks.length > 1) {
266
303
  const second = chunks.find((c: any) => c !== r)
267
304
  if (second) {
@@ -85,6 +85,15 @@ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
85
85
  let METRICS_ENABLED = false;
86
86
  let CACHE_ENABLED = true;
87
87
 
88
+ // ── Search defaults (exposed to tool layer) ──────────────────────────────────
89
+ const DEFAULT_SEARCH_CONFIG = {
90
+ freshen: false, // Don't freshen on every search — auto_index handles it
91
+ min_score: 0.35, // Minimum relevance score cutoff
92
+ include_archived: false, // Exclude archived files by default
93
+ default_limit: 10, // Default result count
94
+ };
95
+ let SEARCH_CONFIG = { ...DEFAULT_SEARCH_CONFIG };
96
+
88
97
  // ── Graph config (v3) ───────────────────────────────────────────────────────
89
98
  const DEFAULT_GRAPH_CONFIG = {
90
99
  enabled: true,
@@ -135,6 +144,10 @@ function defaultVectorizerYaml() {
135
144
  ` search:\n` +
136
145
  ` hybrid: true\n` +
137
146
  ` bm25_weight: 0.3\n` +
147
+ ` freshen: false # Don't re-index on every search (auto_index handles it)\n` +
148
+ ` min_score: 0.35 # Minimum relevance score cutoff\n` +
149
+ ` include_archived: false # Exclude archived files\n` +
150
+ ` default_limit: 10 # Default number of results\n` +
138
151
  `\n` +
139
152
  ` # Graph-based context (v3)\n` +
140
153
  ` graph:\n` +
@@ -299,6 +312,11 @@ async function loadConfig(projectRoot) {
299
312
  enabled: parseBool(ss, "hybrid", false),
300
313
  bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
301
314
  };
315
+ // Extended search defaults
316
+ SEARCH_CONFIG.freshen = parseBool(ss, "freshen", DEFAULT_SEARCH_CONFIG.freshen);
317
+ SEARCH_CONFIG.min_score = parseNumber(ss, "min_score", DEFAULT_SEARCH_CONFIG.min_score);
318
+ SEARCH_CONFIG.include_archived = parseBool(ss, "include_archived", DEFAULT_SEARCH_CONFIG.include_archived);
319
+ SEARCH_CONFIG.default_limit = parseNumber(ss, "default_limit", DEFAULT_SEARCH_CONFIG.default_limit);
302
320
  }
303
321
 
304
322
  // ── Parse quality config ────────────────────────────────────────────────
@@ -1305,4 +1323,8 @@ function getEmbeddingModel() {
1305
1323
  return EMBEDDING_MODEL;
1306
1324
  }
1307
1325
 
1308
- export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel };
1326
+ function getSearchConfig() {
1327
+ return SEARCH_CONFIG;
1328
+ }
1329
+
1330
+ export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig };