npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.8 → 3.0.0 - Mend

@comfanion/usethis_search 3.0.0-dev.8 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/cli.ts +263 -0
package/file-indexer.ts +1 -1
package/index.ts +0 -8
package/package.json +12 -5
package/tools/codeindex.ts +2 -2
package/tools/search.ts +254 -66
package/vectorizer/analyzers/lsp-analyzer.ts +7 -7
package/vectorizer/analyzers/regex-analyzer.ts +358 -61
package/vectorizer/chunk-store.ts +207 -0
package/vectorizer/chunkers/code-chunker.ts +74 -24
package/vectorizer/chunkers/markdown-chunker.ts +69 -7
package/vectorizer/graph-builder.ts +207 -15
package/vectorizer/graph-db.ts +161 -164
package/vectorizer/hybrid-search.ts +1 -1
package/vectorizer/{index.js → index.ts} +796 -160
package/vectorizer.yaml +20 -2

package/tools/search.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 /**
- * Semantic Code Search Tool (v2)
+ * Semantic Code Search Tool (v3)
  *
  * Uses local embeddings + LanceDB vector store via bundled vectorizer.
- * v2: hybrid search, metadata filtering, rich result metadata.
+ * v3: simplified agent API — 5 params, config-driven defaults, smart filter.
  * Index data is stored in `.opencode/vectors/<index>/`.
  */
@@ -10,7 +10,82 @@ import { tool } from "@opencode-ai/plugin"
 import path from "path"
 import fs from "fs/promises"
-import { CodebaseIndexer } from "../vectorizer/index.js"
+import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
+// ── Extension → language mapping (for filter parsing) ─────────────────────
+const EXT_TO_LANG: Record<string, string> = {
+  go: "go", py: "python", ts: "typescript", tsx: "typescript",
+  js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
+  rs: "rust", java: "java", kt: "kotlin", swift: "swift",
+  c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
+  rb: "ruby", php: "php", scala: "scala", clj: "clojure",
+  md: "markdown", mdx: "markdown", txt: "text",
+  yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
+}
+const LANG_NAMES = new Set(Object.values(EXT_TO_LANG))
+/**
+ * Parse the `filter` param into path prefix and/or language filter.
+ *
+ * Supported formats:
+ *   "internal/domain/"    → pathPrefix = "internal/domain"
+ *   "*.go"                → language = "go"
+ *   ".go"                 → language = "go"
+ *   "go"                  → language = "go"
+ *   "internal/*.go"       → pathPrefix = "internal", language = "go"
+ *   "internal/**\/*.go"   → pathPrefix = "internal", language = "go"
+ *   "service"             → pathContains = "service"
+ */
+function parseFilter(filter: string): {
+  pathPrefix?: string
+  language?: string
+  pathContains?: string
+} {
+  if (!filter) return {}
+  const f = filter.trim()
+  // "internal/**/*.go" or "internal/*.go" → path + extension
+  const globMatch = f.match(/^([^*]+?)(?:\/\*\*)?\/?\*\.(\w+)$/)
+  if (globMatch) {
+    const prefix = globMatch[1].replace(/\/+$/, "")
+    const ext = globMatch[2]
+    return {
+      pathPrefix: prefix,
+      language: EXT_TO_LANG[ext] || undefined,
+    }
+  }
+  // "*.go" or ".go" → extension only
+  const extMatch = f.match(/^\*?\.(\w+)$/)
+  if (extMatch) {
+    const ext = extMatch[1]
+    return { language: EXT_TO_LANG[ext] || undefined }
+  }
+  // "go", "python", "typescript" → language name
+  const lower = f.toLowerCase()
+  if (LANG_NAMES.has(lower)) {
+    return { language: lower }
+  }
+  // "go" could also be ext
+  if (EXT_TO_LANG[lower]) {
+    return { language: EXT_TO_LANG[lower] }
+  }
+  // Ends with "/" → path prefix
+  if (f.endsWith("/")) {
+    return { pathPrefix: f.replace(/\/+$/, "") }
+  }
+  // Contains "/" → path prefix (e.g. "internal/domain")
+  if (f.includes("/")) {
+    return { pathPrefix: f.replace(/\/+$/, "") }
+  }
+  // Anything else → substring match on file path
+  return { pathContains: f }
+}
 export default tool({
   description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
@@ -22,74 +97,74 @@ Available indexes:
 - searchAll: true - Search across all indexes
 Examples:
-- "authentication logic" → finds auth-related code
-- "database connection handling" → finds DB setup code
-- "how to deploy" with index: "docs" → finds deployment docs
-- "API keys" with index: "config" → finds config with API settings`,
+- search({ query: "authentication logic" })
+- search({ query: "how to deploy", index: "docs" })
+- search({ query: "tenant management", filter: "internal/domain/" })
+- search({ query: "event handling", filter: "*.go" })
+- search({ query: "API routes", filter: "internal/**/*.go" })
+- search({ query: "metrics", searchAll: true })`,
   args: {
     query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
-    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config, or custom name"),
-    limit: tool.schema.number().optional().default(10).describe("Number of results to return (default: 10)"),
+    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
+    limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
     searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
-    freshen: tool.schema.boolean().optional().default(true).describe("Auto-update stale files before searching (default: true)"),
-    includeArchived: tool.schema.boolean().optional().default(false).describe("Include archived files in results (default: false). Files are archived if in /archive/ folder or have 'archived: true' in frontmatter."),
-    // v2 params
-    hybrid: tool.schema.boolean().optional().describe("Enable hybrid search (vector + BM25 keyword matching). Improves exact keyword recall."),
-    fileType: tool.schema.string().optional().describe("Filter by file type: 'code', 'docs', or 'config'"),
-    language: tool.schema.string().optional().describe("Filter by language: 'typescript', 'python', 'markdown', etc."),
-    modifiedAfter: tool.schema.string().optional().describe("Filter: only files modified after this ISO date (e.g. '2024-01-01')"),
-    modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
-    tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
+    filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
   },
   async execute(args) {
     const projectRoot = process.cwd()
     try {
-      let allResults: any[] = []
-      const limit = args.limit || 10
+      // Load config defaults (parsed from vectorizer.yaml)
+      const cfg = getSearchConfig()
+      const limit = args.limit || cfg.default_limit || 10
       const indexName = args.index || "code"
+      const minScore = cfg.min_score ?? 0.35
+      const includeArchived = cfg.include_archived ?? false
+      // Parse filter into path/language constraints
+      const filterParsed = args.filter ? parseFilter(args.filter) : {}
-      // Build search options from v2 params
+      // Build search options — hybrid is always from per-index config
       const searchOptions: Record<string, any> = {}
-      if (args.hybrid != null) searchOptions.hybrid = args.hybrid
-      if (args.fileType) searchOptions.fileType = args.fileType
-      if (args.language) searchOptions.language = args.language
-      if (args.modifiedAfter) searchOptions.modifiedAfter = args.modifiedAfter
-      if (args.modifiedBefore) searchOptions.modifiedBefore = args.modifiedBefore
-      if (args.tags) searchOptions.tags = args.tags.split(",").map((t: string) => t.trim()).filter(Boolean)
-      // Auto-freshen stale files before searching
-      if (args.freshen !== false) {
-        const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
-        await tempIndexer.freshen()
-        await tempIndexer.unloadModel()
+      if (filterParsed.language) searchOptions.language = filterParsed.language
+      // Freshen from config (default: false — auto_index handles it)
+      if (cfg.freshen) {
+        try {
+          const indexer = await getIndexer(projectRoot, indexName)
+          await indexer.freshen()
+          releaseIndexer(projectRoot, indexName)
+        } catch {
+          // non-fatal — search can proceed without freshen
+        }
       }
+      let allResults: any[] = []
       if (args.searchAll) {
-        const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
+        const tempIndexer = await getIndexer(projectRoot, "code")
         const indexes = await tempIndexer.listIndexes()
-        await tempIndexer.unloadModel()
+        releaseIndexer(projectRoot, "code")
         if (indexes.length === 0) {
-          return `No indexes found. Create one with: codeindex({ action: "reindex", index: "code" })`
+          return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
         }
         for (const idx of indexes) {
-          const indexer = await new CodebaseIndexer(projectRoot, idx).init()
-          if (args.freshen !== false) {
-            await indexer.freshen()
+          const indexer = await getIndexer(projectRoot, idx)
+          try {
+            const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
+            allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
+          } finally {
+            releaseIndexer(projectRoot, idx)
           }
-          const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
-          allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
-          await indexer.unloadModel()
         }
         allResults.sort((a, b) => {
-          // Prefer combinedScore (hybrid), fall back to distance
-          const scoreA = a._combinedScore ?? (a._distance != null ? 1 - a._distance : 0)
-          const scoreB = b._combinedScore ?? (b._distance != null ? 1 - b._distance : 0)
+          const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
+          const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
           return scoreB - scoreA
         })
         allResults = allResults.slice(0, limit)
@@ -98,34 +173,120 @@ Examples:
         try {
           await fs.access(hashesFile)
         } catch {
-          return `Index "${indexName}" not found. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
+          // Index doesn't exist — check what indexes ARE available
+          const tempIndexer = await getIndexer(projectRoot, "code")
+          const available = await tempIndexer.listIndexes()
+          releaseIndexer(projectRoot, "code")
+          if (available.length > 0) {
+            const list = available.map(i => `"${i}"`).join(", ")
+            return `Index "${indexName}" not found. Available indexes: ${list}.\n\nTry: search({ query: "${args.query}", index: "${available[0]}" })\nOr search all: search({ query: "${args.query}", searchAll: true })`
+          }
+          return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
+        }
+        const indexer = await getIndexer(projectRoot, indexName)
+        try {
+          const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
+          allResults = results.map((r: any) => ({ ...r, _index: indexName }))
+        } finally {
+          releaseIndexer(projectRoot, indexName)
         }
+      }
-        const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
-        const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
-        allResults = results.map((r: any) => ({ ...r, _index: indexName }))
-        await indexer.unloadModel()
+      // ── Score cutoff — drop low-relevance results ──────────────────────────
+      allResults = allResults.filter(r => {
+        const score = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
+        return score >= minScore
+      })
+      // ── Filter — apply path/language constraints from `filter` param ───────
+      if (filterParsed.pathPrefix) {
+        const prefix = filterParsed.pathPrefix
+        allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
+      }
+      if (filterParsed.pathContains) {
+        const needle = filterParsed.pathContains.toLowerCase()
+        allResults = allResults.filter(r => r.file && r.file.toLowerCase().includes(needle))
       }
+      // Language filter is already passed to searchOptions above, but double-check
+      // in case vectorizer didn't filter (e.g. docs index has no language field)
+      if (filterParsed.language) {
+        allResults = allResults.filter(r => !r.language || r.language === filterParsed.language || r.language === "unknown")
+      }
+      // ── Reranking — boost results where query keywords appear in text ──────
+      const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
+      for (const r of allResults) {
+        const isBM25Only = !!r._bm25Only
+        const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
+        r._vectorScore = vectorScore
+        r._bm25Component = isBM25Only
+          ? (r._combinedScore ?? 0)
+          : (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
+        const baseScore = r._combinedScore ?? vectorScore
-      if (allResults.length === 0) {
+        const text = (r.content || "").toLowerCase()
+        const matchedKeywords: string[] = []
+        if (queryKeywords.length > 0) {
+          for (const kw of queryKeywords) {
+            if (text.includes(kw)) matchedKeywords.push(kw)
+          }
+        }
+        r._matchedKeywords = matchedKeywords
+        const keywordBonus = queryKeywords.length > 0 ? (matchedKeywords.length / queryKeywords.length) * 0.15 : 0
+        r._keywordBonus = keywordBonus
+        r._finalScore = baseScore + keywordBonus
+      }
+      allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
+      // ── Group by file — best chunk per file, with chunk count ─────────────
+      const fileGroups = new Map<string, { best: any, chunks: any[] }>()
+      for (const r of allResults) {
+        const key = r.file
+        if (!fileGroups.has(key)) {
+          fileGroups.set(key, { best: r, chunks: [r] })
+        } else {
+          const group = fileGroups.get(key)!
+          group.chunks.push(r)
+          if ((r._finalScore ?? 0) > (group.best._finalScore ?? 0)) {
+            group.best = r
+          }
+        }
+      }
+      const sortedGroups = [...fileGroups.values()]
+        .sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
+        .slice(0, limit)
+      if (sortedGroups.length === 0) {
         const scope = args.searchAll ? "any index" : `index "${indexName}"`
-        return `No results found in ${scope} for: "${args.query}"\n\nTry:\n- Different keywords\n- Enable hybrid search: search({ query: "...", hybrid: true })\n- Re-index with: codeindex({ action: "reindex", index: "${indexName}" })`
+        const filterNote = args.filter ? ` with filter "${args.filter}"` : ""
+        return `No results found in ${scope}${filterNote} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords or phrasing\n- Remove or broaden the filter\n- search({ query: "...", searchAll: true })`
       }
+      // ── Confidence signal ──────────────────────────────────────────────────
+      const topScore = sortedGroups[0].best._finalScore ?? 0
+      const hasBM25Only = allResults.some((r: any) => r._bm25Only)
       const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
-      const hybridLabel = args.hybrid ? " [hybrid]" : ""
-      let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel})\n\n`
-      for (let i = 0; i < allResults.length; i++) {
-        const r = allResults[i]
-        const score = r._combinedScore != null
-          ? r._combinedScore.toFixed(3)
-          : r._distance != null
-            ? (1 - r._distance).toFixed(3)
-            : "N/A"
+      const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
+      let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
+      if (hasBM25Only) {
+        output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
+      }
+      if (topScore < 0.45) {
+        output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
+      }
+      for (let i = 0; i < sortedGroups.length; i++) {
+        const { best: r, chunks } = sortedGroups[i]
+        const score = (r._finalScore ?? 0).toFixed(3)
         const indexLabel = args.searchAll ? ` [${r._index}]` : ""
+        const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
-        // v2: show rich metadata when available
+        // Rich metadata
         const metaParts: string[] = []
         if (r.language && r.language !== "unknown") metaParts.push(r.language)
         if (r.heading_context) metaParts.push(`"${r.heading_context}"`)
@@ -133,13 +294,38 @@ Examples:
         if (r.class_name) metaParts.push(`class: ${r.class_name}`)
         const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
-        output += `### ${i + 1}. ${r.file}${indexLabel}\n`
-        output += `**Relevance:** ${score}${metaLine}\n\n`
+        // Score breakdown
+        const breakdownParts: string[] = r._bm25Only
+          ? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
+          : [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
+        if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
+        if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
+        const breakdown = breakdownParts.join(", ")
+        // Matched keywords
+        const kwDisplay = r._matchedKeywords && r._matchedKeywords.length > 0
+          ? ` | matched: "${r._matchedKeywords.join('", "')}"`
+          : ""
+        output += `### ${i + 1}. ${r.file}${indexLabel}${chunkNote}\n`
+        output += `**Score:** ${score} (${breakdown}${kwDisplay})${metaLine}\n\n`
         output += "```\n"
         const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
         output += content
         output += "\n```\n"
+        // Second-best chunk hint
+        if (chunks.length > 1) {
+          const second = chunks.find((c: any) => c !== r)
+          if (second) {
+            const secMeta: string[] = []
+            if (second.function_name) secMeta.push(`fn: ${second.function_name}`)
+            if (second.heading_context) secMeta.push(`"${second.heading_context}"`)
+            const secLabel = secMeta.length > 0 ? ` ${secMeta.join(", ")}` : ""
+            output += `\n*Also:${secLabel}*\n`
+          }
+        }
         if (r.relatedContext && r.relatedContext.length > 0) {
           output += "\n**Related Context:**\n"
           for (const rel of r.relatedContext) {
@@ -154,7 +340,9 @@ Examples:
         output += "\n"
       }
-      output += `---\n*Found ${allResults.length} results. Use Read tool to see full files.*`
+      const totalChunks = allResults.length
+      const uniqueFiles = sortedGroups.length
+      output += `---\n*${uniqueFiles} files (${totalChunks} chunks). Use Read tool to see full files.*`
       return output
     } catch (error: any) {
       return `Search failed: ${error.message || String(error)}`

package/vectorizer/analyzers/lsp-analyzer.ts CHANGED Viewed

@@ -12,7 +12,7 @@
 import path from "path"
 import fs from "fs/promises"
-import { ChunkWithId } from "../graph-builder"
+import { ChunkWithId, buildDefaultChunkId } from "../graph-builder"
 import { LSPClient, LSPSymbolInformation, SymbolKind } from "./lsp-client"
 export interface Relation {
@@ -252,7 +252,9 @@ export class LSPAnalyzer {
     return result
   }
-  /** Convert LSP location URI + line → chunk_id. */
+  /** Convert LSP location URI + line → chunk_id.
+   *  For same-file refs, resolves to exact chunk by line.
+   *  For cross-file refs, returns the default (first) chunk of the target file. */
   private locationToChunkId(currentFile: string, uri: string, line: number, root: string): string | null {
     // uri = file:///absolute/path/to/file.ts
     const filePath = uri.startsWith("file://") ? uri.slice(7) : uri
@@ -261,11 +263,9 @@ export class LSPAnalyzer {
     // Skip external files (node_modules, etc.)
     if (relPath.startsWith("..") || relPath.includes("node_modules")) return null
-    const withoutExt = relPath.replace(/\.[^/.]+$/, "")
-    const normalized = withoutExt.replace(/[^a-zA-Z0-9]/g, "_")
-    // For cross-file references, point to chunk 0 (first chunk of target file)
-    // For same-file, we could be more precise but chunk 0 is sufficient for graph
-    return `chunk_${normalized}_0`
+    // Same file → use findChunkForPosition (called separately with chunks)
+    // Cross-file → default chunk
+    return buildDefaultChunkId(relPath)
   }
   private findChunkForPosition(chunks: ChunkWithId[], line: number): string | null {