npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.22 → 3.0.0-dev.24 - Mend

@comfanion/usethis_search 3.0.0-dev.22 → 3.0.0-dev.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "3.0.0-dev.22",
+  "version": "3.0.0-dev.24",
   "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
   "type": "module",
   "main": "./index.ts",

package/tools/search.ts CHANGED Viewed

@@ -1,8 +1,8 @@
 /**
- * Semantic Code Search Tool (v2)
+ * Semantic Code Search Tool (v3)
  *
  * Uses local embeddings + LanceDB vector store via bundled vectorizer.
- * v2: hybrid search, metadata filtering, rich result metadata.
+ * v3: simplified agent API — 5 params, config-driven defaults, smart filter.
  * Index data is stored in `.opencode/vectors/<index>/`.
  */
@@ -10,7 +10,82 @@ import { tool } from "@opencode-ai/plugin"
 import path from "path"
 import fs from "fs/promises"
-import { CodebaseIndexer } from "../vectorizer/index.ts"
+import { CodebaseIndexer, getSearchConfig } from "../vectorizer/index.ts"
+// ── Extension → language mapping (for filter parsing) ─────────────────────
+const EXT_TO_LANG: Record<string, string> = {
+  go: "go", py: "python", ts: "typescript", tsx: "typescript",
+  js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
+  rs: "rust", java: "java", kt: "kotlin", swift: "swift",
+  c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
+  rb: "ruby", php: "php", scala: "scala", clj: "clojure",
+  md: "markdown", mdx: "markdown", txt: "text",
+  yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
+}
+const LANG_NAMES = new Set(Object.values(EXT_TO_LANG))
+/**
+ * Parse the `filter` param into path prefix and/or language filter.
+ *
+ * Supported formats:
+ *   "internal/domain/"    → pathPrefix = "internal/domain"
+ *   "*.go"                → language = "go"
+ *   ".go"                 → language = "go"
+ *   "go"                  → language = "go"
+ *   "internal/*.go"       → pathPrefix = "internal", language = "go"
+ *   "internal/**\/*.go"   → pathPrefix = "internal", language = "go"
+ *   "service"             → pathContains = "service"
+ */
+function parseFilter(filter: string): {
+  pathPrefix?: string
+  language?: string
+  pathContains?: string
+} {
+  if (!filter) return {}
+  const f = filter.trim()
+  // "internal/**/*.go" or "internal/*.go" → path + extension
+  const globMatch = f.match(/^([^*]+?)(?:\/\*\*)?\/?\*\.(\w+)$/)
+  if (globMatch) {
+    const prefix = globMatch[1].replace(/\/+$/, "")
+    const ext = globMatch[2]
+    return {
+      pathPrefix: prefix,
+      language: EXT_TO_LANG[ext] || undefined,
+    }
+  }
+  // "*.go" or ".go" → extension only
+  const extMatch = f.match(/^\*?\.(\w+)$/)
+  if (extMatch) {
+    const ext = extMatch[1]
+    return { language: EXT_TO_LANG[ext] || undefined }
+  }
+  // "go", "python", "typescript" → language name
+  const lower = f.toLowerCase()
+  if (LANG_NAMES.has(lower)) {
+    return { language: lower }
+  }
+  // "go" could also be ext
+  if (EXT_TO_LANG[lower]) {
+    return { language: EXT_TO_LANG[lower] }
+  }
+  // Ends with "/" → path prefix
+  if (f.endsWith("/")) {
+    return { pathPrefix: f.replace(/\/+$/, "") }
+  }
+  // Contains "/" → path prefix (e.g. "internal/domain")
+  if (f.includes("/")) {
+    return { pathPrefix: f.replace(/\/+$/, "") }
+  }
+  // Anything else → substring match on file path
+  return { pathContains: f }
+}
 export default tool({
   description: `Search the codebase semantically. Use this to find relevant code snippets, functions, or files based on meaning, not just text matching.
@@ -22,84 +97,51 @@ Available indexes:
 - searchAll: true - Search across all indexes
 Examples:
-- "authentication logic" → finds auth-related code
-- "database connection handling" → finds DB setup code
-- "how to deploy" with index: "docs" → finds deployment docs
-- "API keys" with index: "config" → finds config with API settings
-- search({ query: "tenant", path: "internal/domain/" }) → searches only in internal/domain/`,
+- search({ query: "authentication logic" })
+- search({ query: "how to deploy", index: "docs" })
+- search({ query: "tenant management", filter: "internal/domain/" })
+- search({ query: "event handling", filter: "*.go" })
+- search({ query: "API routes", filter: "internal/**/*.go" })
+- search({ query: "metrics", searchAll: true })`,
   args: {
     query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
-    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config, or custom name"),
-    limit: tool.schema.number().optional().default(10).describe("Number of results to return (default: 10)"),
+    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
+    limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
     searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
-    freshen: tool.schema.boolean().optional().default(true).describe("Auto-update stale files before searching (default: true)"),
-    includeArchived: tool.schema.boolean().optional().default(false).describe("Include archived files in results (default: false). Files are archived if in /archive/ folder or have 'archived: true' in frontmatter."),
-    // v2 params
-    hybrid: tool.schema.boolean().optional().describe("Enable hybrid search (vector + BM25 keyword matching). Improves exact keyword recall."),
-    fileType: tool.schema.string().optional().describe("Filter by file type: 'code', 'docs', or 'config'"),
-    language: tool.schema.string().optional().describe("Filter by language: 'typescript', 'python', 'markdown', etc."),
-    modifiedAfter: tool.schema.string().optional().describe("Filter: only files modified after this ISO date (e.g. '2024-01-01')"),
-    modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
-    tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
-    minScore: tool.schema.number().optional().default(0.35).describe("Minimum relevance score (0-1). Results below this threshold are dropped. Default: 0.35"),
-    path: tool.schema.string().optional().describe("Filter by file path prefix (e.g. 'internal/domain/', 'src/components'). Only returns files under this path."),
+    filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
   },
   async execute(args) {
     const projectRoot = process.cwd()
     try {
-      let allResults: any[] = []
-      const limit = args.limit || 10
+      // Load config defaults (parsed from vectorizer.yaml)
+      const cfg = getSearchConfig()
+      const limit = args.limit || cfg.default_limit || 10
       const indexName = args.index || "code"
+      const minScore = cfg.min_score ?? 0.35
+      const includeArchived = cfg.include_archived ?? false
-      // Build search options from v2 params
+      // Parse filter into path/language constraints
+      const filterParsed = args.filter ? parseFilter(args.filter) : {}
+      // Build search options — hybrid is always from per-index config
       const searchOptions: Record<string, any> = {}
-      if (args.hybrid != null) searchOptions.hybrid = args.hybrid
-      // Normalize fileType: support extensions (*.go, .go) and language names (go, python)
-      // fileType field stores "code" | "docs" | "config", so map user-friendly values
-      if (args.fileType) {
-        const ft = args.fileType.replace(/^\*?\.?/, "").toLowerCase()
-        const extToLanguage: Record<string, string> = {
-          go: "go", py: "python", ts: "typescript", tsx: "typescript",
-          js: "javascript", jsx: "javascript", mjs: "javascript", cjs: "javascript",
-          rs: "rust", java: "java", kt: "kotlin", swift: "swift",
-          c: "c", cpp: "cpp", h: "c", hpp: "cpp", cs: "csharp",
-          rb: "ruby", php: "php", scala: "scala", clj: "clojure",
-          md: "markdown", mdx: "markdown", txt: "text",
-          yaml: "yaml", yml: "yaml", json: "json", toml: "toml",
-        }
-        // Also accept full language names
-        const langNames = new Set([
-          "go", "python", "typescript", "javascript", "rust", "java", "kotlin",
-          "swift", "c", "cpp", "csharp", "ruby", "php", "scala", "clojure",
-          "markdown", "text", "yaml", "json", "toml",
-        ])
-        if (ft === "code" || ft === "docs" || ft === "config") {
-          searchOptions.fileType = ft
-        } else if (extToLanguage[ft]) {
-          searchOptions.language = extToLanguage[ft]
-        } else if (langNames.has(ft)) {
-          searchOptions.language = ft
-        } else {
-          searchOptions.fileType = ft // pass through as-is
+      if (filterParsed.language) searchOptions.language = filterParsed.language
+      // Freshen from config (default: false — auto_index handles it)
+      if (cfg.freshen) {
+        try {
+          const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
+          await tempIndexer.freshen()
+          await tempIndexer.unloadModel()
+        } catch {
+          // non-fatal — search can proceed without freshen
         }
       }
-      if (args.language) searchOptions.language = args.language
-      if (args.modifiedAfter) searchOptions.modifiedAfter = args.modifiedAfter
-      if (args.modifiedBefore) searchOptions.modifiedBefore = args.modifiedBefore
-      if (args.tags) searchOptions.tags = args.tags.split(",").map((t: string) => t.trim()).filter(Boolean)
-      // Auto-freshen stale files before searching
-      if (args.freshen !== false) {
-        const tempIndexer = await new CodebaseIndexer(projectRoot, indexName).init()
-        await tempIndexer.freshen()
-        await tempIndexer.unloadModel()
-      }
+      let allResults: any[] = []
       if (args.searchAll) {
         const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
@@ -107,21 +149,17 @@ Examples:
         await tempIndexer.unloadModel()
         if (indexes.length === 0) {
-          return `No indexes found. Create one with: codeindex({ action: "reindex", index: "code" })`
+          return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
         }
         for (const idx of indexes) {
           const indexer = await new CodebaseIndexer(projectRoot, idx).init()
-          if (args.freshen !== false) {
-            await indexer.freshen()
-          }
-          const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
+          const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
           allResults.push(...results.map((r: any) => ({ ...r, _index: idx })))
           await indexer.unloadModel()
         }
         allResults.sort((a, b) => {
-          // Prefer combinedScore (hybrid), fall back to L2→similarity conversion
           const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
           const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
           return scoreB - scoreA
@@ -145,43 +183,53 @@ Examples:
         }
         const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
-        const results = await indexer.search(args.query, limit, args.includeArchived, searchOptions)
+        const results = await indexer.search(args.query, limit, includeArchived, searchOptions)
         allResults = results.map((r: any) => ({ ...r, _index: indexName }))
         await indexer.unloadModel()
       }
       // ── Score cutoff — drop low-relevance results ──────────────────────────
-      const minScore = args.minScore ?? 0.35
       allResults = allResults.filter(r => {
         const score = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
         return score >= minScore
       })
-      // ── Path filter — restrict to subtree ──────────────────────────────────
-      if (args.path) {
-        const prefix = args.path.replace(/\/+$/, "") // normalize trailing slash
+      // ── Filter — apply path/language constraints from `filter` param ───────
+      if (filterParsed.pathPrefix) {
+        const prefix = filterParsed.pathPrefix
         allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
       }
+      if (filterParsed.pathContains) {
+        const needle = filterParsed.pathContains.toLowerCase()
+        allResults = allResults.filter(r => r.file && r.file.toLowerCase().includes(needle))
+      }
+      // Language filter is already passed to searchOptions above, but double-check
+      // in case vectorizer didn't filter (e.g. docs index has no language field)
+      if (filterParsed.language) {
+        allResults = allResults.filter(r => !r.language || r.language === filterParsed.language || r.language === "unknown")
+      }
       // ── Reranking — boost results where query keywords appear in text ──────
       const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
-      if (queryKeywords.length > 0) {
-        for (const r of allResults) {
-          const text = (r.content || "").toLowerCase()
-          let keywordHits = 0
+      for (const r of allResults) {
+        const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
+        r._vectorScore = vectorScore
+        r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
+        const baseScore = r._combinedScore ?? vectorScore
+        const text = (r.content || "").toLowerCase()
+        const matchedKeywords: string[] = []
+        if (queryKeywords.length > 0) {
           for (const kw of queryKeywords) {
-            if (text.includes(kw)) keywordHits++
+            if (text.includes(kw)) matchedKeywords.push(kw)
           }
-          const keywordBonus = queryKeywords.length > 0 ? (keywordHits / queryKeywords.length) * 0.15 : 0
-          const baseScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
-          r._finalScore = baseScore + keywordBonus
-        }
-        allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
-      } else {
-        for (const r of allResults) {
-          r._finalScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
         }
+        r._matchedKeywords = matchedKeywords
+        const keywordBonus = queryKeywords.length > 0 ? (matchedKeywords.length / queryKeywords.length) * 0.15 : 0
+        r._keywordBonus = keywordBonus
+        r._finalScore = baseScore + keywordBonus
       }
+      allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
       // ── Group by file — best chunk per file, with chunk count ─────────────
       const fileGroups = new Map<string, { best: any, chunks: any[] }>()
@@ -198,25 +246,24 @@ Examples:
         }
       }
-      // Sort groups by best chunk score, take top N unique files
       const sortedGroups = [...fileGroups.values()]
         .sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
         .slice(0, limit)
       if (sortedGroups.length === 0) {
         const scope = args.searchAll ? "any index" : `index "${indexName}"`
-        return `No results found in ${scope} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords\n- Lower minScore threshold: search({ query: "...", minScore: 0.2 })\n- Enable hybrid search: search({ query: "...", hybrid: true })`
+        const filterNote = args.filter ? ` with filter "${args.filter}"` : ""
+        return `No results found in ${scope}${filterNote} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords or phrasing\n- Remove or broaden the filter\n- search({ query: "...", searchAll: true })`
       }
       // ── Confidence signal ──────────────────────────────────────────────────
       const topScore = sortedGroups[0].best._finalScore ?? 0
       const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
-      const hybridLabel = args.hybrid ? " [hybrid]" : ""
-      const pathLabel = args.path ? ` path:"${args.path}"` : ""
-      let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel}${pathLabel})\n\n`
+      const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
+      let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
       if (topScore < 0.45) {
-        output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords, different phrasing, or hybrid: true.\n\n`
+        output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
       }
       for (let i = 0; i < sortedGroups.length; i++) {
@@ -225,7 +272,7 @@ Examples:
         const indexLabel = args.searchAll ? ` [${r._index}]` : ""
         const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
-        // v2: show rich metadata when available
+        // Rich metadata
         const metaParts: string[] = []
         if (r.language && r.language !== "unknown") metaParts.push(r.language)
         if (r.heading_context) metaParts.push(`"${r.heading_context}"`)
@@ -233,14 +280,25 @@ Examples:
         if (r.class_name) metaParts.push(`class: ${r.class_name}`)
         const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
+        // Score breakdown
+        const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
+        if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
+        if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
+        const breakdown = breakdownParts.join(", ")
+        // Matched keywords
+        const kwDisplay = r._matchedKeywords && r._matchedKeywords.length > 0
+          ? ` | matched: "${r._matchedKeywords.join('", "')}"`
+          : ""
         output += `### ${i + 1}. ${r.file}${indexLabel}${chunkNote}\n`
-        output += `**Relevance:** ${score}${metaLine}\n\n`
+        output += `**Score:** ${score} (${breakdown}${kwDisplay})${metaLine}\n\n`
         output += "```\n"
         const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
         output += content
         output += "\n```\n"
-        // Show second-best chunk from same file if available (brief)
+        // Second-best chunk hint
         if (chunks.length > 1) {
           const second = chunks.find((c: any) => c !== r)
           if (second) {

package/vectorizer/index.ts CHANGED Viewed

@@ -85,6 +85,15 @@ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
 let METRICS_ENABLED = false;
 let CACHE_ENABLED = true;
+// ── Search defaults (exposed to tool layer) ──────────────────────────────────
+const DEFAULT_SEARCH_CONFIG = {
+  freshen: false,           // Don't freshen on every search — auto_index handles it
+  min_score: 0.35,          // Minimum relevance score cutoff
+  include_archived: false,  // Exclude archived files by default
+  default_limit: 10,        // Default result count
+};
+let SEARCH_CONFIG = { ...DEFAULT_SEARCH_CONFIG };
 // ── Graph config (v3) ───────────────────────────────────────────────────────
 const DEFAULT_GRAPH_CONFIG = {
   enabled: true,
@@ -135,6 +144,10 @@ function defaultVectorizerYaml() {
     `  search:\n` +
     `    hybrid: true\n` +
     `    bm25_weight: 0.3\n` +
+    `    freshen: false              # Don't re-index on every search (auto_index handles it)\n` +
+    `    min_score: 0.35             # Minimum relevance score cutoff\n` +
+    `    include_archived: false     # Exclude archived files\n` +
+    `    default_limit: 10           # Default number of results\n` +
     `\n` +
     `  # Graph-based context (v3)\n` +
     `  graph:\n` +
@@ -299,6 +312,11 @@ async function loadConfig(projectRoot) {
         enabled: parseBool(ss, "hybrid", false),
         bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
       };
+      // Extended search defaults
+      SEARCH_CONFIG.freshen = parseBool(ss, "freshen", DEFAULT_SEARCH_CONFIG.freshen);
+      SEARCH_CONFIG.min_score = parseNumber(ss, "min_score", DEFAULT_SEARCH_CONFIG.min_score);
+      SEARCH_CONFIG.include_archived = parseBool(ss, "include_archived", DEFAULT_SEARCH_CONFIG.include_archived);
+      SEARCH_CONFIG.default_limit = parseNumber(ss, "default_limit", DEFAULT_SEARCH_CONFIG.default_limit);
     }
     // ── Parse quality config ────────────────────────────────────────────────
@@ -1305,4 +1323,8 @@ function getEmbeddingModel() {
   return EMBEDDING_MODEL;
 }
-export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel };
+function getSearchConfig() {
+  return SEARCH_CONFIG;
+}
+export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel, getSearchConfig };