npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.21 → 3.0.0-dev.23 - Mend

@comfanion/usethis_search 3.0.0-dev.21 → 3.0.0-dev.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "3.0.0-dev.21",
+  "version": "3.0.0-dev.23",
   "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
   "type": "module",
   "main": "./index.ts",

package/tools/search.ts CHANGED Viewed

@@ -25,7 +25,8 @@ Examples:
 - "authentication logic" → finds auth-related code
 - "database connection handling" → finds DB setup code
 - "how to deploy" with index: "docs" → finds deployment docs
-- "API keys" with index: "config" → finds config with API settings`,
+- "API keys" with index: "config" → finds config with API settings
+- search({ query: "tenant", path: "internal/domain/" }) → searches only in internal/domain/`,
   args: {
     query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
@@ -42,6 +43,7 @@ Examples:
     modifiedBefore: tool.schema.string().optional().describe("Filter: only files modified before this ISO date"),
     tags: tool.schema.string().optional().describe("Filter by frontmatter tags (comma-separated, e.g. 'auth,security')"),
     minScore: tool.schema.number().optional().default(0.35).describe("Minimum relevance score (0-1). Results below this threshold are dropped. Default: 0.35"),
+    path: tool.schema.string().optional().describe("Filter by file path prefix (e.g. 'internal/domain/', 'src/components'). Only returns files under this path."),
   },
   async execute(args) {
@@ -119,9 +121,9 @@ Examples:
         }
         allResults.sort((a, b) => {
-          // Prefer combinedScore (hybrid), fall back to distance
-          const scoreA = a._combinedScore ?? (a._distance != null ? 1 - a._distance : 0)
-          const scoreB = b._combinedScore ?? (b._distance != null ? 1 - b._distance : 0)
+          // Prefer combinedScore (hybrid), fall back to L2→similarity conversion
+          const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
+          const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
           return scoreB - scoreA
         })
         allResults = allResults.slice(0, limit)
@@ -130,7 +132,16 @@ Examples:
         try {
           await fs.access(hashesFile)
         } catch {
-          return `Index "${indexName}" not found. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
+          // Index doesn't exist — check what indexes ARE available
+          const tempIndexer = await new CodebaseIndexer(projectRoot, "code").init()
+          const available = await tempIndexer.listIndexes()
+          await tempIndexer.unloadModel()
+          if (available.length > 0) {
+            const list = available.map(i => `"${i}"`).join(", ")
+            return `Index "${indexName}" not found. Available indexes: ${list}.\n\nTry: search({ query: "${args.query}", index: "${available[0]}" })\nOr search all: search({ query: "${args.query}", searchAll: true })`
+          }
+          return `No indexes found. The codebase needs to be indexed first.\n\nRun the CLI: bunx usethis_search reindex`
         }
         const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
@@ -146,23 +157,83 @@ Examples:
         return score >= minScore
       })
-      if (allResults.length === 0) {
+      // ── Path filter — restrict to subtree ──────────────────────────────────
+      if (args.path) {
+        const prefix = args.path.replace(/\/+$/, "") // normalize trailing slash
+        allResults = allResults.filter(r => r.file && r.file.startsWith(prefix))
+      }
+      // ── Reranking — boost results where query keywords appear in text ──────
+      // Also store score components for breakdown display
+      const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
+      for (const r of allResults) {
+        // Vector score (L2 → similarity)
+        const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
+        r._vectorScore = vectorScore
+        // BM25 component (present only in hybrid mode — embedded in _combinedScore)
+        // If _combinedScore exists and differs from vectorScore, the difference is BM25 contribution
+        r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
+        // Base score before keyword boost
+        const baseScore = r._combinedScore ?? vectorScore
+        // Keyword matching
+        const text = (r.content || "").toLowerCase()
+        const matchedKeywords: string[] = []
+        if (queryKeywords.length > 0) {
+          for (const kw of queryKeywords) {
+            if (text.includes(kw)) matchedKeywords.push(kw)
+          }
+        }
+        r._matchedKeywords = matchedKeywords
+        const keywordBonus = queryKeywords.length > 0 ? (matchedKeywords.length / queryKeywords.length) * 0.15 : 0
+        r._keywordBonus = keywordBonus
+        r._finalScore = baseScore + keywordBonus
+      }
+      allResults.sort((a: any, b: any) => (b._finalScore ?? 0) - (a._finalScore ?? 0))
+      // ── Group by file — best chunk per file, with chunk count ─────────────
+      const fileGroups = new Map<string, { best: any, chunks: any[] }>()
+      for (const r of allResults) {
+        const key = r.file
+        if (!fileGroups.has(key)) {
+          fileGroups.set(key, { best: r, chunks: [r] })
+        } else {
+          const group = fileGroups.get(key)!
+          group.chunks.push(r)
+          if ((r._finalScore ?? 0) > (group.best._finalScore ?? 0)) {
+            group.best = r
+          }
+        }
+      }
+      // Sort groups by best chunk score, take top N unique files
+      const sortedGroups = [...fileGroups.values()]
+        .sort((a, b) => (b.best._finalScore ?? 0) - (a.best._finalScore ?? 0))
+        .slice(0, limit)
+      if (sortedGroups.length === 0) {
         const scope = args.searchAll ? "any index" : `index "${indexName}"`
         return `No results found in ${scope} for: "${args.query}" (min score: ${minScore})\n\nTry:\n- Different keywords\n- Lower minScore threshold: search({ query: "...", minScore: 0.2 })\n- Enable hybrid search: search({ query: "...", hybrid: true })`
       }
+      // ── Confidence signal ──────────────────────────────────────────────────
+      const topScore = sortedGroups[0].best._finalScore ?? 0
       const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
       const hybridLabel = args.hybrid ? " [hybrid]" : ""
-      let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel})\n\n`
-      for (let i = 0; i < allResults.length; i++) {
-        const r = allResults[i]
-        const score = r._combinedScore != null
-          ? r._combinedScore.toFixed(3)
-          : r._distance != null
-            ? (1 - r._distance).toFixed(3)
-            : "N/A"
+      const pathLabel = args.path ? ` path:"${args.path}"` : ""
+      let output = `## Search Results for: "${args.query}" (${scope}${hybridLabel}${pathLabel})\n\n`
+      if (topScore < 0.45) {
+        output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords, different phrasing, or hybrid: true.\n\n`
+      }
+      for (let i = 0; i < sortedGroups.length; i++) {
+        const { best: r, chunks } = sortedGroups[i]
+        const score = (r._finalScore ?? 0).toFixed(3)
         const indexLabel = args.searchAll ? ` [${r._index}]` : ""
+        const chunkNote = chunks.length > 1 ? ` (${chunks.length} matching sections)` : ""
         // v2: show rich metadata when available
         const metaParts: string[] = []
@@ -172,13 +243,36 @@ Examples:
         if (r.class_name) metaParts.push(`class: ${r.class_name}`)
         const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
-        output += `### ${i + 1}. ${r.file}${indexLabel}\n`
-        output += `**Relevance:** ${score}${metaLine}\n\n`
+        // Score breakdown: vector + bm25 + keywords
+        const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
+        if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
+        if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
+        const breakdown = breakdownParts.join(", ")
+        // Matched keywords
+        const kwDisplay = r._matchedKeywords && r._matchedKeywords.length > 0
+          ? ` | matched: "${r._matchedKeywords.join('", "')}"`
+          : ""
+        output += `### ${i + 1}. ${r.file}${indexLabel}${chunkNote}\n`
+        output += `**Score:** ${score} (${breakdown}${kwDisplay})${metaLine}\n\n`
         output += "```\n"
         const content = r.content.length > 500 ? r.content.substring(0, 500) + "\n... (truncated)" : r.content
         output += content
         output += "\n```\n"
+        // Show second-best chunk from same file if available (brief)
+        if (chunks.length > 1) {
+          const second = chunks.find((c: any) => c !== r)
+          if (second) {
+            const secMeta: string[] = []
+            if (second.function_name) secMeta.push(`fn: ${second.function_name}`)
+            if (second.heading_context) secMeta.push(`"${second.heading_context}"`)
+            const secLabel = secMeta.length > 0 ? ` ${secMeta.join(", ")}` : ""
+            output += `\n*Also:${secLabel}*\n`
+          }
+        }
         if (r.relatedContext && r.relatedContext.length > 0) {
           output += "\n**Related Context:**\n"
           for (const rel of r.relatedContext) {
@@ -193,7 +287,9 @@ Examples:
         output += "\n"
       }
-      output += `---\n*Found ${allResults.length} results. Use Read tool to see full files.*`
+      const totalChunks = allResults.length
+      const uniqueFiles = sortedGroups.length
+      output += `---\n*${uniqueFiles} files (${totalChunks} chunks). Use Read tool to see full files.*`
       return output
     } catch (error: any) {
       return `Search failed: ${error.message || String(error)}`

package/vectorizer/index.ts CHANGED Viewed

@@ -239,7 +239,7 @@ async function loadConfig(projectRoot) {
     }
     // Parse vectorizer section from YAML
-    const vectorizerMatch = content.match(/^vectorizer:([\s\S]*?)(?=^[a-zA-Z_\-]+:|\Z)/m);
+    const vectorizerMatch = content.match(/^vectorizer:([\s\S]*?)(?=^[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (!vectorizerMatch) {
       await ensureDefaultConfig(projectRoot);
       return;
@@ -255,7 +255,7 @@ async function loadConfig(projectRoot) {
     }
     // ── Parse cleaning config ───────────────────────────────────────────────
-    const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (cleaningMatch) {
       const cs = cleaningMatch[1];
       CLEANING_CONFIG = {
@@ -267,7 +267,7 @@ async function loadConfig(projectRoot) {
     }
     // ── Parse chunking config ───────────────────────────────────────────────
-    const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (chunkingMatch) {
       const cs = chunkingMatch[1];
       const strategy = parseString(cs, "strategy", "semantic");
@@ -292,7 +292,7 @@ async function loadConfig(projectRoot) {
     }
     // ── Parse search config ─────────────────────────────────────────────────
-    const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (searchMatch) {
       const ss = searchMatch[1];
       HYBRID_CONFIG = {
@@ -302,7 +302,7 @@ async function loadConfig(projectRoot) {
     }
     // ── Parse quality config ────────────────────────────────────────────────
-    const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (qualityMatch) {
       const qs = qualityMatch[1];
       METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
@@ -310,7 +310,7 @@ async function loadConfig(projectRoot) {
     }
     // ── Parse graph config (v3) ──────────────────────────────────────────────
-    const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|(?![\s\S]))/m);
     if (graphMatch) {
       const gs = graphMatch[1];
       GRAPH_CONFIG.enabled = parseBool(gs, "enabled", DEFAULT_GRAPH_CONFIG.enabled);
@@ -321,7 +321,7 @@ async function loadConfig(projectRoot) {
       GRAPH_CONFIG.read_intercept = parseBool(gs, "read_intercept", DEFAULT_GRAPH_CONFIG.read_intercept);
       // Nested lsp: section
-      const lspMatch = gs.match(/^\s+lsp:\s*\n([\s\S]*?)(?=^\s{4}[a-zA-Z_\-]+:|\Z)/m);
+      const lspMatch = gs.match(/^\s+lsp:\s*\n([\s\S]*?)(?=^\s{4}[a-zA-Z_\-]+:|(?![\s\S]))/m);
       if (lspMatch) {
         const ls = lspMatch[1];
         GRAPH_CONFIG.lsp.enabled = parseBool(ls, "enabled", DEFAULT_GRAPH_CONFIG.lsp.enabled);
@@ -342,7 +342,7 @@ async function loadConfig(projectRoot) {
     }
     // Parse indexes section
-    const indexesMatch = section.match(/^\s{2}indexes:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\s{2}exclude:|\Z)/m);
+    const indexesMatch = section.match(/^\s{2}indexes:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\s{2}exclude:|(?![\s\S]))/m);
     if (!indexesMatch) return;
     const indexesSection = indexesMatch[1];
@@ -350,7 +350,7 @@ async function loadConfig(projectRoot) {
     // Parse each index (code, docs, config)
     for (const indexName of ["code", "docs", "config"]) {
       const indexRegex = new RegExp(
-        `^\\s{4}${indexName}:\\s*\\n([\\s\\S]*?)(?=^\\s{4}[a-zA-Z_\\-]+:|\\Z)`,
+        `^\\s{4}${indexName}:\\s*\\n([\\s\\S]*?)(?=^\\s{4}[a-zA-Z_\\-]+:|(?![\\s\\S]))`,
         "m",
       );
       const indexMatch = indexesSection.match(indexRegex);