npm - @comfanion/usethis_search - Versions diffs - 0.1.5 → 3.0.0-dev.0 - Mend

@comfanion/usethis_search 0.1.5 → 3.0.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +98 -7
package/file-indexer.ts +21 -1
package/index.ts +2 -0
package/package.json +20 -3
package/tools/codeindex.ts +135 -16
package/tools/read-interceptor.ts +54 -0
package/tools/search.ts +60 -12
package/vectorizer/analyzers/lsp-analyzer.ts +162 -0
package/vectorizer/analyzers/regex-analyzer.ts +255 -0
package/vectorizer/bm25-index.ts +155 -0
package/vectorizer/chunkers/chunker-factory.ts +98 -0
package/vectorizer/chunkers/code-chunker.ts +325 -0
package/vectorizer/chunkers/markdown-chunker.ts +177 -0
package/vectorizer/content-cleaner.ts +136 -0
package/vectorizer/graph-builder.ts +95 -0
package/vectorizer/graph-db.ts +97 -0
package/vectorizer/hybrid-search.ts +97 -0
package/vectorizer/index.js +470 -17
package/vectorizer/metadata-extractor.ts +125 -0
package/vectorizer/query-cache.ts +126 -0
package/vectorizer/search-metrics.ts +155 -0
package/vectorizer.yaml +95 -0

package/README.md CHANGED Viewed

@@ -11,9 +11,13 @@ Forget about `grep` and `find` — search code by meaning, not by text!
 An OpenCode plugin that adds **smart search** to your project:
 - 🧠 **Semantic search** — finds code by meaning, even when words don't match
+- 🔀 **Hybrid search (v2)** — combines vector similarity + BM25 keyword matching
+- 🧩 **Semantic chunking (v2)** — structure-aware splitting for Markdown (headings) and code (functions/classes)
+- 🏷️ **Rich metadata (v2)** — filter by file type, language, date, tags
 - ⚡ **Automatic indexing** — files are indexed on change (zero effort)
 - 📦 **Local vectorization** — works offline, no API keys needed
 - 🎯 **Three indexes** — separate for code, docs, and configs
+- 📊 **Quality metrics (v2)** — track search relevance and usage
 - 🌍 **Multilingual** — supports Ukrainian, Russian, and English
 ---
@@ -79,6 +83,31 @@ search({
   query: "database connection",
   searchAll: true
 })
+// v2: Hybrid search (vector + keyword matching)
+search({
+  query: "getUserById",
+  hybrid: true
+})
+// v2: Filter by file type and language
+search({
+  query: "authentication logic",
+  fileType: "code",
+  language: "typescript"
+})
+// v2: Filter by date
+search({
+  query: "recent changes",
+  modifiedAfter: "2024-06-01"
+})
+// v2: Filter by frontmatter tags
+search({
+  query: "security",
+  tags: "auth,security"
+})
 ```
 ### Index Management
@@ -99,6 +128,9 @@ codeindex({
   index: "docs",
   dir: "docs/"
 })
+// v2: Run quality tests against gold dataset
+codeindex({ action: "test", index: "code" })
 ```
 ---
@@ -108,9 +140,12 @@ codeindex({
 ### Semantic Search
 Instead of searching for exact text matches, the plugin:
-1. Converts code into **vectors** (numerical representations of meaning)
-2. Compares vectors of your query with vectors of code
-3. Finds the most **semantically similar** fragments
+1. **Cleans** content (removes TOC, noise, auto-generated markers)
+2. **Chunks** intelligently (Markdown by headings, code by functions/classes)
+3. Converts chunks into **vectors** (numerical representations of meaning)
+4. Compares vectors of your query with vectors of code
+5. Optionally combines with **BM25 keyword search** (hybrid mode)
+6. Returns the most **semantically similar** fragments with rich metadata
 **Example:**
 ```javascript
@@ -138,16 +173,44 @@ vectorizer:
   auto_index: true       # Automatic indexing
   debounce_ms: 1000      # Delay before indexing (ms)
+  # v2: Content cleaning
+  cleaning:
+    remove_toc: true
+    remove_frontmatter_metadata: false
+    remove_imports: false
+    remove_comments: false
+  # v2: Semantic chunking
+  chunking:
+    strategy: "semantic"  # fixed | semantic
+    markdown:
+      split_by_headings: true
+      min_chunk_size: 200
+      max_chunk_size: 2000
+      preserve_heading_hierarchy: true
+    code:
+      split_by_functions: true
+      include_function_signature: true
+      min_chunk_size: 300
+      max_chunk_size: 1500
+  # v2: Hybrid search
+  search:
+    hybrid: false         # vector + BM25
+    bm25_weight: 0.3
+  # v2: Quality monitoring
+  quality:
+    enable_metrics: false
+    enable_cache: true
   indexes:
     code:
       enabled: true
-      extensions: [.js, .ts, .jsx, .tsx, .py, .go, ...]
     docs:
       enabled: true
-      extensions: [.md, .mdx, .txt, .rst, .adoc]
     config:
-      enabled: false     # Disabled by default
-      extensions: [.yaml, .yml, .json, .toml, ...]
+      enabled: false
   exclude:
     - node_modules
@@ -324,6 +387,34 @@ codeindex({ action: "list" })
 - **Model size:** ~23 MB (downloaded once)
 - **Speed:** ~0.5 sec/file (after model loading)
+### v2 Architecture
+```
+File → Content Cleaner → Chunker Factory → Embedder → LanceDB
+                           ├── Markdown Chunker (heading-aware)
+                           ├── Code Chunker (function/class-aware)
+                           └── Fixed Chunker (fallback)
+Query → Query Cache → Embedder → Vector Search ─┐
+                    └──────────→ BM25 Search ────┤→ Hybrid Merge → Filter → Results
+                                                 │
+                                        Metadata Filter (type, lang, date, tags)
+```
+### New Modules (v2)
+| Module | Purpose |
+|--------|---------|
+| `content-cleaner.ts` | Remove noise (TOC, breadcrumbs, markers) |
+| `metadata-extractor.ts` | Extract file_type, language, tags, dates |
+| `markdown-chunker.ts` | Heading-aware splitting with hierarchy |
+| `code-chunker.ts` | Function/class-aware splitting |
+| `chunker-factory.ts` | Route to correct chunker by file type |
+| `bm25-index.ts` | Inverted index for keyword search |
+| `hybrid-search.ts` | Merge vector + BM25 scores |
+| `query-cache.ts` | LRU cache for query embeddings |
+| `search-metrics.ts` | Track search quality metrics |
 ---
 ## 🤝 Contributing

package/file-indexer.ts CHANGED Viewed

@@ -326,6 +326,8 @@ async function ensureIndexOnSessionStart(
   return { totalFiles, elapsedSeconds, action }
 }
+const STALE_THRESHOLD_MS = 5 * 60 * 1000 // 5 minutes — evict stuck entries
 async function processPendingFiles(projectRoot: string, config: VectorizerConfig): Promise<void> {
   if (pendingFiles.size === 0) return
   if (SKIP_AUTO_INDEX) {
@@ -335,6 +337,7 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
   const now = Date.now()
   const filesToProcess: Map<string, string[]> = new Map()
+  const staleKeys: string[] = []
   for (const [filePath, info] of pendingFiles.entries()) {
     if (now - info.timestamp >= config.debounce_ms) {
@@ -342,9 +345,17 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
       files.push(filePath)
       filesToProcess.set(info.indexName, files)
       pendingFiles.delete(filePath)
+    } else if (now - info.timestamp > STALE_THRESHOLD_MS) {
+      staleKeys.push(filePath)
     }
   }
+  // Evict entries stuck for >5 minutes (prevents unbounded growth)
+  for (const key of staleKeys) {
+    debug(`Evicting stale pending file: ${key}`)
+    pendingFiles.delete(key)
+  }
   if (filesToProcess.size === 0) return
   debug(`Processing ${filesToProcess.size} index(es)...`)
@@ -425,6 +436,9 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
     }, 1000)
   }
+  let lastProcessTime = Date.now()
+  const MAX_DEBOUNCE_WAIT_MS = 5000 // Force processing after 5s of rapid edits
   function queueFileForIndexing(filePath: string): void {
     const relativePath = path.relative(directory, filePath)
     if (relativePath.startsWith("..") || path.isAbsolute(relativePath)) return
@@ -439,9 +453,15 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
     if (processingTimeout) {
       clearTimeout(processingTimeout)
     }
+    // If rapid edits keep resetting the timer, force processing after MAX_DEBOUNCE_WAIT_MS
+    const timeSinceLast = Date.now() - lastProcessTime
+    const waitTime = timeSinceLast > MAX_DEBOUNCE_WAIT_MS ? 0 : config.debounce_ms + 100
     processingTimeout = setTimeout(async () => {
+      lastProcessTime = Date.now()
       await processPendingFiles(directory, config)
-    }, config.debounce_ms + 100)
+    }, waitTime)
   }
   return {

package/index.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import type { Plugin } from "@opencode-ai/plugin"
 import search from "./tools/search"
 import codeindex from "./tools/codeindex"
+import readInterceptor from "./tools/read-interceptor"
 import FileIndexerPlugin from "./file-indexer"
 const UsethisSearchPlugin: Plugin = async (ctx) => {
@@ -12,6 +13,7 @@ const UsethisSearchPlugin: Plugin = async (ctx) => {
     tool: {
       search,
       codeindex,
+      read: readInterceptor,
     },
   }
 }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "0.1.5",
-  "description": "OpenCode plugin: semantic search + code index management",
+  "version": "3.0.0-dev.0",
+  "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
   "type": "module",
   "main": "./index.ts",
   "exports": {
@@ -15,14 +15,31 @@
     "file-indexer.ts",
     "tools/search.ts",
     "tools/codeindex.ts",
+    "tools/read-interceptor.ts",
     "vectorizer/index.js",
+    "vectorizer/content-cleaner.ts",
+    "vectorizer/metadata-extractor.ts",
+    "vectorizer/bm25-index.ts",
+    "vectorizer/hybrid-search.ts",
+    "vectorizer/query-cache.ts",
+    "vectorizer/search-metrics.ts",
+    "vectorizer/graph-db.ts",
+    "vectorizer/graph-builder.ts",
+    "vectorizer/analyzers/regex-analyzer.ts",
+    "vectorizer/analyzers/lsp-analyzer.ts",
+    "vectorizer/chunkers/markdown-chunker.ts",
+    "vectorizer/chunkers/code-chunker.ts",
+    "vectorizer/chunkers/chunker-factory.ts",
+    "vectorizer.yaml",
     "README.md",
     "LICENSE"
   ],
   "dependencies": {
-    "@opencode-ai/plugin": "1.1.39",
+    "@opencode-ai/plugin": ">=1.1.0",
     "@xenova/transformers": "^2.17.0",
     "glob": "^10.3.10",
+    "level": "^8.0.1",
+    "levelgraph": "^4.0.0",
     "vectordb": "^0.4.0"
   },
   "peerDependencies": {

package/tools/codeindex.ts CHANGED Viewed

@@ -1,7 +1,8 @@
 /**
- * Code Index Status & Management Tool
+ * Code Index Status & Management Tool (v2)
  *
  * Uses bundled vectorizer. Index data is stored in `.opencode/vectors/<index>/`.
+ * v2: added "test" action for gold dataset testing, richer stats.
  */
 import { tool } from "@opencode-ai/plugin"
@@ -59,6 +60,7 @@ Actions:
 - "status" → Show index statistics
 - "list" → List all available indexes with stats
 - "reindex" → Re-index files using local vectorizer
+- "test" → Run gold dataset quality tests (if configured)
 Available indexes:
 - "code" - Source code files
@@ -66,7 +68,7 @@ Available indexes:
 - "config" - Configuration files`,
   args: {
-    action: tool.schema.enum(["status", "list", "reindex"]).describe("Action to perform"),
+    action: tool.schema.enum(["status", "list", "reindex", "test"]).describe("Action to perform"),
     index: tool.schema.string().optional().default("code").describe("Index name: code, docs, config"),
     dir: tool.schema.string().optional().describe("Directory to index (default: project root)"),
   },
@@ -87,7 +89,7 @@ Available indexes:
       } catch {}
       if (indexes.length === 0) {
-        output += `⚠️ No indexes created yet\n\nCreate indexes:\n\n\`\`\`\n`
+        output += `No indexes created yet\n\nCreate indexes:\n\n\`\`\`\n`
         output += `codeindex({ action: "reindex", index: "code" })\n`
         output += `codeindex({ action: "reindex", index: "docs", dir: "docs/" })\n`
         output += `\`\`\`\n`
@@ -95,31 +97,62 @@ Available indexes:
         output += `### Active Indexes\n\n`
         for (const idx of indexes) {
           try {
-            const hashesPath = path.join(vectorsDir, idx, "hashes.json")
-            const hashes = JSON.parse(await fs.readFile(hashesPath, "utf8"))
-            const fileCount = Object.keys(hashes).length
+            const indexer = await new CodebaseIndexer(projectRoot, idx).init()
+            const stats = await indexer.getStats()
+            await indexer.unloadModel()
             const desc = INDEX_DESCRIPTIONS[idx] || "Custom index"
-            output += `- ${idx} - ${desc} (files: ${fileCount})\n`
+            const features = stats.features
+              ? ` | chunking: ${stats.features.chunking}, hybrid: ${stats.features.hybrid ? "on" : "off"}`
+              : ""
+            output += `- **${idx}** - ${desc} (files: ${stats.fileCount}, chunks: ${stats.chunkCount}${features})\n`
           } catch {
             output += `- ${idx}\n`
           }
         }
       }
-      output += `\n### Usage\n\n\`\`\`\nsearch({ query: "your query", index: "code" })\n\`\`\``
+      output += `\n### Usage\n\n\`\`\`\nsearch({ query: "your query", index: "code" })\nsearch({ query: "your query", hybrid: true })  // v2: hybrid search\nsearch({ query: "your query", fileType: "code", language: "typescript" })  // v2: filters\n\`\`\``
       return output
     }
     if (args.action === "status") {
       const hashesFile = path.join(vectorsDir, indexName, "hashes.json")
       try {
-        const hashesContent = await fs.readFile(hashesFile, "utf8")
-        const hashes = JSON.parse(hashesContent)
-        const fileCount = Object.keys(hashes).length
-        const sampleFiles = Object.keys(hashes).slice(0, 5)
+        const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
+        const stats = await indexer.getStats()
+        await indexer.unloadModel()
+        const sampleFiles = Object.keys(JSON.parse(await fs.readFile(hashesFile, "utf8"))).slice(0, 5)
         const desc = INDEX_DESCRIPTIONS[indexName] || "Custom index"
-        return `## Index Status: "${indexName}"\n\n**Description:** ${desc}\n**Files indexed:** ${fileCount}\n\n**Sample indexed files:**\n${sampleFiles.map((f) => `- ${f}`).join("\n")}${fileCount > 5 ? `\n- ... and ${fileCount - 5} more` : ""}`
+        let output = `## Index Status: "${indexName}"\n\n`
+        output += `**Description:** ${desc}\n`
+        output += `**Files indexed:** ${stats.fileCount}\n`
+        output += `**Total chunks:** ${stats.chunkCount}\n`
+        output += `**Model:** ${stats.model}\n`
+        if (stats.features) {
+          output += `\n**Features:**\n`
+          output += `- Chunking strategy: ${stats.features.chunking}\n`
+          output += `- Hybrid search: ${stats.features.hybrid ? "enabled" : "disabled"}\n`
+          output += `- Metrics: ${stats.features.metrics ? "enabled" : "disabled"}\n`
+          output += `- Query cache: ${stats.features.cache ? "enabled" : "disabled"}\n`
+        }
+        // Show metrics summary if available
+        try {
+          const metrics = await indexer.getMetrics()
+          if (metrics.total_queries > 0) {
+            output += `\n**Search Metrics:**\n`
+            output += `- Total queries: ${metrics.total_queries}\n`
+            output += `- Avg results/query: ${metrics.avg_results_per_query.toFixed(1)}\n`
+            output += `- Zero results rate: ${(metrics.zero_results_rate * 100).toFixed(1)}%\n`
+            output += `- Avg relevance: ${metrics.avg_relevance.toFixed(3)}\n`
+          }
+        } catch {}
+        output += `\n**Sample indexed files:**\n${sampleFiles.map((f) => `- ${f}`).join("\n")}${stats.fileCount > 5 ? `\n- ... and ${stats.fileCount - 5} more` : ""}`
+        return output
       } catch {
         return `## Index Status: "${indexName}"\n\nIndex "${indexName}" not created yet. Create it with: codeindex({ action: "reindex", index: "${indexName}" })`
       }
@@ -148,12 +181,98 @@ Available indexes:
         await indexer.unloadModel()
         const stats = await indexer.getStats()
-        return `## Re-indexing Complete ✅\n\n**Index:** ${indexName}\n**Directory:** ${args.dir || "(project root)"}\n**Files found:** ${files.length}\n**Files indexed:** ${indexed}\n**Files unchanged:** ${skipped}\n**Total chunks:** ${stats.chunkCount}`
+        let output = `## Re-indexing Complete\n\n`
+        output += `**Index:** ${indexName}\n`
+        output += `**Directory:** ${args.dir || "(project root)"}\n`
+        output += `**Files found:** ${files.length}\n`
+        output += `**Files indexed:** ${indexed}\n`
+        output += `**Files unchanged:** ${skipped}\n`
+        output += `**Total chunks:** ${stats.chunkCount}\n`
+        if (stats.features) {
+          output += `**Chunking:** ${stats.features.chunking}\n`
+        }
+        return output
+      } catch (error: any) {
+        return `Re-indexing failed: ${error.message || String(error)}`
+      }
+    }
+    if (args.action === "test") {
+      try {
+        const goldPath = path.join(projectRoot, ".opencode", "vectors", "gold-dataset.yaml")
+        let goldContent: string
+        try {
+          goldContent = await fs.readFile(goldPath, "utf8")
+        } catch {
+          return `## Gold Dataset Test\n\nNo gold dataset found at: ${goldPath}\n\nCreate one with test queries and expected results.\nSee docs/search-plugin-upgrade-plan.md for format.`
+        }
+        // Simple YAML parsing for test queries
+        const tests: { query: string; expected_files: string[]; min_relevance: number; description?: string }[] = []
+        const queryBlocks = goldContent.split(/\n\s+-\s+query:\s*/)
+        for (const block of queryBlocks.slice(1)) {
+          const queryMatch = block.match(/^["']?([^"'\n]+)["']?/)
+          const filesMatch = block.match(/expected_files:\s*\n((?:\s+-\s+.+\n?)+)/)
+          const relMatch = block.match(/min_relevance:\s*([\d.]+)/)
+          const descMatch = block.match(/description:\s*["']?([^"'\n]+)/)
+          if (queryMatch) {
+            const expectedFiles = filesMatch
+              ? filesMatch[1].split("\n").map(l => l.replace(/^\s+-\s+["']?/, "").replace(/["']$/, "").trim()).filter(Boolean)
+              : []
+            tests.push({
+              query: queryMatch[1].trim(),
+              expected_files: expectedFiles,
+              min_relevance: relMatch ? parseFloat(relMatch[1]) : 0.7,
+              description: descMatch ? descMatch[1].trim() : undefined,
+            })
+          }
+        }
+        if (tests.length === 0) {
+          return `## Gold Dataset Test\n\nNo test queries found in gold dataset.`
+        }
+        const indexer = await new CodebaseIndexer(projectRoot, indexName).init()
+        let passed = 0
+        let failed = 0
+        let output = `## Gold Dataset Test Results\n\n`
+        for (const t of tests) {
+          const results = await indexer.search(t.query, 10, false)
+          const foundFiles = results.map((r: any) => r.file)
+          const foundExpected = t.expected_files.filter(f => foundFiles.includes(f))
+          const topScore = results.length > 0 && results[0]._distance != null
+            ? 1 - results[0]._distance
+            : 0
+          const pass = foundExpected.length >= Math.ceil(t.expected_files.length * 0.5) && topScore >= t.min_relevance
+          if (pass) {
+            passed++
+            output += `**PASS** Query: "${t.query}"\n`
+          } else {
+            failed++
+            output += `**FAIL** Query: "${t.query}"\n`
+          }
+          output += `  Found: ${foundFiles.slice(0, 3).map((f: string) => `${f} (${(1 - (results.find((r: any) => r.file === f)?._distance ?? 1)).toFixed(2)})`).join(", ")}\n`
+          if (foundExpected.length < t.expected_files.length) {
+            const missing = t.expected_files.filter(f => !foundFiles.includes(f))
+            output += `  Missing: ${missing.join(", ")}\n`
+          }
+          output += `\n`
+        }
+        await indexer.unloadModel()
+        output += `---\n**Summary:** ${passed}/${tests.length} tests passed (${Math.round(passed / tests.length * 100)}%)\n`
+        return output
       } catch (error: any) {
-        return `❌ Re-indexing failed: ${error.message || String(error)}`
+        return `Gold dataset test failed: ${error.message || String(error)}`
       }
     }
-    return `Unknown action: ${args.action}. Use: status, list, or reindex`
+    return `Unknown action: ${args.action}. Use: status, list, reindex, or test`
   },
 })

package/tools/read-interceptor.ts ADDED Viewed

@@ -0,0 +1,54 @@
+import { tool } from "@opencode-ai/plugin"
+import path from "path"
+import { CodebaseIndexer } from "../vectorizer/index.js"
+export default tool({
+  description: `Read file with graph-aware context attachment. When available, this tool searches the file in the index and returns content + related context from the graph (imports, links, etc.).
+Use this instead of the standard Read tool for better context awareness.`,
+  args: {
+    filePath: tool.schema.string().describe("Path to the file to read"),
+  },
+  async execute(args) {
+    const projectRoot = process.cwd()
+    const filePath = path.isAbsolute(args.filePath) ? args.filePath : path.join(projectRoot, args.filePath)
+    const relPath = path.relative(projectRoot, filePath)
+    const indexer = await new CodebaseIndexer(projectRoot, "code").init()
+    const results = await indexer.search(relPath, 20, false, {})
+    const fileChunks = results.filter(r => r.file === relPath)
+    await indexer.unloadModel()
+    if (fileChunks.length === 0) {
+      return `File "${relPath}" not indexed. Use original Read tool or run codeindex({ action: "reindex", index: "code" })`
+    }
+    let output = `## ${relPath}\n\n`
+    output += `### Content\n\n`
+    for (const chunk of fileChunks) {
+      output += chunk.content + "\n\n"
+    }
+    const allRelated = fileChunks
+      .flatMap(c => c.relatedContext || [])
+      .filter((r, i, arr) => arr.findIndex(x => x.chunk_id === r.chunk_id) === i)
+    if (allRelated.length > 0) {
+      output += `### Related Context\n\n`
+      for (const rel of allRelated) {
+        const snippet = rel.content.length > 300
+          ? rel.content.substring(0, 300) + "..."
+          : rel.content
+        output += `**${rel.file}** (${rel.relation})\n`
+        output += `\`\`\`\n${snippet}\n\`\`\`\n\n`
+      }
+    }
+    return output
+  },
+})