npm - @comfanion/usethis_search - Versions diffs - 4.4.0 → 4.5.0 - Mend

@comfanion/usethis_search 4.4.0 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/api.ts +34 -17
package/cache/manager.ts +30 -19
package/cli.ts +8 -5
package/file-indexer.ts +28 -11
package/hooks/message-before.ts +5 -5
package/hooks/tool-substitution.ts +4 -120
package/index.ts +17 -6
package/package.json +3 -2
package/tools/codeindex.ts +192 -184
package/tools/graph.ts +265 -0
package/tools/read-interceptor.ts +7 -3
package/tools/search.ts +268 -190
package/tools/workspace-state.ts +1 -2
package/tools/workspace.ts +76 -108
package/vectorizer/analyzers/lsp-client.ts +52 -6
package/vectorizer/chunkers/chunker-factory.ts +6 -0
package/vectorizer/chunkers/code-chunker.ts +73 -16
package/vectorizer/chunkers/lsp-chunker.ts +313 -191
package/vectorizer/graph-db.ts +6 -4
package/vectorizer/index.ts +328 -132
package/vectorizer/usage-tracker.ts +36 -0
package/vectorizer.yaml +2 -2

package/vectorizer/chunkers/lsp-chunker.ts CHANGED Viewed

@@ -2,15 +2,15 @@
  * LSP-Based Code Chunker
  *
  * Uses Language Server Protocol to get AST-accurate function/class boundaries.
- * Captures godoc/JSDoc comments that belong to each symbol.
  *
- * Advantages over regex-chunker:
- * - ✅ Accurate AST parsing (no brace counting bugs)
- * - ✅ Captures leading documentation comments (godoc, JSDoc, docstrings)
- * - ✅ Handles nested structures (class methods, nested functions)
- * - ✅ Language-agnostic (works for Go, TS, Python, Rust, Java, etc.)
- *
- * Fallback: If LSP unavailable → use regex-chunker
+ * Algorithm:
+ * 1. Get flat list of symbols from LSP (functions, methods, classes)
+ * 2. Sort by start_line
+ * 3. For each symbol: everything between previous symbol's end and this symbol's start
+ *    (comments, blank lines, decorators) → prepend to this symbol's chunk
+ * 4. Content before first symbol → separate "header" chunk (imports, package decl)
+ * 5. Large classes → split into header + individual methods
+ * 6. No duplicate/overlapping chunks — each line belongs to exactly one chunk
  */
 import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
@@ -25,7 +25,6 @@ const CHUNKABLE_SYMBOLS = new Set([
   SymbolKind.Class,
   SymbolKind.Interface,
   SymbolKind.Enum,
-  // Note: Struct is not in SymbolKind — Go structs appear as Class
 ])
 /** Map file extension to LSP language ID */
@@ -43,9 +42,17 @@ const EXT_TO_LANGUAGE: Record<string, string> = {
   cs: "csharp",
 }
+interface FlatSymbol {
+  name: string
+  startLine: number
+  endLine: number
+  functionName?: string
+  className?: string
+}
 /**
  * Chunk code using LSP documentSymbol API.
- * Falls back to regex-chunker if LSP unavailable.
+ * Returns null if LSP unavailable → caller falls back to regex chunker.
  */
 export async function chunkCodeWithLSP(
   filePath: string,
@@ -53,43 +60,34 @@ export async function chunkCodeWithLSP(
   config: CodeChunkConfig,
   projectRoot?: string,
 ): Promise<CodeChunk[] | null> {
-  // Check if LSP available for this language
   const ext = filePath.split(".").pop() || ""
   const language = EXT_TO_LANGUAGE[ext]
   if (!language) {
     if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
-    return null // Fallback to regex
+    return null
   }
   const available = await LSPClient.isAvailable(language)
   if (!available) {
     if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
-    return null // Fallback to regex
+    return null
   }
-  // Start LSP client
-  const client = new LSPClient(language, projectRoot)
+  const client = new LSPClient(projectRoot || process.cwd())
   try {
-    await client.start()
+    await client.start(language)
     await client.openDocument(filePath, content)
-    // Get document symbols
     const symbols = await client.documentSymbol(filePath)
     if (!symbols || symbols.length === 0) {
       if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
-      return null // Fallback to regex
+      return null
     }
     const lines = content.split("\n")
-    const chunks: CodeChunk[] = []
-    // Extract chunks from symbols (recursive for nested symbols)
-    extractChunksFromSymbols(symbols, lines, chunks, config)
-    // Add gaps (code between symbols: imports, package declarations, etc.)
-    addGapChunks(chunks, lines, config)
+    const chunks = buildChunks(symbols, lines, config)
-    if (DEBUG) console.log(`[lsp-chunker] Generated ${chunks.length} chunks from ${symbols.length} symbols`)
+    if (DEBUG) console.log(`[lsp-chunker] ${filePath}: ${chunks.length} chunks from ${symbols.length} symbols`)
     await client.closeDocument(filePath)
     await client.stop()
@@ -97,220 +95,344 @@ export async function chunkCodeWithLSP(
     return chunks.length > 0 ? chunks : null
   } catch (error: any) {
     if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
-    try {
-      await client.stop()
-    } catch {}
-    return null // Fallback to regex
+    try { await client.stop() } catch {}
+    return null
   }
 }
 /**
- * Recursively extract chunks from LSP symbols.
- * Handles nested structures (class methods, nested functions).
+ * Build non-overlapping chunks from LSP symbols.
+ *
+ * Each line in the file belongs to exactly one chunk.
+ * Comments/gaps between symbols are prepended to the next symbol.
+ * Content before the first symbol becomes a "header" chunk.
  */
-function extractChunksFromSymbols(
+function buildChunks(
   symbols: LSPSymbolInformation[],
   lines: string[],
-  chunks: CodeChunk[],
   config: CodeChunkConfig,
-  parentClass?: string,
-): void {
-  for (const symbol of symbols) {
-    // Skip non-chunkable symbols (variables, properties, etc.)
-    if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
+): CodeChunk[] {
+  // Step 1: Flatten symbols into a sorted list of non-overlapping ranges
+  const flat = flattenSymbols(symbols, config)
-    const startLine = symbol.range.start.line
-    const endLine = symbol.range.end.line
-    // Expand startLine backward to capture leading comments (godoc, JSDoc, docstrings)
-    const commentStartLine = captureLeadingComments(lines, startLine)
+  if (flat.length === 0) {
+    // No chunkable symbols — return whole file as one chunk
+    return [{ content: lines.join("\n"), start_line: 0, end_line: lines.length - 1 }]
+  }
-    // Extract chunk content
-    const chunkLines = lines.slice(commentStartLine, endLine + 1)
-    const chunkContent = chunkLines.join("\n")
+  const chunks: CodeChunk[] = []
+  let lastEnd = -1
-    // Check size constraints
-    if (chunkContent.length < config.min_chunk_size && chunkLines.length < 5) {
-      // Too small — skip (will be captured in gaps)
-      continue
+  for (let i = 0; i < flat.length; i++) {
+    const sym = flat[i]
+    // Gap between previous symbol end and this symbol start
+    // → prepend to this symbol (comments, decorators, blank lines)
+    const chunkStart = lastEnd + 1
+    const chunkEnd = sym.endLine
+    // But if there's a large gap with real code before first symbol → separate header chunk
+    if (i === 0 && chunkStart < sym.startLine) {
+      const headerContent = lines.slice(chunkStart, sym.startLine).join("\n").trimEnd()
+      if (headerContent.length > 0 && hasRealCode(headerContent)) {
+        chunks.push({
+          content: headerContent,
+          start_line: chunkStart,
+          end_line: sym.startLine - 1,
+        })
+        // Symbol chunk starts at its own startLine (no gap prepended)
+        const symContent = lines.slice(sym.startLine, chunkEnd + 1).join("\n")
+        chunks.push({
+          content: symContent,
+          function_name: sym.functionName,
+          class_name: sym.className,
+          start_line: sym.startLine,
+          end_line: chunkEnd,
+        })
+        lastEnd = chunkEnd
+        continue
+      }
     }
-    // Determine chunk metadata
-    const isClass = symbol.kind === SymbolKind.Class || symbol.kind === SymbolKind.Interface
-    const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
+    // Normal case: gap + symbol → one chunk
+    const chunkContent = lines.slice(chunkStart, chunkEnd + 1).join("\n")
-    const chunk: CodeChunk = {
+    chunks.push({
       content: chunkContent,
-      start_line: commentStartLine,
-      end_line: endLine,
-    }
+      function_name: sym.functionName,
+      class_name: sym.className,
+      start_line: chunkStart,
+      end_line: chunkEnd,
+    })
+    lastEnd = chunkEnd
+  }
-    if (isClass) {
-      chunk.class_name = symbol.name
+  // Trailing content after last symbol
+  if (lastEnd < lines.length - 1) {
+    const trailing = lines.slice(lastEnd + 1).join("\n").trimEnd()
+    if (trailing.length > 0) {
+      // Append to last chunk if small, otherwise separate
+      const lastChunk = chunks[chunks.length - 1]
+      if (trailing.length < config.min_chunk_size && lastChunk) {
+        lastChunk.content += "\n" + trailing
+        lastChunk.end_line = lines.length - 1
+      } else {
+        chunks.push({
+          content: trailing,
+          start_line: lastEnd + 1,
+          end_line: lines.length - 1,
+        })
+      }
     }
-    if (isFunction) {
-      chunk.function_name = symbol.name
-      if (parentClass) chunk.class_name = parentClass
+  }
+  // Split any chunk that's still too large
+  const result: CodeChunk[] = []
+  for (const chunk of chunks) {
+    if (chunk.content.length > config.max_chunk_size) {
+      result.push(...splitLargeChunk(chunk, lines, config))
+    } else {
+      result.push(chunk)
     }
+  }
+  // Filter out empty/trivial chunks
+  return result.filter(c => {
+    const trimmed = c.content.trim()
+    if (trimmed.length === 0) return false
+    if (c.function_name || c.class_name) return true
+    if (trimmed.length < 50) return false
+    return true
+  })
+}
+/**
+ * Flatten LSP symbol tree into a sorted, non-overlapping list.
+ *
+ * For small classes (< max_chunk_size): one chunk for the whole class.
+ * For large classes: class header + individual methods.
+ * Nested callbacks/arrow functions inside a function → NOT separate chunks.
+ */
+function flattenSymbols(
+  symbols: LSPSymbolInformation[],
+  config: CodeChunkConfig,
+  parentClass?: string,
+): FlatSymbol[] {
+  const result: FlatSymbol[] = []
+  for (const sym of symbols) {
+    if (!CHUNKABLE_SYMBOLS.has(sym.kind)) continue
-    // If chunk too large → split by children (for classes with many methods)
-    if (chunkContent.length > config.max_chunk_size && symbol.children && symbol.children.length > 0) {
-      if (DEBUG) console.log(`[lsp-chunker] Splitting large ${symbol.kind === SymbolKind.Class ? 'class' : 'symbol'} ${symbol.name}`)
+    const startLine = sym.range.start.line
+    const endLine = sym.range.end.line
+    const isClass = sym.kind === SymbolKind.Class || sym.kind === SymbolKind.Interface || sym.kind === SymbolKind.Enum
+    const isFunction = sym.kind === SymbolKind.Function || sym.kind === SymbolKind.Method
+    if (isClass && sym.children && sym.children.length > 0) {
+      // Check if class content is too large → split into methods
+      // Estimate size: (endLine - startLine) * ~40 chars per line
+      const estimatedSize = (endLine - startLine + 1) * 40
-      // For classes: chunk class header + each method separately
-      if (isClass) {
-        // Find first child's start line
-        const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
+      if (estimatedSize > config.max_chunk_size) {
+        // Large class → flatten children (methods) as separate symbols
+        const methods = flattenSymbols(sym.children, config, sym.name)
-        // Class header chunk (from comment to first method)
-        const headerLines = lines.slice(commentStartLine, firstChildStart)
-        if (headerLines.join("\n").trim().length > 0) {
-          chunks.push({
-            content: headerLines.join("\n"),
-            class_name: symbol.name,
-            start_line: commentStartLine,
-            end_line: firstChildStart - 1,
+        if (methods.length > 0) {
+          // Class header: from class start to first method
+          const firstMethodStart = Math.min(...methods.map(m => m.startLine))
+          if (firstMethodStart > startLine) {
+            result.push({
+              name: sym.name,
+              startLine,
+              endLine: firstMethodStart - 1,
+              className: sym.name,
+            })
+          }
+          result.push(...methods)
+          // Class tail: from last method end to class end
+          const lastMethodEnd = Math.max(...methods.map(m => m.endLine))
+          if (lastMethodEnd < endLine) {
+            result.push({
+              name: `${sym.name}::tail`,
+              startLine: lastMethodEnd + 1,
+              endLine,
+              className: sym.name,
+            })
+          }
+        } else {
+          // No chunkable children → whole class as one chunk
+          result.push({
+            name: sym.name,
+            startLine,
+            endLine,
+            className: sym.name,
           })
         }
-        // Chunk each method separately (with its comments)
-        extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
       } else {
-        // Non-class: chunk children recursively
-        extractChunksFromSymbols(symbol.children, lines, chunks, config, parentClass)
+        // Small class → one chunk, NO children
+        result.push({
+          name: sym.name,
+          startLine,
+          endLine,
+          className: sym.name,
+        })
       }
-    } else {
-      // Chunk fits size limit → add it
-      chunks.push(chunk)
-      // Still process children if they exist (nested functions in Go, for example)
-      if (symbol.children && symbol.children.length > 0) {
-        extractChunksFromSymbols(symbol.children, lines, chunks, config, isClass ? symbol.name : parentClass)
+    } else if (isFunction) {
+      // Check if function is too large AND has chunkable children
+      // (e.g. describe() with it() blocks, or large function with nested functions)
+      const estimatedSize = (endLine - startLine + 1) * 40
+      const chunkableChildren = sym.children?.filter(c => CHUNKABLE_SYMBOLS.has(c.kind)) || []
+      if (estimatedSize > config.max_chunk_size && chunkableChildren.length > 0) {
+        // Large function with children → split like a class
+        const children = flattenSymbols(sym.children!, config, parentClass)
+        if (children.length > 0) {
+          // Function header: from start to first child
+          const firstChildStart = Math.min(...children.map(m => m.startLine))
+          if (firstChildStart > startLine) {
+            result.push({
+              name: sym.name,
+              startLine,
+              endLine: firstChildStart - 1,
+              functionName: sym.name,
+              className: parentClass,
+            })
+          }
+          result.push(...children)
+          // Function tail: from last child end to function end
+          const lastChildEnd = Math.max(...children.map(m => m.endLine))
+          if (lastChildEnd < endLine) {
+            result.push({
+              name: `${sym.name}::tail`,
+              startLine: lastChildEnd + 1,
+              endLine,
+              functionName: sym.name,
+              className: parentClass,
+            })
+          }
+        } else {
+          // No chunkable children found → keep as one chunk
+          result.push({
+            name: sym.name,
+            startLine,
+            endLine,
+            functionName: sym.name,
+            className: parentClass,
+          })
+        }
+      } else {
+        // Small function or no children → one chunk, NO nested callbacks
+        result.push({
+          name: sym.name,
+          startLine,
+          endLine,
+          functionName: sym.name,
+          className: parentClass,
+        })
       }
+    } else {
+      // Interface, Enum without children
+      result.push({
+        name: sym.name,
+        startLine,
+        endLine,
+        className: sym.name,
+      })
     }
   }
-  // Sort chunks by start_line
-  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+  // Sort by start line and remove overlaps
+  result.sort((a, b) => a.startLine - b.startLine)
+  return deduplicateRanges(result)
 }
 /**
- * Capture leading comments above a symbol.
- * Walks backward from startLine to find godoc, JSDoc, docstrings, etc.
- *
- * Handles:
- * - Go: // comments (consecutive)
- * - Python: """docstring"""
- * - JS/TS: /** JSDoc *\/ or // comments
- * - Rust: /// doc comments
- * - Java/C#: /** JavaDoc *\/
+ * Remove overlapping ranges — keep the more specific (smaller) one.
+ * After sorting by startLine, if B starts inside A, keep whichever is smaller.
  */
-function captureLeadingComments(lines: string[], startLine: number): number {
-  if (startLine <= 0) return startLine
-  let commentStart = startLine - 1
-  let foundComment = false
-  // Walk backward to find comment block
-  while (commentStart >= 0) {
-    const line = lines[commentStart]
-    const trimmed = line.trim()
-    // Empty line
-    if (trimmed === "") {
-      // Allow max 1 blank line between comment and declaration
-      if (foundComment && commentStart > 0) {
-        const prevLine = lines[commentStart - 1].trim()
-        if (isCommentLine(prevLine)) {
-          commentStart--
-          continue
-        }
+function deduplicateRanges(symbols: FlatSymbol[]): FlatSymbol[] {
+  if (symbols.length <= 1) return symbols
+  const result: FlatSymbol[] = [symbols[0]]
+  for (let i = 1; i < symbols.length; i++) {
+    const prev = result[result.length - 1]
+    const curr = symbols[i]
+    if (curr.startLine <= prev.endLine) {
+      // Overlap — keep the one that's NOT a container
+      // If prev contains curr entirely → prev is a class stub, skip it and keep curr
+      // If curr is inside prev → skip curr (it's a nested callback)
+      if (curr.startLine >= prev.startLine && curr.endLine <= prev.endLine) {
+        // curr is inside prev — skip curr (nested callback/arrow fn)
+        continue
       }
-      break
-    }
-    // Check if line is a comment
-    if (isCommentLine(trimmed)) {
-      foundComment = true
-      commentStart--
-      continue
+      // Partial overlap — adjust prev.endLine
+      prev.endLine = Math.min(prev.endLine, curr.startLine - 1)
     }
-    // Non-comment, non-empty line → stop
-    break
+    result.push(curr)
   }
-  return foundComment ? commentStart + 1 : startLine
+  return result
 }
 /**
- * Check if a line is a documentation comment.
+ * Check if content has real code (not just comments/whitespace/braces).
  */
-function isCommentLine(line: string): boolean {
-  return (
-    line.startsWith("//") ||      // Go, JS, TS, Rust, C++
-    line.startsWith("///") ||     // Rust doc comments
-    line.startsWith("#") ||       // Python
-    line.startsWith("*") ||       // Inside /** ... */
-    line.startsWith("/**") ||     // JSDoc/JavaDoc start
-    line.endsWith("*/") ||        // JSDoc/JavaDoc end
-    line.match(/^("""|''')/) ||   // Python docstring
-    line.startsWith("<!--")       // HTML/Markdown
-  )
+function hasRealCode(content: string): boolean {
+  return content.split("\n").some(l => {
+    const t = l.trim()
+    if (t.length === 0) return false
+    if (t.startsWith("//") || t.startsWith("/*") || t.startsWith("*") || t.startsWith("*/")) return false
+    if (t.startsWith("#") || t.startsWith("<!--")) return false
+    if (/^[{}()\[\];,]+$/.test(t)) return false
+    return true
+  })
 }
 /**
- * Add gap chunks (code between symbols: imports, package decl, constants, etc.)
+ * Split a large chunk by line count, preserving metadata.
  */
-function addGapChunks(chunks: CodeChunk[], lines: string[], config: CodeChunkConfig): void {
-  if (chunks.length === 0) {
-    // No symbols found → chunk entire file
-    chunks.push({
-      content: lines.join("\n"),
-      start_line: 0,
-      end_line: lines.length - 1,
-    })
-    return
-  }
-  const gaps: CodeChunk[] = []
-  let lastEnd = -1
-  for (const chunk of chunks) {
-    const start = chunk.start_line ?? 0
-    // Gap before this chunk
-    if (start > lastEnd + 1) {
-      const gapLines = lines.slice(lastEnd + 1, start)
-      const gapContent = gapLines.join("\n").trim()
-      if (gapContent.length >= config.min_chunk_size) {
-        gaps.push({
-          content: gapContent,
-          start_line: lastEnd + 1,
-          end_line: start - 1,
-        })
-      }
+function splitLargeChunk(chunk: CodeChunk, allLines: string[], config: CodeChunkConfig): CodeChunk[] {
+  const chunkLines = chunk.content.split("\n")
+  const baseLine = chunk.start_line || 0
+  const parts: CodeChunk[] = []
+  let current: string[] = []
+  let currentLen = 0
+  let startLine = baseLine
+  for (let i = 0; i < chunkLines.length; i++) {
+    const line = chunkLines[i]
+    if (currentLen + line.length + 1 > config.max_chunk_size && current.length > 0) {
+      parts.push({
+        content: current.join("\n"),
+        function_name: chunk.function_name,
+        class_name: chunk.class_name,
+        start_line: startLine,
+        end_line: baseLine + i - 1,
+      })
+      current = []
+      currentLen = 0
+      startLine = baseLine + i
     }
-    lastEnd = chunk.end_line ?? start
+    current.push(line)
+    currentLen += line.length + 1
   }
-  // Trailing gap
-  if (lastEnd < lines.length - 1) {
-    const gapLines = lines.slice(lastEnd + 1)
-    const gapContent = gapLines.join("\n").trim()
-    if (gapContent.length >= config.min_chunk_size) {
-      gaps.push({
-        content: gapContent,
-        start_line: lastEnd + 1,
-        end_line: lines.length - 1,
-      })
-    }
+  if (current.length > 0) {
+    parts.push({
+      content: current.join("\n"),
+      function_name: chunk.function_name,
+      class_name: chunk.class_name,
+      start_line: startLine,
+      end_line: baseLine + chunkLines.length - 1,
+    })
   }
-  // Merge gaps into chunks
-  chunks.push(...gaps)
-  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+  return parts
 }

package/vectorizer/graph-db.ts CHANGED Viewed

@@ -24,6 +24,7 @@ export class GraphDB {
   private _stmtBySubjectPredicate: any = null
   private _stmtByPredicate: any = null
   private _stmtAll: any = null
+  private _stmtDeleteMeta: any = null
   constructor(private dbPath: string) {}
@@ -69,6 +70,7 @@ export class GraphDB {
     this._stmtBySubjectPredicate = this.db.prepare("SELECT * FROM triples WHERE subject = ? AND predicate = ?")
     this._stmtByPredicate = this.db.prepare("SELECT * FROM triples WHERE predicate = ?")
     this._stmtAll = this.db.prepare("SELECT * FROM triples")
+    this._stmtDeleteMeta = this.db.prepare("DELETE FROM triples WHERE subject = ? AND predicate = ?")
     this.initialized = true
     return this
@@ -135,6 +137,7 @@ export class GraphDB {
       this._stmtBySubjectPredicate = null
       this._stmtByPredicate = null
       this._stmtAll = null
+      this._stmtDeleteMeta = null
       this.initialized = false
     }
   }
@@ -173,8 +176,7 @@ export class GraphDB {
     if (!this.initialized || !this.db) throw new Error("GraphDB not initialized. Call init() first.")
     try {
-      this.db!.prepare("DELETE FROM triples WHERE subject = ? AND predicate = ?")
-        .run(`meta:${filePath}`, "graph_built")
+      this._stmtDeleteMeta.run(`meta:${filePath}`, "graph_built")
     } catch {
       // Silently ignore errors
     }
@@ -266,8 +268,8 @@ export class GraphDB {
             })
           }
         }
-      } catch (error) {
-        console.error(`Error traversing graph for ${currentId}:`, error)
+      } catch {
+        // Non-fatal — skip node on error (corrupted edge, closed DB, etc.)
       }
     }