npm - @comfanion/usethis_search - Versions diffs - 4.2.0-dev.3 → 4.2.0-dev.4 - Mend

@comfanion/usethis_search 4.2.0-dev.3 → 4.2.0-dev.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +3 -2
package/vectorizer/chunkers/lsp-chunker.ts +316 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "4.2.0-dev.3",
+  "version": "4.2.0-dev.4",
   "description": "OpenCode plugin: semantic search with chunk-based workspace injection (v4.2-dev: chunk-level context, granular detach, improved token efficiency)",
   "type": "module",
   "main": "./index.ts",
@@ -15,7 +15,7 @@
     "index:clear": "bun run cli.ts clear"
   },
   "bin": {
-    "usethis-search": "./cli.ts"
+    "usethis-search": "cli.ts"
   },
   "files": [
     "index.ts",
@@ -45,6 +45,7 @@
     "vectorizer/analyzers/lsp-client.ts",
     "vectorizer/chunkers/markdown-chunker.ts",
     "vectorizer/chunkers/code-chunker.ts",
+    "vectorizer/chunkers/lsp-chunker.ts",
     "vectorizer/chunkers/chunker-factory.ts",
     "vectorizer.yaml",
     "README.md",

package/vectorizer/chunkers/lsp-chunker.ts ADDED Viewed

@@ -0,0 +1,316 @@
+/**
+ * LSP-Based Code Chunker
+ *
+ * Uses Language Server Protocol to get AST-accurate function/class boundaries.
+ * Captures godoc/JSDoc comments that belong to each symbol.
+ *
+ * Advantages over regex-chunker:
+ * - ✅ Accurate AST parsing (no brace counting bugs)
+ * - ✅ Captures leading documentation comments (godoc, JSDoc, docstrings)
+ * - ✅ Handles nested structures (class methods, nested functions)
+ * - ✅ Language-agnostic (works for Go, TS, Python, Rust, Java, etc.)
+ *
+ * Fallback: If LSP unavailable → use regex-chunker
+ */
+import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
+import type { CodeChunk, CodeChunkConfig } from "./code-chunker.ts"
+const DEBUG = process.env.DEBUG_LSP_CHUNKER === "true"
+/** Symbol kinds we want to chunk separately */
+const CHUNKABLE_SYMBOLS = new Set([
+  SymbolKind.Function,
+  SymbolKind.Method,
+  SymbolKind.Class,
+  SymbolKind.Interface,
+  SymbolKind.Enum,
+  // Note: Struct is not in SymbolKind — Go structs appear as Class
+])
+/** Map file extension to LSP language ID */
+const EXT_TO_LANGUAGE: Record<string, string> = {
+  ts: "typescript",
+  js: "javascript",
+  tsx: "typescriptreact",
+  jsx: "javascriptreact",
+  py: "python",
+  go: "go",
+  rs: "rust",
+  java: "java",
+  cpp: "cpp",
+  c: "c",
+  cs: "csharp",
+}
+/**
+ * Chunk code using LSP documentSymbol API.
+ * Falls back to regex-chunker if LSP unavailable.
+ */
+export async function chunkCodeWithLSP(
+  filePath: string,
+  content: string,
+  config: CodeChunkConfig,
+  projectRoot?: string,
+): Promise<CodeChunk[] | null> {
+  // Check if LSP available for this language
+  const ext = filePath.split(".").pop() || ""
+  const language = EXT_TO_LANGUAGE[ext]
+  if (!language) {
+    if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
+    return null // Fallback to regex
+  }
+  const available = await LSPClient.isAvailable(language)
+  if (!available) {
+    if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
+    return null // Fallback to regex
+  }
+  // Start LSP client
+  const client = new LSPClient(language, projectRoot)
+  try {
+    await client.start()
+    await client.openDocument(filePath, content)
+    // Get document symbols
+    const symbols = await client.documentSymbol(filePath)
+    if (!symbols || symbols.length === 0) {
+      if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
+      return null // Fallback to regex
+    }
+    const lines = content.split("\n")
+    const chunks: CodeChunk[] = []
+    // Extract chunks from symbols (recursive for nested symbols)
+    extractChunksFromSymbols(symbols, lines, chunks, config)
+    // Add gaps (code between symbols: imports, package declarations, etc.)
+    addGapChunks(chunks, lines, config)
+    if (DEBUG) console.log(`[lsp-chunker] Generated ${chunks.length} chunks from ${symbols.length} symbols`)
+    await client.closeDocument(filePath)
+    await client.stop()
+    return chunks.length > 0 ? chunks : null
+  } catch (error: any) {
+    if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
+    try {
+      await client.stop()
+    } catch {}
+    return null // Fallback to regex
+  }
+}
+/**
+ * Recursively extract chunks from LSP symbols.
+ * Handles nested structures (class methods, nested functions).
+ */
+function extractChunksFromSymbols(
+  symbols: LSPSymbolInformation[],
+  lines: string[],
+  chunks: CodeChunk[],
+  config: CodeChunkConfig,
+  parentClass?: string,
+): void {
+  for (const symbol of symbols) {
+    // Skip non-chunkable symbols (variables, properties, etc.)
+    if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
+    const startLine = symbol.range.start.line
+    const endLine = symbol.range.end.line
+    // Expand startLine backward to capture leading comments (godoc, JSDoc, docstrings)
+    const commentStartLine = captureLeadingComments(lines, startLine)
+    // Extract chunk content
+    const chunkLines = lines.slice(commentStartLine, endLine + 1)
+    const chunkContent = chunkLines.join("\n")
+    // Check size constraints
+    if (chunkContent.length < config.min_chunk_size && chunkLines.length < 5) {
+      // Too small — skip (will be captured in gaps)
+      continue
+    }
+    // Determine chunk metadata
+    const isClass = symbol.kind === SymbolKind.Class || symbol.kind === SymbolKind.Interface
+    const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
+    const chunk: CodeChunk = {
+      content: chunkContent,
+      start_line: commentStartLine,
+      end_line: endLine,
+    }
+    if (isClass) {
+      chunk.class_name = symbol.name
+    }
+    if (isFunction) {
+      chunk.function_name = symbol.name
+      if (parentClass) chunk.class_name = parentClass
+    }
+    // If chunk too large → split by children (for classes with many methods)
+    if (chunkContent.length > config.max_chunk_size && symbol.children && symbol.children.length > 0) {
+      if (DEBUG) console.log(`[lsp-chunker] Splitting large ${symbol.kind === SymbolKind.Class ? 'class' : 'symbol'} ${symbol.name}`)
+      // For classes: chunk class header + each method separately
+      if (isClass) {
+        // Find first child's start line
+        const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
+        // Class header chunk (from comment to first method)
+        const headerLines = lines.slice(commentStartLine, firstChildStart)
+        if (headerLines.join("\n").trim().length > 0) {
+          chunks.push({
+            content: headerLines.join("\n"),
+            class_name: symbol.name,
+            start_line: commentStartLine,
+            end_line: firstChildStart - 1,
+          })
+        }
+        // Chunk each method separately (with its comments)
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
+      } else {
+        // Non-class: chunk children recursively
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, parentClass)
+      }
+    } else {
+      // Chunk fits size limit → add it
+      chunks.push(chunk)
+      // Still process children if they exist (nested functions in Go, for example)
+      if (symbol.children && symbol.children.length > 0) {
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, isClass ? symbol.name : parentClass)
+      }
+    }
+  }
+  // Sort chunks by start_line
+  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+}
+/**
+ * Capture leading comments above a symbol.
+ * Walks backward from startLine to find godoc, JSDoc, docstrings, etc.
+ *
+ * Handles:
+ * - Go: // comments (consecutive)
+ * - Python: """docstring"""
+ * - JS/TS: /** JSDoc *\/ or // comments
+ * - Rust: /// doc comments
+ * - Java/C#: /** JavaDoc *\/
+ */
+function captureLeadingComments(lines: string[], startLine: number): number {
+  if (startLine <= 0) return startLine
+  let commentStart = startLine - 1
+  let foundComment = false
+  // Walk backward to find comment block
+  while (commentStart >= 0) {
+    const line = lines[commentStart]
+    const trimmed = line.trim()
+    // Empty line
+    if (trimmed === "") {
+      // Allow max 1 blank line between comment and declaration
+      if (foundComment && commentStart > 0) {
+        const prevLine = lines[commentStart - 1].trim()
+        if (isCommentLine(prevLine)) {
+          commentStart--
+          continue
+        }
+      }
+      break
+    }
+    // Check if line is a comment
+    if (isCommentLine(trimmed)) {
+      foundComment = true
+      commentStart--
+      continue
+    }
+    // Non-comment, non-empty line → stop
+    break
+  }
+  return foundComment ? commentStart + 1 : startLine
+}
+/**
+ * Check if a line is a documentation comment.
+ */
+function isCommentLine(line: string): boolean {
+  return (
+    line.startsWith("//") ||      // Go, JS, TS, Rust, C++
+    line.startsWith("///") ||     // Rust doc comments
+    line.startsWith("#") ||       // Python
+    line.startsWith("*") ||       // Inside /** ... */
+    line.startsWith("/**") ||     // JSDoc/JavaDoc start
+    line.endsWith("*/") ||        // JSDoc/JavaDoc end
+    line.match(/^("""|''')/) ||   // Python docstring
+    line.startsWith("<!--")       // HTML/Markdown
+  )
+}
+/**
+ * Add gap chunks (code between symbols: imports, package decl, constants, etc.)
+ */
+function addGapChunks(chunks: CodeChunk[], lines: string[], config: CodeChunkConfig): void {
+  if (chunks.length === 0) {
+    // No symbols found → chunk entire file
+    chunks.push({
+      content: lines.join("\n"),
+      start_line: 0,
+      end_line: lines.length - 1,
+    })
+    return
+  }
+  const gaps: CodeChunk[] = []
+  let lastEnd = -1
+  for (const chunk of chunks) {
+    const start = chunk.start_line ?? 0
+    // Gap before this chunk
+    if (start > lastEnd + 1) {
+      const gapLines = lines.slice(lastEnd + 1, start)
+      const gapContent = gapLines.join("\n").trim()
+      if (gapContent.length >= config.min_chunk_size) {
+        gaps.push({
+          content: gapContent,
+          start_line: lastEnd + 1,
+          end_line: start - 1,
+        })
+      }
+    }
+    lastEnd = chunk.end_line ?? start
+  }
+  // Trailing gap
+  if (lastEnd < lines.length - 1) {
+    const gapLines = lines.slice(lastEnd + 1)
+    const gapContent = gapLines.join("\n").trim()
+    if (gapContent.length >= config.min_chunk_size) {
+      gaps.push({
+        content: gapContent,
+        start_line: lastEnd + 1,
+        end_line: lines.length - 1,
+      })
+    }
+  }
+  // Merge gaps into chunks
+  chunks.push(...gaps)
+  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+}