npm - @comfanion/usethis_search - Versions diffs - 4.2.0-dev.2 → 4.2.0-dev.4 - Mend

@comfanion/usethis_search 4.2.0-dev.2 → 4.2.0-dev.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/cache/manager.ts +20 -1
package/hooks/message-before.ts +7 -0
package/package.json +3 -2
package/tools/search.ts +258 -10
package/tools/workspace.ts +35 -11
package/vectorizer/chunkers/chunker-factory.ts +32 -3
package/vectorizer/chunkers/code-chunker.ts +2 -2
package/vectorizer/chunkers/lsp-chunker.ts +316 -0
package/vectorizer/chunkers/markdown-chunker.ts +2 -2
package/vectorizer/index.ts +25 -2
package/vectorizer.yaml +4 -4

package/cache/manager.ts CHANGED Viewed

@@ -47,7 +47,7 @@ export interface WorkspaceEntry {
   /** MD5 hash of chunk content — used by freshen() to detect changes */
   contentHash: string
   /** How this chunk got into workspace */
-  role: "search-main" | "search-graph" | "manual"
+  role: "search-main" | "search-graph" | "search-context" | "manual"
   /** Timestamp when attached */
   attachedAt: number
   /** Search query or "manual" */
@@ -547,6 +547,25 @@ class WorkspaceCache {
      return removed
    }
+   /**
+    * Remove all chunks from a specific file path.
+    * Returns number of chunks removed.
+    */
+   detachByPath(filePath: string): number {
+     let removed = 0
+     for (const [chunkId, entry] of this.entries) {
+       if (entry.path === filePath) {
+         this.entries.delete(chunkId)
+         this._totalTokens -= entry.tokens
+         removed++
+       }
+     }
+     if (removed > 0) this.scheduleSave()
+     return removed
+   }
    /**
     * Get all chunks sorted by: search-main first (by score desc), then search-graph, then manual.
     */

package/hooks/message-before.ts CHANGED Viewed

@@ -103,6 +103,7 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
      // Group by role for clear structure
      const mainFiles = entries.filter(e => e.role === "search-main")
+     const contextFiles = entries.filter(e => e.role === "search-context")
      const graphFiles = entries.filter(e => e.role === "search-graph")
      const manualFiles = entries.filter(e => e.role === "manual")
@@ -111,6 +112,12 @@ export function createWorkspaceInjectionHandler(state: SessionState) {
        workspace += formatChunksByFile(mainFiles, byFile)
      }
+     // Expanded context (class methods, class headers)
+     if (contextFiles.length > 0) {
+       workspace += `\n<!-- Expanded context (class methods/headers for completeness) -->\n`
+       workspace += formatChunksByFile(contextFiles, byFile)
+     }
      // Graph relations (imports, extends, used_by)
      if (graphFiles.length > 0) {
        workspace += `\n<!-- Search graph relations -->\n`

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "4.2.0-dev.2",
+  "version": "4.2.0-dev.4",
   "description": "OpenCode plugin: semantic search with chunk-based workspace injection (v4.2-dev: chunk-level context, granular detach, improved token efficiency)",
   "type": "module",
   "main": "./index.ts",
@@ -15,7 +15,7 @@
     "index:clear": "bun run cli.ts clear"
   },
   "bin": {
-    "usethis-search": "./cli.ts"
+    "usethis-search": "cli.ts"
   },
   "files": [
     "index.ts",
@@ -45,6 +45,7 @@
     "vectorizer/analyzers/lsp-client.ts",
     "vectorizer/chunkers/markdown-chunker.ts",
     "vectorizer/chunkers/code-chunker.ts",
+    "vectorizer/chunkers/lsp-chunker.ts",
     "vectorizer/chunkers/chunker-factory.ts",
     "vectorizer.yaml",
     "README.md",

package/tools/search.ts CHANGED Viewed

@@ -16,6 +16,92 @@ import fs from "fs/promises"
 import { CodebaseIndexer, getSearchConfig, getIndexer, releaseIndexer } from "../vectorizer/index.ts"
 import { workspaceCache } from "../cache/manager.ts"
+// ── Context Expansion Helpers ─────────────────────────────────────────────
+/**
+ * Expand chunk context using GRAPH + file-level structural expansion.
+ *
+ * Strategy:
+ * 1. CODE: Class → all methods, Method → class header (via findChunksByPath)
+ * 2. DOCS: Section → other sections from same file (via file node in graph)
+ * 3. ALL: Use relatedContext from graph (imports, extends, contains, etc.)
+ *
+ * Note: relatedContext already populated by vectorizer._expandGraphContext()
+ * This function adds STRUCTURAL context (same-file chunks).
+ */
+async function expandChunkContext(
+  mainChunk: any,
+  indexer: CodebaseIndexer,
+  alreadyAttached: Set<string>,
+): Promise<Array<{ chunk: any; reason: string }>> {
+  const expanded: Array<{ chunk: any; reason: string }> = []
+  // ══════════════════════════════════════════════════════════════════════
+  // STRUCTURAL EXPANSION: Same-file chunks for completeness
+  // ══════════════════════════════════════════════════════════════════════
+  // CODE: Class → add ALL its methods
+  if (mainChunk.class_name && !mainChunk.function_name) {
+    const allChunks = await indexer.findChunksByPath(mainChunk.file)
+    for (const chunk of allChunks) {
+      const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
+      if (alreadyAttached.has(chunkId)) continue
+      // Add all methods of this class
+      if (chunk.class_name === mainChunk.class_name && chunk.function_name) {
+        expanded.push({
+          chunk,
+          reason: `method of class ${mainChunk.class_name}`,
+        })
+      }
+    }
+  }
+  // CODE: Method → add class header
+  else if (mainChunk.class_name && mainChunk.function_name) {
+    const allChunks = await indexer.findChunksByPath(mainChunk.file)
+    for (const chunk of allChunks) {
+      const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
+      if (alreadyAttached.has(chunkId)) continue
+      // Find class header (class_name matches but no function_name)
+      if (chunk.class_name === mainChunk.class_name && !chunk.function_name) {
+        expanded.push({
+          chunk,
+          reason: `class header for ${mainChunk.function_name}`,
+        })
+        break
+      }
+    }
+  }
+  // DOCS: Section → add other sections from same file (for context)
+  // Only for markdown chunks with heading_context
+  else if (mainChunk.heading_context && mainChunk.language === "markdown") {
+    const allChunks = await indexer.findChunksByPath(mainChunk.file)
+    // Add ALL sections from this file (they're already reasonably sized)
+    // This gives full document context when searching in docs
+    for (const chunk of allChunks) {
+      const chunkId = chunk.chunk_id || `${chunk.file}:chunk-${chunk.chunk_index ?? 0}`
+      if (alreadyAttached.has(chunkId)) continue
+      // Skip the main chunk itself
+      if (chunkId === mainChunk.chunkId) continue
+      // Add other sections from same document
+      expanded.push({
+        chunk,
+        reason: `section from ${mainChunk.file}`,
+      })
+    }
+  }
+  return expanded
+}
 // ── Extension → language mapping (for filter parsing) ─────────────────────
 const EXT_TO_LANG: Record<string, string> = {
   go: "go", py: "python", ts: "typescript", tsx: "typescript",
@@ -92,25 +178,31 @@ function parseFilter(filter: string): {
 }
 export default tool({
-  description: `Search the codebase semantically. Top results are attached to workspace with full content (visible via context injection). Rest returned as summary.
+  description: `Search the codebase semantically OR attach specific chunks/files to workspace.
+Three modes:
+1. Semantic search (query) - Find relevant code by meaning
+2. Direct chunk attach (chunkId) - Attach specific chunk by ID
+3. File attach (path) - Attach all chunks from a file
 Available indexes:
 - "code" (default) - Source code files (*.js, *.ts, *.py, *.go, etc.)
 - "docs" - Documentation files (*.md, *.txt, etc.)
-- "config" - Configuration files (*.yaml, *.json, etc.)
 - searchAll: true - Search across all indexes
 Examples:
 - search({ query: "authentication logic" })
 - search({ query: "how to deploy", index: "docs" })
 - search({ query: "tenant management", filter: "internal/domain/" })
-- search({ query: "event handling", filter: "*.go" })
-- search({ query: "API routes", filter: "internal/**/*.go" })
-- search({ query: "metrics", searchAll: true })`,
+- search({ chunkId: "src/auth.ts:chunk-5" })
+- search({ path: "docs/architecture.md" })
+- search({ path: "src/auth.ts", index: "code" })`,
   args: {
-    query: tool.schema.string().describe("Semantic search query describing what you're looking for"),
-    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs, config"),
+    query: tool.schema.string().optional().describe("Semantic search query describing what you're looking for"),
+    chunkId: tool.schema.string().optional().describe("Specific chunk ID to attach (e.g. 'src/auth.ts:chunk-5')"),
+    path: tool.schema.string().optional().describe("File path to attach all chunks from (e.g. 'docs/architecture.md')"),
+    index: tool.schema.string().optional().default("code").describe("Index to search: code, docs"),
     limit: tool.schema.number().optional().describe("Number of results (default from config, typically 10)"),
     searchAll: tool.schema.boolean().optional().default(false).describe("Search all indexes instead of just one"),
     filter: tool.schema.string().optional().describe("Filter results by path or language. Examples: 'internal/domain/', '*.go', 'internal/**/*.go', 'service'"),
@@ -120,6 +212,15 @@ Examples:
     const projectRoot = process.cwd()
     try {
+      // Validate: exactly one of query, chunkId, or path must be specified
+      const modes = [args.query, args.chunkId, args.path].filter(x => x !== undefined)
+      if (modes.length === 0) {
+        return `Error: Must specify one of: query (semantic search), chunkId (direct attach), or path (file attach)\n\nExamples:\n- search({ query: "authentication" })\n- search({ chunkId: "src/auth.ts:chunk-5" })\n- search({ path: "docs/architecture.md" })`
+      }
+      if (modes.length > 1) {
+        return `Error: Specify only ONE of: query, chunkId, or path (got ${modes.length})`
+      }
       // Load config defaults (parsed from vectorizer.yaml)
       const cfg = getSearchConfig()
       const limit = args.limit || cfg.default_limit || 10
@@ -130,6 +231,95 @@ Examples:
       // Workspace config
       const wsConfig = workspaceCache.getConfig()
+      // ══════════════════════════════════════════════════════════════════════
+      // MODE 1: Direct chunk attach by chunkId
+      // ══════════════════════════════════════════════════════════════════════
+      if (args.chunkId) {
+        const indexer = await getIndexer(projectRoot, indexName)
+        try {
+          const chunk = await indexer.findChunkById(args.chunkId)
+          if (!chunk) {
+            return `Chunk "${args.chunkId}" not found in index "${indexName}".\n\nMake sure:\n1. The file is indexed\n2. The chunk ID is correct (format: "path:chunk-N")\n3. You're searching the right index`
+          }
+          // Attach to workspace
+          workspaceCache.attach({
+            chunkId: args.chunkId,
+            path: chunk.file,
+            content: chunk.content,
+            chunkIndex: chunk.chunk_index ?? 0,
+            role: "manual",
+            attachedAt: Date.now(),
+            attachedBy: `direct:${args.chunkId}`,
+            metadata: {
+              language: chunk.language,
+              function_name: chunk.function_name,
+              class_name: chunk.class_name,
+              heading_context: chunk.heading_context,
+              startLine: chunk.start_line,
+              endLine: chunk.end_line,
+            },
+          })
+          workspaceCache.save().catch(() => {})
+          const entry = workspaceCache.get(args.chunkId)!
+          return `✓ Attached chunk to workspace\n\nChunk: ${args.chunkId}\nFile: ${chunk.file}\nTokens: ${entry.tokens.toLocaleString()}\nLanguage: ${chunk.language}\nLines: ${chunk.start_line}-${chunk.end_line}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
+        } finally {
+          releaseIndexer(projectRoot, indexName)
+        }
+      }
+      // ══════════════════════════════════════════════════════════════════════
+      // MODE 2: File attach by path (all chunks)
+      // ══════════════════════════════════════════════════════════════════════
+      if (args.path) {
+        const indexer = await getIndexer(projectRoot, indexName)
+        try {
+          const chunks = await indexer.findChunksByPath(args.path)
+          if (chunks.length === 0) {
+            return `No chunks found for file "${args.path}" in index "${indexName}".\n\nMake sure:\n1. The file exists and is indexed\n2. The path is correct (relative to project root)\n3. You're searching the right index\n\nRun: bunx usethis_search reindex`
+          }
+          // Attach all chunks to workspace
+          let totalTokens = 0
+          for (const chunk of chunks) {
+            const chunkId = chunk.chunk_id || `${args.path}:chunk-${chunk.chunk_index ?? 0}`
+            workspaceCache.attach({
+              chunkId,
+              path: args.path,
+              content: chunk.content,
+              chunkIndex: chunk.chunk_index ?? 0,
+              role: "manual",
+              attachedAt: Date.now(),
+              attachedBy: `file:${args.path}`,
+              metadata: {
+                language: chunk.language,
+                function_name: chunk.function_name,
+                class_name: chunk.class_name,
+                heading_context: chunk.heading_context,
+                startLine: chunk.start_line,
+                endLine: chunk.end_line,
+              },
+            })
+            const entry = workspaceCache.get(chunkId)!
+            totalTokens += entry.tokens
+          }
+          workspaceCache.save().catch(() => {})
+          return `✓ Attached file to workspace\n\nFile: ${args.path}\nChunks: ${chunks.length}\nTokens: ${totalTokens.toLocaleString()}\nLanguage: ${chunks[0].language}\n\nWorkspace: ${workspaceCache.size} chunks, ${workspaceCache.totalTokens.toLocaleString()} tokens`
+        } finally {
+          releaseIndexer(projectRoot, indexName)
+        }
+      }
+      // ══════════════════════════════════════════════════════════════════════
+      // MODE 3: Semantic search by query (original behavior)
+      // ══════════════════════════════════════════════════════════════════════
       // Parse filter into path/language constraints
       const filterParsed = args.filter ? parseFilter(args.filter) : {}
@@ -169,6 +359,19 @@ Examples:
           }
         }
+        // Deduplicate chunks (searchAll may return same chunk from multiple indexes)
+        const seen = new Set<string>()
+        const deduplicated: any[] = []
+        for (const result of allResults) {
+          const chunkId = result.chunkId || `${result.file}:chunk-${result.index ?? 0}`
+          if (!seen.has(chunkId)) {
+            seen.add(chunkId)
+            deduplicated.push(result)
+          }
+        }
+        allResults = deduplicated
         allResults.sort((a, b) => {
           const scoreA = a._combinedScore ?? (a._distance != null ? Math.max(0, 1 - a._distance / 2) : 0)
           const scoreB = b._combinedScore ?? (b._distance != null ? Math.max(0, 1 - b._distance / 2) : 0)
@@ -255,7 +458,7 @@ Examples:
        }
        // ══════════════════════════════════════════════════════════════════════
-       // WORKSPACE ATTACH: Top N chunks + graph relations (CHUNK CONTENT ONLY)
+       // WORKSPACE ATTACH: Top N chunks + expanded context + graph relations
        // ══════════════════════════════════════════════════════════════════════
        const mainChunks = topChunks.slice(0, wsConfig.attachTopN)
@@ -263,13 +466,17 @@ Examples:
        const attachedMain: Array<{ chunkId: string; path: string }> = []
        const attachedGraph: Array<{ chunkId: string; path: string }> = []
+       const attachedContext: Array<{ chunkId: string; path: string; reason: string }> = []
        const alreadyAttached = new Set<string>()
+       // Get indexer for context expansion (reuse same indexer)
+       const indexerForExpansion = await getIndexer(projectRoot, indexName)
        for (const chunk of mainChunks) {
          // Skip if score too low
          if ((chunk._finalScore ?? 0) < wsConfig.minScoreMain) continue
-         // Attach chunk directly (no file read needed — chunk.content already has it)
+         // Attach main chunk
          const chunkId = chunk.chunkId || `${chunk.file}:chunk-${chunk.index ?? 0}`
          workspaceCache.attach({
@@ -294,6 +501,44 @@ Examples:
          attachedMain.push({ chunkId, path: chunk.file })
          alreadyAttached.add(chunkId)
+         // ── Expand context (class methods, class header) ──────────────────
+         try {
+           const expandedChunks = await expandChunkContext(chunk, indexerForExpansion, alreadyAttached)
+           for (const { chunk: expChunk, reason } of expandedChunks) {
+             const expChunkId = expChunk.chunk_id || `${expChunk.file}:chunk-${expChunk.chunk_index ?? 0}`
+             // Check budget before adding
+             if (workspaceCache.size >= wsConfig.maxChunks) break
+             workspaceCache.attach({
+               chunkId: expChunkId,
+               path: expChunk.file,
+               content: expChunk.content,
+               chunkIndex: expChunk.chunk_index ?? 0,
+               role: "search-context",
+               attachedAt: Date.now(),
+               attachedBy: `${args.query} (${reason})`,
+               score: chunk._finalScore * 0.9, // Slightly lower score than main
+               metadata: {
+                 language: expChunk.language,
+                 function_name: expChunk.function_name,
+                 class_name: expChunk.class_name,
+                 startLine: expChunk.start_line,
+                 endLine: expChunk.end_line,
+               },
+             })
+             attachedContext.push({ chunkId: expChunkId, path: expChunk.file, reason })
+             alreadyAttached.add(expChunkId)
+           }
+         } catch (error: any) {
+           // Context expansion failed — not critical, continue
+           if (process.env.DEBUG) {
+             console.log(`[search] Context expansion failed for ${chunkId}: ${error.message}`)
+           }
+         }
          // Attach graph relations (imports, extends, used_by)
          if (chunk.relatedContext && chunk.relatedContext.length > 0) {
            const topRelated = chunk.relatedContext
@@ -329,8 +574,11 @@ Examples:
          }
        }
+      // Release indexer used for expansion
+      releaseIndexer(projectRoot, indexName)
       // ── Flush workspace to disk immediately (don't rely on debounce) ─────
-      if (attachedMain.length > 0 || attachedGraph.length > 0) {
+      if (attachedMain.length > 0 || attachedGraph.length > 0 || attachedContext.length > 0) {
         workspaceCache.save().catch(() => {})
       }

package/tools/workspace.ts CHANGED Viewed

@@ -56,11 +56,14 @@ export const workspace_list = tool({
     const mainFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
       chunks.some(c => c.role === "search-main")
     )
+    const contextFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
+      chunks.some(c => c.role === "search-context") && !chunks.some(c => c.role === "search-main")
+    )
     const graphFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
-      chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main")
+      chunks.some(c => c.role === "search-graph") && !chunks.some(c => c.role === "search-main" || c.role === "search-context")
     )
     const manualFiles = Array.from(fileGroups.entries()).filter(([_, chunks]) =>
-      chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph")
+      chunks.some(c => c.role === "manual") && !chunks.some(c => c.role === "search-main" || c.role === "search-graph" || c.role === "search-context")
     )
     if (mainFiles.length > 0) {
@@ -86,6 +89,25 @@ export const workspace_list = tool({
       output += `\n`
     }
+    if (contextFiles.length > 0) {
+      output += `### Expanded context (${contextFiles.length} files)\n`
+      for (const [filePath, chunks] of contextFiles) {
+        const totalTokens = chunks.reduce((sum, c) => sum + c.tokens, 0)
+        const reason = chunks[0]?.attachedBy?.match(/\((.+)\)/)?.[1] || "context"
+        const age = Math.floor((Date.now() - chunks[0].attachedAt) / 1000 / 60)
+        output += `- **${filePath}** (${chunks.length} chunk${chunks.length > 1 ? "s" : ""}, ${totalTokens.toLocaleString()} tokens) — ${reason} — ${age}m ago\n`
+        if (chunks.length > 1) {
+          for (const chunk of chunks) {
+            const meta = chunk.metadata?.function_name || chunk.metadata?.class_name || ""
+            output += `  • ${chunk.chunkId} — ${meta} (chunk ${chunk.chunkIndex}, ${chunk.tokens.toLocaleString()} tok)\n`
+          }
+        }
+      }
+      output += `\n`
+    }
     if (graphFiles.length > 0) {
       output += `### Graph relations (${graphFiles.length} files)\n`
       for (const [filePath, chunks] of graphFiles) {
@@ -155,8 +177,12 @@ export const workspace_attach = tool({
       // Check if already attached
       if (workspaceCache.has(args.filePath)) {
-        const existing = workspaceCache.get(args.filePath)!
-        return `File "${args.filePath}" is already in workspace.\nChunkId: ${existing.chunkId} | Role: ${existing.role} | Tokens: ${existing.tokens.toLocaleString()} | Score: ${existing.score?.toFixed(3) ?? "n/a"}`
+        const existing = workspaceCache.getChunksByPath(args.filePath)
+        if (existing.length > 0) {
+          const first = existing[0]
+          const totalTokens = existing.reduce((sum, c) => sum + c.tokens, 0)
+          return `File "${args.filePath}" is already in workspace (${existing.length} chunk${existing.length > 1 ? "s" : ""}).\nTokens: ${totalTokens.toLocaleString()} | Role: ${first.role} | Score: ${first.score?.toFixed(3) ?? "n/a"}`
+        }
       }
       workspaceCache.attach({
@@ -169,7 +195,7 @@ export const workspace_attach = tool({
         attachedBy: "manual",
       })
-      const entry = workspaceCache.get(args.filePath)!
+      const entry = workspaceCache.get(chunkId)!
       return `Attached "${args.filePath}" to workspace as single chunk.\nChunkId: ${chunkId}\nTokens: ${entry.tokens.toLocaleString()}\nWorkspace total: ${workspaceCache.totalTokens.toLocaleString()} tokens (${workspaceCache.size} chunks)`
     } catch (error: any) {
       return `Failed to attach "${args.filePath}": ${error.message || String(error)}`
@@ -194,27 +220,25 @@ export const workspace_detach = tool({
     if (args.chunkId) {
       // Detach specific chunk by chunkId
-      const entries = workspaceCache.getAll()
-      const entry = entries.find(e => e.chunkId === args.chunkId)
+      const entry = workspaceCache.get(args.chunkId)
       if (!entry) {
         return `Chunk "${args.chunkId}" not found in workspace.`
       }
-      removed = workspaceCache.detach(entry.path) ? 1 : 0
+      removed = workspaceCache.detach(args.chunkId) ? 1 : 0
       if (removed === 0) {
         return `Failed to remove chunk "${args.chunkId}".`
       }
     } else if (args.filePath) {
       // Detach all chunks of a file
-      const entries = workspaceCache.getAll()
-      const fileChunks = entries.filter(e => e.path === args.filePath)
+      const fileChunks = workspaceCache.getChunksByPath(args.filePath)
       if (fileChunks.length === 0) {
         return `File "${args.filePath}" not found in workspace.`
       }
-      removed = workspaceCache.detach(args.filePath) ? fileChunks.length : 0
+      removed = workspaceCache.detachByPath(args.filePath)
       if (removed === 0) {
         return `Failed to remove chunks from "${args.filePath}".`
       }

package/vectorizer/chunkers/chunker-factory.ts CHANGED Viewed

@@ -1,11 +1,15 @@
 /**
  * Chunker Factory — selects the appropriate chunker based on file type.
  *
- * Routes: markdown → markdown-chunker, code → code-chunker, else → fixed.
+ * Routes:
+ * - markdown → markdown-chunker
+ * - code → lsp-chunker (fallback: code-chunker regex)
+ * - else → fixed
  */
 import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
 import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
+import { chunkCodeWithLSP } from "./lsp-chunker"
 import type { FileType } from "../metadata-extractor"
 // ── Types ───────────────────────────────────────────────────────────────────
@@ -63,13 +67,18 @@ function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
 /**
  * Chunk content using the appropriate strategy for the given file type.
+ *
+ * For code files: tries LSP-based chunking first (AST-accurate + godoc capture),
+ * falls back to regex-based chunker if LSP unavailable.
  */
-export function chunkContent(
+export async function chunkContent(
   content: string,
   fileType: FileType,
   language: string,
   config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
-): UnifiedChunk[] {
+  filePath?: string,
+  projectRoot?: string,
+): Promise<UnifiedChunk[]> {
   // If strategy is "fixed", always use fixed chunker
   if (config.strategy === "fixed") {
     return chunkFixed(content, config.fixed.max_chars)
@@ -85,6 +94,26 @@ export function chunkContent(
   }
   if (fileType === "code") {
+    // Try LSP-based chunker first (captures godoc/JSDoc comments!)
+    if (filePath) {
+      try {
+        const lspChunks = await chunkCodeWithLSP(filePath, content, config.code, projectRoot)
+        if (lspChunks && lspChunks.length > 0) {
+          return lspChunks.map((c) => ({
+            content: c.content,
+            function_name: c.function_name,
+            class_name: c.class_name,
+          }))
+        }
+      } catch (error) {
+        // LSP failed — fall through to regex chunker
+        if (process.env.DEBUG_LSP_CHUNKER) {
+          console.log(`[chunker-factory] LSP chunker failed for ${filePath}: ${error}`)
+        }
+      }
+    }
+    // Fallback: regex-based code chunker
     const codeChunks = chunkCode(content, config.code)
     return codeChunks.map((c) => ({
       content: c.content,

package/vectorizer/chunkers/code-chunker.ts CHANGED Viewed

@@ -13,8 +13,8 @@ export interface CodeChunkConfig {
 }
 export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
-  min_chunk_size: 300,
-  max_chunk_size: 1500,
+  min_chunk_size: 600,   // Function with godoc/JSDoc (avoid tiny chunks)
+  max_chunk_size: 3000,  // Allow larger chunks (class with multiple methods)
   split_by_functions: true,
   include_function_signature: true,
 }

package/vectorizer/chunkers/lsp-chunker.ts ADDED Viewed

@@ -0,0 +1,316 @@
+/**
+ * LSP-Based Code Chunker
+ *
+ * Uses Language Server Protocol to get AST-accurate function/class boundaries.
+ * Captures godoc/JSDoc comments that belong to each symbol.
+ *
+ * Advantages over regex-chunker:
+ * - ✅ Accurate AST parsing (no brace counting bugs)
+ * - ✅ Captures leading documentation comments (godoc, JSDoc, docstrings)
+ * - ✅ Handles nested structures (class methods, nested functions)
+ * - ✅ Language-agnostic (works for Go, TS, Python, Rust, Java, etc.)
+ *
+ * Fallback: If LSP unavailable → use regex-chunker
+ */
+import { LSPClient, LSPSymbolInformation, SymbolKind } from "../analyzers/lsp-client.ts"
+import type { CodeChunk, CodeChunkConfig } from "./code-chunker.ts"
+const DEBUG = process.env.DEBUG_LSP_CHUNKER === "true"
+/** Symbol kinds we want to chunk separately */
+const CHUNKABLE_SYMBOLS = new Set([
+  SymbolKind.Function,
+  SymbolKind.Method,
+  SymbolKind.Class,
+  SymbolKind.Interface,
+  SymbolKind.Enum,
+  // Note: Struct is not in SymbolKind — Go structs appear as Class
+])
+/** Map file extension to LSP language ID */
+const EXT_TO_LANGUAGE: Record<string, string> = {
+  ts: "typescript",
+  js: "javascript",
+  tsx: "typescriptreact",
+  jsx: "javascriptreact",
+  py: "python",
+  go: "go",
+  rs: "rust",
+  java: "java",
+  cpp: "cpp",
+  c: "c",
+  cs: "csharp",
+}
+/**
+ * Chunk code using LSP documentSymbol API.
+ * Falls back to regex-chunker if LSP unavailable.
+ */
+export async function chunkCodeWithLSP(
+  filePath: string,
+  content: string,
+  config: CodeChunkConfig,
+  projectRoot?: string,
+): Promise<CodeChunk[] | null> {
+  // Check if LSP available for this language
+  const ext = filePath.split(".").pop() || ""
+  const language = EXT_TO_LANGUAGE[ext]
+  if (!language) {
+    if (DEBUG) console.log(`[lsp-chunker] No language mapping for .${ext}`)
+    return null // Fallback to regex
+  }
+  const available = await LSPClient.isAvailable(language)
+  if (!available) {
+    if (DEBUG) console.log(`[lsp-chunker] LSP not available for ${language}`)
+    return null // Fallback to regex
+  }
+  // Start LSP client
+  const client = new LSPClient(language, projectRoot)
+  try {
+    await client.start()
+    await client.openDocument(filePath, content)
+    // Get document symbols
+    const symbols = await client.documentSymbol(filePath)
+    if (!symbols || symbols.length === 0) {
+      if (DEBUG) console.log(`[lsp-chunker] No symbols found in ${filePath}`)
+      return null // Fallback to regex
+    }
+    const lines = content.split("\n")
+    const chunks: CodeChunk[] = []
+    // Extract chunks from symbols (recursive for nested symbols)
+    extractChunksFromSymbols(symbols, lines, chunks, config)
+    // Add gaps (code between symbols: imports, package declarations, etc.)
+    addGapChunks(chunks, lines, config)
+    if (DEBUG) console.log(`[lsp-chunker] Generated ${chunks.length} chunks from ${symbols.length} symbols`)
+    await client.closeDocument(filePath)
+    await client.stop()
+    return chunks.length > 0 ? chunks : null
+  } catch (error: any) {
+    if (DEBUG) console.log(`[lsp-chunker] Error: ${error.message}`)
+    try {
+      await client.stop()
+    } catch {}
+    return null // Fallback to regex
+  }
+}
+/**
+ * Recursively extract chunks from LSP symbols.
+ * Handles nested structures (class methods, nested functions).
+ */
+function extractChunksFromSymbols(
+  symbols: LSPSymbolInformation[],
+  lines: string[],
+  chunks: CodeChunk[],
+  config: CodeChunkConfig,
+  parentClass?: string,
+): void {
+  for (const symbol of symbols) {
+    // Skip non-chunkable symbols (variables, properties, etc.)
+    if (!CHUNKABLE_SYMBOLS.has(symbol.kind)) continue
+    const startLine = symbol.range.start.line
+    const endLine = symbol.range.end.line
+    // Expand startLine backward to capture leading comments (godoc, JSDoc, docstrings)
+    const commentStartLine = captureLeadingComments(lines, startLine)
+    // Extract chunk content
+    const chunkLines = lines.slice(commentStartLine, endLine + 1)
+    const chunkContent = chunkLines.join("\n")
+    // Check size constraints
+    if (chunkContent.length < config.min_chunk_size && chunkLines.length < 5) {
+      // Too small — skip (will be captured in gaps)
+      continue
+    }
+    // Determine chunk metadata
+    const isClass = symbol.kind === SymbolKind.Class || symbol.kind === SymbolKind.Interface
+    const isFunction = symbol.kind === SymbolKind.Function || symbol.kind === SymbolKind.Method
+    const chunk: CodeChunk = {
+      content: chunkContent,
+      start_line: commentStartLine,
+      end_line: endLine,
+    }
+    if (isClass) {
+      chunk.class_name = symbol.name
+    }
+    if (isFunction) {
+      chunk.function_name = symbol.name
+      if (parentClass) chunk.class_name = parentClass
+    }
+    // If chunk too large → split by children (for classes with many methods)
+    if (chunkContent.length > config.max_chunk_size && symbol.children && symbol.children.length > 0) {
+      if (DEBUG) console.log(`[lsp-chunker] Splitting large ${symbol.kind === SymbolKind.Class ? 'class' : 'symbol'} ${symbol.name}`)
+      // For classes: chunk class header + each method separately
+      if (isClass) {
+        // Find first child's start line
+        const firstChildStart = Math.min(...symbol.children.map(c => c.range.start.line))
+        // Class header chunk (from comment to first method)
+        const headerLines = lines.slice(commentStartLine, firstChildStart)
+        if (headerLines.join("\n").trim().length > 0) {
+          chunks.push({
+            content: headerLines.join("\n"),
+            class_name: symbol.name,
+            start_line: commentStartLine,
+            end_line: firstChildStart - 1,
+          })
+        }
+        // Chunk each method separately (with its comments)
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, symbol.name)
+      } else {
+        // Non-class: chunk children recursively
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, parentClass)
+      }
+    } else {
+      // Chunk fits size limit → add it
+      chunks.push(chunk)
+      // Still process children if they exist (nested functions in Go, for example)
+      if (symbol.children && symbol.children.length > 0) {
+        extractChunksFromSymbols(symbol.children, lines, chunks, config, isClass ? symbol.name : parentClass)
+      }
+    }
+  }
+  // Sort chunks by start_line
+  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+}
+/**
+ * Capture leading comments above a symbol.
+ * Walks backward from startLine to find godoc, JSDoc, docstrings, etc.
+ *
+ * Handles:
+ * - Go: // comments (consecutive)
+ * - Python: """docstring"""
+ * - JS/TS: /** JSDoc *\/ or // comments
+ * - Rust: /// doc comments
+ * - Java/C#: /** JavaDoc *\/
+ */
+function captureLeadingComments(lines: string[], startLine: number): number {
+  if (startLine <= 0) return startLine
+  let commentStart = startLine - 1
+  let foundComment = false
+  // Walk backward to find comment block
+  while (commentStart >= 0) {
+    const line = lines[commentStart]
+    const trimmed = line.trim()
+    // Empty line
+    if (trimmed === "") {
+      // Allow max 1 blank line between comment and declaration
+      if (foundComment && commentStart > 0) {
+        const prevLine = lines[commentStart - 1].trim()
+        if (isCommentLine(prevLine)) {
+          commentStart--
+          continue
+        }
+      }
+      break
+    }
+    // Check if line is a comment
+    if (isCommentLine(trimmed)) {
+      foundComment = true
+      commentStart--
+      continue
+    }
+    // Non-comment, non-empty line → stop
+    break
+  }
+  return foundComment ? commentStart + 1 : startLine
+}
+/**
+ * Check if a line is a documentation comment.
+ */
+function isCommentLine(line: string): boolean {
+  return (
+    line.startsWith("//") ||      // Go, JS, TS, Rust, C++
+    line.startsWith("///") ||     // Rust doc comments
+    line.startsWith("#") ||       // Python
+    line.startsWith("*") ||       // Inside /** ... */
+    line.startsWith("/**") ||     // JSDoc/JavaDoc start
+    line.endsWith("*/") ||        // JSDoc/JavaDoc end
+    line.match(/^("""|''')/) ||   // Python docstring
+    line.startsWith("<!--")       // HTML/Markdown
+  )
+}
+/**
+ * Add gap chunks (code between symbols: imports, package decl, constants, etc.)
+ */
+function addGapChunks(chunks: CodeChunk[], lines: string[], config: CodeChunkConfig): void {
+  if (chunks.length === 0) {
+    // No symbols found → chunk entire file
+    chunks.push({
+      content: lines.join("\n"),
+      start_line: 0,
+      end_line: lines.length - 1,
+    })
+    return
+  }
+  const gaps: CodeChunk[] = []
+  let lastEnd = -1
+  for (const chunk of chunks) {
+    const start = chunk.start_line ?? 0
+    // Gap before this chunk
+    if (start > lastEnd + 1) {
+      const gapLines = lines.slice(lastEnd + 1, start)
+      const gapContent = gapLines.join("\n").trim()
+      if (gapContent.length >= config.min_chunk_size) {
+        gaps.push({
+          content: gapContent,
+          start_line: lastEnd + 1,
+          end_line: start - 1,
+        })
+      }
+    }
+    lastEnd = chunk.end_line ?? start
+  }
+  // Trailing gap
+  if (lastEnd < lines.length - 1) {
+    const gapLines = lines.slice(lastEnd + 1)
+    const gapContent = gapLines.join("\n").trim()
+    if (gapContent.length >= config.min_chunk_size) {
+      gaps.push({
+        content: gapContent,
+        start_line: lastEnd + 1,
+        end_line: lines.length - 1,
+      })
+    }
+  }
+  // Merge gaps into chunks
+  chunks.push(...gaps)
+  chunks.sort((a, b) => (a.start_line ?? 0) - (b.start_line ?? 0))
+}

package/vectorizer/chunkers/markdown-chunker.ts CHANGED Viewed

@@ -13,8 +13,8 @@ export interface MarkdownChunkConfig {
 }
 export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
-  min_chunk_size: 200,
-  max_chunk_size: 2000,
+  min_chunk_size: 1000,  // Merge small sections (headers without content)
+  max_chunk_size: 8000,  // Large chunks for docs (SQL schemas, API specs, etc.)
   split_by_headings: true,
   preserve_heading_hierarchy: true,
 }

package/vectorizer/index.ts CHANGED Viewed

@@ -696,8 +696,8 @@ class CodebaseIndexer {
     // Clean content before chunking
     const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
-    // Semantic chunking
-    const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
+    // Semantic chunking (async for LSP-based chunking)
+    const chunks = await chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG, filePath, this.root);
     // Assign chunk IDs
     const chunksWithIds = this.graphBuilder
@@ -1348,6 +1348,29 @@ class CodebaseIndexer {
     return this._chunkCache.get(chunkId) || null;
   }
+  /**
+   * Find all chunks belonging to a specific file path.
+   * @param {string} filePath - Relative file path (e.g. "src/auth.ts")
+   * @returns {Promise<Array>} Array of chunks from this file
+   */
+  async findChunksByPath(filePath) {
+    // Ensure chunk cache is loaded
+    await this.findChunkById("__force_cache_load__");
+    if (!this._chunkCache) return [];
+    const chunks = [];
+    for (const chunk of this._chunkCache.values()) {
+      if (chunk.file === filePath) {
+        chunks.push(chunk);
+      }
+    }
+    // Sort by chunk_index
+    chunks.sort((a, b) => (a.chunk_index || 0) - (b.chunk_index || 0));
+    return chunks;
+  }
   cosineSimilarity(vecA, vecB) {
     let dotProduct = 0;
     let normA = 0;

package/vectorizer.yaml CHANGED Viewed

@@ -23,14 +23,14 @@ vectorizer:
     strategy: "semantic"  # fixed | semantic
     markdown:
       split_by_headings: true
-      min_chunk_size: 200
-      max_chunk_size: 2000
+      min_chunk_size: 1000  # Merge small sections (avoid header-only chunks)
+      max_chunk_size: 8000  # Large chunks for docs (SQL schemas, API specs, etc.)
       preserve_heading_hierarchy: true
     code:
       split_by_functions: true
       include_function_signature: true
-      min_chunk_size: 300
-      max_chunk_size: 1500
+      min_chunk_size: 600   # Function + godoc/JSDoc (avoid tiny chunks)
+      max_chunk_size: 3000  # Allow larger chunks (class with multiple methods)
     fixed:
       max_chars: 1500