npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.9 → 3.0.0 - Mend

@comfanion/usethis_search 3.0.0-dev.9 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/cli.ts +263 -0
package/file-indexer.ts +1 -1
package/index.ts +0 -2
package/package.json +12 -5
package/tools/codeindex.ts +2 -2
package/tools/search.ts +254 -66
package/vectorizer/analyzers/lsp-analyzer.ts +7 -7
package/vectorizer/analyzers/regex-analyzer.ts +358 -61
package/vectorizer/chunk-store.ts +207 -0
package/vectorizer/chunkers/code-chunker.ts +74 -24
package/vectorizer/chunkers/markdown-chunker.ts +69 -7
package/vectorizer/graph-builder.ts +207 -15
package/vectorizer/graph-db.ts +161 -164
package/vectorizer/hybrid-search.ts +1 -1
package/vectorizer/{index.js → index.ts} +796 -160
package/vectorizer.yaml +20 -2

package/vectorizer/chunk-store.ts ADDED Viewed

@@ -0,0 +1,207 @@
+/**
+ * ChunkStore — SQLite-based persistent chunk storage.
+ * Populated by Phase 1 (no vectors needed). Provides BM25 search
+ * and metadata queries immediately, before embedding is complete.
+ *
+ * Uses bun:sqlite with WAL mode for concurrent read access.
+ */
+import { Database } from "bun:sqlite"
+export interface StoredChunk {
+  chunk_id: string
+  file: string
+  chunk_index: number
+  content: string
+  file_type: string
+  language: string
+  last_modified: string
+  file_size: number
+  heading_context: string
+  function_name: string
+  class_name: string
+  tags: string
+  start_line: number
+  end_line: number
+  archived: boolean
+  vectorized: boolean
+}
+export class ChunkStore {
+  private db: Database | null = null
+  // Prepared statements
+  private _stmtInsert: any = null
+  private _stmtByFile: any = null
+  private _stmtDeleteByFile: any = null
+  private _stmtAll: any = null
+  private _stmtByChunkId: any = null
+  private _stmtMarkVectorized: any = null
+  private _stmtHasVectors: any = null
+  private _stmtCount: any = null
+  private _stmtSearch: any = null
+  constructor(private dbPath: string) {}
+  async init(): Promise<this> {
+    const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
+    this.db = new Database(fullPath)
+    this.db.exec("PRAGMA journal_mode = WAL")
+    this.db.exec("PRAGMA synchronous = NORMAL")
+    this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
+    this.db.exec(`
+      CREATE TABLE IF NOT EXISTS chunks (
+        chunk_id TEXT PRIMARY KEY,
+        file TEXT NOT NULL,
+        chunk_index INTEGER NOT NULL DEFAULT 0,
+        content TEXT NOT NULL,
+        file_type TEXT NOT NULL DEFAULT '',
+        language TEXT NOT NULL DEFAULT '',
+        last_modified TEXT NOT NULL DEFAULT '',
+        file_size INTEGER NOT NULL DEFAULT 0,
+        heading_context TEXT NOT NULL DEFAULT '',
+        function_name TEXT NOT NULL DEFAULT '',
+        class_name TEXT NOT NULL DEFAULT '',
+        tags TEXT NOT NULL DEFAULT '',
+        start_line INTEGER NOT NULL DEFAULT -1,
+        end_line INTEGER NOT NULL DEFAULT -1,
+        archived INTEGER NOT NULL DEFAULT 0,
+        vectorized INTEGER NOT NULL DEFAULT 0
+      )
+    `)
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
+    // Prepare statements
+    this._stmtInsert = this.db.prepare(`
+      INSERT OR REPLACE INTO chunks
+        (chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
+         heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
+    `)
+    this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
+    this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
+    this._stmtAll = this.db.prepare("SELECT * FROM chunks")
+    this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
+    this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
+    this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
+    this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
+    return this
+  }
+  /**
+   * Store chunks from Phase 1 (batch, in transaction).
+   */
+  storeChunks(rows: Array<{
+    chunk_id: string, file: string, chunk_index: number, content: string,
+    file_type: string, language: string, last_modified: string, file_size: number,
+    heading_context: string, function_name: string, class_name: string, tags: string,
+    start_line: number, end_line: number, archived: boolean
+  }>): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    const insertMany = this.db.transaction((items: typeof rows) => {
+      for (const r of items) {
+        this._stmtInsert.run(
+          r.chunk_id, r.file, r.chunk_index, r.content,
+          r.file_type, r.language, r.last_modified, r.file_size,
+          r.heading_context, r.function_name, r.class_name, r.tags,
+          r.start_line, r.end_line, r.archived ? 1 : 0
+        )
+      }
+    })
+    insertMany(rows)
+  }
+  /**
+   * Delete all chunks for a file (before re-indexing).
+   */
+  deleteByFile(filePath: string): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    this._stmtDeleteByFile.run(filePath)
+  }
+  /**
+   * Mark all chunks for a file as vectorized (Phase 2 complete).
+   */
+  markVectorized(filePath: string): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    this._stmtMarkVectorized.run(filePath)
+  }
+  /**
+   * Check if all chunks have vectors.
+   */
+  hasUnvectorizedChunks(): boolean {
+    if (!this.db) return false
+    const row = this._stmtHasVectors.get() as { cnt: number }
+    return row.cnt > 0
+  }
+  /**
+   * Get all chunks (for BM25 index building).
+   */
+  getAllChunks(): StoredChunk[] {
+    if (!this.db) return []
+    return this._stmtAll.all().map((r: any) => this.toChunk(r))
+  }
+  /**
+   * Get chunks for a specific file.
+   */
+  getChunksByFile(filePath: string): StoredChunk[] {
+    if (!this.db) return []
+    return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
+  }
+  /**
+   * Get a single chunk by ID.
+   */
+  getChunkById(chunkId: string): StoredChunk | null {
+    if (!this.db) return null
+    const row = this._stmtByChunkId.get(chunkId)
+    return row ? this.toChunk(row) : null
+  }
+  /**
+   * Total chunk count.
+   */
+  count(): number {
+    if (!this.db) return 0
+    const row = this._stmtCount.get() as { cnt: number }
+    return row.cnt
+  }
+  close(): void {
+    if (this.db) {
+      this.db.close()
+      this.db = null
+    }
+  }
+  private toChunk(row: any): StoredChunk {
+    return {
+      chunk_id: row.chunk_id,
+      file: row.file,
+      chunk_index: row.chunk_index,
+      content: row.content,
+      file_type: row.file_type,
+      language: row.language,
+      last_modified: row.last_modified,
+      file_size: row.file_size,
+      heading_context: row.heading_context,
+      function_name: row.function_name,
+      class_name: row.class_name,
+      tags: row.tags,
+      start_line: row.start_line,
+      end_line: row.end_line,
+      archived: !!row.archived,
+      vectorized: !!row.vectorized,
+    }
+  }
+}

package/vectorizer/chunkers/code-chunker.ts CHANGED Viewed

@@ -23,6 +23,8 @@ export interface CodeChunk {
   content: string
   function_name?: string
   class_name?: string
+  start_line?: number
+  end_line?: number
 }
 // ── Block detection ─────────────────────────────────────────────────────────
@@ -172,31 +174,74 @@ function findPythonBlockEnd(lines: string[], startLine: number): number {
   return lines.length - 1
 }
-// ── Fallback: line-based splitting ──────────────────────────────────────────
+  // ── Fallback: line-based splitting ──────────────────────────────────────────
+  function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
+    const chunks: CodeChunk[] = []
+    let current: string[] = []
+    let currentLen = 0
+    let startLine = 0
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i]
+      if (currentLen + line.length + 1 > maxChars && current.length > 0) {
+        chunks.push({ content: current.join("\n"), start_line: startLine, end_line: i - 1 })
+        current = []
+        currentLen = 0
+        startLine = i
+      }
+      current.push(line)
+      currentLen += line.length + 1
+    }
-function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
-  const chunks: CodeChunk[] = []
-  let current: string[] = []
-  let currentLen = 0
-  for (const line of lines) {
-    if (currentLen + line.length + 1 > maxChars && current.length > 0) {
-      chunks.push({ content: current.join("\n") })
-      current = []
-      currentLen = 0
+    if (current.length > 0) {
+      chunks.push({ content: current.join("\n"), start_line: startLine, end_line: lines.length - 1 })
     }
-    current.push(line)
-    currentLen += line.length + 1
-  }
-  if (current.length > 0) {
-    chunks.push({ content: current.join("\n") })
+    return chunks
   }
-  return chunks
-}
+  // ── Split large chunks preserving line numbers ────────────────────────────
+  function splitChunkByLines(chunk: CodeChunk, maxChars: number): CodeChunk[] {
+    const lines = chunk.content.split("\n")
+    const baseLine = chunk.start_line || 0
+    const parts: CodeChunk[] = []
+    let current: string[] = []
+    let currentLen = 0
+    let startLine = baseLine
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i]
+      if (currentLen + line.length + 1 > maxChars && current.length > 0) {
+        parts.push({
+          ...chunk,
+          content: current.join("\n"),
+          start_line: startLine,
+          end_line: baseLine + i - 1,
+        })
+        current = []
+        currentLen = 0
+        startLine = baseLine + i
+      }
+      current.push(line)
+      currentLen += line.length + 1
+    }
+    if (current.length > 0) {
+      parts.push({
+        ...chunk,
+        content: current.join("\n"),
+        start_line: startLine,
+        end_line: baseLine + lines.length - 1,
+      })
+    }
+    return parts
+  }
-// ── Public API ──────────────────────────────────────────────────────────────
+  // ── Public API ──────────────────────────────────────────────────────────────
 /**
  * Chunk source code by functions/classes.
@@ -231,12 +276,13 @@ export function chunkCode(
     if (block.startLine > lastEnd + 1) {
       const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
       if (gapContent.length >= config.min_chunk_size) {
-        chunks.push({ content: gapContent })
+        chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
       } else if (gapContent.length > 0 && chunks.length > 0) {
         // Merge small gap with previous chunk
         chunks[chunks.length - 1].content += "\n\n" + gapContent
+        chunks[chunks.length - 1].end_line = block.startLine - 1
       } else if (gapContent.length > 0) {
-        chunks.push({ content: gapContent })
+        chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
       }
     }
@@ -259,6 +305,8 @@ export function chunkCode(
               chunks.push({
                 content: gap,
                 class_name: block.name,
+                start_line: classLastEnd + 1,
+                end_line: method.startLine - 1,
               })
             }
           }
@@ -267,6 +315,8 @@ export function chunkCode(
             content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
             function_name: method.name,
             class_name: block.name,
+            start_line: method.startLine,
+            end_line: method.endLine,
           })
           classLastEnd = method.endLine
         }
@@ -275,7 +325,7 @@ export function chunkCode(
         if (classLastEnd < block.endLine) {
           const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
           if (tail) {
-            chunks.push({ content: tail, class_name: block.name })
+            chunks.push({ content: tail, class_name: block.name, start_line: classLastEnd + 1, end_line: block.endLine })
           }
         }
       } else {
@@ -312,9 +362,9 @@ export function chunkCode(
   const result: CodeChunk[] = []
   for (const chunk of chunks) {
     if (chunk.content.length > config.max_chunk_size) {
-      const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
+      const parts = splitChunkByLines(chunk, config.max_chunk_size)
       for (const p of parts) {
-        result.push({ ...chunk, content: p.content })
+        result.push(p)
       }
     } else {
       result.push(chunk)

package/vectorizer/chunkers/markdown-chunker.ts CHANGED Viewed

@@ -22,14 +22,18 @@ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
 export interface MarkdownChunk {
   content: string
   heading_context: string   // "H1 > H2 > H3"
+  start_line?: number
+  end_line?: number
 }
 // ── Internal types ──────────────────────────────────────────────────────────
 interface Section {
-  level: number       // 1-6 for headings, 0 for preamble
+  level: number       //1-6 for headings, 0 for preamble
   heading: string
   body: string
+  start_line: number
+  end_line: number
 }
 // ── Parsing ─────────────────────────────────────────────────────────────────
@@ -38,19 +42,23 @@ interface Section {
 function parseSections(content: string): Section[] {
   const lines = content.split("\n")
   const sections: Section[] = []
-  let currentSection: Section = { level: 0, heading: "", body: "" }
+  let currentSection: Section = { level: 0, heading: "", body: "", start_line: 0, end_line: 0 }
-  for (const line of lines) {
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i]
     const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
     if (headingMatch) {
       // Push previous section
       if (currentSection.body.trim() || currentSection.heading) {
+        currentSection.end_line = i - 1
         sections.push(currentSection)
       }
       currentSection = {
         level: headingMatch[1].length,
         heading: headingMatch[2].trim(),
         body: "",
+        start_line: i,
+        end_line: 0,
       }
     } else {
       currentSection.body += line + "\n"
@@ -59,6 +67,7 @@ function parseSections(content: string): Section[] {
   // Push last section
   if (currentSection.body.trim() || currentSection.heading) {
+    currentSection.end_line = lines.length - 1
     sections.push(currentSection)
   }
@@ -97,6 +106,45 @@ function splitLargeText(text: string, maxSize: number): string[] {
   return chunks
 }
+function splitLargeTextWithLines(text: string, maxSize: number, startLine: number): Array<{ content: string; start_line: number; end_line: number }> {
+  if (text.length <= maxSize) {
+    const lines = text.split("\n")
+    return [{ content: text, start_line: startLine, end_line: startLine + lines.length - 1 }]
+  }
+  const chunks: Array<{ content: string; start_line: number; end_line: number }> = []
+  const lines = text.split("\n")
+  let current: string[] = []
+  let currentLen = 0
+  let chunkStartLine = startLine
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i]
+    if (currentLen + line.length + 1 > maxSize && current.length > 0) {
+      chunks.push({
+        content: current.join("\n"),
+        start_line: chunkStartLine,
+        end_line: startLine + i - 1,
+      })
+      current = []
+      currentLen = 0
+      chunkStartLine = startLine + i
+    }
+    current.push(line)
+    currentLen += line.length + 1
+  }
+  if (current.length > 0) {
+    chunks.push({
+      content: current.join("\n"),
+      start_line: chunkStartLine,
+      end_line: startLine + lines.length - 1,
+    })
+  }
+  return chunks
+}
 // ── Public API ──────────────────────────────────────────────────────────────
 /**
@@ -138,7 +186,12 @@ export function chunkMarkdown(
       ? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
       : section.body
-    rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
+    rawChunks.push({
+      content: sectionText.trim(),
+      heading_context: headingContext,
+      start_line: section.start_line,
+      end_line: section.end_line,
+    })
   }
   // Merge small sections with previous
@@ -150,7 +203,11 @@ export function chunkMarkdown(
     ) {
       const prev = merged[merged.length - 1]
       prev.content += "\n\n" + chunk.content
-      // Keep the deepest heading context
+      // Merge end_line
+      if (chunk.end_line !== undefined) {
+        prev.end_line = chunk.end_line
+      }
+      // Keep deepest heading context
       if (chunk.heading_context) {
         prev.heading_context = chunk.heading_context
       }
@@ -163,9 +220,14 @@ export function chunkMarkdown(
   const result: MarkdownChunk[] = []
   for (const chunk of merged) {
     if (chunk.content.length > config.max_chunk_size) {
-      const parts = splitLargeText(chunk.content, config.max_chunk_size)
+      const parts = splitLargeTextWithLines(chunk.content, config.max_chunk_size, chunk.start_line || 0)
       for (const part of parts) {
-        result.push({ content: part, heading_context: chunk.heading_context })
+        result.push({
+          content: part.content,
+          heading_context: chunk.heading_context,
+          start_line: part.start_line,
+          end_line: part.end_line,
+        })
       }
     } else {
       result.push(chunk)