npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.26 → 3.0.0-dev.28 - Mend

@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +2 -1
package/tools/search.ts +13 -3
package/vectorizer/chunk-store.ts +207 -0
package/vectorizer/index.ts +478 -129

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "3.0.0-dev.26",
+  "version": "3.0.0-dev.28",
   "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
   "type": "module",
   "main": "./index.ts",
@@ -32,6 +32,7 @@
     "vectorizer/query-cache.ts",
     "vectorizer/search-metrics.ts",
     "vectorizer/graph-db.ts",
+    "vectorizer/chunk-store.ts",
     "vectorizer/usage-tracker.ts",
     "vectorizer/graph-builder.ts",
     "vectorizer/analyzers/regex-analyzer.ts",

package/tools/search.ts CHANGED Viewed

@@ -218,9 +218,12 @@ Examples:
       // ── Reranking — boost results where query keywords appear in text ──────
       const queryKeywords = args.query.toLowerCase().split(/\s+/).filter((w: string) => w.length > 2)
       for (const r of allResults) {
+        const isBM25Only = !!r._bm25Only
         const vectorScore = r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0
         r._vectorScore = vectorScore
-        r._bm25Component = r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0
+        r._bm25Component = isBM25Only
+          ? (r._combinedScore ?? 0)
+          : (r._combinedScore != null ? Math.max(0, r._combinedScore - vectorScore) : 0)
         const baseScore = r._combinedScore ?? vectorScore
         const text = (r.content || "").toLowerCase()
@@ -264,10 +267,15 @@ Examples:
       // ── Confidence signal ──────────────────────────────────────────────────
       const topScore = sortedGroups[0].best._finalScore ?? 0
+      const hasBM25Only = allResults.some((r: any) => r._bm25Only)
       const scope = args.searchAll ? "all indexes" : `index "${indexName}"`
       const filterLabel = args.filter ? ` filter:"${args.filter}"` : ""
       let output = `## Search Results for: "${args.query}" (${scope}${filterLabel})\n\n`
+      if (hasBM25Only) {
+        output += `> **BM25-only mode** — vector embeddings not yet available. Results are keyword-based. Quality will improve after embedding completes.\n\n`
+      }
       if (topScore < 0.45) {
         output += `> **Low confidence results.** Best score: ${topScore.toFixed(3)}. These results may not be relevant to your query.\n> Try more specific keywords or different phrasing.\n\n`
       }
@@ -287,8 +295,10 @@ Examples:
         const metaLine = metaParts.length > 0 ? ` (${metaParts.join(", ")})` : ""
         // Score breakdown
-        const breakdownParts: string[] = [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
-        if (r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
+        const breakdownParts: string[] = r._bm25Only
+          ? [`bm25: ${(r._bm25Component ?? 0).toFixed(2)}`]
+          : [`vec: ${(r._vectorScore ?? 0).toFixed(2)}`]
+        if (!r._bm25Only && r._bm25Component > 0.005) breakdownParts.push(`bm25: +${r._bm25Component.toFixed(2)}`)
         if (r._keywordBonus > 0.005) breakdownParts.push(`kw: +${r._keywordBonus.toFixed(2)}`)
         const breakdown = breakdownParts.join(", ")

package/vectorizer/chunk-store.ts ADDED Viewed

@@ -0,0 +1,207 @@
+/**
+ * ChunkStore — SQLite-based persistent chunk storage.
+ * Populated by Phase 1 (no vectors needed). Provides BM25 search
+ * and metadata queries immediately, before embedding is complete.
+ *
+ * Uses bun:sqlite with WAL mode for concurrent read access.
+ */
+import { Database } from "bun:sqlite"
+export interface StoredChunk {
+  chunk_id: string
+  file: string
+  chunk_index: number
+  content: string
+  file_type: string
+  language: string
+  last_modified: string
+  file_size: number
+  heading_context: string
+  function_name: string
+  class_name: string
+  tags: string
+  start_line: number
+  end_line: number
+  archived: boolean
+  vectorized: boolean
+}
+export class ChunkStore {
+  private db: Database | null = null
+  // Prepared statements
+  private _stmtInsert: any = null
+  private _stmtByFile: any = null
+  private _stmtDeleteByFile: any = null
+  private _stmtAll: any = null
+  private _stmtByChunkId: any = null
+  private _stmtMarkVectorized: any = null
+  private _stmtHasVectors: any = null
+  private _stmtCount: any = null
+  private _stmtSearch: any = null
+  constructor(private dbPath: string) {}
+  async init(): Promise<this> {
+    const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
+    this.db = new Database(fullPath)
+    this.db.exec("PRAGMA journal_mode = WAL")
+    this.db.exec("PRAGMA synchronous = NORMAL")
+    this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
+    this.db.exec(`
+      CREATE TABLE IF NOT EXISTS chunks (
+        chunk_id TEXT PRIMARY KEY,
+        file TEXT NOT NULL,
+        chunk_index INTEGER NOT NULL DEFAULT 0,
+        content TEXT NOT NULL,
+        file_type TEXT NOT NULL DEFAULT '',
+        language TEXT NOT NULL DEFAULT '',
+        last_modified TEXT NOT NULL DEFAULT '',
+        file_size INTEGER NOT NULL DEFAULT 0,
+        heading_context TEXT NOT NULL DEFAULT '',
+        function_name TEXT NOT NULL DEFAULT '',
+        class_name TEXT NOT NULL DEFAULT '',
+        tags TEXT NOT NULL DEFAULT '',
+        start_line INTEGER NOT NULL DEFAULT -1,
+        end_line INTEGER NOT NULL DEFAULT -1,
+        archived INTEGER NOT NULL DEFAULT 0,
+        vectorized INTEGER NOT NULL DEFAULT 0
+      )
+    `)
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
+    this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
+    // Prepare statements
+    this._stmtInsert = this.db.prepare(`
+      INSERT OR REPLACE INTO chunks
+        (chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
+         heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
+      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
+    `)
+    this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
+    this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
+    this._stmtAll = this.db.prepare("SELECT * FROM chunks")
+    this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
+    this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
+    this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
+    this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
+    return this
+  }
+  /**
+   * Store chunks from Phase 1 (batch, in transaction).
+   */
+  storeChunks(rows: Array<{
+    chunk_id: string, file: string, chunk_index: number, content: string,
+    file_type: string, language: string, last_modified: string, file_size: number,
+    heading_context: string, function_name: string, class_name: string, tags: string,
+    start_line: number, end_line: number, archived: boolean
+  }>): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    const insertMany = this.db.transaction((items: typeof rows) => {
+      for (const r of items) {
+        this._stmtInsert.run(
+          r.chunk_id, r.file, r.chunk_index, r.content,
+          r.file_type, r.language, r.last_modified, r.file_size,
+          r.heading_context, r.function_name, r.class_name, r.tags,
+          r.start_line, r.end_line, r.archived ? 1 : 0
+        )
+      }
+    })
+    insertMany(rows)
+  }
+  /**
+   * Delete all chunks for a file (before re-indexing).
+   */
+  deleteByFile(filePath: string): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    this._stmtDeleteByFile.run(filePath)
+  }
+  /**
+   * Mark all chunks for a file as vectorized (Phase 2 complete).
+   */
+  markVectorized(filePath: string): void {
+    if (!this.db) throw new Error("ChunkStore not initialized")
+    this._stmtMarkVectorized.run(filePath)
+  }
+  /**
+   * Check if all chunks have vectors.
+   */
+  hasUnvectorizedChunks(): boolean {
+    if (!this.db) return false
+    const row = this._stmtHasVectors.get() as { cnt: number }
+    return row.cnt > 0
+  }
+  /**
+   * Get all chunks (for BM25 index building).
+   */
+  getAllChunks(): StoredChunk[] {
+    if (!this.db) return []
+    return this._stmtAll.all().map((r: any) => this.toChunk(r))
+  }
+  /**
+   * Get chunks for a specific file.
+   */
+  getChunksByFile(filePath: string): StoredChunk[] {
+    if (!this.db) return []
+    return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
+  }
+  /**
+   * Get a single chunk by ID.
+   */
+  getChunkById(chunkId: string): StoredChunk | null {
+    if (!this.db) return null
+    const row = this._stmtByChunkId.get(chunkId)
+    return row ? this.toChunk(row) : null
+  }
+  /**
+   * Total chunk count.
+   */
+  count(): number {
+    if (!this.db) return 0
+    const row = this._stmtCount.get() as { cnt: number }
+    return row.cnt
+  }
+  close(): void {
+    if (this.db) {
+      this.db.close()
+      this.db = null
+    }
+  }
+  private toChunk(row: any): StoredChunk {
+    return {
+      chunk_id: row.chunk_id,
+      file: row.file,
+      chunk_index: row.chunk_index,
+      content: row.content,
+      file_type: row.file_type,
+      language: row.language,
+      last_modified: row.last_modified,
+      file_size: row.file_size,
+      heading_context: row.heading_context,
+      function_name: row.function_name,
+      class_name: row.class_name,
+      tags: row.tags,
+      start_line: row.start_line,
+      end_line: row.end_line,
+      archived: !!row.archived,
+      vectorized: !!row.vectorized,
+    }
+  }
+}

package/vectorizer/index.ts CHANGED Viewed

@@ -18,6 +18,7 @@ import { SearchMetrics } from "./search-metrics.ts";
 import { GraphDB } from "./graph-db.ts";
 import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
 import { UsageTracker } from "./usage-tracker.ts";
+import { ChunkStore } from "./chunk-store.ts";
 // Suppress transformers.js logs unless DEBUG is set
 const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -448,6 +449,7 @@ class CodebaseIndexer {
     this.graphBuilder = null;   // Graph builder orchestrator
     this._chunkCache = null;   // Lazy Map<chunk_id, row> for findChunkById
     this.usageTracker = null;  // Usage tracking & provenance (v3)
+    this.chunkStore = null;    // SQLite chunk store (BM25 without vectors)
   }
   async init() {
@@ -459,6 +461,16 @@ class CodebaseIndexer {
     this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
     await this.loadHashes();
+    // ChunkStore — SQLite store for BM25 search without vectors
+    try {
+      const chunkStorePath = path.join(this.cacheDir, "chunks.db");
+      this.chunkStore = await new ChunkStore(chunkStorePath).init();
+      if (DEBUG) console.log(`[vectorizer] ChunkStore initialized: ${chunkStorePath}`);
+    } catch (e) {
+      if (DEBUG) console.log(`[vectorizer] ChunkStore init failed: ${e.message || e}`);
+      this.chunkStore = null;
+    }
     // Graph DB — only if graph is enabled in config
     // Non-fatal: if LevelDB lock fails (parallel access), search works without graph
     if (GRAPH_CONFIG.enabled) {
@@ -512,6 +524,11 @@ class CodebaseIndexer {
     }
     this._bm25Rows = null;
     this.metrics = null;
+    // Close ChunkStore
+    if (this.chunkStore) {
+      try { this.chunkStore.close(); } catch { /* best effort */ }
+      this.chunkStore = null;
+    }
     // Close graph DB to release LevelDB lock
     if (this.graphDB) {
       try { await this.graphDB.close(); } catch { /* best effort */ }
@@ -612,7 +629,161 @@ class CodebaseIndexer {
     return this.hashes[relPath] !== currentHash;
   }
-  // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
+  // ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
+  // Returns prepared chunk data ready for embedding, or null if skipped.
+  async prepareFile(filePath) {
+    const relPath = path.relative(this.root, filePath);
+    let content;
+    try {
+      content = await fs.readFile(filePath, "utf8");
+    } catch {
+      return null;
+    }
+    const hash = this.fileHash(content);
+    if (this.hashes[relPath] === hash) {
+      return null; // unchanged
+    }
+    // Extract metadata
+    const fileMeta = await extractFileMetadata(filePath, content);
+    const archived = this.isArchived(relPath, content);
+    // Clean content before chunking
+    const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
+    // Semantic chunking
+    const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
+    // Assign chunk IDs
+    const chunksWithIds = this.graphBuilder
+      ? this.graphBuilder.assignChunkIds(relPath, chunks)
+      : chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
+    // Build graph edges (Phase 1 — no embedding needed)
+    if (this.graphBuilder && this.graphDB) {
+      await this.graphDB.deleteByFile(relPath);
+      const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
+      if (edgesBuilt > 0 || DEBUG) {
+        const timestamp = new Date().toISOString().slice(11, 19);
+        const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
+        if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
+        try {
+          const logPath = path.join(this.root, ".opencode", "indexer.log");
+          const fsSync = await import("fs");
+          fsSync.appendFileSync(logPath, `${logMsg}\n`);
+        } catch { /* non-fatal */ }
+      }
+      try {
+        await this.graphDB.setFileMeta(relPath, hash, Date.now());
+      } catch { /* non-fatal */ }
+    }
+    // Return prepared rows (without vector — Phase 2 fills it)
+    const rows = chunksWithIds.map((chunk, i) => ({
+      chunk_id: chunk.chunk_id,
+      file: relPath,
+      chunk_index: i,
+      content: chunk.content,
+      archived,
+      file_type: fileMeta.file_type,
+      language: fileMeta.language,
+      last_modified: fileMeta.last_modified,
+      file_size: fileMeta.file_size,
+      heading_context: chunk.heading_context || "",
+      function_name: chunk.function_name || "",
+      class_name: chunk.class_name || "",
+      tags: (fileMeta.tags || []).join(","),
+      start_line: chunk.start_line ?? -1,
+      end_line: chunk.end_line ?? -1,
+    }));
+    // Store chunks in ChunkStore (Phase 1 — BM25 available immediately)
+    if (this.chunkStore) {
+      try {
+        this.chunkStore.deleteByFile(relPath);
+        this.chunkStore.storeChunks(rows);
+      } catch (e) {
+        if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
+      }
+    }
+    return { relPath, hash, rows };
+  }
+  // ── Phase 2: Batch embed + store ──────────────────────────────────────────
+  // Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
+  async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
+    if (preparedFiles.length === 0) return 0;
+    // Collect all rows with their content for batch embedding
+    const allRows = [];
+    for (const pf of preparedFiles) {
+      for (const row of pf.rows) {
+        allRows.push(row);
+      }
+    }
+    if (allRows.length === 0) return 0;
+    // Load model once
+    const model = await this.loadModel();
+    // Batch embed
+    const allData = [];
+    for (let i = 0; i < allRows.length; i += batchSize) {
+      const batch = allRows.slice(i, i + batchSize);
+      const texts = batch.map(r => r.content);
+      // Embed batch — @xenova/transformers processes array inputs efficiently
+      const embeddings = [];
+      for (const text of texts) {
+        const result = await model(text, { pooling: "mean", normalize: true });
+        embeddings.push(Array.from(result.data));
+      }
+      for (let j = 0; j < batch.length; j++) {
+        allData.push({ ...batch[j], vector: embeddings[j] });
+      }
+      if (onProgress) {
+        onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
+      }
+    }
+    // Bulk store in LanceDB
+    const tableName = "chunks";
+    const tables = await this.db.tableNames();
+    if (tables.includes(tableName)) {
+      const table = await this.db.openTable(tableName);
+      await table.add(allData);
+    } else {
+      await this.db.createTable(tableName, allData);
+    }
+    // Update hashes + mark vectorized in ChunkStore
+    for (const pf of preparedFiles) {
+      this.hashes[pf.relPath] = pf.hash;
+      if (this.chunkStore) {
+        try { this.chunkStore.markVectorized(pf.relPath); } catch { /* non-fatal */ }
+      }
+    }
+    await this.saveHashes();
+    // Invalidate caches
+    if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
+    this._bm25Rows = null;
+    this._chunkCache = null;
+    return allData.length;
+  }
+  // ── Index a single file (legacy — used by freshen/on-change) ───────────
   async indexFile(filePath) {
     const relPath = path.relative(this.root, filePath);
@@ -701,6 +872,24 @@ class CodebaseIndexer {
     if (data.length === 0) return false;
+    // Store in ChunkStore (Phase 1 data) + mark vectorized (has embedding)
+    if (this.chunkStore) {
+      try {
+        this.chunkStore.deleteByFile(relPath);
+        this.chunkStore.storeChunks(data.map(d => ({
+          chunk_id: d.chunk_id, file: d.file, chunk_index: d.chunk_index,
+          content: d.content, file_type: d.file_type, language: d.language,
+          last_modified: d.last_modified, file_size: d.file_size,
+          heading_context: d.heading_context, function_name: d.function_name,
+          class_name: d.class_name, tags: d.tags,
+          start_line: d.start_line, end_line: d.end_line, archived: d.archived,
+        })));
+        this.chunkStore.markVectorized(relPath);
+      } catch (e) {
+        if (DEBUG) console.log(`[vectorizer] ChunkStore write failed for ${relPath}: ${e.message || e}`);
+      }
+    }
     const tableName = "chunks";
     const tables = await this.db.tableNames();
     if (tables.includes(tableName)) {
@@ -728,6 +917,35 @@ class CodebaseIndexer {
   async ensureBM25() {
     if (this.bm25) return this.bm25;
+    // Primary source: ChunkStore (SQLite) — available after Phase 1, no vectors needed
+    if (this.chunkStore) {
+      try {
+        const allChunks = this.chunkStore.getAllChunks();
+        if (allChunks.length > 0) {
+          // Sort for stable ID mapping between builds
+          allChunks.sort((a, b) => {
+            const ka = `${a.file}:${a.chunk_index}`;
+            const kb = `${b.file}:${b.chunk_index}`;
+            return ka.localeCompare(kb);
+          });
+          // Release previous data before rebuilding
+          if (this.bm25) this.bm25.clear();
+          this._bm25Rows = null;
+          this.bm25 = new BM25Index();
+          this.bm25.build(allChunks.map((r) => r.content));
+          this._bm25Rows = allChunks;
+          if (DEBUG) console.log(`[vectorizer] BM25 built from ChunkStore (${allChunks.length} chunks)`);
+          return this.bm25;
+        }
+      } catch (e) {
+        if (DEBUG) console.log("[vectorizer] BM25 from ChunkStore failed, trying LanceDB:", e.message);
+      }
+    }
+    // Fallback: LanceDB (legacy — for indexes without ChunkStore)
     const tableName = "chunks";
     const tables = await this.db.tableNames();
     if (!tables.includes(tableName)) return null;
@@ -761,15 +979,170 @@ class CodebaseIndexer {
     return this.bm25;
   }
-  // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
+  // ── Shared helpers for search paths ─────────────────────────────────────────
+  _applyMetadataFilters(results, includeArchived, options) {
+    if (!includeArchived) {
+      results = results.filter((r) => !r.archived);
+    }
+    if (options.fileType) {
+      results = results.filter((r) => r.file_type === options.fileType);
+    }
+    if (options.language) {
+      results = results.filter((r) => r.language === options.language);
+    }
+    if (options.modifiedAfter) {
+      const after = new Date(options.modifiedAfter).getTime();
+      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
+    }
+    if (options.modifiedBefore) {
+      const before = new Date(options.modifiedBefore).getTime();
+      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
+    }
+    if (options.tags && options.tags.length > 0) {
+      results = results.filter((r) => {
+        const rowTags = (r.tags || "").split(",").filter(Boolean);
+        return options.tags.some((t) => rowTags.includes(t));
+      });
+    }
+    return results;
+  }
+  async _expandGraphContext(finalResults, queryEmbedding, query) {
+    if (!this.graphDB) return;
+    for (const result of finalResults) {
+      if (!result.chunk_id) continue;
+      const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
+      const incoming = await this.graphDB.getIncoming(result.chunk_id);
+      const allEdges = [...outgoing, ...incoming].filter(
+        e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
+      );
+      const neighbors = [];
+      for (const edge of allEdges) {
+        const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
+        const neighborChunk = await this.findChunkById(neighborId);
+        if (!neighborChunk) continue;
+        let score;
+        if (queryEmbedding && neighborChunk.vector) {
+          const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
+          score = edge.weight * similarity;
+        } else {
+          // No vectors — use edge weight only (BM25-only fallback)
+          score = edge.weight * 0.7; // dampen without cosine confirmation
+        }
+        neighbors.push({
+          chunk_id: neighborId,
+          file: neighborChunk.file,
+          content: neighborChunk.content,
+          relation: edge.predicate,
+          score,
+          via: edge.source
+        });
+      }
+      neighbors.sort((a, b) => b.score - a.score);
+      const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
+      const maxRelated = GRAPH_CONFIG.max_related ?? 4;
+      result.relatedContext = neighbors
+        .filter(n => n.score >= minRelevance)
+        .slice(0, maxRelated);
+      if (this.usageTracker) {
+        for (const n of result.relatedContext) {
+          this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
+        }
+      }
+    }
+    // Record usage counts for all returned chunks
+    if (this.usageTracker) {
+      const allChunkIds = [];
+      for (const r of finalResults) {
+        if (r.chunk_id) allChunkIds.push(r.chunk_id);
+        if (r.relatedContext) {
+          for (const rc of r.relatedContext) {
+            if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
+          }
+        }
+      }
+      this.usageTracker.recordSearchResults(allChunkIds);
+      this.usageTracker.save().catch(() => {});
+    }
+  }
+  // ── Search (v3: hybrid + BM25-only fallback + metadata filters + metrics) ──
   async search(query, limit = 5, includeArchived = false, options = {}) {
     const tableName = "chunks";
     const tables = await this.db.tableNames();
-    if (!tables.includes(tableName)) {
-      return [];
+    const indexConfig = INDEX_PRESETS[this.indexName];
+    const indexHybridEnabled = indexConfig?.hybrid ?? false;
+    const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
+    const isHybrid = indexHybridEnabled || options.hybrid;
+    // ── Detect if vectors are available ──────────────────────────────────────
+    const hasVectorTable = tables.includes(tableName);
+    let hasVectors = false;
+    if (hasVectorTable) {
+      try {
+        const table = await this.db.openTable(tableName);
+        const rowCount = await table.countRows();
+        hasVectors = rowCount > 0;
+      } catch {
+        hasVectors = false;
+      }
     }
+    // ── BM25-only fallback (Phase 1 complete, Phase 2 not yet) ──────────────
+    if (!hasVectors) {
+      const bm25 = await this.ensureBM25();
+      if (!bm25 || !this._bm25Rows) {
+        // No vectors AND no chunks — nothing indexed yet
+        return [];
+      }
+      if (DEBUG) console.log("[vectorizer] BM25-only search (no vectors yet)");
+      const fetchLimit = Math.max(limit * 3, 50);
+      const bm25Results = bm25.search(query, fetchLimit);
+      // Normalize BM25 scores to [0, 1]
+      let maxBM25 = 0;
+      for (const r of bm25Results) {
+        if (r.score > maxBM25) maxBM25 = r.score;
+      }
+      let results = [];
+      for (const br of bm25Results) {
+        if (br.id < this._bm25Rows.length) {
+          const row = this._bm25Rows[br.id];
+          const normScore = maxBM25 > 0 ? br.score / maxBM25 : 0;
+          results.push({
+            ...row,
+            _combinedScore: normScore,
+            _distance: null, // no vector distance available
+            _bm25Only: true,
+          });
+        }
+      }
+      // Apply metadata filters then return (graph context added below)
+      results = this._applyMetadataFilters(results, includeArchived, options);
+      const finalResults = results.slice(0, limit);
+      // Graph context expansion (same as vector path)
+      await this._expandGraphContext(finalResults, null, query);
+      return finalResults;
+    }
+    // ── Vector search (Phase 2 complete) ─────────────────────────────────────
     const queryEmbedding = await this.embedQuery(query);
     const table = await this.db.openTable(tableName);
@@ -777,10 +1150,6 @@ class CodebaseIndexer {
     const hasFilters = !includeArchived || options.fileType || options.language ||
                        options.modifiedAfter || options.modifiedBefore ||
                        (options.tags && options.tags.length > 0);
-    const indexConfig = INDEX_PRESETS[this.indexName];
-    const indexHybridEnabled = indexConfig?.hybrid ?? false;
-    const indexBM25Weight = indexConfig?.bm25_weight ?? HYBRID_CONFIG.bm25_weight;
-    const isHybrid = indexHybridEnabled || options.hybrid;
     const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit *3, 50) : limit;
     let results;
     try {
@@ -864,35 +1233,7 @@ class CodebaseIndexer {
     }
     // ── Metadata filters ──────────────────────────────────────────────────
-    if (!includeArchived) {
-      results = results.filter((r) => !r.archived);
-    }
-    if (options.fileType) {
-      results = results.filter((r) => r.file_type === options.fileType);
-    }
-    if (options.language) {
-      results = results.filter((r) => r.language === options.language);
-    }
-    if (options.modifiedAfter) {
-      const after = new Date(options.modifiedAfter).getTime();
-      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
-    }
-    if (options.modifiedBefore) {
-      const before = new Date(options.modifiedBefore).getTime();
-      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
-    }
-    if (options.tags && options.tags.length > 0) {
-      results = results.filter((r) => {
-        const rowTags = (r.tags || "").split(",").filter(Boolean);
-        return options.tags.some((t) => rowTags.includes(t));
-      });
-    }
+    results = this._applyMetadataFilters(results, includeArchived, options);
     const finalResults = results.slice(0, limit);
     // ── Metrics tracking ────────────────────────────────────────────────────
@@ -917,68 +1258,7 @@ class CodebaseIndexer {
     }
     // ── Graph context expansion (v3) ───────────────────────────────────────
-    if (this.graphDB) {
-      for (const result of finalResults) {
-        if (!result.chunk_id) continue;
-        const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
-        const incoming = await this.graphDB.getIncoming(result.chunk_id);
-        // Filter out structural and meta edges — only relation edges are useful for context
-        const allEdges = [...outgoing, ...incoming].filter(
-          e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
-        );
-        const neighbors = [];
-        for (const edge of allEdges) {
-          const neighborId = edge.subject === result.chunk_id ? edge.object : edge.subject;
-          const neighborChunk = await this.findChunkById(neighborId);
-          if (!neighborChunk) continue;
-          const similarity = this.cosineSimilarity(neighborChunk.vector, queryEmbedding);
-          const score = edge.weight * similarity;
-          neighbors.push({
-            chunk_id: neighborId,
-            file: neighborChunk.file,
-            content: neighborChunk.content,
-            relation: edge.predicate,
-            score,
-            via: edge.source
-          });
-        }
-        // Apply min_relevance filter, then cap at max_related
-        neighbors.sort((a, b) => b.score - a.score);
-        const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
-        const maxRelated = GRAPH_CONFIG.max_related ?? 4;
-        result.relatedContext = neighbors
-          .filter(n => n.score >= minRelevance)
-          .slice(0, maxRelated);
-        // FR-060: Record provenance for each attached chunk
-        if (this.usageTracker) {
-          for (const n of result.relatedContext) {
-            this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
-          }
-        }
-      }
-    }
-    // FR-061: Record usage counts for all returned chunks (main + attached)
-    if (this.usageTracker) {
-      const allChunkIds = [];
-      for (const r of finalResults) {
-        if (r.chunk_id) allChunkIds.push(r.chunk_id);
-        if (r.relatedContext) {
-          for (const rc of r.relatedContext) {
-            if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
-          }
-        }
-      }
-      this.usageTracker.recordSearchResults(allChunkIds);
-      // Save asynchronously (non-blocking)
-      this.usageTracker.save().catch(() => {});
-    }
+    await this._expandGraphContext(finalResults, queryEmbedding, query);
     return finalResults;
   }
@@ -987,22 +1267,39 @@ class CodebaseIndexer {
     // Lazy-build an in-memory Map keyed by chunk_id on first call.
     // The cache lives until unloadModel() clears it.
     if (!this._chunkCache) {
+      this._chunkCache = new Map();
+      // Primary: LanceDB (has vectors for cosine similarity in graph expansion)
       const tableName = "chunks";
       const tables = await this.db.tableNames();
-      if (!tables.includes(tableName)) return null;
-      const table = await this.db.openTable(tableName);
-      let rows;
-      try {
-        rows = await table.filter("true").limit(100000).execute();
-      } catch (e) {
-        if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
-        return null;
+      if (tables.includes(tableName)) {
+        try {
+          const table = await this.db.openTable(tableName);
+          const rows = await table.filter("true").limit(100000).execute();
+          for (const row of rows) {
+            if (row.chunk_id) {
+              this._chunkCache.set(row.chunk_id, row);
+            }
+          }
+        } catch (e) {
+          if (DEBUG) console.log("[vectorizer] Chunk cache from LanceDB failed:", e.message);
+        }
       }
-      this._chunkCache = new Map();
-      for (const row of rows) {
-        if (row.chunk_id) {
-          this._chunkCache.set(row.chunk_id, row);
+      // Fallback: ChunkStore (no vectors, but has content for BM25-only mode)
+      if (this._chunkCache.size === 0 && this.chunkStore) {
+        try {
+          const allChunks = this.chunkStore.getAllChunks();
+          for (const chunk of allChunks) {
+            if (chunk.chunk_id) {
+              this._chunkCache.set(chunk.chunk_id, chunk);
+            }
+          }
+          if (DEBUG && allChunks.length > 0) {
+            console.log(`[vectorizer] Chunk cache from ChunkStore (${allChunks.length} chunks, no vectors)`);
+          }
+        } catch (e) {
+          if (DEBUG) console.log("[vectorizer] Chunk cache from ChunkStore failed:", e.message);
         }
       }
     }
@@ -1094,6 +1391,9 @@ class CodebaseIndexer {
             // best effort
           }
         }
+        if (this.chunkStore) {
+          try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
+        }
         delete this.hashes[relPath];
         deleted++;
       }
@@ -1156,6 +1456,11 @@ class CodebaseIndexer {
           }
         }
+        // Delete chunks from ChunkStore
+        if (this.chunkStore) {
+          try { this.chunkStore.deleteByFile(relPath); } catch { /* best effort */ }
+        }
         delete this.hashes[relPath];
         removed++;
       }
@@ -1170,31 +1475,64 @@ class CodebaseIndexer {
       }
     }
-    let indexed = 0;
-    let skipped = 0;
     const total = files.length;
+    const CONCURRENCY = 5;
-    for (let i = 0; i < files.length; i++) {
-      const relPath = files[i];
-      const filePath = path.join(this.root, relPath);
-      try {
-        const wasIndexed = await this.indexFile(filePath);
-        if (wasIndexed) {
-          indexed++;
-          // FR-053: progress indicator includes graph building phase
-          if (onProgress) onProgress(indexed, total, relPath, i + 1);
+    // ══════════════════════════════════════════════════════════════════════════
+    // Phase 1: Prepare files in parallel (chunk + graph, no embedding)
+    // ══════════════════════════════════════════════════════════════════════════
+    const preparedFiles = [];
+    let prepared = 0;
+    let skipped = 0;
+    // Process in batches of CONCURRENCY
+    for (let i = 0; i < files.length; i += CONCURRENCY) {
+      const batch = files.slice(i, i + CONCURRENCY);
+      const promises = batch.map(async (relPath) => {
+        const filePath = path.join(this.root, relPath);
+        try {
+          const result = await this.prepareFile(filePath);
+          return result;
+        } catch {
+          return null;
+        }
+      });
+      const results = await Promise.all(promises);
+      for (let j = 0; j < results.length; j++) {
+        if (results[j]) {
+          preparedFiles.push(results[j]);
+          prepared++;
+          if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
         } else {
           skipped++;
         }
-      } catch {
-        skipped++;
       }
     }
+    if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
+    // ══════════════════════════════════════════════════════════════════════════
+    // Phase 2: Batch embed + store (sequential, batch forward pass)
+    // ══════════════════════════════════════════════════════════════════════════
+    let chunksEmbedded = 0;
+    if (preparedFiles.length > 0) {
+      const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
+      if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
+      chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
+        if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
+      });
+      if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
+    }
+    const indexed = prepared; // file count for backward compat
     // FR-005: Build semantic similarity edges as post-pass
     // Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
     let semanticEdges = 0;
-    if (indexed > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
+    if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
       try {
         const tableName = "chunks";
         const tables = await this.db.tableNames();
@@ -1237,23 +1575,34 @@ class CodebaseIndexer {
   async getStats() {
     const fileCount = Object.keys(this.hashes).length;
-    let chunkCount = 0;
+    let vectorChunkCount = 0;
+    let totalChunkCount = 0;
+    let hasUnvectorized = false;
     try {
       const tables = await this.db.tableNames();
       if (tables.includes("chunks")) {
         const table = await this.db.openTable("chunks");
-        chunkCount = await table.countRows();
+        vectorChunkCount = await table.countRows();
       }
     } catch {}
+    if (this.chunkStore) {
+      try {
+        totalChunkCount = this.chunkStore.count();
+        hasUnvectorized = this.chunkStore.hasUnvectorizedChunks();
+      } catch {}
+    }
     const preset = INDEX_PRESETS[this.indexName];
     return {
       indexName: this.indexName,
       description: preset?.description || "Custom index",
       model: EMBEDDING_MODEL,
       fileCount,
-      chunkCount,
+      chunkCount: totalChunkCount || vectorChunkCount,
+      vectorizedChunks: vectorChunkCount,
+      pendingEmbedding: hasUnvectorized,
       features: {
         chunking: CHUNKING_CONFIG.strategy,
         hybrid: preset?.hybrid ?? false,