npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.26 → 3.0.0-dev.27 - Mend

@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/vectorizer/index.ts +189 -15

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "3.0.0-dev.26",
+  "version": "3.0.0-dev.27",
   "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
   "type": "module",
   "main": "./index.ts",

package/vectorizer/index.ts CHANGED Viewed

@@ -612,7 +612,148 @@ class CodebaseIndexer {
     return this.hashes[relPath] !== currentHash;
   }
-  // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
+  // ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
+  // Returns prepared chunk data ready for embedding, or null if skipped.
+  async prepareFile(filePath) {
+    const relPath = path.relative(this.root, filePath);
+    let content;
+    try {
+      content = await fs.readFile(filePath, "utf8");
+    } catch {
+      return null;
+    }
+    const hash = this.fileHash(content);
+    if (this.hashes[relPath] === hash) {
+      return null; // unchanged
+    }
+    // Extract metadata
+    const fileMeta = await extractFileMetadata(filePath, content);
+    const archived = this.isArchived(relPath, content);
+    // Clean content before chunking
+    const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
+    // Semantic chunking
+    const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
+    // Assign chunk IDs
+    const chunksWithIds = this.graphBuilder
+      ? this.graphBuilder.assignChunkIds(relPath, chunks)
+      : chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
+    // Build graph edges (Phase 1 — no embedding needed)
+    if (this.graphBuilder && this.graphDB) {
+      await this.graphDB.deleteByFile(relPath);
+      const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
+      if (edgesBuilt > 0 || DEBUG) {
+        const timestamp = new Date().toISOString().slice(11, 19);
+        const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
+        if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
+        try {
+          const logPath = path.join(this.root, ".opencode", "indexer.log");
+          const fsSync = await import("fs");
+          fsSync.appendFileSync(logPath, `${logMsg}\n`);
+        } catch { /* non-fatal */ }
+      }
+      try {
+        await this.graphDB.setFileMeta(relPath, hash, Date.now());
+      } catch { /* non-fatal */ }
+    }
+    // Return prepared rows (without vector — Phase 2 fills it)
+    const rows = chunksWithIds.map((chunk, i) => ({
+      chunk_id: chunk.chunk_id,
+      file: relPath,
+      chunk_index: i,
+      content: chunk.content,
+      archived,
+      file_type: fileMeta.file_type,
+      language: fileMeta.language,
+      last_modified: fileMeta.last_modified,
+      file_size: fileMeta.file_size,
+      heading_context: chunk.heading_context || "",
+      function_name: chunk.function_name || "",
+      class_name: chunk.class_name || "",
+      tags: (fileMeta.tags || []).join(","),
+      start_line: chunk.start_line ?? -1,
+      end_line: chunk.end_line ?? -1,
+    }));
+    return { relPath, hash, rows };
+  }
+  // ── Phase 2: Batch embed + store ──────────────────────────────────────────
+  // Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
+  async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
+    if (preparedFiles.length === 0) return 0;
+    // Collect all rows with their content for batch embedding
+    const allRows = [];
+    for (const pf of preparedFiles) {
+      for (const row of pf.rows) {
+        allRows.push(row);
+      }
+    }
+    if (allRows.length === 0) return 0;
+    // Load model once
+    const model = await this.loadModel();
+    // Batch embed
+    const allData = [];
+    for (let i = 0; i < allRows.length; i += batchSize) {
+      const batch = allRows.slice(i, i + batchSize);
+      const texts = batch.map(r => r.content);
+      // Embed batch — @xenova/transformers processes array inputs efficiently
+      const embeddings = [];
+      for (const text of texts) {
+        const result = await model(text, { pooling: "mean", normalize: true });
+        embeddings.push(Array.from(result.data));
+      }
+      for (let j = 0; j < batch.length; j++) {
+        allData.push({ ...batch[j], vector: embeddings[j] });
+      }
+      if (onProgress) {
+        onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
+      }
+    }
+    // Bulk store in LanceDB
+    const tableName = "chunks";
+    const tables = await this.db.tableNames();
+    if (tables.includes(tableName)) {
+      const table = await this.db.openTable(tableName);
+      await table.add(allData);
+    } else {
+      await this.db.createTable(tableName, allData);
+    }
+    // Update hashes for all prepared files
+    for (const pf of preparedFiles) {
+      this.hashes[pf.relPath] = pf.hash;
+    }
+    await this.saveHashes();
+    // Invalidate caches
+    if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
+    this._bm25Rows = null;
+    this._chunkCache = null;
+    return allData.length;
+  }
+  // ── Index a single file (legacy — used by freshen/on-change) ───────────
   async indexFile(filePath) {
     const relPath = path.relative(this.root, filePath);
@@ -1170,31 +1311,64 @@ class CodebaseIndexer {
       }
     }
-    let indexed = 0;
-    let skipped = 0;
     const total = files.length;
+    const CONCURRENCY = 5;
-    for (let i = 0; i < files.length; i++) {
-      const relPath = files[i];
-      const filePath = path.join(this.root, relPath);
-      try {
-        const wasIndexed = await this.indexFile(filePath);
-        if (wasIndexed) {
-          indexed++;
-          // FR-053: progress indicator includes graph building phase
-          if (onProgress) onProgress(indexed, total, relPath, i + 1);
+    // ══════════════════════════════════════════════════════════════════════════
+    // Phase 1: Prepare files in parallel (chunk + graph, no embedding)
+    // ══════════════════════════════════════════════════════════════════════════
+    const preparedFiles = [];
+    let prepared = 0;
+    let skipped = 0;
+    // Process in batches of CONCURRENCY
+    for (let i = 0; i < files.length; i += CONCURRENCY) {
+      const batch = files.slice(i, i + CONCURRENCY);
+      const promises = batch.map(async (relPath) => {
+        const filePath = path.join(this.root, relPath);
+        try {
+          const result = await this.prepareFile(filePath);
+          return result;
+        } catch {
+          return null;
+        }
+      });
+      const results = await Promise.all(promises);
+      for (let j = 0; j < results.length; j++) {
+        if (results[j]) {
+          preparedFiles.push(results[j]);
+          prepared++;
+          if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
         } else {
           skipped++;
         }
-      } catch {
-        skipped++;
       }
     }
+    if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
+    // ══════════════════════════════════════════════════════════════════════════
+    // Phase 2: Batch embed + store (sequential, batch forward pass)
+    // ══════════════════════════════════════════════════════════════════════════
+    let chunksEmbedded = 0;
+    if (preparedFiles.length > 0) {
+      const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
+      if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
+      chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
+        if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
+      });
+      if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
+    }
+    const indexed = prepared; // file count for backward compat
     // FR-005: Build semantic similarity edges as post-pass
     // Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
     let semanticEdges = 0;
-    if (indexed > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
+    if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
       try {
         const tableName = "chunks";
         const tables = await this.db.tableNames();