@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/vectorizer/index.ts +189 -15
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "3.0.0-dev.26",
3
+ "version": "3.0.0-dev.27",
4
4
  "description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
@@ -612,7 +612,148 @@ class CodebaseIndexer {
612
612
  return this.hashes[relPath] !== currentHash;
613
613
  }
614
614
 
615
- // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
615
+ // ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
616
+ // Returns prepared chunk data ready for embedding, or null if skipped.
617
+
618
+ async prepareFile(filePath) {
619
+ const relPath = path.relative(this.root, filePath);
620
+
621
+ let content;
622
+ try {
623
+ content = await fs.readFile(filePath, "utf8");
624
+ } catch {
625
+ return null;
626
+ }
627
+
628
+ const hash = this.fileHash(content);
629
+ if (this.hashes[relPath] === hash) {
630
+ return null; // unchanged
631
+ }
632
+
633
+ // Extract metadata
634
+ const fileMeta = await extractFileMetadata(filePath, content);
635
+ const archived = this.isArchived(relPath, content);
636
+
637
+ // Clean content before chunking
638
+ const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
639
+
640
+ // Semantic chunking
641
+ const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
642
+
643
+ // Assign chunk IDs
644
+ const chunksWithIds = this.graphBuilder
645
+ ? this.graphBuilder.assignChunkIds(relPath, chunks)
646
+ : chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
647
+
648
+ // Build graph edges (Phase 1 — no embedding needed)
649
+ if (this.graphBuilder && this.graphDB) {
650
+ await this.graphDB.deleteByFile(relPath);
651
+ const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
652
+
653
+ if (edgesBuilt > 0 || DEBUG) {
654
+ const timestamp = new Date().toISOString().slice(11, 19);
655
+ const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
656
+ if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
657
+ try {
658
+ const logPath = path.join(this.root, ".opencode", "indexer.log");
659
+ const fsSync = await import("fs");
660
+ fsSync.appendFileSync(logPath, `${logMsg}\n`);
661
+ } catch { /* non-fatal */ }
662
+ }
663
+
664
+ try {
665
+ await this.graphDB.setFileMeta(relPath, hash, Date.now());
666
+ } catch { /* non-fatal */ }
667
+ }
668
+
669
+ // Return prepared rows (without vector — Phase 2 fills it)
670
+ const rows = chunksWithIds.map((chunk, i) => ({
671
+ chunk_id: chunk.chunk_id,
672
+ file: relPath,
673
+ chunk_index: i,
674
+ content: chunk.content,
675
+ archived,
676
+ file_type: fileMeta.file_type,
677
+ language: fileMeta.language,
678
+ last_modified: fileMeta.last_modified,
679
+ file_size: fileMeta.file_size,
680
+ heading_context: chunk.heading_context || "",
681
+ function_name: chunk.function_name || "",
682
+ class_name: chunk.class_name || "",
683
+ tags: (fileMeta.tags || []).join(","),
684
+ start_line: chunk.start_line ?? -1,
685
+ end_line: chunk.end_line ?? -1,
686
+ }));
687
+
688
+ return { relPath, hash, rows };
689
+ }
690
+
691
+ // ── Phase 2: Batch embed + store ──────────────────────────────────────────
692
+ // Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
693
+
694
+ async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
695
+ if (preparedFiles.length === 0) return 0;
696
+
697
+ // Collect all rows with their content for batch embedding
698
+ const allRows = [];
699
+ for (const pf of preparedFiles) {
700
+ for (const row of pf.rows) {
701
+ allRows.push(row);
702
+ }
703
+ }
704
+
705
+ if (allRows.length === 0) return 0;
706
+
707
+ // Load model once
708
+ const model = await this.loadModel();
709
+
710
+ // Batch embed
711
+ const allData = [];
712
+ for (let i = 0; i < allRows.length; i += batchSize) {
713
+ const batch = allRows.slice(i, i + batchSize);
714
+ const texts = batch.map(r => r.content);
715
+
716
+ // Embed batch — @xenova/transformers processes array inputs efficiently
717
+ const embeddings = [];
718
+ for (const text of texts) {
719
+ const result = await model(text, { pooling: "mean", normalize: true });
720
+ embeddings.push(Array.from(result.data));
721
+ }
722
+
723
+ for (let j = 0; j < batch.length; j++) {
724
+ allData.push({ ...batch[j], vector: embeddings[j] });
725
+ }
726
+
727
+ if (onProgress) {
728
+ onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
729
+ }
730
+ }
731
+
732
+ // Bulk store in LanceDB
733
+ const tableName = "chunks";
734
+ const tables = await this.db.tableNames();
735
+ if (tables.includes(tableName)) {
736
+ const table = await this.db.openTable(tableName);
737
+ await table.add(allData);
738
+ } else {
739
+ await this.db.createTable(tableName, allData);
740
+ }
741
+
742
+ // Update hashes for all prepared files
743
+ for (const pf of preparedFiles) {
744
+ this.hashes[pf.relPath] = pf.hash;
745
+ }
746
+ await this.saveHashes();
747
+
748
+ // Invalidate caches
749
+ if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
750
+ this._bm25Rows = null;
751
+ this._chunkCache = null;
752
+
753
+ return allData.length;
754
+ }
755
+
756
+ // ── Index a single file (legacy — used by freshen/on-change) ───────────
616
757
 
617
758
  async indexFile(filePath) {
618
759
  const relPath = path.relative(this.root, filePath);
@@ -1170,31 +1311,64 @@ class CodebaseIndexer {
1170
1311
  }
1171
1312
  }
1172
1313
 
1173
- let indexed = 0;
1174
- let skipped = 0;
1175
1314
  const total = files.length;
1315
+ const CONCURRENCY = 5;
1176
1316
 
1177
- for (let i = 0; i < files.length; i++) {
1178
- const relPath = files[i];
1179
- const filePath = path.join(this.root, relPath);
1180
- try {
1181
- const wasIndexed = await this.indexFile(filePath);
1182
- if (wasIndexed) {
1183
- indexed++;
1184
- // FR-053: progress indicator includes graph building phase
1185
- if (onProgress) onProgress(indexed, total, relPath, i + 1);
1317
+ // ══════════════════════════════════════════════════════════════════════════
1318
+ // Phase 1: Prepare files in parallel (chunk + graph, no embedding)
1319
+ // ══════════════════════════════════════════════════════════════════════════
1320
+ const preparedFiles = [];
1321
+ let prepared = 0;
1322
+ let skipped = 0;
1323
+
1324
+ // Process in batches of CONCURRENCY
1325
+ for (let i = 0; i < files.length; i += CONCURRENCY) {
1326
+ const batch = files.slice(i, i + CONCURRENCY);
1327
+ const promises = batch.map(async (relPath) => {
1328
+ const filePath = path.join(this.root, relPath);
1329
+ try {
1330
+ const result = await this.prepareFile(filePath);
1331
+ return result;
1332
+ } catch {
1333
+ return null;
1334
+ }
1335
+ });
1336
+
1337
+ const results = await Promise.all(promises);
1338
+ for (let j = 0; j < results.length; j++) {
1339
+ if (results[j]) {
1340
+ preparedFiles.push(results[j]);
1341
+ prepared++;
1342
+ if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
1186
1343
  } else {
1187
1344
  skipped++;
1188
1345
  }
1189
- } catch {
1190
- skipped++;
1191
1346
  }
1192
1347
  }
1193
1348
 
1349
+ if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
1350
+
1351
+ // ══════════════════════════════════════════════════════════════════════════
1352
+ // Phase 2: Batch embed + store (sequential, batch forward pass)
1353
+ // ══════════════════════════════════════════════════════════════════════════
1354
+ let chunksEmbedded = 0;
1355
+ if (preparedFiles.length > 0) {
1356
+ const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
1357
+ if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
1358
+
1359
+ chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
1360
+ if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
1361
+ });
1362
+
1363
+ if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
1364
+ }
1365
+
1366
+ const indexed = prepared; // file count for backward compat
1367
+
1194
1368
  // FR-005: Build semantic similarity edges as post-pass
1195
1369
  // Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
1196
1370
  let semanticEdges = 0;
1197
- if (indexed > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
1371
+ if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
1198
1372
  try {
1199
1373
  const tableName = "chunks";
1200
1374
  const tables = await this.db.tableNames();