@comfanion/usethis_search 3.0.0-dev.26 → 3.0.0-dev.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/vectorizer/index.ts +189 -15
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@comfanion/usethis_search",
|
|
3
|
-
"version": "3.0.0-dev.
|
|
3
|
+
"version": "3.0.0-dev.27",
|
|
4
4
|
"description": "OpenCode plugin: semantic search with graph-based context (v3: graph relations, 1-hop context, LSP + regex analyzers)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./index.ts",
|
package/vectorizer/index.ts
CHANGED
|
@@ -612,7 +612,148 @@ class CodebaseIndexer {
|
|
|
612
612
|
return this.hashes[relPath] !== currentHash;
|
|
613
613
|
}
|
|
614
614
|
|
|
615
|
-
// ──
|
|
615
|
+
// ── Phase 1: Prepare file (chunk + graph, NO embedding) ─────────────────
|
|
616
|
+
// Returns prepared chunk data ready for embedding, or null if skipped.
|
|
617
|
+
|
|
618
|
+
async prepareFile(filePath) {
|
|
619
|
+
const relPath = path.relative(this.root, filePath);
|
|
620
|
+
|
|
621
|
+
let content;
|
|
622
|
+
try {
|
|
623
|
+
content = await fs.readFile(filePath, "utf8");
|
|
624
|
+
} catch {
|
|
625
|
+
return null;
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
const hash = this.fileHash(content);
|
|
629
|
+
if (this.hashes[relPath] === hash) {
|
|
630
|
+
return null; // unchanged
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
// Extract metadata
|
|
634
|
+
const fileMeta = await extractFileMetadata(filePath, content);
|
|
635
|
+
const archived = this.isArchived(relPath, content);
|
|
636
|
+
|
|
637
|
+
// Clean content before chunking
|
|
638
|
+
const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
|
|
639
|
+
|
|
640
|
+
// Semantic chunking
|
|
641
|
+
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
642
|
+
|
|
643
|
+
// Assign chunk IDs
|
|
644
|
+
const chunksWithIds = this.graphBuilder
|
|
645
|
+
? this.graphBuilder.assignChunkIds(relPath, chunks)
|
|
646
|
+
: chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
|
|
647
|
+
|
|
648
|
+
// Build graph edges (Phase 1 — no embedding needed)
|
|
649
|
+
if (this.graphBuilder && this.graphDB) {
|
|
650
|
+
await this.graphDB.deleteByFile(relPath);
|
|
651
|
+
const edgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
652
|
+
|
|
653
|
+
if (edgesBuilt > 0 || DEBUG) {
|
|
654
|
+
const timestamp = new Date().toISOString().slice(11, 19);
|
|
655
|
+
const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
|
|
656
|
+
if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
|
|
657
|
+
try {
|
|
658
|
+
const logPath = path.join(this.root, ".opencode", "indexer.log");
|
|
659
|
+
const fsSync = await import("fs");
|
|
660
|
+
fsSync.appendFileSync(logPath, `${logMsg}\n`);
|
|
661
|
+
} catch { /* non-fatal */ }
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
try {
|
|
665
|
+
await this.graphDB.setFileMeta(relPath, hash, Date.now());
|
|
666
|
+
} catch { /* non-fatal */ }
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Return prepared rows (without vector — Phase 2 fills it)
|
|
670
|
+
const rows = chunksWithIds.map((chunk, i) => ({
|
|
671
|
+
chunk_id: chunk.chunk_id,
|
|
672
|
+
file: relPath,
|
|
673
|
+
chunk_index: i,
|
|
674
|
+
content: chunk.content,
|
|
675
|
+
archived,
|
|
676
|
+
file_type: fileMeta.file_type,
|
|
677
|
+
language: fileMeta.language,
|
|
678
|
+
last_modified: fileMeta.last_modified,
|
|
679
|
+
file_size: fileMeta.file_size,
|
|
680
|
+
heading_context: chunk.heading_context || "",
|
|
681
|
+
function_name: chunk.function_name || "",
|
|
682
|
+
class_name: chunk.class_name || "",
|
|
683
|
+
tags: (fileMeta.tags || []).join(","),
|
|
684
|
+
start_line: chunk.start_line ?? -1,
|
|
685
|
+
end_line: chunk.end_line ?? -1,
|
|
686
|
+
}));
|
|
687
|
+
|
|
688
|
+
return { relPath, hash, rows };
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// ── Phase 2: Batch embed + store ──────────────────────────────────────────
|
|
692
|
+
// Takes prepared rows from prepareFile(), embeds in batches, stores in LanceDB.
|
|
693
|
+
|
|
694
|
+
async embedAndStore(preparedFiles, batchSize = 32, onProgress = null) {
|
|
695
|
+
if (preparedFiles.length === 0) return 0;
|
|
696
|
+
|
|
697
|
+
// Collect all rows with their content for batch embedding
|
|
698
|
+
const allRows = [];
|
|
699
|
+
for (const pf of preparedFiles) {
|
|
700
|
+
for (const row of pf.rows) {
|
|
701
|
+
allRows.push(row);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
if (allRows.length === 0) return 0;
|
|
706
|
+
|
|
707
|
+
// Load model once
|
|
708
|
+
const model = await this.loadModel();
|
|
709
|
+
|
|
710
|
+
// Batch embed
|
|
711
|
+
const allData = [];
|
|
712
|
+
for (let i = 0; i < allRows.length; i += batchSize) {
|
|
713
|
+
const batch = allRows.slice(i, i + batchSize);
|
|
714
|
+
const texts = batch.map(r => r.content);
|
|
715
|
+
|
|
716
|
+
// Embed batch — @xenova/transformers processes array inputs efficiently
|
|
717
|
+
const embeddings = [];
|
|
718
|
+
for (const text of texts) {
|
|
719
|
+
const result = await model(text, { pooling: "mean", normalize: true });
|
|
720
|
+
embeddings.push(Array.from(result.data));
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
for (let j = 0; j < batch.length; j++) {
|
|
724
|
+
allData.push({ ...batch[j], vector: embeddings[j] });
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
if (onProgress) {
|
|
728
|
+
onProgress(Math.min(i + batchSize, allRows.length), allRows.length, "embedding");
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Bulk store in LanceDB
|
|
733
|
+
const tableName = "chunks";
|
|
734
|
+
const tables = await this.db.tableNames();
|
|
735
|
+
if (tables.includes(tableName)) {
|
|
736
|
+
const table = await this.db.openTable(tableName);
|
|
737
|
+
await table.add(allData);
|
|
738
|
+
} else {
|
|
739
|
+
await this.db.createTable(tableName, allData);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// Update hashes for all prepared files
|
|
743
|
+
for (const pf of preparedFiles) {
|
|
744
|
+
this.hashes[pf.relPath] = pf.hash;
|
|
745
|
+
}
|
|
746
|
+
await this.saveHashes();
|
|
747
|
+
|
|
748
|
+
// Invalidate caches
|
|
749
|
+
if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
|
|
750
|
+
this._bm25Rows = null;
|
|
751
|
+
this._chunkCache = null;
|
|
752
|
+
|
|
753
|
+
return allData.length;
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
// ── Index a single file (legacy — used by freshen/on-change) ───────────
|
|
616
757
|
|
|
617
758
|
async indexFile(filePath) {
|
|
618
759
|
const relPath = path.relative(this.root, filePath);
|
|
@@ -1170,31 +1311,64 @@ class CodebaseIndexer {
|
|
|
1170
1311
|
}
|
|
1171
1312
|
}
|
|
1172
1313
|
|
|
1173
|
-
let indexed = 0;
|
|
1174
|
-
let skipped = 0;
|
|
1175
1314
|
const total = files.length;
|
|
1315
|
+
const CONCURRENCY = 5;
|
|
1176
1316
|
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1317
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1318
|
+
// Phase 1: Prepare files in parallel (chunk + graph, no embedding)
|
|
1319
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1320
|
+
const preparedFiles = [];
|
|
1321
|
+
let prepared = 0;
|
|
1322
|
+
let skipped = 0;
|
|
1323
|
+
|
|
1324
|
+
// Process in batches of CONCURRENCY
|
|
1325
|
+
for (let i = 0; i < files.length; i += CONCURRENCY) {
|
|
1326
|
+
const batch = files.slice(i, i + CONCURRENCY);
|
|
1327
|
+
const promises = batch.map(async (relPath) => {
|
|
1328
|
+
const filePath = path.join(this.root, relPath);
|
|
1329
|
+
try {
|
|
1330
|
+
const result = await this.prepareFile(filePath);
|
|
1331
|
+
return result;
|
|
1332
|
+
} catch {
|
|
1333
|
+
return null;
|
|
1334
|
+
}
|
|
1335
|
+
});
|
|
1336
|
+
|
|
1337
|
+
const results = await Promise.all(promises);
|
|
1338
|
+
for (let j = 0; j < results.length; j++) {
|
|
1339
|
+
if (results[j]) {
|
|
1340
|
+
preparedFiles.push(results[j]);
|
|
1341
|
+
prepared++;
|
|
1342
|
+
if (onProgress) onProgress(prepared, total, results[j].relPath, i + j + 1, "prepare");
|
|
1186
1343
|
} else {
|
|
1187
1344
|
skipped++;
|
|
1188
1345
|
}
|
|
1189
|
-
} catch {
|
|
1190
|
-
skipped++;
|
|
1191
1346
|
}
|
|
1192
1347
|
}
|
|
1193
1348
|
|
|
1349
|
+
if (DEBUG) console.log(`[vectorizer] Phase 1 done: ${prepared} files prepared, ${skipped} skipped`);
|
|
1350
|
+
|
|
1351
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1352
|
+
// Phase 2: Batch embed + store (sequential, batch forward pass)
|
|
1353
|
+
// ══════════════════════════════════════════════════════════════════════════
|
|
1354
|
+
let chunksEmbedded = 0;
|
|
1355
|
+
if (preparedFiles.length > 0) {
|
|
1356
|
+
const totalChunks = preparedFiles.reduce((sum, pf) => sum + pf.rows.length, 0);
|
|
1357
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2: embedding ${totalChunks} chunks from ${preparedFiles.length} files`);
|
|
1358
|
+
|
|
1359
|
+
chunksEmbedded = await this.embedAndStore(preparedFiles, 32, (done, embedTotal, phase) => {
|
|
1360
|
+
if (onProgress) onProgress(done, embedTotal, `embedding`, done, "embed");
|
|
1361
|
+
});
|
|
1362
|
+
|
|
1363
|
+
if (DEBUG) console.log(`[vectorizer] Phase 2 done: ${chunksEmbedded} chunks embedded and stored`);
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
const indexed = prepared; // file count for backward compat
|
|
1367
|
+
|
|
1194
1368
|
// FR-005: Build semantic similarity edges as post-pass
|
|
1195
1369
|
// Disabled by default (O(n²) — slow on large repos). Enable via graph.semantic_edges: true
|
|
1196
1370
|
let semanticEdges = 0;
|
|
1197
|
-
if (
|
|
1371
|
+
if (chunksEmbedded > 0 && this.graphBuilder && this.graphDB && GRAPH_CONFIG.semantic_edges) {
|
|
1198
1372
|
try {
|
|
1199
1373
|
const tableName = "chunks";
|
|
1200
1374
|
const tables = await this.db.tableNames();
|