@comfanion/usethis_search 3.0.0-dev.16 → 3.0.0-dev.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/vectorizer/analyzers/lsp-analyzer.ts +7 -7
- package/vectorizer/analyzers/regex-analyzer.ts +173 -67
- package/vectorizer/chunkers/code-chunker.ts +74 -24
- package/vectorizer/chunkers/markdown-chunker.ts +69 -7
- package/vectorizer/graph-builder.ts +207 -15
- package/vectorizer/graph-db.ts +70 -47
- package/vectorizer/index.ts +111 -23
- package/vectorizer.yaml +16 -0
package/vectorizer/index.ts
CHANGED
|
@@ -16,7 +16,7 @@ import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
|
|
|
16
16
|
import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
|
|
17
17
|
import { SearchMetrics } from "./search-metrics.ts";
|
|
18
18
|
import { GraphDB } from "./graph-db.ts";
|
|
19
|
-
import { GraphBuilder } from "./graph-builder.ts";
|
|
19
|
+
import { GraphBuilder, isStructuralPredicate } from "./graph-builder.ts";
|
|
20
20
|
import { UsageTracker } from "./usage-tracker.ts";
|
|
21
21
|
|
|
22
22
|
// Suppress transformers.js logs unless DEBUG is set
|
|
@@ -85,6 +85,19 @@ let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
|
|
|
85
85
|
let METRICS_ENABLED = false;
|
|
86
86
|
let CACHE_ENABLED = true;
|
|
87
87
|
|
|
88
|
+
// ── Graph config (v3) ───────────────────────────────────────────────────────
|
|
89
|
+
const DEFAULT_GRAPH_CONFIG = {
|
|
90
|
+
enabled: true,
|
|
91
|
+
max_related: 4,
|
|
92
|
+
min_relevance: 0.5,
|
|
93
|
+
lsp: {
|
|
94
|
+
enabled: true,
|
|
95
|
+
timeout_ms: 5000,
|
|
96
|
+
},
|
|
97
|
+
read_intercept: true,
|
|
98
|
+
};
|
|
99
|
+
let GRAPH_CONFIG = { ...DEFAULT_GRAPH_CONFIG, lsp: { ...DEFAULT_GRAPH_CONFIG.lsp } };
|
|
100
|
+
|
|
88
101
|
function defaultVectorizerYaml() {
|
|
89
102
|
return (
|
|
90
103
|
`vectorizer:\n` +
|
|
@@ -121,6 +134,16 @@ function defaultVectorizerYaml() {
|
|
|
121
134
|
` hybrid: true\n` +
|
|
122
135
|
` bm25_weight: 0.3\n` +
|
|
123
136
|
`\n` +
|
|
137
|
+
` # Graph-based context (v3)\n` +
|
|
138
|
+
` graph:\n` +
|
|
139
|
+
` enabled: true\n` +
|
|
140
|
+
` max_related: 4\n` +
|
|
141
|
+
` min_relevance: 0.5\n` +
|
|
142
|
+
` lsp:\n` +
|
|
143
|
+
` enabled: true\n` +
|
|
144
|
+
` timeout_ms: 5000\n` +
|
|
145
|
+
` read_intercept: true\n` +
|
|
146
|
+
`\n` +
|
|
124
147
|
` # Quality monitoring\n` +
|
|
125
148
|
` quality:\n` +
|
|
126
149
|
` enable_metrics: false\n` +
|
|
@@ -282,6 +305,26 @@ async function loadConfig(projectRoot) {
|
|
|
282
305
|
CACHE_ENABLED = parseBool(qs, "enable_cache", true);
|
|
283
306
|
}
|
|
284
307
|
|
|
308
|
+
// ── Parse graph config (v3) ──────────────────────────────────────────────
|
|
309
|
+
const graphMatch = section.match(/^\s{2}graph:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
|
|
310
|
+
if (graphMatch) {
|
|
311
|
+
const gs = graphMatch[1];
|
|
312
|
+
GRAPH_CONFIG.enabled = parseBool(gs, "enabled", DEFAULT_GRAPH_CONFIG.enabled);
|
|
313
|
+
GRAPH_CONFIG.max_related = parseNumber(gs, "max_related", DEFAULT_GRAPH_CONFIG.max_related);
|
|
314
|
+
GRAPH_CONFIG.min_relevance = parseNumber(gs, "min_relevance", DEFAULT_GRAPH_CONFIG.min_relevance);
|
|
315
|
+
GRAPH_CONFIG.read_intercept = parseBool(gs, "read_intercept", DEFAULT_GRAPH_CONFIG.read_intercept);
|
|
316
|
+
|
|
317
|
+
// Nested lsp: section
|
|
318
|
+
const lspMatch = gs.match(/^\s+lsp:\s*\n([\s\S]*?)(?=^\s{4}[a-zA-Z_\-]+:|\Z)/m);
|
|
319
|
+
if (lspMatch) {
|
|
320
|
+
const ls = lspMatch[1];
|
|
321
|
+
GRAPH_CONFIG.lsp.enabled = parseBool(ls, "enabled", DEFAULT_GRAPH_CONFIG.lsp.enabled);
|
|
322
|
+
GRAPH_CONFIG.lsp.timeout_ms = parseNumber(ls, "timeout_ms", DEFAULT_GRAPH_CONFIG.lsp.timeout_ms);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (DEBUG) console.log("[vectorizer] Graph config:", GRAPH_CONFIG);
|
|
326
|
+
}
|
|
327
|
+
|
|
285
328
|
// Parse global exclude
|
|
286
329
|
const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
|
|
287
330
|
if (excludeMatch) {
|
|
@@ -392,11 +435,19 @@ class CodebaseIndexer {
|
|
|
392
435
|
this.db = await lancedb.connect(path.join(this.cacheDir, "lancedb"));
|
|
393
436
|
await this.loadHashes();
|
|
394
437
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
438
|
+
// Graph DB — only if graph is enabled in config
|
|
439
|
+
if (GRAPH_CONFIG.enabled) {
|
|
440
|
+
const graphType = this.indexName === "docs" ? "doc_graph" : "code_graph";
|
|
441
|
+
const graphPath = path.join(this.root, ".opencode", "graph", graphType);
|
|
442
|
+
await fs.mkdir(path.dirname(graphPath), { recursive: true });
|
|
443
|
+
this.graphDB = await new GraphDB(graphPath).init();
|
|
444
|
+
this.graphBuilder = new GraphBuilder(
|
|
445
|
+
this.graphDB,
|
|
446
|
+
this.root,
|
|
447
|
+
GRAPH_CONFIG.lsp.enabled,
|
|
448
|
+
GRAPH_CONFIG.lsp.timeout_ms,
|
|
449
|
+
);
|
|
450
|
+
}
|
|
400
451
|
|
|
401
452
|
// Usage tracker — provenance & usage stats
|
|
402
453
|
this.usageTracker = new UsageTracker(this.cacheDir);
|
|
@@ -557,18 +608,39 @@ class CodebaseIndexer {
|
|
|
557
608
|
// Semantic chunking
|
|
558
609
|
const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
|
|
559
610
|
|
|
560
|
-
// v3: Assign chunk IDs for graph tracking
|
|
561
|
-
const chunksWithIds = this.graphBuilder
|
|
611
|
+
// v3: Assign chunk IDs for graph tracking (works without graph — just adds IDs)
|
|
612
|
+
const chunksWithIds = this.graphBuilder
|
|
613
|
+
? this.graphBuilder.assignChunkIds(relPath, chunks)
|
|
614
|
+
: chunks.map((c, i) => ({ ...c, chunk_id: `chunk:${relPath}::_chunk_${i}` }));
|
|
562
615
|
|
|
563
616
|
// v3: Delete old edges for this file and build new ones
|
|
564
|
-
|
|
565
|
-
|
|
617
|
+
let graphEdgesBuilt = 0;
|
|
618
|
+
if (this.graphBuilder && this.graphDB) {
|
|
619
|
+
await this.graphDB.deleteByFile(relPath);
|
|
620
|
+
graphEdgesBuilt = await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
|
|
621
|
+
|
|
622
|
+
// Log graph creation to indexer.log
|
|
623
|
+
if (graphEdgesBuilt > 0 || DEBUG) {
|
|
624
|
+
const timestamp = new Date().toISOString().slice(11, 19);
|
|
625
|
+
const logMsg = `${timestamp} Graph built: ${relPath} (${chunksWithIds.length} chunks)`;
|
|
626
|
+
if (DEBUG) console.log(`[vectorizer] ${logMsg}`);
|
|
627
|
+
|
|
628
|
+
// Write to indexer.log in .opencode directory
|
|
629
|
+
try {
|
|
630
|
+
const logPath = path.join(this.root, ".opencode", "indexer.log");
|
|
631
|
+
const fsSync = await import("fs");
|
|
632
|
+
fsSync.appendFileSync(logPath, `${logMsg}\n`);
|
|
633
|
+
} catch {
|
|
634
|
+
// non-fatal — logging is advisory
|
|
635
|
+
}
|
|
636
|
+
}
|
|
566
637
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
638
|
+
// FR-054: Store graph build timestamp + file hash as metadata triple
|
|
639
|
+
try {
|
|
640
|
+
await this.graphDB.setFileMeta(relPath, hash, Date.now());
|
|
641
|
+
} catch {
|
|
642
|
+
// non-fatal — metadata is advisory
|
|
643
|
+
}
|
|
572
644
|
}
|
|
573
645
|
|
|
574
646
|
const data = [];
|
|
@@ -590,6 +662,9 @@ class CodebaseIndexer {
|
|
|
590
662
|
function_name: chunksWithIds[i].function_name || "",
|
|
591
663
|
class_name: chunksWithIds[i].class_name || "",
|
|
592
664
|
tags: (fileMeta.tags || []).join(","),
|
|
665
|
+
// Line numbers for "from-to" extraction (default to -1 when unknown)
|
|
666
|
+
start_line: chunksWithIds[i].start_line ?? -1,
|
|
667
|
+
end_line: chunksWithIds[i].end_line ?? -1,
|
|
593
668
|
});
|
|
594
669
|
}
|
|
595
670
|
|
|
@@ -629,7 +704,7 @@ class CodebaseIndexer {
|
|
|
629
704
|
const table = await this.db.openTable(tableName);
|
|
630
705
|
let allRows;
|
|
631
706
|
try {
|
|
632
|
-
allRows = await table.filter("").limit(100000).execute();
|
|
707
|
+
allRows = await table.filter("true").limit(100000).execute();
|
|
633
708
|
} catch (e) {
|
|
634
709
|
if (DEBUG) console.log("[vectorizer] BM25 index build failed (corrupted table?):", e.message);
|
|
635
710
|
return null;
|
|
@@ -693,10 +768,15 @@ class CodebaseIndexer {
|
|
|
693
768
|
const bm25Results = bm25.search(query, fetchLimit);
|
|
694
769
|
|
|
695
770
|
// Build score maps
|
|
771
|
+
// LanceDB _distance is L2 (euclidean). For normalized vectors,
|
|
772
|
+
// L2 ∈ [0, 2]. Convert to similarity ∈ [0, 1]:
|
|
773
|
+
// similarity = 1 - (distance / 2)
|
|
774
|
+
const distanceToScore = (d: number | null | undefined) =>
|
|
775
|
+
d != null ? Math.max(0, 1 - d / 2) : 0.5;
|
|
776
|
+
|
|
696
777
|
const vectorScores = new Map();
|
|
697
778
|
for (let i = 0; i < results.length; i++) {
|
|
698
|
-
|
|
699
|
-
vectorScores.set(i, score);
|
|
779
|
+
vectorScores.set(i, distanceToScore(results[i]._distance));
|
|
700
780
|
}
|
|
701
781
|
|
|
702
782
|
const bm25Scores = new Map();
|
|
@@ -711,7 +791,7 @@ class CodebaseIndexer {
|
|
|
711
791
|
|
|
712
792
|
for (let i = 0; i < results.length; i++) {
|
|
713
793
|
const key = `${results[i].file}:${results[i].chunk_index}`;
|
|
714
|
-
const vs = results[i]._distance
|
|
794
|
+
const vs = distanceToScore(results[i]._distance);
|
|
715
795
|
resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
|
|
716
796
|
}
|
|
717
797
|
|
|
@@ -812,7 +892,10 @@ class CodebaseIndexer {
|
|
|
812
892
|
|
|
813
893
|
const outgoing = await this.graphDB.getOutgoing(result.chunk_id);
|
|
814
894
|
const incoming = await this.graphDB.getIncoming(result.chunk_id);
|
|
815
|
-
|
|
895
|
+
// Filter out structural and meta edges — only relation edges are useful for context
|
|
896
|
+
const allEdges = [...outgoing, ...incoming].filter(
|
|
897
|
+
e => e.predicate !== "belongs_to" && e.predicate !== "graph_built" && !isStructuralPredicate(e.predicate)
|
|
898
|
+
);
|
|
816
899
|
|
|
817
900
|
const neighbors = [];
|
|
818
901
|
for (const edge of allEdges) {
|
|
@@ -833,8 +916,13 @@ class CodebaseIndexer {
|
|
|
833
916
|
});
|
|
834
917
|
}
|
|
835
918
|
|
|
919
|
+
// Apply min_relevance filter, then cap at max_related
|
|
836
920
|
neighbors.sort((a, b) => b.score - a.score);
|
|
837
|
-
|
|
921
|
+
const minRelevance = GRAPH_CONFIG.min_relevance ?? 0.5;
|
|
922
|
+
const maxRelated = GRAPH_CONFIG.max_related ?? 4;
|
|
923
|
+
result.relatedContext = neighbors
|
|
924
|
+
.filter(n => n.score >= minRelevance)
|
|
925
|
+
.slice(0, maxRelated);
|
|
838
926
|
|
|
839
927
|
// FR-060: Record provenance for each attached chunk
|
|
840
928
|
if (this.usageTracker) {
|
|
@@ -875,7 +963,7 @@ class CodebaseIndexer {
|
|
|
875
963
|
const table = await this.db.openTable(tableName);
|
|
876
964
|
let rows;
|
|
877
965
|
try {
|
|
878
|
-
rows = await table.filter("").limit(100000).execute();
|
|
966
|
+
rows = await table.filter("true").limit(100000).execute();
|
|
879
967
|
} catch (e) {
|
|
880
968
|
if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
|
|
881
969
|
return null;
|
|
@@ -1013,7 +1101,7 @@ class CodebaseIndexer {
|
|
|
1013
1101
|
const tables = await this.db.tableNames();
|
|
1014
1102
|
if (tables.includes(tableName)) {
|
|
1015
1103
|
const table = await this.db.openTable(tableName);
|
|
1016
|
-
const allRows = await table.filter("").limit(100000).execute();
|
|
1104
|
+
const allRows = await table.filter("true").limit(100000).execute();
|
|
1017
1105
|
const chunkData = allRows
|
|
1018
1106
|
.filter(r => r.chunk_id && r.vector)
|
|
1019
1107
|
.map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
|
package/vectorizer.yaml
CHANGED
|
@@ -61,6 +61,22 @@ vectorizer:
|
|
|
61
61
|
# Indexes to maintain - each has pattern (what to include) and ignore (what to skip)
|
|
62
62
|
indexes:
|
|
63
63
|
|
|
64
|
+
# Source code index - all common programming languages
|
|
65
|
+
code:
|
|
66
|
+
enabled: true
|
|
67
|
+
pattern: "**/*.{js,ts,jsx,tsx,mjs,cjs,py,go,rs,java,kt,swift,c,cpp,h,hpp,cs,rb,php,scala,clj}"
|
|
68
|
+
ignore:
|
|
69
|
+
- "**/node_modules/**"
|
|
70
|
+
- "**/.git/**"
|
|
71
|
+
- "**/dist/**"
|
|
72
|
+
- "**/build/**"
|
|
73
|
+
- "**/.opencode/**"
|
|
74
|
+
- "**/docs/**"
|
|
75
|
+
- "**/vendor/**"
|
|
76
|
+
- "**/__pycache__/**"
|
|
77
|
+
hybrid: true
|
|
78
|
+
bm25_weight: 0.3
|
|
79
|
+
|
|
64
80
|
# Documentation index - markdown, text files
|
|
65
81
|
docs:
|
|
66
82
|
enabled: true
|