@optave/codegraph 3.11.0 → 3.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -31
- package/dist/ast-analysis/engine.d.ts.map +1 -1
- package/dist/ast-analysis/engine.js +91 -60
- package/dist/ast-analysis/engine.js.map +1 -1
- package/dist/ast-analysis/visitor-utils.d.ts +3 -0
- package/dist/ast-analysis/visitor-utils.d.ts.map +1 -1
- package/dist/ast-analysis/visitor-utils.js +83 -49
- package/dist/ast-analysis/visitor-utils.js.map +1 -1
- package/dist/ast-analysis/visitors/ast-store-visitor.d.ts.map +1 -1
- package/dist/ast-analysis/visitors/ast-store-visitor.js +78 -62
- package/dist/ast-analysis/visitors/ast-store-visitor.js.map +1 -1
- package/dist/ast-analysis/visitors/dataflow-visitor.d.ts.map +1 -1
- package/dist/ast-analysis/visitors/dataflow-visitor.js +61 -42
- package/dist/ast-analysis/visitors/dataflow-visitor.js.map +1 -1
- package/dist/cli/commands/embed.d.ts.map +1 -1
- package/dist/cli/commands/embed.js +49 -4
- package/dist/cli/commands/embed.js.map +1 -1
- package/dist/domain/analysis/dependencies.d.ts.map +1 -1
- package/dist/domain/analysis/dependencies.js +106 -80
- package/dist/domain/analysis/dependencies.js.map +1 -1
- package/dist/domain/analysis/fn-impact.d.ts.map +1 -1
- package/dist/domain/analysis/fn-impact.js +77 -52
- package/dist/domain/analysis/fn-impact.js.map +1 -1
- package/dist/domain/analysis/module-map.d.ts.map +1 -1
- package/dist/domain/analysis/module-map.js +132 -121
- package/dist/domain/analysis/module-map.js.map +1 -1
- package/dist/domain/graph/builder/call-resolver.d.ts +71 -0
- package/dist/domain/graph/builder/call-resolver.d.ts.map +1 -0
- package/dist/domain/graph/builder/call-resolver.js +130 -0
- package/dist/domain/graph/builder/call-resolver.js.map +1 -0
- package/dist/domain/graph/builder/helpers.d.ts +4 -4
- package/dist/domain/graph/builder/helpers.d.ts.map +1 -1
- package/dist/domain/graph/builder/helpers.js +47 -33
- package/dist/domain/graph/builder/helpers.js.map +1 -1
- package/dist/domain/graph/builder/incremental.d.ts +6 -0
- package/dist/domain/graph/builder/incremental.d.ts.map +1 -1
- package/dist/domain/graph/builder/incremental.js +214 -127
- package/dist/domain/graph/builder/incremental.js.map +1 -1
- package/dist/domain/graph/builder/pipeline.d.ts +1 -44
- package/dist/domain/graph/builder/pipeline.d.ts.map +1 -1
- package/dist/domain/graph/builder/pipeline.js +10 -766
- package/dist/domain/graph/builder/pipeline.js.map +1 -1
- package/dist/domain/graph/builder/stages/build-edges.d.ts.map +1 -1
- package/dist/domain/graph/builder/stages/build-edges.js +151 -192
- package/dist/domain/graph/builder/stages/build-edges.js.map +1 -1
- package/dist/domain/graph/builder/stages/build-structure.d.ts.map +1 -1
- package/dist/domain/graph/builder/stages/build-structure.js +82 -65
- package/dist/domain/graph/builder/stages/build-structure.js.map +1 -1
- package/dist/domain/graph/builder/stages/detect-changes.d.ts.map +1 -1
- package/dist/domain/graph/builder/stages/detect-changes.js +84 -56
- package/dist/domain/graph/builder/stages/detect-changes.js.map +1 -1
- package/dist/domain/graph/builder/stages/finalize.d.ts.map +1 -1
- package/dist/domain/graph/builder/stages/finalize.js +60 -51
- package/dist/domain/graph/builder/stages/finalize.js.map +1 -1
- package/dist/domain/graph/builder/stages/insert-nodes.d.ts +8 -6
- package/dist/domain/graph/builder/stages/insert-nodes.d.ts.map +1 -1
- package/dist/domain/graph/builder/stages/insert-nodes.js +107 -122
- package/dist/domain/graph/builder/stages/insert-nodes.js.map +1 -1
- package/dist/domain/graph/builder/stages/native-db-lifecycle.d.ts +14 -0
- package/dist/domain/graph/builder/stages/native-db-lifecycle.d.ts.map +1 -0
- package/dist/domain/graph/builder/stages/native-db-lifecycle.js +77 -0
- package/dist/domain/graph/builder/stages/native-db-lifecycle.js.map +1 -0
- package/dist/domain/graph/builder/stages/native-orchestrator.d.ts +62 -0
- package/dist/domain/graph/builder/stages/native-orchestrator.d.ts.map +1 -0
- package/dist/domain/graph/builder/stages/native-orchestrator.js +747 -0
- package/dist/domain/graph/builder/stages/native-orchestrator.js.map +1 -0
- package/dist/domain/graph/cycles.d.ts +6 -4
- package/dist/domain/graph/cycles.d.ts.map +1 -1
- package/dist/domain/graph/cycles.js +50 -55
- package/dist/domain/graph/cycles.js.map +1 -1
- package/dist/domain/graph/journal.d.ts.map +1 -1
- package/dist/domain/graph/journal.js +89 -70
- package/dist/domain/graph/journal.js.map +1 -1
- package/dist/domain/graph/watcher.d.ts.map +1 -1
- package/dist/domain/graph/watcher.js +10 -4
- package/dist/domain/graph/watcher.js.map +1 -1
- package/dist/domain/parser.d.ts +12 -23
- package/dist/domain/parser.d.ts.map +1 -1
- package/dist/domain/parser.js +126 -79
- package/dist/domain/parser.js.map +1 -1
- package/dist/domain/search/generator.d.ts +3 -1
- package/dist/domain/search/generator.d.ts.map +1 -1
- package/dist/domain/search/generator.js +68 -45
- package/dist/domain/search/generator.js.map +1 -1
- package/dist/domain/search/models.d.ts +2 -0
- package/dist/domain/search/models.d.ts.map +1 -1
- package/dist/domain/search/models.js +37 -3
- package/dist/domain/search/models.js.map +1 -1
- package/dist/domain/search/search/hybrid.d.ts.map +1 -1
- package/dist/domain/search/search/hybrid.js +49 -40
- package/dist/domain/search/search/hybrid.js.map +1 -1
- package/dist/domain/search/search/semantic.d.ts.map +1 -1
- package/dist/domain/search/search/semantic.js +69 -49
- package/dist/domain/search/search/semantic.js.map +1 -1
- package/dist/domain/wasm-worker-entry.js +201 -136
- package/dist/domain/wasm-worker-entry.js.map +1 -1
- package/dist/extractors/elixir.js +95 -71
- package/dist/extractors/elixir.js.map +1 -1
- package/dist/extractors/gleam.d.ts.map +1 -1
- package/dist/extractors/gleam.js +23 -31
- package/dist/extractors/gleam.js.map +1 -1
- package/dist/extractors/helpers.d.ts +79 -1
- package/dist/extractors/helpers.d.ts.map +1 -1
- package/dist/extractors/helpers.js +137 -0
- package/dist/extractors/helpers.js.map +1 -1
- package/dist/extractors/java.d.ts.map +1 -1
- package/dist/extractors/java.js +37 -49
- package/dist/extractors/java.js.map +1 -1
- package/dist/extractors/javascript.d.ts.map +1 -1
- package/dist/extractors/javascript.js +44 -44
- package/dist/extractors/javascript.js.map +1 -1
- package/dist/extractors/julia.js +27 -34
- package/dist/extractors/julia.js.map +1 -1
- package/dist/extractors/r.d.ts.map +1 -1
- package/dist/extractors/r.js +33 -58
- package/dist/extractors/r.js.map +1 -1
- package/dist/extractors/solidity.d.ts.map +1 -1
- package/dist/extractors/solidity.js +38 -61
- package/dist/extractors/solidity.js.map +1 -1
- package/dist/features/boundaries.d.ts.map +1 -1
- package/dist/features/boundaries.js +49 -39
- package/dist/features/boundaries.js.map +1 -1
- package/dist/features/cfg.d.ts.map +1 -1
- package/dist/features/cfg.js +90 -63
- package/dist/features/cfg.js.map +1 -1
- package/dist/features/check.d.ts.map +1 -1
- package/dist/features/check.js +43 -34
- package/dist/features/check.js.map +1 -1
- package/dist/features/cochange.d.ts.map +1 -1
- package/dist/features/cochange.js +68 -56
- package/dist/features/cochange.js.map +1 -1
- package/dist/features/complexity.d.ts.map +1 -1
- package/dist/features/complexity.js +105 -75
- package/dist/features/complexity.js.map +1 -1
- package/dist/features/dataflow.d.ts.map +1 -1
- package/dist/features/dataflow.js +37 -29
- package/dist/features/dataflow.js.map +1 -1
- package/dist/features/flow.d.ts.map +1 -1
- package/dist/features/flow.js +31 -22
- package/dist/features/flow.js.map +1 -1
- package/dist/features/graph-enrichment.d.ts.map +1 -1
- package/dist/features/graph-enrichment.js +77 -70
- package/dist/features/graph-enrichment.js.map +1 -1
- package/dist/features/owners.d.ts +17 -26
- package/dist/features/owners.d.ts.map +1 -1
- package/dist/features/owners.js +120 -109
- package/dist/features/owners.js.map +1 -1
- package/dist/features/sequence.d.ts.map +1 -1
- package/dist/features/sequence.js +59 -54
- package/dist/features/sequence.js.map +1 -1
- package/dist/features/structure-query.d.ts.map +1 -1
- package/dist/features/structure-query.js +60 -60
- package/dist/features/structure-query.js.map +1 -1
- package/dist/features/structure.d.ts.map +1 -1
- package/dist/features/structure.js +149 -52
- package/dist/features/structure.js.map +1 -1
- package/dist/graph/algorithms/leiden/optimiser.d.ts.map +1 -1
- package/dist/graph/algorithms/leiden/optimiser.js +100 -69
- package/dist/graph/algorithms/leiden/optimiser.js.map +1 -1
- package/dist/graph/classifiers/roles.d.ts.map +1 -1
- package/dist/graph/classifiers/roles.js +63 -59
- package/dist/graph/classifiers/roles.js.map +1 -1
- package/dist/infrastructure/config.d.ts +1 -1
- package/dist/infrastructure/config.d.ts.map +1 -1
- package/dist/infrastructure/config.js +1 -1
- package/dist/infrastructure/config.js.map +1 -1
- package/dist/presentation/cfg.d.ts.map +1 -1
- package/dist/presentation/cfg.js +44 -29
- package/dist/presentation/cfg.js.map +1 -1
- package/dist/presentation/flow.d.ts.map +1 -1
- package/dist/presentation/flow.js +58 -38
- package/dist/presentation/flow.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/types.d.ts.map +1 -1
- package/grammars/tree-sitter-erlang.wasm +0 -0
- package/package.json +9 -9
- package/src/ast-analysis/engine.ts +145 -61
- package/src/ast-analysis/visitor-utils.ts +86 -46
- package/src/ast-analysis/visitors/ast-store-visitor.ts +104 -69
- package/src/ast-analysis/visitors/dataflow-visitor.ts +86 -47
- package/src/cli/commands/embed.ts +54 -4
- package/src/domain/analysis/dependencies.ts +166 -85
- package/src/domain/analysis/fn-impact.ts +120 -50
- package/src/domain/analysis/module-map.ts +175 -140
- package/src/domain/graph/builder/call-resolver.ts +181 -0
- package/src/domain/graph/builder/helpers.ts +85 -76
- package/src/domain/graph/builder/incremental.ts +321 -152
- package/src/domain/graph/builder/pipeline.ts +19 -957
- package/src/domain/graph/builder/stages/build-edges.ts +229 -275
- package/src/domain/graph/builder/stages/build-structure.ts +115 -82
- package/src/domain/graph/builder/stages/detect-changes.ts +107 -64
- package/src/domain/graph/builder/stages/finalize.ts +72 -70
- package/src/domain/graph/builder/stages/insert-nodes.ts +154 -120
- package/src/domain/graph/builder/stages/native-db-lifecycle.ts +74 -0
- package/src/domain/graph/builder/stages/native-orchestrator.ts +942 -0
- package/src/domain/graph/cycles.ts +51 -49
- package/src/domain/graph/journal.ts +84 -69
- package/src/domain/graph/watcher.ts +12 -4
- package/src/domain/parser.ts +143 -66
- package/src/domain/search/generator.ts +132 -74
- package/src/domain/search/models.ts +39 -3
- package/src/domain/search/search/hybrid.ts +53 -42
- package/src/domain/search/search/semantic.ts +105 -65
- package/src/domain/wasm-worker-entry.ts +235 -152
- package/src/extractors/elixir.ts +91 -64
- package/src/extractors/gleam.ts +33 -37
- package/src/extractors/helpers.ts +205 -1
- package/src/extractors/java.ts +42 -45
- package/src/extractors/javascript.ts +44 -43
- package/src/extractors/julia.ts +28 -35
- package/src/extractors/r.ts +38 -56
- package/src/extractors/solidity.ts +43 -71
- package/src/features/boundaries.ts +64 -46
- package/src/features/cfg.ts +145 -74
- package/src/features/check.ts +60 -43
- package/src/features/cochange.ts +95 -72
- package/src/features/complexity.ts +134 -79
- package/src/features/dataflow.ts +57 -34
- package/src/features/flow.ts +48 -24
- package/src/features/graph-enrichment.ts +105 -70
- package/src/features/owners.ts +186 -146
- package/src/features/sequence.ts +99 -69
- package/src/features/structure-query.ts +94 -79
- package/src/features/structure.ts +199 -79
- package/src/graph/algorithms/leiden/optimiser.ts +142 -87
- package/src/graph/classifiers/roles.ts +64 -54
- package/src/infrastructure/config.ts +1 -1
- package/src/presentation/cfg.ts +48 -32
- package/src/presentation/flow.ts +100 -52
- package/src/types.ts +1 -1
|
@@ -8,6 +8,19 @@ import { embed, getModelConfig } from './models.js';
|
|
|
8
8
|
import { buildSourceText } from './strategies/source.js';
|
|
9
9
|
import { buildStructuredText } from './strategies/structured.js';
|
|
10
10
|
|
|
11
|
+
type EmbeddingNode = NodeRow & { id: number };
|
|
12
|
+
type EmbeddingStrategy = 'structured' | 'source';
|
|
13
|
+
|
|
14
|
+
interface PreparedEmbeddings {
|
|
15
|
+
texts: string[];
|
|
16
|
+
nodeIds: number[];
|
|
17
|
+
nodeNames: string[];
|
|
18
|
+
previews: string[];
|
|
19
|
+
overflowCount: number;
|
|
20
|
+
filesRead: number;
|
|
21
|
+
filesSkipped: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
11
24
|
/**
|
|
12
25
|
* Rough token estimate (~4 chars per token for code/English).
|
|
13
26
|
* Conservative — avoids adding a tokenizer dependency.
|
|
@@ -47,47 +60,22 @@ function initEmbeddingsSchema(db: BetterSqlite3Database): void {
|
|
|
47
60
|
`);
|
|
48
61
|
}
|
|
49
62
|
|
|
50
|
-
export interface BuildEmbeddingsOptions {
|
|
51
|
-
strategy?: 'structured' | 'source';
|
|
52
|
-
}
|
|
53
|
-
|
|
54
63
|
/**
|
|
55
|
-
*
|
|
64
|
+
* Resolve the repo root for embedding. Prefer the root recorded at build time;
|
|
65
|
+
* fall back to `<dbParent>` only when the DB lives at the conventional
|
|
66
|
+
* `<root>/.codegraph/graph.db` layout — otherwise trust the caller's rootDir.
|
|
56
67
|
*/
|
|
57
|
-
|
|
58
|
-
rootDir: string,
|
|
59
|
-
modelKey: string,
|
|
60
|
-
customDbPath?: string,
|
|
61
|
-
options: BuildEmbeddingsOptions = {},
|
|
62
|
-
): Promise<void> {
|
|
63
|
-
const strategy = options.strategy || 'structured';
|
|
64
|
-
const dbPath = customDbPath || findDbPath(undefined);
|
|
65
|
-
|
|
66
|
-
if (!fs.existsSync(dbPath)) {
|
|
67
|
-
throw new DbError(
|
|
68
|
-
`No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
|
|
69
|
-
{ file: dbPath },
|
|
70
|
-
);
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
const db = openDb(dbPath) as BetterSqlite3Database;
|
|
74
|
-
initEmbeddingsSchema(db);
|
|
75
|
-
|
|
76
|
-
// Prefer the repo root recorded at build time — embed may be invoked from a
|
|
77
|
-
// different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the
|
|
78
|
-
// positional rootDir will be wrong in that case. For legacy DBs without
|
|
79
|
-
// root_dir metadata, fall back to `<dbParent>` only when the DB lives at
|
|
80
|
-
// the conventional `<root>/.codegraph/graph.db` layout — otherwise trust
|
|
81
|
-
// the caller-provided rootDir (which may be an explicit positional arg).
|
|
82
|
-
// `path.dirname(...)` is always non-empty (`'.'` at minimum), so the
|
|
83
|
-
// conventional-layout check is required to keep the rootDir path reachable.
|
|
68
|
+
function resolveRoot(db: BetterSqlite3Database, dbPath: string, rootDir: string): string {
|
|
84
69
|
const metaRoot = getBuildMeta(db, 'root_dir');
|
|
85
70
|
const resolvedDbPath = path.resolve(dbPath);
|
|
86
71
|
const dbDirName = path.basename(path.dirname(resolvedDbPath));
|
|
87
72
|
const dbParent =
|
|
88
73
|
dbDirName === '.codegraph' ? path.dirname(path.dirname(resolvedDbPath)) : undefined;
|
|
89
|
-
|
|
74
|
+
return metaRoot || dbParent || rootDir;
|
|
75
|
+
}
|
|
90
76
|
|
|
77
|
+
/** Reset embedding tables and load eligible symbols grouped by file. */
|
|
78
|
+
function loadNodesByFile(db: BetterSqlite3Database): Map<string, EmbeddingNode[]> {
|
|
91
79
|
db.exec('DELETE FROM embeddings');
|
|
92
80
|
db.exec('DELETE FROM embedding_meta');
|
|
93
81
|
db.exec('DELETE FROM fts_index');
|
|
@@ -96,22 +84,52 @@ export async function buildEmbeddings(
|
|
|
96
84
|
.prepare(
|
|
97
85
|
`SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
|
|
98
86
|
)
|
|
99
|
-
.all() as
|
|
87
|
+
.all() as EmbeddingNode[];
|
|
100
88
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
const byFile = new Map<string, typeof nodes>();
|
|
89
|
+
const byFile = new Map<string, EmbeddingNode[]>();
|
|
104
90
|
for (const node of nodes) {
|
|
105
91
|
if (!byFile.has(node.file)) byFile.set(node.file, []);
|
|
106
92
|
byFile.get(node.file)?.push(node);
|
|
107
93
|
}
|
|
94
|
+
return byFile;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/** Build embedding text for a single node, truncating if it would overflow. */
|
|
98
|
+
function buildNodeText(
|
|
99
|
+
node: EmbeddingNode,
|
|
100
|
+
file: string,
|
|
101
|
+
lines: string[],
|
|
102
|
+
db: BetterSqlite3Database,
|
|
103
|
+
strategy: EmbeddingStrategy,
|
|
104
|
+
contextWindow: number,
|
|
105
|
+
): { text: string; overflowed: boolean } {
|
|
106
|
+
let text =
|
|
107
|
+
strategy === 'structured'
|
|
108
|
+
? buildStructuredText(node, file, lines, db)
|
|
109
|
+
: buildSourceText(node, file, lines);
|
|
110
|
+
const tokens = estimateTokens(text);
|
|
111
|
+
if (tokens > contextWindow) {
|
|
112
|
+
text = text.slice(0, contextWindow * 4);
|
|
113
|
+
return { text, overflowed: true };
|
|
114
|
+
}
|
|
115
|
+
return { text, overflowed: false };
|
|
116
|
+
}
|
|
108
117
|
|
|
118
|
+
/**
|
|
119
|
+
* Walk files in the graph, read source, and produce parallel arrays of
|
|
120
|
+
* texts / nodeIds / nodeNames / previews ready for embedding.
|
|
121
|
+
*/
|
|
122
|
+
function prepareEmbeddingTexts(
|
|
123
|
+
byFile: Map<string, EmbeddingNode[]>,
|
|
124
|
+
db: BetterSqlite3Database,
|
|
125
|
+
resolvedRoot: string,
|
|
126
|
+
strategy: EmbeddingStrategy,
|
|
127
|
+
contextWindow: number,
|
|
128
|
+
): PreparedEmbeddings {
|
|
109
129
|
const texts: string[] = [];
|
|
110
130
|
const nodeIds: number[] = [];
|
|
111
131
|
const nodeNames: string[] = [];
|
|
112
132
|
const previews: string[] = [];
|
|
113
|
-
const config = getModelConfig(modelKey);
|
|
114
|
-
const contextWindow = config.contextWindow;
|
|
115
133
|
let overflowCount = 0;
|
|
116
134
|
let filesRead = 0;
|
|
117
135
|
let filesSkipped = 0;
|
|
@@ -129,19 +147,8 @@ export async function buildEmbeddings(
|
|
|
129
147
|
}
|
|
130
148
|
|
|
131
149
|
for (const node of fileNodes) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
? buildStructuredText(node, file, lines, db)
|
|
135
|
-
: buildSourceText(node, file, lines);
|
|
136
|
-
|
|
137
|
-
// Detect and handle context window overflow
|
|
138
|
-
const tokens = estimateTokens(text);
|
|
139
|
-
if (tokens > contextWindow) {
|
|
140
|
-
overflowCount++;
|
|
141
|
-
const maxChars = contextWindow * 4;
|
|
142
|
-
text = text.slice(0, maxChars);
|
|
143
|
-
}
|
|
144
|
-
|
|
150
|
+
const { text, overflowed } = buildNodeText(node, file, lines, db, strategy, contextWindow);
|
|
151
|
+
if (overflowed) overflowCount++;
|
|
145
152
|
texts.push(text);
|
|
146
153
|
nodeIds.push(node.id);
|
|
147
154
|
nodeNames.push(node.name);
|
|
@@ -149,28 +156,19 @@ export async function buildEmbeddings(
|
|
|
149
156
|
}
|
|
150
157
|
}
|
|
151
158
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
`${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
|
|
155
|
-
);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// If there were symbols to embed but every file failed to read, the DB was
|
|
159
|
-
// almost certainly built from a different location than the current cwd.
|
|
160
|
-
// Surface this clearly instead of emitting a silent "Stored 0 embeddings".
|
|
161
|
-
if (byFile.size > 0 && filesRead === 0) {
|
|
162
|
-
closeDb(db);
|
|
163
|
-
throw new DbError(
|
|
164
|
-
`embed: could not read any of the ${filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` +
|
|
165
|
-
`Tried resolving against: ${resolvedRoot}\n` +
|
|
166
|
-
'Pass a positional <dir> argument pointing at the original repo root, or re-run "codegraph build" from that directory.',
|
|
167
|
-
{ file: dbPath },
|
|
168
|
-
);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
console.log(`Embedding ${texts.length} symbols...`);
|
|
172
|
-
const { vectors, dim } = await embed(texts, modelKey);
|
|
159
|
+
return { texts, nodeIds, nodeNames, previews, overflowCount, filesRead, filesSkipped };
|
|
160
|
+
}
|
|
173
161
|
|
|
162
|
+
/** Persist vectors, FTS rows, and embedding metadata in a single transaction. */
|
|
163
|
+
function persistEmbeddings(
|
|
164
|
+
db: BetterSqlite3Database,
|
|
165
|
+
prepared: PreparedEmbeddings,
|
|
166
|
+
vectors: Float32Array[],
|
|
167
|
+
dim: number,
|
|
168
|
+
modelName: string,
|
|
169
|
+
strategy: EmbeddingStrategy,
|
|
170
|
+
): void {
|
|
171
|
+
const { nodeIds, nodeNames, previews, texts, overflowCount } = prepared;
|
|
174
172
|
const insert = db.prepare(
|
|
175
173
|
'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)',
|
|
176
174
|
);
|
|
@@ -182,7 +180,7 @@ export async function buildEmbeddings(
|
|
|
182
180
|
insert.run(nodeIds[i], Buffer.from(vec.buffer), previews[i], texts[i]);
|
|
183
181
|
insertFts.run(nodeIds[i], nodeNames[i], texts[i]);
|
|
184
182
|
}
|
|
185
|
-
insertMeta.run('model',
|
|
183
|
+
insertMeta.run('model', modelName);
|
|
186
184
|
insertMeta.run('dim', String(dim));
|
|
187
185
|
insertMeta.run('count', String(vectors.length));
|
|
188
186
|
insertMeta.run('fts_count', String(vectors.length));
|
|
@@ -193,6 +191,66 @@ export async function buildEmbeddings(
|
|
|
193
191
|
}
|
|
194
192
|
});
|
|
195
193
|
insertAll();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
export interface BuildEmbeddingsOptions {
|
|
197
|
+
strategy?: EmbeddingStrategy;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Build embeddings for all functions/methods/classes in the graph.
|
|
202
|
+
*/
|
|
203
|
+
export async function buildEmbeddings(
|
|
204
|
+
rootDir: string,
|
|
205
|
+
modelKey: string,
|
|
206
|
+
customDbPath?: string,
|
|
207
|
+
options: BuildEmbeddingsOptions = {},
|
|
208
|
+
): Promise<void> {
|
|
209
|
+
const strategy = options.strategy || 'structured';
|
|
210
|
+
const dbPath = customDbPath || findDbPath(undefined);
|
|
211
|
+
|
|
212
|
+
if (!fs.existsSync(dbPath)) {
|
|
213
|
+
throw new DbError(
|
|
214
|
+
`No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
|
|
215
|
+
{ file: dbPath },
|
|
216
|
+
);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const db = openDb(dbPath) as BetterSqlite3Database;
|
|
220
|
+
initEmbeddingsSchema(db);
|
|
221
|
+
|
|
222
|
+
const resolvedRoot = resolveRoot(db, dbPath, rootDir);
|
|
223
|
+
const byFile = loadNodesByFile(db);
|
|
224
|
+
|
|
225
|
+
const nodeCount = [...byFile.values()].reduce((acc, list) => acc + list.length, 0);
|
|
226
|
+
console.log(`Building embeddings for ${nodeCount} symbols (strategy: ${strategy})...`);
|
|
227
|
+
|
|
228
|
+
const config = getModelConfig(modelKey);
|
|
229
|
+
const prepared = prepareEmbeddingTexts(byFile, db, resolvedRoot, strategy, config.contextWindow);
|
|
230
|
+
|
|
231
|
+
if (prepared.overflowCount > 0) {
|
|
232
|
+
warn(
|
|
233
|
+
`${prepared.overflowCount} symbol(s) exceeded model context window (${config.contextWindow} tokens) and were truncated`,
|
|
234
|
+
);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// If there were symbols to embed but every file failed to read, the DB was
|
|
238
|
+
// almost certainly built from a different location than the current cwd.
|
|
239
|
+
// Surface this clearly instead of emitting a silent "Stored 0 embeddings".
|
|
240
|
+
if (byFile.size > 0 && prepared.filesRead === 0) {
|
|
241
|
+
closeDb(db);
|
|
242
|
+
throw new DbError(
|
|
243
|
+
`embed: could not read any of the ${prepared.filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` +
|
|
244
|
+
`Tried resolving against: ${resolvedRoot}\n` +
|
|
245
|
+
'Pass a positional <dir> argument pointing at the original repo root, or re-run "codegraph build" from that directory.',
|
|
246
|
+
{ file: dbPath },
|
|
247
|
+
);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
console.log(`Embedding ${prepared.texts.length} symbols...`);
|
|
251
|
+
const { vectors, dim } = await embed(prepared.texts, modelKey);
|
|
252
|
+
|
|
253
|
+
persistEmbeddings(db, prepared, vectors as Float32Array[], dim, config.name, strategy);
|
|
196
254
|
|
|
197
255
|
console.log(
|
|
198
256
|
`\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
|
|
@@ -41,6 +41,8 @@ export interface ModelConfig {
|
|
|
41
41
|
contextWindow: number;
|
|
42
42
|
desc: string;
|
|
43
43
|
quantized: boolean;
|
|
44
|
+
/** Pooling strategy passed to the transformers pipeline. Defaults to 'mean'. */
|
|
45
|
+
pooling?: 'mean' | 'cls';
|
|
44
46
|
}
|
|
45
47
|
|
|
46
48
|
// Lazy-load transformers (heavy, optional module)
|
|
@@ -91,7 +93,7 @@ export const MODELS: Record<string, ModelConfig> = {
|
|
|
91
93
|
name: 'nomic-ai/nomic-embed-text-v1.5',
|
|
92
94
|
dim: 768,
|
|
93
95
|
contextWindow: 8192,
|
|
94
|
-
desc: '
|
|
96
|
+
desc: 'Matryoshka MRL trained (~137MB). 8192 context. Codegraph stores full 768d (no truncation); v1 scores higher on our benchmark.',
|
|
95
97
|
quantized: false,
|
|
96
98
|
},
|
|
97
99
|
'bge-large': {
|
|
@@ -101,11 +103,41 @@ export const MODELS: Record<string, ModelConfig> = {
|
|
|
101
103
|
desc: 'Best general retrieval (~335MB). Top MTEB scores.',
|
|
102
104
|
quantized: false,
|
|
103
105
|
},
|
|
106
|
+
'mxbai-xsmall': {
|
|
107
|
+
name: 'mixedbread-ai/mxbai-embed-xsmall-v1',
|
|
108
|
+
dim: 384,
|
|
109
|
+
contextWindow: 4096,
|
|
110
|
+
desc: 'Tiny model with long context (~50MB). 4096 ctx.',
|
|
111
|
+
quantized: false,
|
|
112
|
+
pooling: 'cls',
|
|
113
|
+
},
|
|
114
|
+
'mxbai-large': {
|
|
115
|
+
name: 'mixedbread-ai/mxbai-embed-large-v1',
|
|
116
|
+
dim: 1024,
|
|
117
|
+
contextWindow: 512,
|
|
118
|
+
desc: 'Top MTEB BERT-large, Matryoshka dimensions (~400MB). 512 ctx.',
|
|
119
|
+
quantized: false,
|
|
120
|
+
pooling: 'cls',
|
|
121
|
+
},
|
|
122
|
+
'bge-m3': {
|
|
123
|
+
name: 'Xenova/bge-m3',
|
|
124
|
+
dim: 1024,
|
|
125
|
+
contextWindow: 8192,
|
|
126
|
+
desc: 'Multilingual, multi-task (~600MB). 100+ languages, 8192 context.',
|
|
127
|
+
quantized: false,
|
|
128
|
+
},
|
|
129
|
+
modernbert: {
|
|
130
|
+
name: 'nomic-ai/modernbert-embed-base',
|
|
131
|
+
dim: 768,
|
|
132
|
+
contextWindow: 8192,
|
|
133
|
+
desc: 'ModernBERT base (~150MB). Newer architecture, 8192 ctx, English.',
|
|
134
|
+
quantized: false,
|
|
135
|
+
},
|
|
104
136
|
};
|
|
105
137
|
|
|
106
138
|
export const EMBEDDING_STRATEGIES: readonly string[] = ['structured', 'source'];
|
|
107
139
|
|
|
108
|
-
export const DEFAULT_MODEL: string = 'nomic
|
|
140
|
+
export const DEFAULT_MODEL: string = 'nomic';
|
|
109
141
|
const NPM_BIN = process.platform === 'win32' ? 'npm.cmd' : 'npm';
|
|
110
142
|
const BATCH_SIZE_MAP: Record<string, number> = {
|
|
111
143
|
minilm: 32,
|
|
@@ -115,6 +147,10 @@ const BATCH_SIZE_MAP: Record<string, number> = {
|
|
|
115
147
|
nomic: 8,
|
|
116
148
|
'nomic-v1.5': 8,
|
|
117
149
|
'bge-large': 4,
|
|
150
|
+
'mxbai-xsmall': 32,
|
|
151
|
+
'mxbai-large': 4,
|
|
152
|
+
'bge-m3': 4,
|
|
153
|
+
modernbert: 8,
|
|
118
154
|
};
|
|
119
155
|
const DEFAULT_BATCH_SIZE = 32;
|
|
120
156
|
|
|
@@ -274,7 +310,7 @@ export async function embed(
|
|
|
274
310
|
const batch = texts.slice(i, i + batchSize);
|
|
275
311
|
const output =
|
|
276
312
|
(await // biome-ignore lint/complexity/noBannedTypes: dynamically loaded extractor is untyped
|
|
277
|
-
(ext as Function)(batch, { pooling: 'mean', normalize: true })) as {
|
|
313
|
+
(ext as Function)(batch, { pooling: config.pooling ?? 'mean', normalize: true })) as {
|
|
278
314
|
data: number[];
|
|
279
315
|
};
|
|
280
316
|
|
|
@@ -105,61 +105,72 @@ async function collectRankedLists(
|
|
|
105
105
|
return rankedLists;
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
/** Initialise a fusion entry seeded from the first ranked item we see for a key. */
|
|
109
|
+
function createFusionEntry(item: RankedItem): FusionEntry {
|
|
110
|
+
return {
|
|
111
|
+
name: item.name,
|
|
112
|
+
kind: item.kind,
|
|
113
|
+
file: item.file,
|
|
114
|
+
line: item.line,
|
|
115
|
+
endLine: (item.endLine as number | null) ?? null,
|
|
116
|
+
role: (item.role as string | null) ?? null,
|
|
117
|
+
fileHash: (item.fileHash as string | null) ?? null,
|
|
118
|
+
rrfScore: 0,
|
|
119
|
+
bm25Score: null,
|
|
120
|
+
bm25Rank: null,
|
|
121
|
+
similarity: null,
|
|
122
|
+
semanticRank: null,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/** Merge a single ranked item into its fusion entry: update RRF and best per-source rank. */
|
|
127
|
+
function mergeRankedItem(entry: FusionEntry, item: RankedItem, k: number): void {
|
|
128
|
+
entry.rrfScore += 1 / (k + item.rank);
|
|
129
|
+
if (item.source === 'bm25') {
|
|
130
|
+
if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
|
|
131
|
+
entry.bm25Score = item.bm25Score ?? null;
|
|
132
|
+
entry.bm25Rank = item.rank;
|
|
133
|
+
}
|
|
134
|
+
} else if (entry.semanticRank === null || item.rank < entry.semanticRank) {
|
|
135
|
+
entry.similarity = item.similarity ?? null;
|
|
136
|
+
entry.semanticRank = item.rank;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** Flatten a fusion entry into the public-facing hybrid result shape. */
|
|
141
|
+
function toHybridResult(e: FusionEntry): HybridResult {
|
|
142
|
+
return {
|
|
143
|
+
name: e.name,
|
|
144
|
+
kind: e.kind,
|
|
145
|
+
file: e.file,
|
|
146
|
+
line: e.line,
|
|
147
|
+
endLine: e.endLine,
|
|
148
|
+
role: e.role,
|
|
149
|
+
fileHash: e.fileHash,
|
|
150
|
+
rrf: e.rrfScore,
|
|
151
|
+
bm25Score: e.bm25Score,
|
|
152
|
+
bm25Rank: e.bm25Rank,
|
|
153
|
+
similarity: e.similarity,
|
|
154
|
+
semanticRank: e.semanticRank,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
108
158
|
/** Reciprocal Rank Fusion: merge ranked lists into a single scored result set. */
|
|
109
159
|
function fuseResults(rankedLists: RankedItem[][], k: number, limit: number): HybridResult[] {
|
|
110
160
|
const fusionMap = new Map<string, FusionEntry>();
|
|
111
|
-
|
|
112
161
|
for (const list of rankedLists) {
|
|
113
162
|
for (const item of list) {
|
|
114
163
|
if (!fusionMap.has(item.key)) {
|
|
115
|
-
fusionMap.set(item.key,
|
|
116
|
-
name: item.name,
|
|
117
|
-
kind: item.kind,
|
|
118
|
-
file: item.file,
|
|
119
|
-
line: item.line,
|
|
120
|
-
endLine: (item.endLine as number | null) ?? null,
|
|
121
|
-
role: (item.role as string | null) ?? null,
|
|
122
|
-
fileHash: (item.fileHash as string | null) ?? null,
|
|
123
|
-
rrfScore: 0,
|
|
124
|
-
bm25Score: null,
|
|
125
|
-
bm25Rank: null,
|
|
126
|
-
similarity: null,
|
|
127
|
-
semanticRank: null,
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
const entry = fusionMap.get(item.key)!;
|
|
131
|
-
entry.rrfScore += 1 / (k + item.rank);
|
|
132
|
-
if (item.source === 'bm25') {
|
|
133
|
-
if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
|
|
134
|
-
entry.bm25Score = (item as RankedItem & { bm25Score?: number }).bm25Score ?? null;
|
|
135
|
-
entry.bm25Rank = item.rank;
|
|
136
|
-
}
|
|
137
|
-
} else {
|
|
138
|
-
if (entry.semanticRank === null || item.rank < entry.semanticRank) {
|
|
139
|
-
entry.similarity = (item as RankedItem & { similarity?: number }).similarity ?? null;
|
|
140
|
-
entry.semanticRank = item.rank;
|
|
141
|
-
}
|
|
164
|
+
fusionMap.set(item.key, createFusionEntry(item));
|
|
142
165
|
}
|
|
166
|
+
mergeRankedItem(fusionMap.get(item.key)!, item, k);
|
|
143
167
|
}
|
|
144
168
|
}
|
|
145
169
|
|
|
146
170
|
return [...fusionMap.values()]
|
|
147
171
|
.sort((a, b) => b.rrfScore - a.rrfScore)
|
|
148
172
|
.slice(0, limit)
|
|
149
|
-
.map(
|
|
150
|
-
name: e.name,
|
|
151
|
-
kind: e.kind,
|
|
152
|
-
file: e.file,
|
|
153
|
-
line: e.line,
|
|
154
|
-
endLine: e.endLine,
|
|
155
|
-
role: e.role,
|
|
156
|
-
fileHash: e.fileHash,
|
|
157
|
-
rrf: e.rrfScore,
|
|
158
|
-
bm25Score: e.bm25Score,
|
|
159
|
-
bm25Rank: e.bm25Rank,
|
|
160
|
-
similarity: e.similarity,
|
|
161
|
-
semanticRank: e.semanticRank,
|
|
162
|
-
}));
|
|
173
|
+
.map(toHybridResult);
|
|
163
174
|
}
|
|
164
175
|
|
|
165
176
|
export async function hybridSearchData(
|
|
@@ -4,7 +4,7 @@ import type { BetterSqlite3Database, CodegraphConfig } from '../../../types.js';
|
|
|
4
4
|
import { normalizeSymbol } from '../../queries.js';
|
|
5
5
|
import { embed } from '../models.js';
|
|
6
6
|
import { cosineSim } from '../stores/sqlite-blob.js';
|
|
7
|
-
import { prepareSearch } from './prepare.js';
|
|
7
|
+
import { type PreparedSearch, prepareSearch } from './prepare.js';
|
|
8
8
|
|
|
9
9
|
export interface SemanticSearchOpts {
|
|
10
10
|
config?: CodegraphConfig;
|
|
@@ -30,6 +30,25 @@ export interface SearchDataResult {
|
|
|
30
30
|
results: SemanticResult[];
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
+
type StoredRow = PreparedSearch['rows'][number];
|
|
34
|
+
|
|
35
|
+
/** Reconstitute a stored embedding row's vector blob into a Float32Array. */
|
|
36
|
+
function rowVector(row: StoredRow): Float32Array {
|
|
37
|
+
return new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Warn when stored embeddings and the query model use different dimensions. */
|
|
41
|
+
function checkDimensionMismatch(storedDim: number | null, dim: number): boolean {
|
|
42
|
+
if (storedDim && dim !== storedDim) {
|
|
43
|
+
console.log(
|
|
44
|
+
`Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
|
|
45
|
+
);
|
|
46
|
+
console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
|
|
47
|
+
return true;
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
|
|
33
52
|
export async function searchData(
|
|
34
53
|
query: string,
|
|
35
54
|
customDbPath: string | undefined,
|
|
@@ -50,20 +69,12 @@ export async function searchData(
|
|
|
50
69
|
dim,
|
|
51
70
|
} = await embed([query], modelKey ?? undefined);
|
|
52
71
|
|
|
53
|
-
if (storedDim
|
|
54
|
-
console.log(
|
|
55
|
-
`Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
|
|
56
|
-
);
|
|
57
|
-
console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
|
|
58
|
-
return null;
|
|
59
|
-
}
|
|
72
|
+
if (checkDimensionMismatch(storedDim, dim)) return null;
|
|
60
73
|
|
|
61
74
|
const hc = new Map<string, string>();
|
|
62
75
|
const results: SemanticResult[] = [];
|
|
63
76
|
for (const row of rows) {
|
|
64
|
-
const
|
|
65
|
-
const sim = cosineSim(queryVec!, vec);
|
|
66
|
-
|
|
77
|
+
const sim = cosineSim(queryVec!, rowVector(row));
|
|
67
78
|
if (sim >= minScore) {
|
|
68
79
|
results.push({
|
|
69
80
|
...normalizeSymbol(row, db as BetterSqlite3Database, hc),
|
|
@@ -91,6 +102,82 @@ export interface MultiSearchResult {
|
|
|
91
102
|
}>;
|
|
92
103
|
}
|
|
93
104
|
|
|
105
|
+
interface RankedHit {
|
|
106
|
+
rowIndex: number;
|
|
107
|
+
similarity: number;
|
|
108
|
+
rank: number;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
interface FusionEntry {
|
|
112
|
+
rrfScore: number;
|
|
113
|
+
queryScores: Array<{ query: string; similarity: number; rank: number }>;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Emit a warning for any query pair whose embeddings are nearly identical,
|
|
118
|
+
* since RRF would over-weight matches shared between them.
|
|
119
|
+
*/
|
|
120
|
+
function warnOnSimilarQueries(
|
|
121
|
+
queries: string[],
|
|
122
|
+
queryVecs: Float32Array[],
|
|
123
|
+
threshold: number,
|
|
124
|
+
): void {
|
|
125
|
+
for (let i = 0; i < queryVecs.length; i++) {
|
|
126
|
+
for (let j = i + 1; j < queryVecs.length; j++) {
|
|
127
|
+
const sim = cosineSim(queryVecs[i]!, queryVecs[j]!);
|
|
128
|
+
if (sim >= threshold) {
|
|
129
|
+
warn(
|
|
130
|
+
`Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
|
|
131
|
+
`(${(sim * 100).toFixed(0)}% cosine similarity). ` +
|
|
132
|
+
`This may bias RRF results toward their shared matches. ` +
|
|
133
|
+
`Consider using more distinct queries.`,
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/** Rank stored rows for a single query, keeping only those above minScore. */
|
|
141
|
+
function rankRowsForQuery(
|
|
142
|
+
queryVec: Float32Array,
|
|
143
|
+
rowVecs: Float32Array[],
|
|
144
|
+
minScore: number,
|
|
145
|
+
): RankedHit[] {
|
|
146
|
+
const scored: Array<{ rowIndex: number; similarity: number }> = [];
|
|
147
|
+
for (let ri = 0; ri < rowVecs.length; ri++) {
|
|
148
|
+
const sim = cosineSim(queryVec, rowVecs[ri]!);
|
|
149
|
+
if (sim >= minScore) {
|
|
150
|
+
scored.push({ rowIndex: ri, similarity: sim });
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
scored.sort((a, b) => b.similarity - a.similarity);
|
|
154
|
+
return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/** Reciprocal Rank Fusion across each query's ranked hits. */
|
|
158
|
+
function fuseRankedHits(
|
|
159
|
+
queries: string[],
|
|
160
|
+
perQueryRanked: RankedHit[][],
|
|
161
|
+
k: number,
|
|
162
|
+
): Map<number, FusionEntry> {
|
|
163
|
+
const fusionMap = new Map<number, FusionEntry>();
|
|
164
|
+
for (let qi = 0; qi < queries.length; qi++) {
|
|
165
|
+
for (const item of perQueryRanked[qi]!) {
|
|
166
|
+
if (!fusionMap.has(item.rowIndex)) {
|
|
167
|
+
fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
|
|
168
|
+
}
|
|
169
|
+
const entry = fusionMap.get(item.rowIndex)!;
|
|
170
|
+
entry.rrfScore += 1 / (k + item.rank);
|
|
171
|
+
entry.queryScores.push({
|
|
172
|
+
query: queries[qi]!,
|
|
173
|
+
similarity: item.similarity,
|
|
174
|
+
rank: item.rank,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return fusionMap;
|
|
179
|
+
}
|
|
180
|
+
|
|
94
181
|
export async function multiSearchData(
|
|
95
182
|
queries: string[],
|
|
96
183
|
customDbPath: string | undefined,
|
|
@@ -101,6 +188,7 @@ export async function multiSearchData(
|
|
|
101
188
|
const limit = opts.limit ?? searchCfg.topK ?? 15;
|
|
102
189
|
const minScore = opts.minScore ?? searchCfg.defaultMinScore ?? 0.2;
|
|
103
190
|
const k = opts.rrfK ?? searchCfg.rrfK ?? 60;
|
|
191
|
+
const similarityWarnThreshold = searchCfg.similarityWarnThreshold ?? 0.85;
|
|
104
192
|
|
|
105
193
|
const prepared = prepareSearch(customDbPath, opts);
|
|
106
194
|
if (!prepared) return null;
|
|
@@ -109,63 +197,15 @@ export async function multiSearchData(
|
|
|
109
197
|
try {
|
|
110
198
|
const { vectors: queryVecs, dim } = await embed(queries, modelKey ?? undefined);
|
|
111
199
|
|
|
112
|
-
|
|
113
|
-
for (let i = 0; i < queryVecs.length; i++) {
|
|
114
|
-
for (let j = i + 1; j < queryVecs.length; j++) {
|
|
115
|
-
const sim = cosineSim(queryVecs[i]!, queryVecs[j]!);
|
|
116
|
-
if (sim >= SIMILARITY_WARN_THRESHOLD) {
|
|
117
|
-
warn(
|
|
118
|
-
`Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
|
|
119
|
-
`(${(sim * 100).toFixed(0)}% cosine similarity). ` +
|
|
120
|
-
`This may bias RRF results toward their shared matches. ` +
|
|
121
|
-
`Consider using more distinct queries.`,
|
|
122
|
-
);
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
}
|
|
200
|
+
warnOnSimilarQueries(queries, queryVecs as Float32Array[], similarityWarnThreshold);
|
|
126
201
|
|
|
127
|
-
if (storedDim
|
|
128
|
-
console.log(
|
|
129
|
-
`Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
|
|
130
|
-
);
|
|
131
|
-
console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
|
|
132
|
-
return null;
|
|
133
|
-
}
|
|
202
|
+
if (checkDimensionMismatch(storedDim, dim)) return null;
|
|
134
203
|
|
|
135
|
-
const rowVecs = rows.map(
|
|
136
|
-
|
|
204
|
+
const rowVecs = rows.map(rowVector);
|
|
205
|
+
const perQueryRanked = queries.map((_q, qi) =>
|
|
206
|
+
rankRowsForQuery(queryVecs[qi]!, rowVecs, minScore),
|
|
137
207
|
);
|
|
138
|
-
|
|
139
|
-
const perQueryRanked = queries.map((_query, qi) => {
|
|
140
|
-
const scored: Array<{ rowIndex: number; similarity: number }> = [];
|
|
141
|
-
for (let ri = 0; ri < rows.length; ri++) {
|
|
142
|
-
const sim = cosineSim(queryVecs[qi]!, rowVecs[ri]!);
|
|
143
|
-
if (sim >= minScore) {
|
|
144
|
-
scored.push({ rowIndex: ri, similarity: sim });
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
scored.sort((a, b) => b.similarity - a.similarity);
|
|
148
|
-
return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
|
|
149
|
-
});
|
|
150
|
-
|
|
151
|
-
const fusionMap = new Map<
|
|
152
|
-
number,
|
|
153
|
-
{ rrfScore: number; queryScores: Array<{ query: string; similarity: number; rank: number }> }
|
|
154
|
-
>();
|
|
155
|
-
for (let qi = 0; qi < queries.length; qi++) {
|
|
156
|
-
for (const item of perQueryRanked[qi]!) {
|
|
157
|
-
if (!fusionMap.has(item.rowIndex)) {
|
|
158
|
-
fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
|
|
159
|
-
}
|
|
160
|
-
const entry = fusionMap.get(item.rowIndex)!;
|
|
161
|
-
entry.rrfScore += 1 / (k + item.rank);
|
|
162
|
-
entry.queryScores.push({
|
|
163
|
-
query: queries[qi]!,
|
|
164
|
-
similarity: item.similarity,
|
|
165
|
-
rank: item.rank,
|
|
166
|
-
});
|
|
167
|
-
}
|
|
168
|
-
}
|
|
208
|
+
const fusionMap = fuseRankedHits(queries, perQueryRanked, k);
|
|
169
209
|
|
|
170
210
|
const hc = new Map<string, string>();
|
|
171
211
|
const results: MultiSearchResult['results'] = [];
|