@optave/codegraph 3.11.0 → 3.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (230) hide show
  1. package/README.md +38 -31
  2. package/dist/ast-analysis/engine.d.ts.map +1 -1
  3. package/dist/ast-analysis/engine.js +91 -60
  4. package/dist/ast-analysis/engine.js.map +1 -1
  5. package/dist/ast-analysis/visitor-utils.d.ts +3 -0
  6. package/dist/ast-analysis/visitor-utils.d.ts.map +1 -1
  7. package/dist/ast-analysis/visitor-utils.js +83 -49
  8. package/dist/ast-analysis/visitor-utils.js.map +1 -1
  9. package/dist/ast-analysis/visitors/ast-store-visitor.d.ts.map +1 -1
  10. package/dist/ast-analysis/visitors/ast-store-visitor.js +78 -62
  11. package/dist/ast-analysis/visitors/ast-store-visitor.js.map +1 -1
  12. package/dist/ast-analysis/visitors/dataflow-visitor.d.ts.map +1 -1
  13. package/dist/ast-analysis/visitors/dataflow-visitor.js +61 -42
  14. package/dist/ast-analysis/visitors/dataflow-visitor.js.map +1 -1
  15. package/dist/cli/commands/embed.d.ts.map +1 -1
  16. package/dist/cli/commands/embed.js +49 -4
  17. package/dist/cli/commands/embed.js.map +1 -1
  18. package/dist/domain/analysis/dependencies.d.ts.map +1 -1
  19. package/dist/domain/analysis/dependencies.js +106 -80
  20. package/dist/domain/analysis/dependencies.js.map +1 -1
  21. package/dist/domain/analysis/fn-impact.d.ts.map +1 -1
  22. package/dist/domain/analysis/fn-impact.js +77 -52
  23. package/dist/domain/analysis/fn-impact.js.map +1 -1
  24. package/dist/domain/analysis/module-map.d.ts.map +1 -1
  25. package/dist/domain/analysis/module-map.js +132 -121
  26. package/dist/domain/analysis/module-map.js.map +1 -1
  27. package/dist/domain/graph/builder/call-resolver.d.ts +71 -0
  28. package/dist/domain/graph/builder/call-resolver.d.ts.map +1 -0
  29. package/dist/domain/graph/builder/call-resolver.js +130 -0
  30. package/dist/domain/graph/builder/call-resolver.js.map +1 -0
  31. package/dist/domain/graph/builder/helpers.d.ts +4 -4
  32. package/dist/domain/graph/builder/helpers.d.ts.map +1 -1
  33. package/dist/domain/graph/builder/helpers.js +47 -33
  34. package/dist/domain/graph/builder/helpers.js.map +1 -1
  35. package/dist/domain/graph/builder/incremental.d.ts +6 -0
  36. package/dist/domain/graph/builder/incremental.d.ts.map +1 -1
  37. package/dist/domain/graph/builder/incremental.js +214 -127
  38. package/dist/domain/graph/builder/incremental.js.map +1 -1
  39. package/dist/domain/graph/builder/pipeline.d.ts +1 -44
  40. package/dist/domain/graph/builder/pipeline.d.ts.map +1 -1
  41. package/dist/domain/graph/builder/pipeline.js +10 -766
  42. package/dist/domain/graph/builder/pipeline.js.map +1 -1
  43. package/dist/domain/graph/builder/stages/build-edges.d.ts.map +1 -1
  44. package/dist/domain/graph/builder/stages/build-edges.js +151 -192
  45. package/dist/domain/graph/builder/stages/build-edges.js.map +1 -1
  46. package/dist/domain/graph/builder/stages/build-structure.d.ts.map +1 -1
  47. package/dist/domain/graph/builder/stages/build-structure.js +82 -65
  48. package/dist/domain/graph/builder/stages/build-structure.js.map +1 -1
  49. package/dist/domain/graph/builder/stages/detect-changes.d.ts.map +1 -1
  50. package/dist/domain/graph/builder/stages/detect-changes.js +84 -56
  51. package/dist/domain/graph/builder/stages/detect-changes.js.map +1 -1
  52. package/dist/domain/graph/builder/stages/finalize.d.ts.map +1 -1
  53. package/dist/domain/graph/builder/stages/finalize.js +60 -51
  54. package/dist/domain/graph/builder/stages/finalize.js.map +1 -1
  55. package/dist/domain/graph/builder/stages/insert-nodes.d.ts +8 -6
  56. package/dist/domain/graph/builder/stages/insert-nodes.d.ts.map +1 -1
  57. package/dist/domain/graph/builder/stages/insert-nodes.js +107 -122
  58. package/dist/domain/graph/builder/stages/insert-nodes.js.map +1 -1
  59. package/dist/domain/graph/builder/stages/native-db-lifecycle.d.ts +14 -0
  60. package/dist/domain/graph/builder/stages/native-db-lifecycle.d.ts.map +1 -0
  61. package/dist/domain/graph/builder/stages/native-db-lifecycle.js +77 -0
  62. package/dist/domain/graph/builder/stages/native-db-lifecycle.js.map +1 -0
  63. package/dist/domain/graph/builder/stages/native-orchestrator.d.ts +62 -0
  64. package/dist/domain/graph/builder/stages/native-orchestrator.d.ts.map +1 -0
  65. package/dist/domain/graph/builder/stages/native-orchestrator.js +747 -0
  66. package/dist/domain/graph/builder/stages/native-orchestrator.js.map +1 -0
  67. package/dist/domain/graph/cycles.d.ts +6 -4
  68. package/dist/domain/graph/cycles.d.ts.map +1 -1
  69. package/dist/domain/graph/cycles.js +50 -55
  70. package/dist/domain/graph/cycles.js.map +1 -1
  71. package/dist/domain/graph/journal.d.ts.map +1 -1
  72. package/dist/domain/graph/journal.js +89 -70
  73. package/dist/domain/graph/journal.js.map +1 -1
  74. package/dist/domain/graph/watcher.d.ts.map +1 -1
  75. package/dist/domain/graph/watcher.js +10 -4
  76. package/dist/domain/graph/watcher.js.map +1 -1
  77. package/dist/domain/parser.d.ts +12 -23
  78. package/dist/domain/parser.d.ts.map +1 -1
  79. package/dist/domain/parser.js +126 -79
  80. package/dist/domain/parser.js.map +1 -1
  81. package/dist/domain/search/generator.d.ts +3 -1
  82. package/dist/domain/search/generator.d.ts.map +1 -1
  83. package/dist/domain/search/generator.js +68 -45
  84. package/dist/domain/search/generator.js.map +1 -1
  85. package/dist/domain/search/models.d.ts +2 -0
  86. package/dist/domain/search/models.d.ts.map +1 -1
  87. package/dist/domain/search/models.js +37 -3
  88. package/dist/domain/search/models.js.map +1 -1
  89. package/dist/domain/search/search/hybrid.d.ts.map +1 -1
  90. package/dist/domain/search/search/hybrid.js +49 -40
  91. package/dist/domain/search/search/hybrid.js.map +1 -1
  92. package/dist/domain/search/search/semantic.d.ts.map +1 -1
  93. package/dist/domain/search/search/semantic.js +69 -49
  94. package/dist/domain/search/search/semantic.js.map +1 -1
  95. package/dist/domain/wasm-worker-entry.js +201 -136
  96. package/dist/domain/wasm-worker-entry.js.map +1 -1
  97. package/dist/extractors/elixir.js +95 -71
  98. package/dist/extractors/elixir.js.map +1 -1
  99. package/dist/extractors/gleam.d.ts.map +1 -1
  100. package/dist/extractors/gleam.js +23 -31
  101. package/dist/extractors/gleam.js.map +1 -1
  102. package/dist/extractors/helpers.d.ts +79 -1
  103. package/dist/extractors/helpers.d.ts.map +1 -1
  104. package/dist/extractors/helpers.js +137 -0
  105. package/dist/extractors/helpers.js.map +1 -1
  106. package/dist/extractors/java.d.ts.map +1 -1
  107. package/dist/extractors/java.js +37 -49
  108. package/dist/extractors/java.js.map +1 -1
  109. package/dist/extractors/javascript.d.ts.map +1 -1
  110. package/dist/extractors/javascript.js +44 -44
  111. package/dist/extractors/javascript.js.map +1 -1
  112. package/dist/extractors/julia.js +27 -34
  113. package/dist/extractors/julia.js.map +1 -1
  114. package/dist/extractors/r.d.ts.map +1 -1
  115. package/dist/extractors/r.js +33 -58
  116. package/dist/extractors/r.js.map +1 -1
  117. package/dist/extractors/solidity.d.ts.map +1 -1
  118. package/dist/extractors/solidity.js +38 -61
  119. package/dist/extractors/solidity.js.map +1 -1
  120. package/dist/features/boundaries.d.ts.map +1 -1
  121. package/dist/features/boundaries.js +49 -39
  122. package/dist/features/boundaries.js.map +1 -1
  123. package/dist/features/cfg.d.ts.map +1 -1
  124. package/dist/features/cfg.js +90 -63
  125. package/dist/features/cfg.js.map +1 -1
  126. package/dist/features/check.d.ts.map +1 -1
  127. package/dist/features/check.js +43 -34
  128. package/dist/features/check.js.map +1 -1
  129. package/dist/features/cochange.d.ts.map +1 -1
  130. package/dist/features/cochange.js +68 -56
  131. package/dist/features/cochange.js.map +1 -1
  132. package/dist/features/complexity.d.ts.map +1 -1
  133. package/dist/features/complexity.js +105 -75
  134. package/dist/features/complexity.js.map +1 -1
  135. package/dist/features/dataflow.d.ts.map +1 -1
  136. package/dist/features/dataflow.js +37 -29
  137. package/dist/features/dataflow.js.map +1 -1
  138. package/dist/features/flow.d.ts.map +1 -1
  139. package/dist/features/flow.js +31 -22
  140. package/dist/features/flow.js.map +1 -1
  141. package/dist/features/graph-enrichment.d.ts.map +1 -1
  142. package/dist/features/graph-enrichment.js +77 -70
  143. package/dist/features/graph-enrichment.js.map +1 -1
  144. package/dist/features/owners.d.ts +17 -26
  145. package/dist/features/owners.d.ts.map +1 -1
  146. package/dist/features/owners.js +120 -109
  147. package/dist/features/owners.js.map +1 -1
  148. package/dist/features/sequence.d.ts.map +1 -1
  149. package/dist/features/sequence.js +59 -54
  150. package/dist/features/sequence.js.map +1 -1
  151. package/dist/features/structure-query.d.ts.map +1 -1
  152. package/dist/features/structure-query.js +60 -60
  153. package/dist/features/structure-query.js.map +1 -1
  154. package/dist/features/structure.d.ts.map +1 -1
  155. package/dist/features/structure.js +149 -52
  156. package/dist/features/structure.js.map +1 -1
  157. package/dist/graph/algorithms/leiden/optimiser.d.ts.map +1 -1
  158. package/dist/graph/algorithms/leiden/optimiser.js +100 -69
  159. package/dist/graph/algorithms/leiden/optimiser.js.map +1 -1
  160. package/dist/graph/classifiers/roles.d.ts.map +1 -1
  161. package/dist/graph/classifiers/roles.js +63 -59
  162. package/dist/graph/classifiers/roles.js.map +1 -1
  163. package/dist/infrastructure/config.d.ts +1 -1
  164. package/dist/infrastructure/config.d.ts.map +1 -1
  165. package/dist/infrastructure/config.js +1 -1
  166. package/dist/infrastructure/config.js.map +1 -1
  167. package/dist/presentation/cfg.d.ts.map +1 -1
  168. package/dist/presentation/cfg.js +44 -29
  169. package/dist/presentation/cfg.js.map +1 -1
  170. package/dist/presentation/flow.d.ts.map +1 -1
  171. package/dist/presentation/flow.js +58 -38
  172. package/dist/presentation/flow.js.map +1 -1
  173. package/dist/types.d.ts +1 -1
  174. package/dist/types.d.ts.map +1 -1
  175. package/grammars/tree-sitter-erlang.wasm +0 -0
  176. package/package.json +9 -9
  177. package/src/ast-analysis/engine.ts +145 -61
  178. package/src/ast-analysis/visitor-utils.ts +86 -46
  179. package/src/ast-analysis/visitors/ast-store-visitor.ts +104 -69
  180. package/src/ast-analysis/visitors/dataflow-visitor.ts +86 -47
  181. package/src/cli/commands/embed.ts +54 -4
  182. package/src/domain/analysis/dependencies.ts +166 -85
  183. package/src/domain/analysis/fn-impact.ts +120 -50
  184. package/src/domain/analysis/module-map.ts +175 -140
  185. package/src/domain/graph/builder/call-resolver.ts +181 -0
  186. package/src/domain/graph/builder/helpers.ts +85 -76
  187. package/src/domain/graph/builder/incremental.ts +321 -152
  188. package/src/domain/graph/builder/pipeline.ts +19 -957
  189. package/src/domain/graph/builder/stages/build-edges.ts +229 -275
  190. package/src/domain/graph/builder/stages/build-structure.ts +115 -82
  191. package/src/domain/graph/builder/stages/detect-changes.ts +107 -64
  192. package/src/domain/graph/builder/stages/finalize.ts +72 -70
  193. package/src/domain/graph/builder/stages/insert-nodes.ts +154 -120
  194. package/src/domain/graph/builder/stages/native-db-lifecycle.ts +74 -0
  195. package/src/domain/graph/builder/stages/native-orchestrator.ts +942 -0
  196. package/src/domain/graph/cycles.ts +51 -49
  197. package/src/domain/graph/journal.ts +84 -69
  198. package/src/domain/graph/watcher.ts +12 -4
  199. package/src/domain/parser.ts +143 -66
  200. package/src/domain/search/generator.ts +132 -74
  201. package/src/domain/search/models.ts +39 -3
  202. package/src/domain/search/search/hybrid.ts +53 -42
  203. package/src/domain/search/search/semantic.ts +105 -65
  204. package/src/domain/wasm-worker-entry.ts +235 -152
  205. package/src/extractors/elixir.ts +91 -64
  206. package/src/extractors/gleam.ts +33 -37
  207. package/src/extractors/helpers.ts +205 -1
  208. package/src/extractors/java.ts +42 -45
  209. package/src/extractors/javascript.ts +44 -43
  210. package/src/extractors/julia.ts +28 -35
  211. package/src/extractors/r.ts +38 -56
  212. package/src/extractors/solidity.ts +43 -71
  213. package/src/features/boundaries.ts +64 -46
  214. package/src/features/cfg.ts +145 -74
  215. package/src/features/check.ts +60 -43
  216. package/src/features/cochange.ts +95 -72
  217. package/src/features/complexity.ts +134 -79
  218. package/src/features/dataflow.ts +57 -34
  219. package/src/features/flow.ts +48 -24
  220. package/src/features/graph-enrichment.ts +105 -70
  221. package/src/features/owners.ts +186 -146
  222. package/src/features/sequence.ts +99 -69
  223. package/src/features/structure-query.ts +94 -79
  224. package/src/features/structure.ts +199 -79
  225. package/src/graph/algorithms/leiden/optimiser.ts +142 -87
  226. package/src/graph/classifiers/roles.ts +64 -54
  227. package/src/infrastructure/config.ts +1 -1
  228. package/src/presentation/cfg.ts +48 -32
  229. package/src/presentation/flow.ts +100 -52
  230. package/src/types.ts +1 -1
@@ -8,6 +8,19 @@ import { embed, getModelConfig } from './models.js';
8
8
  import { buildSourceText } from './strategies/source.js';
9
9
  import { buildStructuredText } from './strategies/structured.js';
10
10
 
11
+ type EmbeddingNode = NodeRow & { id: number };
12
+ type EmbeddingStrategy = 'structured' | 'source';
13
+
14
+ interface PreparedEmbeddings {
15
+ texts: string[];
16
+ nodeIds: number[];
17
+ nodeNames: string[];
18
+ previews: string[];
19
+ overflowCount: number;
20
+ filesRead: number;
21
+ filesSkipped: number;
22
+ }
23
+
11
24
  /**
12
25
  * Rough token estimate (~4 chars per token for code/English).
13
26
  * Conservative — avoids adding a tokenizer dependency.
@@ -47,47 +60,22 @@ function initEmbeddingsSchema(db: BetterSqlite3Database): void {
47
60
  `);
48
61
  }
49
62
 
50
- export interface BuildEmbeddingsOptions {
51
- strategy?: 'structured' | 'source';
52
- }
53
-
54
63
  /**
55
- * Build embeddings for all functions/methods/classes in the graph.
64
+ * Resolve the repo root for embedding. Prefer the root recorded at build time;
65
+ * fall back to `<dbParent>` only when the DB lives at the conventional
66
+ * `<root>/.codegraph/graph.db` layout — otherwise trust the caller's rootDir.
56
67
  */
57
- export async function buildEmbeddings(
58
- rootDir: string,
59
- modelKey: string,
60
- customDbPath?: string,
61
- options: BuildEmbeddingsOptions = {},
62
- ): Promise<void> {
63
- const strategy = options.strategy || 'structured';
64
- const dbPath = customDbPath || findDbPath(undefined);
65
-
66
- if (!fs.existsSync(dbPath)) {
67
- throw new DbError(
68
- `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
69
- { file: dbPath },
70
- );
71
- }
72
-
73
- const db = openDb(dbPath) as BetterSqlite3Database;
74
- initEmbeddingsSchema(db);
75
-
76
- // Prefer the repo root recorded at build time — embed may be invoked from a
77
- // different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the
78
- // positional rootDir will be wrong in that case. For legacy DBs without
79
- // root_dir metadata, fall back to `<dbParent>` only when the DB lives at
80
- // the conventional `<root>/.codegraph/graph.db` layout — otherwise trust
81
- // the caller-provided rootDir (which may be an explicit positional arg).
82
- // `path.dirname(...)` is always non-empty (`'.'` at minimum), so the
83
- // conventional-layout check is required to keep the rootDir path reachable.
68
+ function resolveRoot(db: BetterSqlite3Database, dbPath: string, rootDir: string): string {
84
69
  const metaRoot = getBuildMeta(db, 'root_dir');
85
70
  const resolvedDbPath = path.resolve(dbPath);
86
71
  const dbDirName = path.basename(path.dirname(resolvedDbPath));
87
72
  const dbParent =
88
73
  dbDirName === '.codegraph' ? path.dirname(path.dirname(resolvedDbPath)) : undefined;
89
- const resolvedRoot = metaRoot || dbParent || rootDir;
74
+ return metaRoot || dbParent || rootDir;
75
+ }
90
76
 
77
+ /** Reset embedding tables and load eligible symbols grouped by file. */
78
+ function loadNodesByFile(db: BetterSqlite3Database): Map<string, EmbeddingNode[]> {
91
79
  db.exec('DELETE FROM embeddings');
92
80
  db.exec('DELETE FROM embedding_meta');
93
81
  db.exec('DELETE FROM fts_index');
@@ -96,22 +84,52 @@ export async function buildEmbeddings(
96
84
  .prepare(
97
85
  `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
98
86
  )
99
- .all() as Array<NodeRow & { id: number }>;
87
+ .all() as EmbeddingNode[];
100
88
 
101
- console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
102
-
103
- const byFile = new Map<string, typeof nodes>();
89
+ const byFile = new Map<string, EmbeddingNode[]>();
104
90
  for (const node of nodes) {
105
91
  if (!byFile.has(node.file)) byFile.set(node.file, []);
106
92
  byFile.get(node.file)?.push(node);
107
93
  }
94
+ return byFile;
95
+ }
96
+
97
+ /** Build embedding text for a single node, truncating if it would overflow. */
98
+ function buildNodeText(
99
+ node: EmbeddingNode,
100
+ file: string,
101
+ lines: string[],
102
+ db: BetterSqlite3Database,
103
+ strategy: EmbeddingStrategy,
104
+ contextWindow: number,
105
+ ): { text: string; overflowed: boolean } {
106
+ let text =
107
+ strategy === 'structured'
108
+ ? buildStructuredText(node, file, lines, db)
109
+ : buildSourceText(node, file, lines);
110
+ const tokens = estimateTokens(text);
111
+ if (tokens > contextWindow) {
112
+ text = text.slice(0, contextWindow * 4);
113
+ return { text, overflowed: true };
114
+ }
115
+ return { text, overflowed: false };
116
+ }
108
117
 
118
+ /**
119
+ * Walk files in the graph, read source, and produce parallel arrays of
120
+ * texts / nodeIds / nodeNames / previews ready for embedding.
121
+ */
122
+ function prepareEmbeddingTexts(
123
+ byFile: Map<string, EmbeddingNode[]>,
124
+ db: BetterSqlite3Database,
125
+ resolvedRoot: string,
126
+ strategy: EmbeddingStrategy,
127
+ contextWindow: number,
128
+ ): PreparedEmbeddings {
109
129
  const texts: string[] = [];
110
130
  const nodeIds: number[] = [];
111
131
  const nodeNames: string[] = [];
112
132
  const previews: string[] = [];
113
- const config = getModelConfig(modelKey);
114
- const contextWindow = config.contextWindow;
115
133
  let overflowCount = 0;
116
134
  let filesRead = 0;
117
135
  let filesSkipped = 0;
@@ -129,19 +147,8 @@ export async function buildEmbeddings(
129
147
  }
130
148
 
131
149
  for (const node of fileNodes) {
132
- let text =
133
- strategy === 'structured'
134
- ? buildStructuredText(node, file, lines, db)
135
- : buildSourceText(node, file, lines);
136
-
137
- // Detect and handle context window overflow
138
- const tokens = estimateTokens(text);
139
- if (tokens > contextWindow) {
140
- overflowCount++;
141
- const maxChars = contextWindow * 4;
142
- text = text.slice(0, maxChars);
143
- }
144
-
150
+ const { text, overflowed } = buildNodeText(node, file, lines, db, strategy, contextWindow);
151
+ if (overflowed) overflowCount++;
145
152
  texts.push(text);
146
153
  nodeIds.push(node.id);
147
154
  nodeNames.push(node.name);
@@ -149,28 +156,19 @@ export async function buildEmbeddings(
149
156
  }
150
157
  }
151
158
 
152
- if (overflowCount > 0) {
153
- warn(
154
- `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
155
- );
156
- }
157
-
158
- // If there were symbols to embed but every file failed to read, the DB was
159
- // almost certainly built from a different location than the current cwd.
160
- // Surface this clearly instead of emitting a silent "Stored 0 embeddings".
161
- if (byFile.size > 0 && filesRead === 0) {
162
- closeDb(db);
163
- throw new DbError(
164
- `embed: could not read any of the ${filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` +
165
- `Tried resolving against: ${resolvedRoot}\n` +
166
- 'Pass a positional <dir> argument pointing at the original repo root, or re-run "codegraph build" from that directory.',
167
- { file: dbPath },
168
- );
169
- }
170
-
171
- console.log(`Embedding ${texts.length} symbols...`);
172
- const { vectors, dim } = await embed(texts, modelKey);
159
+ return { texts, nodeIds, nodeNames, previews, overflowCount, filesRead, filesSkipped };
160
+ }
173
161
 
162
+ /** Persist vectors, FTS rows, and embedding metadata in a single transaction. */
163
+ function persistEmbeddings(
164
+ db: BetterSqlite3Database,
165
+ prepared: PreparedEmbeddings,
166
+ vectors: Float32Array[],
167
+ dim: number,
168
+ modelName: string,
169
+ strategy: EmbeddingStrategy,
170
+ ): void {
171
+ const { nodeIds, nodeNames, previews, texts, overflowCount } = prepared;
174
172
  const insert = db.prepare(
175
173
  'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)',
176
174
  );
@@ -182,7 +180,7 @@ export async function buildEmbeddings(
182
180
  insert.run(nodeIds[i], Buffer.from(vec.buffer), previews[i], texts[i]);
183
181
  insertFts.run(nodeIds[i], nodeNames[i], texts[i]);
184
182
  }
185
- insertMeta.run('model', config.name);
183
+ insertMeta.run('model', modelName);
186
184
  insertMeta.run('dim', String(dim));
187
185
  insertMeta.run('count', String(vectors.length));
188
186
  insertMeta.run('fts_count', String(vectors.length));
@@ -193,6 +191,66 @@ export async function buildEmbeddings(
193
191
  }
194
192
  });
195
193
  insertAll();
194
+ }
195
+
196
+ export interface BuildEmbeddingsOptions {
197
+ strategy?: EmbeddingStrategy;
198
+ }
199
+
200
+ /**
201
+ * Build embeddings for all functions/methods/classes in the graph.
202
+ */
203
+ export async function buildEmbeddings(
204
+ rootDir: string,
205
+ modelKey: string,
206
+ customDbPath?: string,
207
+ options: BuildEmbeddingsOptions = {},
208
+ ): Promise<void> {
209
+ const strategy = options.strategy || 'structured';
210
+ const dbPath = customDbPath || findDbPath(undefined);
211
+
212
+ if (!fs.existsSync(dbPath)) {
213
+ throw new DbError(
214
+ `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
215
+ { file: dbPath },
216
+ );
217
+ }
218
+
219
+ const db = openDb(dbPath) as BetterSqlite3Database;
220
+ initEmbeddingsSchema(db);
221
+
222
+ const resolvedRoot = resolveRoot(db, dbPath, rootDir);
223
+ const byFile = loadNodesByFile(db);
224
+
225
+ const nodeCount = [...byFile.values()].reduce((acc, list) => acc + list.length, 0);
226
+ console.log(`Building embeddings for ${nodeCount} symbols (strategy: ${strategy})...`);
227
+
228
+ const config = getModelConfig(modelKey);
229
+ const prepared = prepareEmbeddingTexts(byFile, db, resolvedRoot, strategy, config.contextWindow);
230
+
231
+ if (prepared.overflowCount > 0) {
232
+ warn(
233
+ `${prepared.overflowCount} symbol(s) exceeded model context window (${config.contextWindow} tokens) and were truncated`,
234
+ );
235
+ }
236
+
237
+ // If there were symbols to embed but every file failed to read, the DB was
238
+ // almost certainly built from a different location than the current cwd.
239
+ // Surface this clearly instead of emitting a silent "Stored 0 embeddings".
240
+ if (byFile.size > 0 && prepared.filesRead === 0) {
241
+ closeDb(db);
242
+ throw new DbError(
243
+ `embed: could not read any of the ${prepared.filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` +
244
+ `Tried resolving against: ${resolvedRoot}\n` +
245
+ 'Pass a positional <dir> argument pointing at the original repo root, or re-run "codegraph build" from that directory.',
246
+ { file: dbPath },
247
+ );
248
+ }
249
+
250
+ console.log(`Embedding ${prepared.texts.length} symbols...`);
251
+ const { vectors, dim } = await embed(prepared.texts, modelKey);
252
+
253
+ persistEmbeddings(db, prepared, vectors as Float32Array[], dim, config.name, strategy);
196
254
 
197
255
  console.log(
198
256
  `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
@@ -41,6 +41,8 @@ export interface ModelConfig {
41
41
  contextWindow: number;
42
42
  desc: string;
43
43
  quantized: boolean;
44
+ /** Pooling strategy passed to the transformers pipeline. Defaults to 'mean'. */
45
+ pooling?: 'mean' | 'cls';
44
46
  }
45
47
 
46
48
  // Lazy-load transformers (heavy, optional module)
@@ -91,7 +93,7 @@ export const MODELS: Record<string, ModelConfig> = {
91
93
  name: 'nomic-ai/nomic-embed-text-v1.5',
92
94
  dim: 768,
93
95
  contextWindow: 8192,
94
- desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
96
+ desc: 'Matryoshka MRL trained (~137MB). 8192 context. Codegraph stores full 768d (no truncation); v1 scores higher on our benchmark.',
95
97
  quantized: false,
96
98
  },
97
99
  'bge-large': {
@@ -101,11 +103,41 @@ export const MODELS: Record<string, ModelConfig> = {
101
103
  desc: 'Best general retrieval (~335MB). Top MTEB scores.',
102
104
  quantized: false,
103
105
  },
106
+ 'mxbai-xsmall': {
107
+ name: 'mixedbread-ai/mxbai-embed-xsmall-v1',
108
+ dim: 384,
109
+ contextWindow: 4096,
110
+ desc: 'Tiny model with long context (~50MB). 4096 ctx.',
111
+ quantized: false,
112
+ pooling: 'cls',
113
+ },
114
+ 'mxbai-large': {
115
+ name: 'mixedbread-ai/mxbai-embed-large-v1',
116
+ dim: 1024,
117
+ contextWindow: 512,
118
+ desc: 'Top MTEB BERT-large, Matryoshka dimensions (~400MB). 512 ctx.',
119
+ quantized: false,
120
+ pooling: 'cls',
121
+ },
122
+ 'bge-m3': {
123
+ name: 'Xenova/bge-m3',
124
+ dim: 1024,
125
+ contextWindow: 8192,
126
+ desc: 'Multilingual, multi-task (~600MB). 100+ languages, 8192 context.',
127
+ quantized: false,
128
+ },
129
+ modernbert: {
130
+ name: 'nomic-ai/modernbert-embed-base',
131
+ dim: 768,
132
+ contextWindow: 8192,
133
+ desc: 'ModernBERT base (~150MB). Newer architecture, 8192 ctx, English.',
134
+ quantized: false,
135
+ },
104
136
  };
105
137
 
106
138
  export const EMBEDDING_STRATEGIES: readonly string[] = ['structured', 'source'];
107
139
 
108
- export const DEFAULT_MODEL: string = 'nomic-v1.5';
140
+ export const DEFAULT_MODEL: string = 'nomic';
109
141
  const NPM_BIN = process.platform === 'win32' ? 'npm.cmd' : 'npm';
110
142
  const BATCH_SIZE_MAP: Record<string, number> = {
111
143
  minilm: 32,
@@ -115,6 +147,10 @@ const BATCH_SIZE_MAP: Record<string, number> = {
115
147
  nomic: 8,
116
148
  'nomic-v1.5': 8,
117
149
  'bge-large': 4,
150
+ 'mxbai-xsmall': 32,
151
+ 'mxbai-large': 4,
152
+ 'bge-m3': 4,
153
+ modernbert: 8,
118
154
  };
119
155
  const DEFAULT_BATCH_SIZE = 32;
120
156
 
@@ -274,7 +310,7 @@ export async function embed(
274
310
  const batch = texts.slice(i, i + batchSize);
275
311
  const output =
276
312
  (await // biome-ignore lint/complexity/noBannedTypes: dynamically loaded extractor is untyped
277
- (ext as Function)(batch, { pooling: 'mean', normalize: true })) as {
313
+ (ext as Function)(batch, { pooling: config.pooling ?? 'mean', normalize: true })) as {
278
314
  data: number[];
279
315
  };
280
316
 
@@ -105,61 +105,72 @@ async function collectRankedLists(
105
105
  return rankedLists;
106
106
  }
107
107
 
108
+ /** Initialise a fusion entry seeded from the first ranked item we see for a key. */
109
+ function createFusionEntry(item: RankedItem): FusionEntry {
110
+ return {
111
+ name: item.name,
112
+ kind: item.kind,
113
+ file: item.file,
114
+ line: item.line,
115
+ endLine: (item.endLine as number | null) ?? null,
116
+ role: (item.role as string | null) ?? null,
117
+ fileHash: (item.fileHash as string | null) ?? null,
118
+ rrfScore: 0,
119
+ bm25Score: null,
120
+ bm25Rank: null,
121
+ similarity: null,
122
+ semanticRank: null,
123
+ };
124
+ }
125
+
126
+ /** Merge a single ranked item into its fusion entry: update RRF and best per-source rank. */
127
+ function mergeRankedItem(entry: FusionEntry, item: RankedItem, k: number): void {
128
+ entry.rrfScore += 1 / (k + item.rank);
129
+ if (item.source === 'bm25') {
130
+ if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
131
+ entry.bm25Score = item.bm25Score ?? null;
132
+ entry.bm25Rank = item.rank;
133
+ }
134
+ } else if (entry.semanticRank === null || item.rank < entry.semanticRank) {
135
+ entry.similarity = item.similarity ?? null;
136
+ entry.semanticRank = item.rank;
137
+ }
138
+ }
139
+
140
+ /** Flatten a fusion entry into the public-facing hybrid result shape. */
141
+ function toHybridResult(e: FusionEntry): HybridResult {
142
+ return {
143
+ name: e.name,
144
+ kind: e.kind,
145
+ file: e.file,
146
+ line: e.line,
147
+ endLine: e.endLine,
148
+ role: e.role,
149
+ fileHash: e.fileHash,
150
+ rrf: e.rrfScore,
151
+ bm25Score: e.bm25Score,
152
+ bm25Rank: e.bm25Rank,
153
+ similarity: e.similarity,
154
+ semanticRank: e.semanticRank,
155
+ };
156
+ }
157
+
108
158
  /** Reciprocal Rank Fusion: merge ranked lists into a single scored result set. */
109
159
  function fuseResults(rankedLists: RankedItem[][], k: number, limit: number): HybridResult[] {
110
160
  const fusionMap = new Map<string, FusionEntry>();
111
-
112
161
  for (const list of rankedLists) {
113
162
  for (const item of list) {
114
163
  if (!fusionMap.has(item.key)) {
115
- fusionMap.set(item.key, {
116
- name: item.name,
117
- kind: item.kind,
118
- file: item.file,
119
- line: item.line,
120
- endLine: (item.endLine as number | null) ?? null,
121
- role: (item.role as string | null) ?? null,
122
- fileHash: (item.fileHash as string | null) ?? null,
123
- rrfScore: 0,
124
- bm25Score: null,
125
- bm25Rank: null,
126
- similarity: null,
127
- semanticRank: null,
128
- });
129
- }
130
- const entry = fusionMap.get(item.key)!;
131
- entry.rrfScore += 1 / (k + item.rank);
132
- if (item.source === 'bm25') {
133
- if (entry.bm25Rank === null || item.rank < entry.bm25Rank) {
134
- entry.bm25Score = (item as RankedItem & { bm25Score?: number }).bm25Score ?? null;
135
- entry.bm25Rank = item.rank;
136
- }
137
- } else {
138
- if (entry.semanticRank === null || item.rank < entry.semanticRank) {
139
- entry.similarity = (item as RankedItem & { similarity?: number }).similarity ?? null;
140
- entry.semanticRank = item.rank;
141
- }
164
+ fusionMap.set(item.key, createFusionEntry(item));
142
165
  }
166
+ mergeRankedItem(fusionMap.get(item.key)!, item, k);
143
167
  }
144
168
  }
145
169
 
146
170
  return [...fusionMap.values()]
147
171
  .sort((a, b) => b.rrfScore - a.rrfScore)
148
172
  .slice(0, limit)
149
- .map((e) => ({
150
- name: e.name,
151
- kind: e.kind,
152
- file: e.file,
153
- line: e.line,
154
- endLine: e.endLine,
155
- role: e.role,
156
- fileHash: e.fileHash,
157
- rrf: e.rrfScore,
158
- bm25Score: e.bm25Score,
159
- bm25Rank: e.bm25Rank,
160
- similarity: e.similarity,
161
- semanticRank: e.semanticRank,
162
- }));
173
+ .map(toHybridResult);
163
174
  }
164
175
 
165
176
  export async function hybridSearchData(
@@ -4,7 +4,7 @@ import type { BetterSqlite3Database, CodegraphConfig } from '../../../types.js';
4
4
  import { normalizeSymbol } from '../../queries.js';
5
5
  import { embed } from '../models.js';
6
6
  import { cosineSim } from '../stores/sqlite-blob.js';
7
- import { prepareSearch } from './prepare.js';
7
+ import { type PreparedSearch, prepareSearch } from './prepare.js';
8
8
 
9
9
  export interface SemanticSearchOpts {
10
10
  config?: CodegraphConfig;
@@ -30,6 +30,25 @@ export interface SearchDataResult {
30
30
  results: SemanticResult[];
31
31
  }
32
32
 
33
+ type StoredRow = PreparedSearch['rows'][number];
34
+
35
+ /** Reconstitute a stored embedding row's vector blob into a Float32Array. */
36
+ function rowVector(row: StoredRow): Float32Array {
37
+ return new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer);
38
+ }
39
+
40
+ /** Warn when stored embeddings and the query model use different dimensions. */
41
+ function checkDimensionMismatch(storedDim: number | null, dim: number): boolean {
42
+ if (storedDim && dim !== storedDim) {
43
+ console.log(
44
+ `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
45
+ );
46
+ console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
47
+ return true;
48
+ }
49
+ return false;
50
+ }
51
+
33
52
  export async function searchData(
34
53
  query: string,
35
54
  customDbPath: string | undefined,
@@ -50,20 +69,12 @@ export async function searchData(
50
69
  dim,
51
70
  } = await embed([query], modelKey ?? undefined);
52
71
 
53
- if (storedDim && dim !== storedDim) {
54
- console.log(
55
- `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
56
- );
57
- console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
58
- return null;
59
- }
72
+ if (checkDimensionMismatch(storedDim, dim)) return null;
60
73
 
61
74
  const hc = new Map<string, string>();
62
75
  const results: SemanticResult[] = [];
63
76
  for (const row of rows) {
64
- const vec = new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer);
65
- const sim = cosineSim(queryVec!, vec);
66
-
77
+ const sim = cosineSim(queryVec!, rowVector(row));
67
78
  if (sim >= minScore) {
68
79
  results.push({
69
80
  ...normalizeSymbol(row, db as BetterSqlite3Database, hc),
@@ -91,6 +102,82 @@ export interface MultiSearchResult {
91
102
  }>;
92
103
  }
93
104
 
105
+ interface RankedHit {
106
+ rowIndex: number;
107
+ similarity: number;
108
+ rank: number;
109
+ }
110
+
111
+ interface FusionEntry {
112
+ rrfScore: number;
113
+ queryScores: Array<{ query: string; similarity: number; rank: number }>;
114
+ }
115
+
116
+ /**
117
+ * Emit a warning for any query pair whose embeddings are nearly identical,
118
+ * since RRF would over-weight matches shared between them.
119
+ */
120
+ function warnOnSimilarQueries(
121
+ queries: string[],
122
+ queryVecs: Float32Array[],
123
+ threshold: number,
124
+ ): void {
125
+ for (let i = 0; i < queryVecs.length; i++) {
126
+ for (let j = i + 1; j < queryVecs.length; j++) {
127
+ const sim = cosineSim(queryVecs[i]!, queryVecs[j]!);
128
+ if (sim >= threshold) {
129
+ warn(
130
+ `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
131
+ `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
132
+ `This may bias RRF results toward their shared matches. ` +
133
+ `Consider using more distinct queries.`,
134
+ );
135
+ }
136
+ }
137
+ }
138
+ }
139
+
140
+ /** Rank stored rows for a single query, keeping only those above minScore. */
141
+ function rankRowsForQuery(
142
+ queryVec: Float32Array,
143
+ rowVecs: Float32Array[],
144
+ minScore: number,
145
+ ): RankedHit[] {
146
+ const scored: Array<{ rowIndex: number; similarity: number }> = [];
147
+ for (let ri = 0; ri < rowVecs.length; ri++) {
148
+ const sim = cosineSim(queryVec, rowVecs[ri]!);
149
+ if (sim >= minScore) {
150
+ scored.push({ rowIndex: ri, similarity: sim });
151
+ }
152
+ }
153
+ scored.sort((a, b) => b.similarity - a.similarity);
154
+ return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
155
+ }
156
+
157
+ /** Reciprocal Rank Fusion across each query's ranked hits. */
158
+ function fuseRankedHits(
159
+ queries: string[],
160
+ perQueryRanked: RankedHit[][],
161
+ k: number,
162
+ ): Map<number, FusionEntry> {
163
+ const fusionMap = new Map<number, FusionEntry>();
164
+ for (let qi = 0; qi < queries.length; qi++) {
165
+ for (const item of perQueryRanked[qi]!) {
166
+ if (!fusionMap.has(item.rowIndex)) {
167
+ fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
168
+ }
169
+ const entry = fusionMap.get(item.rowIndex)!;
170
+ entry.rrfScore += 1 / (k + item.rank);
171
+ entry.queryScores.push({
172
+ query: queries[qi]!,
173
+ similarity: item.similarity,
174
+ rank: item.rank,
175
+ });
176
+ }
177
+ }
178
+ return fusionMap;
179
+ }
180
+
94
181
  export async function multiSearchData(
95
182
  queries: string[],
96
183
  customDbPath: string | undefined,
@@ -101,6 +188,7 @@ export async function multiSearchData(
101
188
  const limit = opts.limit ?? searchCfg.topK ?? 15;
102
189
  const minScore = opts.minScore ?? searchCfg.defaultMinScore ?? 0.2;
103
190
  const k = opts.rrfK ?? searchCfg.rrfK ?? 60;
191
+ const similarityWarnThreshold = searchCfg.similarityWarnThreshold ?? 0.85;
104
192
 
105
193
  const prepared = prepareSearch(customDbPath, opts);
106
194
  if (!prepared) return null;
@@ -109,63 +197,15 @@ export async function multiSearchData(
109
197
  try {
110
198
  const { vectors: queryVecs, dim } = await embed(queries, modelKey ?? undefined);
111
199
 
112
- const SIMILARITY_WARN_THRESHOLD = searchCfg.similarityWarnThreshold ?? 0.85;
113
- for (let i = 0; i < queryVecs.length; i++) {
114
- for (let j = i + 1; j < queryVecs.length; j++) {
115
- const sim = cosineSim(queryVecs[i]!, queryVecs[j]!);
116
- if (sim >= SIMILARITY_WARN_THRESHOLD) {
117
- warn(
118
- `Queries "${queries[i]}" and "${queries[j]}" are very similar ` +
119
- `(${(sim * 100).toFixed(0)}% cosine similarity). ` +
120
- `This may bias RRF results toward their shared matches. ` +
121
- `Consider using more distinct queries.`,
122
- );
123
- }
124
- }
125
- }
200
+ warnOnSimilarQueries(queries, queryVecs as Float32Array[], similarityWarnThreshold);
126
201
 
127
- if (storedDim && dim !== storedDim) {
128
- console.log(
129
- `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`,
130
- );
131
- console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`);
132
- return null;
133
- }
202
+ if (checkDimensionMismatch(storedDim, dim)) return null;
134
203
 
135
- const rowVecs = rows.map(
136
- (row) => new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer),
204
+ const rowVecs = rows.map(rowVector);
205
+ const perQueryRanked = queries.map((_q, qi) =>
206
+ rankRowsForQuery(queryVecs[qi]!, rowVecs, minScore),
137
207
  );
138
-
139
- const perQueryRanked = queries.map((_query, qi) => {
140
- const scored: Array<{ rowIndex: number; similarity: number }> = [];
141
- for (let ri = 0; ri < rows.length; ri++) {
142
- const sim = cosineSim(queryVecs[qi]!, rowVecs[ri]!);
143
- if (sim >= minScore) {
144
- scored.push({ rowIndex: ri, similarity: sim });
145
- }
146
- }
147
- scored.sort((a, b) => b.similarity - a.similarity);
148
- return scored.map((item, rank) => ({ ...item, rank: rank + 1 }));
149
- });
150
-
151
- const fusionMap = new Map<
152
- number,
153
- { rrfScore: number; queryScores: Array<{ query: string; similarity: number; rank: number }> }
154
- >();
155
- for (let qi = 0; qi < queries.length; qi++) {
156
- for (const item of perQueryRanked[qi]!) {
157
- if (!fusionMap.has(item.rowIndex)) {
158
- fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] });
159
- }
160
- const entry = fusionMap.get(item.rowIndex)!;
161
- entry.rrfScore += 1 / (k + item.rank);
162
- entry.queryScores.push({
163
- query: queries[qi]!,
164
- similarity: item.similarity,
165
- rank: item.rank,
166
- });
167
- }
168
- }
208
+ const fusionMap = fuseRankedHits(queries, perQueryRanked, k);
169
209
 
170
210
  const hc = new Map<string, string>();
171
211
  const results: MultiSearchResult['results'] = [];