@optave/codegraph 2.2.1 → 2.2.3-dev.44e8146
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +117 -126
- package/package.json +5 -5
- package/src/builder.js +96 -18
- package/src/cli.js +85 -25
- package/src/config.js +1 -0
- package/src/embedder.js +196 -15
- package/src/export.js +16 -7
- package/src/extractors/javascript.js +6 -8
- package/src/index.js +3 -0
- package/src/mcp.js +21 -7
- package/src/queries.js +222 -18
- package/src/structure.js +2 -1
- package/src/watcher.js +2 -1
package/src/cli.js
CHANGED
|
@@ -5,9 +5,10 @@ import path from 'node:path';
|
|
|
5
5
|
import Database from 'better-sqlite3';
|
|
6
6
|
import { Command } from 'commander';
|
|
7
7
|
import { buildGraph } from './builder.js';
|
|
8
|
+
import { loadConfig } from './config.js';
|
|
8
9
|
import { findCycles, formatCycles } from './cycles.js';
|
|
9
10
|
import { findDbPath } from './db.js';
|
|
10
|
-
import { buildEmbeddings, MODELS, search } from './embedder.js';
|
|
11
|
+
import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js';
|
|
11
12
|
import { exportDOT, exportJSON, exportMermaid } from './export.js';
|
|
12
13
|
import { setVerbose } from './logger.js';
|
|
13
14
|
import {
|
|
@@ -36,6 +37,8 @@ import { watchProject } from './watcher.js';
|
|
|
36
37
|
const __cliDir = path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/i, '$1'));
|
|
37
38
|
const pkg = JSON.parse(fs.readFileSync(path.join(__cliDir, '..', 'package.json'), 'utf-8'));
|
|
38
39
|
|
|
40
|
+
const config = loadConfig(process.cwd());
|
|
41
|
+
|
|
39
42
|
const program = new Command();
|
|
40
43
|
program
|
|
41
44
|
.name('codegraph')
|
|
@@ -48,6 +51,18 @@ program
|
|
|
48
51
|
if (opts.verbose) setVerbose(true);
|
|
49
52
|
});
|
|
50
53
|
|
|
54
|
+
/**
|
|
55
|
+
* Resolve the effective noTests value: CLI flag > config > false.
|
|
56
|
+
* Commander sets opts.tests to false when --no-tests is passed.
|
|
57
|
+
* When --include-tests is passed, always return false (include tests).
|
|
58
|
+
* Otherwise, fall back to config.query.excludeTests.
|
|
59
|
+
*/
|
|
60
|
+
function resolveNoTests(opts) {
|
|
61
|
+
if (opts.includeTests) return false;
|
|
62
|
+
if (opts.tests === false) return true;
|
|
63
|
+
return config.query?.excludeTests || false;
|
|
64
|
+
}
|
|
65
|
+
|
|
51
66
|
program
|
|
52
67
|
.command('build [dir]')
|
|
53
68
|
.description('Parse repo and build graph in .codegraph/graph.db')
|
|
@@ -63,9 +78,10 @@ program
|
|
|
63
78
|
.description('Find a function/class, show callers and callees')
|
|
64
79
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
65
80
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
81
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
66
82
|
.option('-j, --json', 'Output as JSON')
|
|
67
83
|
.action((name, opts) => {
|
|
68
|
-
queryName(name, opts.db, { noTests:
|
|
84
|
+
queryName(name, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
|
|
69
85
|
});
|
|
70
86
|
|
|
71
87
|
program
|
|
@@ -73,9 +89,10 @@ program
|
|
|
73
89
|
.description('Show what depends on this file (transitive)')
|
|
74
90
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
75
91
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
92
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
76
93
|
.option('-j, --json', 'Output as JSON')
|
|
77
94
|
.action((file, opts) => {
|
|
78
|
-
impactAnalysis(file, opts.db, { noTests:
|
|
95
|
+
impactAnalysis(file, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
|
|
79
96
|
});
|
|
80
97
|
|
|
81
98
|
program
|
|
@@ -84,9 +101,13 @@ program
|
|
|
84
101
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
85
102
|
.option('-n, --limit <number>', 'Number of top nodes', '20')
|
|
86
103
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
104
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
87
105
|
.option('-j, --json', 'Output as JSON')
|
|
88
106
|
.action((opts) => {
|
|
89
|
-
moduleMap(opts.db, parseInt(opts.limit, 10), {
|
|
107
|
+
moduleMap(opts.db, parseInt(opts.limit, 10), {
|
|
108
|
+
noTests: resolveNoTests(opts),
|
|
109
|
+
json: opts.json,
|
|
110
|
+
});
|
|
90
111
|
});
|
|
91
112
|
|
|
92
113
|
program
|
|
@@ -94,9 +115,10 @@ program
|
|
|
94
115
|
.description('Show graph health overview: nodes, edges, languages, cycles, hotspots, embeddings')
|
|
95
116
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
96
117
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
118
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
97
119
|
.option('-j, --json', 'Output as JSON')
|
|
98
120
|
.action((opts) => {
|
|
99
|
-
stats(opts.db, { noTests:
|
|
121
|
+
stats(opts.db, { noTests: resolveNoTests(opts), json: opts.json });
|
|
100
122
|
});
|
|
101
123
|
|
|
102
124
|
program
|
|
@@ -104,9 +126,10 @@ program
|
|
|
104
126
|
.description('Show what this file imports and what imports it')
|
|
105
127
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
106
128
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
129
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
107
130
|
.option('-j, --json', 'Output as JSON')
|
|
108
131
|
.action((file, opts) => {
|
|
109
|
-
fileDeps(file, opts.db, { noTests:
|
|
132
|
+
fileDeps(file, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
|
|
110
133
|
});
|
|
111
134
|
|
|
112
135
|
program
|
|
@@ -117,6 +140,7 @@ program
|
|
|
117
140
|
.option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
|
|
118
141
|
.option('-k, --kind <kind>', 'Filter to a specific symbol kind')
|
|
119
142
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
143
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
120
144
|
.option('-j, --json', 'Output as JSON')
|
|
121
145
|
.action((name, opts) => {
|
|
122
146
|
if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
|
|
@@ -127,7 +151,7 @@ program
|
|
|
127
151
|
depth: parseInt(opts.depth, 10),
|
|
128
152
|
file: opts.file,
|
|
129
153
|
kind: opts.kind,
|
|
130
|
-
noTests:
|
|
154
|
+
noTests: resolveNoTests(opts),
|
|
131
155
|
json: opts.json,
|
|
132
156
|
});
|
|
133
157
|
});
|
|
@@ -140,6 +164,7 @@ program
|
|
|
140
164
|
.option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
|
|
141
165
|
.option('-k, --kind <kind>', 'Filter to a specific symbol kind')
|
|
142
166
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
167
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
143
168
|
.option('-j, --json', 'Output as JSON')
|
|
144
169
|
.action((name, opts) => {
|
|
145
170
|
if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
|
|
@@ -150,7 +175,7 @@ program
|
|
|
150
175
|
depth: parseInt(opts.depth, 10),
|
|
151
176
|
file: opts.file,
|
|
152
177
|
kind: opts.kind,
|
|
153
|
-
noTests:
|
|
178
|
+
noTests: resolveNoTests(opts),
|
|
154
179
|
json: opts.json,
|
|
155
180
|
});
|
|
156
181
|
});
|
|
@@ -163,8 +188,9 @@ program
|
|
|
163
188
|
.option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
|
|
164
189
|
.option('-k, --kind <kind>', 'Filter to a specific symbol kind')
|
|
165
190
|
.option('--no-source', 'Metadata only (skip source extraction)')
|
|
166
|
-
.option('--
|
|
191
|
+
.option('--with-test-source', 'Include test source code')
|
|
167
192
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
193
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
168
194
|
.option('-j, --json', 'Output as JSON')
|
|
169
195
|
.action((name, opts) => {
|
|
170
196
|
if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
|
|
@@ -176,8 +202,8 @@ program
|
|
|
176
202
|
file: opts.file,
|
|
177
203
|
kind: opts.kind,
|
|
178
204
|
noSource: !opts.source,
|
|
179
|
-
noTests:
|
|
180
|
-
includeTests: opts.
|
|
205
|
+
noTests: resolveNoTests(opts),
|
|
206
|
+
includeTests: opts.withTestSource,
|
|
181
207
|
json: opts.json,
|
|
182
208
|
});
|
|
183
209
|
});
|
|
@@ -186,10 +212,16 @@ program
|
|
|
186
212
|
.command('explain <target>')
|
|
187
213
|
.description('Structural summary of a file or function (no LLM needed)')
|
|
188
214
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
215
|
+
.option('--depth <n>', 'Recursively explain dependencies up to N levels deep', '0')
|
|
189
216
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
217
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
190
218
|
.option('-j, --json', 'Output as JSON')
|
|
191
219
|
.action((target, opts) => {
|
|
192
|
-
explain(target, opts.db, {
|
|
220
|
+
explain(target, opts.db, {
|
|
221
|
+
depth: parseInt(opts.depth, 10),
|
|
222
|
+
noTests: resolveNoTests(opts),
|
|
223
|
+
json: opts.json,
|
|
224
|
+
});
|
|
193
225
|
});
|
|
194
226
|
|
|
195
227
|
program
|
|
@@ -198,6 +230,7 @@ program
|
|
|
198
230
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
199
231
|
.option('-f, --file <path>', 'File overview: list symbols, imports, exports')
|
|
200
232
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
233
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
201
234
|
.option('-j, --json', 'Output as JSON')
|
|
202
235
|
.action((name, opts) => {
|
|
203
236
|
if (!name && !opts.file) {
|
|
@@ -205,7 +238,7 @@ program
|
|
|
205
238
|
process.exit(1);
|
|
206
239
|
}
|
|
207
240
|
const target = opts.file || name;
|
|
208
|
-
where(target, opts.db, { file: !!opts.file, noTests:
|
|
241
|
+
where(target, opts.db, { file: !!opts.file, noTests: resolveNoTests(opts), json: opts.json });
|
|
209
242
|
});
|
|
210
243
|
|
|
211
244
|
program
|
|
@@ -215,14 +248,17 @@ program
|
|
|
215
248
|
.option('--staged', 'Analyze staged changes instead of unstaged')
|
|
216
249
|
.option('--depth <n>', 'Max transitive caller depth', '3')
|
|
217
250
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
251
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
218
252
|
.option('-j, --json', 'Output as JSON')
|
|
253
|
+
.option('-f, --format <format>', 'Output format: text, mermaid, json', 'text')
|
|
219
254
|
.action((ref, opts) => {
|
|
220
255
|
diffImpact(opts.db, {
|
|
221
256
|
ref,
|
|
222
257
|
staged: opts.staged,
|
|
223
258
|
depth: parseInt(opts.depth, 10),
|
|
224
|
-
noTests:
|
|
259
|
+
noTests: resolveNoTests(opts),
|
|
225
260
|
json: opts.json,
|
|
261
|
+
format: opts.format,
|
|
226
262
|
});
|
|
227
263
|
});
|
|
228
264
|
|
|
@@ -235,10 +271,16 @@ program
|
|
|
235
271
|
.option('-f, --format <format>', 'Output format: dot, mermaid, json', 'dot')
|
|
236
272
|
.option('--functions', 'Function-level graph instead of file-level')
|
|
237
273
|
.option('-T, --no-tests', 'Exclude test/spec files')
|
|
274
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
275
|
+
.option('--min-confidence <score>', 'Minimum edge confidence threshold (default: 0.5)', '0.5')
|
|
238
276
|
.option('-o, --output <file>', 'Write to file instead of stdout')
|
|
239
277
|
.action((opts) => {
|
|
240
278
|
const db = new Database(findDbPath(opts.db), { readonly: true });
|
|
241
|
-
const exportOpts = {
|
|
279
|
+
const exportOpts = {
|
|
280
|
+
fileLevel: !opts.functions,
|
|
281
|
+
noTests: resolveNoTests(opts),
|
|
282
|
+
minConfidence: parseFloat(opts.minConfidence),
|
|
283
|
+
};
|
|
242
284
|
|
|
243
285
|
let output;
|
|
244
286
|
switch (opts.format) {
|
|
@@ -269,10 +311,11 @@ program
|
|
|
269
311
|
.option('-d, --db <path>', 'Path to graph.db')
|
|
270
312
|
.option('--functions', 'Function-level cycle detection')
|
|
271
313
|
.option('-T, --no-tests', 'Exclude test/spec files')
|
|
314
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
272
315
|
.option('-j, --json', 'Output as JSON')
|
|
273
316
|
.action((opts) => {
|
|
274
317
|
const db = new Database(findDbPath(opts.db), { readonly: true });
|
|
275
|
-
const cycles = findCycles(db, { fileLevel: !opts.functions, noTests:
|
|
318
|
+
const cycles = findCycles(db, { fileLevel: !opts.functions, noTests: resolveNoTests(opts) });
|
|
276
319
|
db.close();
|
|
277
320
|
|
|
278
321
|
if (opts.json) {
|
|
@@ -374,10 +417,13 @@ program
|
|
|
374
417
|
.action(() => {
|
|
375
418
|
console.log('\nAvailable embedding models:\n');
|
|
376
419
|
for (const [key, config] of Object.entries(MODELS)) {
|
|
377
|
-
const def = key === '
|
|
378
|
-
|
|
420
|
+
const def = key === 'minilm' ? ' (default)' : '';
|
|
421
|
+
const ctx = config.contextWindow ? `${config.contextWindow} ctx` : '';
|
|
422
|
+
console.log(
|
|
423
|
+
` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`,
|
|
424
|
+
);
|
|
379
425
|
}
|
|
380
|
-
console.log('\nUsage: codegraph embed --model <name>');
|
|
426
|
+
console.log('\nUsage: codegraph embed --model <name> --strategy <structured|source>');
|
|
381
427
|
console.log(' codegraph search "query" --model <name>\n');
|
|
382
428
|
});
|
|
383
429
|
|
|
@@ -388,12 +434,23 @@ program
|
|
|
388
434
|
)
|
|
389
435
|
.option(
|
|
390
436
|
'-m, --model <name>',
|
|
391
|
-
'Embedding model: minilm, jina-small, jina-base, jina-code, nomic, nomic-v1.5
|
|
392
|
-
'
|
|
437
|
+
'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details',
|
|
438
|
+
'minilm',
|
|
439
|
+
)
|
|
440
|
+
.option(
|
|
441
|
+
'-s, --strategy <name>',
|
|
442
|
+
`Embedding strategy: ${EMBEDDING_STRATEGIES.join(', ')}. "structured" uses graph context (callers/callees), "source" embeds raw code`,
|
|
443
|
+
'structured',
|
|
393
444
|
)
|
|
394
445
|
.action(async (dir, opts) => {
|
|
446
|
+
if (!EMBEDDING_STRATEGIES.includes(opts.strategy)) {
|
|
447
|
+
console.error(
|
|
448
|
+
`Unknown strategy: ${opts.strategy}. Available: ${EMBEDDING_STRATEGIES.join(', ')}`,
|
|
449
|
+
);
|
|
450
|
+
process.exit(1);
|
|
451
|
+
}
|
|
395
452
|
const root = path.resolve(dir || '.');
|
|
396
|
-
await buildEmbeddings(root, opts.model);
|
|
453
|
+
await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy });
|
|
397
454
|
});
|
|
398
455
|
|
|
399
456
|
program
|
|
@@ -403,6 +460,7 @@ program
|
|
|
403
460
|
.option('-m, --model <name>', 'Override embedding model (auto-detects from DB)')
|
|
404
461
|
.option('-n, --limit <number>', 'Max results', '15')
|
|
405
462
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
463
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
406
464
|
.option('--min-score <score>', 'Minimum similarity threshold', '0.2')
|
|
407
465
|
.option('-k, --kind <kind>', 'Filter by kind: function, method, class')
|
|
408
466
|
.option('--file <pattern>', 'Filter by file path pattern')
|
|
@@ -410,7 +468,7 @@ program
|
|
|
410
468
|
.action(async (query, opts) => {
|
|
411
469
|
await search(query, opts.db, {
|
|
412
470
|
limit: parseInt(opts.limit, 10),
|
|
413
|
-
noTests:
|
|
471
|
+
noTests: resolveNoTests(opts),
|
|
414
472
|
minScore: parseFloat(opts.minScore),
|
|
415
473
|
model: opts.model,
|
|
416
474
|
kind: opts.kind,
|
|
@@ -428,6 +486,7 @@ program
|
|
|
428
486
|
.option('--depth <n>', 'Max directory depth')
|
|
429
487
|
.option('--sort <metric>', 'Sort by: cohesion | fan-in | fan-out | density | files', 'files')
|
|
430
488
|
.option('-T, --no-tests', 'Exclude test/spec files')
|
|
489
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
431
490
|
.option('-j, --json', 'Output as JSON')
|
|
432
491
|
.action(async (dir, opts) => {
|
|
433
492
|
const { structureData, formatStructure } = await import('./structure.js');
|
|
@@ -435,7 +494,7 @@ program
|
|
|
435
494
|
directory: dir,
|
|
436
495
|
depth: opts.depth ? parseInt(opts.depth, 10) : undefined,
|
|
437
496
|
sort: opts.sort,
|
|
438
|
-
noTests:
|
|
497
|
+
noTests: resolveNoTests(opts),
|
|
439
498
|
});
|
|
440
499
|
if (opts.json) {
|
|
441
500
|
console.log(JSON.stringify(data, null, 2));
|
|
@@ -454,6 +513,7 @@ program
|
|
|
454
513
|
.option('--metric <metric>', 'fan-in | fan-out | density | coupling', 'fan-in')
|
|
455
514
|
.option('--level <level>', 'file | directory', 'file')
|
|
456
515
|
.option('-T, --no-tests', 'Exclude test/spec files from results')
|
|
516
|
+
.option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
|
|
457
517
|
.option('-j, --json', 'Output as JSON')
|
|
458
518
|
.action(async (opts) => {
|
|
459
519
|
const { hotspotsData, formatHotspots } = await import('./structure.js');
|
|
@@ -461,7 +521,7 @@ program
|
|
|
461
521
|
metric: opts.metric,
|
|
462
522
|
level: opts.level,
|
|
463
523
|
limit: parseInt(opts.limit, 10),
|
|
464
|
-
noTests:
|
|
524
|
+
noTests: resolveNoTests(opts),
|
|
465
525
|
});
|
|
466
526
|
if (opts.json) {
|
|
467
527
|
console.log(JSON.stringify(data, null, 2));
|
package/src/config.js
CHANGED
package/src/embedder.js
CHANGED
|
@@ -4,6 +4,18 @@ import Database from 'better-sqlite3';
|
|
|
4
4
|
import { findDbPath, openReadonlyOrFail } from './db.js';
|
|
5
5
|
import { warn } from './logger.js';
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Split an identifier into readable words.
|
|
9
|
+
* camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case"
|
|
10
|
+
*/
|
|
11
|
+
function splitIdentifier(name) {
|
|
12
|
+
return name
|
|
13
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
14
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
|
|
15
|
+
.replace(/[_-]+/g, ' ')
|
|
16
|
+
.trim();
|
|
17
|
+
}
|
|
18
|
+
|
|
7
19
|
// Lazy-load transformers (heavy, optional module)
|
|
8
20
|
let pipeline = null;
|
|
9
21
|
let _cos_sim = null;
|
|
@@ -14,48 +26,57 @@ export const MODELS = {
|
|
|
14
26
|
minilm: {
|
|
15
27
|
name: 'Xenova/all-MiniLM-L6-v2',
|
|
16
28
|
dim: 384,
|
|
29
|
+
contextWindow: 256,
|
|
17
30
|
desc: 'Smallest, fastest (~23MB). General text.',
|
|
18
31
|
quantized: true,
|
|
19
32
|
},
|
|
20
33
|
'jina-small': {
|
|
21
34
|
name: 'Xenova/jina-embeddings-v2-small-en',
|
|
22
35
|
dim: 512,
|
|
36
|
+
contextWindow: 8192,
|
|
23
37
|
desc: 'Small, good quality (~33MB). General text.',
|
|
24
38
|
quantized: false,
|
|
25
39
|
},
|
|
26
40
|
'jina-base': {
|
|
27
41
|
name: 'Xenova/jina-embeddings-v2-base-en',
|
|
28
42
|
dim: 768,
|
|
43
|
+
contextWindow: 8192,
|
|
29
44
|
desc: 'Good quality (~137MB). General text, 8192 token context.',
|
|
30
45
|
quantized: false,
|
|
31
46
|
},
|
|
32
47
|
'jina-code': {
|
|
33
48
|
name: 'Xenova/jina-embeddings-v2-base-code',
|
|
34
49
|
dim: 768,
|
|
50
|
+
contextWindow: 8192,
|
|
35
51
|
desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
|
|
36
52
|
quantized: false,
|
|
37
53
|
},
|
|
38
54
|
nomic: {
|
|
39
55
|
name: 'Xenova/nomic-embed-text-v1',
|
|
40
56
|
dim: 768,
|
|
57
|
+
contextWindow: 8192,
|
|
41
58
|
desc: 'Good local quality (~137MB). 8192 context.',
|
|
42
59
|
quantized: false,
|
|
43
60
|
},
|
|
44
61
|
'nomic-v1.5': {
|
|
45
62
|
name: 'nomic-ai/nomic-embed-text-v1.5',
|
|
46
63
|
dim: 768,
|
|
64
|
+
contextWindow: 8192,
|
|
47
65
|
desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
|
|
48
66
|
quantized: false,
|
|
49
67
|
},
|
|
50
68
|
'bge-large': {
|
|
51
69
|
name: 'Xenova/bge-large-en-v1.5',
|
|
52
70
|
dim: 1024,
|
|
71
|
+
contextWindow: 512,
|
|
53
72
|
desc: 'Best general retrieval (~335MB). Top MTEB scores.',
|
|
54
73
|
quantized: false,
|
|
55
74
|
},
|
|
56
75
|
};
|
|
57
76
|
|
|
58
|
-
export const
|
|
77
|
+
export const EMBEDDING_STRATEGIES = ['structured', 'source'];
|
|
78
|
+
|
|
79
|
+
export const DEFAULT_MODEL = 'minilm';
|
|
59
80
|
const BATCH_SIZE_MAP = {
|
|
60
81
|
minilm: 32,
|
|
61
82
|
'jina-small': 16,
|
|
@@ -77,6 +98,108 @@ function getModelConfig(modelKey) {
|
|
|
77
98
|
return config;
|
|
78
99
|
}
|
|
79
100
|
|
|
101
|
+
/**
|
|
102
|
+
* Rough token estimate (~4 chars per token for code/English).
|
|
103
|
+
* Conservative — avoids adding a tokenizer dependency.
|
|
104
|
+
*/
|
|
105
|
+
export function estimateTokens(text) {
|
|
106
|
+
return Math.ceil(text.length / 4);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Extract leading comment text (JSDoc, //, #, etc.) above a function line.
|
|
111
|
+
* Returns the cleaned comment text or null if none found.
|
|
112
|
+
*/
|
|
113
|
+
function extractLeadingComment(lines, fnLineIndex) {
|
|
114
|
+
const raw = [];
|
|
115
|
+
for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) {
|
|
116
|
+
const trimmed = lines[i].trim();
|
|
117
|
+
if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) {
|
|
118
|
+
raw.unshift(trimmed);
|
|
119
|
+
} else if (trimmed === '') {
|
|
120
|
+
if (raw.length > 0) break;
|
|
121
|
+
} else {
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
if (raw.length === 0) return null;
|
|
126
|
+
return raw
|
|
127
|
+
.map((line) =>
|
|
128
|
+
line
|
|
129
|
+
.replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */
|
|
130
|
+
.replace(/^\*\s?/, '') // middle * lines
|
|
131
|
+
.replace(/^\/\/\/?\s?/, '') // // or ///
|
|
132
|
+
.replace(/^#\s?/, '') // # (Python/Ruby)
|
|
133
|
+
.trim(),
|
|
134
|
+
)
|
|
135
|
+
.filter((l) => l.length > 0)
|
|
136
|
+
.join(' ');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Build graph-enriched text for a symbol using dependency context.
|
|
141
|
+
* Produces compact, semantic text (~100 tokens) instead of full source code.
|
|
142
|
+
*/
|
|
143
|
+
function buildStructuredText(node, file, lines, calleesStmt, callersStmt) {
|
|
144
|
+
const readable = splitIdentifier(node.name);
|
|
145
|
+
const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`];
|
|
146
|
+
const startLine = Math.max(0, node.line - 1);
|
|
147
|
+
|
|
148
|
+
// Extract parameters from signature (best-effort, single-line)
|
|
149
|
+
const sigLine = lines[startLine] || '';
|
|
150
|
+
const paramMatch = sigLine.match(/\(([^)]*)\)/);
|
|
151
|
+
if (paramMatch?.[1]?.trim()) {
|
|
152
|
+
parts.push(`Parameters: ${paramMatch[1].trim()}`);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Graph context: callees (capped at 10)
|
|
156
|
+
const callees = calleesStmt.all(node.id);
|
|
157
|
+
if (callees.length > 0) {
|
|
158
|
+
parts.push(
|
|
159
|
+
`Calls: ${callees
|
|
160
|
+
.slice(0, 10)
|
|
161
|
+
.map((c) => c.name)
|
|
162
|
+
.join(', ')}`,
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Graph context: callers (capped at 10)
|
|
167
|
+
const callers = callersStmt.all(node.id);
|
|
168
|
+
if (callers.length > 0) {
|
|
169
|
+
parts.push(
|
|
170
|
+
`Called by: ${callers
|
|
171
|
+
.slice(0, 10)
|
|
172
|
+
.map((c) => c.name)
|
|
173
|
+
.join(', ')}`,
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Leading comment (high semantic value) or first few lines of code
|
|
178
|
+
const comment = extractLeadingComment(lines, startLine);
|
|
179
|
+
if (comment) {
|
|
180
|
+
parts.push(comment);
|
|
181
|
+
} else {
|
|
182
|
+
const endLine = Math.min(lines.length, startLine + 4);
|
|
183
|
+
const snippet = lines.slice(startLine, endLine).join('\n').trim();
|
|
184
|
+
if (snippet) parts.push(snippet);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return parts.join('\n');
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Build raw source-code text for a symbol (original strategy).
|
|
192
|
+
*/
|
|
193
|
+
function buildSourceText(node, file, lines) {
|
|
194
|
+
const startLine = Math.max(0, node.line - 1);
|
|
195
|
+
const endLine = node.end_line
|
|
196
|
+
? Math.min(lines.length, node.end_line)
|
|
197
|
+
: Math.min(lines.length, startLine + 15);
|
|
198
|
+
const context = lines.slice(startLine, endLine).join('\n');
|
|
199
|
+
const readable = splitIdentifier(node.name);
|
|
200
|
+
return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
|
|
201
|
+
}
|
|
202
|
+
|
|
80
203
|
/**
|
|
81
204
|
* Lazy-load @huggingface/transformers.
|
|
82
205
|
* This is an optional dependency — gives a clear error if not installed.
|
|
@@ -103,8 +226,27 @@ async function loadModel(modelKey) {
|
|
|
103
226
|
_cos_sim = transformers.cos_sim;
|
|
104
227
|
|
|
105
228
|
console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`);
|
|
106
|
-
const
|
|
107
|
-
|
|
229
|
+
const pipelineOpts = config.quantized ? { quantized: true } : {};
|
|
230
|
+
try {
|
|
231
|
+
extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
|
|
232
|
+
} catch (err) {
|
|
233
|
+
const msg = err.message || String(err);
|
|
234
|
+
if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
|
|
235
|
+
console.error(
|
|
236
|
+
`\nModel "${config.name}" requires authentication.\n` +
|
|
237
|
+
`This model is gated on HuggingFace and needs an access token.\n\n` +
|
|
238
|
+
`Options:\n` +
|
|
239
|
+
` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
|
|
240
|
+
` 2. Use a public model instead: codegraph embed --model minilm\n`,
|
|
241
|
+
);
|
|
242
|
+
} else {
|
|
243
|
+
console.error(
|
|
244
|
+
`\nFailed to load model "${config.name}": ${msg}\n` +
|
|
245
|
+
`Try a different model: codegraph embed --model minilm\n`,
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
process.exit(1);
|
|
249
|
+
}
|
|
108
250
|
activeModel = config.name;
|
|
109
251
|
console.log('Model loaded.');
|
|
110
252
|
return { extractor, config };
|
|
@@ -172,10 +314,14 @@ function initEmbeddingsSchema(db) {
|
|
|
172
314
|
|
|
173
315
|
/**
|
|
174
316
|
* Build embeddings for all functions/methods/classes in the graph.
|
|
317
|
+
* @param {string} rootDir - Project root directory
|
|
318
|
+
* @param {string} modelKey - Model identifier from MODELS registry
|
|
319
|
+
* @param {string} [customDbPath] - Override path to graph.db
|
|
320
|
+
* @param {object} [options] - Embedding options
|
|
321
|
+
* @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
|
|
175
322
|
*/
|
|
176
|
-
export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
|
|
177
|
-
|
|
178
|
-
// fs already imported at top
|
|
323
|
+
export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
|
|
324
|
+
const strategy = options.strategy || 'structured';
|
|
179
325
|
const dbPath = customDbPath || findDbPath(null);
|
|
180
326
|
|
|
181
327
|
const db = new Database(dbPath);
|
|
@@ -190,7 +336,24 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
|
|
|
190
336
|
)
|
|
191
337
|
.all();
|
|
192
338
|
|
|
193
|
-
console.log(`Building embeddings for ${nodes.length} symbols...`);
|
|
339
|
+
console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
|
|
340
|
+
|
|
341
|
+
// Prepare graph-context queries for structured strategy
|
|
342
|
+
let calleesStmt, callersStmt;
|
|
343
|
+
if (strategy === 'structured') {
|
|
344
|
+
calleesStmt = db.prepare(`
|
|
345
|
+
SELECT DISTINCT n.name FROM edges e
|
|
346
|
+
JOIN nodes n ON e.target_id = n.id
|
|
347
|
+
WHERE e.source_id = ? AND e.kind = 'calls'
|
|
348
|
+
ORDER BY n.name
|
|
349
|
+
`);
|
|
350
|
+
callersStmt = db.prepare(`
|
|
351
|
+
SELECT DISTINCT n.name FROM edges e
|
|
352
|
+
JOIN nodes n ON e.source_id = n.id
|
|
353
|
+
WHERE e.target_id = ? AND e.kind = 'calls'
|
|
354
|
+
ORDER BY n.name
|
|
355
|
+
`);
|
|
356
|
+
}
|
|
194
357
|
|
|
195
358
|
const byFile = new Map();
|
|
196
359
|
for (const node of nodes) {
|
|
@@ -201,6 +364,9 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
|
|
|
201
364
|
const texts = [];
|
|
202
365
|
const nodeIds = [];
|
|
203
366
|
const previews = [];
|
|
367
|
+
const config = getModelConfig(modelKey);
|
|
368
|
+
const contextWindow = config.contextWindow;
|
|
369
|
+
let overflowCount = 0;
|
|
204
370
|
|
|
205
371
|
for (const [file, fileNodes] of byFile) {
|
|
206
372
|
const fullPath = path.join(rootDir, file);
|
|
@@ -213,19 +379,31 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
|
|
|
213
379
|
}
|
|
214
380
|
|
|
215
381
|
for (const node of fileNodes) {
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
382
|
+
let text =
|
|
383
|
+
strategy === 'structured'
|
|
384
|
+
? buildStructuredText(node, file, lines, calleesStmt, callersStmt)
|
|
385
|
+
: buildSourceText(node, file, lines);
|
|
386
|
+
|
|
387
|
+
// Detect and handle context window overflow
|
|
388
|
+
const tokens = estimateTokens(text);
|
|
389
|
+
if (tokens > contextWindow) {
|
|
390
|
+
overflowCount++;
|
|
391
|
+
const maxChars = contextWindow * 4;
|
|
392
|
+
text = text.slice(0, maxChars);
|
|
393
|
+
}
|
|
221
394
|
|
|
222
|
-
const text = `${node.kind} ${node.name} in ${file}\n${context}`;
|
|
223
395
|
texts.push(text);
|
|
224
396
|
nodeIds.push(node.id);
|
|
225
397
|
previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
|
|
226
398
|
}
|
|
227
399
|
}
|
|
228
400
|
|
|
401
|
+
if (overflowCount > 0) {
|
|
402
|
+
warn(
|
|
403
|
+
`${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
|
|
404
|
+
);
|
|
405
|
+
}
|
|
406
|
+
|
|
229
407
|
console.log(`Embedding ${texts.length} symbols...`);
|
|
230
408
|
const { vectors, dim } = await embed(texts, modelKey);
|
|
231
409
|
|
|
@@ -237,16 +415,19 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
|
|
|
237
415
|
for (let i = 0; i < vectors.length; i++) {
|
|
238
416
|
insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i]);
|
|
239
417
|
}
|
|
240
|
-
const config = getModelConfig(modelKey);
|
|
241
418
|
insertMeta.run('model', config.name);
|
|
242
419
|
insertMeta.run('dim', String(dim));
|
|
243
420
|
insertMeta.run('count', String(vectors.length));
|
|
421
|
+
insertMeta.run('strategy', strategy);
|
|
244
422
|
insertMeta.run('built_at', new Date().toISOString());
|
|
423
|
+
if (overflowCount > 0) {
|
|
424
|
+
insertMeta.run('truncated_count', String(overflowCount));
|
|
425
|
+
}
|
|
245
426
|
});
|
|
246
427
|
insertAll();
|
|
247
428
|
|
|
248
429
|
console.log(
|
|
249
|
-
`\nStored ${vectors.length} embeddings (${dim}d, ${
|
|
430
|
+
`\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
|
|
250
431
|
);
|
|
251
432
|
db.close();
|
|
252
433
|
}
|