@optave/codegraph 2.2.1 → 2.2.3-dev.44e8146

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/cli.js CHANGED
@@ -5,9 +5,10 @@ import path from 'node:path';
5
5
  import Database from 'better-sqlite3';
6
6
  import { Command } from 'commander';
7
7
  import { buildGraph } from './builder.js';
8
+ import { loadConfig } from './config.js';
8
9
  import { findCycles, formatCycles } from './cycles.js';
9
10
  import { findDbPath } from './db.js';
10
- import { buildEmbeddings, MODELS, search } from './embedder.js';
11
+ import { buildEmbeddings, EMBEDDING_STRATEGIES, MODELS, search } from './embedder.js';
11
12
  import { exportDOT, exportJSON, exportMermaid } from './export.js';
12
13
  import { setVerbose } from './logger.js';
13
14
  import {
@@ -36,6 +37,8 @@ import { watchProject } from './watcher.js';
36
37
  const __cliDir = path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/i, '$1'));
37
38
  const pkg = JSON.parse(fs.readFileSync(path.join(__cliDir, '..', 'package.json'), 'utf-8'));
38
39
 
40
+ const config = loadConfig(process.cwd());
41
+
39
42
  const program = new Command();
40
43
  program
41
44
  .name('codegraph')
@@ -48,6 +51,18 @@ program
48
51
  if (opts.verbose) setVerbose(true);
49
52
  });
50
53
 
54
+ /**
55
+ * Resolve the effective noTests value: CLI flag > config > false.
56
+ * Commander sets opts.tests to false when --no-tests is passed.
57
+ * When --include-tests is passed, always return false (include tests).
58
+ * Otherwise, fall back to config.query.excludeTests.
59
+ */
60
+ function resolveNoTests(opts) {
61
+ if (opts.includeTests) return false;
62
+ if (opts.tests === false) return true;
63
+ return config.query?.excludeTests || false;
64
+ }
65
+
51
66
  program
52
67
  .command('build [dir]')
53
68
  .description('Parse repo and build graph in .codegraph/graph.db')
@@ -63,9 +78,10 @@ program
63
78
  .description('Find a function/class, show callers and callees')
64
79
  .option('-d, --db <path>', 'Path to graph.db')
65
80
  .option('-T, --no-tests', 'Exclude test/spec files from results')
81
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
66
82
  .option('-j, --json', 'Output as JSON')
67
83
  .action((name, opts) => {
68
- queryName(name, opts.db, { noTests: !opts.tests, json: opts.json });
84
+ queryName(name, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
69
85
  });
70
86
 
71
87
  program
@@ -73,9 +89,10 @@ program
73
89
  .description('Show what depends on this file (transitive)')
74
90
  .option('-d, --db <path>', 'Path to graph.db')
75
91
  .option('-T, --no-tests', 'Exclude test/spec files from results')
92
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
76
93
  .option('-j, --json', 'Output as JSON')
77
94
  .action((file, opts) => {
78
- impactAnalysis(file, opts.db, { noTests: !opts.tests, json: opts.json });
95
+ impactAnalysis(file, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
79
96
  });
80
97
 
81
98
  program
@@ -84,9 +101,13 @@ program
84
101
  .option('-d, --db <path>', 'Path to graph.db')
85
102
  .option('-n, --limit <number>', 'Number of top nodes', '20')
86
103
  .option('-T, --no-tests', 'Exclude test/spec files from results')
104
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
87
105
  .option('-j, --json', 'Output as JSON')
88
106
  .action((opts) => {
89
- moduleMap(opts.db, parseInt(opts.limit, 10), { noTests: !opts.tests, json: opts.json });
107
+ moduleMap(opts.db, parseInt(opts.limit, 10), {
108
+ noTests: resolveNoTests(opts),
109
+ json: opts.json,
110
+ });
90
111
  });
91
112
 
92
113
  program
@@ -94,9 +115,10 @@ program
94
115
  .description('Show graph health overview: nodes, edges, languages, cycles, hotspots, embeddings')
95
116
  .option('-d, --db <path>', 'Path to graph.db')
96
117
  .option('-T, --no-tests', 'Exclude test/spec files from results')
118
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
97
119
  .option('-j, --json', 'Output as JSON')
98
120
  .action((opts) => {
99
- stats(opts.db, { noTests: !opts.tests, json: opts.json });
121
+ stats(opts.db, { noTests: resolveNoTests(opts), json: opts.json });
100
122
  });
101
123
 
102
124
  program
@@ -104,9 +126,10 @@ program
104
126
  .description('Show what this file imports and what imports it')
105
127
  .option('-d, --db <path>', 'Path to graph.db')
106
128
  .option('-T, --no-tests', 'Exclude test/spec files from results')
129
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
107
130
  .option('-j, --json', 'Output as JSON')
108
131
  .action((file, opts) => {
109
- fileDeps(file, opts.db, { noTests: !opts.tests, json: opts.json });
132
+ fileDeps(file, opts.db, { noTests: resolveNoTests(opts), json: opts.json });
110
133
  });
111
134
 
112
135
  program
@@ -117,6 +140,7 @@ program
117
140
  .option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
118
141
  .option('-k, --kind <kind>', 'Filter to a specific symbol kind')
119
142
  .option('-T, --no-tests', 'Exclude test/spec files from results')
143
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
120
144
  .option('-j, --json', 'Output as JSON')
121
145
  .action((name, opts) => {
122
146
  if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
@@ -127,7 +151,7 @@ program
127
151
  depth: parseInt(opts.depth, 10),
128
152
  file: opts.file,
129
153
  kind: opts.kind,
130
- noTests: !opts.tests,
154
+ noTests: resolveNoTests(opts),
131
155
  json: opts.json,
132
156
  });
133
157
  });
@@ -140,6 +164,7 @@ program
140
164
  .option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
141
165
  .option('-k, --kind <kind>', 'Filter to a specific symbol kind')
142
166
  .option('-T, --no-tests', 'Exclude test/spec files from results')
167
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
143
168
  .option('-j, --json', 'Output as JSON')
144
169
  .action((name, opts) => {
145
170
  if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
@@ -150,7 +175,7 @@ program
150
175
  depth: parseInt(opts.depth, 10),
151
176
  file: opts.file,
152
177
  kind: opts.kind,
153
- noTests: !opts.tests,
178
+ noTests: resolveNoTests(opts),
154
179
  json: opts.json,
155
180
  });
156
181
  });
@@ -163,8 +188,9 @@ program
163
188
  .option('-f, --file <path>', 'Scope search to functions in this file (partial match)')
164
189
  .option('-k, --kind <kind>', 'Filter to a specific symbol kind')
165
190
  .option('--no-source', 'Metadata only (skip source extraction)')
166
- .option('--include-tests', 'Include test source code')
191
+ .option('--with-test-source', 'Include test source code')
167
192
  .option('-T, --no-tests', 'Exclude test/spec files from results')
193
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
168
194
  .option('-j, --json', 'Output as JSON')
169
195
  .action((name, opts) => {
170
196
  if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) {
@@ -176,8 +202,8 @@ program
176
202
  file: opts.file,
177
203
  kind: opts.kind,
178
204
  noSource: !opts.source,
179
- noTests: !opts.tests,
180
- includeTests: opts.includeTests,
205
+ noTests: resolveNoTests(opts),
206
+ includeTests: opts.withTestSource,
181
207
  json: opts.json,
182
208
  });
183
209
  });
@@ -186,10 +212,16 @@ program
186
212
  .command('explain <target>')
187
213
  .description('Structural summary of a file or function (no LLM needed)')
188
214
  .option('-d, --db <path>', 'Path to graph.db')
215
+ .option('--depth <n>', 'Recursively explain dependencies up to N levels deep', '0')
189
216
  .option('-T, --no-tests', 'Exclude test/spec files from results')
217
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
190
218
  .option('-j, --json', 'Output as JSON')
191
219
  .action((target, opts) => {
192
- explain(target, opts.db, { noTests: !opts.tests, json: opts.json });
220
+ explain(target, opts.db, {
221
+ depth: parseInt(opts.depth, 10),
222
+ noTests: resolveNoTests(opts),
223
+ json: opts.json,
224
+ });
193
225
  });
194
226
 
195
227
  program
@@ -198,6 +230,7 @@ program
198
230
  .option('-d, --db <path>', 'Path to graph.db')
199
231
  .option('-f, --file <path>', 'File overview: list symbols, imports, exports')
200
232
  .option('-T, --no-tests', 'Exclude test/spec files from results')
233
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
201
234
  .option('-j, --json', 'Output as JSON')
202
235
  .action((name, opts) => {
203
236
  if (!name && !opts.file) {
@@ -205,7 +238,7 @@ program
205
238
  process.exit(1);
206
239
  }
207
240
  const target = opts.file || name;
208
- where(target, opts.db, { file: !!opts.file, noTests: !opts.tests, json: opts.json });
241
+ where(target, opts.db, { file: !!opts.file, noTests: resolveNoTests(opts), json: opts.json });
209
242
  });
210
243
 
211
244
  program
@@ -215,14 +248,17 @@ program
215
248
  .option('--staged', 'Analyze staged changes instead of unstaged')
216
249
  .option('--depth <n>', 'Max transitive caller depth', '3')
217
250
  .option('-T, --no-tests', 'Exclude test/spec files from results')
251
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
218
252
  .option('-j, --json', 'Output as JSON')
253
+ .option('-f, --format <format>', 'Output format: text, mermaid, json', 'text')
219
254
  .action((ref, opts) => {
220
255
  diffImpact(opts.db, {
221
256
  ref,
222
257
  staged: opts.staged,
223
258
  depth: parseInt(opts.depth, 10),
224
- noTests: !opts.tests,
259
+ noTests: resolveNoTests(opts),
225
260
  json: opts.json,
261
+ format: opts.format,
226
262
  });
227
263
  });
228
264
 
@@ -235,10 +271,16 @@ program
235
271
  .option('-f, --format <format>', 'Output format: dot, mermaid, json', 'dot')
236
272
  .option('--functions', 'Function-level graph instead of file-level')
237
273
  .option('-T, --no-tests', 'Exclude test/spec files')
274
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
275
+ .option('--min-confidence <score>', 'Minimum edge confidence threshold (default: 0.5)', '0.5')
238
276
  .option('-o, --output <file>', 'Write to file instead of stdout')
239
277
  .action((opts) => {
240
278
  const db = new Database(findDbPath(opts.db), { readonly: true });
241
- const exportOpts = { fileLevel: !opts.functions, noTests: !opts.tests };
279
+ const exportOpts = {
280
+ fileLevel: !opts.functions,
281
+ noTests: resolveNoTests(opts),
282
+ minConfidence: parseFloat(opts.minConfidence),
283
+ };
242
284
 
243
285
  let output;
244
286
  switch (opts.format) {
@@ -269,10 +311,11 @@ program
269
311
  .option('-d, --db <path>', 'Path to graph.db')
270
312
  .option('--functions', 'Function-level cycle detection')
271
313
  .option('-T, --no-tests', 'Exclude test/spec files')
314
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
272
315
  .option('-j, --json', 'Output as JSON')
273
316
  .action((opts) => {
274
317
  const db = new Database(findDbPath(opts.db), { readonly: true });
275
- const cycles = findCycles(db, { fileLevel: !opts.functions, noTests: !opts.tests });
318
+ const cycles = findCycles(db, { fileLevel: !opts.functions, noTests: resolveNoTests(opts) });
276
319
  db.close();
277
320
 
278
321
  if (opts.json) {
@@ -374,10 +417,13 @@ program
374
417
  .action(() => {
375
418
  console.log('\nAvailable embedding models:\n');
376
419
  for (const [key, config] of Object.entries(MODELS)) {
377
- const def = key === 'nomic-v1.5' ? ' (default)' : '';
378
- console.log(` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${config.desc}${def}`);
420
+ const def = key === 'minilm' ? ' (default)' : '';
421
+ const ctx = config.contextWindow ? `${config.contextWindow} ctx` : '';
422
+ console.log(
423
+ ` ${key.padEnd(12)} ${String(config.dim).padStart(4)}d ${ctx.padEnd(9)} ${config.desc}${def}`,
424
+ );
379
425
  }
380
- console.log('\nUsage: codegraph embed --model <name>');
426
+ console.log('\nUsage: codegraph embed --model <name> --strategy <structured|source>');
381
427
  console.log(' codegraph search "query" --model <name>\n');
382
428
  });
383
429
 
@@ -388,12 +434,23 @@ program
388
434
  )
389
435
  .option(
390
436
  '-m, --model <name>',
391
- 'Embedding model: minilm, jina-small, jina-base, jina-code, nomic, nomic-v1.5 (default), bge-large. Run `codegraph models` for details',
392
- 'nomic-v1.5',
437
+ 'Embedding model: minilm (default), jina-small, jina-base, jina-code, nomic, nomic-v1.5, bge-large. Run `codegraph models` for details',
438
+ 'minilm',
439
+ )
440
+ .option(
441
+ '-s, --strategy <name>',
442
+ `Embedding strategy: ${EMBEDDING_STRATEGIES.join(', ')}. "structured" uses graph context (callers/callees), "source" embeds raw code`,
443
+ 'structured',
393
444
  )
394
445
  .action(async (dir, opts) => {
446
+ if (!EMBEDDING_STRATEGIES.includes(opts.strategy)) {
447
+ console.error(
448
+ `Unknown strategy: ${opts.strategy}. Available: ${EMBEDDING_STRATEGIES.join(', ')}`,
449
+ );
450
+ process.exit(1);
451
+ }
395
452
  const root = path.resolve(dir || '.');
396
- await buildEmbeddings(root, opts.model);
453
+ await buildEmbeddings(root, opts.model, undefined, { strategy: opts.strategy });
397
454
  });
398
455
 
399
456
  program
@@ -403,6 +460,7 @@ program
403
460
  .option('-m, --model <name>', 'Override embedding model (auto-detects from DB)')
404
461
  .option('-n, --limit <number>', 'Max results', '15')
405
462
  .option('-T, --no-tests', 'Exclude test/spec files from results')
463
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
406
464
  .option('--min-score <score>', 'Minimum similarity threshold', '0.2')
407
465
  .option('-k, --kind <kind>', 'Filter by kind: function, method, class')
408
466
  .option('--file <pattern>', 'Filter by file path pattern')
@@ -410,7 +468,7 @@ program
410
468
  .action(async (query, opts) => {
411
469
  await search(query, opts.db, {
412
470
  limit: parseInt(opts.limit, 10),
413
- noTests: !opts.tests,
471
+ noTests: resolveNoTests(opts),
414
472
  minScore: parseFloat(opts.minScore),
415
473
  model: opts.model,
416
474
  kind: opts.kind,
@@ -428,6 +486,7 @@ program
428
486
  .option('--depth <n>', 'Max directory depth')
429
487
  .option('--sort <metric>', 'Sort by: cohesion | fan-in | fan-out | density | files', 'files')
430
488
  .option('-T, --no-tests', 'Exclude test/spec files')
489
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
431
490
  .option('-j, --json', 'Output as JSON')
432
491
  .action(async (dir, opts) => {
433
492
  const { structureData, formatStructure } = await import('./structure.js');
@@ -435,7 +494,7 @@ program
435
494
  directory: dir,
436
495
  depth: opts.depth ? parseInt(opts.depth, 10) : undefined,
437
496
  sort: opts.sort,
438
- noTests: !opts.tests,
497
+ noTests: resolveNoTests(opts),
439
498
  });
440
499
  if (opts.json) {
441
500
  console.log(JSON.stringify(data, null, 2));
@@ -454,6 +513,7 @@ program
454
513
  .option('--metric <metric>', 'fan-in | fan-out | density | coupling', 'fan-in')
455
514
  .option('--level <level>', 'file | directory', 'file')
456
515
  .option('-T, --no-tests', 'Exclude test/spec files from results')
516
+ .option('--include-tests', 'Include test/spec files (overrides excludeTests config)')
457
517
  .option('-j, --json', 'Output as JSON')
458
518
  .action(async (opts) => {
459
519
  const { hotspotsData, formatHotspots } = await import('./structure.js');
@@ -461,7 +521,7 @@ program
461
521
  metric: opts.metric,
462
522
  level: opts.level,
463
523
  limit: parseInt(opts.limit, 10),
464
- noTests: !opts.tests,
524
+ noTests: resolveNoTests(opts),
465
525
  });
466
526
  if (opts.json) {
467
527
  console.log(JSON.stringify(data, null, 2));
package/src/config.js CHANGED
@@ -18,6 +18,7 @@ export const DEFAULTS = {
18
18
  query: {
19
19
  defaultDepth: 3,
20
20
  defaultLimit: 20,
21
+ excludeTests: false,
21
22
  },
22
23
  embeddings: { model: 'nomic-v1.5', llmProvider: null },
23
24
  llm: { provider: null, model: null, baseUrl: null, apiKey: null, apiKeyCommand: null },
package/src/embedder.js CHANGED
@@ -4,6 +4,18 @@ import Database from 'better-sqlite3';
4
4
  import { findDbPath, openReadonlyOrFail } from './db.js';
5
5
  import { warn } from './logger.js';
6
6
 
7
+ /**
8
+ * Split an identifier into readable words.
9
+ * camelCase/PascalCase → "camel Case", snake_case → "snake case", kebab-case → "kebab case"
10
+ */
11
+ function splitIdentifier(name) {
12
+ return name
13
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
14
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
15
+ .replace(/[_-]+/g, ' ')
16
+ .trim();
17
+ }
18
+
7
19
  // Lazy-load transformers (heavy, optional module)
8
20
  let pipeline = null;
9
21
  let _cos_sim = null;
@@ -14,48 +26,57 @@ export const MODELS = {
14
26
  minilm: {
15
27
  name: 'Xenova/all-MiniLM-L6-v2',
16
28
  dim: 384,
29
+ contextWindow: 256,
17
30
  desc: 'Smallest, fastest (~23MB). General text.',
18
31
  quantized: true,
19
32
  },
20
33
  'jina-small': {
21
34
  name: 'Xenova/jina-embeddings-v2-small-en',
22
35
  dim: 512,
36
+ contextWindow: 8192,
23
37
  desc: 'Small, good quality (~33MB). General text.',
24
38
  quantized: false,
25
39
  },
26
40
  'jina-base': {
27
41
  name: 'Xenova/jina-embeddings-v2-base-en',
28
42
  dim: 768,
43
+ contextWindow: 8192,
29
44
  desc: 'Good quality (~137MB). General text, 8192 token context.',
30
45
  quantized: false,
31
46
  },
32
47
  'jina-code': {
33
48
  name: 'Xenova/jina-embeddings-v2-base-code',
34
49
  dim: 768,
50
+ contextWindow: 8192,
35
51
  desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
36
52
  quantized: false,
37
53
  },
38
54
  nomic: {
39
55
  name: 'Xenova/nomic-embed-text-v1',
40
56
  dim: 768,
57
+ contextWindow: 8192,
41
58
  desc: 'Good local quality (~137MB). 8192 context.',
42
59
  quantized: false,
43
60
  },
44
61
  'nomic-v1.5': {
45
62
  name: 'nomic-ai/nomic-embed-text-v1.5',
46
63
  dim: 768,
64
+ contextWindow: 8192,
47
65
  desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
48
66
  quantized: false,
49
67
  },
50
68
  'bge-large': {
51
69
  name: 'Xenova/bge-large-en-v1.5',
52
70
  dim: 1024,
71
+ contextWindow: 512,
53
72
  desc: 'Best general retrieval (~335MB). Top MTEB scores.',
54
73
  quantized: false,
55
74
  },
56
75
  };
57
76
 
58
- export const DEFAULT_MODEL = 'nomic-v1.5';
77
+ export const EMBEDDING_STRATEGIES = ['structured', 'source'];
78
+
79
+ export const DEFAULT_MODEL = 'minilm';
59
80
  const BATCH_SIZE_MAP = {
60
81
  minilm: 32,
61
82
  'jina-small': 16,
@@ -77,6 +98,108 @@ function getModelConfig(modelKey) {
77
98
  return config;
78
99
  }
79
100
 
101
+ /**
102
+ * Rough token estimate (~4 chars per token for code/English).
103
+ * Conservative — avoids adding a tokenizer dependency.
104
+ */
105
+ export function estimateTokens(text) {
106
+ return Math.ceil(text.length / 4);
107
+ }
108
+
109
+ /**
110
+ * Extract leading comment text (JSDoc, //, #, etc.) above a function line.
111
+ * Returns the cleaned comment text or null if none found.
112
+ */
113
+ function extractLeadingComment(lines, fnLineIndex) {
114
+ const raw = [];
115
+ for (let i = fnLineIndex - 1; i >= Math.max(0, fnLineIndex - 15); i--) {
116
+ const trimmed = lines[i].trim();
117
+ if (/^(\/\/|\/\*|\*\/|\*|#|\/\/\/)/.test(trimmed)) {
118
+ raw.unshift(trimmed);
119
+ } else if (trimmed === '') {
120
+ if (raw.length > 0) break;
121
+ } else {
122
+ break;
123
+ }
124
+ }
125
+ if (raw.length === 0) return null;
126
+ return raw
127
+ .map((line) =>
128
+ line
129
+ .replace(/^\/\*\*?\s?|\*\/$/g, '') // opening /** or /* and closing */
130
+ .replace(/^\*\s?/, '') // middle * lines
131
+ .replace(/^\/\/\/?\s?/, '') // // or ///
132
+ .replace(/^#\s?/, '') // # (Python/Ruby)
133
+ .trim(),
134
+ )
135
+ .filter((l) => l.length > 0)
136
+ .join(' ');
137
+ }
138
+
139
+ /**
140
+ * Build graph-enriched text for a symbol using dependency context.
141
+ * Produces compact, semantic text (~100 tokens) instead of full source code.
142
+ */
143
+ function buildStructuredText(node, file, lines, calleesStmt, callersStmt) {
144
+ const readable = splitIdentifier(node.name);
145
+ const parts = [`${node.kind} ${node.name} (${readable}) in ${file}`];
146
+ const startLine = Math.max(0, node.line - 1);
147
+
148
+ // Extract parameters from signature (best-effort, single-line)
149
+ const sigLine = lines[startLine] || '';
150
+ const paramMatch = sigLine.match(/\(([^)]*)\)/);
151
+ if (paramMatch?.[1]?.trim()) {
152
+ parts.push(`Parameters: ${paramMatch[1].trim()}`);
153
+ }
154
+
155
+ // Graph context: callees (capped at 10)
156
+ const callees = calleesStmt.all(node.id);
157
+ if (callees.length > 0) {
158
+ parts.push(
159
+ `Calls: ${callees
160
+ .slice(0, 10)
161
+ .map((c) => c.name)
162
+ .join(', ')}`,
163
+ );
164
+ }
165
+
166
+ // Graph context: callers (capped at 10)
167
+ const callers = callersStmt.all(node.id);
168
+ if (callers.length > 0) {
169
+ parts.push(
170
+ `Called by: ${callers
171
+ .slice(0, 10)
172
+ .map((c) => c.name)
173
+ .join(', ')}`,
174
+ );
175
+ }
176
+
177
+ // Leading comment (high semantic value) or first few lines of code
178
+ const comment = extractLeadingComment(lines, startLine);
179
+ if (comment) {
180
+ parts.push(comment);
181
+ } else {
182
+ const endLine = Math.min(lines.length, startLine + 4);
183
+ const snippet = lines.slice(startLine, endLine).join('\n').trim();
184
+ if (snippet) parts.push(snippet);
185
+ }
186
+
187
+ return parts.join('\n');
188
+ }
189
+
190
+ /**
191
+ * Build raw source-code text for a symbol (original strategy).
192
+ */
193
+ function buildSourceText(node, file, lines) {
194
+ const startLine = Math.max(0, node.line - 1);
195
+ const endLine = node.end_line
196
+ ? Math.min(lines.length, node.end_line)
197
+ : Math.min(lines.length, startLine + 15);
198
+ const context = lines.slice(startLine, endLine).join('\n');
199
+ const readable = splitIdentifier(node.name);
200
+ return `${node.kind} ${node.name} (${readable}) in ${file}\n${context}`;
201
+ }
202
+
80
203
  /**
81
204
  * Lazy-load @huggingface/transformers.
82
205
  * This is an optional dependency — gives a clear error if not installed.
@@ -103,8 +226,27 @@ async function loadModel(modelKey) {
103
226
  _cos_sim = transformers.cos_sim;
104
227
 
105
228
  console.log(`Loading embedding model: ${config.name} (${config.dim}d)...`);
106
- const opts = config.quantized ? { quantized: true } : {};
107
- extractor = await pipeline('feature-extraction', config.name, opts);
229
+ const pipelineOpts = config.quantized ? { quantized: true } : {};
230
+ try {
231
+ extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
232
+ } catch (err) {
233
+ const msg = err.message || String(err);
234
+ if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
235
+ console.error(
236
+ `\nModel "${config.name}" requires authentication.\n` +
237
+ `This model is gated on HuggingFace and needs an access token.\n\n` +
238
+ `Options:\n` +
239
+ ` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
240
+ ` 2. Use a public model instead: codegraph embed --model minilm\n`,
241
+ );
242
+ } else {
243
+ console.error(
244
+ `\nFailed to load model "${config.name}": ${msg}\n` +
245
+ `Try a different model: codegraph embed --model minilm\n`,
246
+ );
247
+ }
248
+ process.exit(1);
249
+ }
108
250
  activeModel = config.name;
109
251
  console.log('Model loaded.');
110
252
  return { extractor, config };
@@ -172,10 +314,14 @@ function initEmbeddingsSchema(db) {
172
314
 
173
315
  /**
174
316
  * Build embeddings for all functions/methods/classes in the graph.
317
+ * @param {string} rootDir - Project root directory
318
+ * @param {string} modelKey - Model identifier from MODELS registry
319
+ * @param {string} [customDbPath] - Override path to graph.db
320
+ * @param {object} [options] - Embedding options
321
+ * @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
175
322
  */
176
- export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
177
- // path already imported at top
178
- // fs already imported at top
323
+ export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
324
+ const strategy = options.strategy || 'structured';
179
325
  const dbPath = customDbPath || findDbPath(null);
180
326
 
181
327
  const db = new Database(dbPath);
@@ -190,7 +336,24 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
190
336
  )
191
337
  .all();
192
338
 
193
- console.log(`Building embeddings for ${nodes.length} symbols...`);
339
+ console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
340
+
341
+ // Prepare graph-context queries for structured strategy
342
+ let calleesStmt, callersStmt;
343
+ if (strategy === 'structured') {
344
+ calleesStmt = db.prepare(`
345
+ SELECT DISTINCT n.name FROM edges e
346
+ JOIN nodes n ON e.target_id = n.id
347
+ WHERE e.source_id = ? AND e.kind = 'calls'
348
+ ORDER BY n.name
349
+ `);
350
+ callersStmt = db.prepare(`
351
+ SELECT DISTINCT n.name FROM edges e
352
+ JOIN nodes n ON e.source_id = n.id
353
+ WHERE e.target_id = ? AND e.kind = 'calls'
354
+ ORDER BY n.name
355
+ `);
356
+ }
194
357
 
195
358
  const byFile = new Map();
196
359
  for (const node of nodes) {
@@ -201,6 +364,9 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
201
364
  const texts = [];
202
365
  const nodeIds = [];
203
366
  const previews = [];
367
+ const config = getModelConfig(modelKey);
368
+ const contextWindow = config.contextWindow;
369
+ let overflowCount = 0;
204
370
 
205
371
  for (const [file, fileNodes] of byFile) {
206
372
  const fullPath = path.join(rootDir, file);
@@ -213,19 +379,31 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
213
379
  }
214
380
 
215
381
  for (const node of fileNodes) {
216
- const startLine = Math.max(0, node.line - 1);
217
- const endLine = node.end_line
218
- ? Math.min(lines.length, node.end_line)
219
- : Math.min(lines.length, startLine + 15);
220
- const context = lines.slice(startLine, endLine).join('\n');
382
+ let text =
383
+ strategy === 'structured'
384
+ ? buildStructuredText(node, file, lines, calleesStmt, callersStmt)
385
+ : buildSourceText(node, file, lines);
386
+
387
+ // Detect and handle context window overflow
388
+ const tokens = estimateTokens(text);
389
+ if (tokens > contextWindow) {
390
+ overflowCount++;
391
+ const maxChars = contextWindow * 4;
392
+ text = text.slice(0, maxChars);
393
+ }
221
394
 
222
- const text = `${node.kind} ${node.name} in ${file}\n${context}`;
223
395
  texts.push(text);
224
396
  nodeIds.push(node.id);
225
397
  previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
226
398
  }
227
399
  }
228
400
 
401
+ if (overflowCount > 0) {
402
+ warn(
403
+ `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
404
+ );
405
+ }
406
+
229
407
  console.log(`Embedding ${texts.length} symbols...`);
230
408
  const { vectors, dim } = await embed(texts, modelKey);
231
409
 
@@ -237,16 +415,19 @@ export async function buildEmbeddings(rootDir, modelKey, customDbPath) {
237
415
  for (let i = 0; i < vectors.length; i++) {
238
416
  insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i]);
239
417
  }
240
- const config = getModelConfig(modelKey);
241
418
  insertMeta.run('model', config.name);
242
419
  insertMeta.run('dim', String(dim));
243
420
  insertMeta.run('count', String(vectors.length));
421
+ insertMeta.run('strategy', strategy);
244
422
  insertMeta.run('built_at', new Date().toISOString());
423
+ if (overflowCount > 0) {
424
+ insertMeta.run('truncated_count', String(overflowCount));
425
+ }
245
426
  });
246
427
  insertAll();
247
428
 
248
429
  console.log(
249
- `\nStored ${vectors.length} embeddings (${dim}d, ${getModelConfig(modelKey).name}) in graph.db`,
430
+ `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
250
431
  );
251
432
  db.close();
252
433
  }