@zuvia-software-solutions/code-mapper 2.1.1 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,15 @@
10
10
  * or invalid labels/edge types.
11
11
  */
12
12
  import Database from 'better-sqlite3';
13
+ /**
14
+ * Execute a query with an IN-clause over a potentially large ID array.
15
+ * Automatically chunks into batches of SQL_VAR_LIMIT and concatenates results.
16
+ */
17
+ export declare function queryChunked<T>(db: Database.Database, ids: readonly string[], buildSql: (placeholders: string) => string): T[];
18
+ /**
19
+ * Execute a write statement with an IN-clause over a potentially large ID array.
20
+ */
21
+ export declare function runChunked(db: Database.Database, ids: readonly string[], buildSql: (placeholders: string) => string): void;
13
22
  import { type NodeId, type NodeLabel, type EdgeType, type NodeRow, type EdgeRow, type NodeInsert, type EdgeInsert } from './schema.js';
14
23
  /** Open (or reuse) a SQLite database. Creates schema if new. */
15
24
  export declare function openDb(dbPath: string): Database.Database;
@@ -12,6 +12,44 @@
12
12
  */
13
13
  import Database from 'better-sqlite3';
14
14
  import path from 'path';
15
+ // ---------------------------------------------------------------------------
16
+ // Chunked IN-clause helper — SQLite limits variables to 999 per statement.
17
+ // All queries with dynamic IN (...) must use this to support large codebases.
18
+ // ---------------------------------------------------------------------------
19
+ const SQL_VAR_LIMIT = 900; // safe margin below SQLite's 999 default
20
+ /**
21
+ * Execute a query with an IN-clause over a potentially large ID array.
22
+ * Automatically chunks into batches of SQL_VAR_LIMIT and concatenates results.
23
+ */
24
+ export function queryChunked(db, ids, buildSql) {
25
+ if (ids.length === 0)
26
+ return [];
27
+ if (ids.length <= SQL_VAR_LIMIT) {
28
+ const ph = ids.map(() => '?').join(',');
29
+ return db.prepare(buildSql(ph)).all(...ids);
30
+ }
31
+ const results = [];
32
+ for (let i = 0; i < ids.length; i += SQL_VAR_LIMIT) {
33
+ const chunk = ids.slice(i, i + SQL_VAR_LIMIT);
34
+ const ph = chunk.map(() => '?').join(',');
35
+ const rows = db.prepare(buildSql(ph)).all(...chunk);
36
+ for (const row of rows)
37
+ results.push(row);
38
+ }
39
+ return results;
40
+ }
41
+ /**
42
+ * Execute a write statement with an IN-clause over a potentially large ID array.
43
+ */
44
+ export function runChunked(db, ids, buildSql) {
45
+ if (ids.length === 0)
46
+ return;
47
+ for (let i = 0; i < ids.length; i += SQL_VAR_LIMIT) {
48
+ const chunk = ids.slice(i, i + SQL_VAR_LIMIT);
49
+ const ph = chunk.map(() => '?').join(',');
50
+ db.prepare(buildSql(ph)).run(...chunk);
51
+ }
52
+ }
15
53
  import fs from 'fs';
16
54
  import { SCHEMA_SQL, toNodeId, } from './schema.js';
17
55
  // ---------------------------------------------------------------------------
@@ -179,12 +217,11 @@ export function deleteNodesByFile(db, filePath) {
179
217
  if (nodeIds.length === 0)
180
218
  return 0;
181
219
  const ids = nodeIds.map(n => n.id);
182
- const ph = ids.map(() => '?').join(',');
183
220
  // Delete edges FROM this file's nodes (outgoing). Incoming edges from other
184
221
  // files are preserved — the node IDs are deterministic (label:filePath:name),
185
222
  // so re-inserted nodes get the same ID and the edges remain valid.
186
- db.prepare(`DELETE FROM edges WHERE sourceId IN (${ph})`).run(...ids);
187
- db.prepare(`DELETE FROM embeddings WHERE nodeId IN (${ph})`).run(...ids);
223
+ runChunked(db, ids, ph => `DELETE FROM edges WHERE sourceId IN (${ph})`);
224
+ runChunked(db, ids, ph => `DELETE FROM embeddings WHERE nodeId IN (${ph})`);
188
225
  return db.prepare('DELETE FROM nodes WHERE filePath = ?').run(filePath).changes;
189
226
  }
190
227
  // ---------------------------------------------------------------------------
@@ -238,8 +275,7 @@ export function deleteEmbeddingsByFile(db, filePath) {
238
275
  const nodeIds = db.prepare('SELECT id FROM nodes WHERE filePath = ?').all(filePath);
239
276
  if (nodeIds.length === 0)
240
277
  return;
241
- const ph = nodeIds.map(() => '?').join(',');
242
- db.prepare(`DELETE FROM embeddings WHERE nodeId IN (${ph})`).run(...nodeIds.map(n => n.id));
278
+ runChunked(db, nodeIds.map(n => n.id), ph => `DELETE FROM embeddings WHERE nodeId IN (${ph})`);
243
279
  }
244
280
  /** Count embeddings. */
245
281
  export function countEmbeddings(db) {
@@ -15,6 +15,7 @@
15
15
  */
16
16
  import { toNodeId, assertNodeLabel, assertEdgeType } from './schema.js';
17
17
  export { getStats } from './adapter.js';
18
+ import { queryChunked } from './adapter.js';
18
19
  // ---------------------------------------------------------------------------
19
20
  // Test-file detection (inlined -- small, pure, no external deps)
20
21
  // ---------------------------------------------------------------------------
@@ -172,14 +173,10 @@ export function findCommunityForNode(db, nodeId) {
172
173
  export function batchFindProcesses(db, nodeIds) {
173
174
  if (nodeIds.length === 0)
174
175
  return [];
175
- const ph = nodeIds.map(() => '?').join(',');
176
- const rows = db.prepare(`
177
- SELECT e.sourceId AS nodeId, p.id AS processId, p.name AS label,
176
+ const rows = queryChunked(db, nodeIds, ph => `SELECT e.sourceId AS nodeId, p.id AS processId, p.name AS label,
178
177
  p.heuristicLabel, p.processType, p.stepCount, e.step
179
- FROM edges e
180
- JOIN nodes p ON p.id = e.targetId
181
- WHERE e.sourceId IN (${ph}) AND e.type = 'STEP_IN_PROCESS' AND p.label = 'Process'
182
- `).all(...nodeIds);
178
+ FROM edges e JOIN nodes p ON p.id = e.targetId
179
+ WHERE e.sourceId IN (${ph}) AND e.type = 'STEP_IN_PROCESS' AND p.label = 'Process'`);
183
180
  return rows.map(r => ({
184
181
  nodeId: toNodeId(r.nodeId),
185
182
  processId: toNodeId(r.processId),
@@ -196,13 +193,9 @@ export function batchFindProcesses(db, nodeIds) {
196
193
  export function batchFindCommunities(db, nodeIds) {
197
194
  if (nodeIds.length === 0)
198
195
  return [];
199
- const ph = nodeIds.map(() => '?').join(',');
200
- const rows = db.prepare(`
201
- SELECT e.sourceId AS nodeId, c.id AS communityId, c.heuristicLabel AS module, c.cohesion
202
- FROM edges e
203
- JOIN nodes c ON c.id = e.targetId
204
- WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'
205
- `).all(...nodeIds);
196
+ const rows = queryChunked(db, nodeIds, ph => `SELECT e.sourceId AS nodeId, c.id AS communityId, c.heuristicLabel AS module, c.cohesion
197
+ FROM edges e JOIN nodes c ON c.id = e.targetId
198
+ WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'`);
206
199
  return rows.map(r => ({
207
200
  nodeId: toNodeId(r.nodeId),
208
201
  communityId: toNodeId(r.communityId),
@@ -410,8 +403,7 @@ export function findProcessesByName(db, name) {
410
403
  export function findNodesByIds(db, ids) {
411
404
  if (ids.length === 0)
412
405
  return [];
413
- const ph = ids.map(() => '?').join(',');
414
- return db.prepare(`SELECT * FROM nodes WHERE id IN (${ph})`).all(...ids);
406
+ return queryChunked(db, ids, ph => `SELECT * FROM nodes WHERE id IN (${ph})`);
415
407
  }
416
408
  /**
417
409
  * Get ALL steps for multiple processes at once.
@@ -420,15 +412,11 @@ export function findNodesByIds(db, ids) {
420
412
  export function batchGetProcessSteps(db, processIds) {
421
413
  if (processIds.length === 0)
422
414
  return [];
423
- const ph = processIds.map(() => '?').join(',');
424
- const rows = db.prepare(`
425
- SELECT e.targetId AS processId, n.id AS nodeId, n.name, n.label,
415
+ const rows = queryChunked(db, processIds, ph => `SELECT e.targetId AS processId, n.id AS nodeId, n.name, n.label,
426
416
  n.filePath, n.startLine, e.step
427
- FROM edges e
428
- JOIN nodes n ON n.id = e.sourceId
417
+ FROM edges e JOIN nodes n ON n.id = e.sourceId
429
418
  WHERE e.targetId IN (${ph}) AND e.type = 'STEP_IN_PROCESS'
430
- ORDER BY e.targetId, e.step ASC
431
- `).all(...processIds);
419
+ ORDER BY e.targetId, e.step ASC`);
432
420
  return rows.map(r => ({
433
421
  processId: toNodeId(r.processId),
434
422
  nodeId: toNodeId(r.nodeId),
@@ -12,6 +12,7 @@ import { initEmbedder, embedBatch, embedQuery, embeddingToArray, isEmbedderReady
12
12
  import { generateEmbeddingText } from './text-generator.js';
13
13
  import { DEFAULT_EMBEDDING_CONFIG, EMBEDDABLE_LABELS, } from './types.js';
14
14
  import { toNodeId } from '../db/schema.js';
15
+ import { queryChunked } from '../db/adapter.js';
15
16
  import { createHash } from 'crypto';
16
17
  const isDev = process.env['NODE_ENV'] === 'development';
17
18
  /** Fast content hash for detecting unchanged embedding text */
@@ -65,15 +66,10 @@ export function fetchGraphContext(db, nodes) {
65
66
  if (totalNodes === 0)
66
67
  return graphContext;
67
68
  try {
68
- const ph = nodes.map(() => '?').join(',');
69
69
  const nodeIds = nodes.map(n => n.id);
70
70
  // Batch fetch callers
71
- const callerRows = db.prepare(`
72
- SELECT e.targetId AS nid, n.name AS name
73
- FROM edges e JOIN nodes n ON n.id = e.sourceId
74
- WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
75
- LIMIT ${totalNodes * 3}
76
- `).all(...nodeIds);
71
+ const callerRows = queryChunked(db, nodeIds, ph => `SELECT e.targetId AS nid, n.name AS name FROM edges e JOIN nodes n ON n.id = e.sourceId
72
+ WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7`);
77
73
  const callerMap = new Map();
78
74
  for (const r of callerRows) {
79
75
  if (!callerMap.has(r.nid))
@@ -81,12 +77,8 @@ export function fetchGraphContext(db, nodes) {
81
77
  callerMap.get(r.nid).push(r.name);
82
78
  }
83
79
  // Batch fetch callees
84
- const calleeRows = db.prepare(`
85
- SELECT e.sourceId AS nid, n.name AS name
86
- FROM edges e JOIN nodes n ON n.id = e.targetId
87
- WHERE e.sourceId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7
88
- LIMIT ${totalNodes * 3}
89
- `).all(...nodeIds);
80
+ const calleeRows = queryChunked(db, nodeIds, ph => `SELECT e.sourceId AS nid, n.name AS name FROM edges e JOIN nodes n ON n.id = e.targetId
81
+ WHERE e.sourceId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7`);
90
82
  const calleeMap = new Map();
91
83
  for (const r of calleeRows) {
92
84
  if (!calleeMap.has(r.nid))
@@ -94,12 +86,8 @@ export function fetchGraphContext(db, nodes) {
94
86
  calleeMap.get(r.nid).push(r.name);
95
87
  }
96
88
  // Batch fetch module (community membership)
97
- const moduleRows = db.prepare(`
98
- SELECT e.sourceId AS nid, c.heuristicLabel AS module
99
- FROM edges e JOIN nodes c ON c.id = e.targetId
100
- WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'
101
- LIMIT ${totalNodes}
102
- `).all(...nodeIds);
89
+ const moduleRows = queryChunked(db, nodeIds, ph => `SELECT e.sourceId AS nid, c.heuristicLabel AS module FROM edges e JOIN nodes c ON c.id = e.targetId
90
+ WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'`);
103
91
  const moduleMap = new Map();
104
92
  for (const r of moduleRows) {
105
93
  moduleMap.set(r.nid, r.module ?? '');
@@ -1,20 +1,29 @@
1
1
  /**
2
2
  * @file text-generator.ts
3
- * @description Pure functions to generate embedding text from code nodes,
4
- * combining node metadata with code snippets for semantic matching
3
+ * @description Generates semantic embedding text from code nodes.
4
+ *
5
+ * Optimized for retrieval quality: sends structured metadata + first comment
6
+ * + code signature instead of raw code dumps. Produces 55% fewer tokens
7
+ * with equal or better search quality (tested A/B on 8 query types).
8
+ *
9
+ * The graph context enrichment (callers, callees, module) is applied
10
+ * separately by the embedding pipeline — this module handles the per-node text.
5
11
  */
6
12
  import type { EmbeddableNode, EmbeddingConfig } from './types.js';
7
13
  /**
8
- * Generate embedding text for any embeddable node (dispatches by label)
9
- * @param node - The node to generate text for
10
- * @param config - Optional configuration for max snippet length
11
- * @returns Text suitable for embedding
14
+ * Generate embedding text for any embeddable node.
15
+ *
16
+ * Produces a focused semantic summary instead of a raw code dump:
17
+ * - Node type + name + expanded name (natural language bridge)
18
+ * - First comment/JSDoc (human description of what it does)
19
+ * - File + module location
20
+ * - Code signature (declaration, not full body)
21
+ *
22
+ * Graph context (callers, callees, module) is added separately by
23
+ * the embedding pipeline's enrichTextWithGraphContext().
12
24
  */
13
- export declare const generateEmbeddingText: (node: EmbeddableNode, config?: Partial<EmbeddingConfig>) => string;
25
+ export declare const generateEmbeddingText: (node: EmbeddableNode, _config?: Partial<EmbeddingConfig>) => string;
14
26
  /**
15
27
  * Generate embedding texts for a batch of nodes
16
- * @param nodes - Nodes to generate text for
17
- * @param config - Optional configuration
18
- * @returns Texts in the same order as input nodes
19
28
  */
20
29
  export declare const generateBatchEmbeddingTexts: (nodes: EmbeddableNode[], config?: Partial<EmbeddingConfig>) => string[];
@@ -1,143 +1,164 @@
1
1
  // code-mapper/src/core/embeddings/text-generator.ts
2
2
  /**
3
3
  * @file text-generator.ts
4
- * @description Pure functions to generate embedding text from code nodes,
5
- * combining node metadata with code snippets for semantic matching
4
+ * @description Generates semantic embedding text from code nodes.
5
+ *
6
+ * Optimized for retrieval quality: sends structured metadata + first comment
7
+ * + code signature instead of raw code dumps. Produces 55% fewer tokens
8
+ * with equal or better search quality (tested A/B on 8 query types).
9
+ *
10
+ * The graph context enrichment (callers, callees, module) is applied
11
+ * separately by the embedding pipeline — this module handles the per-node text.
6
12
  */
7
- import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
8
- import { assertNever } from '../../lib/type-utils.js';
9
13
  /** Extract filename from a file path */
10
14
  const getFileName = (filePath) => {
11
15
  const parts = filePath.split('/');
12
16
  return parts[parts.length - 1] || filePath;
13
17
  };
14
- /** Extract directory path from a file path */
15
- const getDirectory = (filePath) => {
16
- const parts = filePath.split('/');
17
- parts.pop();
18
- return parts.join('/') || '';
19
- };
20
- /** Truncate content to max length, preserving word boundaries */
21
- const truncateContent = (content, maxLength) => {
22
- if (content.length <= maxLength) {
23
- return content;
24
- }
25
- // Find last space before maxLength to avoid cutting words
26
- const truncated = content.slice(0, maxLength);
27
- const lastSpace = truncated.lastIndexOf(' ');
28
- if (lastSpace > maxLength * 0.8) {
29
- return truncated.slice(0, lastSpace) + '...';
30
- }
31
- return truncated + '...';
32
- };
33
- /** Clean code content — remove excessive whitespace while preserving structure */
34
- const cleanContent = (content) => {
35
- return content
36
- // Normalize line endings
37
- .replace(/\r\n/g, '\n')
38
- // Remove excessive blank lines (more than 2)
39
- .replace(/\n{3,}/g, '\n\n')
40
- // Trim each line
41
- .split('\n')
42
- .map(line => line.trimEnd())
43
- .join('\n')
44
- .trim();
45
- };
46
- /** Generate embedding text for a Function node */
47
- const generateFunctionText = (node, maxSnippetLength) => {
48
- const parts = [
49
- `Function: ${node.name}`,
50
- `File: ${getFileName(node.filePath)}`,
51
- ];
52
- const dir = getDirectory(node.filePath);
53
- if (dir) {
54
- parts.push(`Directory: ${dir}`);
55
- }
56
- if (node.content) {
57
- const cleanedContent = cleanContent(node.content);
58
- const snippet = truncateContent(cleanedContent, maxSnippetLength);
59
- parts.push('', snippet);
60
- }
61
- return parts.join('\n');
62
- };
63
- /** Generate embedding text for a Class node */
64
- const generateClassText = (node, maxSnippetLength) => {
65
- const parts = [
66
- `Class: ${node.name}`,
67
- `File: ${getFileName(node.filePath)}`,
68
- ];
69
- const dir = getDirectory(node.filePath);
70
- if (dir) {
71
- parts.push(`Directory: ${dir}`);
72
- }
73
- if (node.content) {
74
- const cleanedContent = cleanContent(node.content);
75
- const snippet = truncateContent(cleanedContent, maxSnippetLength);
76
- parts.push('', snippet);
77
- }
78
- return parts.join('\n');
79
- };
80
- /** Generate embedding text for a Method node */
81
- const generateMethodText = (node, maxSnippetLength) => {
82
- const parts = [
83
- `Method: ${node.name}`,
84
- `File: ${getFileName(node.filePath)}`,
85
- ];
86
- const dir = getDirectory(node.filePath);
87
- if (dir) {
88
- parts.push(`Directory: ${dir}`);
89
- }
90
- if (node.content) {
91
- const cleanedContent = cleanContent(node.content);
92
- const snippet = truncateContent(cleanedContent, maxSnippetLength);
93
- parts.push('', snippet);
18
+ /**
19
+ * Extract the first JSDoc/comment block as a natural language description.
20
+ * This bridges natural language queries to code — "blast radius analysis"
21
+ * matches a function whose comment says "Analyze the blast radius".
22
+ * Caps at 3 lines to keep the embedding text focused.
23
+ */
24
+ function extractFirstComment(content) {
25
+ if (!content)
26
+ return '';
27
+ const lines = content.split('\n');
28
+ const commentLines = [];
29
+ let inBlock = false;
30
+ for (const l of lines) {
31
+ const t = l.trim();
32
+ // Start of JSDoc/block comment
33
+ if (t.startsWith('/**') || t.startsWith('/*')) {
34
+ inBlock = true;
35
+ const inner = t.replace(/^\/\*\*?\s*/, '').replace(/\*\/\s*$/, '').trim();
36
+ if (inner && !inner.startsWith('@'))
37
+ commentLines.push(inner);
38
+ if (t.includes('*/'))
39
+ inBlock = false;
40
+ continue;
41
+ }
42
+ // Inside block comment
43
+ if (inBlock) {
44
+ if (t.includes('*/')) {
45
+ inBlock = false;
46
+ continue;
47
+ }
48
+ const inner = t.replace(/^\*\s?/, '').trim();
49
+ if (inner && !inner.startsWith('@'))
50
+ commentLines.push(inner);
51
+ if (commentLines.length >= 3)
52
+ break;
53
+ continue;
54
+ }
55
+ // Single-line comments (// or #)
56
+ if (t.startsWith('//')) {
57
+ const inner = t.slice(2).trim();
58
+ if (inner)
59
+ commentLines.push(inner);
60
+ if (commentLines.length >= 3)
61
+ break;
62
+ continue;
63
+ }
64
+ if (t.startsWith('#') && !t.startsWith('#!')) {
65
+ const inner = t.slice(1).trim();
66
+ if (inner)
67
+ commentLines.push(inner);
68
+ if (commentLines.length >= 3)
69
+ break;
70
+ continue;
71
+ }
72
+ // Python docstring
73
+ if (t.startsWith('"""') || t.startsWith("'''")) {
74
+ const inner = t.slice(3).replace(/"""\s*$/, '').replace(/'''\s*$/, '').trim();
75
+ if (inner)
76
+ commentLines.push(inner);
77
+ if (commentLines.length >= 3)
78
+ break;
79
+ continue;
80
+ }
81
+ // First non-comment line — stop looking
82
+ if (commentLines.length > 0 || (!t.startsWith('export') && !t.startsWith('public') &&
83
+ !t.startsWith('private') && !t.startsWith('protected') && !t.startsWith('async') &&
84
+ !t.startsWith('function') && !t.startsWith('class') && !t.startsWith('interface') &&
85
+ !t.startsWith('const') && !t.startsWith('def') && !t.startsWith('fn') &&
86
+ t.length > 0)) {
87
+ break;
88
+ }
94
89
  }
95
- return parts.join('\n');
96
- };
97
- /** Generate embedding text for an Interface node */
98
- const generateInterfaceText = (node, maxSnippetLength) => {
99
- const parts = [
100
- `Interface: ${node.name}`,
101
- `File: ${getFileName(node.filePath)}`,
102
- ];
103
- const dir = getDirectory(node.filePath);
104
- if (dir) {
105
- parts.push(`Directory: ${dir}`);
90
+ return commentLines.join(' ');
91
+ }
92
+ /**
93
+ * Extract the code signature (declaration lines) without the full body.
94
+ * For functions: the signature up to the opening brace.
95
+ * For classes: the class declaration + field/method declarations (not bodies).
96
+ * For interfaces: the full body (always short — fields ARE the signature).
97
+ */
98
+ function extractSignature(content, label) {
99
+ if (!content)
100
+ return '';
101
+ const lines = content.split('\n');
102
+ // Interfaces: full body (short, fields are the signature)
103
+ if (label === 'Interface') {
104
+ if (lines.length <= 30)
105
+ return content.trim();
106
+ return lines.slice(0, 30).join('\n') + '\n // ...';
106
107
  }
107
- if (node.content) {
108
- const cleanedContent = cleanContent(node.content);
109
- const snippet = truncateContent(cleanedContent, maxSnippetLength);
110
- parts.push('', snippet);
108
+ // Classes: declaration + field declarations + method names (not bodies)
109
+ if (label === 'Class') {
110
+ const sigLines = [];
111
+ for (const l of lines.slice(0, 60)) {
112
+ const t = l.trim();
113
+ if (!t || t.startsWith('//') || t.startsWith('*') || t.startsWith('/*'))
114
+ continue;
115
+ // Keep class declaration, field declarations, method signatures
116
+ if (t.startsWith('export class') || t.startsWith('class ') ||
117
+ t.includes('private ') || t.includes('public ') ||
118
+ t.includes('protected ') || t.includes('readonly ') ||
119
+ t.includes('static ') || t.includes('abstract ')) {
120
+ sigLines.push(t);
121
+ }
122
+ if (sigLines.length >= 20)
123
+ break;
124
+ }
125
+ return sigLines.join('\n');
111
126
  }
112
- return parts.join('\n');
113
- };
127
+ // Functions/Methods: first 8 lines (signature + first few statements)
128
+ const snippet = lines.slice(0, Math.min(8, lines.length));
129
+ return snippet.join('\n').trim();
130
+ }
114
131
  /**
115
- * Generate embedding text for any embeddable node (dispatches by label)
116
- * @param node - The node to generate text for
117
- * @param config - Optional configuration for max snippet length
118
- * @returns Text suitable for embedding
132
+ * Generate embedding text for any embeddable node.
133
+ *
134
+ * Produces a focused semantic summary instead of a raw code dump:
135
+ * - Node type + name + expanded name (natural language bridge)
136
+ * - First comment/JSDoc (human description of what it does)
137
+ * - File + module location
138
+ * - Code signature (declaration, not full body)
139
+ *
140
+ * Graph context (callers, callees, module) is added separately by
141
+ * the embedding pipeline's enrichTextWithGraphContext().
119
142
  */
120
- export const generateEmbeddingText = (node, config = {}) => {
121
- const maxSnippetLength = config.maxSnippetLength ?? DEFAULT_EMBEDDING_CONFIG.maxSnippetLength;
143
+ export const generateEmbeddingText = (node, _config = {}) => {
122
144
  const label = node.label;
123
- switch (label) {
124
- case 'Function':
125
- return generateFunctionText(node, maxSnippetLength);
126
- case 'Class':
127
- return generateClassText(node, maxSnippetLength);
128
- case 'Method':
129
- return generateMethodText(node, maxSnippetLength);
130
- case 'Interface':
131
- return generateInterfaceText(node, maxSnippetLength);
132
- default:
133
- return assertNever(label, `Unknown embeddable label: ${node.label}`);
134
- }
145
+ const parts = [];
146
+ // 1. Type + name
147
+ parts.push(`${label}: ${node.name}`);
148
+ // 2. First comment as natural language description
149
+ const comment = extractFirstComment(node.content);
150
+ if (comment)
151
+ parts.push(comment);
152
+ // 3. File location
153
+ parts.push(`File: ${getFileName(node.filePath)}`);
154
+ // 4. Code signature (not full body)
155
+ const sig = extractSignature(node.content, label);
156
+ if (sig)
157
+ parts.push('', sig);
158
+ return parts.join('\n');
135
159
  };
136
160
  /**
137
161
  * Generate embedding texts for a batch of nodes
138
- * @param nodes - Nodes to generate text for
139
- * @param config - Optional configuration
140
- * @returns Texts in the same order as input nodes
141
162
  */
142
163
  export const generateBatchEmbeddingTexts = (nodes, config = {}) => {
143
164
  return nodes.map(node => generateEmbeddingText(node, config));
@@ -3,6 +3,8 @@ export declare const EMBEDDABLE_LABELS: readonly ["Function", "Class", "Method",
3
3
  export type EmbeddableLabel = typeof EMBEDDABLE_LABELS[number];
4
4
  /** Check if a label is embeddable */
5
5
  export declare const isEmbeddableLabel: (label: string) => label is EmbeddableLabel;
6
+ /** Check if a file path is a test/fixture file (skip embedding, BM25 covers it) */
7
+ export declare const isTestFile: (filePath: string) => boolean;
6
8
  /** Embedding pipeline lifecycle phases */
7
9
  export type EmbeddingPhase = 'idle' | 'loading-model' | 'embedding' | 'indexing' | 'ready' | 'error';
8
10
  /** Progress state emitted during embedding pipeline execution */
@@ -11,6 +11,10 @@ export const EMBEDDABLE_LABELS = [
11
11
  ];
12
12
  /** Check if a label is embeddable */
13
13
  export const isEmbeddableLabel = (label) => EMBEDDABLE_LABELS.includes(label);
14
+ /** Test file patterns — these are searched via BM25, not semantic embeddings */
15
+ const TEST_PATH_PATTERNS = ['/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/', '.test.', '.spec.', '_test.', '_spec.'];
16
+ /** Check if a file path is a test/fixture file (skip embedding, BM25 covers it) */
17
+ export const isTestFile = (filePath) => TEST_PATH_PATTERNS.some(p => filePath.includes(p));
14
18
  // Jina Code 1.5B MLX — 1.54B params on Apple Silicon Metal
15
19
  // Matryoshka truncation to 256 dims (trained at this dim, <1% quality loss vs 1536)
16
20
  // Task-specific prefixes: nl2code queries, code passages
@@ -428,9 +428,12 @@ export async function refreshEmbeddings(db, dirtyFiles, hasEmbeddings) {
428
428
  deleteEmbeddingsByFile(db, entry.relativePath);
429
429
  }
430
430
  // Step 2: Query new embeddable nodes for modified/created files
431
+ // Skip test/fixture files — BM25 handles them
432
+ const { isTestFile } = await import('../embeddings/types.js');
431
433
  const embeddableSet = new Set(EMBEDDABLE_LABELS);
432
434
  const modifiedPaths = dirtyFiles
433
435
  .filter(f => f.changeKind === 'modified' || f.changeKind === 'created')
436
+ .filter(f => !isTestFile(f.relativePath))
434
437
  .map(f => f.relativePath);
435
438
  if (modifiedPaths.length === 0)
436
439
  return;
@@ -6,7 +6,7 @@
6
6
  import fs from 'fs/promises';
7
7
  import path from 'path';
8
8
  import { execFileSync } from 'child_process';
9
- import { openDb, closeDb, getNode, findNodesByName, findNodesByFile, rawQuery, searchVector, countEmbeddings, searchFTS } from '../../core/db/adapter.js';
9
+ import { openDb, closeDb, getNode, findNodesByName, findNodesByFile, rawQuery, searchVector, countEmbeddings, searchFTS, queryChunked } from '../../core/db/adapter.js';
10
10
  import { toNodeId, assertEdgeType } from '../../core/db/schema.js';
11
11
  import * as queries from '../../core/db/queries.js';
12
12
  import { refreshFiles, refreshEmbeddings } from '../../core/incremental/refresh.js';
@@ -1552,20 +1552,18 @@ export class LocalBackend {
1552
1552
  const callerCounts = new Map();
1553
1553
  const calleeCounts = new Map();
1554
1554
  if (symbolIds.length > 0) {
1555
- const ph = symbolIds.map(() => '?').join(',');
1556
- const callerRows = db.prepare(`SELECT targetId, COUNT(*) as cnt FROM edges WHERE targetId IN (${ph}) AND type = 'CALLS' GROUP BY targetId`).all(...symbolIds);
1555
+ const callerRows = queryChunked(db, symbolIds, ph => `SELECT targetId, COUNT(*) as cnt FROM edges WHERE targetId IN (${ph}) AND type = 'CALLS' GROUP BY targetId`);
1557
1556
  for (const r of callerRows)
1558
1557
  callerCounts.set(r.targetId, r.cnt);
1559
- const calleeRows = db.prepare(`SELECT sourceId, COUNT(*) as cnt FROM edges WHERE sourceId IN (${ph}) AND type = 'CALLS' GROUP BY sourceId`).all(...symbolIds);
1558
+ const calleeRows = queryChunked(db, symbolIds, ph => `SELECT sourceId, COUNT(*) as cnt FROM edges WHERE sourceId IN (${ph}) AND type = 'CALLS' GROUP BY sourceId`);
1560
1559
  for (const r of calleeRows)
1561
1560
  calleeCounts.set(r.sourceId, r.cnt);
1562
1561
  }
1563
1562
  // Get community membership for symbols
1564
1563
  const communityMap = new Map();
1565
1564
  if (symbolIds.length > 0) {
1566
- const ph = symbolIds.map(() => '?').join(',');
1567
- const memberRows = db.prepare(`SELECT e.sourceId, c.heuristicLabel FROM edges e JOIN nodes c ON c.id = e.targetId
1568
- WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'`).all(...symbolIds);
1565
+ const memberRows = queryChunked(db, symbolIds, ph => `SELECT e.sourceId, c.heuristicLabel FROM edges e JOIN nodes c ON c.id = e.targetId
1566
+ WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'`);
1569
1567
  for (const r of memberRows)
1570
1568
  communityMap.set(r.sourceId, r.heuristicLabel);
1571
1569
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zuvia-software-solutions/code-mapper",
3
- "version": "2.1.1",
3
+ "version": "2.2.1",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",