@zuvia-software-solutions/code-mapper 2.6.2 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,20 +9,28 @@
9
9
  import { pipeline } from '@huggingface/transformers';
10
10
  const MODEL_ID = 'Xenova/bge-small-en-v1.5';
11
11
  async function main() {
12
- // Load model
13
- const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
12
+ const extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
14
13
  process.send({ type: 'ready' });
15
14
  // Process messages from parent
16
15
  process.on('message', async (msg) => {
17
16
  if (msg.type === 'embed') {
18
17
  const results = [];
19
- for (const item of msg.items) {
20
- try {
21
- const result = await extractor(item.text, { pooling: 'cls', normalize: true });
22
- results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
18
+ try {
19
+ const texts = msg.items.map((item) => item.text);
20
+ const batchResult = await extractor(texts, { pooling: 'cls', normalize: true });
21
+ const dims = batchResult.dims?.[1] ?? 384;
22
+ const flat = batchResult.data;
23
+ for (let i = 0; i < msg.items.length; i++) {
24
+ results.push({ nodeId: msg.items[i].nodeId, vec: Array.from(flat.subarray(i * dims, (i + 1) * dims)) });
23
25
  }
24
- catch {
25
- // Skip failed embeddings
26
+ }
27
+ catch {
28
+ for (const item of msg.items) {
29
+ try {
30
+ const result = await extractor(item.text, { pooling: 'cls', normalize: true });
31
+ results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
32
+ }
33
+ catch { /* skip */ }
26
34
  }
27
35
  }
28
36
  process.send({ type: 'results', results, batchId: msg.batchId });
@@ -30,7 +30,7 @@ interface NlDocument {
30
30
  source: string;
31
31
  text: string;
32
32
  }
33
- /** Build NL documents from a node */
33
+ /** Build NL documents from a node — keyword-dense, minimal tokens */
34
34
  export declare function extractNlTexts(node: NodeForNl): NlDocument[];
35
35
  /**
36
36
  * Build NL embeddings for all eligible nodes in the database.
@@ -21,11 +21,13 @@ export async function initNlEmbedder() {
21
21
  return loadPromise;
22
22
  loadPromise = (async () => {
23
23
  const { pipeline, env } = await import('@huggingface/transformers');
24
+ const os = await import('os');
25
+ const cpuCount = os.cpus().length;
24
26
  // Use all available CPU threads for ONNX inference
25
27
  if (env.backends?.onnx?.wasm) {
26
- env.backends.onnx.wasm.numThreads = Math.max(1, (await import('os')).cpus().length);
28
+ env.backends.onnx.wasm.numThreads = Math.max(1, cpuCount);
27
29
  }
28
- extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
30
+ extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
29
31
  })();
30
32
  return loadPromise;
31
33
  }
@@ -44,14 +46,15 @@ export async function nlEmbed(text) {
44
46
  export async function nlEmbedBatch(texts) {
45
47
  if (!extractor)
46
48
  await initNlEmbedder();
47
- const BATCH = 32; // sub-batch size — balances throughput vs memory
49
+ const BATCH = 64;
48
50
  const results = [];
49
51
  for (let i = 0; i < texts.length; i += BATCH) {
50
52
  const batch = texts.slice(i, i + BATCH);
51
- // Process sub-batch transformers.js handles arrays
52
- const batchResults = await Promise.all(batch.map(text => extractor(text, { pooling: 'cls', normalize: true })));
53
- for (const result of batchResults) {
54
- results.push(Array.from(result.data));
53
+ const batchResult = await extractor(batch, { pooling: 'cls', normalize: true });
54
+ const dims = batchResult.dims?.[1] ?? 384;
55
+ const flat = batchResult.data;
56
+ for (let j = 0; j < batch.length; j++) {
57
+ results.push(Array.from(flat.subarray(j * dims, (j + 1) * dims)));
55
58
  }
56
59
  }
57
60
  return results;
@@ -144,11 +147,19 @@ function extractParamNames(content) {
144
147
  .map(p => expandIdentifier(p))
145
148
  .join(', ');
146
149
  }
147
- /** Build NL documents from a node */
150
+ /** Strip noise tokens that waste tokenizer budget without adding semantic value */
151
+ function condense(text) {
152
+ return text
153
+ .replace(/---[^-]*---/g, '') // section headers from comments
154
+ .replace(/[{}[\]()'",;:]/g, '') // punctuation
155
+ .replace(/\. /g, ' ') // sentence separators
156
+ .replace(/\s{2,}/g, ' ') // collapse whitespace
157
+ .trim();
158
+ }
159
+ /** Build NL documents from a node — keyword-dense, minimal tokens */
148
160
  export function extractNlTexts(node) {
149
161
  const docs = [];
150
- const name = node.name;
151
- const expandedName = expandIdentifier(name);
162
+ const expandedName = expandIdentifier(node.name);
152
163
  const dir = node.filePath.split('/').slice(-3, -1).join('/');
153
164
  // 1. Comment-based NL text (primary)
154
165
  const comment = extractFullComment(node.content);
@@ -156,22 +167,21 @@ export function extractNlTexts(node) {
156
167
  docs.push({
157
168
  nodeId: node.id,
158
169
  source: 'comment',
159
- text: `${expandedName}: ${comment}. File: ${dir}`,
170
+ text: condense(`${expandedName} ${comment} ${dir}`),
160
171
  });
161
172
  }
162
- // 2. Name + params + return type (always available)
173
+ // 2. Name + params (always available)
163
174
  const params = extractParamNames(node.content);
164
- const parts = [expandedName];
165
- if (params)
166
- parts.push(`Parameters: ${params}`);
167
- if (dir)
168
- parts.push(`in ${dir}`);
169
175
  if (!comment) {
170
- // Only add name-based doc if no comment (avoid duplication)
176
+ const parts = [expandedName];
177
+ if (params)
178
+ parts.push(params);
179
+ if (dir)
180
+ parts.push(dir);
171
181
  docs.push({
172
182
  nodeId: node.id,
173
183
  source: 'name',
174
- text: parts.join('. '),
184
+ text: condense(parts.join(' ')),
175
185
  });
176
186
  }
177
187
  // 3. Enum/const values
@@ -181,7 +191,7 @@ export function extractNlTexts(node) {
181
191
  docs.push({
182
192
  nodeId: node.id,
183
193
  source: 'enum',
184
- text: `${expandedName}: ${values}`,
194
+ text: condense(`${expandedName} ${values}`),
185
195
  });
186
196
  }
187
197
  }
@@ -271,8 +281,9 @@ export async function buildNlEmbeddings(db, onProgress) {
271
281
  // Find worker script path
272
282
  const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
273
283
  const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
274
- // Split work across workers
275
- const ITEMS_PER_BATCH = 50;
284
+ // Split work across workers — larger batches reduce IPC round-trips
285
+ // and let the ONNX runtime amortize overhead across more items
286
+ const ITEMS_PER_BATCH = 256;
276
287
  let nextIdx = 0;
277
288
  let embedded = 0;
278
289
  const getNextBatch = () => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zuvia-software-solutions/code-mapper",
3
- "version": "2.6.2",
3
+ "version": "2.6.4",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",