npm - @zuvia-software-solutions/code-mapper - Versions diffs - 2.6.2 → 2.6.4 - Mend

@zuvia-software-solutions/code-mapper 2.6.2 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/core/embeddings/nl-embed-worker.js +16 -8
package/dist/core/embeddings/nl-embedder.d.ts +1 -1
package/dist/core/embeddings/nl-embedder.js +33 -22
package/package.json +1 -1

package/dist/core/embeddings/nl-embed-worker.js CHANGED Viewed

@@ -9,20 +9,28 @@
 import { pipeline } from '@huggingface/transformers';
 const MODEL_ID = 'Xenova/bge-small-en-v1.5';
 async function main() {
-    // Load model
-    const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
+    const extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
     process.send({ type: 'ready' });
     // Process messages from parent
     process.on('message', async (msg) => {
         if (msg.type === 'embed') {
             const results = [];
-            for (const item of msg.items) {
-                try {
-                    const result = await extractor(item.text, { pooling: 'cls', normalize: true });
-                    results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
+            try {
+                const texts = msg.items.map((item) => item.text);
+                const batchResult = await extractor(texts, { pooling: 'cls', normalize: true });
+                const dims = batchResult.dims?.[1] ?? 384;
+                const flat = batchResult.data;
+                for (let i = 0; i < msg.items.length; i++) {
+                    results.push({ nodeId: msg.items[i].nodeId, vec: Array.from(flat.subarray(i * dims, (i + 1) * dims)) });
                 }
-                catch {
-                    // Skip failed embeddings
+            }
+            catch {
+                for (const item of msg.items) {
+                    try {
+                        const result = await extractor(item.text, { pooling: 'cls', normalize: true });
+                        results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
+                    }
+                    catch { /* skip */ }
                 }
             }
             process.send({ type: 'results', results, batchId: msg.batchId });

package/dist/core/embeddings/nl-embedder.d.ts CHANGED Viewed

@@ -30,7 +30,7 @@ interface NlDocument {
     source: string;
     text: string;
 }
-/** Build NL documents from a node */
+/** Build NL documents from a node — keyword-dense, minimal tokens */
 export declare function extractNlTexts(node: NodeForNl): NlDocument[];
 /**
  * Build NL embeddings for all eligible nodes in the database.

package/dist/core/embeddings/nl-embedder.js CHANGED Viewed

@@ -21,11 +21,13 @@ export async function initNlEmbedder() {
         return loadPromise;
     loadPromise = (async () => {
         const { pipeline, env } = await import('@huggingface/transformers');
+        const os = await import('os');
+        const cpuCount = os.cpus().length;
         // Use all available CPU threads for ONNX inference
         if (env.backends?.onnx?.wasm) {
-            env.backends.onnx.wasm.numThreads = Math.max(1, (await import('os')).cpus().length);
+            env.backends.onnx.wasm.numThreads = Math.max(1, cpuCount);
         }
-        extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
+        extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
     })();
     return loadPromise;
 }
@@ -44,14 +46,15 @@ export async function nlEmbed(text) {
 export async function nlEmbedBatch(texts) {
     if (!extractor)
         await initNlEmbedder();
-    const BATCH = 32; // sub-batch size — balances throughput vs memory
+    const BATCH = 64;
     const results = [];
     for (let i = 0; i < texts.length; i += BATCH) {
         const batch = texts.slice(i, i + BATCH);
-        // Process sub-batch — transformers.js handles arrays
-        const batchResults = await Promise.all(batch.map(text => extractor(text, { pooling: 'cls', normalize: true })));
-        for (const result of batchResults) {
-            results.push(Array.from(result.data));
+        const batchResult = await extractor(batch, { pooling: 'cls', normalize: true });
+        const dims = batchResult.dims?.[1] ?? 384;
+        const flat = batchResult.data;
+        for (let j = 0; j < batch.length; j++) {
+            results.push(Array.from(flat.subarray(j * dims, (j + 1) * dims)));
         }
     }
     return results;
@@ -144,11 +147,19 @@ function extractParamNames(content) {
         .map(p => expandIdentifier(p))
         .join(', ');
 }
-/** Build NL documents from a node */
+/** Strip noise tokens that waste tokenizer budget without adding semantic value */
+function condense(text) {
+    return text
+        .replace(/---[^-]*---/g, '') // section headers from comments
+        .replace(/[{}[\]()'",;:]/g, '') // punctuation
+        .replace(/\. /g, ' ') // sentence separators
+        .replace(/\s{2,}/g, ' ') // collapse whitespace
+        .trim();
+}
+/** Build NL documents from a node — keyword-dense, minimal tokens */
 export function extractNlTexts(node) {
     const docs = [];
-    const name = node.name;
-    const expandedName = expandIdentifier(name);
+    const expandedName = expandIdentifier(node.name);
     const dir = node.filePath.split('/').slice(-3, -1).join('/');
     // 1. Comment-based NL text (primary)
     const comment = extractFullComment(node.content);
@@ -156,22 +167,21 @@ export function extractNlTexts(node) {
         docs.push({
             nodeId: node.id,
             source: 'comment',
-            text: `${expandedName}: ${comment}. File: ${dir}`,
+            text: condense(`${expandedName} ${comment} ${dir}`),
         });
     }
-    // 2. Name + params + return type (always available)
+    // 2. Name + params (always available)
     const params = extractParamNames(node.content);
-    const parts = [expandedName];
-    if (params)
-        parts.push(`Parameters: ${params}`);
-    if (dir)
-        parts.push(`in ${dir}`);
     if (!comment) {
-        // Only add name-based doc if no comment (avoid duplication)
+        const parts = [expandedName];
+        if (params)
+            parts.push(params);
+        if (dir)
+            parts.push(dir);
         docs.push({
             nodeId: node.id,
             source: 'name',
-            text: parts.join('. '),
+            text: condense(parts.join(' ')),
         });
     }
     // 3. Enum/const values
@@ -181,7 +191,7 @@ export function extractNlTexts(node) {
             docs.push({
                 nodeId: node.id,
                 source: 'enum',
-                text: `${expandedName}: ${values}`,
+                text: condense(`${expandedName} ${values}`),
             });
         }
     }
@@ -271,8 +281,9 @@ export async function buildNlEmbeddings(db, onProgress) {
     // Find worker script path
     const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
     const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
-    // Split work across workers
-    const ITEMS_PER_BATCH = 50;
+    // Split work across workers — larger batches reduce IPC round-trips
+    // and let the ONNX runtime amortize overhead across more items
+    const ITEMS_PER_BATCH = 256;
     let nextIdx = 0;
     let embedded = 0;
     const getNextBatch = () => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@zuvia-software-solutions/code-mapper",
-  "version": "2.6.2",
+  "version": "2.6.4",
   "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
   "author": "Abhigyan Patwari",
   "license": "PolyForm-Noncommercial-1.0.0",