@zuvia-software-solutions/code-mapper 2.6.2 → 2.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -9,20 +9,28 @@
|
|
|
9
9
|
import { pipeline } from '@huggingface/transformers';
|
|
10
10
|
const MODEL_ID = 'Xenova/bge-small-en-v1.5';
|
|
11
11
|
async function main() {
|
|
12
|
-
|
|
13
|
-
const extractor = await pipeline('feature-extraction', MODEL_ID, { quantized: true });
|
|
12
|
+
const extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
|
|
14
13
|
process.send({ type: 'ready' });
|
|
15
14
|
// Process messages from parent
|
|
16
15
|
process.on('message', async (msg) => {
|
|
17
16
|
if (msg.type === 'embed') {
|
|
18
17
|
const results = [];
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
18
|
+
try {
|
|
19
|
+
const texts = msg.items.map((item) => item.text);
|
|
20
|
+
const batchResult = await extractor(texts, { pooling: 'cls', normalize: true });
|
|
21
|
+
const dims = batchResult.dims?.[1] ?? 384;
|
|
22
|
+
const flat = batchResult.data;
|
|
23
|
+
for (let i = 0; i < msg.items.length; i++) {
|
|
24
|
+
results.push({ nodeId: msg.items[i].nodeId, vec: Array.from(flat.subarray(i * dims, (i + 1) * dims)) });
|
|
23
25
|
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
for (const item of msg.items) {
|
|
29
|
+
try {
|
|
30
|
+
const result = await extractor(item.text, { pooling: 'cls', normalize: true });
|
|
31
|
+
results.push({ nodeId: item.nodeId, vec: Array.from(result.data) });
|
|
32
|
+
}
|
|
33
|
+
catch { /* skip */ }
|
|
26
34
|
}
|
|
27
35
|
}
|
|
28
36
|
process.send({ type: 'results', results, batchId: msg.batchId });
|
|
@@ -30,7 +30,7 @@ interface NlDocument {
|
|
|
30
30
|
source: string;
|
|
31
31
|
text: string;
|
|
32
32
|
}
|
|
33
|
-
/** Build NL documents from a node */
|
|
33
|
+
/** Build NL documents from a node — keyword-dense, minimal tokens */
|
|
34
34
|
export declare function extractNlTexts(node: NodeForNl): NlDocument[];
|
|
35
35
|
/**
|
|
36
36
|
* Build NL embeddings for all eligible nodes in the database.
|
|
@@ -21,11 +21,13 @@ export async function initNlEmbedder() {
|
|
|
21
21
|
return loadPromise;
|
|
22
22
|
loadPromise = (async () => {
|
|
23
23
|
const { pipeline, env } = await import('@huggingface/transformers');
|
|
24
|
+
const os = await import('os');
|
|
25
|
+
const cpuCount = os.cpus().length;
|
|
24
26
|
// Use all available CPU threads for ONNX inference
|
|
25
27
|
if (env.backends?.onnx?.wasm) {
|
|
26
|
-
env.backends.onnx.wasm.numThreads = Math.max(1,
|
|
28
|
+
env.backends.onnx.wasm.numThreads = Math.max(1, cpuCount);
|
|
27
29
|
}
|
|
28
|
-
extractor = await pipeline('feature-extraction', MODEL_ID, {
|
|
30
|
+
extractor = await pipeline('feature-extraction', MODEL_ID, { dtype: 'q8' });
|
|
29
31
|
})();
|
|
30
32
|
return loadPromise;
|
|
31
33
|
}
|
|
@@ -44,14 +46,15 @@ export async function nlEmbed(text) {
|
|
|
44
46
|
export async function nlEmbedBatch(texts) {
|
|
45
47
|
if (!extractor)
|
|
46
48
|
await initNlEmbedder();
|
|
47
|
-
const BATCH =
|
|
49
|
+
const BATCH = 64;
|
|
48
50
|
const results = [];
|
|
49
51
|
for (let i = 0; i < texts.length; i += BATCH) {
|
|
50
52
|
const batch = texts.slice(i, i + BATCH);
|
|
51
|
-
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
const batchResult = await extractor(batch, { pooling: 'cls', normalize: true });
|
|
54
|
+
const dims = batchResult.dims?.[1] ?? 384;
|
|
55
|
+
const flat = batchResult.data;
|
|
56
|
+
for (let j = 0; j < batch.length; j++) {
|
|
57
|
+
results.push(Array.from(flat.subarray(j * dims, (j + 1) * dims)));
|
|
55
58
|
}
|
|
56
59
|
}
|
|
57
60
|
return results;
|
|
@@ -144,11 +147,19 @@ function extractParamNames(content) {
|
|
|
144
147
|
.map(p => expandIdentifier(p))
|
|
145
148
|
.join(', ');
|
|
146
149
|
}
|
|
147
|
-
/**
|
|
150
|
+
/** Strip noise tokens that waste tokenizer budget without adding semantic value */
|
|
151
|
+
function condense(text) {
|
|
152
|
+
return text
|
|
153
|
+
.replace(/---[^-]*---/g, '') // section headers from comments
|
|
154
|
+
.replace(/[{}[\]()'",;:]/g, '') // punctuation
|
|
155
|
+
.replace(/\. /g, ' ') // sentence separators
|
|
156
|
+
.replace(/\s{2,}/g, ' ') // collapse whitespace
|
|
157
|
+
.trim();
|
|
158
|
+
}
|
|
159
|
+
/** Build NL documents from a node — keyword-dense, minimal tokens */
|
|
148
160
|
export function extractNlTexts(node) {
|
|
149
161
|
const docs = [];
|
|
150
|
-
const
|
|
151
|
-
const expandedName = expandIdentifier(name);
|
|
162
|
+
const expandedName = expandIdentifier(node.name);
|
|
152
163
|
const dir = node.filePath.split('/').slice(-3, -1).join('/');
|
|
153
164
|
// 1. Comment-based NL text (primary)
|
|
154
165
|
const comment = extractFullComment(node.content);
|
|
@@ -156,22 +167,21 @@ export function extractNlTexts(node) {
|
|
|
156
167
|
docs.push({
|
|
157
168
|
nodeId: node.id,
|
|
158
169
|
source: 'comment',
|
|
159
|
-
text: `${expandedName}
|
|
170
|
+
text: condense(`${expandedName} ${comment} ${dir}`),
|
|
160
171
|
});
|
|
161
172
|
}
|
|
162
|
-
// 2. Name + params
|
|
173
|
+
// 2. Name + params (always available)
|
|
163
174
|
const params = extractParamNames(node.content);
|
|
164
|
-
const parts = [expandedName];
|
|
165
|
-
if (params)
|
|
166
|
-
parts.push(`Parameters: ${params}`);
|
|
167
|
-
if (dir)
|
|
168
|
-
parts.push(`in ${dir}`);
|
|
169
175
|
if (!comment) {
|
|
170
|
-
|
|
176
|
+
const parts = [expandedName];
|
|
177
|
+
if (params)
|
|
178
|
+
parts.push(params);
|
|
179
|
+
if (dir)
|
|
180
|
+
parts.push(dir);
|
|
171
181
|
docs.push({
|
|
172
182
|
nodeId: node.id,
|
|
173
183
|
source: 'name',
|
|
174
|
-
text: parts.join('
|
|
184
|
+
text: condense(parts.join(' ')),
|
|
175
185
|
});
|
|
176
186
|
}
|
|
177
187
|
// 3. Enum/const values
|
|
@@ -181,7 +191,7 @@ export function extractNlTexts(node) {
|
|
|
181
191
|
docs.push({
|
|
182
192
|
nodeId: node.id,
|
|
183
193
|
source: 'enum',
|
|
184
|
-
text: `${expandedName}
|
|
194
|
+
text: condense(`${expandedName} ${values}`),
|
|
185
195
|
});
|
|
186
196
|
}
|
|
187
197
|
}
|
|
@@ -271,8 +281,9 @@ export async function buildNlEmbeddings(db, onProgress) {
|
|
|
271
281
|
// Find worker script path
|
|
272
282
|
const thisDir = pathMod.dirname(fileURLToPath(import.meta.url));
|
|
273
283
|
const workerScript = pathMod.join(thisDir, 'nl-embed-worker.js');
|
|
274
|
-
// Split work across workers
|
|
275
|
-
|
|
284
|
+
// Split work across workers — larger batches reduce IPC round-trips
|
|
285
|
+
// and let the ONNX runtime amortize overhead across more items
|
|
286
|
+
const ITEMS_PER_BATCH = 256;
|
|
276
287
|
let nextIdx = 0;
|
|
277
288
|
let embedded = 0;
|
|
278
289
|
const getNextBatch = () => {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zuvia-software-solutions/code-mapper",
|
|
3
|
-
"version": "2.6.
|
|
3
|
+
"version": "2.6.4",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|