@zuvia-software-solutions/code-mapper 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.js +10 -3
- package/models/mlx-embedder.py +70 -15
- package/package.json +1 -1
package/dist/cli/analyze.js
CHANGED
|
@@ -266,17 +266,24 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
266
266
|
continue;
|
|
267
267
|
try {
|
|
268
268
|
const msg = JSON.parse(line);
|
|
269
|
-
if (msg.phase === '
|
|
269
|
+
if (msg.phase === 'downloading' || msg.phase === 'converting') {
|
|
270
|
+
updateBar(90, msg.message);
|
|
271
|
+
}
|
|
272
|
+
else if (msg.phase === 'loaded') {
|
|
270
273
|
updateBar(91, `Model loaded (${msg.load_ms}ms)`);
|
|
271
274
|
}
|
|
272
275
|
else if (msg.phase === 'queried') {
|
|
273
|
-
updateBar(92, `Found ${msg.nodes} embeddable nodes`);
|
|
276
|
+
updateBar(92, `Found ${msg.nodes} embeddable nodes${msg.skipped_tests ? ` (${msg.skipped_tests} test files skipped)` : ''}`);
|
|
274
277
|
}
|
|
275
278
|
else if (msg.phase === 'prepared') {
|
|
276
279
|
updateBar(93, `${msg.to_embed} to embed, ${msg.skipped} cached`);
|
|
277
280
|
}
|
|
281
|
+
else if (msg.phase === 'embedding') {
|
|
282
|
+
const scaled = 93 + Math.round((msg.progress / 100) * 4);
|
|
283
|
+
updateBar(scaled, `Embedding... ${msg.progress}% (${msg.embedded} written)`);
|
|
284
|
+
}
|
|
278
285
|
else if (msg.phase === 'embedded') {
|
|
279
|
-
updateBar(
|
|
286
|
+
updateBar(97, `Embedded ${msg.count} nodes (${(msg.ms / 1000).toFixed(1)}s)`);
|
|
280
287
|
}
|
|
281
288
|
else if (msg.phase === 'done') {
|
|
282
289
|
updateBar(98, `Embeddings complete (${msg.embedded} new, ${msg.skipped} cached)`);
|
package/models/mlx-embedder.py
CHANGED
|
@@ -415,26 +415,81 @@ def batch_mode(db_path, dims=256, max_tokens=2048):
|
|
|
415
415
|
unique_texts = [v["text"] for v in unique_by_hash.values()]
|
|
416
416
|
deduped = len(to_embed) - len(unique_texts)
|
|
417
417
|
|
|
418
|
-
# Embed
|
|
418
|
+
# Embed unique texts in streaming fashion — process each batch, write to DB
|
|
419
|
+
# immediately, free GPU memory. Keeps peak memory at ONE batch instead of ALL.
|
|
419
420
|
t0_embed = time.time()
|
|
420
|
-
|
|
421
|
-
embed_ms = int((time.time() - t0_embed) * 1000)
|
|
421
|
+
unique_entries = list(unique_by_hash.values())
|
|
422
422
|
|
|
423
|
-
|
|
423
|
+
# Tokenize + sort (same as embed_tiered but we handle the loop here)
|
|
424
|
+
is_code_model = "jina-code" in MODEL_DIR
|
|
425
|
+
if is_code_model:
|
|
426
|
+
prefix_map = {"retrieval.query": "Find the most relevant code snippet given the following query:\n", "retrieval.passage": "Candidate code snippet:\n"}
|
|
427
|
+
else:
|
|
428
|
+
prefix_map = {"retrieval.query": "Query: ", "retrieval.passage": "Document: "}
|
|
429
|
+
prefix = prefix_map.get("retrieval.passage", "")
|
|
430
|
+
prefixed = [prefix + e["text"] for e in unique_entries]
|
|
431
|
+
encodings = tokenizer.encode_batch(prefixed)
|
|
432
|
+
indexed = sorted(range(len(unique_entries)), key=lambda i: len(encodings[i].ids))
|
|
424
433
|
|
|
425
|
-
|
|
426
|
-
t0_write = time.time()
|
|
434
|
+
embedded_count = 0
|
|
427
435
|
db.execute("BEGIN")
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
+
|
|
437
|
+
i = 0
|
|
438
|
+
while i < len(indexed):
|
|
439
|
+
peek_idx = indexed[min(i + 1, len(indexed) - 1)]
|
|
440
|
+
tok_count = min(len(encodings[peek_idx].ids), max_tokens)
|
|
441
|
+
batch_size = get_batch_size_for_tokens(tok_count)
|
|
442
|
+
|
|
443
|
+
batch_indices = []
|
|
444
|
+
batch_encs = []
|
|
445
|
+
while len(batch_encs) < batch_size and i < len(indexed):
|
|
446
|
+
orig_idx = indexed[i]
|
|
447
|
+
batch_indices.append(orig_idx)
|
|
448
|
+
batch_encs.append(encodings[orig_idx])
|
|
449
|
+
i += 1
|
|
450
|
+
|
|
451
|
+
max_len = min(max_tokens, max(len(e.ids) for e in batch_encs))
|
|
452
|
+
input_ids = []
|
|
453
|
+
attention_mask = []
|
|
454
|
+
for enc in batch_encs:
|
|
455
|
+
ids = enc.ids[:max_len]
|
|
456
|
+
mask = enc.attention_mask[:max_len]
|
|
457
|
+
pad = max_len - len(ids)
|
|
458
|
+
if pad > 0:
|
|
459
|
+
ids = ids + [0] * pad
|
|
460
|
+
mask = mask + [0] * pad
|
|
461
|
+
input_ids.append(ids)
|
|
462
|
+
attention_mask.append(mask)
|
|
463
|
+
|
|
464
|
+
# Forward pass
|
|
465
|
+
embs = model(mx.array(input_ids), mx.array(attention_mask))
|
|
466
|
+
if dims and dims < embs.shape[1]:
|
|
467
|
+
embs = embs[:, :dims]
|
|
468
|
+
norms = mx.linalg.norm(embs, axis=1, keepdims=True)
|
|
469
|
+
embs = embs / norms
|
|
470
|
+
mx.eval(embs)
|
|
471
|
+
|
|
472
|
+
# Convert to Python + write to DB immediately
|
|
473
|
+
emb_list = embs.tolist()
|
|
474
|
+
del embs # free MLX GPU memory
|
|
475
|
+
|
|
476
|
+
for j, orig_idx in enumerate(batch_indices):
|
|
477
|
+
entry = unique_entries[orig_idx]
|
|
478
|
+
blob = float_list_to_blob(emb_list[j])
|
|
479
|
+
for nid, th in entry["node_ids"]:
|
|
480
|
+
db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
|
|
481
|
+
(nid, blob, th))
|
|
482
|
+
embedded_count += len(entry["node_ids"])
|
|
483
|
+
|
|
484
|
+
# Progress
|
|
485
|
+
pct = i * 100 // len(indexed)
|
|
486
|
+
print(json.dumps({"phase": "embedding", "progress": pct, "embedded": embedded_count}), flush=True)
|
|
487
|
+
|
|
436
488
|
db.execute("COMMIT")
|
|
437
|
-
|
|
489
|
+
embed_ms = int((time.time() - t0_embed) * 1000)
|
|
490
|
+
write_ms = 0 # included in embed_ms now
|
|
491
|
+
|
|
492
|
+
print(json.dumps({"phase": "embedded", "count": len(unique_entries), "deduped": deduped, "ms": embed_ms}), flush=True)
|
|
438
493
|
|
|
439
494
|
total_ms = int((time.time() - t0_total) * 1000)
|
|
440
495
|
print(json.dumps({
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zuvia-software-solutions/code-mapper",
|
|
3
|
-
"version": "2.3.
|
|
3
|
+
"version": "2.3.2",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|