@zuvia-software-solutions/code-mapper 2.2.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/analyze.js +5 -1
- package/dist/core/db/adapter.d.ts +6 -0
- package/dist/core/db/adapter.js +102 -2
- package/dist/core/db/schema.d.ts +3 -1
- package/dist/core/db/schema.js +7 -5
- package/models/jina-v5-small-mlx/config.json +19 -0
- package/models/jina-v5-small-mlx/model.py +260 -0
- package/models/mlx-embedder.py +495 -0
- package/package.json +5 -2
package/dist/cli/analyze.js
CHANGED
|
@@ -5,7 +5,7 @@ import { execFileSync } from 'child_process';
|
|
|
5
5
|
import v8 from 'v8';
|
|
6
6
|
import cliProgress from 'cli-progress';
|
|
7
7
|
import { runPipelineFromRepo } from '../core/ingestion/pipeline.js';
|
|
8
|
-
import { openDb, closeDb, resetDb, getStats, insertEmbeddingsBatch, countEmbeddings } from '../core/db/adapter.js';
|
|
8
|
+
import { openDb, closeDb, resetDb, getStats, insertEmbeddingsBatch, countEmbeddings, populateSearchText } from '../core/db/adapter.js';
|
|
9
9
|
import { loadGraphToDb } from '../core/db/graph-loader.js';
|
|
10
10
|
import { stitchRoutes } from '../core/ingestion/route-stitcher.js';
|
|
11
11
|
import { toNodeId } from '../core/db/schema.js';
|
|
@@ -200,6 +200,10 @@ export const analyzeCommand = async (inputPath, options) => {
|
|
|
200
200
|
const dbWarnings = dbResult.warnings;
|
|
201
201
|
// Phase 2.5: HTTP route stitching (post-DB-load, needs content field)
|
|
202
202
|
stitchRoutes(db);
|
|
203
|
+
// Phase 2.6: Populate searchText for BM25 concept matching
|
|
204
|
+
// Uses first comment + callers + module — must run after edges are loaded
|
|
205
|
+
updateBar(84, 'Building search index...');
|
|
206
|
+
populateSearchText(db);
|
|
203
207
|
// Phase 3: FTS (85-90%)
|
|
204
208
|
// FTS5 is auto-created by schema triggers — no manual index creation needed
|
|
205
209
|
updateBar(85, 'Search indexes ready');
|
|
@@ -90,6 +90,12 @@ export declare function getStats(db: Database.Database): {
|
|
|
90
90
|
export declare function insertNodesBatch(db: Database.Database, nodes: readonly NodeInsert[]): void;
|
|
91
91
|
/** Batch insert edges in a single transaction. */
|
|
92
92
|
export declare function insertEdgesBatch(db: Database.Database, edges: readonly EdgeInsert[]): void;
|
|
93
|
+
/**
|
|
94
|
+
* Populate the searchText column for all nodes with semantic summaries.
|
|
95
|
+
* Uses first comment + callers + module to enable BM25 concept matching.
|
|
96
|
+
* Call AFTER edges are loaded (needs CALLS and MEMBER_OF edges).
|
|
97
|
+
*/
|
|
98
|
+
export declare function populateSearchText(db: Database.Database): void;
|
|
93
99
|
/** Batch insert embeddings in a single transaction. */
|
|
94
100
|
export declare function insertEmbeddingsBatch(db: Database.Database, items: readonly {
|
|
95
101
|
nodeId: NodeId;
|
package/dist/core/db/adapter.js
CHANGED
|
@@ -145,12 +145,12 @@ const INSERT_NODE_SQL = `
|
|
|
145
145
|
id, label, name, filePath, startLine, endLine, isExported, content, description,
|
|
146
146
|
heuristicLabel, cohesion, symbolCount, keywords, enrichedBy,
|
|
147
147
|
processType, stepCount, communities, entryPointId, terminalId,
|
|
148
|
-
parameterCount, returnType, nameExpanded
|
|
148
|
+
parameterCount, returnType, nameExpanded, searchText
|
|
149
149
|
) VALUES (
|
|
150
150
|
@id, @label, @name, @filePath, @startLine, @endLine, @isExported, @content, @description,
|
|
151
151
|
@heuristicLabel, @cohesion, @symbolCount, @keywords, @enrichedBy,
|
|
152
152
|
@processType, @stepCount, @communities, @entryPointId, @terminalId,
|
|
153
|
-
@parameterCount, @returnType, @nameExpanded
|
|
153
|
+
@parameterCount, @returnType, @nameExpanded, @searchText
|
|
154
154
|
)
|
|
155
155
|
`;
|
|
156
156
|
/** Insert or replace a node. Automatically expands name for FTS natural language matching. */
|
|
@@ -178,6 +178,7 @@ export function insertNode(db, node) {
|
|
|
178
178
|
parameterCount: node.parameterCount ?? null,
|
|
179
179
|
returnType: node.returnType ?? null,
|
|
180
180
|
nameExpanded: node.nameExpanded ?? expandIdentifier(node.name ?? ''),
|
|
181
|
+
searchText: node.searchText ?? '',
|
|
181
182
|
});
|
|
182
183
|
}
|
|
183
184
|
/** Get a node by ID. Returns undefined if not found. */
|
|
@@ -373,6 +374,7 @@ export function insertNodesBatch(db, nodes) {
|
|
|
373
374
|
terminalId: node.terminalId ?? null, parameterCount: node.parameterCount ?? null,
|
|
374
375
|
returnType: node.returnType ?? null,
|
|
375
376
|
nameExpanded: node.nameExpanded ?? expandIdentifier(node.name ?? ''),
|
|
377
|
+
searchText: node.searchText ?? '',
|
|
376
378
|
});
|
|
377
379
|
}
|
|
378
380
|
});
|
|
@@ -392,6 +394,104 @@ export function insertEdgesBatch(db, edges) {
|
|
|
392
394
|
});
|
|
393
395
|
txn(edges);
|
|
394
396
|
}
|
|
397
|
+
/**
|
|
398
|
+
* Populate the searchText column for all nodes with semantic summaries.
|
|
399
|
+
* Uses first comment + callers + module to enable BM25 concept matching.
|
|
400
|
+
* Call AFTER edges are loaded (needs CALLS and MEMBER_OF edges).
|
|
401
|
+
*/
|
|
402
|
+
export function populateSearchText(db) {
|
|
403
|
+
// Extract first comment from content
|
|
404
|
+
function extractComment(content) {
|
|
405
|
+
if (!content)
|
|
406
|
+
return '';
|
|
407
|
+
const lines = content.split('\n');
|
|
408
|
+
const out = [];
|
|
409
|
+
let inBlock = false;
|
|
410
|
+
for (const l of lines) {
|
|
411
|
+
const t = l.trim();
|
|
412
|
+
if (t.startsWith('/**') || t.startsWith('/*')) {
|
|
413
|
+
inBlock = true;
|
|
414
|
+
const inner = t.replace(/^\/\*\*?\s*/, '').replace(/\*\/\s*$/, '').trim();
|
|
415
|
+
if (inner && !inner.startsWith('@'))
|
|
416
|
+
out.push(inner);
|
|
417
|
+
if (t.includes('*/'))
|
|
418
|
+
inBlock = false;
|
|
419
|
+
continue;
|
|
420
|
+
}
|
|
421
|
+
if (inBlock) {
|
|
422
|
+
if (t.includes('*/')) {
|
|
423
|
+
inBlock = false;
|
|
424
|
+
continue;
|
|
425
|
+
}
|
|
426
|
+
const inner = t.replace(/^\*\s?/, '').trim();
|
|
427
|
+
if (inner && !inner.startsWith('@'))
|
|
428
|
+
out.push(inner);
|
|
429
|
+
if (out.length >= 3)
|
|
430
|
+
break;
|
|
431
|
+
continue;
|
|
432
|
+
}
|
|
433
|
+
if (t.startsWith('//')) {
|
|
434
|
+
const inner = t.slice(2).trim();
|
|
435
|
+
if (inner)
|
|
436
|
+
out.push(inner);
|
|
437
|
+
if (out.length >= 3)
|
|
438
|
+
break;
|
|
439
|
+
continue;
|
|
440
|
+
}
|
|
441
|
+
if (out.length > 0)
|
|
442
|
+
break;
|
|
443
|
+
}
|
|
444
|
+
return out.join(' ');
|
|
445
|
+
}
|
|
446
|
+
const nodes = db.prepare("SELECT id, name, nameExpanded, content FROM nodes WHERE label IN ('Function','Class','Method','Interface','Const','TypeAlias','Enum')").all();
|
|
447
|
+
if (nodes.length === 0)
|
|
448
|
+
return;
|
|
449
|
+
// Batch fetch callers + module
|
|
450
|
+
const callerMap = new Map();
|
|
451
|
+
const moduleMap = new Map();
|
|
452
|
+
const ids = nodes.map(n => n.id);
|
|
453
|
+
for (let i = 0; i < ids.length; i += 900) {
|
|
454
|
+
const chunk = ids.slice(i, i + 900);
|
|
455
|
+
const ph = chunk.map(() => '?').join(',');
|
|
456
|
+
const callerRows = db.prepare(`SELECT e.targetId AS nid, n.name FROM edges e JOIN nodes n ON n.id = e.sourceId WHERE e.targetId IN (${ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7`).all(...chunk);
|
|
457
|
+
for (const r of callerRows) {
|
|
458
|
+
if (!callerMap.has(r.nid))
|
|
459
|
+
callerMap.set(r.nid, []);
|
|
460
|
+
callerMap.get(r.nid).push(r.name);
|
|
461
|
+
}
|
|
462
|
+
const modRows = db.prepare(`SELECT e.sourceId AS nid, c.heuristicLabel AS module FROM edges e JOIN nodes c ON c.id = e.targetId WHERE e.sourceId IN (${ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community'`).all(...chunk);
|
|
463
|
+
for (const r of modRows)
|
|
464
|
+
moduleMap.set(r.nid, r.module);
|
|
465
|
+
}
|
|
466
|
+
// Build searchText and update
|
|
467
|
+
// Drop FTS triggers temporarily to avoid column-count issues during bulk update,
|
|
468
|
+
// then rebuild the FTS index in one pass (faster than per-row trigger updates)
|
|
469
|
+
db.exec("DROP TRIGGER IF EXISTS nodes_fts_au");
|
|
470
|
+
const txn = db.transaction(() => {
|
|
471
|
+
for (const node of nodes) {
|
|
472
|
+
const parts = [];
|
|
473
|
+
if (node.nameExpanded)
|
|
474
|
+
parts.push(node.nameExpanded);
|
|
475
|
+
const comment = extractComment(node.content);
|
|
476
|
+
if (comment)
|
|
477
|
+
parts.push(comment);
|
|
478
|
+
const callers = callerMap.get(node.id)?.slice(0, 5);
|
|
479
|
+
if (callers && callers.length > 0)
|
|
480
|
+
parts.push(callers.map(c => expandIdentifier(c)).join(' '));
|
|
481
|
+
const mod = moduleMap.get(node.id);
|
|
482
|
+
if (mod)
|
|
483
|
+
parts.push(mod);
|
|
484
|
+
db.prepare('UPDATE nodes SET searchText = ? WHERE id = ?').run(parts.join(' | '), node.id);
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
txn();
|
|
488
|
+
// Rebuild FTS index from scratch and recreate the trigger
|
|
489
|
+
db.exec("INSERT INTO nodes_fts(nodes_fts) VALUES('rebuild')");
|
|
490
|
+
db.exec(`CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN
|
|
491
|
+
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);
|
|
492
|
+
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);
|
|
493
|
+
END`);
|
|
494
|
+
}
|
|
395
495
|
/** Batch insert embeddings in a single transaction. */
|
|
396
496
|
export function insertEmbeddingsBatch(db, items) {
|
|
397
497
|
const stmt = db.prepare('INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)');
|
package/dist/core/db/schema.d.ts
CHANGED
|
@@ -49,6 +49,7 @@ export interface NodeRow {
|
|
|
49
49
|
readonly parameterCount: number | null;
|
|
50
50
|
readonly returnType: string | null;
|
|
51
51
|
readonly nameExpanded: string;
|
|
52
|
+
readonly searchText: string;
|
|
52
53
|
}
|
|
53
54
|
/** An edge row as stored in the `edges` table */
|
|
54
55
|
export interface EdgeRow {
|
|
@@ -91,6 +92,7 @@ export interface NodeInsert {
|
|
|
91
92
|
readonly parameterCount?: number | null;
|
|
92
93
|
readonly returnType?: string | null;
|
|
93
94
|
readonly nameExpanded?: string;
|
|
95
|
+
readonly searchText?: string;
|
|
94
96
|
}
|
|
95
97
|
/** Fields required to insert an edge */
|
|
96
98
|
export interface EdgeInsert {
|
|
@@ -105,4 +107,4 @@ export interface EdgeInsert {
|
|
|
105
107
|
}
|
|
106
108
|
/** Legacy edge table name constant (kept for compatibility) */
|
|
107
109
|
export declare const REL_TABLE_NAME = "CodeRelation";
|
|
108
|
-
export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);\nEND;\n";
|
|
110
|
+
export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT '',\n searchText TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n searchText,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);\nEND;\n";
|
package/dist/core/db/schema.js
CHANGED
|
@@ -79,7 +79,8 @@ CREATE TABLE IF NOT EXISTS nodes (
|
|
|
79
79
|
terminalId TEXT,
|
|
80
80
|
parameterCount INTEGER,
|
|
81
81
|
returnType TEXT,
|
|
82
|
-
nameExpanded TEXT DEFAULT ''
|
|
82
|
+
nameExpanded TEXT DEFAULT '',
|
|
83
|
+
searchText TEXT DEFAULT ''
|
|
83
84
|
);
|
|
84
85
|
|
|
85
86
|
CREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);
|
|
@@ -117,6 +118,7 @@ CREATE TABLE IF NOT EXISTS embeddings (
|
|
|
117
118
|
CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
|
|
118
119
|
name,
|
|
119
120
|
nameExpanded,
|
|
121
|
+
searchText,
|
|
120
122
|
filePath,
|
|
121
123
|
content,
|
|
122
124
|
content='nodes',
|
|
@@ -124,13 +126,13 @@ CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
|
|
|
124
126
|
);
|
|
125
127
|
|
|
126
128
|
CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN
|
|
127
|
-
INSERT INTO nodes_fts(rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);
|
|
129
|
+
INSERT INTO nodes_fts(rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);
|
|
128
130
|
END;
|
|
129
131
|
CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN
|
|
130
|
-
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);
|
|
132
|
+
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);
|
|
131
133
|
END;
|
|
132
134
|
CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN
|
|
133
|
-
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);
|
|
134
|
-
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);
|
|
135
|
+
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.searchText, old.filePath, old.content);
|
|
136
|
+
INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, searchText, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.searchText, new.filePath, new.content);
|
|
135
137
|
END;
|
|
136
138
|
`;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"model_type": "qwen3",
|
|
3
|
+
"hidden_size": 1024,
|
|
4
|
+
"num_hidden_layers": 28,
|
|
5
|
+
"intermediate_size": 3072,
|
|
6
|
+
"num_attention_heads": 16,
|
|
7
|
+
"num_key_value_heads": 8,
|
|
8
|
+
"rms_norm_eps": 1e-06,
|
|
9
|
+
"vocab_size": 151936,
|
|
10
|
+
"max_position_embeddings": 32768,
|
|
11
|
+
"rope_theta": 3500000,
|
|
12
|
+
"rope_parameters": {
|
|
13
|
+
"rope_theta": 3500000,
|
|
14
|
+
"rope_type": "default"
|
|
15
|
+
},
|
|
16
|
+
"head_dim": 128,
|
|
17
|
+
"tie_word_embeddings": true,
|
|
18
|
+
"rope_scaling": null
|
|
19
|
+
}
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Jina Embeddings v5 Text Small - MLX Implementation
|
|
3
|
+
|
|
4
|
+
Pure MLX port of jina-embeddings-v5-text-small (Qwen3-0.6B backbone).
|
|
5
|
+
Zero dependency on PyTorch or transformers.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Causal attention (decoder architecture)
|
|
9
|
+
- QKNorm (q_norm/k_norm per head)
|
|
10
|
+
- Last-token pooling
|
|
11
|
+
- L2 normalization
|
|
12
|
+
- Matryoshka embedding dimensions: [32, 64, 128, 256, 512, 768, 1024]
|
|
13
|
+
- Max sequence length: 32768 tokens
|
|
14
|
+
- Embedding dimension: 1024
|
|
15
|
+
|
|
16
|
+
Architecture:
|
|
17
|
+
- RoPE (rope_theta from config)
|
|
18
|
+
- SwiGLU MLP
|
|
19
|
+
- RMSNorm
|
|
20
|
+
- QKNorm (RMSNorm on Q/K per head)
|
|
21
|
+
- No attention bias
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from typing import Any, Dict, Optional, Union
|
|
26
|
+
|
|
27
|
+
import mlx.core as mx
|
|
28
|
+
import mlx.nn as nn
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ModelArgs:
|
|
33
|
+
model_type: str
|
|
34
|
+
hidden_size: int
|
|
35
|
+
num_hidden_layers: int
|
|
36
|
+
intermediate_size: int
|
|
37
|
+
num_attention_heads: int
|
|
38
|
+
rms_norm_eps: float
|
|
39
|
+
vocab_size: int
|
|
40
|
+
num_key_value_heads: int
|
|
41
|
+
max_position_embeddings: int
|
|
42
|
+
head_dim: int
|
|
43
|
+
tie_word_embeddings: bool
|
|
44
|
+
rope_parameters: Optional[Dict[str, Union[float, str]]] = None
|
|
45
|
+
rope_theta: Optional[float] = None
|
|
46
|
+
rope_scaling: Optional[Dict[str, Union[float, str]]] = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Attention(nn.Module):
|
|
50
|
+
def __init__(self, args: ModelArgs):
|
|
51
|
+
super().__init__()
|
|
52
|
+
|
|
53
|
+
dim = args.hidden_size
|
|
54
|
+
self.n_heads = n_heads = args.num_attention_heads
|
|
55
|
+
self.n_kv_heads = n_kv_heads = args.num_key_value_heads
|
|
56
|
+
|
|
57
|
+
head_dim = args.head_dim
|
|
58
|
+
self.scale = head_dim**-0.5
|
|
59
|
+
self.head_dim = head_dim
|
|
60
|
+
|
|
61
|
+
self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=False)
|
|
62
|
+
self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
|
|
63
|
+
self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
|
|
64
|
+
self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
|
|
65
|
+
|
|
66
|
+
# Qwen3 has QKNorm
|
|
67
|
+
self.q_norm = nn.RMSNorm(head_dim, eps=args.rms_norm_eps)
|
|
68
|
+
self.k_norm = nn.RMSNorm(head_dim, eps=args.rms_norm_eps)
|
|
69
|
+
|
|
70
|
+
# Resolve rope_theta from config
|
|
71
|
+
if args.rope_parameters and 'rope_theta' in args.rope_parameters:
|
|
72
|
+
rope_theta = float(args.rope_parameters['rope_theta'])
|
|
73
|
+
elif args.rope_theta:
|
|
74
|
+
rope_theta = float(args.rope_theta)
|
|
75
|
+
else:
|
|
76
|
+
rope_theta = 10000.0
|
|
77
|
+
self.rope_theta = rope_theta
|
|
78
|
+
|
|
79
|
+
def __call__(
|
|
80
|
+
self,
|
|
81
|
+
x: mx.array,
|
|
82
|
+
mask: Optional[mx.array] = None,
|
|
83
|
+
) -> mx.array:
|
|
84
|
+
B, L, D = x.shape
|
|
85
|
+
|
|
86
|
+
queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
|
|
87
|
+
|
|
88
|
+
# Reshape and apply QKNorm
|
|
89
|
+
queries = self.q_norm(queries.reshape(B, L, self.n_heads, -1)).transpose(0, 2, 1, 3)
|
|
90
|
+
keys = self.k_norm(keys.reshape(B, L, self.n_kv_heads, -1)).transpose(0, 2, 1, 3)
|
|
91
|
+
values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
|
|
92
|
+
|
|
93
|
+
# RoPE via mx.fast
|
|
94
|
+
queries = mx.fast.rope(queries, self.head_dim, traditional=False, base=self.rope_theta, scale=1.0, offset=0)
|
|
95
|
+
keys = mx.fast.rope(keys, self.head_dim, traditional=False, base=self.rope_theta, scale=1.0, offset=0)
|
|
96
|
+
|
|
97
|
+
# Scaled dot-product attention (handles GQA natively)
|
|
98
|
+
output = mx.fast.scaled_dot_product_attention(
|
|
99
|
+
queries, keys, values,
|
|
100
|
+
mask=mask.astype(queries.dtype) if mask is not None else None,
|
|
101
|
+
scale=self.scale,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
|
|
105
|
+
return self.o_proj(output)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class MLP(nn.Module):
|
|
109
|
+
def __init__(self, dim, hidden_dim):
|
|
110
|
+
super().__init__()
|
|
111
|
+
self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
|
|
112
|
+
self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
|
|
113
|
+
self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
|
|
114
|
+
|
|
115
|
+
def __call__(self, x) -> mx.array:
|
|
116
|
+
gate = nn.silu(self.gate_proj(x))
|
|
117
|
+
return self.down_proj(gate * self.up_proj(x))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class TransformerBlock(nn.Module):
|
|
121
|
+
def __init__(self, args: ModelArgs):
|
|
122
|
+
super().__init__()
|
|
123
|
+
self.self_attn = Attention(args)
|
|
124
|
+
self.mlp = MLP(args.hidden_size, args.intermediate_size)
|
|
125
|
+
self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
|
126
|
+
self.post_attention_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
|
127
|
+
|
|
128
|
+
def __call__(
|
|
129
|
+
self,
|
|
130
|
+
x: mx.array,
|
|
131
|
+
mask: Optional[mx.array] = None,
|
|
132
|
+
) -> mx.array:
|
|
133
|
+
r = self.self_attn(self.input_layernorm(x), mask)
|
|
134
|
+
h = x + r
|
|
135
|
+
r = self.mlp(self.post_attention_layernorm(h))
|
|
136
|
+
out = h + r
|
|
137
|
+
return out
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class Qwen3Model(nn.Module):
|
|
141
|
+
def __init__(self, args: ModelArgs):
|
|
142
|
+
super().__init__()
|
|
143
|
+
self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
|
|
144
|
+
self.layers = [TransformerBlock(args=args) for _ in range(args.num_hidden_layers)]
|
|
145
|
+
self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
|
|
146
|
+
|
|
147
|
+
def __call__(self, inputs: mx.array, mask: Optional[mx.array] = None):
|
|
148
|
+
h = self.embed_tokens(inputs)
|
|
149
|
+
for layer in self.layers:
|
|
150
|
+
h = layer(h, mask)
|
|
151
|
+
return self.norm(h)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class JinaEmbeddingModel(nn.Module):
|
|
155
|
+
"""Jina v5-text-small embedding model with last-token pooling."""
|
|
156
|
+
|
|
157
|
+
def __init__(self, config: dict):
|
|
158
|
+
super().__init__()
|
|
159
|
+
args = ModelArgs(**config)
|
|
160
|
+
self.model = Qwen3Model(args)
|
|
161
|
+
self.config = config
|
|
162
|
+
|
|
163
|
+
def __call__(
|
|
164
|
+
self,
|
|
165
|
+
input_ids: mx.array,
|
|
166
|
+
attention_mask: Optional[mx.array] = None,
|
|
167
|
+
):
|
|
168
|
+
batch_size, seq_len = input_ids.shape
|
|
169
|
+
|
|
170
|
+
# Causal mask (Qwen3 is a decoder model)
|
|
171
|
+
causal_mask = mx.tril(mx.ones((seq_len, seq_len)))
|
|
172
|
+
causal_mask = mx.where(causal_mask == 0, -1e4, 0.0)
|
|
173
|
+
causal_mask = causal_mask[None, None, :, :]
|
|
174
|
+
|
|
175
|
+
# Combine with padding mask
|
|
176
|
+
if attention_mask is not None:
|
|
177
|
+
padding_mask = mx.where(attention_mask == 0, -1e4, 0.0)
|
|
178
|
+
padding_mask = padding_mask[:, None, None, :]
|
|
179
|
+
mask = causal_mask + padding_mask
|
|
180
|
+
else:
|
|
181
|
+
mask = causal_mask
|
|
182
|
+
|
|
183
|
+
hidden_states = self.model(input_ids, mask)
|
|
184
|
+
|
|
185
|
+
# Last token pooling
|
|
186
|
+
if attention_mask is not None:
|
|
187
|
+
sequence_lengths = mx.sum(attention_mask, axis=1) - 1
|
|
188
|
+
batch_indices = mx.arange(hidden_states.shape[0])
|
|
189
|
+
embeddings = hidden_states[batch_indices, sequence_lengths]
|
|
190
|
+
else:
|
|
191
|
+
embeddings = hidden_states[:, -1, :]
|
|
192
|
+
|
|
193
|
+
# L2 normalization
|
|
194
|
+
norms = mx.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
195
|
+
embeddings = embeddings / norms
|
|
196
|
+
|
|
197
|
+
return embeddings
|
|
198
|
+
|
|
199
|
+
def encode(
|
|
200
|
+
self,
|
|
201
|
+
texts: list[str],
|
|
202
|
+
tokenizer,
|
|
203
|
+
max_length: int = 8192,
|
|
204
|
+
truncate_dim: Optional[int] = None,
|
|
205
|
+
task_type: str = "retrieval.query",
|
|
206
|
+
):
|
|
207
|
+
"""
|
|
208
|
+
Encode texts to embeddings.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
texts: List of input texts
|
|
212
|
+
tokenizer: Tokenizer instance (from tokenizers library)
|
|
213
|
+
max_length: Maximum sequence length
|
|
214
|
+
truncate_dim: Optional Matryoshka dimension [32, 64, 128, 256, 512, 768, 1024]
|
|
215
|
+
task_type: Task prefix ("retrieval.query", "retrieval.passage", etc.)
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Embeddings array [batch, dim]
|
|
219
|
+
"""
|
|
220
|
+
prefix_map = {
|
|
221
|
+
"retrieval.query": "Query: ",
|
|
222
|
+
"retrieval.passage": "Document: ",
|
|
223
|
+
"classification": "Document: ",
|
|
224
|
+
"text-matching": "Document: ",
|
|
225
|
+
"clustering": "Document: ",
|
|
226
|
+
}
|
|
227
|
+
prefix = prefix_map.get(task_type, "")
|
|
228
|
+
|
|
229
|
+
if prefix:
|
|
230
|
+
texts = [prefix + text for text in texts]
|
|
231
|
+
|
|
232
|
+
encodings = tokenizer.encode_batch(texts)
|
|
233
|
+
|
|
234
|
+
max_len = min(max_length, max(len(enc.ids) for enc in encodings))
|
|
235
|
+
input_ids = []
|
|
236
|
+
attention_mask = []
|
|
237
|
+
|
|
238
|
+
for encoding in encodings:
|
|
239
|
+
ids = encoding.ids[:max_len]
|
|
240
|
+
mask = encoding.attention_mask[:max_len]
|
|
241
|
+
|
|
242
|
+
pad_len = max_len - len(ids)
|
|
243
|
+
if pad_len > 0:
|
|
244
|
+
ids = ids + [0] * pad_len
|
|
245
|
+
mask = mask + [0] * pad_len
|
|
246
|
+
|
|
247
|
+
input_ids.append(ids)
|
|
248
|
+
attention_mask.append(mask)
|
|
249
|
+
|
|
250
|
+
input_ids = mx.array(input_ids)
|
|
251
|
+
attention_mask = mx.array(attention_mask)
|
|
252
|
+
|
|
253
|
+
embeddings = self(input_ids, attention_mask)
|
|
254
|
+
|
|
255
|
+
if truncate_dim is not None:
|
|
256
|
+
embeddings = embeddings[:, :truncate_dim]
|
|
257
|
+
norms = mx.linalg.norm(embeddings, axis=1, keepdims=True)
|
|
258
|
+
embeddings = embeddings / norms
|
|
259
|
+
|
|
260
|
+
return embeddings
|
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
MLX-accelerated code embedder for Apple Silicon.
|
|
4
|
+
|
|
5
|
+
TWO MODES:
|
|
6
|
+
1. Batch mode (main use): reads nodes directly from SQLite, embeds, writes back.
|
|
7
|
+
No IPC overhead — everything happens in one process.
|
|
8
|
+
Usage: python3 mlx-embedder.py batch <db_path> [--dims 256] [--max-tokens 2048]
|
|
9
|
+
|
|
10
|
+
2. Interactive mode (for MCP query embedding): reads JSON from stdin.
|
|
11
|
+
Usage: python3 mlx-embedder.py [interactive]
|
|
12
|
+
|
|
13
|
+
Model: Jina Embeddings v5 Text Small Retrieval (677M params, Qwen3-0.6B backbone)
|
|
14
|
+
Optimized with int4 quantization (Linear) + int6 quantization (Embedding).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import sys
|
|
18
|
+
import os
|
|
19
|
+
import json
|
|
20
|
+
import time
|
|
21
|
+
import struct
|
|
22
|
+
import hashlib
|
|
23
|
+
|
|
24
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
25
|
+
|
|
26
|
+
import mlx.core as mx
|
|
27
|
+
import mlx.nn as nn
|
|
28
|
+
from tokenizers import Tokenizer
|
|
29
|
+
|
|
30
|
+
MODEL_DIR = os.path.dirname(os.path.abspath(__file__)) + "/jina-code-0.5b-mlx"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def ensure_model_downloaded():
|
|
38
|
+
"""Download model weights from HuggingFace if not present."""
|
|
39
|
+
weights_path = os.path.join(MODEL_DIR, "model.safetensors")
|
|
40
|
+
if os.path.exists(weights_path):
|
|
41
|
+
return
|
|
42
|
+
|
|
43
|
+
print(json.dumps({"phase": "downloading", "message": "Downloading embedding model (~1.1GB, first time only)..."}), flush=True)
|
|
44
|
+
try:
|
|
45
|
+
from huggingface_hub import hf_hub_download
|
|
46
|
+
import shutil
|
|
47
|
+
repo = "jinaai/jina-embeddings-v5-text-small-retrieval-mlx"
|
|
48
|
+
for fname in ["model.safetensors", "tokenizer.json", "vocab.json", "merges.txt", "tokenizer_config.json"]:
|
|
49
|
+
dest = os.path.join(MODEL_DIR, fname)
|
|
50
|
+
if not os.path.exists(dest):
|
|
51
|
+
path = hf_hub_download(repo, fname)
|
|
52
|
+
shutil.copy(path, dest)
|
|
53
|
+
print(json.dumps({"phase": "downloaded", "message": "Model downloaded successfully"}), flush=True)
|
|
54
|
+
except ImportError:
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
"Model weights not found. Install huggingface_hub to auto-download:\n"
|
|
57
|
+
" pip3 install huggingface_hub\n"
|
|
58
|
+
"Or manually download from: https://huggingface.co/jinaai/jina-embeddings-v5-text-small-retrieval-mlx"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_model():
|
|
63
|
+
"""Load model, quantize for speed. Auto-downloads weights on first use."""
|
|
64
|
+
ensure_model_downloaded()
|
|
65
|
+
|
|
66
|
+
sys.path.insert(0, MODEL_DIR)
|
|
67
|
+
import importlib
|
|
68
|
+
model_module = importlib.import_module("model")
|
|
69
|
+
# Support both model class names (v5 = JinaEmbeddingModel, code-0.5b = JinaCodeEmbeddingModel)
|
|
70
|
+
JinaEmbeddingModel = getattr(model_module, "JinaEmbeddingModel", None) or getattr(model_module, "JinaCodeEmbeddingModel")
|
|
71
|
+
|
|
72
|
+
with open(os.path.join(MODEL_DIR, "config.json")) as f:
|
|
73
|
+
config = json.load(f)
|
|
74
|
+
|
|
75
|
+
model = JinaEmbeddingModel(config)
|
|
76
|
+
weights = mx.load(os.path.join(MODEL_DIR, "model.safetensors"))
|
|
77
|
+
model.load_weights(list(weights.items()))
|
|
78
|
+
|
|
79
|
+
nn.quantize(model.model, bits=4, group_size=64,
|
|
80
|
+
class_predicate=lambda _, m: isinstance(m, nn.Linear))
|
|
81
|
+
nn.quantize(model.model, bits=6, group_size=64,
|
|
82
|
+
class_predicate=lambda _, m: isinstance(m, nn.Embedding))
|
|
83
|
+
mx.eval(model.parameters())
|
|
84
|
+
|
|
85
|
+
tokenizer = Tokenizer.from_file(os.path.join(MODEL_DIR, "tokenizer.json"))
|
|
86
|
+
return model, tokenizer
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def get_batch_size_for_tokens(token_count):
|
|
90
|
+
"""Optimal batch size based on actual token count."""
|
|
91
|
+
if token_count <= 64: return 256
|
|
92
|
+
if token_count <= 128: return 128
|
|
93
|
+
if token_count <= 256: return 64
|
|
94
|
+
if token_count <= 512: return 32
|
|
95
|
+
if token_count <= 1024: return 16
|
|
96
|
+
return 8
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def embed_tiered(model, tokenizer, texts, task_type="retrieval.passage", truncate_dim=256, max_tokens=2048):
|
|
100
|
+
"""Embed texts with token-aware batching. Tokenizes first, batches by token count.
|
|
101
|
+
Returns embeddings in the ORIGINAL input order."""
|
|
102
|
+
if not texts:
|
|
103
|
+
return []
|
|
104
|
+
|
|
105
|
+
# Add task prefix — auto-detect based on model type
|
|
106
|
+
# v5 (Qwen3): "Query: " / "Document: "
|
|
107
|
+
# code-0.5b (Qwen2): "Find the most relevant code snippet...\n" / "Candidate code snippet:\n"
|
|
108
|
+
is_code_model = "jina-code" in MODEL_DIR
|
|
109
|
+
if is_code_model:
|
|
110
|
+
prefix_map = {
|
|
111
|
+
"retrieval.query": "Find the most relevant code snippet given the following query:\n",
|
|
112
|
+
"retrieval.passage": "Candidate code snippet:\n",
|
|
113
|
+
}
|
|
114
|
+
else:
|
|
115
|
+
prefix_map = {"retrieval.query": "Query: ", "retrieval.passage": "Document: "}
|
|
116
|
+
prefix = prefix_map.get(task_type, "")
|
|
117
|
+
prefixed = [prefix + t for t in texts] if prefix else texts
|
|
118
|
+
|
|
119
|
+
# Tokenize everything in one call (fast — Rust HF tokenizer)
|
|
120
|
+
encodings = tokenizer.encode_batch(prefixed)
|
|
121
|
+
|
|
122
|
+
# Sort by token length for minimal padding
|
|
123
|
+
indexed = sorted(range(len(texts)), key=lambda i: len(encodings[i].ids))
|
|
124
|
+
|
|
125
|
+
all_embeddings = [None] * len(texts)
|
|
126
|
+
i = 0
|
|
127
|
+
|
|
128
|
+
while i < len(indexed):
|
|
129
|
+
peek_idx = indexed[min(i + 1, len(indexed) - 1)]
|
|
130
|
+
tok_count = min(len(encodings[peek_idx].ids), max_tokens)
|
|
131
|
+
batch_size = get_batch_size_for_tokens(tok_count)
|
|
132
|
+
|
|
133
|
+
batch_indices = []
|
|
134
|
+
batch_encs = []
|
|
135
|
+
while len(batch_encs) < batch_size and i < len(indexed):
|
|
136
|
+
orig_idx = indexed[i]
|
|
137
|
+
batch_indices.append(orig_idx)
|
|
138
|
+
batch_encs.append(encodings[orig_idx])
|
|
139
|
+
i += 1
|
|
140
|
+
|
|
141
|
+
max_len = min(max_tokens, max(len(e.ids) for e in batch_encs))
|
|
142
|
+
input_ids = []
|
|
143
|
+
attention_mask = []
|
|
144
|
+
for enc in batch_encs:
|
|
145
|
+
ids = enc.ids[:max_len]
|
|
146
|
+
mask = enc.attention_mask[:max_len]
|
|
147
|
+
pad = max_len - len(ids)
|
|
148
|
+
if pad > 0:
|
|
149
|
+
ids = ids + [0] * pad
|
|
150
|
+
mask = mask + [0] * pad
|
|
151
|
+
input_ids.append(ids)
|
|
152
|
+
attention_mask.append(mask)
|
|
153
|
+
|
|
154
|
+
embs = model(mx.array(input_ids), mx.array(attention_mask))
|
|
155
|
+
if truncate_dim and truncate_dim < embs.shape[1]:
|
|
156
|
+
embs = embs[:, :truncate_dim]
|
|
157
|
+
norms = mx.linalg.norm(embs, axis=1, keepdims=True)
|
|
158
|
+
embs = embs / norms
|
|
159
|
+
mx.eval(embs)
|
|
160
|
+
|
|
161
|
+
emb_list = embs.tolist()
|
|
162
|
+
for j, orig_idx in enumerate(batch_indices):
|
|
163
|
+
all_embeddings[orig_idx] = emb_list[j]
|
|
164
|
+
|
|
165
|
+
return all_embeddings
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def float_list_to_blob(floats):
|
|
169
|
+
"""Convert list of floats to a binary blob (Float32Array compatible)."""
|
|
170
|
+
return struct.pack(f'{len(floats)}f', *floats)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def md5(text):
|
|
174
|
+
return hashlib.md5(text.encode()).hexdigest()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# =========================================================================
|
|
178
|
+
# BATCH MODE — read from SQLite, embed, write back. Zero IPC.
|
|
179
|
+
# =========================================================================
|
|
180
|
+
|
|
181
|
+
def batch_mode(db_path, dims=256, max_tokens=2048):
|
|
182
|
+
import sqlite3
|
|
183
|
+
|
|
184
|
+
t0_total = time.time()
|
|
185
|
+
|
|
186
|
+
# Load model
|
|
187
|
+
print(json.dumps({"phase": "loading", "message": "Loading MLX model..."}), flush=True)
|
|
188
|
+
model, tokenizer = load_model()
|
|
189
|
+
load_ms = int((time.time() - t0_total) * 1000)
|
|
190
|
+
print(json.dumps({"phase": "loaded", "load_ms": load_ms, "device": str(mx.default_device())}), flush=True)
|
|
191
|
+
|
|
192
|
+
# Open database
|
|
193
|
+
db = sqlite3.connect(db_path)
|
|
194
|
+
db.execute("PRAGMA journal_mode=WAL")
|
|
195
|
+
db.execute("PRAGMA synchronous=NORMAL")
|
|
196
|
+
|
|
197
|
+
# Ensure textHash column exists (migration)
|
|
198
|
+
try:
|
|
199
|
+
db.execute("SELECT textHash FROM embeddings LIMIT 0")
|
|
200
|
+
except sqlite3.OperationalError:
|
|
201
|
+
db.execute("ALTER TABLE embeddings ADD COLUMN textHash TEXT")
|
|
202
|
+
|
|
203
|
+
# Query embeddable nodes — skip test/fixture files (BM25 covers them)
|
|
204
|
+
labels = ('Function', 'Class', 'Method', 'Interface')
|
|
205
|
+
placeholders = ','.join('?' * len(labels))
|
|
206
|
+
all_rows = db.execute(
|
|
207
|
+
f"SELECT id, name, label, filePath, content, startLine, endLine, nameExpanded FROM nodes WHERE label IN ({placeholders})",
|
|
208
|
+
labels
|
|
209
|
+
).fetchall()
|
|
210
|
+
|
|
211
|
+
# Filter out test files — they're searchable via BM25 keyword matching
|
|
212
|
+
test_patterns = ('/test/', '/tests/', '/spec/', '/fixtures/', '/__tests__/', '/__mocks__/',
|
|
213
|
+
'.test.', '.spec.', '_test.', '_spec.')
|
|
214
|
+
rows = [r for r in all_rows if not any(p in (r[3] or '') for p in test_patterns)]
|
|
215
|
+
skipped_tests = len(all_rows) - len(rows)
|
|
216
|
+
|
|
217
|
+
print(json.dumps({"phase": "queried", "nodes": len(rows), "skipped_tests": skipped_tests}), flush=True)
|
|
218
|
+
|
|
219
|
+
if not rows:
|
|
220
|
+
print(json.dumps({"phase": "done", "embedded": 0, "skipped": 0, "ms": 0}), flush=True)
|
|
221
|
+
db.close()
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
# Fetch graph context (callers, callees, module) for richer embedding text
|
|
225
|
+
node_ids = [r[0] for r in rows]
|
|
226
|
+
id_set = set(node_ids)
|
|
227
|
+
|
|
228
|
+
# Batch fetch callers
|
|
229
|
+
caller_map = {}
|
|
230
|
+
callee_map = {}
|
|
231
|
+
module_map = {}
|
|
232
|
+
|
|
233
|
+
# Chunk the IN clause to avoid SQLite variable limits
|
|
234
|
+
CHUNK = 500
|
|
235
|
+
for ci in range(0, len(node_ids), CHUNK):
|
|
236
|
+
chunk_ids = node_ids[ci:ci+CHUNK]
|
|
237
|
+
ph = ','.join('?' * len(chunk_ids))
|
|
238
|
+
|
|
239
|
+
for row in db.execute(f"SELECT e.targetId, n.name FROM edges e JOIN nodes n ON n.id = e.sourceId WHERE e.targetId IN ({ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7 LIMIT {len(chunk_ids)*3}", chunk_ids):
|
|
240
|
+
caller_map.setdefault(row[0], []).append(row[1])
|
|
241
|
+
|
|
242
|
+
for row in db.execute(f"SELECT e.sourceId, n.name FROM edges e JOIN nodes n ON n.id = e.targetId WHERE e.sourceId IN ({ph}) AND e.type = 'CALLS' AND e.confidence >= 0.7 LIMIT {len(chunk_ids)*3}", chunk_ids):
|
|
243
|
+
callee_map.setdefault(row[0], []).append(row[1])
|
|
244
|
+
|
|
245
|
+
for row in db.execute(f"SELECT e.sourceId, c.heuristicLabel FROM edges e JOIN nodes c ON c.id = e.targetId WHERE e.sourceId IN ({ph}) AND e.type = 'MEMBER_OF' AND c.label = 'Community' LIMIT {len(chunk_ids)}", chunk_ids):
|
|
246
|
+
module_map[row[0]] = row[1]
|
|
247
|
+
|
|
248
|
+
print(json.dumps({"phase": "context", "with_callers": len(caller_map), "with_module": len(module_map)}), flush=True)
|
|
249
|
+
|
|
250
|
+
# Get existing text hashes for skip detection
|
|
251
|
+
existing_hashes = {}
|
|
252
|
+
for row in db.execute("SELECT nodeId, textHash FROM embeddings WHERE textHash IS NOT NULL"):
|
|
253
|
+
existing_hashes[row[0]] = row[1]
|
|
254
|
+
|
|
255
|
+
# Generate embedding texts + hashes
|
|
256
|
+
# Optimized: semantic summary (name + comment + signature + context)
|
|
257
|
+
# instead of raw code dump. 55% fewer tokens, equal search quality.
|
|
258
|
+
to_embed = [] # (node_id, text, hash)
|
|
259
|
+
skipped = 0
|
|
260
|
+
|
|
261
|
+
def extract_first_comment(content):
|
|
262
|
+
"""Extract JSDoc/comment as natural language description (max 3 lines)."""
|
|
263
|
+
if not content:
|
|
264
|
+
return ""
|
|
265
|
+
lines = content.split("\n")
|
|
266
|
+
comment_lines = []
|
|
267
|
+
in_block = False
|
|
268
|
+
for l in lines:
|
|
269
|
+
t = l.strip()
|
|
270
|
+
if t.startswith("/**") or t.startswith("/*"):
|
|
271
|
+
in_block = True
|
|
272
|
+
inner = t.lstrip("/").lstrip("*").strip().rstrip("*/").strip()
|
|
273
|
+
if inner and not inner.startswith("@"):
|
|
274
|
+
comment_lines.append(inner)
|
|
275
|
+
if "*/" in t:
|
|
276
|
+
in_block = False
|
|
277
|
+
continue
|
|
278
|
+
if in_block:
|
|
279
|
+
if "*/" in t:
|
|
280
|
+
in_block = False
|
|
281
|
+
continue
|
|
282
|
+
inner = t.lstrip("*").strip()
|
|
283
|
+
if inner and not inner.startswith("@"):
|
|
284
|
+
comment_lines.append(inner)
|
|
285
|
+
if len(comment_lines) >= 3:
|
|
286
|
+
break
|
|
287
|
+
continue
|
|
288
|
+
if t.startswith("//"):
|
|
289
|
+
inner = t[2:].strip()
|
|
290
|
+
if inner:
|
|
291
|
+
comment_lines.append(inner)
|
|
292
|
+
if len(comment_lines) >= 3:
|
|
293
|
+
break
|
|
294
|
+
continue
|
|
295
|
+
if t.startswith("#") and not t.startswith("#!"):
|
|
296
|
+
inner = t[1:].strip()
|
|
297
|
+
if inner:
|
|
298
|
+
comment_lines.append(inner)
|
|
299
|
+
if len(comment_lines) >= 3:
|
|
300
|
+
break
|
|
301
|
+
continue
|
|
302
|
+
if comment_lines:
|
|
303
|
+
break
|
|
304
|
+
return " ".join(comment_lines)
|
|
305
|
+
|
|
306
|
+
def extract_signature(content, label):
|
|
307
|
+
"""Extract code signature without full body."""
|
|
308
|
+
if not content:
|
|
309
|
+
return ""
|
|
310
|
+
lines = content.split("\n")
|
|
311
|
+
if label == "Interface":
|
|
312
|
+
return "\n".join(lines[:30]).strip() if len(lines) <= 30 else "\n".join(lines[:30]) + "\n // ..."
|
|
313
|
+
if label == "Class":
|
|
314
|
+
sigs = []
|
|
315
|
+
for l in lines[:60]:
|
|
316
|
+
t = l.strip()
|
|
317
|
+
if not t or t.startswith("//") or t.startswith("*") or t.startswith("/*"):
|
|
318
|
+
continue
|
|
319
|
+
if any(kw in t for kw in ("class ", "private ", "public ", "protected ", "readonly ", "static ", "abstract ")):
|
|
320
|
+
sigs.append(t)
|
|
321
|
+
if len(sigs) >= 20:
|
|
322
|
+
break
|
|
323
|
+
return "\n".join(sigs)
|
|
324
|
+
return "\n".join(lines[:min(8, len(lines))]).strip()
|
|
325
|
+
|
|
326
|
+
for row in rows:
|
|
327
|
+
nid, name, label, filePath, content, startLine, endLine, nameExpanded = row
|
|
328
|
+
content = content or ""
|
|
329
|
+
file_name = filePath.rsplit('/', 1)[-1] if filePath else ""
|
|
330
|
+
|
|
331
|
+
# Build semantic embedding text
|
|
332
|
+
parts = [f"{label}: {name}"]
|
|
333
|
+
|
|
334
|
+
# nameExpanded: natural language bridge (e.g. "checkStaleness" → "check staleness")
|
|
335
|
+
if nameExpanded and nameExpanded != name.lower():
|
|
336
|
+
parts.append(nameExpanded)
|
|
337
|
+
|
|
338
|
+
# First comment as natural language description
|
|
339
|
+
comment = extract_first_comment(content)
|
|
340
|
+
if comment:
|
|
341
|
+
parts.append(comment)
|
|
342
|
+
|
|
343
|
+
# File + module location
|
|
344
|
+
loc = f"File: {file_name}"
|
|
345
|
+
module = module_map.get(nid, "")
|
|
346
|
+
if module:
|
|
347
|
+
loc += f" | Module: {module}"
|
|
348
|
+
parts.append(loc)
|
|
349
|
+
|
|
350
|
+
# Graph context
|
|
351
|
+
callers = caller_map.get(nid, [])[:5]
|
|
352
|
+
callees = callee_map.get(nid, [])[:5]
|
|
353
|
+
if callers:
|
|
354
|
+
parts.append(f"Called by: {', '.join(callers)}")
|
|
355
|
+
if callees:
|
|
356
|
+
parts.append(f"Calls: {', '.join(callees)}")
|
|
357
|
+
|
|
358
|
+
# Code signature (not full body)
|
|
359
|
+
sig = extract_signature(content, label)
|
|
360
|
+
if sig:
|
|
361
|
+
parts.extend(["", sig])
|
|
362
|
+
|
|
363
|
+
text = '\n'.join(parts)
|
|
364
|
+
text_hash = md5(text)
|
|
365
|
+
|
|
366
|
+
# Skip if hash unchanged
|
|
367
|
+
if existing_hashes.get(nid) == text_hash:
|
|
368
|
+
skipped += 1
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
to_embed.append((nid, text, text_hash))
|
|
372
|
+
|
|
373
|
+
print(json.dumps({"phase": "prepared", "to_embed": len(to_embed), "skipped": skipped}), flush=True)
|
|
374
|
+
|
|
375
|
+
if not to_embed:
|
|
376
|
+
print(json.dumps({"phase": "done", "embedded": 0, "skipped": skipped, "ms": int((time.time() - t0_total) * 1000)}), flush=True)
|
|
377
|
+
db.close()
|
|
378
|
+
return
|
|
379
|
+
|
|
380
|
+
# Deduplicate — embed unique texts only, copy vectors to duplicates.
|
|
381
|
+
# Identical embedding texts produce identical vectors; no quality loss.
|
|
382
|
+
unique_by_hash = {} # text_hash -> { text, node_ids: [(nid, text_hash)] }
|
|
383
|
+
for nid, text, text_hash in to_embed:
|
|
384
|
+
if text_hash in unique_by_hash:
|
|
385
|
+
unique_by_hash[text_hash]["node_ids"].append((nid, text_hash))
|
|
386
|
+
else:
|
|
387
|
+
unique_by_hash[text_hash] = {"text": text, "node_ids": [(nid, text_hash)]}
|
|
388
|
+
unique_texts = [v["text"] for v in unique_by_hash.values()]
|
|
389
|
+
deduped = len(to_embed) - len(unique_texts)
|
|
390
|
+
|
|
391
|
+
# Embed only unique texts
|
|
392
|
+
t0_embed = time.time()
|
|
393
|
+
embeddings = embed_tiered(model, tokenizer, unique_texts, "retrieval.passage", dims, max_tokens)
|
|
394
|
+
embed_ms = int((time.time() - t0_embed) * 1000)
|
|
395
|
+
|
|
396
|
+
print(json.dumps({"phase": "embedded", "count": len(unique_texts), "deduped": deduped, "ms": embed_ms}), flush=True)
|
|
397
|
+
|
|
398
|
+
# Write to database — copy embedding to all nodes sharing the same hash
|
|
399
|
+
t0_write = time.time()
|
|
400
|
+
db.execute("BEGIN")
|
|
401
|
+
for i, (text_hash, entry) in enumerate(unique_by_hash.items()):
|
|
402
|
+
emb = embeddings[i]
|
|
403
|
+
if emb is None:
|
|
404
|
+
continue
|
|
405
|
+
blob = float_list_to_blob(emb)
|
|
406
|
+
for nid, th in entry["node_ids"]:
|
|
407
|
+
db.execute("INSERT OR REPLACE INTO embeddings (nodeId, embedding, textHash) VALUES (?, ?, ?)",
|
|
408
|
+
(nid, blob, th))
|
|
409
|
+
db.execute("COMMIT")
|
|
410
|
+
write_ms = int((time.time() - t0_write) * 1000)
|
|
411
|
+
|
|
412
|
+
total_ms = int((time.time() - t0_total) * 1000)
|
|
413
|
+
print(json.dumps({
|
|
414
|
+
"phase": "done",
|
|
415
|
+
"embedded": len(to_embed),
|
|
416
|
+
"skipped": skipped,
|
|
417
|
+
"embed_ms": embed_ms,
|
|
418
|
+
"write_ms": write_ms,
|
|
419
|
+
"total_ms": total_ms,
|
|
420
|
+
}), flush=True)
|
|
421
|
+
|
|
422
|
+
db.close()
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
# =========================================================================
|
|
426
|
+
# INTERACTIVE MODE — stdin/stdout JSON for MCP query embedding
|
|
427
|
+
# =========================================================================
|
|
428
|
+
|
|
429
|
+
def interactive_mode():
|
|
430
|
+
t0 = time.time()
|
|
431
|
+
model, tokenizer = load_model()
|
|
432
|
+
load_ms = int((time.time() - t0) * 1000)
|
|
433
|
+
|
|
434
|
+
print(json.dumps({
|
|
435
|
+
"status": "ready",
|
|
436
|
+
"model": "jina-v5-text-small-retrieval",
|
|
437
|
+
"device": str(mx.default_device()),
|
|
438
|
+
"load_ms": load_ms,
|
|
439
|
+
"precision": "int4-g64",
|
|
440
|
+
}), flush=True)
|
|
441
|
+
|
|
442
|
+
for line in sys.stdin:
|
|
443
|
+
line = line.strip()
|
|
444
|
+
if not line:
|
|
445
|
+
continue
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
req = json.loads(line)
|
|
449
|
+
except json.JSONDecodeError:
|
|
450
|
+
print(json.dumps({"error": "Invalid JSON"}), flush=True)
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
if "cmd" in req:
|
|
454
|
+
if req["cmd"] == "ping":
|
|
455
|
+
print(json.dumps({"status": "ready"}), flush=True)
|
|
456
|
+
elif req["cmd"] == "quit":
|
|
457
|
+
break
|
|
458
|
+
continue
|
|
459
|
+
|
|
460
|
+
texts = req.get("texts", [])
|
|
461
|
+
prompt_type = req.get("type", "passage")
|
|
462
|
+
dims = req.get("dims", 256)
|
|
463
|
+
task_type = "retrieval.query" if prompt_type == "query" else "retrieval.passage"
|
|
464
|
+
|
|
465
|
+
t0 = time.time()
|
|
466
|
+
try:
|
|
467
|
+
embeddings = embed_tiered(model, tokenizer, texts, task_type, dims)
|
|
468
|
+
elapsed_ms = int((time.time() - t0) * 1000)
|
|
469
|
+
print(json.dumps({
|
|
470
|
+
"embeddings": embeddings,
|
|
471
|
+
"count": len(embeddings),
|
|
472
|
+
"dims": dims,
|
|
473
|
+
"ms": elapsed_ms,
|
|
474
|
+
}), flush=True)
|
|
475
|
+
except Exception as e:
|
|
476
|
+
print(json.dumps({"error": str(e)}), flush=True)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
# =========================================================================
|
|
480
|
+
# MAIN
|
|
481
|
+
# =========================================================================
|
|
482
|
+
|
|
483
|
+
if __name__ == "__main__":
|
|
484
|
+
if len(sys.argv) >= 3 and sys.argv[1] == "batch":
|
|
485
|
+
db_path = sys.argv[2]
|
|
486
|
+
dims = 256
|
|
487
|
+
max_tokens = 2048
|
|
488
|
+
for i, arg in enumerate(sys.argv[3:], 3):
|
|
489
|
+
if arg == "--dims" and i + 1 < len(sys.argv):
|
|
490
|
+
dims = int(sys.argv[i + 1])
|
|
491
|
+
if arg == "--max-tokens" and i + 1 < len(sys.argv):
|
|
492
|
+
max_tokens = int(sys.argv[i + 1])
|
|
493
|
+
batch_mode(db_path, dims, max_tokens)
|
|
494
|
+
else:
|
|
495
|
+
interactive_mode()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@zuvia-software-solutions/code-mapper",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.3.0",
|
|
4
4
|
"description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
|
|
5
5
|
"author": "Abhigyan Patwari",
|
|
6
6
|
"license": "PolyForm-Noncommercial-1.0.0",
|
|
@@ -34,7 +34,10 @@
|
|
|
34
34
|
"hooks",
|
|
35
35
|
"scripts",
|
|
36
36
|
"skills",
|
|
37
|
-
"vendor"
|
|
37
|
+
"vendor",
|
|
38
|
+
"models/mlx-embedder.py",
|
|
39
|
+
"models/jina-v5-small-mlx/model.py",
|
|
40
|
+
"models/jina-v5-small-mlx/config.json"
|
|
38
41
|
],
|
|
39
42
|
"scripts": {
|
|
40
43
|
"build": "tsc",
|