@zuvia-software-solutions/code-mapper 1.4.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/dist/cli/ai-context.js +1 -1
  2. package/dist/cli/analyze.d.ts +1 -0
  3. package/dist/cli/analyze.js +73 -82
  4. package/dist/cli/augment.js +0 -2
  5. package/dist/cli/eval-server.d.ts +2 -2
  6. package/dist/cli/eval-server.js +6 -6
  7. package/dist/cli/index.js +6 -10
  8. package/dist/cli/mcp.d.ts +1 -3
  9. package/dist/cli/mcp.js +3 -3
  10. package/dist/cli/refresh.d.ts +2 -2
  11. package/dist/cli/refresh.js +24 -29
  12. package/dist/cli/status.js +4 -13
  13. package/dist/cli/tool.d.ts +5 -4
  14. package/dist/cli/tool.js +8 -10
  15. package/dist/config/ignore-service.js +14 -34
  16. package/dist/core/augmentation/engine.js +53 -83
  17. package/dist/core/db/adapter.d.ts +99 -0
  18. package/dist/core/db/adapter.js +402 -0
  19. package/dist/core/db/graph-loader.d.ts +27 -0
  20. package/dist/core/db/graph-loader.js +148 -0
  21. package/dist/core/db/queries.d.ts +160 -0
  22. package/dist/core/db/queries.js +441 -0
  23. package/dist/core/db/schema.d.ts +108 -0
  24. package/dist/core/db/schema.js +136 -0
  25. package/dist/core/embeddings/embedder.d.ts +21 -12
  26. package/dist/core/embeddings/embedder.js +104 -50
  27. package/dist/core/embeddings/embedding-pipeline.d.ts +48 -22
  28. package/dist/core/embeddings/embedding-pipeline.js +220 -262
  29. package/dist/core/embeddings/text-generator.js +4 -19
  30. package/dist/core/embeddings/types.d.ts +1 -1
  31. package/dist/core/graph/graph.d.ts +1 -1
  32. package/dist/core/graph/graph.js +1 -0
  33. package/dist/core/graph/types.d.ts +11 -9
  34. package/dist/core/graph/types.js +4 -1
  35. package/dist/core/incremental/refresh.d.ts +46 -0
  36. package/dist/core/incremental/refresh.js +503 -0
  37. package/dist/core/incremental/types.d.ts +2 -1
  38. package/dist/core/incremental/types.js +42 -44
  39. package/dist/core/ingestion/ast-cache.js +1 -0
  40. package/dist/core/ingestion/call-processor.d.ts +15 -3
  41. package/dist/core/ingestion/call-processor.js +448 -60
  42. package/dist/core/ingestion/cluster-enricher.d.ts +1 -1
  43. package/dist/core/ingestion/cluster-enricher.js +2 -0
  44. package/dist/core/ingestion/community-processor.d.ts +1 -1
  45. package/dist/core/ingestion/community-processor.js +8 -3
  46. package/dist/core/ingestion/export-detection.d.ts +1 -1
  47. package/dist/core/ingestion/export-detection.js +1 -1
  48. package/dist/core/ingestion/filesystem-walker.js +1 -1
  49. package/dist/core/ingestion/heritage-processor.d.ts +2 -2
  50. package/dist/core/ingestion/heritage-processor.js +22 -11
  51. package/dist/core/ingestion/import-processor.d.ts +2 -2
  52. package/dist/core/ingestion/import-processor.js +24 -9
  53. package/dist/core/ingestion/language-config.js +7 -4
  54. package/dist/core/ingestion/mro-processor.d.ts +1 -1
  55. package/dist/core/ingestion/mro-processor.js +23 -11
  56. package/dist/core/ingestion/named-binding-extraction.js +5 -5
  57. package/dist/core/ingestion/parsing-processor.d.ts +4 -4
  58. package/dist/core/ingestion/parsing-processor.js +26 -18
  59. package/dist/core/ingestion/pipeline.d.ts +4 -2
  60. package/dist/core/ingestion/pipeline.js +50 -20
  61. package/dist/core/ingestion/process-processor.d.ts +2 -2
  62. package/dist/core/ingestion/process-processor.js +28 -14
  63. package/dist/core/ingestion/resolution-context.d.ts +1 -1
  64. package/dist/core/ingestion/resolution-context.js +14 -4
  65. package/dist/core/ingestion/resolvers/csharp.js +4 -3
  66. package/dist/core/ingestion/resolvers/go.js +3 -1
  67. package/dist/core/ingestion/resolvers/jvm.js +13 -4
  68. package/dist/core/ingestion/resolvers/standard.js +2 -2
  69. package/dist/core/ingestion/resolvers/utils.js +6 -2
  70. package/dist/core/ingestion/route-stitcher.d.ts +15 -0
  71. package/dist/core/ingestion/route-stitcher.js +92 -0
  72. package/dist/core/ingestion/structure-processor.d.ts +1 -1
  73. package/dist/core/ingestion/structure-processor.js +3 -2
  74. package/dist/core/ingestion/symbol-table.d.ts +2 -0
  75. package/dist/core/ingestion/symbol-table.js +5 -1
  76. package/dist/core/ingestion/tree-sitter-queries.d.ts +2 -2
  77. package/dist/core/ingestion/tree-sitter-queries.js +177 -0
  78. package/dist/core/ingestion/type-env.js +20 -0
  79. package/dist/core/ingestion/type-extractors/csharp.js +4 -3
  80. package/dist/core/ingestion/type-extractors/go.js +23 -12
  81. package/dist/core/ingestion/type-extractors/php.js +18 -10
  82. package/dist/core/ingestion/type-extractors/ruby.js +15 -3
  83. package/dist/core/ingestion/type-extractors/rust.js +3 -2
  84. package/dist/core/ingestion/type-extractors/shared.js +3 -2
  85. package/dist/core/ingestion/type-extractors/typescript.js +11 -5
  86. package/dist/core/ingestion/utils.d.ts +27 -4
  87. package/dist/core/ingestion/utils.js +145 -100
  88. package/dist/core/ingestion/workers/parse-worker.d.ts +1 -0
  89. package/dist/core/ingestion/workers/parse-worker.js +97 -29
  90. package/dist/core/ingestion/workers/worker-pool.js +3 -0
  91. package/dist/core/search/bm25-index.d.ts +15 -8
  92. package/dist/core/search/bm25-index.js +48 -98
  93. package/dist/core/search/hybrid-search.d.ts +9 -3
  94. package/dist/core/search/hybrid-search.js +30 -25
  95. package/dist/core/search/reranker.js +9 -7
  96. package/dist/core/search/types.d.ts +0 -4
  97. package/dist/core/semantic/tsgo-service.d.ts +7 -1
  98. package/dist/core/semantic/tsgo-service.js +165 -66
  99. package/dist/lib/tsgo-test.d.ts +2 -0
  100. package/dist/lib/tsgo-test.js +6 -0
  101. package/dist/lib/type-utils.d.ts +25 -0
  102. package/dist/lib/type-utils.js +22 -0
  103. package/dist/lib/utils.d.ts +3 -2
  104. package/dist/lib/utils.js +3 -2
  105. package/dist/mcp/compatible-stdio-transport.js +1 -1
  106. package/dist/mcp/local/local-backend.d.ts +29 -56
  107. package/dist/mcp/local/local-backend.js +808 -1118
  108. package/dist/mcp/resources.js +35 -25
  109. package/dist/mcp/server.d.ts +1 -1
  110. package/dist/mcp/server.js +5 -5
  111. package/dist/mcp/tools.js +24 -25
  112. package/dist/storage/repo-manager.d.ts +2 -12
  113. package/dist/storage/repo-manager.js +1 -47
  114. package/dist/types/pipeline.d.ts +8 -5
  115. package/dist/types/pipeline.js +5 -0
  116. package/package.json +18 -11
  117. package/dist/cli/serve.d.ts +0 -5
  118. package/dist/cli/serve.js +0 -8
  119. package/dist/core/incremental/child-process.d.ts +0 -8
  120. package/dist/core/incremental/child-process.js +0 -649
  121. package/dist/core/incremental/refresh-coordinator.d.ts +0 -32
  122. package/dist/core/incremental/refresh-coordinator.js +0 -147
  123. package/dist/core/lbug/csv-generator.d.ts +0 -28
  124. package/dist/core/lbug/csv-generator.js +0 -355
  125. package/dist/core/lbug/lbug-adapter.d.ts +0 -96
  126. package/dist/core/lbug/lbug-adapter.js +0 -753
  127. package/dist/core/lbug/schema.d.ts +0 -46
  128. package/dist/core/lbug/schema.js +0 -402
  129. package/dist/mcp/core/embedder.d.ts +0 -24
  130. package/dist/mcp/core/embedder.js +0 -168
  131. package/dist/mcp/core/lbug-adapter.d.ts +0 -29
  132. package/dist/mcp/core/lbug-adapter.js +0 -330
  133. package/dist/server/api.d.ts +0 -5
  134. package/dist/server/api.js +0 -340
  135. package/dist/server/mcp-http.d.ts +0 -7
  136. package/dist/server/mcp-http.js +0 -95
  137. package/models/mlx-embedder.py +0 -185
@@ -0,0 +1,108 @@
1
+ /**
2
+ * @file Single source of truth for the code knowledge graph schema.
3
+ *
4
+ * ALL types in the system are derived from these const declarations.
5
+ * The compiler enforces exhaustiveness — adding a new node label or edge
6
+ * type requires updating every switch/map that handles them.
7
+ */
8
+ import { type Brand } from '../../lib/type-utils.js';
9
+ export { assertNever } from '../../lib/type-utils.js';
10
+ export declare const NODE_LABELS: readonly ["File", "Folder", "Function", "Class", "Interface", "Method", "CodeElement", "Community", "Process", "Struct", "Enum", "Macro", "Typedef", "Union", "Namespace", "Trait", "Impl", "TypeAlias", "Const", "Static", "Property", "Record", "Delegate", "Annotation", "Constructor", "Template", "Module"];
11
+ /** Union of all valid node labels — derived from the const tuple */
12
+ export type NodeLabel = typeof NODE_LABELS[number];
13
+ /** Compile-time check: ensure a value is a valid NodeLabel */
14
+ export declare function assertNodeLabel(value: string): asserts value is NodeLabel;
15
+ export declare const EDGE_TYPES: readonly ["CONTAINS", "DEFINES", "IMPORTS", "CALLS", "EXTENDS", "IMPLEMENTS", "HAS_METHOD", "OVERRIDES", "MEMBER_OF", "STEP_IN_PROCESS", "DEPENDS_ON", "PROVIDES"];
16
+ /** Union of all valid edge types — derived from the const tuple */
17
+ export type EdgeType = typeof EDGE_TYPES[number];
18
+ /** Compile-time check: ensure a value is a valid EdgeType */
19
+ export declare function assertEdgeType(value: string): asserts value is EdgeType;
20
+ /** A node ID (format: "Label:filePath:name") */
21
+ export type NodeId = Brand<string, 'NodeId'>;
22
+ /** An edge ID (format: "sourceId_type_targetId") */
23
+ export type EdgeId = Brand<string, 'EdgeId'>;
24
+ /** Construct a NodeId (runtime validation + compile-time branding) */
25
+ export declare function toNodeId(raw: string): NodeId;
26
+ /** Construct an EdgeId */
27
+ export declare function toEdgeId(raw: string): EdgeId;
28
+ /** A node row as stored in the `nodes` table */
29
+ export interface NodeRow {
30
+ readonly id: NodeId;
31
+ readonly label: NodeLabel;
32
+ readonly name: string;
33
+ readonly filePath: string;
34
+ readonly startLine: number | null;
35
+ readonly endLine: number | null;
36
+ readonly isExported: number | null;
37
+ readonly content: string;
38
+ readonly description: string;
39
+ readonly heuristicLabel: string | null;
40
+ readonly cohesion: number | null;
41
+ readonly symbolCount: number | null;
42
+ readonly keywords: string | null;
43
+ readonly enrichedBy: 'heuristic' | 'llm' | null;
44
+ readonly processType: 'intra_community' | 'cross_community' | null;
45
+ readonly stepCount: number | null;
46
+ readonly communities: string | null;
47
+ readonly entryPointId: string | null;
48
+ readonly terminalId: string | null;
49
+ readonly parameterCount: number | null;
50
+ readonly returnType: string | null;
51
+ readonly nameExpanded: string;
52
+ }
53
+ /** An edge row as stored in the `edges` table */
54
+ export interface EdgeRow {
55
+ readonly id: EdgeId;
56
+ readonly sourceId: NodeId;
57
+ readonly targetId: NodeId;
58
+ readonly type: EdgeType;
59
+ readonly confidence: number;
60
+ readonly reason: string;
61
+ readonly step: number;
62
+ readonly callLine: number | null;
63
+ }
64
+ /** An embedding row as stored in the `embeddings` table */
65
+ export interface EmbeddingRow {
66
+ readonly nodeId: NodeId;
67
+ readonly embedding: Buffer;
68
+ readonly textHash: string | null;
69
+ }
70
+ /** Fields required to insert a node */
71
+ export interface NodeInsert {
72
+ readonly id: NodeId;
73
+ readonly label: NodeLabel;
74
+ readonly name?: string;
75
+ readonly filePath?: string;
76
+ readonly startLine?: number | null;
77
+ readonly endLine?: number | null;
78
+ readonly isExported?: number | null;
79
+ readonly content?: string;
80
+ readonly description?: string;
81
+ readonly heuristicLabel?: string | null;
82
+ readonly cohesion?: number | null;
83
+ readonly symbolCount?: number | null;
84
+ readonly keywords?: string | null;
85
+ readonly enrichedBy?: 'heuristic' | 'llm' | null;
86
+ readonly processType?: 'intra_community' | 'cross_community' | null;
87
+ readonly stepCount?: number | null;
88
+ readonly communities?: string | null;
89
+ readonly entryPointId?: string | null;
90
+ readonly terminalId?: string | null;
91
+ readonly parameterCount?: number | null;
92
+ readonly returnType?: string | null;
93
+ readonly nameExpanded?: string;
94
+ }
95
+ /** Fields required to insert an edge */
96
+ export interface EdgeInsert {
97
+ readonly id: EdgeId;
98
+ readonly sourceId: NodeId;
99
+ readonly targetId: NodeId;
100
+ readonly type: EdgeType;
101
+ readonly confidence?: number;
102
+ readonly reason?: string;
103
+ readonly step?: number;
104
+ readonly callLine?: number | null;
105
+ }
106
+ /** Legacy edge table name constant (kept for compatibility) */
107
+ export declare const REL_TABLE_NAME = "CodeRelation";
108
+ export declare const SCHEMA_SQL = "\n-- Nodes: unified table for all code elements\nCREATE TABLE IF NOT EXISTS nodes (\n id TEXT PRIMARY KEY,\n label TEXT NOT NULL,\n name TEXT NOT NULL DEFAULT '',\n filePath TEXT NOT NULL DEFAULT '',\n startLine INTEGER,\n endLine INTEGER,\n isExported INTEGER,\n content TEXT NOT NULL DEFAULT '',\n description TEXT NOT NULL DEFAULT '',\n heuristicLabel TEXT,\n cohesion REAL,\n symbolCount INTEGER,\n keywords TEXT,\n enrichedBy TEXT,\n processType TEXT,\n stepCount INTEGER,\n communities TEXT,\n entryPointId TEXT,\n terminalId TEXT,\n parameterCount INTEGER,\n returnType TEXT,\n nameExpanded TEXT DEFAULT ''\n);\n\nCREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);\nCREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);\nCREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);\nCREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);\n\n-- Edges: single table for all relationships\nCREATE TABLE IF NOT EXISTS edges (\n id TEXT PRIMARY KEY,\n sourceId TEXT NOT NULL,\n targetId TEXT NOT NULL,\n type TEXT NOT NULL,\n confidence REAL NOT NULL DEFAULT 1.0,\n reason TEXT NOT NULL DEFAULT '',\n step INTEGER NOT NULL DEFAULT 0,\n callLine INTEGER\n);\n\nCREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);\nCREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);\nCREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);\nCREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);\nCREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);\n\n-- Embeddings: vector storage\nCREATE TABLE IF NOT EXISTS embeddings (\n nodeId TEXT PRIMARY KEY,\n embedding BLOB NOT NULL,\n textHash TEXT\n);\n\n-- FTS5 virtual table (auto-updated via triggers)\nCREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(\n name,\n nameExpanded,\n filePath,\n content,\n content='nodes',\n content_rowid='rowid'\n);\n\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN\n INSERT INTO nodes_fts(rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);\nEND;\nCREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);\n INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);\nEND;\n";
@@ -0,0 +1,136 @@
1
+ // code-mapper/src/core/db/schema.ts
2
+ /**
3
+ * @file Single source of truth for the code knowledge graph schema.
4
+ *
5
+ * ALL types in the system are derived from these const declarations.
6
+ * The compiler enforces exhaustiveness — adding a new node label or edge
7
+ * type requires updating every switch/map that handles them.
8
+ */
9
+ import {} from '../../lib/type-utils.js';
10
+ export { assertNever } from '../../lib/type-utils.js';
11
+ // ---------------------------------------------------------------------------
12
+ // Node labels — const tuple is the single source of truth
13
+ // ---------------------------------------------------------------------------
14
+ export const NODE_LABELS = [
15
+ 'File', 'Folder', 'Function', 'Class', 'Interface', 'Method', 'CodeElement',
16
+ 'Community', 'Process',
17
+ 'Struct', 'Enum', 'Macro', 'Typedef', 'Union', 'Namespace', 'Trait', 'Impl',
18
+ 'TypeAlias', 'Const', 'Static', 'Property', 'Record', 'Delegate', 'Annotation',
19
+ 'Constructor', 'Template', 'Module',
20
+ ];
21
+ /** Compile-time check: ensure a value is a valid NodeLabel */
22
+ export function assertNodeLabel(value) {
23
+ if (!NODE_LABELS.includes(value)) {
24
+ throw new TypeError(`Invalid node label: ${value}`);
25
+ }
26
+ }
27
+ // ---------------------------------------------------------------------------
28
+ // Edge types — const tuple is the single source of truth
29
+ // ---------------------------------------------------------------------------
30
+ export const EDGE_TYPES = [
31
+ 'CONTAINS', 'DEFINES', 'IMPORTS', 'CALLS', 'EXTENDS', 'IMPLEMENTS',
32
+ 'HAS_METHOD', 'OVERRIDES', 'MEMBER_OF', 'STEP_IN_PROCESS',
33
+ 'DEPENDS_ON', 'PROVIDES',
34
+ ];
35
+ /** Compile-time check: ensure a value is a valid EdgeType */
36
+ export function assertEdgeType(value) {
37
+ if (!EDGE_TYPES.includes(value)) {
38
+ throw new TypeError(`Invalid edge type: ${value}`);
39
+ }
40
+ }
41
+ /** Construct a NodeId (runtime validation + compile-time branding) */
42
+ export function toNodeId(raw) {
43
+ if (!raw)
44
+ throw new TypeError('NodeId cannot be empty');
45
+ return raw;
46
+ }
47
+ /** Construct an EdgeId */
48
+ export function toEdgeId(raw) {
49
+ if (!raw)
50
+ throw new TypeError('EdgeId cannot be empty');
51
+ return raw;
52
+ }
53
+ /** Legacy edge table name constant (kept for compatibility) */
54
+ export const REL_TABLE_NAME = 'CodeRelation';
55
+ // ---------------------------------------------------------------------------
56
+ // SQL schema — the DDL statements that create the SQLite tables
57
+ // ---------------------------------------------------------------------------
58
+ export const SCHEMA_SQL = `
59
+ -- Nodes: unified table for all code elements
60
+ CREATE TABLE IF NOT EXISTS nodes (
61
+ id TEXT PRIMARY KEY,
62
+ label TEXT NOT NULL,
63
+ name TEXT NOT NULL DEFAULT '',
64
+ filePath TEXT NOT NULL DEFAULT '',
65
+ startLine INTEGER,
66
+ endLine INTEGER,
67
+ isExported INTEGER,
68
+ content TEXT NOT NULL DEFAULT '',
69
+ description TEXT NOT NULL DEFAULT '',
70
+ heuristicLabel TEXT,
71
+ cohesion REAL,
72
+ symbolCount INTEGER,
73
+ keywords TEXT,
74
+ enrichedBy TEXT,
75
+ processType TEXT,
76
+ stepCount INTEGER,
77
+ communities TEXT,
78
+ entryPointId TEXT,
79
+ terminalId TEXT,
80
+ parameterCount INTEGER,
81
+ returnType TEXT,
82
+ nameExpanded TEXT DEFAULT ''
83
+ );
84
+
85
+ CREATE INDEX IF NOT EXISTS idx_nodes_label ON nodes(label);
86
+ CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
87
+ CREATE INDEX IF NOT EXISTS idx_nodes_filePath ON nodes(filePath);
88
+ CREATE INDEX IF NOT EXISTS idx_nodes_label_name ON nodes(label, name);
89
+ CREATE INDEX IF NOT EXISTS idx_nodes_filePath_lines ON nodes(filePath, startLine, endLine);
90
+
91
+ -- Edges: single table for all relationships
92
+ CREATE TABLE IF NOT EXISTS edges (
93
+ id TEXT PRIMARY KEY,
94
+ sourceId TEXT NOT NULL,
95
+ targetId TEXT NOT NULL,
96
+ type TEXT NOT NULL,
97
+ confidence REAL NOT NULL DEFAULT 1.0,
98
+ reason TEXT NOT NULL DEFAULT '',
99
+ step INTEGER NOT NULL DEFAULT 0,
100
+ callLine INTEGER
101
+ );
102
+
103
+ CREATE INDEX IF NOT EXISTS idx_edges_sourceId ON edges(sourceId);
104
+ CREATE INDEX IF NOT EXISTS idx_edges_targetId ON edges(targetId);
105
+ CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(type);
106
+ CREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(sourceId, type);
107
+ CREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(targetId, type);
108
+
109
+ -- Embeddings: vector storage
110
+ CREATE TABLE IF NOT EXISTS embeddings (
111
+ nodeId TEXT PRIMARY KEY,
112
+ embedding BLOB NOT NULL,
113
+ textHash TEXT
114
+ );
115
+
116
+ -- FTS5 virtual table (auto-updated via triggers)
117
+ CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(
118
+ name,
119
+ nameExpanded,
120
+ filePath,
121
+ content,
122
+ content='nodes',
123
+ content_rowid='rowid'
124
+ );
125
+
126
+ CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN
127
+ INSERT INTO nodes_fts(rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);
128
+ END;
129
+ CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN
130
+ INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);
131
+ END;
132
+ CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN
133
+ INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES ('delete', old.rowid, old.name, old.nameExpanded, old.filePath, old.content);
134
+ INSERT INTO nodes_fts(nodes_fts, rowid, name, nameExpanded, filePath, content) VALUES (new.rowid, new.name, new.nameExpanded, new.filePath, new.content);
135
+ END;
136
+ `;
@@ -2,23 +2,24 @@
2
2
  * @file embedder.ts
3
3
  * @description MLX-accelerated code embedder via Python subprocess
4
4
  *
5
- * Replaces the previous ONNX/transformers.js embedder with Jina Code 1.5B
6
- * running on Apple Silicon Metal via MLX. Fail-fast no fallback.
5
+ * Spawns a persistent Python process running Jina Code 1.5B on Apple Silicon
6
+ * Metal via MLX. Communicates via newline-delimited JSON over stdio.
7
7
  *
8
- * Model: jinaai/jina-code-embeddings-1.5b-mlx (1.54B params, 1536 dims, 32K context)
9
- * Matryoshka truncation to 256 dims for optimal speed/quality tradeoff
8
+ * Architecture: request queue with sequential processing. Each sendAndReceive()
9
+ * waits for its specific response no global resolver that can be stolen by
10
+ * out-of-order messages.
11
+ *
12
+ * Model: jinaai/jina-code-embeddings-1.5b-mlx (1.54B params, 256-dim Matryoshka)
10
13
  */
11
14
  import { type EmbeddingConfig, type ModelProgress } from './types.js';
12
- /** Progress callback for model loading */
13
15
  export type ModelProgressCallback = (progress: ModelProgress) => void;
14
- /** Get the current inference device */
15
16
  export declare const getCurrentDevice: () => string | null;
17
+ export declare const isEmbedderReady: () => boolean;
18
+ export declare const getEmbeddingDims: () => number;
16
19
  /**
17
- * Initialize the MLX embedder (spawns Python subprocess, loads model)
20
+ * Initialize the MLX embedder (spawns Python subprocess, waits for model load)
18
21
  */
19
- export declare const initEmbedder: (_onProgress?: ModelProgressCallback, _config?: Partial<EmbeddingConfig>) => Promise<any>;
20
- /** Check if the embedder is initialized and ready */
21
- export declare const isEmbedderReady: () => boolean;
22
+ export declare const initEmbedder: (_onProgress?: ModelProgressCallback, _config?: Partial<EmbeddingConfig>) => Promise<void>;
22
23
  /** Get the embedder instance — not applicable for MLX, returns null */
23
24
  export declare const getEmbedder: () => any;
24
25
  /**
@@ -26,10 +27,18 @@ export declare const getEmbedder: () => any;
26
27
  */
27
28
  export declare const embedText: (text: string) => Promise<Float32Array>;
28
29
  /**
29
- * Embed multiple texts in a single batch
30
+ * Embed multiple texts in batches.
31
+ *
32
+ * Sends chunks of 100 texts to Python — keeps JSON responses manageable
33
+ * over stdio while letting Python's internal length-tiered batching
34
+ * optimize GPU utilization within each chunk.
30
35
  */
31
36
  export declare const embedBatch: (texts: string[]) => Promise<Float32Array[]>;
32
- /** Convert Float32Array to number[] for LadybugDB storage */
37
+ /**
38
+ * Embed a query text for semantic search (cached, uses "query" prompt type)
39
+ */
40
+ export declare const embedQuery: (query: string) => Promise<number[]>;
41
+ /** Convert Float32Array to number[] for database storage */
33
42
  export declare const embeddingToArray: (embedding: Float32Array) => number[];
34
43
  /** Dispose the embedder subprocess */
35
44
  export declare const disposeEmbedder: () => Promise<void>;
@@ -3,43 +3,42 @@
3
3
  * @file embedder.ts
4
4
  * @description MLX-accelerated code embedder via Python subprocess
5
5
  *
6
- * Replaces the previous ONNX/transformers.js embedder with Jina Code 1.5B
7
- * running on Apple Silicon Metal via MLX. Fail-fast no fallback.
6
+ * Spawns a persistent Python process running Jina Code 1.5B on Apple Silicon
7
+ * Metal via MLX. Communicates via newline-delimited JSON over stdio.
8
8
  *
9
- * Model: jinaai/jina-code-embeddings-1.5b-mlx (1.54B params, 1536 dims, 32K context)
10
- * Matryoshka truncation to 256 dims for optimal speed/quality tradeoff
9
+ * Architecture: request queue with sequential processing. Each sendAndReceive()
10
+ * waits for its specific response no global resolver that can be stolen by
11
+ * out-of-order messages.
12
+ *
13
+ * Model: jinaai/jina-code-embeddings-1.5b-mlx (1.54B params, 256-dim Matryoshka)
11
14
  */
12
- import { spawn, execFileSync } from 'child_process';
15
+ import { spawn } from 'child_process';
13
16
  import path from 'path';
14
17
  import { fileURLToPath } from 'url';
18
+ import { queryEmbeddingCache } from '../search/query-cache.js';
15
19
  import { DEFAULT_EMBEDDING_CONFIG } from './types.js';
16
20
  const __filename = fileURLToPath(import.meta.url);
17
21
  const __dirname = path.dirname(__filename);
18
- // Path to MLX embedder script (relative to compiled dist/)
19
22
  const MLX_SCRIPT = path.resolve(__dirname, '..', '..', '..', 'models', 'mlx-embedder.py');
20
- // Singleton subprocess
23
+ // ---------------------------------------------------------------------------
24
+ // Singleton state
25
+ // ---------------------------------------------------------------------------
21
26
  let mlxProcess = null;
22
- let pendingResolve = null;
23
- let pendingReject = null;
24
- let lineBuffer = '';
25
27
  let ready = false;
26
- /** Get the current inference device */
28
+ let lineBuffer = '';
29
+ /** Queued requests waiting for responses — FIFO order matches Python's processing */
30
+ const responseQueue = [];
31
+ /** Promise that resolves when the process is ready (model loaded) */
32
+ let readyPromise = null;
33
+ let readyResolve = null;
27
34
  export const getCurrentDevice = () => ready ? 'mlx-metal' : null;
35
+ export const isEmbedderReady = () => ready;
36
+ export const getEmbeddingDims = () => DEFAULT_EMBEDDING_CONFIG.dimensions;
28
37
  function ensureProcess() {
29
38
  if (mlxProcess && !mlxProcess.killed)
30
39
  return mlxProcess;
31
- // Check prerequisites
32
- try {
33
- execFileSync('python3', ['-c', 'import mlx; import tokenizers'], {
34
- timeout: 5000,
35
- stdio: ['pipe', 'pipe', 'pipe'],
36
- });
37
- }
38
- catch {
39
- throw new Error('MLX embedder requires Python 3 + MLX on Apple Silicon.\n' +
40
- 'Install: pip3 install mlx tokenizers huggingface_hub\n' +
41
- 'The embedding model will download automatically on first use (~3GB).');
42
- }
40
+ // Create ready promise before spawning so we don't miss the message
41
+ readyPromise = new Promise(resolve => { readyResolve = resolve; });
43
42
  mlxProcess = spawn('python3', [MLX_SCRIPT], {
44
43
  stdio: ['pipe', 'pipe', 'pipe'],
45
44
  env: { ...process.env, TOKENIZERS_PARALLELISM: 'false' },
@@ -54,24 +53,29 @@ function ensureProcess() {
54
53
  continue;
55
54
  try {
56
55
  const msg = JSON.parse(line);
56
+ // Startup ready message — NOT a response to any request
57
57
  if (msg.status === 'ready' && !ready) {
58
58
  ready = true;
59
- console.error(`Code Mapper: MLX embedder ready (${msg.device}, loaded in ${msg.load_ms}ms)`);
59
+ console.error(`Code Mapper: MLX embedder ready (${msg.device ?? 'unknown'}, loaded in ${msg.load_ms ?? '?'}ms)`);
60
+ readyResolve?.();
61
+ readyResolve = null;
62
+ continue; // Don't dispatch to response queue
60
63
  }
61
- if (pendingResolve) {
62
- const resolve = pendingResolve;
63
- pendingResolve = null;
64
- pendingReject = null;
65
- resolve(msg);
64
+ // Response to a queued request — dispatch FIFO
65
+ const pending = responseQueue.shift();
66
+ if (pending) {
67
+ pending.resolve(msg);
68
+ }
69
+ else {
70
+ console.error(`Code Mapper: MLX embedder unexpected message (no pending request): ${line.slice(0, 100)}`);
66
71
  }
67
72
  }
68
73
  catch {
69
- // Non-JSON output — ignore
74
+ // Non-JSON output — ignore (Python progress bars, etc.)
70
75
  }
71
76
  }
72
77
  });
73
78
  mlxProcess.stderr.on('data', (chunk) => {
74
- // Forward stderr for debugging
75
79
  const msg = chunk.toString().trim();
76
80
  if (msg)
77
81
  console.error(`[mlx-embedder] ${msg}`);
@@ -79,49 +83,60 @@ function ensureProcess() {
79
83
  mlxProcess.on('exit', (code) => {
80
84
  ready = false;
81
85
  mlxProcess = null;
82
- if (pendingReject) {
83
- const reject = pendingReject;
84
- pendingResolve = null;
85
- pendingReject = null;
86
- reject(new Error(`MLX embedder exited with code ${code}`));
86
+ // Reject all pending requests
87
+ const err = new Error(`MLX embedder exited with code ${code}`);
88
+ for (const pending of responseQueue) {
89
+ pending.reject(err);
87
90
  }
91
+ responseQueue.length = 0;
92
+ // Also resolve readyPromise so init doesn't hang
93
+ readyResolve?.();
94
+ readyResolve = null;
88
95
  });
89
96
  return mlxProcess;
90
97
  }
98
+ /**
99
+ * Send a request and wait for its response.
100
+ *
101
+ * Requests are queued FIFO — Python processes them in order and sends
102
+ * responses in the same order. Each caller gets exactly its own response.
103
+ */
91
104
  function sendAndReceive(request) {
92
105
  return new Promise((resolve, reject) => {
93
106
  const proc = ensureProcess();
94
- pendingResolve = resolve;
95
- pendingReject = reject;
107
+ responseQueue.push({ resolve, reject });
96
108
  proc.stdin.write(JSON.stringify(request) + '\n');
97
109
  });
98
110
  }
111
+ // ---------------------------------------------------------------------------
112
+ // Public API
113
+ // ---------------------------------------------------------------------------
99
114
  /**
100
- * Initialize the MLX embedder (spawns Python subprocess, loads model)
115
+ * Initialize the MLX embedder (spawns Python subprocess, waits for model load)
101
116
  */
102
117
  export const initEmbedder = async (_onProgress, _config = {}) => {
103
118
  if (ready)
104
119
  return;
105
120
  ensureProcess();
106
- // Wait for the "ready" message from the Python process
107
- const msg = await sendAndReceive({ cmd: 'ping' });
108
- if (msg.error) {
109
- throw new Error(`MLX embedder failed: ${msg.error}`);
121
+ // Wait for the automatic "ready" message from Python (model loaded)
122
+ // No ping needed Python sends ready on its own after loading the model
123
+ await readyPromise;
124
+ if (!ready) {
125
+ throw new Error('MLX embedder failed to start — process exited before ready');
110
126
  }
111
- return msg;
112
127
  };
113
- /** Check if the embedder is initialized and ready */
114
- export const isEmbedderReady = () => ready;
115
128
  /** Get the embedder instance — not applicable for MLX, returns null */
116
129
  export const getEmbedder = () => {
117
130
  if (!ready)
118
131
  throw new Error('MLX embedder not initialized. Call initEmbedder() first.');
119
- return null; // No JS-side instance — inference happens in Python
132
+ return null;
120
133
  };
121
134
  /**
122
135
  * Embed a single text string
123
136
  */
124
137
  export const embedText = async (text) => {
138
+ if (!ready)
139
+ await initEmbedder();
125
140
  const result = await sendAndReceive({
126
141
  texts: [text],
127
142
  task: 'nl2code',
@@ -133,11 +148,21 @@ export const embedText = async (text) => {
133
148
  return new Float32Array(result.embeddings[0]);
134
149
  };
135
150
  /**
136
- * Embed multiple texts in a single batch
151
+ * Embed multiple texts in batches.
152
+ *
153
+ * Sends chunks of 100 texts to Python — keeps JSON responses manageable
154
+ * over stdio while letting Python's internal length-tiered batching
155
+ * optimize GPU utilization within each chunk.
137
156
  */
138
157
  export const embedBatch = async (texts) => {
139
158
  if (texts.length === 0)
140
159
  return [];
160
+ if (!ready)
161
+ await initEmbedder();
162
+ // Send all texts to Python in one call — Python does optimal length-tiered
163
+ // batching internally for Metal GPU. No need to double-batch at the Node level.
164
+ console.error(`Code Mapper: embedBatch sending ${texts.length} texts to MLX...`);
165
+ const t0 = Date.now();
141
166
  const result = await sendAndReceive({
142
167
  texts,
143
168
  task: 'nl2code',
@@ -146,9 +171,35 @@ export const embedBatch = async (texts) => {
146
171
  });
147
172
  if (result.error)
148
173
  throw new Error(`Batch embedding failed: ${result.error}`);
174
+ if (!result.embeddings || !Array.isArray(result.embeddings)) {
175
+ throw new Error(`Batch embedding returned invalid response: ${JSON.stringify(result).slice(0, 200)}`);
176
+ }
177
+ const elapsed = Date.now() - t0;
178
+ console.error(`Code Mapper: embedBatch complete — ${result.embeddings.length} embeddings in ${elapsed}ms (${result.ms ?? '?'}ms inference)`);
149
179
  return result.embeddings.map((e) => new Float32Array(e));
150
180
  };
151
- /** Convert Float32Array to number[] for LadybugDB storage */
181
+ /**
182
+ * Embed a query text for semantic search (cached, uses "query" prompt type)
183
+ */
184
+ export const embedQuery = async (query) => {
185
+ const cached = queryEmbeddingCache.get(query);
186
+ if (cached)
187
+ return cached;
188
+ if (!ready)
189
+ await initEmbedder();
190
+ const result = await sendAndReceive({
191
+ texts: [query],
192
+ task: 'nl2code',
193
+ type: 'query',
194
+ dims: DEFAULT_EMBEDDING_CONFIG.dimensions,
195
+ });
196
+ if (result.error)
197
+ throw new Error(`Query embedding failed: ${result.error}`);
198
+ const embedding = result.embeddings[0];
199
+ queryEmbeddingCache.set(query, embedding);
200
+ return embedding;
201
+ };
202
+ /** Convert Float32Array to number[] for database storage */
152
203
  export const embeddingToArray = (embedding) => {
153
204
  return Array.from(embedding);
154
205
  };
@@ -157,7 +208,6 @@ export const disposeEmbedder = async () => {
157
208
  if (mlxProcess && !mlxProcess.killed) {
158
209
  try {
159
210
  mlxProcess.stdin.write(JSON.stringify({ cmd: 'quit' }) + '\n');
160
- // Give it a moment to exit gracefully
161
211
  await new Promise(resolve => setTimeout(resolve, 500));
162
212
  }
163
213
  catch { }
@@ -168,4 +218,8 @@ export const disposeEmbedder = async () => {
168
218
  mlxProcess = null;
169
219
  }
170
220
  ready = false;
221
+ readyPromise = null;
222
+ readyResolve = null;
223
+ responseQueue.length = 0;
224
+ queryEmbeddingCache.clear();
171
225
  };
@@ -1,41 +1,67 @@
1
1
  /**
2
2
  * @file embedding-pipeline.ts
3
3
  * @description Orchestrates the background embedding process:
4
- * 1) Query embeddable nodes from LadybugDB
4
+ * 1) Query embeddable nodes from SQLite
5
5
  * 2) Generate text representations
6
6
  * 3) Batch embed using transformers.js
7
- * 4) Store embeddings in LadybugDB
8
- * 5) Create vector index for semantic search
7
+ * 4) Store embeddings in SQLite
8
+ * 5) Vector search via brute-force cosine similarity in adapter.ts
9
9
  */
10
10
  import { type EmbeddingProgress, type EmbeddingConfig, type SemanticSearchResult } from './types.js';
11
+ import type Database from 'better-sqlite3';
11
12
  /** Progress callback type */
12
13
  export type EmbeddingProgressCallback = (progress: EmbeddingProgress) => void;
14
+ /** Graph context for a node: callers, callees, and community module */
15
+ export interface GraphContext {
16
+ callers: string[];
17
+ callees: string[];
18
+ module: string;
19
+ }
13
20
  /**
14
- * Run the full embedding pipeline (load model, embed nodes, create index)
15
- * @param executeQuery - Execute Cypher queries against LadybugDB
16
- * @param executeWithReusedStatement - Execute with reused prepared statement
21
+ * Fetch graph context (callers, callees, community module) for a set of nodes.
22
+ *
23
+ * This enrichment adds relationship context so that embedding text like
24
+ * "import resolution pipeline" matches `processImports` because its caller
25
+ * "runPipelineFromRepo" contains "pipeline".
26
+ *
27
+ * Reusable by both the full analyze pipeline and incremental refresh.
28
+ *
29
+ * @param db - Open SQLite database instance
30
+ * @param nodes - Nodes to fetch context for (must have `id` field)
31
+ * @returns Map from node ID to graph context
32
+ */
33
+ export declare function fetchGraphContext(db: Database.Database, nodes: ReadonlyArray<{
34
+ id: string;
35
+ }>): Map<string, GraphContext>;
36
+ /**
37
+ * Enrich embedding text with graph context (callers, callees, module).
38
+ *
39
+ * Inserts context lines (Module, Called by, Calls) after the header
40
+ * section of the generated text, before the code snippet.
41
+ *
42
+ * @param text - Base embedding text from generateEmbeddingText
43
+ * @param ctx - Graph context for this node
44
+ * @returns Enriched text
45
+ */
46
+ export declare function enrichTextWithGraphContext(text: string, ctx: GraphContext): string;
47
+ /**
48
+ * Run the full embedding pipeline (load model, embed nodes, store in SQLite)
49
+ * @param db - Open SQLite database instance
17
50
  * @param onProgress - Progress callback
18
51
  * @param config - Configuration override
19
52
  * @param skipNodeIds - Node IDs that already have embeddings (incremental mode)
20
53
  */
21
- export declare const runEmbeddingPipeline: (executeQuery: (cypher: string) => Promise<any[]>, executeWithReusedStatement: (cypher: string, paramsList: Array<Record<string, any>>) => Promise<void>, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>, skipNodeIds?: Set<string>) => Promise<void>;
54
+ export declare function runEmbeddingPipeline(db: Database.Database, onProgress: EmbeddingProgressCallback, config?: Partial<EmbeddingConfig>, skipNodeIds?: Set<string>): Promise<void>;
22
55
  /**
23
- * Perform semantic search via the CodeEmbedding vector index
24
- * @param executeQuery - Execute Cypher queries
25
- * @param query - Search query text
26
- * @param k - Number of results (default: 10)
27
- * @param maxDistance - Maximum cosine distance threshold (default: 0.5)
28
- * @returns Search results ordered by relevance
29
- */
30
- export declare const semanticSearch: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, maxDistance?: number) => Promise<SemanticSearchResult[]>;
31
- /**
32
- * Semantic search with flattened results (graph expansion placeholder)
56
+ * Semantic vector search against a SQLite database.
33
57
  *
34
- * For full graph traversal, use the execute_vector_cypher tool directly
58
+ * Uses brute-force cosine similarity via adapter.searchVector, then
59
+ * enriches results with node metadata. This mirrors the pattern in
60
+ * local-backend.ts but as a standalone function for hybrid search.
35
61
  *
36
- * @param executeQuery - Execute Cypher queries
62
+ * @param db - Open SQLite database instance
37
63
  * @param query - Search query text
38
- * @param k - Number of semantic matches (default: 5)
39
- * @param _hops - Unused, kept for API compatibility
64
+ * @param k - Number of results (default: 10)
65
+ * @param maxDistance - Maximum cosine distance threshold (default: from types.ts)
40
66
  */
41
- export declare const semanticSearchWithContext: (executeQuery: (cypher: string) => Promise<any[]>, query: string, k?: number, _hops?: number) => Promise<any[]>;
67
+ export declare function semanticSearchSqlite(db: Database.Database, query: string, k?: number): Promise<SemanticSearchResult[]>;