@optave/codegraph 3.1.3 → 3.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -19
- package/package.json +10 -7
- package/src/analysis/context.js +408 -0
- package/src/analysis/dependencies.js +341 -0
- package/src/analysis/exports.js +130 -0
- package/src/analysis/impact.js +463 -0
- package/src/analysis/module-map.js +322 -0
- package/src/analysis/roles.js +45 -0
- package/src/analysis/symbol-lookup.js +232 -0
- package/src/ast-analysis/shared.js +5 -4
- package/src/batch.js +2 -1
- package/src/builder/context.js +85 -0
- package/src/builder/helpers.js +218 -0
- package/src/builder/incremental.js +178 -0
- package/src/builder/pipeline.js +130 -0
- package/src/builder/stages/build-edges.js +297 -0
- package/src/builder/stages/build-structure.js +113 -0
- package/src/builder/stages/collect-files.js +44 -0
- package/src/builder/stages/detect-changes.js +413 -0
- package/src/builder/stages/finalize.js +139 -0
- package/src/builder/stages/insert-nodes.js +195 -0
- package/src/builder/stages/parse-files.js +28 -0
- package/src/builder/stages/resolve-imports.js +143 -0
- package/src/builder/stages/run-analyses.js +44 -0
- package/src/builder.js +10 -1485
- package/src/cfg.js +1 -2
- package/src/cli/commands/ast.js +26 -0
- package/src/cli/commands/audit.js +46 -0
- package/src/cli/commands/batch.js +68 -0
- package/src/cli/commands/branch-compare.js +21 -0
- package/src/cli/commands/build.js +26 -0
- package/src/cli/commands/cfg.js +30 -0
- package/src/cli/commands/check.js +79 -0
- package/src/cli/commands/children.js +31 -0
- package/src/cli/commands/co-change.js +65 -0
- package/src/cli/commands/communities.js +23 -0
- package/src/cli/commands/complexity.js +45 -0
- package/src/cli/commands/context.js +34 -0
- package/src/cli/commands/cycles.js +28 -0
- package/src/cli/commands/dataflow.js +32 -0
- package/src/cli/commands/deps.js +16 -0
- package/src/cli/commands/diff-impact.js +30 -0
- package/src/cli/commands/embed.js +30 -0
- package/src/cli/commands/export.js +75 -0
- package/src/cli/commands/exports.js +18 -0
- package/src/cli/commands/flow.js +36 -0
- package/src/cli/commands/fn-impact.js +30 -0
- package/src/cli/commands/impact.js +16 -0
- package/src/cli/commands/info.js +76 -0
- package/src/cli/commands/map.js +19 -0
- package/src/cli/commands/mcp.js +18 -0
- package/src/cli/commands/models.js +19 -0
- package/src/cli/commands/owners.js +25 -0
- package/src/cli/commands/path.js +36 -0
- package/src/cli/commands/plot.js +80 -0
- package/src/cli/commands/query.js +49 -0
- package/src/cli/commands/registry.js +100 -0
- package/src/cli/commands/roles.js +34 -0
- package/src/cli/commands/search.js +42 -0
- package/src/cli/commands/sequence.js +32 -0
- package/src/cli/commands/snapshot.js +61 -0
- package/src/cli/commands/stats.js +15 -0
- package/src/cli/commands/structure.js +32 -0
- package/src/cli/commands/triage.js +78 -0
- package/src/cli/commands/watch.js +12 -0
- package/src/cli/commands/where.js +24 -0
- package/src/cli/index.js +118 -0
- package/src/cli/shared/options.js +39 -0
- package/src/cli/shared/output.js +1 -0
- package/src/cli.js +11 -1522
- package/src/commands/check.js +5 -5
- package/src/commands/manifesto.js +3 -3
- package/src/commands/structure.js +1 -1
- package/src/communities.js +15 -87
- package/src/cycles.js +30 -85
- package/src/dataflow.js +1 -2
- package/src/db/connection.js +4 -4
- package/src/db/migrations.js +41 -0
- package/src/db/query-builder.js +6 -5
- package/src/db/repository/base.js +201 -0
- package/src/db/repository/graph-read.js +5 -2
- package/src/db/repository/in-memory-repository.js +584 -0
- package/src/db/repository/index.js +5 -1
- package/src/db/repository/nodes.js +63 -4
- package/src/db/repository/sqlite-repository.js +219 -0
- package/src/db.js +5 -0
- package/src/embeddings/generator.js +163 -0
- package/src/embeddings/index.js +13 -0
- package/src/embeddings/models.js +218 -0
- package/src/embeddings/search/cli-formatter.js +151 -0
- package/src/embeddings/search/filters.js +46 -0
- package/src/embeddings/search/hybrid.js +121 -0
- package/src/embeddings/search/keyword.js +68 -0
- package/src/embeddings/search/prepare.js +66 -0
- package/src/embeddings/search/semantic.js +145 -0
- package/src/embeddings/stores/fts5.js +27 -0
- package/src/embeddings/stores/sqlite-blob.js +24 -0
- package/src/embeddings/strategies/source.js +14 -0
- package/src/embeddings/strategies/structured.js +43 -0
- package/src/embeddings/strategies/text-utils.js +43 -0
- package/src/errors.js +78 -0
- package/src/export.js +217 -520
- package/src/extractors/csharp.js +10 -2
- package/src/extractors/go.js +3 -1
- package/src/extractors/helpers.js +71 -0
- package/src/extractors/java.js +9 -2
- package/src/extractors/javascript.js +38 -1
- package/src/extractors/php.js +3 -1
- package/src/extractors/python.js +14 -3
- package/src/extractors/rust.js +3 -1
- package/src/graph/algorithms/bfs.js +49 -0
- package/src/graph/algorithms/centrality.js +16 -0
- package/src/graph/algorithms/index.js +5 -0
- package/src/graph/algorithms/louvain.js +26 -0
- package/src/graph/algorithms/shortest-path.js +41 -0
- package/src/graph/algorithms/tarjan.js +49 -0
- package/src/graph/builders/dependency.js +91 -0
- package/src/graph/builders/index.js +3 -0
- package/src/graph/builders/structure.js +40 -0
- package/src/graph/builders/temporal.js +33 -0
- package/src/graph/classifiers/index.js +2 -0
- package/src/graph/classifiers/risk.js +85 -0
- package/src/graph/classifiers/roles.js +64 -0
- package/src/graph/index.js +13 -0
- package/src/graph/model.js +230 -0
- package/src/index.js +33 -210
- package/src/infrastructure/result-formatter.js +2 -21
- package/src/mcp/index.js +2 -0
- package/src/mcp/middleware.js +26 -0
- package/src/mcp/server.js +128 -0
- package/src/mcp/tool-registry.js +801 -0
- package/src/mcp/tools/ast-query.js +14 -0
- package/src/mcp/tools/audit.js +21 -0
- package/src/mcp/tools/batch-query.js +11 -0
- package/src/mcp/tools/branch-compare.js +10 -0
- package/src/mcp/tools/cfg.js +21 -0
- package/src/mcp/tools/check.js +43 -0
- package/src/mcp/tools/co-changes.js +20 -0
- package/src/mcp/tools/code-owners.js +12 -0
- package/src/mcp/tools/communities.js +15 -0
- package/src/mcp/tools/complexity.js +18 -0
- package/src/mcp/tools/context.js +17 -0
- package/src/mcp/tools/dataflow.js +26 -0
- package/src/mcp/tools/diff-impact.js +24 -0
- package/src/mcp/tools/execution-flow.js +26 -0
- package/src/mcp/tools/export-graph.js +57 -0
- package/src/mcp/tools/file-deps.js +12 -0
- package/src/mcp/tools/file-exports.js +13 -0
- package/src/mcp/tools/find-cycles.js +15 -0
- package/src/mcp/tools/fn-impact.js +15 -0
- package/src/mcp/tools/impact-analysis.js +12 -0
- package/src/mcp/tools/index.js +71 -0
- package/src/mcp/tools/list-functions.js +14 -0
- package/src/mcp/tools/list-repos.js +11 -0
- package/src/mcp/tools/module-map.js +6 -0
- package/src/mcp/tools/node-roles.js +14 -0
- package/src/mcp/tools/path.js +12 -0
- package/src/mcp/tools/query.js +30 -0
- package/src/mcp/tools/semantic-search.js +65 -0
- package/src/mcp/tools/sequence.js +17 -0
- package/src/mcp/tools/structure.js +15 -0
- package/src/mcp/tools/symbol-children.js +14 -0
- package/src/mcp/tools/triage.js +35 -0
- package/src/mcp/tools/where.js +13 -0
- package/src/mcp.js +2 -1470
- package/src/native.js +3 -1
- package/src/presentation/colors.js +44 -0
- package/src/presentation/export.js +444 -0
- package/src/presentation/result-formatter.js +21 -0
- package/src/presentation/sequence-renderer.js +43 -0
- package/src/presentation/table.js +47 -0
- package/src/presentation/viewer.js +634 -0
- package/src/queries.js +35 -2276
- package/src/resolve.js +1 -1
- package/src/sequence.js +2 -38
- package/src/shared/file-utils.js +153 -0
- package/src/shared/generators.js +125 -0
- package/src/shared/hierarchy.js +27 -0
- package/src/shared/normalize.js +59 -0
- package/src/snapshot.js +6 -5
- package/src/structure.js +15 -40
- package/src/triage.js +20 -72
- package/src/viewer.js +35 -656
- package/src/watcher.js +8 -148
- package/src/embedder.js +0 -1097
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import { Repository } from './base.js';
|
|
2
|
+
import { hasCfgTables } from './cfg.js';
|
|
3
|
+
import { getComplexityForNode } from './complexity.js';
|
|
4
|
+
import { hasDataflowTable } from './dataflow.js';
|
|
5
|
+
import {
|
|
6
|
+
countCrossFileCallers,
|
|
7
|
+
findAllIncomingEdges,
|
|
8
|
+
findAllOutgoingEdges,
|
|
9
|
+
findCalleeNames,
|
|
10
|
+
findCallees,
|
|
11
|
+
findCallerNames,
|
|
12
|
+
findCallers,
|
|
13
|
+
findCrossFileCallTargets,
|
|
14
|
+
findDistinctCallers,
|
|
15
|
+
findImportDependents,
|
|
16
|
+
findImportSources,
|
|
17
|
+
findImportTargets,
|
|
18
|
+
findIntraFileCallEdges,
|
|
19
|
+
getClassHierarchy,
|
|
20
|
+
} from './edges.js';
|
|
21
|
+
import { hasEmbeddings } from './embeddings.js';
|
|
22
|
+
import { getCallableNodes, getCallEdges, getFileNodesAll, getImportEdges } from './graph-read.js';
|
|
23
|
+
import {
|
|
24
|
+
bulkNodeIdsByFile,
|
|
25
|
+
countEdges,
|
|
26
|
+
countFiles,
|
|
27
|
+
countNodes,
|
|
28
|
+
findFileNodes,
|
|
29
|
+
findNodeById,
|
|
30
|
+
findNodeByQualifiedName,
|
|
31
|
+
findNodeChildren,
|
|
32
|
+
findNodesByFile,
|
|
33
|
+
findNodesByScope,
|
|
34
|
+
findNodesForTriage,
|
|
35
|
+
findNodesWithFanIn,
|
|
36
|
+
getFunctionNodeId,
|
|
37
|
+
getNodeId,
|
|
38
|
+
iterateFunctionNodes,
|
|
39
|
+
listFunctionNodes,
|
|
40
|
+
} from './nodes.js';
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* SqliteRepository — wraps existing `fn(db, ...)` repository functions
|
|
44
|
+
* behind the Repository interface so callers can use `repo.method(...)`.
|
|
45
|
+
*/
|
|
46
|
+
export class SqliteRepository extends Repository {
|
|
47
|
+
#db;
|
|
48
|
+
|
|
49
|
+
/** @param {object} db - better-sqlite3 Database instance */
|
|
50
|
+
constructor(db) {
|
|
51
|
+
super();
|
|
52
|
+
this.#db = db;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Expose the underlying db for code that still needs raw access. */
|
|
56
|
+
get db() {
|
|
57
|
+
return this.#db;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// ── Node lookups ──────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
findNodeById(id) {
|
|
63
|
+
return findNodeById(this.#db, id);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
findNodesByFile(file) {
|
|
67
|
+
return findNodesByFile(this.#db, file);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
findFileNodes(fileLike) {
|
|
71
|
+
return findFileNodes(this.#db, fileLike);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
findNodesWithFanIn(namePattern, opts) {
|
|
75
|
+
return findNodesWithFanIn(this.#db, namePattern, opts);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
countNodes() {
|
|
79
|
+
return countNodes(this.#db);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
countEdges() {
|
|
83
|
+
return countEdges(this.#db);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
countFiles() {
|
|
87
|
+
return countFiles(this.#db);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
getNodeId(name, kind, file, line) {
|
|
91
|
+
return getNodeId(this.#db, name, kind, file, line);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
getFunctionNodeId(name, file, line) {
|
|
95
|
+
return getFunctionNodeId(this.#db, name, file, line);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
bulkNodeIdsByFile(file) {
|
|
99
|
+
return bulkNodeIdsByFile(this.#db, file);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
findNodeChildren(parentId) {
|
|
103
|
+
return findNodeChildren(this.#db, parentId);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
findNodesByScope(scopeName, opts) {
|
|
107
|
+
return findNodesByScope(this.#db, scopeName, opts);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
findNodeByQualifiedName(qualifiedName, opts) {
|
|
111
|
+
return findNodeByQualifiedName(this.#db, qualifiedName, opts);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
listFunctionNodes(opts) {
|
|
115
|
+
return listFunctionNodes(this.#db, opts);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
iterateFunctionNodes(opts) {
|
|
119
|
+
return iterateFunctionNodes(this.#db, opts);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
findNodesForTriage(opts) {
|
|
123
|
+
return findNodesForTriage(this.#db, opts);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// ── Edge queries ──────────────────────────────────────────────────
|
|
127
|
+
|
|
128
|
+
findCallees(nodeId) {
|
|
129
|
+
return findCallees(this.#db, nodeId);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
findCallers(nodeId) {
|
|
133
|
+
return findCallers(this.#db, nodeId);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
findDistinctCallers(nodeId) {
|
|
137
|
+
return findDistinctCallers(this.#db, nodeId);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
findAllOutgoingEdges(nodeId) {
|
|
141
|
+
return findAllOutgoingEdges(this.#db, nodeId);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
findAllIncomingEdges(nodeId) {
|
|
145
|
+
return findAllIncomingEdges(this.#db, nodeId);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
findCalleeNames(nodeId) {
|
|
149
|
+
return findCalleeNames(this.#db, nodeId);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
findCallerNames(nodeId) {
|
|
153
|
+
return findCallerNames(this.#db, nodeId);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
findImportTargets(nodeId) {
|
|
157
|
+
return findImportTargets(this.#db, nodeId);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
findImportSources(nodeId) {
|
|
161
|
+
return findImportSources(this.#db, nodeId);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
findImportDependents(nodeId) {
|
|
165
|
+
return findImportDependents(this.#db, nodeId);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
findCrossFileCallTargets(file) {
|
|
169
|
+
return findCrossFileCallTargets(this.#db, file);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
countCrossFileCallers(nodeId, file) {
|
|
173
|
+
return countCrossFileCallers(this.#db, nodeId, file);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
getClassHierarchy(classNodeId) {
|
|
177
|
+
return getClassHierarchy(this.#db, classNodeId);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
findIntraFileCallEdges(file) {
|
|
181
|
+
return findIntraFileCallEdges(this.#db, file);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// ── Graph-read queries ────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
getCallableNodes() {
|
|
187
|
+
return getCallableNodes(this.#db);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
getCallEdges() {
|
|
191
|
+
return getCallEdges(this.#db);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
getFileNodesAll() {
|
|
195
|
+
return getFileNodesAll(this.#db);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
getImportEdges() {
|
|
199
|
+
return getImportEdges(this.#db);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// ── Optional table checks ─────────────────────────────────────────
|
|
203
|
+
|
|
204
|
+
hasCfgTables() {
|
|
205
|
+
return hasCfgTables(this.#db);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
hasEmbeddings() {
|
|
209
|
+
return hasEmbeddings(this.#db);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
hasDataflowTable() {
|
|
213
|
+
return hasDataflowTable(this.#db);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
getComplexityForNode(nodeId) {
|
|
217
|
+
return getComplexityForNode(this.#db, nodeId);
|
|
218
|
+
}
|
|
219
|
+
}
|
package/src/db.js
CHANGED
|
@@ -29,8 +29,10 @@ export {
|
|
|
29
29
|
findImportTargets,
|
|
30
30
|
findIntraFileCallEdges,
|
|
31
31
|
findNodeById,
|
|
32
|
+
findNodeByQualifiedName,
|
|
32
33
|
findNodeChildren,
|
|
33
34
|
findNodesByFile,
|
|
35
|
+
findNodesByScope,
|
|
34
36
|
findNodesForTriage,
|
|
35
37
|
findNodesWithFanIn,
|
|
36
38
|
getCallableNodes,
|
|
@@ -50,9 +52,12 @@ export {
|
|
|
50
52
|
hasCoChanges,
|
|
51
53
|
hasDataflowTable,
|
|
52
54
|
hasEmbeddings,
|
|
55
|
+
InMemoryRepository,
|
|
53
56
|
iterateFunctionNodes,
|
|
54
57
|
listFunctionNodes,
|
|
55
58
|
purgeFileData,
|
|
56
59
|
purgeFilesData,
|
|
60
|
+
Repository,
|
|
61
|
+
SqliteRepository,
|
|
57
62
|
upsertCoChangeMeta,
|
|
58
63
|
} from './db/repository/index.js';
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { closeDb, findDbPath, openDb } from '../db.js';
|
|
4
|
+
import { DbError } from '../errors.js';
|
|
5
|
+
import { warn } from '../logger.js';
|
|
6
|
+
import { embed, getModelConfig } from './models.js';
|
|
7
|
+
import { buildSourceText } from './strategies/source.js';
|
|
8
|
+
import { buildStructuredText } from './strategies/structured.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Rough token estimate (~4 chars per token for code/English).
|
|
12
|
+
* Conservative — avoids adding a tokenizer dependency.
|
|
13
|
+
*/
|
|
14
|
+
export function estimateTokens(text) {
|
|
15
|
+
return Math.ceil(text.length / 4);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function initEmbeddingsSchema(db) {
|
|
19
|
+
db.exec(`
|
|
20
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
21
|
+
node_id INTEGER PRIMARY KEY,
|
|
22
|
+
vector BLOB NOT NULL,
|
|
23
|
+
text_preview TEXT,
|
|
24
|
+
FOREIGN KEY(node_id) REFERENCES nodes(id)
|
|
25
|
+
);
|
|
26
|
+
CREATE TABLE IF NOT EXISTS embedding_meta (
|
|
27
|
+
key TEXT PRIMARY KEY,
|
|
28
|
+
value TEXT
|
|
29
|
+
);
|
|
30
|
+
`);
|
|
31
|
+
|
|
32
|
+
// Add full_text column (idempotent — ignore if already exists)
|
|
33
|
+
try {
|
|
34
|
+
db.exec('ALTER TABLE embeddings ADD COLUMN full_text TEXT');
|
|
35
|
+
} catch {
|
|
36
|
+
/* column already exists */
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// FTS5 virtual table for BM25 keyword search
|
|
40
|
+
db.exec(`
|
|
41
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fts_index USING fts5(
|
|
42
|
+
name,
|
|
43
|
+
content,
|
|
44
|
+
tokenize='unicode61'
|
|
45
|
+
);
|
|
46
|
+
`);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Build embeddings for all functions/methods/classes in the graph.
|
|
51
|
+
* @param {string} rootDir - Project root directory
|
|
52
|
+
* @param {string} modelKey - Model identifier from MODELS registry
|
|
53
|
+
* @param {string} [customDbPath] - Override path to graph.db
|
|
54
|
+
* @param {object} [options] - Embedding options
|
|
55
|
+
* @param {string} [options.strategy='structured'] - 'structured' (graph-enriched) or 'source' (raw code)
|
|
56
|
+
*/
|
|
57
|
+
export async function buildEmbeddings(rootDir, modelKey, customDbPath, options = {}) {
|
|
58
|
+
const strategy = options.strategy || 'structured';
|
|
59
|
+
const dbPath = customDbPath || findDbPath(null);
|
|
60
|
+
|
|
61
|
+
if (!fs.existsSync(dbPath)) {
|
|
62
|
+
throw new DbError(
|
|
63
|
+
`No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`,
|
|
64
|
+
{ file: dbPath },
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const db = openDb(dbPath);
|
|
69
|
+
initEmbeddingsSchema(db);
|
|
70
|
+
|
|
71
|
+
db.exec('DELETE FROM embeddings');
|
|
72
|
+
db.exec('DELETE FROM embedding_meta');
|
|
73
|
+
db.exec('DELETE FROM fts_index');
|
|
74
|
+
|
|
75
|
+
const nodes = db
|
|
76
|
+
.prepare(
|
|
77
|
+
`SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`,
|
|
78
|
+
)
|
|
79
|
+
.all();
|
|
80
|
+
|
|
81
|
+
console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`);
|
|
82
|
+
|
|
83
|
+
const byFile = new Map();
|
|
84
|
+
for (const node of nodes) {
|
|
85
|
+
if (!byFile.has(node.file)) byFile.set(node.file, []);
|
|
86
|
+
byFile.get(node.file).push(node);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const texts = [];
|
|
90
|
+
const nodeIds = [];
|
|
91
|
+
const nodeNames = [];
|
|
92
|
+
const previews = [];
|
|
93
|
+
const config = getModelConfig(modelKey);
|
|
94
|
+
const contextWindow = config.contextWindow;
|
|
95
|
+
let overflowCount = 0;
|
|
96
|
+
|
|
97
|
+
for (const [file, fileNodes] of byFile) {
|
|
98
|
+
const fullPath = path.join(rootDir, file);
|
|
99
|
+
let lines;
|
|
100
|
+
try {
|
|
101
|
+
lines = fs.readFileSync(fullPath, 'utf-8').split('\n');
|
|
102
|
+
} catch (err) {
|
|
103
|
+
warn(`Cannot read ${file} for embeddings: ${err.message}`);
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for (const node of fileNodes) {
|
|
108
|
+
let text =
|
|
109
|
+
strategy === 'structured'
|
|
110
|
+
? buildStructuredText(node, file, lines, db)
|
|
111
|
+
: buildSourceText(node, file, lines);
|
|
112
|
+
|
|
113
|
+
// Detect and handle context window overflow
|
|
114
|
+
const tokens = estimateTokens(text);
|
|
115
|
+
if (tokens > contextWindow) {
|
|
116
|
+
overflowCount++;
|
|
117
|
+
const maxChars = contextWindow * 4;
|
|
118
|
+
text = text.slice(0, maxChars);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
texts.push(text);
|
|
122
|
+
nodeIds.push(node.id);
|
|
123
|
+
nodeNames.push(node.name);
|
|
124
|
+
previews.push(`${node.name} (${node.kind}) -- ${file}:${node.line}`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (overflowCount > 0) {
|
|
129
|
+
warn(
|
|
130
|
+
`${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`,
|
|
131
|
+
);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
console.log(`Embedding ${texts.length} symbols...`);
|
|
135
|
+
const { vectors, dim } = await embed(texts, modelKey);
|
|
136
|
+
|
|
137
|
+
const insert = db.prepare(
|
|
138
|
+
'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)',
|
|
139
|
+
);
|
|
140
|
+
const insertFts = db.prepare('INSERT INTO fts_index(rowid, name, content) VALUES (?, ?, ?)');
|
|
141
|
+
const insertMeta = db.prepare('INSERT OR REPLACE INTO embedding_meta (key, value) VALUES (?, ?)');
|
|
142
|
+
const insertAll = db.transaction(() => {
|
|
143
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
144
|
+
insert.run(nodeIds[i], Buffer.from(vectors[i].buffer), previews[i], texts[i]);
|
|
145
|
+
insertFts.run(nodeIds[i], nodeNames[i], texts[i]);
|
|
146
|
+
}
|
|
147
|
+
insertMeta.run('model', config.name);
|
|
148
|
+
insertMeta.run('dim', String(dim));
|
|
149
|
+
insertMeta.run('count', String(vectors.length));
|
|
150
|
+
insertMeta.run('fts_count', String(vectors.length));
|
|
151
|
+
insertMeta.run('strategy', strategy);
|
|
152
|
+
insertMeta.run('built_at', new Date().toISOString());
|
|
153
|
+
if (overflowCount > 0) {
|
|
154
|
+
insertMeta.run('truncated_count', String(overflowCount));
|
|
155
|
+
}
|
|
156
|
+
});
|
|
157
|
+
insertAll();
|
|
158
|
+
|
|
159
|
+
console.log(
|
|
160
|
+
`\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`,
|
|
161
|
+
);
|
|
162
|
+
closeDb(db);
|
|
163
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embeddings subsystem — public API barrel.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports everything consumers previously imported from `../embedder.js`.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export { buildEmbeddings, estimateTokens } from './generator.js';
|
|
8
|
+
export { DEFAULT_MODEL, disposeModel, EMBEDDING_STRATEGIES, embed, MODELS } from './models.js';
|
|
9
|
+
export { search } from './search/cli-formatter.js';
|
|
10
|
+
export { hybridSearchData } from './search/hybrid.js';
|
|
11
|
+
export { ftsSearchData } from './search/keyword.js';
|
|
12
|
+
export { multiSearchData, searchData } from './search/semantic.js';
|
|
13
|
+
export { cosineSim } from './stores/sqlite-blob.js';
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { execFileSync } from 'node:child_process';
|
|
2
|
+
import { createInterface } from 'node:readline';
|
|
3
|
+
import { ConfigError, EngineError } from '../errors.js';
|
|
4
|
+
import { info } from '../logger.js';
|
|
5
|
+
|
|
6
|
+
// Lazy-load transformers (heavy, optional module)
|
|
7
|
+
let pipeline = null;
|
|
8
|
+
let extractor = null;
|
|
9
|
+
let activeModel = null;
|
|
10
|
+
|
|
11
|
+
export const MODELS = {
|
|
12
|
+
minilm: {
|
|
13
|
+
name: 'Xenova/all-MiniLM-L6-v2',
|
|
14
|
+
dim: 384,
|
|
15
|
+
contextWindow: 256,
|
|
16
|
+
desc: 'Smallest, fastest (~23MB). General text.',
|
|
17
|
+
quantized: true,
|
|
18
|
+
},
|
|
19
|
+
'jina-small': {
|
|
20
|
+
name: 'Xenova/jina-embeddings-v2-small-en',
|
|
21
|
+
dim: 512,
|
|
22
|
+
contextWindow: 8192,
|
|
23
|
+
desc: 'Small, good quality (~33MB). General text.',
|
|
24
|
+
quantized: false,
|
|
25
|
+
},
|
|
26
|
+
'jina-base': {
|
|
27
|
+
name: 'Xenova/jina-embeddings-v2-base-en',
|
|
28
|
+
dim: 768,
|
|
29
|
+
contextWindow: 8192,
|
|
30
|
+
desc: 'Good quality (~137MB). General text, 8192 token context.',
|
|
31
|
+
quantized: false,
|
|
32
|
+
},
|
|
33
|
+
'jina-code': {
|
|
34
|
+
name: 'Xenova/jina-embeddings-v2-base-code',
|
|
35
|
+
dim: 768,
|
|
36
|
+
contextWindow: 8192,
|
|
37
|
+
desc: 'Code-aware (~137MB). Trained on code+text, best for code search.',
|
|
38
|
+
quantized: false,
|
|
39
|
+
},
|
|
40
|
+
nomic: {
|
|
41
|
+
name: 'Xenova/nomic-embed-text-v1',
|
|
42
|
+
dim: 768,
|
|
43
|
+
contextWindow: 8192,
|
|
44
|
+
desc: 'Good local quality (~137MB). 8192 context.',
|
|
45
|
+
quantized: false,
|
|
46
|
+
},
|
|
47
|
+
'nomic-v1.5': {
|
|
48
|
+
name: 'nomic-ai/nomic-embed-text-v1.5',
|
|
49
|
+
dim: 768,
|
|
50
|
+
contextWindow: 8192,
|
|
51
|
+
desc: 'Improved nomic (~137MB). Matryoshka dimensions, 8192 context.',
|
|
52
|
+
quantized: false,
|
|
53
|
+
},
|
|
54
|
+
'bge-large': {
|
|
55
|
+
name: 'Xenova/bge-large-en-v1.5',
|
|
56
|
+
dim: 1024,
|
|
57
|
+
contextWindow: 512,
|
|
58
|
+
desc: 'Best general retrieval (~335MB). Top MTEB scores.',
|
|
59
|
+
quantized: false,
|
|
60
|
+
},
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
export const EMBEDDING_STRATEGIES = ['structured', 'source'];
|
|
64
|
+
|
|
65
|
+
export const DEFAULT_MODEL = 'nomic-v1.5';
|
|
66
|
+
const BATCH_SIZE_MAP = {
|
|
67
|
+
minilm: 32,
|
|
68
|
+
'jina-small': 16,
|
|
69
|
+
'jina-base': 8,
|
|
70
|
+
'jina-code': 8,
|
|
71
|
+
nomic: 8,
|
|
72
|
+
'nomic-v1.5': 8,
|
|
73
|
+
'bge-large': 4,
|
|
74
|
+
};
|
|
75
|
+
const DEFAULT_BATCH_SIZE = 32;
|
|
76
|
+
|
|
77
|
+
/** @internal Used by generator.js — not part of the public barrel. */
|
|
78
|
+
export function getModelConfig(modelKey) {
|
|
79
|
+
const key = modelKey || DEFAULT_MODEL;
|
|
80
|
+
const config = MODELS[key];
|
|
81
|
+
if (!config) {
|
|
82
|
+
throw new ConfigError(`Unknown model: ${key}. Available: ${Object.keys(MODELS).join(', ')}`);
|
|
83
|
+
}
|
|
84
|
+
return config;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Prompt the user to install a missing package interactively.
|
|
89
|
+
* Returns true if the package was installed, false otherwise.
|
|
90
|
+
* Skips the prompt entirely in non-TTY environments (CI, piped stdin).
|
|
91
|
+
* @internal Not part of the public barrel.
|
|
92
|
+
*/
|
|
93
|
+
export function promptInstall(packageName) {
|
|
94
|
+
if (!process.stdin.isTTY) return Promise.resolve(false);
|
|
95
|
+
|
|
96
|
+
return new Promise((resolve) => {
|
|
97
|
+
const rl = createInterface({ input: process.stdin, output: process.stderr });
|
|
98
|
+
rl.question(`Semantic search requires ${packageName}. Install it now? [y/N] `, (answer) => {
|
|
99
|
+
rl.close();
|
|
100
|
+
if (answer.trim().toLowerCase() !== 'y') return resolve(false);
|
|
101
|
+
try {
|
|
102
|
+
execFileSync('npm', ['install', packageName], {
|
|
103
|
+
stdio: 'inherit',
|
|
104
|
+
timeout: 300_000,
|
|
105
|
+
});
|
|
106
|
+
resolve(true);
|
|
107
|
+
} catch {
|
|
108
|
+
resolve(false);
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Lazy-load @huggingface/transformers.
|
|
116
|
+
* If the package is missing, prompts the user to install it interactively.
|
|
117
|
+
* In non-TTY environments, prints an error and exits.
|
|
118
|
+
* @internal Not part of the public barrel.
|
|
119
|
+
*/
|
|
120
|
+
export async function loadTransformers() {
|
|
121
|
+
try {
|
|
122
|
+
return await import('@huggingface/transformers');
|
|
123
|
+
} catch {
|
|
124
|
+
const pkg = '@huggingface/transformers';
|
|
125
|
+
const installed = await promptInstall(pkg);
|
|
126
|
+
if (installed) {
|
|
127
|
+
try {
|
|
128
|
+
return await import(pkg);
|
|
129
|
+
} catch (loadErr) {
|
|
130
|
+
throw new EngineError(
|
|
131
|
+
`${pkg} was installed but failed to load. Please check your environment.`,
|
|
132
|
+
{ cause: loadErr },
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
throw new EngineError(`Semantic search requires ${pkg}.\nInstall it with: npm install ${pkg}`);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Dispose the current ONNX session and free memory.
|
|
142
|
+
* Safe to call when no model is loaded (no-op).
|
|
143
|
+
*/
|
|
144
|
+
export async function disposeModel() {
|
|
145
|
+
if (extractor) {
|
|
146
|
+
await extractor.dispose();
|
|
147
|
+
extractor = null;
|
|
148
|
+
}
|
|
149
|
+
activeModel = null;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
async function loadModel(modelKey) {
|
|
153
|
+
const config = getModelConfig(modelKey);
|
|
154
|
+
|
|
155
|
+
if (extractor && activeModel === config.name) return { extractor, config };
|
|
156
|
+
|
|
157
|
+
// Dispose previous model before loading a different one
|
|
158
|
+
await disposeModel();
|
|
159
|
+
|
|
160
|
+
const transformers = await loadTransformers();
|
|
161
|
+
pipeline = transformers.pipeline;
|
|
162
|
+
|
|
163
|
+
info(`Loading embedding model: ${config.name} (${config.dim}d)...`);
|
|
164
|
+
const pipelineOpts = config.quantized ? { quantized: true } : {};
|
|
165
|
+
try {
|
|
166
|
+
extractor = await pipeline('feature-extraction', config.name, pipelineOpts);
|
|
167
|
+
} catch (err) {
|
|
168
|
+
const msg = err.message || String(err);
|
|
169
|
+
if (msg.includes('Unauthorized') || msg.includes('401') || msg.includes('gated')) {
|
|
170
|
+
throw new EngineError(
|
|
171
|
+
`Model "${config.name}" requires authentication.\n` +
|
|
172
|
+
`This model is gated on HuggingFace and needs an access token.\n\n` +
|
|
173
|
+
`Options:\n` +
|
|
174
|
+
` 1. Set HF_TOKEN env var: export HF_TOKEN=hf_...\n` +
|
|
175
|
+
` 2. Use a public model instead: codegraph embed --model minilm`,
|
|
176
|
+
{ cause: err },
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
throw new EngineError(
|
|
180
|
+
`Failed to load model "${config.name}": ${msg}\n` +
|
|
181
|
+
`Try a different model: codegraph embed --model minilm`,
|
|
182
|
+
{ cause: err },
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
activeModel = config.name;
|
|
186
|
+
info('Model loaded.');
|
|
187
|
+
return { extractor, config };
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Generate embeddings for an array of texts.
|
|
192
|
+
*/
|
|
193
|
+
export async function embed(texts, modelKey) {
|
|
194
|
+
const { extractor: ext, config } = await loadModel(modelKey);
|
|
195
|
+
const dim = config.dim;
|
|
196
|
+
const results = [];
|
|
197
|
+
const batchSize = BATCH_SIZE_MAP[modelKey || DEFAULT_MODEL] || DEFAULT_BATCH_SIZE;
|
|
198
|
+
|
|
199
|
+
for (let i = 0; i < texts.length; i += batchSize) {
|
|
200
|
+
const batch = texts.slice(i, i + batchSize);
|
|
201
|
+
const output = await ext(batch, { pooling: 'mean', normalize: true });
|
|
202
|
+
|
|
203
|
+
for (let j = 0; j < batch.length; j++) {
|
|
204
|
+
const start = j * dim;
|
|
205
|
+
const vec = new Float32Array(dim);
|
|
206
|
+
for (let k = 0; k < dim; k++) {
|
|
207
|
+
vec[k] = output.data[start + k];
|
|
208
|
+
}
|
|
209
|
+
results.push(vec);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (texts.length > batchSize) {
|
|
213
|
+
process.stdout.write(` Embedded ${Math.min(i + batchSize, texts.length)}/${texts.length}\r`);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return { vectors: results, dim };
|
|
218
|
+
}
|