@mars167/git-ai 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +364 -0
- package/README.zh-CN.md +361 -0
- package/assets/hooks/post-checkout +28 -0
- package/assets/hooks/post-merge +28 -0
- package/assets/hooks/pre-commit +17 -0
- package/assets/hooks/pre-push +29 -0
- package/dist/bin/git-ai.js +62 -0
- package/dist/src/commands/ai.js +30 -0
- package/dist/src/commands/checkIndex.js +19 -0
- package/dist/src/commands/dsr.js +156 -0
- package/dist/src/commands/graph.js +203 -0
- package/dist/src/commands/hooks.js +125 -0
- package/dist/src/commands/index.js +92 -0
- package/dist/src/commands/pack.js +31 -0
- package/dist/src/commands/query.js +139 -0
- package/dist/src/commands/semantic.js +134 -0
- package/dist/src/commands/serve.js +14 -0
- package/dist/src/commands/status.js +78 -0
- package/dist/src/commands/trae.js +75 -0
- package/dist/src/commands/unpack.js +28 -0
- package/dist/src/core/archive.js +91 -0
- package/dist/src/core/astGraph.js +127 -0
- package/dist/src/core/astGraphQuery.js +142 -0
- package/dist/src/core/cozo.js +266 -0
- package/dist/src/core/cpg/astLayer.js +56 -0
- package/dist/src/core/cpg/callGraph.js +483 -0
- package/dist/src/core/cpg/cfgLayer.js +490 -0
- package/dist/src/core/cpg/dfgLayer.js +237 -0
- package/dist/src/core/cpg/index.js +80 -0
- package/dist/src/core/cpg/types.js +108 -0
- package/dist/src/core/crypto.js +10 -0
- package/dist/src/core/dsr/generate.js +308 -0
- package/dist/src/core/dsr/gitContext.js +74 -0
- package/dist/src/core/dsr/indexMaterialize.js +106 -0
- package/dist/src/core/dsr/paths.js +26 -0
- package/dist/src/core/dsr/query.js +73 -0
- package/dist/src/core/dsr/snapshotParser.js +73 -0
- package/dist/src/core/dsr/state.js +27 -0
- package/dist/src/core/dsr/types.js +2 -0
- package/dist/src/core/embedding/fusion.js +52 -0
- package/dist/src/core/embedding/index.js +43 -0
- package/dist/src/core/embedding/parser.js +14 -0
- package/dist/src/core/embedding/semantic.js +254 -0
- package/dist/src/core/embedding/structural.js +97 -0
- package/dist/src/core/embedding/symbolic.js +117 -0
- package/dist/src/core/embedding/tokenizer.js +91 -0
- package/dist/src/core/embedding/types.js +2 -0
- package/dist/src/core/embedding.js +36 -0
- package/dist/src/core/git.js +49 -0
- package/dist/src/core/gitDiff.js +73 -0
- package/dist/src/core/indexCheck.js +131 -0
- package/dist/src/core/indexer.js +185 -0
- package/dist/src/core/indexerIncremental.js +303 -0
- package/dist/src/core/indexing/config.js +51 -0
- package/dist/src/core/indexing/hnsw.js +568 -0
- package/dist/src/core/indexing/index.js +17 -0
- package/dist/src/core/indexing/monitor.js +82 -0
- package/dist/src/core/indexing/parallel.js +252 -0
- package/dist/src/core/lancedb.js +111 -0
- package/dist/src/core/lfs.js +27 -0
- package/dist/src/core/log.js +62 -0
- package/dist/src/core/manifest.js +88 -0
- package/dist/src/core/parser/adapter.js +2 -0
- package/dist/src/core/parser/c.js +93 -0
- package/dist/src/core/parser/chunkRelations.js +178 -0
- package/dist/src/core/parser/chunker.js +274 -0
- package/dist/src/core/parser/go.js +98 -0
- package/dist/src/core/parser/java.js +80 -0
- package/dist/src/core/parser/markdown.js +76 -0
- package/dist/src/core/parser/python.js +81 -0
- package/dist/src/core/parser/rust.js +103 -0
- package/dist/src/core/parser/typescript.js +98 -0
- package/dist/src/core/parser/utils.js +62 -0
- package/dist/src/core/parser/yaml.js +53 -0
- package/dist/src/core/parser.js +75 -0
- package/dist/src/core/paths.js +10 -0
- package/dist/src/core/repoMap.js +164 -0
- package/dist/src/core/retrieval/cache.js +31 -0
- package/dist/src/core/retrieval/classifier.js +74 -0
- package/dist/src/core/retrieval/expander.js +80 -0
- package/dist/src/core/retrieval/fuser.js +40 -0
- package/dist/src/core/retrieval/index.js +32 -0
- package/dist/src/core/retrieval/reranker.js +304 -0
- package/dist/src/core/retrieval/types.js +2 -0
- package/dist/src/core/retrieval/weights.js +42 -0
- package/dist/src/core/search.js +41 -0
- package/dist/src/core/sq8.js +65 -0
- package/dist/src/core/symbolSearch.js +143 -0
- package/dist/src/core/types.js +2 -0
- package/dist/src/core/workspace.js +116 -0
- package/dist/src/mcp/server.js +794 -0
- package/docs/README.md +44 -0
- package/docs/cross-encoder.md +157 -0
- package/docs/embedding.md +158 -0
- package/docs/logo.png +0 -0
- package/docs/windows-setup.md +67 -0
- package/docs/zh-CN/DESIGN.md +102 -0
- package/docs/zh-CN/README.md +46 -0
- package/docs/zh-CN/advanced.md +26 -0
- package/docs/zh-CN/architecture_explained.md +116 -0
- package/docs/zh-CN/cli.md +109 -0
- package/docs/zh-CN/dsr.md +91 -0
- package/docs/zh-CN/graph_scenarios.md +173 -0
- package/docs/zh-CN/hooks.md +14 -0
- package/docs/zh-CN/manifests.md +136 -0
- package/docs/zh-CN/mcp.md +205 -0
- package/docs/zh-CN/quickstart.md +35 -0
- package/docs/zh-CN/rules.md +7 -0
- package/docs/zh-CN/technical-details.md +454 -0
- package/docs/zh-CN/troubleshooting.md +19 -0
- package/docs/zh-CN/windows-setup.md +67 -0
- package/install.sh +183 -0
- package/package.json +97 -0
- package/skills/git-ai-mcp/SKILL.md +86 -0
- package/skills/git-ai-mcp/references/constraints.md +143 -0
- package/skills/git-ai-mcp/references/tools.md +263 -0
- package/templates/agents/common/documents/Fix EISDIR error and enable multi-language indexing.md +14 -0
- package/templates/agents/common/documents/Fix git-ai index error in CodaGraph directory.md +13 -0
- package/templates/agents/common/skills/git-ai-mcp/SKILL.md +86 -0
- package/templates/agents/common/skills/git-ai-mcp/references/constraints.md +143 -0
- package/templates/agents/common/skills/git-ai-mcp/references/tools.md +263 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.WeightedEmbeddingFusion = void 0;
|
|
4
|
+
exports.defaultFusionConfig = defaultFusionConfig;
|
|
5
|
+
function normalize(vec) {
|
|
6
|
+
let norm = 0;
|
|
7
|
+
for (const v of vec)
|
|
8
|
+
norm += v * v;
|
|
9
|
+
norm = Math.sqrt(norm);
|
|
10
|
+
if (norm <= 0)
|
|
11
|
+
return vec.slice();
|
|
12
|
+
return vec.map((v) => v / norm);
|
|
13
|
+
}
|
|
14
|
+
function scale(vec, weight, targetDim) {
|
|
15
|
+
const out = new Array(targetDim).fill(0);
|
|
16
|
+
const len = Math.min(vec.length, targetDim);
|
|
17
|
+
for (let i = 0; i < len; i++)
|
|
18
|
+
out[i] = vec[i] * weight;
|
|
19
|
+
return out;
|
|
20
|
+
}
|
|
21
|
+
class WeightedEmbeddingFusion {
|
|
22
|
+
constructor(config) {
|
|
23
|
+
this.config = config;
|
|
24
|
+
}
|
|
25
|
+
fuse(semantic, structural, symbolic) {
|
|
26
|
+
const dim = Math.max(semantic.length, structural.length, symbolic.length);
|
|
27
|
+
const out = new Array(dim).fill(0);
|
|
28
|
+
const s0 = scale(semantic, this.config.semanticWeight, dim);
|
|
29
|
+
const s1 = scale(structural, this.config.structuralWeight, dim);
|
|
30
|
+
const s2 = scale(symbolic, this.config.symbolicWeight, dim);
|
|
31
|
+
for (let i = 0; i < dim; i++)
|
|
32
|
+
out[i] = s0[i] + s1[i] + s2[i];
|
|
33
|
+
return this.config.normalize ? normalize(out) : out;
|
|
34
|
+
}
|
|
35
|
+
fuseBatch(semantic, structural, symbolic) {
|
|
36
|
+
const count = Math.max(semantic.length, structural.length, symbolic.length);
|
|
37
|
+
const out = [];
|
|
38
|
+
for (let i = 0; i < count; i++) {
|
|
39
|
+
out.push(this.fuse(semantic[i] ?? [], structural[i] ?? [], symbolic[i] ?? []));
|
|
40
|
+
}
|
|
41
|
+
return out;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
exports.WeightedEmbeddingFusion = WeightedEmbeddingFusion;
|
|
45
|
+
function defaultFusionConfig() {
|
|
46
|
+
return {
|
|
47
|
+
semanticWeight: 0.5,
|
|
48
|
+
structuralWeight: 0.3,
|
|
49
|
+
symbolicWeight: 0.2,
|
|
50
|
+
normalize: true,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.HybridEmbedder = void 0;
|
|
4
|
+
exports.defaultHybridEmbeddingConfig = defaultHybridEmbeddingConfig;
|
|
5
|
+
const semantic_1 = require("./semantic");
|
|
6
|
+
const structural_1 = require("./structural");
|
|
7
|
+
const symbolic_1 = require("./symbolic");
|
|
8
|
+
const fusion_1 = require("./fusion");
|
|
9
|
+
const parser_1 = require("./parser");
|
|
10
|
+
class HybridEmbedder {
|
|
11
|
+
constructor(config) {
|
|
12
|
+
this.config = config;
|
|
13
|
+
this.semantic = new semantic_1.OnnxSemanticEmbedder(config.semantic);
|
|
14
|
+
this.structural = new structural_1.WlStructuralEmbedder(config.structural);
|
|
15
|
+
this.symbolic = new symbolic_1.GraphSymbolicEmbedder(config.symbolic);
|
|
16
|
+
this.fusion = new fusion_1.WeightedEmbeddingFusion(config.fusion);
|
|
17
|
+
}
|
|
18
|
+
async embed(code, symbols) {
|
|
19
|
+
const [semanticVec] = await this.semantic.embedBatch([code]);
|
|
20
|
+
const structuralVec = this.structural.embed(this.parse(code));
|
|
21
|
+
const symbolicVec = this.symbolic.embedSymbols(symbols ?? []);
|
|
22
|
+
return this.fusion.fuse(semanticVec ?? [], structuralVec, symbolicVec);
|
|
23
|
+
}
|
|
24
|
+
async embedBatch(codes, symbols) {
|
|
25
|
+
const semanticVecs = await this.semantic.embedBatch(codes);
|
|
26
|
+
const structuralVecs = codes.map((code) => this.structural.embed(this.parse(code)));
|
|
27
|
+
const symbolicVecs = (symbols ?? []).map((s) => this.symbolic.embedSymbols(s ?? []));
|
|
28
|
+
const paddedSymbolic = codes.map((_, idx) => symbolicVecs[idx] ?? this.symbolic.embedSymbols([]));
|
|
29
|
+
return this.fusion.fuseBatch(semanticVecs, structuralVecs, paddedSymbolic);
|
|
30
|
+
}
|
|
31
|
+
parse(code) {
|
|
32
|
+
return (0, parser_1.parseCodeToTree)(code);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
exports.HybridEmbedder = HybridEmbedder;
|
|
36
|
+
function defaultHybridEmbeddingConfig() {
|
|
37
|
+
return {
|
|
38
|
+
semantic: (0, semantic_1.defaultSemanticConfig)(),
|
|
39
|
+
structural: (0, structural_1.defaultStructuralConfig)(),
|
|
40
|
+
symbolic: (0, symbolic_1.defaultSymbolicConfig)(),
|
|
41
|
+
fusion: (0, fusion_1.defaultFusionConfig)(),
|
|
42
|
+
};
|
|
43
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.parseCodeToTree = parseCodeToTree;
|
|
7
|
+
const tree_sitter_1 = __importDefault(require("tree-sitter"));
|
|
8
|
+
const typescript_1 = require("../parser/typescript");
|
|
9
|
+
const adapter = new typescript_1.TypeScriptAdapter(false);
|
|
10
|
+
function parseCodeToTree(code) {
|
|
11
|
+
const parser = new tree_sitter_1.default();
|
|
12
|
+
parser.setLanguage(adapter.getTreeSitterLanguage());
|
|
13
|
+
return parser.parse(code ?? '');
|
|
14
|
+
}
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.OnnxSemanticEmbedder = void 0;
|
|
7
|
+
exports.defaultSemanticConfig = defaultSemanticConfig;
|
|
8
|
+
const os_1 = __importDefault(require("os"));
|
|
9
|
+
const path_1 = __importDefault(require("path"));
|
|
10
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
11
|
+
const embedding_1 = require("../embedding");
|
|
12
|
+
const crypto_1 = require("../crypto");
|
|
13
|
+
const log_1 = require("../log");
|
|
14
|
+
const log = (0, log_1.createLogger)({ component: 'embedding', kind: 'semantic' });
|
|
15
|
+
class LruCache {
|
|
16
|
+
constructor(maxSize) {
|
|
17
|
+
this.maxSize = Math.max(1, maxSize);
|
|
18
|
+
this.map = new Map();
|
|
19
|
+
}
|
|
20
|
+
get(key) {
|
|
21
|
+
const value = this.map.get(key);
|
|
22
|
+
if (!value)
|
|
23
|
+
return undefined;
|
|
24
|
+
this.map.delete(key);
|
|
25
|
+
this.map.set(key, value);
|
|
26
|
+
return value;
|
|
27
|
+
}
|
|
28
|
+
set(key, value) {
|
|
29
|
+
if (this.map.has(key))
|
|
30
|
+
this.map.delete(key);
|
|
31
|
+
this.map.set(key, value);
|
|
32
|
+
if (this.map.size > this.maxSize) {
|
|
33
|
+
const first = this.map.keys().next().value;
|
|
34
|
+
if (first)
|
|
35
|
+
this.map.delete(first);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
function normalize(vec) {
|
|
40
|
+
let norm = 0;
|
|
41
|
+
for (const v of vec)
|
|
42
|
+
norm += v * v;
|
|
43
|
+
norm = Math.sqrt(norm);
|
|
44
|
+
if (norm <= 0)
|
|
45
|
+
return vec.slice();
|
|
46
|
+
return vec.map((v) => v / norm);
|
|
47
|
+
}
|
|
48
|
+
function meanPool(hidden, attention, dims) {
|
|
49
|
+
const [batch, seqLen, dim] = dims;
|
|
50
|
+
if (batch !== 1)
|
|
51
|
+
throw new Error('meanPool expects batch=1');
|
|
52
|
+
const out = new Float32Array(dim);
|
|
53
|
+
let count = 0;
|
|
54
|
+
for (let i = 0; i < seqLen; i++) {
|
|
55
|
+
const att = Number(attention[i] ?? 0);
|
|
56
|
+
if (att === 0)
|
|
57
|
+
continue;
|
|
58
|
+
const offset = i * dim;
|
|
59
|
+
for (let d = 0; d < dim; d++)
|
|
60
|
+
out[d] += hidden[offset + d];
|
|
61
|
+
count += 1;
|
|
62
|
+
}
|
|
63
|
+
if (count === 0)
|
|
64
|
+
return Array.from(out);
|
|
65
|
+
for (let d = 0; d < dim; d++)
|
|
66
|
+
out[d] /= count;
|
|
67
|
+
return Array.from(out);
|
|
68
|
+
}
|
|
69
|
+
function padBigInt(values, target, pad = 0n) {
|
|
70
|
+
if (values.length >= target)
|
|
71
|
+
return values.slice(0, target);
|
|
72
|
+
const out = values.slice();
|
|
73
|
+
while (out.length < target)
|
|
74
|
+
out.push(pad);
|
|
75
|
+
return out;
|
|
76
|
+
}
|
|
77
|
+
function findModelPath(modelName) {
|
|
78
|
+
const resolved = path_1.default.isAbsolute(modelName) ? modelName : path_1.default.join(process.cwd(), modelName);
|
|
79
|
+
const candidates = [
|
|
80
|
+
resolved,
|
|
81
|
+
path_1.default.join(resolved, 'model.onnx'),
|
|
82
|
+
path_1.default.join(resolved, 'onnx', 'model.onnx'),
|
|
83
|
+
];
|
|
84
|
+
for (const c of candidates) {
|
|
85
|
+
if (fs_extra_1.default.pathExistsSync(c))
|
|
86
|
+
return c;
|
|
87
|
+
}
|
|
88
|
+
return resolved;
|
|
89
|
+
}
|
|
90
|
+
async function loadOnnx() {
|
|
91
|
+
const moduleName = 'onnxruntime-node';
|
|
92
|
+
const mod = await import(moduleName);
|
|
93
|
+
return mod;
|
|
94
|
+
}
|
|
95
|
+
async function loadTokenizerModule() {
|
|
96
|
+
const moduleName = './tokenizer.js';
|
|
97
|
+
const mod = await import(moduleName);
|
|
98
|
+
return mod;
|
|
99
|
+
}
|
|
100
|
+
class OnnxSemanticEmbedder {
|
|
101
|
+
constructor(config) {
|
|
102
|
+
this.config = config;
|
|
103
|
+
this.cache = new LruCache(512);
|
|
104
|
+
this.onnxPromise = null;
|
|
105
|
+
this.sessionPromise = null;
|
|
106
|
+
this.tokenizerPromise = null;
|
|
107
|
+
}
|
|
108
|
+
async embed(code) {
|
|
109
|
+
const batch = await this.embedBatch([code]);
|
|
110
|
+
return batch[0] ?? new Array(this.config.embeddingDim).fill(0);
|
|
111
|
+
}
|
|
112
|
+
async embedBatch(codes) {
|
|
113
|
+
const clean = codes.map((c) => String(c ?? ''));
|
|
114
|
+
const results = new Array(clean.length);
|
|
115
|
+
const pending = [];
|
|
116
|
+
for (let i = 0; i < clean.length; i++) {
|
|
117
|
+
const key = (0, crypto_1.sha256Hex)(clean[i]);
|
|
118
|
+
const cached = this.cache.get(key);
|
|
119
|
+
if (cached) {
|
|
120
|
+
results[i] = cached.slice();
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
pending.push({ index: i, code: clean[i], key });
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
if (pending.length === 0)
|
|
127
|
+
return results;
|
|
128
|
+
try {
|
|
129
|
+
const session = await this.getSession();
|
|
130
|
+
const tokenizer = await this.getTokenizer();
|
|
131
|
+
const batchSize = Math.max(1, this.config.batchSize);
|
|
132
|
+
for (let i = 0; i < pending.length; i += batchSize) {
|
|
133
|
+
const slice = pending.slice(i, i + batchSize);
|
|
134
|
+
const encoded = slice.map((item) => tokenizer.encode(item.code, { maxLength: 512 }));
|
|
135
|
+
const maxLen = Math.max(2, Math.min(512, Math.max(...encoded.map((e) => e.input_ids.length))));
|
|
136
|
+
const inputIds = encoded.map((e) => padBigInt(e.input_ids, maxLen, 0n));
|
|
137
|
+
const attentionMask = encoded.map((e) => padBigInt(e.attention_mask, maxLen, 0n));
|
|
138
|
+
const feeds = await this.buildFeeds(inputIds, attentionMask, maxLen, session);
|
|
139
|
+
const outputs = await session.run(feeds);
|
|
140
|
+
const outputName = Object.keys(outputs)[0];
|
|
141
|
+
const output = outputs[outputName];
|
|
142
|
+
if (!output)
|
|
143
|
+
throw new Error('ONNX output missing');
|
|
144
|
+
const outputDims = output.dims;
|
|
145
|
+
const seqLen = outputDims?.[1] ?? maxLen;
|
|
146
|
+
const hiddenDim = outputDims?.[2] ?? this.config.embeddingDim;
|
|
147
|
+
const data = output.data;
|
|
148
|
+
const batchOut = [];
|
|
149
|
+
for (let b = 0; b < slice.length; b++) {
|
|
150
|
+
const offset = b * seqLen * hiddenDim;
|
|
151
|
+
const chunk = data.slice(offset, offset + seqLen * hiddenDim);
|
|
152
|
+
const pooled = meanPool(chunk, attentionMask[b], [1, seqLen, hiddenDim]);
|
|
153
|
+
const normalized = normalize(pooled);
|
|
154
|
+
batchOut.push(this.ensureDim(normalized, this.config.embeddingDim));
|
|
155
|
+
}
|
|
156
|
+
for (let j = 0; j < slice.length; j++) {
|
|
157
|
+
const out = batchOut[j] ?? new Array(this.config.embeddingDim).fill(0);
|
|
158
|
+
results[slice[j].index] = out;
|
|
159
|
+
this.cache.set(slice[j].key, out);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return results;
|
|
163
|
+
}
|
|
164
|
+
catch (err) {
|
|
165
|
+
log.warn('semantic_embed_fallback', { err: String(err?.message ?? err) });
|
|
166
|
+
for (const item of pending) {
|
|
167
|
+
const out = this.hashEmbed(item.code);
|
|
168
|
+
results[item.index] = out;
|
|
169
|
+
this.cache.set(item.key, out);
|
|
170
|
+
}
|
|
171
|
+
return results;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
async getSession() {
|
|
175
|
+
if (!this.sessionPromise) {
|
|
176
|
+
this.sessionPromise = (async () => {
|
|
177
|
+
const onnx = await this.getOnnx();
|
|
178
|
+
const modelPath = findModelPath(this.config.modelName);
|
|
179
|
+
const providers = this.config.device === 'gpu' ? ['cuda', 'cpu'] : ['cpu'];
|
|
180
|
+
const opts = { executionProviders: providers };
|
|
181
|
+
const session = await onnx.InferenceSession.create(modelPath, opts);
|
|
182
|
+
log.info('semantic_session_ready', { model: modelPath, device: this.config.device });
|
|
183
|
+
return session;
|
|
184
|
+
})();
|
|
185
|
+
}
|
|
186
|
+
return this.sessionPromise;
|
|
187
|
+
}
|
|
188
|
+
async getTokenizer() {
|
|
189
|
+
if (!this.tokenizerPromise) {
|
|
190
|
+
this.tokenizerPromise = (async () => {
|
|
191
|
+
const mod = await loadTokenizerModule();
|
|
192
|
+
return mod.loadTokenizer(this.config.modelName);
|
|
193
|
+
})();
|
|
194
|
+
}
|
|
195
|
+
return this.tokenizerPromise;
|
|
196
|
+
}
|
|
197
|
+
async getOnnx() {
|
|
198
|
+
if (!this.onnxPromise)
|
|
199
|
+
this.onnxPromise = loadOnnx();
|
|
200
|
+
return this.onnxPromise;
|
|
201
|
+
}
|
|
202
|
+
async buildFeeds(inputIds, attentionMask, maxLen, session) {
|
|
203
|
+
const onnx = await this.getOnnx();
|
|
204
|
+
const batch = inputIds.length;
|
|
205
|
+
const flattenIds = inputIds.flat();
|
|
206
|
+
const flattenMask = attentionMask.flat();
|
|
207
|
+
const idsTensor = new onnx.Tensor('int64', BigInt64Array.from(flattenIds), [batch, maxLen]);
|
|
208
|
+
const maskTensor = new onnx.Tensor('int64', BigInt64Array.from(flattenMask), [batch, maxLen]);
|
|
209
|
+
const feeds = {};
|
|
210
|
+
const inputNames = ['input_ids', 'attention_mask', 'token_type_ids'];
|
|
211
|
+
for (const name of inputNames) {
|
|
212
|
+
if (name === 'input_ids')
|
|
213
|
+
feeds[name] = idsTensor;
|
|
214
|
+
if (name === 'attention_mask')
|
|
215
|
+
feeds[name] = maskTensor;
|
|
216
|
+
if (name === 'token_type_ids') {
|
|
217
|
+
const types = new onnx.Tensor('int64', new BigInt64Array(batch * maxLen), [batch, maxLen]);
|
|
218
|
+
feeds[name] = types;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return feeds;
|
|
222
|
+
}
|
|
223
|
+
ensureDim(vec, dim) {
|
|
224
|
+
if (vec.length === dim)
|
|
225
|
+
return vec;
|
|
226
|
+
if (vec.length > dim)
|
|
227
|
+
return vec.slice(0, dim);
|
|
228
|
+
const out = vec.slice();
|
|
229
|
+
while (out.length < dim)
|
|
230
|
+
out.push(0);
|
|
231
|
+
return out;
|
|
232
|
+
}
|
|
233
|
+
hashEmbed(text) {
|
|
234
|
+
const out = (0, embedding_1.hashEmbedding)(text, { dim: this.config.embeddingDim });
|
|
235
|
+
return out;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
exports.OnnxSemanticEmbedder = OnnxSemanticEmbedder;
|
|
239
|
+
function defaultSemanticConfig() {
|
|
240
|
+
// Support environment variable override
|
|
241
|
+
const modelPath = process.env.GIT_AI_EMBEDDING_MODEL ||
|
|
242
|
+
path_1.default.join(os_1.default.homedir(), '.cache', 'git-ai', 'models', 'codebert', 'model.onnx');
|
|
243
|
+
// Auto-detect embedding dimension based on model path
|
|
244
|
+
let embeddingDim = 768; // Default for CodeBERT
|
|
245
|
+
if (modelPath.includes('MiniLM')) {
|
|
246
|
+
embeddingDim = 384; // MiniLM-L6 uses 384 dimensions
|
|
247
|
+
}
|
|
248
|
+
return {
|
|
249
|
+
modelName: modelPath,
|
|
250
|
+
embeddingDim,
|
|
251
|
+
device: 'cpu',
|
|
252
|
+
batchSize: 4,
|
|
253
|
+
};
|
|
254
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.WlStructuralEmbedder = void 0;
|
|
4
|
+
exports.defaultStructuralConfig = defaultStructuralConfig;
|
|
5
|
+
const crypto_1 = require("../crypto");
|
|
6
|
+
function normalize(vec) {
|
|
7
|
+
let norm = 0;
|
|
8
|
+
for (const v of vec)
|
|
9
|
+
norm += v * v;
|
|
10
|
+
norm = Math.sqrt(norm);
|
|
11
|
+
if (norm <= 0)
|
|
12
|
+
return vec.slice();
|
|
13
|
+
return vec.map((v) => v / norm);
|
|
14
|
+
}
|
|
15
|
+
function hashToIndex(hash, dim) {
|
|
16
|
+
const idx = parseInt(hash.slice(0, 8), 16) >>> 0;
|
|
17
|
+
return idx % dim;
|
|
18
|
+
}
|
|
19
|
+
function nodeFeatures(node) {
|
|
20
|
+
const childTypes = [];
|
|
21
|
+
for (let i = 0; i < node.namedChildCount; i++) {
|
|
22
|
+
const child = node.namedChild(i);
|
|
23
|
+
if (child)
|
|
24
|
+
childTypes.push(child.type);
|
|
25
|
+
}
|
|
26
|
+
let depth = 0;
|
|
27
|
+
let current = node;
|
|
28
|
+
while (current) {
|
|
29
|
+
depth += 1;
|
|
30
|
+
current = current.parent;
|
|
31
|
+
}
|
|
32
|
+
return { type: node.type, childTypes, depth };
|
|
33
|
+
}
|
|
34
|
+
function wlHash(type, neighborHashes, iteration) {
|
|
35
|
+
const base = [type, iteration.toString(), ...neighborHashes.sort()].join('|');
|
|
36
|
+
return (0, crypto_1.sha256Hex)(base);
|
|
37
|
+
}
|
|
38
|
+
class WlStructuralEmbedder {
|
|
39
|
+
constructor(config) {
|
|
40
|
+
this.config = config;
|
|
41
|
+
}
|
|
42
|
+
embed(tree) {
|
|
43
|
+
return this.embedNode(tree.rootNode);
|
|
44
|
+
}
|
|
45
|
+
embedNode(node) {
|
|
46
|
+
return this.embedSubtree(node);
|
|
47
|
+
}
|
|
48
|
+
embedSubtree(node) {
|
|
49
|
+
const dim = this.config.dim;
|
|
50
|
+
const iterations = Math.max(1, this.config.wlIterations);
|
|
51
|
+
const nodes = [];
|
|
52
|
+
const traverse = (n) => {
|
|
53
|
+
nodes.push(n);
|
|
54
|
+
for (let i = 0; i < n.namedChildCount; i++) {
|
|
55
|
+
const child = n.namedChild(i);
|
|
56
|
+
if (child)
|
|
57
|
+
traverse(child);
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
traverse(node);
|
|
61
|
+
const currentHashes = new Map();
|
|
62
|
+
for (const n of nodes) {
|
|
63
|
+
const features = nodeFeatures(n);
|
|
64
|
+
const base = [features.type, features.childTypes.join(','), features.depth.toString()].join('|');
|
|
65
|
+
currentHashes.set(n, (0, crypto_1.sha256Hex)(base));
|
|
66
|
+
}
|
|
67
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
68
|
+
const next = new Map();
|
|
69
|
+
for (const n of nodes) {
|
|
70
|
+
const neighborHashes = [];
|
|
71
|
+
for (let i = 0; i < n.namedChildCount; i++) {
|
|
72
|
+
const child = n.namedChild(i);
|
|
73
|
+
if (child)
|
|
74
|
+
neighborHashes.push(currentHashes.get(child) ?? '');
|
|
75
|
+
}
|
|
76
|
+
next.set(n, wlHash(n.type, neighborHashes, iter));
|
|
77
|
+
}
|
|
78
|
+
for (const [n, h] of next.entries())
|
|
79
|
+
currentHashes.set(n, h);
|
|
80
|
+
}
|
|
81
|
+
const vec = new Array(dim).fill(0);
|
|
82
|
+
for (const n of nodes) {
|
|
83
|
+
const h = currentHashes.get(n) ?? '';
|
|
84
|
+
const idx = hashToIndex(h, dim);
|
|
85
|
+
const sign = (parseInt(h.slice(0, 2), 16) & 1) === 0 ? 1 : -1;
|
|
86
|
+
vec[idx] += sign;
|
|
87
|
+
const features = nodeFeatures(n);
|
|
88
|
+
const depthIdx = (features.depth * 7) % dim;
|
|
89
|
+
vec[depthIdx] += 0.5;
|
|
90
|
+
}
|
|
91
|
+
return normalize(vec);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
exports.WlStructuralEmbedder = WlStructuralEmbedder;
|
|
95
|
+
function defaultStructuralConfig() {
|
|
96
|
+
return { dim: 256, wlIterations: 2 };
|
|
97
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.GraphSymbolicEmbedder = void 0;
|
|
4
|
+
exports.defaultSymbolicConfig = defaultSymbolicConfig;
|
|
5
|
+
const crypto_1 = require("../crypto");
|
|
6
|
+
function tokenize(text) {
|
|
7
|
+
const raw = String(text ?? '')
|
|
8
|
+
.toLowerCase()
|
|
9
|
+
.split(/[^a-z0-9_]+/g)
|
|
10
|
+
.map((t) => t.trim())
|
|
11
|
+
.filter(Boolean);
|
|
12
|
+
const out = [];
|
|
13
|
+
for (const tok of raw) {
|
|
14
|
+
if (tok.length <= 6) {
|
|
15
|
+
out.push(tok);
|
|
16
|
+
}
|
|
17
|
+
else {
|
|
18
|
+
out.push(tok.slice(0, 3));
|
|
19
|
+
out.push(tok.slice(3, 6));
|
|
20
|
+
out.push(tok.slice(6));
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return out;
|
|
24
|
+
}
|
|
25
|
+
function normalize(vec) {
|
|
26
|
+
let norm = 0;
|
|
27
|
+
for (const v of vec)
|
|
28
|
+
norm += v * v;
|
|
29
|
+
norm = Math.sqrt(norm);
|
|
30
|
+
if (norm <= 0)
|
|
31
|
+
return vec.slice();
|
|
32
|
+
return vec.map((v) => v / norm);
|
|
33
|
+
}
|
|
34
|
+
function hashToIndex(hash, dim) {
|
|
35
|
+
const idx = parseInt(hash.slice(0, 8), 16) >>> 0;
|
|
36
|
+
return idx % dim;
|
|
37
|
+
}
|
|
38
|
+
function addToken(vec, token, dim, weight) {
|
|
39
|
+
const hash = (0, crypto_1.sha256Hex)(token);
|
|
40
|
+
const idx = hashToIndex(hash, dim);
|
|
41
|
+
const sign = (parseInt(hash.slice(8, 10), 16) & 1) === 0 ? 1 : -1;
|
|
42
|
+
vec[idx] += sign * weight;
|
|
43
|
+
}
|
|
44
|
+
function addRelation(vec, a, b, dim, weight) {
|
|
45
|
+
const h = (0, crypto_1.sha256Hex)(`${a}=>${b}`);
|
|
46
|
+
const idx = hashToIndex(h, dim);
|
|
47
|
+
vec[idx] += weight;
|
|
48
|
+
}
|
|
49
|
+
class GraphSymbolicEmbedder {
|
|
50
|
+
constructor(config) {
|
|
51
|
+
this.config = config;
|
|
52
|
+
}
|
|
53
|
+
embedSymbols(symbols) {
|
|
54
|
+
const dim = this.config.dim;
|
|
55
|
+
const vec = new Array(dim).fill(0);
|
|
56
|
+
for (const sym of symbols) {
|
|
57
|
+
const nameTokens = tokenize(sym.name);
|
|
58
|
+
for (const t of nameTokens)
|
|
59
|
+
addToken(vec, t, dim, 1);
|
|
60
|
+
const signatureTokens = tokenize(sym.signature);
|
|
61
|
+
for (const t of signatureTokens)
|
|
62
|
+
addToken(vec, t, dim, 0.5);
|
|
63
|
+
addToken(vec, sym.kind, dim, 0.3);
|
|
64
|
+
if (sym.container) {
|
|
65
|
+
const containerTokens = tokenize(sym.container.name);
|
|
66
|
+
for (const t of containerTokens)
|
|
67
|
+
addToken(vec, t, dim, 0.4);
|
|
68
|
+
}
|
|
69
|
+
if (sym.extends) {
|
|
70
|
+
for (const ext of sym.extends)
|
|
71
|
+
addToken(vec, ext, dim, 0.6);
|
|
72
|
+
}
|
|
73
|
+
if (sym.implements) {
|
|
74
|
+
for (const iface of sym.implements)
|
|
75
|
+
addToken(vec, iface, dim, 0.4);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return normalize(vec);
|
|
79
|
+
}
|
|
80
|
+
embedRelations(relations) {
|
|
81
|
+
const dim = this.config.dim;
|
|
82
|
+
const vec = new Array(dim).fill(0);
|
|
83
|
+
if (this.config.includeCalls) {
|
|
84
|
+
for (const [caller, callee] of relations.calls) {
|
|
85
|
+
addRelation(vec, caller, callee, dim, 1);
|
|
86
|
+
for (const t of tokenize(caller))
|
|
87
|
+
addToken(vec, t, dim, 0.2);
|
|
88
|
+
for (const t of tokenize(callee))
|
|
89
|
+
addToken(vec, t, dim, 0.2);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
if (this.config.includeTypes) {
|
|
93
|
+
for (const [sub, sup] of relations.types) {
|
|
94
|
+
addRelation(vec, sub, sup, dim, 0.8);
|
|
95
|
+
for (const t of tokenize(sub))
|
|
96
|
+
addToken(vec, t, dim, 0.15);
|
|
97
|
+
for (const t of tokenize(sup))
|
|
98
|
+
addToken(vec, t, dim, 0.15);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
if (this.config.includeImports) {
|
|
102
|
+
for (const [file, imp] of relations.imports) {
|
|
103
|
+
addRelation(vec, file, imp, dim, 0.6);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return normalize(vec);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
exports.GraphSymbolicEmbedder = GraphSymbolicEmbedder;
|
|
110
|
+
function defaultSymbolicConfig() {
|
|
111
|
+
return {
|
|
112
|
+
dim: 128,
|
|
113
|
+
includeCalls: true,
|
|
114
|
+
includeTypes: true,
|
|
115
|
+
includeImports: true,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.loadTokenizer = loadTokenizer;
|
|
7
|
+
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
8
|
+
const path_1 = __importDefault(require("path"));
|
|
9
|
+
class BasicTokenizer {
|
|
10
|
+
constructor(vocab, unkId, clsId, sepId) {
|
|
11
|
+
this.vocab = vocab;
|
|
12
|
+
this.unkId = unkId;
|
|
13
|
+
this.clsId = clsId;
|
|
14
|
+
this.sepId = sepId;
|
|
15
|
+
}
|
|
16
|
+
encode(text, options = {}) {
|
|
17
|
+
const maxLength = Math.max(2, Math.min(512, options.maxLength ?? 512));
|
|
18
|
+
const tokens = tokenize(text);
|
|
19
|
+
const ids = [BigInt(this.clsId)];
|
|
20
|
+
for (const tok of tokens) {
|
|
21
|
+
if (ids.length >= maxLength - 1)
|
|
22
|
+
break;
|
|
23
|
+
const id = this.vocab.get(tok) ?? this.unkId;
|
|
24
|
+
ids.push(BigInt(id));
|
|
25
|
+
}
|
|
26
|
+
ids.push(BigInt(this.sepId));
|
|
27
|
+
const attention = ids.map(() => 1n);
|
|
28
|
+
return { input_ids: ids, attention_mask: attention };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
function tokenize(text) {
|
|
32
|
+
const raw = String(text ?? '')
|
|
33
|
+
.toLowerCase()
|
|
34
|
+
.split(/[^a-z0-9_]+/g)
|
|
35
|
+
.map((t) => t.trim())
|
|
36
|
+
.filter(Boolean);
|
|
37
|
+
const out = [];
|
|
38
|
+
for (const tok of raw) {
|
|
39
|
+
if (tok.length <= 8) {
|
|
40
|
+
out.push(tok);
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
out.push(tok.slice(0, 4));
|
|
44
|
+
out.push(tok.slice(4, 8));
|
|
45
|
+
out.push(tok.slice(8));
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return out;
|
|
49
|
+
}
|
|
50
|
+
async function loadVocab(vocabPath) {
|
|
51
|
+
const vocab = new Map();
|
|
52
|
+
if (!await fs_extra_1.default.pathExists(vocabPath))
|
|
53
|
+
return vocab;
|
|
54
|
+
const content = await fs_extra_1.default.readFile(vocabPath, 'utf-8');
|
|
55
|
+
const lines = content.split(/\r?\n/).filter(Boolean);
|
|
56
|
+
for (let i = 0; i < lines.length; i++) {
|
|
57
|
+
vocab.set(lines[i].trim(), i);
|
|
58
|
+
}
|
|
59
|
+
return vocab;
|
|
60
|
+
}
|
|
61
|
+
async function loadTokenizer(modelName) {
|
|
62
|
+
const vocabCandidates = [
|
|
63
|
+
path_1.default.join(modelName, 'vocab.txt'),
|
|
64
|
+
path_1.default.join(modelName, 'tokenizer', 'vocab.txt'),
|
|
65
|
+
path_1.default.join(modelName, 'tokenizer', 'vocab.json'),
|
|
66
|
+
];
|
|
67
|
+
let vocab = new Map();
|
|
68
|
+
for (const candidate of vocabCandidates) {
|
|
69
|
+
if (candidate.endsWith('vocab.json')) {
|
|
70
|
+
if (!await fs_extra_1.default.pathExists(candidate))
|
|
71
|
+
continue;
|
|
72
|
+
const json = await fs_extra_1.default.readJSON(candidate).catch(() => null);
|
|
73
|
+
if (json && typeof json === 'object') {
|
|
74
|
+
vocab = new Map();
|
|
75
|
+
for (const [key, value] of Object.entries(json)) {
|
|
76
|
+
if (typeof value === 'number')
|
|
77
|
+
vocab.set(key, value);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
vocab = await loadVocab(candidate);
|
|
83
|
+
}
|
|
84
|
+
if (vocab.size > 0)
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
const unkId = vocab.get('[UNK]') ?? 100;
|
|
88
|
+
const clsId = vocab.get('[CLS]') ?? 101;
|
|
89
|
+
const sepId = vocab.get('[SEP]') ?? 102;
|
|
90
|
+
return new BasicTokenizer(vocab, unkId, clsId, sepId);
|
|
91
|
+
}
|