@kk-irving/knowledge-mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analyze-issue.js +392 -0
- package/dist/aosp/chunker.js +221 -0
- package/dist/aosp/embed-aosp.js +56 -0
- package/dist/aosp/indexer.js +186 -0
- package/dist/aosp/module-map-loader.js +176 -0
- package/dist/aosp/search.js +254 -0
- package/dist/config.js +39 -0
- package/dist/db.js +244 -0
- package/dist/embed-pending.js +81 -0
- package/dist/embedder.js +76 -0
- package/dist/index-store.js +175 -0
- package/dist/index.js +166 -0
- package/dist/search.js +274 -0
- package/dist/sources/confluence-sync.js +180 -0
- package/dist/sources/gerrit-sync.js +180 -0
- package/dist/sources/zmind-sync.js +106 -0
- package/package.json +37 -0
package/dist/db.js
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SQLite 数据层(基于 Node.js 22.5+ 内置 `node:sqlite`,无 native 编译)。
|
|
3
|
+
*
|
|
4
|
+
* 三源主表 + 各自 FTS5 + sync_state,与 v1.0 设计一致。
|
|
5
|
+
*/
|
|
6
|
+
import { DatabaseSync } from "node:sqlite";
|
|
7
|
+
import * as path from "node:path";
|
|
8
|
+
import { mkdirSync } from "node:fs";
|
|
9
|
+
import { config } from "./config.js";
|
|
10
|
+
let _db = null;
|
|
11
|
+
export function getDb() {
|
|
12
|
+
if (_db)
|
|
13
|
+
return _db;
|
|
14
|
+
mkdirSync(path.dirname(config.dbPath), { recursive: true });
|
|
15
|
+
const db = new DatabaseSync(config.dbPath);
|
|
16
|
+
db.exec("PRAGMA journal_mode = WAL");
|
|
17
|
+
db.exec("PRAGMA synchronous = NORMAL");
|
|
18
|
+
db.exec("PRAGMA foreign_keys = ON");
|
|
19
|
+
migrate(db);
|
|
20
|
+
_db = db;
|
|
21
|
+
return db;
|
|
22
|
+
}
|
|
23
|
+
export function closeDb() {
|
|
24
|
+
if (_db) {
|
|
25
|
+
try {
|
|
26
|
+
_db.close();
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
/* ignore */
|
|
30
|
+
}
|
|
31
|
+
_db = null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// =============================================================================
|
|
35
|
+
// 迁移(幂等)
|
|
36
|
+
// =============================================================================
|
|
37
|
+
function migrate(db) {
|
|
38
|
+
db.exec(`
|
|
39
|
+
-- 公共:同步水位
|
|
40
|
+
CREATE TABLE IF NOT EXISTS sync_state (
|
|
41
|
+
source TEXT NOT NULL,
|
|
42
|
+
key TEXT NOT NULL,
|
|
43
|
+
value TEXT NOT NULL,
|
|
44
|
+
PRIMARY KEY (source, key)
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
-- ============ Zmind issues ============
|
|
48
|
+
CREATE TABLE IF NOT EXISTS zmind_issues (
|
|
49
|
+
id INTEGER PRIMARY KEY,
|
|
50
|
+
tracker TEXT,
|
|
51
|
+
subject TEXT,
|
|
52
|
+
description TEXT,
|
|
53
|
+
status TEXT,
|
|
54
|
+
assigned_to TEXT,
|
|
55
|
+
project_id INTEGER,
|
|
56
|
+
project_name TEXT,
|
|
57
|
+
created_on TEXT,
|
|
58
|
+
updated_on TEXT,
|
|
59
|
+
embedding BLOB,
|
|
60
|
+
embedding_updated_at TEXT
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS zmind_issues_fts USING fts5(
|
|
64
|
+
subject, description,
|
|
65
|
+
content='zmind_issues', content_rowid='id',
|
|
66
|
+
tokenize='unicode61'
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
CREATE TRIGGER IF NOT EXISTS zmind_issues_ai AFTER INSERT ON zmind_issues BEGIN
|
|
70
|
+
INSERT INTO zmind_issues_fts(rowid, subject, description)
|
|
71
|
+
VALUES (new.id, new.subject, new.description);
|
|
72
|
+
END;
|
|
73
|
+
CREATE TRIGGER IF NOT EXISTS zmind_issues_ad AFTER DELETE ON zmind_issues BEGIN
|
|
74
|
+
INSERT INTO zmind_issues_fts(zmind_issues_fts, rowid, subject, description)
|
|
75
|
+
VALUES('delete', old.id, old.subject, old.description);
|
|
76
|
+
END;
|
|
77
|
+
CREATE TRIGGER IF NOT EXISTS zmind_issues_au AFTER UPDATE ON zmind_issues BEGIN
|
|
78
|
+
INSERT INTO zmind_issues_fts(zmind_issues_fts, rowid, subject, description)
|
|
79
|
+
VALUES('delete', old.id, old.subject, old.description);
|
|
80
|
+
INSERT INTO zmind_issues_fts(rowid, subject, description)
|
|
81
|
+
VALUES (new.id, new.subject, new.description);
|
|
82
|
+
END;
|
|
83
|
+
|
|
84
|
+
-- ============ Gerrit changes ============
|
|
85
|
+
CREATE TABLE IF NOT EXISTS gerrit_changes (
|
|
86
|
+
change_id TEXT PRIMARY KEY,
|
|
87
|
+
number INTEGER,
|
|
88
|
+
project TEXT,
|
|
89
|
+
branch TEXT,
|
|
90
|
+
subject TEXT,
|
|
91
|
+
commit_message TEXT,
|
|
92
|
+
owner_name TEXT,
|
|
93
|
+
status TEXT,
|
|
94
|
+
created TEXT,
|
|
95
|
+
updated TEXT,
|
|
96
|
+
embedding BLOB,
|
|
97
|
+
embedding_updated_at TEXT
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS gerrit_changes_fts USING fts5(
|
|
101
|
+
subject, commit_message,
|
|
102
|
+
content='gerrit_changes', content_rowid='rowid',
|
|
103
|
+
tokenize='unicode61'
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
CREATE TRIGGER IF NOT EXISTS gerrit_changes_ai AFTER INSERT ON gerrit_changes BEGIN
|
|
107
|
+
INSERT INTO gerrit_changes_fts(rowid, subject, commit_message)
|
|
108
|
+
VALUES (new.rowid, new.subject, new.commit_message);
|
|
109
|
+
END;
|
|
110
|
+
CREATE TRIGGER IF NOT EXISTS gerrit_changes_ad AFTER DELETE ON gerrit_changes BEGIN
|
|
111
|
+
INSERT INTO gerrit_changes_fts(gerrit_changes_fts, rowid, subject, commit_message)
|
|
112
|
+
VALUES('delete', old.rowid, old.subject, old.commit_message);
|
|
113
|
+
END;
|
|
114
|
+
CREATE TRIGGER IF NOT EXISTS gerrit_changes_au AFTER UPDATE ON gerrit_changes BEGIN
|
|
115
|
+
INSERT INTO gerrit_changes_fts(gerrit_changes_fts, rowid, subject, commit_message)
|
|
116
|
+
VALUES('delete', old.rowid, old.subject, old.commit_message);
|
|
117
|
+
INSERT INTO gerrit_changes_fts(rowid, subject, commit_message)
|
|
118
|
+
VALUES (new.rowid, new.subject, new.commit_message);
|
|
119
|
+
END;
|
|
120
|
+
|
|
121
|
+
-- ============ Confluence pages ============
|
|
122
|
+
CREATE TABLE IF NOT EXISTS confluence_pages (
|
|
123
|
+
id TEXT PRIMARY KEY,
|
|
124
|
+
space_key TEXT,
|
|
125
|
+
title TEXT,
|
|
126
|
+
body_text TEXT,
|
|
127
|
+
version INTEGER,
|
|
128
|
+
webui TEXT,
|
|
129
|
+
created TEXT,
|
|
130
|
+
updated TEXT,
|
|
131
|
+
embedding BLOB,
|
|
132
|
+
embedding_updated_at TEXT
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS confluence_pages_fts USING fts5(
|
|
136
|
+
title, body_text,
|
|
137
|
+
content='confluence_pages', content_rowid='rowid',
|
|
138
|
+
tokenize='unicode61'
|
|
139
|
+
);
|
|
140
|
+
|
|
141
|
+
CREATE TRIGGER IF NOT EXISTS confluence_pages_ai AFTER INSERT ON confluence_pages BEGIN
|
|
142
|
+
INSERT INTO confluence_pages_fts(rowid, title, body_text)
|
|
143
|
+
VALUES (new.rowid, new.title, new.body_text);
|
|
144
|
+
END;
|
|
145
|
+
CREATE TRIGGER IF NOT EXISTS confluence_pages_ad AFTER DELETE ON confluence_pages BEGIN
|
|
146
|
+
INSERT INTO confluence_pages_fts(confluence_pages_fts, rowid, title, body_text)
|
|
147
|
+
VALUES('delete', old.rowid, old.title, old.body_text);
|
|
148
|
+
END;
|
|
149
|
+
CREATE TRIGGER IF NOT EXISTS confluence_pages_au AFTER UPDATE ON confluence_pages BEGIN
|
|
150
|
+
INSERT INTO confluence_pages_fts(confluence_pages_fts, rowid, title, body_text)
|
|
151
|
+
VALUES('delete', old.rowid, old.title, old.body_text);
|
|
152
|
+
INSERT INTO confluence_pages_fts(rowid, title, body_text)
|
|
153
|
+
VALUES (new.rowid, new.title, new.body_text);
|
|
154
|
+
END;
|
|
155
|
+
|
|
156
|
+
-- ============ AOSP 代码 chunks (P2) ============
|
|
157
|
+
CREATE TABLE IF NOT EXISTS aosp_chunks (
|
|
158
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
159
|
+
platform TEXT NOT NULL,
|
|
160
|
+
module TEXT NOT NULL,
|
|
161
|
+
module_path TEXT NOT NULL,
|
|
162
|
+
file_path TEXT NOT NULL,
|
|
163
|
+
line_start INTEGER NOT NULL,
|
|
164
|
+
line_end INTEGER NOT NULL,
|
|
165
|
+
symbol_kind TEXT,
|
|
166
|
+
symbol_name TEXT,
|
|
167
|
+
content TEXT NOT NULL,
|
|
168
|
+
content_hash TEXT,
|
|
169
|
+
embedding BLOB,
|
|
170
|
+
embedding_updated_at TEXT,
|
|
171
|
+
indexed_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
172
|
+
UNIQUE(platform, module, file_path, line_start, line_end)
|
|
173
|
+
);
|
|
174
|
+
|
|
175
|
+
CREATE INDEX IF NOT EXISTS idx_aosp_pm ON aosp_chunks(platform, module);
|
|
176
|
+
CREATE INDEX IF NOT EXISTS idx_aosp_pmp ON aosp_chunks(platform, module_path);
|
|
177
|
+
CREATE INDEX IF NOT EXISTS idx_aosp_file ON aosp_chunks(file_path);
|
|
178
|
+
|
|
179
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS aosp_chunks_fts USING fts5(
|
|
180
|
+
content,
|
|
181
|
+
content='aosp_chunks', content_rowid='id',
|
|
182
|
+
tokenize='unicode61'
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
CREATE TRIGGER IF NOT EXISTS aosp_chunks_ai AFTER INSERT ON aosp_chunks BEGIN
|
|
186
|
+
INSERT INTO aosp_chunks_fts(rowid, content) VALUES (new.id, new.content);
|
|
187
|
+
END;
|
|
188
|
+
CREATE TRIGGER IF NOT EXISTS aosp_chunks_ad AFTER DELETE ON aosp_chunks BEGIN
|
|
189
|
+
INSERT INTO aosp_chunks_fts(aosp_chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
|
|
190
|
+
END;
|
|
191
|
+
CREATE TRIGGER IF NOT EXISTS aosp_chunks_au AFTER UPDATE ON aosp_chunks BEGIN
|
|
192
|
+
INSERT INTO aosp_chunks_fts(aosp_chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
|
|
193
|
+
INSERT INTO aosp_chunks_fts(rowid, content) VALUES (new.id, new.content);
|
|
194
|
+
END;
|
|
195
|
+
`);
|
|
196
|
+
}
|
|
197
|
+
// =============================================================================
|
|
198
|
+
// 事务封装(node:sqlite 没有 better-sqlite3 的 transaction() helper)
|
|
199
|
+
// =============================================================================
|
|
200
|
+
export function runInTransaction(db, fn) {
|
|
201
|
+
db.exec("BEGIN");
|
|
202
|
+
try {
|
|
203
|
+
const result = fn();
|
|
204
|
+
db.exec("COMMIT");
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
207
|
+
catch (e) {
|
|
208
|
+
try {
|
|
209
|
+
db.exec("ROLLBACK");
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
/* ignore */
|
|
213
|
+
}
|
|
214
|
+
throw e;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
// =============================================================================
|
|
218
|
+
// sync_state 帮手
|
|
219
|
+
// =============================================================================
|
|
220
|
+
export function getSyncState(source, key) {
|
|
221
|
+
const row = getDb()
|
|
222
|
+
.prepare("SELECT value FROM sync_state WHERE source = ? AND key = ?")
|
|
223
|
+
.get(source, key);
|
|
224
|
+
return row?.value ?? null;
|
|
225
|
+
}
|
|
226
|
+
export function setSyncState(source, key, value) {
|
|
227
|
+
getDb()
|
|
228
|
+
.prepare("INSERT INTO sync_state(source, key, value) VALUES (?, ?, ?) ON CONFLICT(source, key) DO UPDATE SET value = excluded.value")
|
|
229
|
+
.run(source, key, value);
|
|
230
|
+
}
|
|
231
|
+
// =============================================================================
|
|
232
|
+
// Helpers:序列化 / 反序列化向量
|
|
233
|
+
// =============================================================================
|
|
234
|
+
export function vectorToBlob(vec) {
|
|
235
|
+
return Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength);
|
|
236
|
+
}
|
|
237
|
+
export function blobToVector(blob, dim) {
|
|
238
|
+
const ab = blob.buffer.slice(blob.byteOffset, blob.byteOffset + blob.byteLength);
|
|
239
|
+
const f32 = new Float32Array(ab);
|
|
240
|
+
if (f32.length !== dim) {
|
|
241
|
+
throw new Error(`embedding dim mismatch: expected ${dim}, got ${f32.length}`);
|
|
242
|
+
}
|
|
243
|
+
return f32;
|
|
244
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embed_pending锛氭壒閲忓祵鍏?鏈祵鍏ユ垨宸茶繃鏈?鐨勮銆?
|
|
3
|
+
*
|
|
4
|
+
* "杩囨湡"鍒ゅ畾锛歟mbedding_updated_at 涓?NULL 鎴?< 涓昏〃鐨?updated 鍒椼€?
|
|
5
|
+
*/
|
|
6
|
+
import { getDb, runInTransaction, vectorToBlob } from "./db.js";
|
|
7
|
+
import { embedTexts } from "./embedder.js";
|
|
8
|
+
import { invalidateIndex } from "./index-store.js";
|
|
9
|
+
function selectPending(source, limit) {
|
|
10
|
+
const db = getDb();
|
|
11
|
+
if (source === "zmind") {
|
|
12
|
+
return db
|
|
13
|
+
.prepare(`SELECT id AS _key,
|
|
14
|
+
COALESCE(subject, '') || char(10) || COALESCE(description, '') AS text
|
|
15
|
+
FROM zmind_issues
|
|
16
|
+
WHERE embedding IS NULL
|
|
17
|
+
OR embedding_updated_at IS NULL
|
|
18
|
+
OR (updated_on IS NOT NULL AND embedding_updated_at < updated_on)
|
|
19
|
+
ORDER BY updated_on DESC
|
|
20
|
+
LIMIT ?`)
|
|
21
|
+
.all(limit);
|
|
22
|
+
}
|
|
23
|
+
if (source === "gerrit") {
|
|
24
|
+
return db
|
|
25
|
+
.prepare(`SELECT change_id AS _key,
|
|
26
|
+
COALESCE(subject, '') || char(10) || COALESCE(commit_message, '') AS text
|
|
27
|
+
FROM gerrit_changes
|
|
28
|
+
WHERE embedding IS NULL
|
|
29
|
+
OR embedding_updated_at IS NULL
|
|
30
|
+
OR (updated IS NOT NULL AND embedding_updated_at < updated)
|
|
31
|
+
ORDER BY updated DESC
|
|
32
|
+
LIMIT ?`)
|
|
33
|
+
.all(limit);
|
|
34
|
+
}
|
|
35
|
+
return db
|
|
36
|
+
.prepare(`SELECT id AS _key,
|
|
37
|
+
COALESCE(title, '') || char(10) || COALESCE(body_text, '') AS text
|
|
38
|
+
FROM confluence_pages
|
|
39
|
+
WHERE embedding IS NULL
|
|
40
|
+
OR embedding_updated_at IS NULL
|
|
41
|
+
OR (updated IS NOT NULL AND embedding_updated_at < updated)
|
|
42
|
+
ORDER BY updated DESC
|
|
43
|
+
LIMIT ?`)
|
|
44
|
+
.all(limit);
|
|
45
|
+
}
|
|
46
|
+
function updateBlobs(source, rows) {
|
|
47
|
+
const db = getDb();
|
|
48
|
+
const sql = (() => {
|
|
49
|
+
if (source === "zmind")
|
|
50
|
+
return "UPDATE zmind_issues SET embedding = ?, embedding_updated_at = ? WHERE id = ?";
|
|
51
|
+
if (source === "gerrit")
|
|
52
|
+
return "UPDATE gerrit_changes SET embedding = ?, embedding_updated_at = ? WHERE change_id = ?";
|
|
53
|
+
return "UPDATE confluence_pages SET embedding = ?, embedding_updated_at = ? WHERE id = ?";
|
|
54
|
+
})();
|
|
55
|
+
const stmt = db.prepare(sql);
|
|
56
|
+
runInTransaction(db, () => {
|
|
57
|
+
for (const it of rows)
|
|
58
|
+
stmt.run(it.blob, it.ts, it.key);
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
export async function embedPending(args) {
|
|
62
|
+
const limit = Math.max(1, Math.min(5000, args.batch_size ?? 200));
|
|
63
|
+
const pending = selectPending(args.source, limit);
|
|
64
|
+
if (pending.length === 0) {
|
|
65
|
+
return { source: args.source, embedded: 0, total_pending: 0 };
|
|
66
|
+
}
|
|
67
|
+
const texts = pending.map((r) => r.text);
|
|
68
|
+
const vectors = await embedTexts(texts);
|
|
69
|
+
if (vectors.length !== pending.length) {
|
|
70
|
+
throw new Error(`embedTexts 杩斿洖 ${vectors.length} 椤癸紝鏈熸湜 ${pending.length}`);
|
|
71
|
+
}
|
|
72
|
+
const ts = new Date().toISOString();
|
|
73
|
+
const writeRows = pending.map((r, i) => ({
|
|
74
|
+
key: r._key,
|
|
75
|
+
blob: vectorToBlob(vectors[i]),
|
|
76
|
+
ts,
|
|
77
|
+
}));
|
|
78
|
+
updateBlobs(args.source, writeRows);
|
|
79
|
+
invalidateIndex(args.source);
|
|
80
|
+
return { source: args.source, embedded: pending.length, total_pending: pending.length };
|
|
81
|
+
}
|
package/dist/embedder.js
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 嵌入器:单例 ONNX 模型 + batch encode。
|
|
3
|
+
*
|
|
4
|
+
* 默认模型 BAAI/bge-small-zh-v1.5(中文友好,512 dim,~80MB)。
|
|
5
|
+
* 通过 @xenova/transformers 在 Node 端运行 ONNX,无需 GPU。
|
|
6
|
+
*
|
|
7
|
+
* 首次调用会下载并缓存模型到 KNOWLEDGE_MODEL_CACHE_DIR(默认 ./data/models)。
|
|
8
|
+
*/
|
|
9
|
+
import { config } from "./config.js";
|
|
10
|
+
let _embedderPromise = null;
|
|
11
|
+
async function buildEmbedder() {
|
|
12
|
+
// 控制 ONNX runtime 线程数(防止低配机吃光)
|
|
13
|
+
process.env.OMP_NUM_THREADS = String(config.embeddingThreads);
|
|
14
|
+
process.env.MKL_NUM_THREADS = String(config.embeddingThreads);
|
|
15
|
+
const transformers = (await import("@xenova/transformers"));
|
|
16
|
+
// 重定向模型缓存目录
|
|
17
|
+
if (transformers?.env) {
|
|
18
|
+
transformers.env.cacheDir = config.modelCacheDir;
|
|
19
|
+
// 默认开启的远程下载
|
|
20
|
+
transformers.env.allowRemoteModels = true;
|
|
21
|
+
}
|
|
22
|
+
const pipeline = transformers.pipeline;
|
|
23
|
+
const extractor = await pipeline("feature-extraction", config.embeddingModelId, {
|
|
24
|
+
quantized: true, // 用 quantized 模型省内存
|
|
25
|
+
});
|
|
26
|
+
return async (texts) => {
|
|
27
|
+
if (texts.length === 0)
|
|
28
|
+
return [];
|
|
29
|
+
// 截断防止 token 超 BGE 512 上限
|
|
30
|
+
const trimmed = texts.map((t) => typeof t === "string" ? t.slice(0, config.maxTextChars) : "");
|
|
31
|
+
const out = await extractor(trimmed, { pooling: "mean", normalize: true });
|
|
32
|
+
// out.data 是 Float32Array,dims = [N, dim]
|
|
33
|
+
const dims = out.dims;
|
|
34
|
+
const dim = dims[dims.length - 1];
|
|
35
|
+
const flat = out.data;
|
|
36
|
+
if (dim !== config.embeddingDim) {
|
|
37
|
+
throw new Error(`模型输出维度 ${dim} 与 config.embeddingDim ${config.embeddingDim} 不一致;请调整 KNOWLEDGE_EMBEDDING_DIM`);
|
|
38
|
+
}
|
|
39
|
+
const result = [];
|
|
40
|
+
for (let i = 0; i < trimmed.length; i++) {
|
|
41
|
+
const start = i * dim;
|
|
42
|
+
// 必须复制独立 buffer(否则共享底层 ArrayBuffer 会被下次 batch 覆盖)
|
|
43
|
+
result.push(new Float32Array(flat.buffer.slice(flat.byteOffset + start * 4, flat.byteOffset + (start + dim) * 4)));
|
|
44
|
+
}
|
|
45
|
+
return result;
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
export async function getEmbedder() {
|
|
49
|
+
if (!_embedderPromise)
|
|
50
|
+
_embedderPromise = buildEmbedder();
|
|
51
|
+
return _embedderPromise;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* 批量嵌入 N 条文本,按 config.embeddingBatchSize 切片调用 ONNX。
|
|
55
|
+
*/
|
|
56
|
+
export async function embedTexts(texts) {
|
|
57
|
+
if (texts.length === 0)
|
|
58
|
+
return [];
|
|
59
|
+
const embed = await getEmbedder();
|
|
60
|
+
const result = [];
|
|
61
|
+
const batch = Math.max(1, config.embeddingBatchSize);
|
|
62
|
+
for (let i = 0; i < texts.length; i += batch) {
|
|
63
|
+
const slice = texts.slice(i, i + batch);
|
|
64
|
+
const out = await embed(slice);
|
|
65
|
+
for (const v of out)
|
|
66
|
+
result.push(v);
|
|
67
|
+
}
|
|
68
|
+
return result;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* 单条嵌入便捷方法。
|
|
72
|
+
*/
|
|
73
|
+
export async function embedOne(text) {
|
|
74
|
+
const out = await embedTexts([text]);
|
|
75
|
+
return out[0];
|
|
76
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 进程内向量索引:lazy 加载 + invalidate。
|
|
3
|
+
*
|
|
4
|
+
* 每个 source(zmind / gerrit / confluence)维护一份:
|
|
5
|
+
* - ids: 主键数组
|
|
6
|
+
* - matrix: 单一连续 Float32Array(N × dim 行优先),便于一次循环算 cosine
|
|
7
|
+
* - meta: 主键 → 行元信息映射
|
|
8
|
+
*/
|
|
9
|
+
import { config } from "./config.js";
|
|
10
|
+
import { blobToVector, getDb } from "./db.js";
|
|
11
|
+
const _cache = new Map();
|
|
12
|
+
export function invalidateIndex(source) {
|
|
13
|
+
if (source)
|
|
14
|
+
_cache.delete(source);
|
|
15
|
+
else
|
|
16
|
+
_cache.clear();
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* 加载某个 source 的所有 embedded rows 到内存矩阵。
|
|
20
|
+
* 首次调用或 invalidate 后下一次调用都会全表 SELECT。
|
|
21
|
+
*/
|
|
22
|
+
export function loadIndex(source) {
|
|
23
|
+
const cached = _cache.get(source);
|
|
24
|
+
if (cached)
|
|
25
|
+
return cached;
|
|
26
|
+
const db = getDb();
|
|
27
|
+
const dim = config.embeddingDim;
|
|
28
|
+
const rows = selectRowsWithEmbedding(source, db);
|
|
29
|
+
const matrix = new Float32Array(rows.length * dim);
|
|
30
|
+
const ids = [];
|
|
31
|
+
const meta = new Map();
|
|
32
|
+
rows.forEach((r, i) => {
|
|
33
|
+
const vec = blobToVector(r.embedding, dim);
|
|
34
|
+
matrix.set(vec, i * dim);
|
|
35
|
+
ids.push(r._key);
|
|
36
|
+
meta.set(r._key, r._meta);
|
|
37
|
+
});
|
|
38
|
+
const idx = { ids, matrix, count: rows.length, meta };
|
|
39
|
+
_cache.set(source, idx);
|
|
40
|
+
return idx;
|
|
41
|
+
}
|
|
42
|
+
function selectRowsWithEmbedding(source, db) {
|
|
43
|
+
if (source === "zmind") {
|
|
44
|
+
const rs = db
|
|
45
|
+
.prepare(`SELECT id, subject, description, status, project_name, updated_on, embedding
|
|
46
|
+
FROM zmind_issues
|
|
47
|
+
WHERE embedding IS NOT NULL`)
|
|
48
|
+
.all();
|
|
49
|
+
return rs.map((r) => ({
|
|
50
|
+
_key: r.id,
|
|
51
|
+
_meta: {
|
|
52
|
+
source: "zmind",
|
|
53
|
+
id: r.id,
|
|
54
|
+
title: r.subject ?? "",
|
|
55
|
+
url: zmindUrl(r.id),
|
|
56
|
+
snippet: snippet(r.description ?? "", config.snippetMaxChars),
|
|
57
|
+
status: r.status ?? "",
|
|
58
|
+
project: r.project_name ?? "",
|
|
59
|
+
updated: r.updated_on ?? "",
|
|
60
|
+
},
|
|
61
|
+
embedding: r.embedding,
|
|
62
|
+
}));
|
|
63
|
+
}
|
|
64
|
+
if (source === "gerrit") {
|
|
65
|
+
const rs = db
|
|
66
|
+
.prepare(`SELECT change_id, number, project, subject, commit_message, status, updated, embedding
|
|
67
|
+
FROM gerrit_changes
|
|
68
|
+
WHERE embedding IS NOT NULL`)
|
|
69
|
+
.all();
|
|
70
|
+
return rs.map((r) => ({
|
|
71
|
+
_key: r.change_id,
|
|
72
|
+
_meta: {
|
|
73
|
+
source: "gerrit",
|
|
74
|
+
id: r.change_id,
|
|
75
|
+
title: r.subject ?? "",
|
|
76
|
+
url: gerritUrl(r.project, r.number),
|
|
77
|
+
snippet: snippet(r.commit_message ?? "", config.snippetMaxChars),
|
|
78
|
+
status: r.status ?? "",
|
|
79
|
+
project: r.project ?? "",
|
|
80
|
+
updated: r.updated ?? "",
|
|
81
|
+
extra: { number: r.number },
|
|
82
|
+
},
|
|
83
|
+
embedding: r.embedding,
|
|
84
|
+
}));
|
|
85
|
+
}
|
|
86
|
+
// confluence
|
|
87
|
+
const rs = db
|
|
88
|
+
.prepare(`SELECT id, space_key, title, body_text, version, webui, updated, embedding
|
|
89
|
+
FROM confluence_pages
|
|
90
|
+
WHERE embedding IS NOT NULL`)
|
|
91
|
+
.all();
|
|
92
|
+
return rs.map((r) => ({
|
|
93
|
+
_key: r.id,
|
|
94
|
+
_meta: {
|
|
95
|
+
source: "confluence",
|
|
96
|
+
id: r.id,
|
|
97
|
+
title: r.title ?? "",
|
|
98
|
+
url: confluenceUrl(r.webui),
|
|
99
|
+
snippet: snippet(r.body_text ?? "", config.snippetMaxChars),
|
|
100
|
+
space: r.space_key ?? "",
|
|
101
|
+
updated: r.updated ?? "",
|
|
102
|
+
extra: { version: r.version },
|
|
103
|
+
},
|
|
104
|
+
embedding: r.embedding,
|
|
105
|
+
}));
|
|
106
|
+
}
|
|
107
|
+
// =============================================================================
|
|
108
|
+
// URL 帮手
|
|
109
|
+
// =============================================================================
|
|
110
|
+
function zmindUrl(id) {
|
|
111
|
+
const base = (process.env.ZMIND_URL ?? "https://zmind.whaletv.com").replace(/\/+$/, "");
|
|
112
|
+
return `${base}/issues/${id}`;
|
|
113
|
+
}
|
|
114
|
+
function gerritUrl(project, num) {
|
|
115
|
+
const base = (process.env.GERRIT_URL ?? "").replace(/\/+$/, "");
|
|
116
|
+
if (!base || !num)
|
|
117
|
+
return "";
|
|
118
|
+
return `${base}/c/${project}/+/${num}`;
|
|
119
|
+
}
|
|
120
|
+
function confluenceUrl(webui) {
|
|
121
|
+
if (!webui)
|
|
122
|
+
return "";
|
|
123
|
+
if (/^https?:\/\//i.test(webui))
|
|
124
|
+
return webui;
|
|
125
|
+
const base = (process.env.CONFLUENCE_BASE_URL ?? "").replace(/\/+$/, "");
|
|
126
|
+
return base ? base + webui : webui;
|
|
127
|
+
}
|
|
128
|
+
function snippet(text, maxChars) {
|
|
129
|
+
const clean = text.replace(/\s+/g, " ").trim();
|
|
130
|
+
if (clean.length <= maxChars)
|
|
131
|
+
return clean;
|
|
132
|
+
return clean.slice(0, maxChars).trimEnd() + "…";
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* 计算 query 向量与 source 索引的 top-K cosine(向量已 L2 normalize,dot=cosine)。
|
|
136
|
+
*
|
|
137
|
+
* @returns 按分数降序的 [{ id, score, meta }, ...]
|
|
138
|
+
*/
|
|
139
|
+
export function vectorTopK(source, query, topK, excludeIds) {
|
|
140
|
+
const idx = loadIndex(source);
|
|
141
|
+
if (idx.count === 0)
|
|
142
|
+
return [];
|
|
143
|
+
const dim = config.embeddingDim;
|
|
144
|
+
if (query.length !== dim) {
|
|
145
|
+
throw new Error(`query dim ${query.length} != embedding dim ${dim}`);
|
|
146
|
+
}
|
|
147
|
+
// 简单全表 dot product
|
|
148
|
+
const scores = new Float32Array(idx.count);
|
|
149
|
+
for (let i = 0; i < idx.count; i++) {
|
|
150
|
+
const offset = i * dim;
|
|
151
|
+
let s = 0;
|
|
152
|
+
for (let j = 0; j < dim; j++) {
|
|
153
|
+
s += idx.matrix[offset + j] * query[j];
|
|
154
|
+
}
|
|
155
|
+
scores[i] = s;
|
|
156
|
+
}
|
|
157
|
+
// 取 top-K
|
|
158
|
+
const indices = [];
|
|
159
|
+
for (let i = 0; i < idx.count; i++) {
|
|
160
|
+
if (excludeIds && excludeIds.has(idx.ids[i]))
|
|
161
|
+
continue;
|
|
162
|
+
indices.push(i);
|
|
163
|
+
}
|
|
164
|
+
indices.sort((a, b) => scores[b] - scores[a]);
|
|
165
|
+
const out = [];
|
|
166
|
+
for (let i = 0; i < Math.min(topK, indices.length); i++) {
|
|
167
|
+
const idxRow = indices[i];
|
|
168
|
+
const id = idx.ids[idxRow];
|
|
169
|
+
const meta = idx.meta.get(id);
|
|
170
|
+
if (!meta)
|
|
171
|
+
continue;
|
|
172
|
+
out.push({ id, score: scores[idxRow], meta });
|
|
173
|
+
}
|
|
174
|
+
return out;
|
|
175
|
+
}
|