@softerist/heuristic-mcp 3.0.15 → 3.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -104
- package/config.jsonc +173 -173
- package/features/ann-config.js +131 -0
- package/features/clear-cache.js +84 -0
- package/features/find-similar-code.js +291 -0
- package/features/hybrid-search.js +544 -0
- package/features/index-codebase.js +3268 -0
- package/features/lifecycle.js +1189 -0
- package/features/package-version.js +302 -0
- package/features/register.js +408 -0
- package/features/resources.js +156 -0
- package/features/set-workspace.js +265 -0
- package/index.js +96 -96
- package/lib/cache-ops.js +22 -22
- package/lib/cache-utils.js +565 -565
- package/lib/cache.js +1870 -1870
- package/lib/call-graph.js +396 -396
- package/lib/cli.js +1 -1
- package/lib/config.js +517 -517
- package/lib/constants.js +39 -39
- package/lib/embed-query-process.js +7 -7
- package/lib/embedding-process.js +7 -7
- package/lib/embedding-worker.js +299 -299
- package/lib/ignore-patterns.js +316 -316
- package/lib/json-worker.js +14 -14
- package/lib/json-writer.js +337 -337
- package/lib/logging.js +164 -164
- package/lib/memory-logger.js +13 -13
- package/lib/onnx-backend.js +193 -193
- package/lib/project-detector.js +84 -84
- package/lib/server-lifecycle.js +165 -165
- package/lib/settings-editor.js +754 -754
- package/lib/tokenizer.js +256 -256
- package/lib/utils.js +428 -428
- package/lib/vector-store-binary.js +627 -627
- package/lib/vector-store-sqlite.js +95 -95
- package/lib/workspace-env.js +28 -28
- package/mcp_config.json +9 -9
- package/package.json +86 -75
- package/scripts/clear-cache.js +20 -0
- package/scripts/download-model.js +43 -0
- package/scripts/mcp-launcher.js +49 -0
- package/scripts/postinstall.js +12 -0
- package/search-configs.js +36 -36
- package/.prettierrc +0 -7
- package/debug-pids.js +0 -30
- package/eslint.config.js +0 -36
- package/specs/plan.md +0 -23
- package/vitest.config.js +0 -39
|
@@ -1,627 +1,627 @@
|
|
|
1
|
-
import fs from 'fs/promises';
|
|
2
|
-
import fsSync from 'fs';
|
|
3
|
-
import path from 'path';
|
|
4
|
-
import os from 'os';
|
|
5
|
-
import {
|
|
6
|
-
BINARY_STORE_VERSION as STORE_VERSION,
|
|
7
|
-
BINARY_VECTOR_HEADER_SIZE as VECTOR_HEADER_SIZE,
|
|
8
|
-
BINARY_RECORD_HEADER_SIZE as RECORD_HEADER_SIZE,
|
|
9
|
-
BINARY_CONTENT_HEADER_SIZE as CONTENT_HEADER_SIZE,
|
|
10
|
-
BINARY_RECORD_SIZE as RECORD_SIZE,
|
|
11
|
-
} from './constants.js';
|
|
12
|
-
|
|
13
|
-
const MAGIC_VECTORS = 'HMCV';
|
|
14
|
-
const MAGIC_RECORDS = 'HMCR';
|
|
15
|
-
const MAGIC_CONTENT = 'HMCC';
|
|
16
|
-
|
|
17
|
-
const VECTORS_FILE = 'vectors.bin';
|
|
18
|
-
const RECORDS_FILE = 'records.bin';
|
|
19
|
-
const CONTENT_FILE = 'content.bin';
|
|
20
|
-
const FILES_FILE = 'files.json';
|
|
21
|
-
const RETRYABLE_RENAME_ERRORS = new Set(['EPERM', 'EACCES', 'EBUSY']);
|
|
22
|
-
|
|
23
|
-
async function renameWithRetry(source, target, { retries = 5, delayMs = 50 } = {}) {
|
|
24
|
-
let attempt = 0;
|
|
25
|
-
let delay = delayMs;
|
|
26
|
-
while (true) {
|
|
27
|
-
try {
|
|
28
|
-
await fs.rename(source, target);
|
|
29
|
-
return;
|
|
30
|
-
} catch (err) {
|
|
31
|
-
const code = err?.code;
|
|
32
|
-
if (!RETRYABLE_RENAME_ERRORS.has(code) || attempt >= retries) {
|
|
33
|
-
throw err;
|
|
34
|
-
}
|
|
35
|
-
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
36
|
-
attempt += 1;
|
|
37
|
-
delay *= 2;
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
function writeMagic(buffer, magic) {
|
|
43
|
-
buffer.write(magic, 0, 'ascii');
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
function readMagic(buffer) {
|
|
47
|
-
return buffer.toString('ascii', 0, 4);
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
function ensureLittleEndian() {
|
|
51
|
-
if (os.endianness() !== 'LE') {
|
|
52
|
-
throw new Error('Binary vector store requires little-endian architecture');
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function getDataView(buffer) {
|
|
57
|
-
return new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
function readHeader(buffer, magic, headerSize) {
|
|
61
|
-
if (buffer.length < headerSize) {
|
|
62
|
-
throw new Error('Binary store header is truncated');
|
|
63
|
-
}
|
|
64
|
-
const actualMagic = readMagic(buffer);
|
|
65
|
-
if (actualMagic !== magic) {
|
|
66
|
-
throw new Error(`Invalid binary store magic (${actualMagic})`);
|
|
67
|
-
}
|
|
68
|
-
const view = getDataView(buffer);
|
|
69
|
-
const version = view.getUint32(4, true);
|
|
70
|
-
if (version !== STORE_VERSION) {
|
|
71
|
-
throw new Error(`Unsupported binary store version (${version})`);
|
|
72
|
-
}
|
|
73
|
-
return view;
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
function writeVectorsHeader(buffer, dim, count) {
|
|
77
|
-
writeMagic(buffer, MAGIC_VECTORS);
|
|
78
|
-
const view = getDataView(buffer);
|
|
79
|
-
view.setUint32(4, STORE_VERSION, true);
|
|
80
|
-
view.setUint32(8, dim, true);
|
|
81
|
-
view.setUint32(12, count, true);
|
|
82
|
-
view.setUint32(16, 0, true);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
function writeRecordsHeader(buffer, count, fileCount) {
|
|
86
|
-
writeMagic(buffer, MAGIC_RECORDS);
|
|
87
|
-
const view = getDataView(buffer);
|
|
88
|
-
view.setUint32(4, STORE_VERSION, true);
|
|
89
|
-
view.setUint32(8, count, true);
|
|
90
|
-
view.setUint32(12, fileCount, true);
|
|
91
|
-
view.setUint32(16, 0, true);
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
function writeContentHeader(buffer, totalBytes) {
|
|
95
|
-
writeMagic(buffer, MAGIC_CONTENT);
|
|
96
|
-
const view = getDataView(buffer);
|
|
97
|
-
view.setUint32(4, STORE_VERSION, true);
|
|
98
|
-
const value = BigInt(totalBytes);
|
|
99
|
-
view.setBigUint64(8, value, true);
|
|
100
|
-
view.setUint32(16, 0, true);
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
function readBigUint(view, offset) {
|
|
104
|
-
const value = view.getBigUint64(offset, true);
|
|
105
|
-
if (value > BigInt(Number.MAX_SAFE_INTEGER)) {
|
|
106
|
-
throw new Error('Binary store content offset exceeds safe integer range');
|
|
107
|
-
}
|
|
108
|
-
return Number(value);
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
function normalizeContent(value) {
|
|
112
|
-
if (value === null || value === undefined) return '';
|
|
113
|
-
if (typeof value !== 'string') return String(value);
|
|
114
|
-
return value;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
export class BinaryVectorStore {
|
|
118
|
-
constructor({
|
|
119
|
-
vectorsBuffer,
|
|
120
|
-
recordsBuffer,
|
|
121
|
-
vectorsHandle,
|
|
122
|
-
vectorsFd,
|
|
123
|
-
contentHandle,
|
|
124
|
-
contentBuffer,
|
|
125
|
-
contentSize,
|
|
126
|
-
files,
|
|
127
|
-
dim,
|
|
128
|
-
count,
|
|
129
|
-
contentCacheEntries,
|
|
130
|
-
vectorCacheEntries,
|
|
131
|
-
}) {
|
|
132
|
-
this.vectorsBuffer = vectorsBuffer;
|
|
133
|
-
this.recordsBuffer = recordsBuffer;
|
|
134
|
-
this.vectorsHandle = vectorsHandle ?? null;
|
|
135
|
-
this.vectorsFd = Number.isInteger(vectorsFd) ? vectorsFd : null;
|
|
136
|
-
this.contentHandle = contentHandle ?? null;
|
|
137
|
-
this.contentBuffer = contentBuffer ?? null;
|
|
138
|
-
this.contentSize = Number.isFinite(contentSize)
|
|
139
|
-
? contentSize
|
|
140
|
-
: contentBuffer
|
|
141
|
-
? Math.max(0, contentBuffer.length - CONTENT_HEADER_SIZE)
|
|
142
|
-
: 0;
|
|
143
|
-
this.files = files;
|
|
144
|
-
this.dim = dim;
|
|
145
|
-
this.count = count;
|
|
146
|
-
this.contentCacheEntries = Number.isInteger(contentCacheEntries) ? contentCacheEntries : 256;
|
|
147
|
-
this.contentCache = new Map();
|
|
148
|
-
this.vectorCacheEntries = Number.isInteger(vectorCacheEntries) ? vectorCacheEntries : 0;
|
|
149
|
-
this.vectorCache = new Map();
|
|
150
|
-
|
|
151
|
-
this.vectorDataOffset = VECTOR_HEADER_SIZE;
|
|
152
|
-
this.recordDataOffset = RECORD_HEADER_SIZE;
|
|
153
|
-
this.contentDataOffset = CONTENT_HEADER_SIZE;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
async close() {
|
|
157
|
-
this.contentCache.clear();
|
|
158
|
-
this.vectorCache.clear();
|
|
159
|
-
this.vectorsBuffer = null;
|
|
160
|
-
this.recordsBuffer = null;
|
|
161
|
-
this.contentBuffer = null;
|
|
162
|
-
this.files = null;
|
|
163
|
-
if (this.vectorsHandle) {
|
|
164
|
-
try {
|
|
165
|
-
await this.vectorsHandle.close();
|
|
166
|
-
} catch {
|
|
167
|
-
// ignore close errors
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
this.vectorsHandle = null;
|
|
171
|
-
if (Number.isInteger(this.vectorsFd)) {
|
|
172
|
-
try {
|
|
173
|
-
fsSync.closeSync(this.vectorsFd);
|
|
174
|
-
} catch {
|
|
175
|
-
// ignore close errors
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
this.vectorsFd = null;
|
|
179
|
-
if (this.contentHandle) {
|
|
180
|
-
try {
|
|
181
|
-
await this.contentHandle.close();
|
|
182
|
-
} catch {
|
|
183
|
-
// ignore close errors
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
this.contentHandle = null;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
static getPaths(cacheDir) {
|
|
190
|
-
return {
|
|
191
|
-
vectorsPath: path.join(cacheDir, VECTORS_FILE),
|
|
192
|
-
recordsPath: path.join(cacheDir, RECORDS_FILE),
|
|
193
|
-
contentPath: path.join(cacheDir, CONTENT_FILE),
|
|
194
|
-
filesPath: path.join(cacheDir, FILES_FILE),
|
|
195
|
-
};
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
static async load(cacheDir, { contentCacheEntries, vectorCacheEntries, vectorLoadMode } = {}) {
|
|
199
|
-
ensureLittleEndian();
|
|
200
|
-
const { vectorsPath, recordsPath, contentPath, filesPath } =
|
|
201
|
-
BinaryVectorStore.getPaths(cacheDir);
|
|
202
|
-
|
|
203
|
-
let contentReadHandle = null;
|
|
204
|
-
let vectorsFd = null;
|
|
205
|
-
|
|
206
|
-
try {
|
|
207
|
-
const loadVectorsFromDisk = String(vectorLoadMode).toLowerCase() === 'disk';
|
|
208
|
-
let vectorsBuffer = null;
|
|
209
|
-
|
|
210
|
-
const [recordsBuffer, filesRaw] = await Promise.all([
|
|
211
|
-
fs.readFile(recordsPath),
|
|
212
|
-
fs.readFile(filesPath, 'utf-8'),
|
|
213
|
-
]);
|
|
214
|
-
|
|
215
|
-
if (loadVectorsFromDisk) {
|
|
216
|
-
vectorsFd = fsSync.openSync(vectorsPath, 'r');
|
|
217
|
-
const headerBuffer = Buffer.alloc(VECTOR_HEADER_SIZE);
|
|
218
|
-
const bytesRead = fsSync.readSync(vectorsFd, headerBuffer, 0, VECTOR_HEADER_SIZE, 0);
|
|
219
|
-
if (bytesRead < VECTOR_HEADER_SIZE) {
|
|
220
|
-
throw new Error('Binary store vectors header is truncated');
|
|
221
|
-
}
|
|
222
|
-
vectorsBuffer = headerBuffer;
|
|
223
|
-
} else {
|
|
224
|
-
vectorsBuffer = await fs.readFile(vectorsPath);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
const vectorsView = readHeader(vectorsBuffer, MAGIC_VECTORS, VECTOR_HEADER_SIZE);
|
|
228
|
-
const dim = vectorsView.getUint32(8, true);
|
|
229
|
-
const count = vectorsView.getUint32(12, true);
|
|
230
|
-
|
|
231
|
-
const recordsView = readHeader(recordsBuffer, MAGIC_RECORDS, RECORD_HEADER_SIZE);
|
|
232
|
-
const recordCount = recordsView.getUint32(8, true);
|
|
233
|
-
const fileCount = recordsView.getUint32(12, true);
|
|
234
|
-
|
|
235
|
-
if (recordCount !== count) {
|
|
236
|
-
throw new Error(`Binary store count mismatch (${recordCount} != ${count})`);
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
contentReadHandle = await fs.open(contentPath, 'r');
|
|
240
|
-
let totalContentBytes = 0;
|
|
241
|
-
|
|
242
|
-
const headerBuffer = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
243
|
-
const { bytesRead } = await contentReadHandle.read(headerBuffer, 0, CONTENT_HEADER_SIZE, 0);
|
|
244
|
-
if (bytesRead < CONTENT_HEADER_SIZE) {
|
|
245
|
-
throw new Error('Binary store content header is truncated');
|
|
246
|
-
}
|
|
247
|
-
const contentView = readHeader(headerBuffer, MAGIC_CONTENT, CONTENT_HEADER_SIZE);
|
|
248
|
-
totalContentBytes = readBigUint(contentView, 8);
|
|
249
|
-
const stats = await contentReadHandle.stat();
|
|
250
|
-
const expectedContentSize = CONTENT_HEADER_SIZE + totalContentBytes;
|
|
251
|
-
if (stats.size < expectedContentSize) {
|
|
252
|
-
throw new Error('Binary store content file truncated');
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
const files = JSON.parse(filesRaw);
|
|
256
|
-
if (!Array.isArray(files) || files.length !== fileCount) {
|
|
257
|
-
throw new Error('Binary store file table is invalid');
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
return new BinaryVectorStore({
|
|
261
|
-
vectorsBuffer,
|
|
262
|
-
recordsBuffer,
|
|
263
|
-
vectorsHandle: null,
|
|
264
|
-
vectorsFd,
|
|
265
|
-
contentHandle: contentReadHandle,
|
|
266
|
-
contentSize: totalContentBytes,
|
|
267
|
-
files,
|
|
268
|
-
dim,
|
|
269
|
-
count,
|
|
270
|
-
contentCacheEntries,
|
|
271
|
-
vectorCacheEntries,
|
|
272
|
-
});
|
|
273
|
-
} catch (err) {
|
|
274
|
-
if (contentReadHandle) await contentReadHandle.close().catch(() => {});
|
|
275
|
-
if (Number.isInteger(vectorsFd)) {
|
|
276
|
-
try {
|
|
277
|
-
fsSync.closeSync(vectorsFd);
|
|
278
|
-
} catch {
|
|
279
|
-
// ignore close errors
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
throw err;
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
get length() {
|
|
287
|
-
return this.count;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
getRecord(index) {
|
|
291
|
-
if (index < 0 || index >= this.count) return null;
|
|
292
|
-
const offset = this.recordDataOffset + index * RECORD_SIZE;
|
|
293
|
-
const view = getDataView(this.recordsBuffer);
|
|
294
|
-
|
|
295
|
-
const fileId = view.getUint32(offset, true);
|
|
296
|
-
const startLine = view.getUint32(offset + 4, true);
|
|
297
|
-
const endLine = view.getUint32(offset + 8, true);
|
|
298
|
-
const contentOffset = readBigUint(view, offset + 12);
|
|
299
|
-
const contentLength = view.getUint32(offset + 20, true);
|
|
300
|
-
|
|
301
|
-
return {
|
|
302
|
-
fileId,
|
|
303
|
-
file: this.files[fileId],
|
|
304
|
-
startLine,
|
|
305
|
-
endLine,
|
|
306
|
-
contentOffset,
|
|
307
|
-
contentLength,
|
|
308
|
-
};
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
getVector(index) {
|
|
312
|
-
if (index < 0 || index >= this.count) return null;
|
|
313
|
-
if (this.vectorCacheEntries > 0) {
|
|
314
|
-
const cached = this.vectorCache.get(index);
|
|
315
|
-
if (cached) {
|
|
316
|
-
this.vectorCache.delete(index);
|
|
317
|
-
this.vectorCache.set(index, cached);
|
|
318
|
-
return cached;
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
const offset = this.vectorDataOffset + index * this.dim * 4;
|
|
323
|
-
const byteLength = this.dim * 4;
|
|
324
|
-
let vector = null;
|
|
325
|
-
|
|
326
|
-
if (this.vectorsBuffer && this.vectorsBuffer.length >= this.vectorDataOffset + byteLength) {
|
|
327
|
-
vector = new Float32Array(
|
|
328
|
-
this.vectorsBuffer.buffer,
|
|
329
|
-
this.vectorsBuffer.byteOffset + offset,
|
|
330
|
-
this.dim
|
|
331
|
-
);
|
|
332
|
-
} else if (Number.isInteger(this.vectorsFd)) {
|
|
333
|
-
// Use Buffer.alloc (not allocUnsafe) for safety - prevents potential
|
|
334
|
-
// information leak if read is partial or fails silently
|
|
335
|
-
const buffer = Buffer.alloc(byteLength);
|
|
336
|
-
const bytesRead = fsSync.readSync(this.vectorsFd, buffer, 0, byteLength, offset);
|
|
337
|
-
if (bytesRead === byteLength) {
|
|
338
|
-
vector = new Float32Array(buffer.buffer, buffer.byteOffset, this.dim);
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
if (vector && this.vectorCacheEntries > 0) {
|
|
343
|
-
this.vectorCache.set(index, vector);
|
|
344
|
-
if (this.vectorCache.size > this.vectorCacheEntries) {
|
|
345
|
-
const firstKey = this.vectorCache.keys().next().value;
|
|
346
|
-
this.vectorCache.delete(firstKey);
|
|
347
|
-
}
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
return vector;
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
async getContent(index) {
|
|
354
|
-
if (index < 0 || index >= this.count) return null;
|
|
355
|
-
if (this.contentCacheEntries > 0) {
|
|
356
|
-
const cached = this.contentCache.get(index);
|
|
357
|
-
if (cached !== undefined) {
|
|
358
|
-
this.contentCache.delete(index);
|
|
359
|
-
this.contentCache.set(index, cached);
|
|
360
|
-
return cached;
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
const record = this.getRecord(index);
|
|
365
|
-
if (!record || record.contentLength === 0) return '';
|
|
366
|
-
const contentLimit = record.contentOffset + record.contentLength;
|
|
367
|
-
if (Number.isFinite(this.contentSize) && contentLimit > this.contentSize) {
|
|
368
|
-
return '';
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
let content = '';
|
|
372
|
-
if (this.contentBuffer) {
|
|
373
|
-
const start = this.contentDataOffset + record.contentOffset;
|
|
374
|
-
const end = start + record.contentLength;
|
|
375
|
-
content = this.contentBuffer.slice(start, end).toString('utf-8');
|
|
376
|
-
} else if (this.contentHandle) {
|
|
377
|
-
const start = this.contentDataOffset + record.contentOffset;
|
|
378
|
-
const length = record.contentLength;
|
|
379
|
-
const buffer = Buffer.alloc(length);
|
|
380
|
-
const { bytesRead } = await this.contentHandle.read(buffer, 0, length, start);
|
|
381
|
-
content = buffer.slice(0, bytesRead).toString('utf-8');
|
|
382
|
-
} else {
|
|
383
|
-
return '';
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
if (this.contentCacheEntries > 0) {
|
|
387
|
-
this.contentCache.set(index, content);
|
|
388
|
-
if (this.contentCache.size > this.contentCacheEntries) {
|
|
389
|
-
const firstKey = this.contentCache.keys().next().value;
|
|
390
|
-
this.contentCache.delete(firstKey);
|
|
391
|
-
}
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
return content;
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
async toChunkViews({ includeContent = false, includeVector = true } = {}) {
|
|
398
|
-
const chunks = new Array(this.count);
|
|
399
|
-
for (let i = 0; i < this.count; i += 1) {
|
|
400
|
-
const record = this.getRecord(i);
|
|
401
|
-
if (!record) continue;
|
|
402
|
-
const chunk = {
|
|
403
|
-
file: record.file,
|
|
404
|
-
startLine: record.startLine,
|
|
405
|
-
endLine: record.endLine,
|
|
406
|
-
_index: i,
|
|
407
|
-
_binaryIndex: i,
|
|
408
|
-
};
|
|
409
|
-
if (includeVector) {
|
|
410
|
-
chunk.vector = this.getVector(i);
|
|
411
|
-
}
|
|
412
|
-
if (includeContent) {
|
|
413
|
-
chunk.content = await this.getContent(i);
|
|
414
|
-
}
|
|
415
|
-
chunks[i] = chunk;
|
|
416
|
-
}
|
|
417
|
-
return chunks;
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
getAllFileIndices() {
|
|
421
|
-
const map = new Map();
|
|
422
|
-
for (let i = 0; i < this.count; i++) {
|
|
423
|
-
const record = this.getRecord(i);
|
|
424
|
-
if (record) {
|
|
425
|
-
let list = map.get(record.file);
|
|
426
|
-
if (!list) {
|
|
427
|
-
list = [];
|
|
428
|
-
map.set(record.file, list);
|
|
429
|
-
}
|
|
430
|
-
list.push(i);
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
return map;
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
static async write(
|
|
437
|
-
cacheDir,
|
|
438
|
-
chunks,
|
|
439
|
-
{
|
|
440
|
-
contentCacheEntries,
|
|
441
|
-
vectorCacheEntries,
|
|
442
|
-
vectorLoadMode,
|
|
443
|
-
getContent,
|
|
444
|
-
getVector,
|
|
445
|
-
preRename,
|
|
446
|
-
} = {}
|
|
447
|
-
) {
|
|
448
|
-
ensureLittleEndian();
|
|
449
|
-
const { vectorsPath, recordsPath, contentPath, filesPath } =
|
|
450
|
-
BinaryVectorStore.getPaths(cacheDir);
|
|
451
|
-
|
|
452
|
-
const tmpSuffix = `.tmp-${process.pid}`;
|
|
453
|
-
const vectorsTmp = `${vectorsPath}${tmpSuffix}`;
|
|
454
|
-
const recordsTmp = `${recordsPath}${tmpSuffix}`;
|
|
455
|
-
const contentTmp = `${contentPath}${tmpSuffix}`;
|
|
456
|
-
const filesTmp = `${filesPath}${tmpSuffix}`;
|
|
457
|
-
|
|
458
|
-
const fileIds = new Map();
|
|
459
|
-
const files = [];
|
|
460
|
-
const denseChunks = [];
|
|
461
|
-
const denseSourceIndices = [];
|
|
462
|
-
for (let i = 0; i < chunks.length; i += 1) {
|
|
463
|
-
const chunk = chunks[i];
|
|
464
|
-
if (!chunk) continue;
|
|
465
|
-
denseChunks.push(chunk);
|
|
466
|
-
denseSourceIndices.push(i);
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
const resolveVector = async (chunk, sourceIndex) => {
|
|
470
|
-
let vectorSource = chunk.vector;
|
|
471
|
-
if (
|
|
472
|
-
(vectorSource === undefined || vectorSource === null) &&
|
|
473
|
-
typeof getVector === 'function'
|
|
474
|
-
) {
|
|
475
|
-
vectorSource = getVector(chunk, sourceIndex);
|
|
476
|
-
if (vectorSource && typeof vectorSource.then === 'function') {
|
|
477
|
-
vectorSource = await vectorSource;
|
|
478
|
-
}
|
|
479
|
-
}
|
|
480
|
-
if (vectorSource === undefined || vectorSource === null) {
|
|
481
|
-
throw new Error(`Missing vector data for binary cache write at index ${sourceIndex}`);
|
|
482
|
-
}
|
|
483
|
-
const vector =
|
|
484
|
-
vectorSource instanceof Float32Array
|
|
485
|
-
? vectorSource
|
|
486
|
-
: ArrayBuffer.isView(vectorSource)
|
|
487
|
-
? Float32Array.from(vectorSource)
|
|
488
|
-
: new Float32Array(vectorSource);
|
|
489
|
-
if (!vector || vector.length === 0) {
|
|
490
|
-
throw new Error(`Empty vector data for binary cache write at index ${sourceIndex}`);
|
|
491
|
-
}
|
|
492
|
-
return vector;
|
|
493
|
-
};
|
|
494
|
-
|
|
495
|
-
const resolveContent = async (chunk, sourceIndex) => {
|
|
496
|
-
const contentSource =
|
|
497
|
-
chunk.content !== undefined && chunk.content !== null
|
|
498
|
-
? chunk.content
|
|
499
|
-
: getContent
|
|
500
|
-
? await getContent(chunk, sourceIndex)
|
|
501
|
-
: '';
|
|
502
|
-
return normalizeContent(contentSource);
|
|
503
|
-
};
|
|
504
|
-
|
|
505
|
-
const recordEntries = new Array(denseChunks.length);
|
|
506
|
-
let contentOffset = 0;
|
|
507
|
-
|
|
508
|
-
for (let i = 0; i < denseChunks.length; i += 1) {
|
|
509
|
-
const chunk = denseChunks[i];
|
|
510
|
-
const sourceIndex = denseSourceIndices[i];
|
|
511
|
-
|
|
512
|
-
const file = chunk.file;
|
|
513
|
-
if (!fileIds.has(file)) {
|
|
514
|
-
fileIds.set(file, files.length);
|
|
515
|
-
files.push(file);
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
const contentValue = await resolveContent(chunk, sourceIndex);
|
|
519
|
-
const contentLength = Buffer.byteLength(contentValue, 'utf-8');
|
|
520
|
-
|
|
521
|
-
recordEntries[i] = {
|
|
522
|
-
fileId: fileIds.get(file),
|
|
523
|
-
startLine: chunk.startLine ?? 0,
|
|
524
|
-
endLine: chunk.endLine ?? 0,
|
|
525
|
-
contentOffset,
|
|
526
|
-
contentLength,
|
|
527
|
-
};
|
|
528
|
-
|
|
529
|
-
contentOffset += contentLength;
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
const count = denseChunks.length;
|
|
533
|
-
const dim =
|
|
534
|
-
count > 0 ? (await resolveVector(denseChunks[0], denseSourceIndices[0])).length : 0;
|
|
535
|
-
|
|
536
|
-
await fs.writeFile(filesTmp, JSON.stringify(files));
|
|
537
|
-
|
|
538
|
-
let vectorsHandle = null;
|
|
539
|
-
let recordsHandle = null;
|
|
540
|
-
let contentHandle = null;
|
|
541
|
-
|
|
542
|
-
try {
|
|
543
|
-
vectorsHandle = await fs.open(vectorsTmp, 'w');
|
|
544
|
-
recordsHandle = await fs.open(recordsTmp, 'w');
|
|
545
|
-
contentHandle = await fs.open(contentTmp, 'w');
|
|
546
|
-
|
|
547
|
-
const vectorsHeader = Buffer.alloc(VECTOR_HEADER_SIZE);
|
|
548
|
-
writeVectorsHeader(vectorsHeader, dim, count);
|
|
549
|
-
await vectorsHandle.write(vectorsHeader, 0, vectorsHeader.length, 0);
|
|
550
|
-
|
|
551
|
-
const recordsHeader = Buffer.alloc(RECORD_HEADER_SIZE);
|
|
552
|
-
writeRecordsHeader(recordsHeader, count, files.length);
|
|
553
|
-
await recordsHandle.write(recordsHeader, 0, recordsHeader.length, 0);
|
|
554
|
-
|
|
555
|
-
const contentHeader = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
556
|
-
writeContentHeader(contentHeader, contentOffset);
|
|
557
|
-
await contentHandle.write(contentHeader, 0, contentHeader.length, 0);
|
|
558
|
-
|
|
559
|
-
let vectorPos = VECTOR_HEADER_SIZE;
|
|
560
|
-
let recordPos = RECORD_HEADER_SIZE;
|
|
561
|
-
let contentPos = CONTENT_HEADER_SIZE;
|
|
562
|
-
|
|
563
|
-
for (let i = 0; i < count; i += 1) {
|
|
564
|
-
const entry = recordEntries[i];
|
|
565
|
-
if (!entry) continue;
|
|
566
|
-
|
|
567
|
-
const recordBuffer = Buffer.alloc(RECORD_SIZE);
|
|
568
|
-
const view = getDataView(recordBuffer);
|
|
569
|
-
view.setUint32(0, entry.fileId, true);
|
|
570
|
-
view.setUint32(4, entry.startLine, true);
|
|
571
|
-
view.setUint32(8, entry.endLine, true);
|
|
572
|
-
view.setBigUint64(12, BigInt(entry.contentOffset), true);
|
|
573
|
-
view.setUint32(20, entry.contentLength, true);
|
|
574
|
-
view.setUint32(24, 0, true);
|
|
575
|
-
view.setUint32(28, 0, true);
|
|
576
|
-
|
|
577
|
-
await recordsHandle.write(recordBuffer, 0, recordBuffer.length, recordPos);
|
|
578
|
-
recordPos += recordBuffer.length;
|
|
579
|
-
|
|
580
|
-
const chunk = denseChunks[i];
|
|
581
|
-
const sourceIndex = denseSourceIndices[i];
|
|
582
|
-
const vector = await resolveVector(chunk, sourceIndex);
|
|
583
|
-
if (vector.length !== dim) {
|
|
584
|
-
throw new Error('Vector dimension mismatch in binary cache write');
|
|
585
|
-
}
|
|
586
|
-
const vectorBuffer = Buffer.from(
|
|
587
|
-
vector.buffer,
|
|
588
|
-
vector.byteOffset,
|
|
589
|
-
vector.byteLength
|
|
590
|
-
);
|
|
591
|
-
await vectorsHandle.write(vectorBuffer, 0, vectorBuffer.length, vectorPos);
|
|
592
|
-
vectorPos += vectorBuffer.length;
|
|
593
|
-
|
|
594
|
-
if (entry.contentLength > 0) {
|
|
595
|
-
// Re-fetch content to avoid holding all strings in memory
|
|
596
|
-
const val = await resolveContent(chunk, sourceIndex);
|
|
597
|
-
const contentBuffer = Buffer.from(val, 'utf-8');
|
|
598
|
-
await contentHandle.write(contentBuffer, 0, contentBuffer.length, contentPos);
|
|
599
|
-
contentPos += contentBuffer.length;
|
|
600
|
-
}
|
|
601
|
-
}
|
|
602
|
-
} finally {
|
|
603
|
-
const closes = [];
|
|
604
|
-
if (vectorsHandle) closes.push(vectorsHandle.close().catch(() => {}));
|
|
605
|
-
if (recordsHandle) closes.push(recordsHandle.close().catch(() => {}));
|
|
606
|
-
if (contentHandle) closes.push(contentHandle.close().catch(() => {}));
|
|
607
|
-
await Promise.all(closes);
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
if (preRename) {
|
|
611
|
-
await preRename();
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
await Promise.all([
|
|
615
|
-
renameWithRetry(vectorsTmp, vectorsPath),
|
|
616
|
-
renameWithRetry(recordsTmp, recordsPath),
|
|
617
|
-
renameWithRetry(contentTmp, contentPath),
|
|
618
|
-
renameWithRetry(filesTmp, filesPath),
|
|
619
|
-
]);
|
|
620
|
-
|
|
621
|
-
return BinaryVectorStore.load(cacheDir, {
|
|
622
|
-
contentCacheEntries,
|
|
623
|
-
vectorCacheEntries,
|
|
624
|
-
vectorLoadMode,
|
|
625
|
-
});
|
|
626
|
-
}
|
|
627
|
-
}
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import fsSync from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
import {
|
|
6
|
+
BINARY_STORE_VERSION as STORE_VERSION,
|
|
7
|
+
BINARY_VECTOR_HEADER_SIZE as VECTOR_HEADER_SIZE,
|
|
8
|
+
BINARY_RECORD_HEADER_SIZE as RECORD_HEADER_SIZE,
|
|
9
|
+
BINARY_CONTENT_HEADER_SIZE as CONTENT_HEADER_SIZE,
|
|
10
|
+
BINARY_RECORD_SIZE as RECORD_SIZE,
|
|
11
|
+
} from './constants.js';
|
|
12
|
+
|
|
13
|
+
const MAGIC_VECTORS = 'HMCV';
|
|
14
|
+
const MAGIC_RECORDS = 'HMCR';
|
|
15
|
+
const MAGIC_CONTENT = 'HMCC';
|
|
16
|
+
|
|
17
|
+
const VECTORS_FILE = 'vectors.bin';
|
|
18
|
+
const RECORDS_FILE = 'records.bin';
|
|
19
|
+
const CONTENT_FILE = 'content.bin';
|
|
20
|
+
const FILES_FILE = 'files.json';
|
|
21
|
+
const RETRYABLE_RENAME_ERRORS = new Set(['EPERM', 'EACCES', 'EBUSY']);
|
|
22
|
+
|
|
23
|
+
async function renameWithRetry(source, target, { retries = 5, delayMs = 50 } = {}) {
|
|
24
|
+
let attempt = 0;
|
|
25
|
+
let delay = delayMs;
|
|
26
|
+
while (true) {
|
|
27
|
+
try {
|
|
28
|
+
await fs.rename(source, target);
|
|
29
|
+
return;
|
|
30
|
+
} catch (err) {
|
|
31
|
+
const code = err?.code;
|
|
32
|
+
if (!RETRYABLE_RENAME_ERRORS.has(code) || attempt >= retries) {
|
|
33
|
+
throw err;
|
|
34
|
+
}
|
|
35
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
36
|
+
attempt += 1;
|
|
37
|
+
delay *= 2;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function writeMagic(buffer, magic) {
|
|
43
|
+
buffer.write(magic, 0, 'ascii');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function readMagic(buffer) {
|
|
47
|
+
return buffer.toString('ascii', 0, 4);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function ensureLittleEndian() {
|
|
51
|
+
if (os.endianness() !== 'LE') {
|
|
52
|
+
throw new Error('Binary vector store requires little-endian architecture');
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function getDataView(buffer) {
|
|
57
|
+
return new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function readHeader(buffer, magic, headerSize) {
|
|
61
|
+
if (buffer.length < headerSize) {
|
|
62
|
+
throw new Error('Binary store header is truncated');
|
|
63
|
+
}
|
|
64
|
+
const actualMagic = readMagic(buffer);
|
|
65
|
+
if (actualMagic !== magic) {
|
|
66
|
+
throw new Error(`Invalid binary store magic (${actualMagic})`);
|
|
67
|
+
}
|
|
68
|
+
const view = getDataView(buffer);
|
|
69
|
+
const version = view.getUint32(4, true);
|
|
70
|
+
if (version !== STORE_VERSION) {
|
|
71
|
+
throw new Error(`Unsupported binary store version (${version})`);
|
|
72
|
+
}
|
|
73
|
+
return view;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function writeVectorsHeader(buffer, dim, count) {
|
|
77
|
+
writeMagic(buffer, MAGIC_VECTORS);
|
|
78
|
+
const view = getDataView(buffer);
|
|
79
|
+
view.setUint32(4, STORE_VERSION, true);
|
|
80
|
+
view.setUint32(8, dim, true);
|
|
81
|
+
view.setUint32(12, count, true);
|
|
82
|
+
view.setUint32(16, 0, true);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function writeRecordsHeader(buffer, count, fileCount) {
|
|
86
|
+
writeMagic(buffer, MAGIC_RECORDS);
|
|
87
|
+
const view = getDataView(buffer);
|
|
88
|
+
view.setUint32(4, STORE_VERSION, true);
|
|
89
|
+
view.setUint32(8, count, true);
|
|
90
|
+
view.setUint32(12, fileCount, true);
|
|
91
|
+
view.setUint32(16, 0, true);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function writeContentHeader(buffer, totalBytes) {
|
|
95
|
+
writeMagic(buffer, MAGIC_CONTENT);
|
|
96
|
+
const view = getDataView(buffer);
|
|
97
|
+
view.setUint32(4, STORE_VERSION, true);
|
|
98
|
+
const value = BigInt(totalBytes);
|
|
99
|
+
view.setBigUint64(8, value, true);
|
|
100
|
+
view.setUint32(16, 0, true);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function readBigUint(view, offset) {
|
|
104
|
+
const value = view.getBigUint64(offset, true);
|
|
105
|
+
if (value > BigInt(Number.MAX_SAFE_INTEGER)) {
|
|
106
|
+
throw new Error('Binary store content offset exceeds safe integer range');
|
|
107
|
+
}
|
|
108
|
+
return Number(value);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function normalizeContent(value) {
|
|
112
|
+
if (value === null || value === undefined) return '';
|
|
113
|
+
if (typeof value !== 'string') return String(value);
|
|
114
|
+
return value;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export class BinaryVectorStore {
|
|
118
|
+
constructor({
|
|
119
|
+
vectorsBuffer,
|
|
120
|
+
recordsBuffer,
|
|
121
|
+
vectorsHandle,
|
|
122
|
+
vectorsFd,
|
|
123
|
+
contentHandle,
|
|
124
|
+
contentBuffer,
|
|
125
|
+
contentSize,
|
|
126
|
+
files,
|
|
127
|
+
dim,
|
|
128
|
+
count,
|
|
129
|
+
contentCacheEntries,
|
|
130
|
+
vectorCacheEntries,
|
|
131
|
+
}) {
|
|
132
|
+
this.vectorsBuffer = vectorsBuffer;
|
|
133
|
+
this.recordsBuffer = recordsBuffer;
|
|
134
|
+
this.vectorsHandle = vectorsHandle ?? null;
|
|
135
|
+
this.vectorsFd = Number.isInteger(vectorsFd) ? vectorsFd : null;
|
|
136
|
+
this.contentHandle = contentHandle ?? null;
|
|
137
|
+
this.contentBuffer = contentBuffer ?? null;
|
|
138
|
+
this.contentSize = Number.isFinite(contentSize)
|
|
139
|
+
? contentSize
|
|
140
|
+
: contentBuffer
|
|
141
|
+
? Math.max(0, contentBuffer.length - CONTENT_HEADER_SIZE)
|
|
142
|
+
: 0;
|
|
143
|
+
this.files = files;
|
|
144
|
+
this.dim = dim;
|
|
145
|
+
this.count = count;
|
|
146
|
+
this.contentCacheEntries = Number.isInteger(contentCacheEntries) ? contentCacheEntries : 256;
|
|
147
|
+
this.contentCache = new Map();
|
|
148
|
+
this.vectorCacheEntries = Number.isInteger(vectorCacheEntries) ? vectorCacheEntries : 0;
|
|
149
|
+
this.vectorCache = new Map();
|
|
150
|
+
|
|
151
|
+
this.vectorDataOffset = VECTOR_HEADER_SIZE;
|
|
152
|
+
this.recordDataOffset = RECORD_HEADER_SIZE;
|
|
153
|
+
this.contentDataOffset = CONTENT_HEADER_SIZE;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async close() {
|
|
157
|
+
this.contentCache.clear();
|
|
158
|
+
this.vectorCache.clear();
|
|
159
|
+
this.vectorsBuffer = null;
|
|
160
|
+
this.recordsBuffer = null;
|
|
161
|
+
this.contentBuffer = null;
|
|
162
|
+
this.files = null;
|
|
163
|
+
if (this.vectorsHandle) {
|
|
164
|
+
try {
|
|
165
|
+
await this.vectorsHandle.close();
|
|
166
|
+
} catch {
|
|
167
|
+
// ignore close errors
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
this.vectorsHandle = null;
|
|
171
|
+
if (Number.isInteger(this.vectorsFd)) {
|
|
172
|
+
try {
|
|
173
|
+
fsSync.closeSync(this.vectorsFd);
|
|
174
|
+
} catch {
|
|
175
|
+
// ignore close errors
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
this.vectorsFd = null;
|
|
179
|
+
if (this.contentHandle) {
|
|
180
|
+
try {
|
|
181
|
+
await this.contentHandle.close();
|
|
182
|
+
} catch {
|
|
183
|
+
// ignore close errors
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
this.contentHandle = null;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
static getPaths(cacheDir) {
|
|
190
|
+
return {
|
|
191
|
+
vectorsPath: path.join(cacheDir, VECTORS_FILE),
|
|
192
|
+
recordsPath: path.join(cacheDir, RECORDS_FILE),
|
|
193
|
+
contentPath: path.join(cacheDir, CONTENT_FILE),
|
|
194
|
+
filesPath: path.join(cacheDir, FILES_FILE),
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
static async load(cacheDir, { contentCacheEntries, vectorCacheEntries, vectorLoadMode } = {}) {
|
|
199
|
+
ensureLittleEndian();
|
|
200
|
+
const { vectorsPath, recordsPath, contentPath, filesPath } =
|
|
201
|
+
BinaryVectorStore.getPaths(cacheDir);
|
|
202
|
+
|
|
203
|
+
let contentReadHandle = null;
|
|
204
|
+
let vectorsFd = null;
|
|
205
|
+
|
|
206
|
+
try {
|
|
207
|
+
const loadVectorsFromDisk = String(vectorLoadMode).toLowerCase() === 'disk';
|
|
208
|
+
let vectorsBuffer = null;
|
|
209
|
+
|
|
210
|
+
const [recordsBuffer, filesRaw] = await Promise.all([
|
|
211
|
+
fs.readFile(recordsPath),
|
|
212
|
+
fs.readFile(filesPath, 'utf-8'),
|
|
213
|
+
]);
|
|
214
|
+
|
|
215
|
+
if (loadVectorsFromDisk) {
|
|
216
|
+
vectorsFd = fsSync.openSync(vectorsPath, 'r');
|
|
217
|
+
const headerBuffer = Buffer.alloc(VECTOR_HEADER_SIZE);
|
|
218
|
+
const bytesRead = fsSync.readSync(vectorsFd, headerBuffer, 0, VECTOR_HEADER_SIZE, 0);
|
|
219
|
+
if (bytesRead < VECTOR_HEADER_SIZE) {
|
|
220
|
+
throw new Error('Binary store vectors header is truncated');
|
|
221
|
+
}
|
|
222
|
+
vectorsBuffer = headerBuffer;
|
|
223
|
+
} else {
|
|
224
|
+
vectorsBuffer = await fs.readFile(vectorsPath);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const vectorsView = readHeader(vectorsBuffer, MAGIC_VECTORS, VECTOR_HEADER_SIZE);
|
|
228
|
+
const dim = vectorsView.getUint32(8, true);
|
|
229
|
+
const count = vectorsView.getUint32(12, true);
|
|
230
|
+
|
|
231
|
+
const recordsView = readHeader(recordsBuffer, MAGIC_RECORDS, RECORD_HEADER_SIZE);
|
|
232
|
+
const recordCount = recordsView.getUint32(8, true);
|
|
233
|
+
const fileCount = recordsView.getUint32(12, true);
|
|
234
|
+
|
|
235
|
+
if (recordCount !== count) {
|
|
236
|
+
throw new Error(`Binary store count mismatch (${recordCount} != ${count})`);
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
contentReadHandle = await fs.open(contentPath, 'r');
|
|
240
|
+
let totalContentBytes = 0;
|
|
241
|
+
|
|
242
|
+
const headerBuffer = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
243
|
+
const { bytesRead } = await contentReadHandle.read(headerBuffer, 0, CONTENT_HEADER_SIZE, 0);
|
|
244
|
+
if (bytesRead < CONTENT_HEADER_SIZE) {
|
|
245
|
+
throw new Error('Binary store content header is truncated');
|
|
246
|
+
}
|
|
247
|
+
const contentView = readHeader(headerBuffer, MAGIC_CONTENT, CONTENT_HEADER_SIZE);
|
|
248
|
+
totalContentBytes = readBigUint(contentView, 8);
|
|
249
|
+
const stats = await contentReadHandle.stat();
|
|
250
|
+
const expectedContentSize = CONTENT_HEADER_SIZE + totalContentBytes;
|
|
251
|
+
if (stats.size < expectedContentSize) {
|
|
252
|
+
throw new Error('Binary store content file truncated');
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const files = JSON.parse(filesRaw);
|
|
256
|
+
if (!Array.isArray(files) || files.length !== fileCount) {
|
|
257
|
+
throw new Error('Binary store file table is invalid');
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return new BinaryVectorStore({
|
|
261
|
+
vectorsBuffer,
|
|
262
|
+
recordsBuffer,
|
|
263
|
+
vectorsHandle: null,
|
|
264
|
+
vectorsFd,
|
|
265
|
+
contentHandle: contentReadHandle,
|
|
266
|
+
contentSize: totalContentBytes,
|
|
267
|
+
files,
|
|
268
|
+
dim,
|
|
269
|
+
count,
|
|
270
|
+
contentCacheEntries,
|
|
271
|
+
vectorCacheEntries,
|
|
272
|
+
});
|
|
273
|
+
} catch (err) {
|
|
274
|
+
if (contentReadHandle) await contentReadHandle.close().catch(() => {});
|
|
275
|
+
if (Number.isInteger(vectorsFd)) {
|
|
276
|
+
try {
|
|
277
|
+
fsSync.closeSync(vectorsFd);
|
|
278
|
+
} catch {
|
|
279
|
+
// ignore close errors
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
throw err;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
get length() {
|
|
287
|
+
return this.count;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
getRecord(index) {
|
|
291
|
+
if (index < 0 || index >= this.count) return null;
|
|
292
|
+
const offset = this.recordDataOffset + index * RECORD_SIZE;
|
|
293
|
+
const view = getDataView(this.recordsBuffer);
|
|
294
|
+
|
|
295
|
+
const fileId = view.getUint32(offset, true);
|
|
296
|
+
const startLine = view.getUint32(offset + 4, true);
|
|
297
|
+
const endLine = view.getUint32(offset + 8, true);
|
|
298
|
+
const contentOffset = readBigUint(view, offset + 12);
|
|
299
|
+
const contentLength = view.getUint32(offset + 20, true);
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
fileId,
|
|
303
|
+
file: this.files[fileId],
|
|
304
|
+
startLine,
|
|
305
|
+
endLine,
|
|
306
|
+
contentOffset,
|
|
307
|
+
contentLength,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
getVector(index) {
|
|
312
|
+
if (index < 0 || index >= this.count) return null;
|
|
313
|
+
if (this.vectorCacheEntries > 0) {
|
|
314
|
+
const cached = this.vectorCache.get(index);
|
|
315
|
+
if (cached) {
|
|
316
|
+
this.vectorCache.delete(index);
|
|
317
|
+
this.vectorCache.set(index, cached);
|
|
318
|
+
return cached;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
const offset = this.vectorDataOffset + index * this.dim * 4;
|
|
323
|
+
const byteLength = this.dim * 4;
|
|
324
|
+
let vector = null;
|
|
325
|
+
|
|
326
|
+
if (this.vectorsBuffer && this.vectorsBuffer.length >= this.vectorDataOffset + byteLength) {
|
|
327
|
+
vector = new Float32Array(
|
|
328
|
+
this.vectorsBuffer.buffer,
|
|
329
|
+
this.vectorsBuffer.byteOffset + offset,
|
|
330
|
+
this.dim
|
|
331
|
+
);
|
|
332
|
+
} else if (Number.isInteger(this.vectorsFd)) {
|
|
333
|
+
// Use Buffer.alloc (not allocUnsafe) for safety - prevents potential
|
|
334
|
+
// information leak if read is partial or fails silently
|
|
335
|
+
const buffer = Buffer.alloc(byteLength);
|
|
336
|
+
const bytesRead = fsSync.readSync(this.vectorsFd, buffer, 0, byteLength, offset);
|
|
337
|
+
if (bytesRead === byteLength) {
|
|
338
|
+
vector = new Float32Array(buffer.buffer, buffer.byteOffset, this.dim);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (vector && this.vectorCacheEntries > 0) {
|
|
343
|
+
this.vectorCache.set(index, vector);
|
|
344
|
+
if (this.vectorCache.size > this.vectorCacheEntries) {
|
|
345
|
+
const firstKey = this.vectorCache.keys().next().value;
|
|
346
|
+
this.vectorCache.delete(firstKey);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
return vector;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
async getContent(index) {
|
|
354
|
+
if (index < 0 || index >= this.count) return null;
|
|
355
|
+
if (this.contentCacheEntries > 0) {
|
|
356
|
+
const cached = this.contentCache.get(index);
|
|
357
|
+
if (cached !== undefined) {
|
|
358
|
+
this.contentCache.delete(index);
|
|
359
|
+
this.contentCache.set(index, cached);
|
|
360
|
+
return cached;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
const record = this.getRecord(index);
|
|
365
|
+
if (!record || record.contentLength === 0) return '';
|
|
366
|
+
const contentLimit = record.contentOffset + record.contentLength;
|
|
367
|
+
if (Number.isFinite(this.contentSize) && contentLimit > this.contentSize) {
|
|
368
|
+
return '';
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
let content = '';
|
|
372
|
+
if (this.contentBuffer) {
|
|
373
|
+
const start = this.contentDataOffset + record.contentOffset;
|
|
374
|
+
const end = start + record.contentLength;
|
|
375
|
+
content = this.contentBuffer.slice(start, end).toString('utf-8');
|
|
376
|
+
} else if (this.contentHandle) {
|
|
377
|
+
const start = this.contentDataOffset + record.contentOffset;
|
|
378
|
+
const length = record.contentLength;
|
|
379
|
+
const buffer = Buffer.alloc(length);
|
|
380
|
+
const { bytesRead } = await this.contentHandle.read(buffer, 0, length, start);
|
|
381
|
+
content = buffer.slice(0, bytesRead).toString('utf-8');
|
|
382
|
+
} else {
|
|
383
|
+
return '';
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
if (this.contentCacheEntries > 0) {
|
|
387
|
+
this.contentCache.set(index, content);
|
|
388
|
+
if (this.contentCache.size > this.contentCacheEntries) {
|
|
389
|
+
const firstKey = this.contentCache.keys().next().value;
|
|
390
|
+
this.contentCache.delete(firstKey);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
return content;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
async toChunkViews({ includeContent = false, includeVector = true } = {}) {
|
|
398
|
+
const chunks = new Array(this.count);
|
|
399
|
+
for (let i = 0; i < this.count; i += 1) {
|
|
400
|
+
const record = this.getRecord(i);
|
|
401
|
+
if (!record) continue;
|
|
402
|
+
const chunk = {
|
|
403
|
+
file: record.file,
|
|
404
|
+
startLine: record.startLine,
|
|
405
|
+
endLine: record.endLine,
|
|
406
|
+
_index: i,
|
|
407
|
+
_binaryIndex: i,
|
|
408
|
+
};
|
|
409
|
+
if (includeVector) {
|
|
410
|
+
chunk.vector = this.getVector(i);
|
|
411
|
+
}
|
|
412
|
+
if (includeContent) {
|
|
413
|
+
chunk.content = await this.getContent(i);
|
|
414
|
+
}
|
|
415
|
+
chunks[i] = chunk;
|
|
416
|
+
}
|
|
417
|
+
return chunks;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
getAllFileIndices() {
|
|
421
|
+
const map = new Map();
|
|
422
|
+
for (let i = 0; i < this.count; i++) {
|
|
423
|
+
const record = this.getRecord(i);
|
|
424
|
+
if (record) {
|
|
425
|
+
let list = map.get(record.file);
|
|
426
|
+
if (!list) {
|
|
427
|
+
list = [];
|
|
428
|
+
map.set(record.file, list);
|
|
429
|
+
}
|
|
430
|
+
list.push(i);
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
return map;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
static async write(
|
|
437
|
+
cacheDir,
|
|
438
|
+
chunks,
|
|
439
|
+
{
|
|
440
|
+
contentCacheEntries,
|
|
441
|
+
vectorCacheEntries,
|
|
442
|
+
vectorLoadMode,
|
|
443
|
+
getContent,
|
|
444
|
+
getVector,
|
|
445
|
+
preRename,
|
|
446
|
+
} = {}
|
|
447
|
+
) {
|
|
448
|
+
ensureLittleEndian();
|
|
449
|
+
const { vectorsPath, recordsPath, contentPath, filesPath } =
|
|
450
|
+
BinaryVectorStore.getPaths(cacheDir);
|
|
451
|
+
|
|
452
|
+
const tmpSuffix = `.tmp-${process.pid}`;
|
|
453
|
+
const vectorsTmp = `${vectorsPath}${tmpSuffix}`;
|
|
454
|
+
const recordsTmp = `${recordsPath}${tmpSuffix}`;
|
|
455
|
+
const contentTmp = `${contentPath}${tmpSuffix}`;
|
|
456
|
+
const filesTmp = `${filesPath}${tmpSuffix}`;
|
|
457
|
+
|
|
458
|
+
const fileIds = new Map();
|
|
459
|
+
const files = [];
|
|
460
|
+
const denseChunks = [];
|
|
461
|
+
const denseSourceIndices = [];
|
|
462
|
+
for (let i = 0; i < chunks.length; i += 1) {
|
|
463
|
+
const chunk = chunks[i];
|
|
464
|
+
if (!chunk) continue;
|
|
465
|
+
denseChunks.push(chunk);
|
|
466
|
+
denseSourceIndices.push(i);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const resolveVector = async (chunk, sourceIndex) => {
|
|
470
|
+
let vectorSource = chunk.vector;
|
|
471
|
+
if (
|
|
472
|
+
(vectorSource === undefined || vectorSource === null) &&
|
|
473
|
+
typeof getVector === 'function'
|
|
474
|
+
) {
|
|
475
|
+
vectorSource = getVector(chunk, sourceIndex);
|
|
476
|
+
if (vectorSource && typeof vectorSource.then === 'function') {
|
|
477
|
+
vectorSource = await vectorSource;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
if (vectorSource === undefined || vectorSource === null) {
|
|
481
|
+
throw new Error(`Missing vector data for binary cache write at index ${sourceIndex}`);
|
|
482
|
+
}
|
|
483
|
+
const vector =
|
|
484
|
+
vectorSource instanceof Float32Array
|
|
485
|
+
? vectorSource
|
|
486
|
+
: ArrayBuffer.isView(vectorSource)
|
|
487
|
+
? Float32Array.from(vectorSource)
|
|
488
|
+
: new Float32Array(vectorSource);
|
|
489
|
+
if (!vector || vector.length === 0) {
|
|
490
|
+
throw new Error(`Empty vector data for binary cache write at index ${sourceIndex}`);
|
|
491
|
+
}
|
|
492
|
+
return vector;
|
|
493
|
+
};
|
|
494
|
+
|
|
495
|
+
const resolveContent = async (chunk, sourceIndex) => {
|
|
496
|
+
const contentSource =
|
|
497
|
+
chunk.content !== undefined && chunk.content !== null
|
|
498
|
+
? chunk.content
|
|
499
|
+
: getContent
|
|
500
|
+
? await getContent(chunk, sourceIndex)
|
|
501
|
+
: '';
|
|
502
|
+
return normalizeContent(contentSource);
|
|
503
|
+
};
|
|
504
|
+
|
|
505
|
+
const recordEntries = new Array(denseChunks.length);
|
|
506
|
+
let contentOffset = 0;
|
|
507
|
+
|
|
508
|
+
for (let i = 0; i < denseChunks.length; i += 1) {
|
|
509
|
+
const chunk = denseChunks[i];
|
|
510
|
+
const sourceIndex = denseSourceIndices[i];
|
|
511
|
+
|
|
512
|
+
const file = chunk.file;
|
|
513
|
+
if (!fileIds.has(file)) {
|
|
514
|
+
fileIds.set(file, files.length);
|
|
515
|
+
files.push(file);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
const contentValue = await resolveContent(chunk, sourceIndex);
|
|
519
|
+
const contentLength = Buffer.byteLength(contentValue, 'utf-8');
|
|
520
|
+
|
|
521
|
+
recordEntries[i] = {
|
|
522
|
+
fileId: fileIds.get(file),
|
|
523
|
+
startLine: chunk.startLine ?? 0,
|
|
524
|
+
endLine: chunk.endLine ?? 0,
|
|
525
|
+
contentOffset,
|
|
526
|
+
contentLength,
|
|
527
|
+
};
|
|
528
|
+
|
|
529
|
+
contentOffset += contentLength;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
const count = denseChunks.length;
|
|
533
|
+
const dim =
|
|
534
|
+
count > 0 ? (await resolveVector(denseChunks[0], denseSourceIndices[0])).length : 0;
|
|
535
|
+
|
|
536
|
+
await fs.writeFile(filesTmp, JSON.stringify(files));
|
|
537
|
+
|
|
538
|
+
let vectorsHandle = null;
|
|
539
|
+
let recordsHandle = null;
|
|
540
|
+
let contentHandle = null;
|
|
541
|
+
|
|
542
|
+
try {
|
|
543
|
+
vectorsHandle = await fs.open(vectorsTmp, 'w');
|
|
544
|
+
recordsHandle = await fs.open(recordsTmp, 'w');
|
|
545
|
+
contentHandle = await fs.open(contentTmp, 'w');
|
|
546
|
+
|
|
547
|
+
const vectorsHeader = Buffer.alloc(VECTOR_HEADER_SIZE);
|
|
548
|
+
writeVectorsHeader(vectorsHeader, dim, count);
|
|
549
|
+
await vectorsHandle.write(vectorsHeader, 0, vectorsHeader.length, 0);
|
|
550
|
+
|
|
551
|
+
const recordsHeader = Buffer.alloc(RECORD_HEADER_SIZE);
|
|
552
|
+
writeRecordsHeader(recordsHeader, count, files.length);
|
|
553
|
+
await recordsHandle.write(recordsHeader, 0, recordsHeader.length, 0);
|
|
554
|
+
|
|
555
|
+
const contentHeader = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
556
|
+
writeContentHeader(contentHeader, contentOffset);
|
|
557
|
+
await contentHandle.write(contentHeader, 0, contentHeader.length, 0);
|
|
558
|
+
|
|
559
|
+
let vectorPos = VECTOR_HEADER_SIZE;
|
|
560
|
+
let recordPos = RECORD_HEADER_SIZE;
|
|
561
|
+
let contentPos = CONTENT_HEADER_SIZE;
|
|
562
|
+
|
|
563
|
+
for (let i = 0; i < count; i += 1) {
|
|
564
|
+
const entry = recordEntries[i];
|
|
565
|
+
if (!entry) continue;
|
|
566
|
+
|
|
567
|
+
const recordBuffer = Buffer.alloc(RECORD_SIZE);
|
|
568
|
+
const view = getDataView(recordBuffer);
|
|
569
|
+
view.setUint32(0, entry.fileId, true);
|
|
570
|
+
view.setUint32(4, entry.startLine, true);
|
|
571
|
+
view.setUint32(8, entry.endLine, true);
|
|
572
|
+
view.setBigUint64(12, BigInt(entry.contentOffset), true);
|
|
573
|
+
view.setUint32(20, entry.contentLength, true);
|
|
574
|
+
view.setUint32(24, 0, true);
|
|
575
|
+
view.setUint32(28, 0, true);
|
|
576
|
+
|
|
577
|
+
await recordsHandle.write(recordBuffer, 0, recordBuffer.length, recordPos);
|
|
578
|
+
recordPos += recordBuffer.length;
|
|
579
|
+
|
|
580
|
+
const chunk = denseChunks[i];
|
|
581
|
+
const sourceIndex = denseSourceIndices[i];
|
|
582
|
+
const vector = await resolveVector(chunk, sourceIndex);
|
|
583
|
+
if (vector.length !== dim) {
|
|
584
|
+
throw new Error('Vector dimension mismatch in binary cache write');
|
|
585
|
+
}
|
|
586
|
+
const vectorBuffer = Buffer.from(
|
|
587
|
+
vector.buffer,
|
|
588
|
+
vector.byteOffset,
|
|
589
|
+
vector.byteLength
|
|
590
|
+
);
|
|
591
|
+
await vectorsHandle.write(vectorBuffer, 0, vectorBuffer.length, vectorPos);
|
|
592
|
+
vectorPos += vectorBuffer.length;
|
|
593
|
+
|
|
594
|
+
if (entry.contentLength > 0) {
|
|
595
|
+
// Re-fetch content to avoid holding all strings in memory
|
|
596
|
+
const val = await resolveContent(chunk, sourceIndex);
|
|
597
|
+
const contentBuffer = Buffer.from(val, 'utf-8');
|
|
598
|
+
await contentHandle.write(contentBuffer, 0, contentBuffer.length, contentPos);
|
|
599
|
+
contentPos += contentBuffer.length;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
} finally {
|
|
603
|
+
const closes = [];
|
|
604
|
+
if (vectorsHandle) closes.push(vectorsHandle.close().catch(() => {}));
|
|
605
|
+
if (recordsHandle) closes.push(recordsHandle.close().catch(() => {}));
|
|
606
|
+
if (contentHandle) closes.push(contentHandle.close().catch(() => {}));
|
|
607
|
+
await Promise.all(closes);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
if (preRename) {
|
|
611
|
+
await preRename();
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
await Promise.all([
|
|
615
|
+
renameWithRetry(vectorsTmp, vectorsPath),
|
|
616
|
+
renameWithRetry(recordsTmp, recordsPath),
|
|
617
|
+
renameWithRetry(contentTmp, contentPath),
|
|
618
|
+
renameWithRetry(filesTmp, filesPath),
|
|
619
|
+
]);
|
|
620
|
+
|
|
621
|
+
return BinaryVectorStore.load(cacheDir, {
|
|
622
|
+
contentCacheEntries,
|
|
623
|
+
vectorCacheEntries,
|
|
624
|
+
vectorLoadMode,
|
|
625
|
+
});
|
|
626
|
+
}
|
|
627
|
+
}
|