@softerist/heuristic-mcp 3.2.1 → 3.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/features/ann-config.js +26 -15
- package/features/index-codebase.js +17 -11
- package/features/lifecycle.js +83 -9
- package/features/register.js +11 -11
- package/features/set-workspace.js +11 -7
- package/index.js +95 -22
- package/lib/cache.js +79 -30
- package/lib/constants.js +8 -7
- package/lib/vector-store-binary.js +599 -43
- package/package.json +1 -1
|
@@ -2,6 +2,8 @@ import fs from 'fs/promises';
|
|
|
2
2
|
import fsSync from 'fs';
|
|
3
3
|
import path from 'path';
|
|
4
4
|
import os from 'os';
|
|
5
|
+
import crypto from 'crypto';
|
|
6
|
+
import { crc32 } from 'zlib';
|
|
5
7
|
import {
|
|
6
8
|
BINARY_STORE_VERSION as STORE_VERSION,
|
|
7
9
|
BINARY_VECTOR_HEADER_SIZE as VECTOR_HEADER_SIZE,
|
|
@@ -18,24 +20,376 @@ const VECTORS_FILE = 'vectors.bin';
|
|
|
18
20
|
const RECORDS_FILE = 'records.bin';
|
|
19
21
|
const CONTENT_FILE = 'content.bin';
|
|
20
22
|
const FILES_FILE = 'files.json';
|
|
23
|
+
const TELEMETRY_FILE = 'binary-store-telemetry.json';
|
|
21
24
|
const RETRYABLE_RENAME_ERRORS = new Set(['EPERM', 'EACCES', 'EBUSY']);
|
|
25
|
+
const BINARY_ARTIFACT_BASE_FILES = [VECTORS_FILE, RECORDS_FILE, CONTENT_FILE, FILES_FILE];
|
|
26
|
+
const STARTUP_TMP_CLEANUP_MIN_AGE_MS = 2 * 60 * 1000;
|
|
27
|
+
const TELEMETRY_VERSION = 1;
|
|
28
|
+
|
|
29
|
+
function createTelemetryTotals() {
|
|
30
|
+
return {
|
|
31
|
+
atomicReplaceAttempts: 0,
|
|
32
|
+
atomicReplaceSuccesses: 0,
|
|
33
|
+
atomicReplaceFailures: 0,
|
|
34
|
+
renameRetryCount: 0,
|
|
35
|
+
fallbackCopyCount: 0,
|
|
36
|
+
rollbackCount: 0,
|
|
37
|
+
rollbackRestoreFailureCount: 0,
|
|
38
|
+
startupCleanupRuns: 0,
|
|
39
|
+
staleTempFilesRemoved: 0,
|
|
40
|
+
staleTempFilesSkippedActive: 0,
|
|
41
|
+
corruptionDetected: 0,
|
|
42
|
+
corruptionAutoCleared: 0,
|
|
43
|
+
corruptionSecondaryReadonlyBlocked: 0,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function normalizeTelemetry(raw) {
|
|
48
|
+
const totals = createTelemetryTotals();
|
|
49
|
+
if (raw?.totals && typeof raw.totals === 'object') {
|
|
50
|
+
for (const key of Object.keys(totals)) {
|
|
51
|
+
if (Number.isFinite(raw.totals[key])) {
|
|
52
|
+
totals[key] = raw.totals[key];
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return {
|
|
57
|
+
version: TELEMETRY_VERSION,
|
|
58
|
+
totals,
|
|
59
|
+
updatedAt: typeof raw?.updatedAt === 'string' ? raw.updatedAt : null,
|
|
60
|
+
lastError:
|
|
61
|
+
raw?.lastError && typeof raw.lastError === 'object'
|
|
62
|
+
? {
|
|
63
|
+
at: typeof raw.lastError.at === 'string' ? raw.lastError.at : null,
|
|
64
|
+
message:
|
|
65
|
+
typeof raw.lastError.message === 'string' ? raw.lastError.message : null,
|
|
66
|
+
}
|
|
67
|
+
: null,
|
|
68
|
+
lastAtomicReplace:
|
|
69
|
+
raw?.lastAtomicReplace && typeof raw.lastAtomicReplace === 'object'
|
|
70
|
+
? { ...raw.lastAtomicReplace }
|
|
71
|
+
: null,
|
|
72
|
+
lastStartupCleanup:
|
|
73
|
+
raw?.lastStartupCleanup && typeof raw.lastStartupCleanup === 'object'
|
|
74
|
+
? { ...raw.lastStartupCleanup }
|
|
75
|
+
: null,
|
|
76
|
+
lastCorruption:
|
|
77
|
+
raw?.lastCorruption && typeof raw.lastCorruption === 'object'
|
|
78
|
+
? { ...raw.lastCorruption }
|
|
79
|
+
: null,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async function readTelemetryFile(cacheDir) {
|
|
84
|
+
const telemetryPath = path.join(cacheDir, TELEMETRY_FILE);
|
|
85
|
+
try {
|
|
86
|
+
const raw = await fs.readFile(telemetryPath, 'utf-8');
|
|
87
|
+
return normalizeTelemetry(JSON.parse(raw));
|
|
88
|
+
} catch {
|
|
89
|
+
return normalizeTelemetry(null);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function writeTelemetryFile(cacheDir, telemetry) {
|
|
94
|
+
const telemetryPath = path.join(cacheDir, TELEMETRY_FILE);
|
|
95
|
+
await fs.mkdir(cacheDir, { recursive: true }).catch(() => {});
|
|
96
|
+
await fs.writeFile(telemetryPath, JSON.stringify(telemetry, null, 2));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
async function updateTelemetry(cacheDir, mutate) {
|
|
100
|
+
if (!cacheDir) return;
|
|
101
|
+
try {
|
|
102
|
+
const telemetry = await readTelemetryFile(cacheDir);
|
|
103
|
+
mutate(telemetry);
|
|
104
|
+
telemetry.updatedAt = new Date().toISOString();
|
|
105
|
+
await writeTelemetryFile(cacheDir, telemetry);
|
|
106
|
+
} catch {
|
|
107
|
+
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function isProcessRunning(pid) {
|
|
112
|
+
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
113
|
+
try {
|
|
114
|
+
process.kill(pid, 0);
|
|
115
|
+
return true;
|
|
116
|
+
} catch (err) {
|
|
117
|
+
return err?.code === 'EPERM';
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function parsePidFromBinaryArtifact(fileName) {
|
|
122
|
+
const match = fileName.match(/\.(?:tmp|bak)-(\d+)(?:-|$)/);
|
|
123
|
+
if (!match) return null;
|
|
124
|
+
const pid = Number.parseInt(match[1], 10);
|
|
125
|
+
return Number.isInteger(pid) ? pid : null;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function isBinaryTempArtifact(fileName) {
|
|
129
|
+
return BINARY_ARTIFACT_BASE_FILES.some(
|
|
130
|
+
(baseFile) =>
|
|
131
|
+
fileName.startsWith(`${baseFile}.tmp-`) || fileName.startsWith(`${baseFile}.bak-`)
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function addToMetric(metrics, key, value = 1) {
|
|
136
|
+
if (!metrics || !Number.isFinite(value) || value <= 0) return;
|
|
137
|
+
metrics[key] = (metrics[key] || 0) + value;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export async function readBinaryStoreTelemetry(cacheDir) {
|
|
141
|
+
return readTelemetryFile(cacheDir);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export async function recordBinaryStoreCorruption(
|
|
145
|
+
cacheDir,
|
|
146
|
+
{ message = null, context = null, action = 'detected' } = {}
|
|
147
|
+
) {
|
|
148
|
+
await updateTelemetry(cacheDir, (telemetry) => {
|
|
149
|
+
if (!telemetry?.totals || typeof telemetry.totals !== 'object') return;
|
|
150
|
+
if (action === 'detected') telemetry.totals.corruptionDetected += 1;
|
|
151
|
+
if (action === 'auto-cleared') telemetry.totals.corruptionAutoCleared += 1;
|
|
152
|
+
if (action === 'secondary-readonly-blocked') {
|
|
153
|
+
telemetry.totals.corruptionSecondaryReadonlyBlocked += 1;
|
|
154
|
+
}
|
|
155
|
+
telemetry.lastCorruption = {
|
|
156
|
+
at: new Date().toISOString(),
|
|
157
|
+
action,
|
|
158
|
+
context: typeof context === 'string' ? context : null,
|
|
159
|
+
message: typeof message === 'string' ? message : null,
|
|
160
|
+
};
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export async function cleanupStaleBinaryArtifacts(
|
|
165
|
+
cacheDir,
|
|
166
|
+
{ minAgeMs = STARTUP_TMP_CLEANUP_MIN_AGE_MS, logger = null } = {}
|
|
167
|
+
) {
|
|
168
|
+
const result = {
|
|
169
|
+
cacheDir,
|
|
170
|
+
scanned: 0,
|
|
171
|
+
removed: 0,
|
|
172
|
+
skippedActive: 0,
|
|
173
|
+
removedFiles: [],
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
let entries = [];
|
|
177
|
+
try {
|
|
178
|
+
entries = await fs.readdir(cacheDir, { withFileTypes: true });
|
|
179
|
+
} catch {
|
|
180
|
+
return result;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const now = Date.now();
|
|
184
|
+
for (const entry of entries) {
|
|
185
|
+
const fileName = typeof entry === 'string' ? entry : entry?.name;
|
|
186
|
+
if (!fileName) continue;
|
|
187
|
+
const isFileEntry = typeof entry === 'string' ? true : entry?.isFile?.() === true;
|
|
188
|
+
if (!isFileEntry) continue;
|
|
189
|
+
if (!isBinaryTempArtifact(fileName)) continue;
|
|
190
|
+
result.scanned += 1;
|
|
191
|
+
|
|
192
|
+
const fullPath = path.join(cacheDir, fileName);
|
|
193
|
+
const stats = await fs.stat(fullPath).catch(() => null);
|
|
194
|
+
if (!stats) continue;
|
|
195
|
+
|
|
196
|
+
const ageMs = now - stats.mtimeMs;
|
|
197
|
+
const ownerPid = parsePidFromBinaryArtifact(fileName);
|
|
198
|
+
if (ownerPid && isProcessRunning(ownerPid)) {
|
|
199
|
+
result.skippedActive += 1;
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
if (ageMs < minAgeMs) continue;
|
|
203
|
+
|
|
204
|
+
await fs.rm(fullPath, { force: true }).catch(() => {});
|
|
205
|
+
result.removed += 1;
|
|
206
|
+
result.removedFiles.push(fileName);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
await updateTelemetry(cacheDir, (telemetry) => {
|
|
210
|
+
telemetry.totals.startupCleanupRuns += 1;
|
|
211
|
+
telemetry.totals.staleTempFilesRemoved += result.removed;
|
|
212
|
+
telemetry.totals.staleTempFilesSkippedActive += result.skippedActive;
|
|
213
|
+
telemetry.lastStartupCleanup = {
|
|
214
|
+
at: new Date().toISOString(),
|
|
215
|
+
scanned: result.scanned,
|
|
216
|
+
removed: result.removed,
|
|
217
|
+
skippedActive: result.skippedActive,
|
|
218
|
+
};
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
if (logger && result.removed > 0) {
|
|
222
|
+
logger.info(
|
|
223
|
+
`[Cache] Startup temp cleanup removed ${result.removed} stale artifact(s) from ${cacheDir}`
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return result;
|
|
228
|
+
}
|
|
22
229
|
|
|
23
|
-
|
|
230
|
+
function isRetryableRenameError(err) {
|
|
231
|
+
return RETRYABLE_RENAME_ERRORS.has(err?.code);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
async function renameWithRetry(
|
|
235
|
+
source,
|
|
236
|
+
target,
|
|
237
|
+
{ retries = 12, delayMs = 50, maxDelayMs = 1000 } = {}
|
|
238
|
+
) {
|
|
24
239
|
let attempt = 0;
|
|
25
240
|
let delay = delayMs;
|
|
26
241
|
while (true) {
|
|
27
242
|
try {
|
|
28
243
|
await fs.rename(source, target);
|
|
29
|
-
return;
|
|
244
|
+
return attempt;
|
|
30
245
|
} catch (err) {
|
|
31
|
-
|
|
32
|
-
|
|
246
|
+
if (!isRetryableRenameError(err) || attempt >= retries) {
|
|
247
|
+
err.renameRetryCount = attempt;
|
|
33
248
|
throw err;
|
|
34
249
|
}
|
|
35
250
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
36
251
|
attempt += 1;
|
|
37
|
-
delay
|
|
252
|
+
delay = Math.min(delay * 2, maxDelayMs);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
async function pathExists(filePath) {
|
|
258
|
+
try {
|
|
259
|
+
await fs.access(filePath);
|
|
260
|
+
return true;
|
|
261
|
+
} catch {
|
|
262
|
+
return false;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
async function removeIfExists(filePath) {
|
|
267
|
+
await fs.rm(filePath, { force: true }).catch(() => {});
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
async function promoteFileWithFallback(source, target, renameOptions = {}, metrics = null) {
|
|
271
|
+
try {
|
|
272
|
+
const retriesUsed = await renameWithRetry(source, target, renameOptions);
|
|
273
|
+
addToMetric(metrics, 'renameRetryCount', retriesUsed);
|
|
274
|
+
return;
|
|
275
|
+
} catch (renameError) {
|
|
276
|
+
const retriesUsed = Number.isFinite(renameError?.renameRetryCount)
|
|
277
|
+
? renameError.renameRetryCount
|
|
278
|
+
: 0;
|
|
279
|
+
addToMetric(metrics, 'renameRetryCount', retriesUsed);
|
|
280
|
+
if (!isRetryableRenameError(renameError)) {
|
|
281
|
+
throw renameError;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
try {
|
|
285
|
+
await fs.copyFile(source, target);
|
|
286
|
+
await removeIfExists(source);
|
|
287
|
+
addToMetric(metrics, 'fallbackCopyCount', 1);
|
|
288
|
+
return;
|
|
289
|
+
} catch (copyError) {
|
|
290
|
+
const wrapped = new Error(
|
|
291
|
+
`rename failed (${renameError.message}); fallback copy failed (${copyError.message})`
|
|
292
|
+
);
|
|
293
|
+
wrapped.code = copyError?.code || renameError?.code;
|
|
294
|
+
throw wrapped;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
async function replaceFilesAtomically(filePairs, renameOptions = {}) {
|
|
300
|
+
const metrics = createTelemetryTotals();
|
|
301
|
+
metrics.atomicReplaceAttempts = 1;
|
|
302
|
+
const cacheDir = filePairs.length > 0 ? path.dirname(filePairs[0].target) : null;
|
|
303
|
+
const backupSuffix = `.bak-${process.pid}-${Date.now()}`;
|
|
304
|
+
const backups = [];
|
|
305
|
+
const replacedTargets = [];
|
|
306
|
+
let operationError = null;
|
|
307
|
+
|
|
308
|
+
try {
|
|
309
|
+
// Stage current files as backups first. If this fails, nothing is replaced.
|
|
310
|
+
for (const pair of filePairs) {
|
|
311
|
+
if (!(await pathExists(pair.target))) continue;
|
|
312
|
+
const backupPath = `${pair.target}${backupSuffix}`;
|
|
313
|
+
await removeIfExists(backupPath);
|
|
314
|
+
await promoteFileWithFallback(pair.target, backupPath, renameOptions, metrics);
|
|
315
|
+
backups.push({ target: pair.target, backupPath });
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Replace targets with new temp files.
|
|
319
|
+
for (const pair of filePairs) {
|
|
320
|
+
await promoteFileWithFallback(pair.source, pair.target, renameOptions, metrics);
|
|
321
|
+
replacedTargets.push(pair.target);
|
|
38
322
|
}
|
|
323
|
+
metrics.atomicReplaceSuccesses = 1;
|
|
324
|
+
} catch (error) {
|
|
325
|
+
operationError = error;
|
|
326
|
+
metrics.atomicReplaceFailures = 1;
|
|
327
|
+
metrics.rollbackCount = 1;
|
|
328
|
+
const rollbackErrors = [];
|
|
329
|
+
|
|
330
|
+
// Remove any partially replaced files before restoring backups.
|
|
331
|
+
for (const target of replacedTargets.reverse()) {
|
|
332
|
+
await removeIfExists(target);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Restore original files from backups.
|
|
336
|
+
for (const backup of backups.reverse()) {
|
|
337
|
+
try {
|
|
338
|
+
await promoteFileWithFallback(backup.backupPath, backup.target, renameOptions, metrics);
|
|
339
|
+
} catch (restoreErr) {
|
|
340
|
+
rollbackErrors.push(
|
|
341
|
+
`restore ${path.basename(backup.target)} failed: ${restoreErr.message}`
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
if (rollbackErrors.length > 0) {
|
|
346
|
+
metrics.rollbackRestoreFailureCount = rollbackErrors.length;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Clean up temp files left from this failed write attempt.
|
|
350
|
+
await Promise.all(filePairs.map((pair) => removeIfExists(pair.source)));
|
|
351
|
+
|
|
352
|
+
if (rollbackErrors.length > 0) {
|
|
353
|
+
error.message = `${error.message}. Rollback issues: ${rollbackErrors.join('; ')}`;
|
|
354
|
+
}
|
|
355
|
+
throw error;
|
|
356
|
+
} finally {
|
|
357
|
+
// Best-effort cleanup for any backup remnants after success/rollback.
|
|
358
|
+
await Promise.all(backups.map((backup) => removeIfExists(backup.backupPath)));
|
|
359
|
+
await updateTelemetry(cacheDir, (telemetry) => {
|
|
360
|
+
telemetry.totals.atomicReplaceAttempts += metrics.atomicReplaceAttempts;
|
|
361
|
+
telemetry.totals.atomicReplaceSuccesses += metrics.atomicReplaceSuccesses;
|
|
362
|
+
telemetry.totals.atomicReplaceFailures += metrics.atomicReplaceFailures;
|
|
363
|
+
telemetry.totals.renameRetryCount += metrics.renameRetryCount;
|
|
364
|
+
telemetry.totals.fallbackCopyCount += metrics.fallbackCopyCount;
|
|
365
|
+
telemetry.totals.rollbackCount += metrics.rollbackCount;
|
|
366
|
+
telemetry.totals.rollbackRestoreFailureCount += metrics.rollbackRestoreFailureCount;
|
|
367
|
+
telemetry.lastAtomicReplace = {
|
|
368
|
+
at: new Date().toISOString(),
|
|
369
|
+
success: metrics.atomicReplaceSuccesses > 0,
|
|
370
|
+
renameRetryCount: metrics.renameRetryCount,
|
|
371
|
+
fallbackCopyCount: metrics.fallbackCopyCount,
|
|
372
|
+
rollbackCount: metrics.rollbackCount,
|
|
373
|
+
rollbackRestoreFailureCount: metrics.rollbackRestoreFailureCount,
|
|
374
|
+
};
|
|
375
|
+
if (operationError) {
|
|
376
|
+
telemetry.lastError = {
|
|
377
|
+
at: new Date().toISOString(),
|
|
378
|
+
message: operationError.message,
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Custom error for binary store corruption.
|
|
387
|
+
* Allows cache layer to distinguish corruption from other load failures.
|
|
388
|
+
*/
|
|
389
|
+
export class BinaryStoreCorruptionError extends Error {
|
|
390
|
+
constructor(message) {
|
|
391
|
+
super(message);
|
|
392
|
+
this.name = 'BinaryStoreCorruptionError';
|
|
39
393
|
}
|
|
40
394
|
}
|
|
41
395
|
|
|
@@ -57,13 +411,86 @@ function getDataView(buffer) {
|
|
|
57
411
|
return new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength);
|
|
58
412
|
}
|
|
59
413
|
|
|
414
|
+
/**
|
|
415
|
+
* Generate a random writeId shared across all files in a single write operation.
|
|
416
|
+
*/
|
|
417
|
+
function generateWriteId() {
|
|
418
|
+
return crypto.randomInt(1, 0xFFFFFFFF);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Compute CRC32 checksum over a buffer.
|
|
423
|
+
*/
|
|
424
|
+
function computeCrc32(buffer, initial) {
|
|
425
|
+
return initial !== undefined ? crc32(buffer, initial) >>> 0 : crc32(buffer) >>> 0;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
function updateCrc32(checksum, buffer) {
|
|
429
|
+
return crc32(buffer, checksum >>> 0) >>> 0;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
async function computeHandleCrc32(handle, startOffset, totalBytes) {
|
|
433
|
+
if (!Number.isFinite(totalBytes) || totalBytes <= 0) return 0;
|
|
434
|
+
const chunkSize = Math.min(1024 * 1024, totalBytes);
|
|
435
|
+
const buffer = Buffer.allocUnsafe(chunkSize);
|
|
436
|
+
let checksum = 0;
|
|
437
|
+
let remaining = totalBytes;
|
|
438
|
+
let position = startOffset;
|
|
439
|
+
|
|
440
|
+
while (remaining > 0) {
|
|
441
|
+
const toRead = Math.min(buffer.length, remaining);
|
|
442
|
+
const { bytesRead } = await handle.read(buffer, 0, toRead, position);
|
|
443
|
+
if (bytesRead !== toRead) {
|
|
444
|
+
throw new BinaryStoreCorruptionError(
|
|
445
|
+
'Binary store content file truncated during CRC validation'
|
|
446
|
+
);
|
|
447
|
+
}
|
|
448
|
+
checksum = updateCrc32(checksum, buffer.subarray(0, bytesRead));
|
|
449
|
+
remaining -= bytesRead;
|
|
450
|
+
position += bytesRead;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
return checksum >>> 0;
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
function computeFdCrc32Sync(fd, startOffset, totalBytes) {
|
|
457
|
+
if (!Number.isFinite(totalBytes) || totalBytes <= 0) return 0;
|
|
458
|
+
const chunkSize = Math.min(1024 * 1024, totalBytes);
|
|
459
|
+
const buffer = Buffer.allocUnsafe(chunkSize);
|
|
460
|
+
let checksum = 0;
|
|
461
|
+
let remaining = totalBytes;
|
|
462
|
+
let position = startOffset;
|
|
463
|
+
|
|
464
|
+
while (remaining > 0) {
|
|
465
|
+
const toRead = Math.min(buffer.length, remaining);
|
|
466
|
+
const bytesRead = fsSync.readSync(fd, buffer, 0, toRead, position);
|
|
467
|
+
if (bytesRead !== toRead) {
|
|
468
|
+
throw new BinaryStoreCorruptionError(
|
|
469
|
+
'Binary store vectors file truncated during CRC validation'
|
|
470
|
+
);
|
|
471
|
+
}
|
|
472
|
+
checksum = updateCrc32(checksum, buffer.subarray(0, bytesRead));
|
|
473
|
+
remaining -= bytesRead;
|
|
474
|
+
position += bytesRead;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
return checksum >>> 0;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
async function writeHeaderCrc(handle, crcValue) {
|
|
481
|
+
const crcBuffer = Buffer.alloc(4);
|
|
482
|
+
const crcView = getDataView(crcBuffer);
|
|
483
|
+
crcView.setUint32(0, crcValue >>> 0, true);
|
|
484
|
+
await handle.write(crcBuffer, 0, crcBuffer.length, 20);
|
|
485
|
+
}
|
|
486
|
+
|
|
60
487
|
function readHeader(buffer, magic, headerSize) {
|
|
61
488
|
if (buffer.length < headerSize) {
|
|
62
|
-
throw new
|
|
489
|
+
throw new BinaryStoreCorruptionError('Binary store header is truncated');
|
|
63
490
|
}
|
|
64
491
|
const actualMagic = readMagic(buffer);
|
|
65
492
|
if (actualMagic !== magic) {
|
|
66
|
-
throw new
|
|
493
|
+
throw new BinaryStoreCorruptionError(`Invalid binary store magic (${actualMagic})`);
|
|
67
494
|
}
|
|
68
495
|
const view = getDataView(buffer);
|
|
69
496
|
const version = view.getUint32(4, true);
|
|
@@ -73,31 +500,37 @@ function readHeader(buffer, magic, headerSize) {
|
|
|
73
500
|
return view;
|
|
74
501
|
}
|
|
75
502
|
|
|
76
|
-
function writeVectorsHeader(buffer, dim, count) {
|
|
503
|
+
function writeVectorsHeader(buffer, dim, count, writeId) {
|
|
77
504
|
writeMagic(buffer, MAGIC_VECTORS);
|
|
78
505
|
const view = getDataView(buffer);
|
|
79
506
|
view.setUint32(4, STORE_VERSION, true);
|
|
80
507
|
view.setUint32(8, dim, true);
|
|
81
508
|
view.setUint32(12, count, true);
|
|
82
|
-
view.setUint32(16,
|
|
509
|
+
view.setUint32(16, writeId, true);
|
|
510
|
+
view.setUint32(20, 0, true); // CRC32 placeholder — filled after payload write
|
|
511
|
+
// bytes 24-31: reserved
|
|
83
512
|
}
|
|
84
513
|
|
|
85
|
-
function writeRecordsHeader(buffer, count, fileCount) {
|
|
514
|
+
function writeRecordsHeader(buffer, count, fileCount, writeId) {
|
|
86
515
|
writeMagic(buffer, MAGIC_RECORDS);
|
|
87
516
|
const view = getDataView(buffer);
|
|
88
517
|
view.setUint32(4, STORE_VERSION, true);
|
|
89
518
|
view.setUint32(8, count, true);
|
|
90
519
|
view.setUint32(12, fileCount, true);
|
|
91
|
-
view.setUint32(16,
|
|
520
|
+
view.setUint32(16, writeId, true);
|
|
521
|
+
view.setUint32(20, 0, true); // CRC32 placeholder
|
|
522
|
+
// bytes 24-31: reserved
|
|
92
523
|
}
|
|
93
524
|
|
|
94
|
-
function writeContentHeader(buffer, totalBytes) {
|
|
525
|
+
function writeContentHeader(buffer, totalBytes, writeId) {
|
|
95
526
|
writeMagic(buffer, MAGIC_CONTENT);
|
|
96
527
|
const view = getDataView(buffer);
|
|
97
528
|
view.setUint32(4, STORE_VERSION, true);
|
|
98
529
|
const value = BigInt(totalBytes);
|
|
99
530
|
view.setBigUint64(8, value, true);
|
|
100
|
-
view.setUint32(16,
|
|
531
|
+
view.setUint32(16, writeId, true);
|
|
532
|
+
view.setUint32(20, 0, true); // CRC32 placeholder
|
|
533
|
+
// bytes 24-31: reserved
|
|
101
534
|
}
|
|
102
535
|
|
|
103
536
|
function readBigUint(view, offset) {
|
|
@@ -224,16 +657,28 @@ export class BinaryVectorStore {
|
|
|
224
657
|
vectorsBuffer = await fs.readFile(vectorsPath);
|
|
225
658
|
}
|
|
226
659
|
|
|
227
|
-
const vectorsView = readHeader(vectorsBuffer, MAGIC_VECTORS, VECTOR_HEADER_SIZE);
|
|
228
|
-
const dim = vectorsView.getUint32(8, true);
|
|
229
|
-
const count = vectorsView.getUint32(12, true);
|
|
230
|
-
|
|
231
|
-
const
|
|
660
|
+
const vectorsView = readHeader(vectorsBuffer, MAGIC_VECTORS, VECTOR_HEADER_SIZE);
|
|
661
|
+
const dim = vectorsView.getUint32(8, true);
|
|
662
|
+
const count = vectorsView.getUint32(12, true);
|
|
663
|
+
const vectorsWriteId = vectorsView.getUint32(16, true);
|
|
664
|
+
const vectorsExpectedCrc = vectorsView.getUint32(20, true);
|
|
665
|
+
const vectorsPayloadBytes = count * dim * 4;
|
|
666
|
+
|
|
667
|
+
const recordsView = readHeader(recordsBuffer, MAGIC_RECORDS, RECORD_HEADER_SIZE);
|
|
232
668
|
const recordCount = recordsView.getUint32(8, true);
|
|
233
669
|
const fileCount = recordsView.getUint32(12, true);
|
|
670
|
+
const recordsWriteId = recordsView.getUint32(16, true);
|
|
671
|
+
const recordsExpectedCrc = recordsView.getUint32(20, true);
|
|
234
672
|
|
|
235
673
|
if (recordCount !== count) {
|
|
236
|
-
throw new
|
|
674
|
+
throw new BinaryStoreCorruptionError(`Binary store count mismatch (${recordCount} != ${count})`);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// Validate writeId consistency between vectors and records
|
|
678
|
+
if (vectorsWriteId !== recordsWriteId) {
|
|
679
|
+
throw new BinaryStoreCorruptionError(
|
|
680
|
+
`Binary store writeId mismatch: vectors=${vectorsWriteId}, records=${recordsWriteId}`
|
|
681
|
+
);
|
|
237
682
|
}
|
|
238
683
|
|
|
239
684
|
contentReadHandle = await fs.open(contentPath, 'r');
|
|
@@ -242,19 +687,110 @@ export class BinaryVectorStore {
|
|
|
242
687
|
const headerBuffer = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
243
688
|
const { bytesRead } = await contentReadHandle.read(headerBuffer, 0, CONTENT_HEADER_SIZE, 0);
|
|
244
689
|
if (bytesRead < CONTENT_HEADER_SIZE) {
|
|
245
|
-
throw new
|
|
690
|
+
throw new BinaryStoreCorruptionError('Binary store content header is truncated');
|
|
246
691
|
}
|
|
247
692
|
const contentView = readHeader(headerBuffer, MAGIC_CONTENT, CONTENT_HEADER_SIZE);
|
|
248
693
|
totalContentBytes = readBigUint(contentView, 8);
|
|
694
|
+
const contentWriteId = contentView.getUint32(16, true);
|
|
695
|
+
const contentExpectedCrc = contentView.getUint32(20, true);
|
|
249
696
|
const stats = await contentReadHandle.stat();
|
|
250
697
|
const expectedContentSize = CONTENT_HEADER_SIZE + totalContentBytes;
|
|
251
698
|
if (stats.size < expectedContentSize) {
|
|
252
|
-
throw new
|
|
699
|
+
throw new BinaryStoreCorruptionError('Binary store content file truncated');
|
|
253
700
|
}
|
|
254
701
|
|
|
255
|
-
|
|
256
|
-
if (
|
|
257
|
-
throw new
|
|
702
|
+
// Validate writeId consistency across all three files
|
|
703
|
+
if (vectorsWriteId !== contentWriteId) {
|
|
704
|
+
throw new BinaryStoreCorruptionError(
|
|
705
|
+
`Binary store writeId mismatch: vectors=${vectorsWriteId}, content=${contentWriteId}`
|
|
706
|
+
);
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// Validate CRC32 for records payload
|
|
710
|
+
const recordsPayload = recordsBuffer.subarray(RECORD_HEADER_SIZE);
|
|
711
|
+
const recordsActualCrc = computeCrc32(recordsPayload);
|
|
712
|
+
if (recordsActualCrc !== recordsExpectedCrc) {
|
|
713
|
+
throw new BinaryStoreCorruptionError(
|
|
714
|
+
`Binary store records CRC32 mismatch (expected ${recordsExpectedCrc}, got ${recordsActualCrc})`
|
|
715
|
+
);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
// Validate CRC32 for vectors payload (only when fully loaded into memory)
|
|
719
|
+
if (!loadVectorsFromDisk) {
|
|
720
|
+
const expectedVectorsSize = VECTOR_HEADER_SIZE + vectorsPayloadBytes;
|
|
721
|
+
if (vectorsBuffer.length < expectedVectorsSize) {
|
|
722
|
+
throw new BinaryStoreCorruptionError('Binary store vectors file truncated');
|
|
723
|
+
}
|
|
724
|
+
const vectorsPayload = vectorsBuffer.subarray(VECTOR_HEADER_SIZE);
|
|
725
|
+
const vectorsActualCrc = computeCrc32(vectorsPayload);
|
|
726
|
+
if (vectorsActualCrc !== vectorsExpectedCrc) {
|
|
727
|
+
throw new BinaryStoreCorruptionError(
|
|
728
|
+
`Binary store vectors CRC32 mismatch (expected ${vectorsExpectedCrc}, got ${vectorsActualCrc})`
|
|
729
|
+
);
|
|
730
|
+
}
|
|
731
|
+
} else if (vectorsPayloadBytes > 0) {
|
|
732
|
+
const vectorsStats = fsSync.fstatSync(vectorsFd);
|
|
733
|
+
const expectedVectorsSize = VECTOR_HEADER_SIZE + vectorsPayloadBytes;
|
|
734
|
+
if (vectorsStats.size < expectedVectorsSize) {
|
|
735
|
+
throw new BinaryStoreCorruptionError('Binary store vectors file truncated');
|
|
736
|
+
}
|
|
737
|
+
const vectorsActualCrc = computeFdCrc32Sync(
|
|
738
|
+
vectorsFd,
|
|
739
|
+
VECTOR_HEADER_SIZE,
|
|
740
|
+
vectorsPayloadBytes
|
|
741
|
+
);
|
|
742
|
+
if (vectorsActualCrc !== vectorsExpectedCrc) {
|
|
743
|
+
throw new BinaryStoreCorruptionError(
|
|
744
|
+
`Binary store vectors CRC32 mismatch (expected ${vectorsExpectedCrc}, got ${vectorsActualCrc})`
|
|
745
|
+
);
|
|
746
|
+
}
|
|
747
|
+
} else if (vectorsExpectedCrc !== 0) {
|
|
748
|
+
throw new BinaryStoreCorruptionError(
|
|
749
|
+
`Binary store vectors CRC32 mismatch (expected ${vectorsExpectedCrc}, got 0)`
|
|
750
|
+
);
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
if (totalContentBytes > 0) {
|
|
754
|
+
const contentActualCrc = await computeHandleCrc32(
|
|
755
|
+
contentReadHandle,
|
|
756
|
+
CONTENT_HEADER_SIZE,
|
|
757
|
+
totalContentBytes
|
|
758
|
+
);
|
|
759
|
+
if (contentActualCrc !== contentExpectedCrc) {
|
|
760
|
+
throw new BinaryStoreCorruptionError(
|
|
761
|
+
`Binary store content CRC32 mismatch (expected ${contentExpectedCrc}, got ${contentActualCrc})`
|
|
762
|
+
);
|
|
763
|
+
}
|
|
764
|
+
} else if (contentExpectedCrc !== 0) {
|
|
765
|
+
throw new BinaryStoreCorruptionError(
|
|
766
|
+
`Binary store content CRC32 mismatch (expected ${contentExpectedCrc}, got 0)`
|
|
767
|
+
);
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
const filesData = JSON.parse(filesRaw);
|
|
771
|
+
// Support new format { writeId, files } and legacy raw array
|
|
772
|
+
let files;
|
|
773
|
+
let filesWriteId = null;
|
|
774
|
+
if (filesData && !Array.isArray(filesData) && Array.isArray(filesData.files)) {
|
|
775
|
+
files = filesData.files;
|
|
776
|
+
filesWriteId = filesData.writeId ?? null;
|
|
777
|
+
} else if (Array.isArray(filesData)) {
|
|
778
|
+
files = filesData;
|
|
779
|
+
} else {
|
|
780
|
+
throw new BinaryStoreCorruptionError('Binary store file table is invalid');
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
if (files.length !== fileCount) {
|
|
784
|
+
throw new BinaryStoreCorruptionError(
|
|
785
|
+
`Binary store file table count mismatch (${files.length} != ${fileCount})`
|
|
786
|
+
);
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
// Validate writeId from files.json if present
|
|
790
|
+
if (filesWriteId !== null && filesWriteId !== vectorsWriteId) {
|
|
791
|
+
throw new BinaryStoreCorruptionError(
|
|
792
|
+
`Binary store writeId mismatch: vectors=${vectorsWriteId}, files.json=${filesWriteId}`
|
|
793
|
+
);
|
|
258
794
|
}
|
|
259
795
|
|
|
260
796
|
return new BinaryVectorStore({
|
|
@@ -443,6 +979,7 @@ export class BinaryVectorStore {
|
|
|
443
979
|
getContent,
|
|
444
980
|
getVector,
|
|
445
981
|
preRename,
|
|
982
|
+
renameOptions,
|
|
446
983
|
} = {}
|
|
447
984
|
) {
|
|
448
985
|
ensureLittleEndian();
|
|
@@ -533,7 +1070,9 @@ export class BinaryVectorStore {
|
|
|
533
1070
|
const dim =
|
|
534
1071
|
count > 0 ? (await resolveVector(denseChunks[0], denseSourceIndices[0])).length : 0;
|
|
535
1072
|
|
|
536
|
-
|
|
1073
|
+
const writeId = generateWriteId();
|
|
1074
|
+
|
|
1075
|
+
await fs.writeFile(filesTmp, JSON.stringify({ writeId, files }));
|
|
537
1076
|
|
|
538
1077
|
let vectorsHandle = null;
|
|
539
1078
|
let recordsHandle = null;
|
|
@@ -545,17 +1084,22 @@ export class BinaryVectorStore {
|
|
|
545
1084
|
contentHandle = await fs.open(contentTmp, 'w');
|
|
546
1085
|
|
|
547
1086
|
const vectorsHeader = Buffer.alloc(VECTOR_HEADER_SIZE);
|
|
548
|
-
writeVectorsHeader(vectorsHeader, dim, count);
|
|
1087
|
+
writeVectorsHeader(vectorsHeader, dim, count, writeId);
|
|
549
1088
|
await vectorsHandle.write(vectorsHeader, 0, vectorsHeader.length, 0);
|
|
550
1089
|
|
|
551
1090
|
const recordsHeader = Buffer.alloc(RECORD_HEADER_SIZE);
|
|
552
|
-
writeRecordsHeader(recordsHeader, count, files.length);
|
|
1091
|
+
writeRecordsHeader(recordsHeader, count, files.length, writeId);
|
|
553
1092
|
await recordsHandle.write(recordsHeader, 0, recordsHeader.length, 0);
|
|
554
1093
|
|
|
555
1094
|
const contentHeader = Buffer.alloc(CONTENT_HEADER_SIZE);
|
|
556
|
-
writeContentHeader(contentHeader, contentOffset);
|
|
1095
|
+
writeContentHeader(contentHeader, contentOffset, writeId);
|
|
557
1096
|
await contentHandle.write(contentHeader, 0, contentHeader.length, 0);
|
|
558
1097
|
|
|
1098
|
+
// Incremental CRC32 accumulators (zero-alloc — no read-back needed)
|
|
1099
|
+
let vectorsCrc = 0;
|
|
1100
|
+
let recordsCrc = 0;
|
|
1101
|
+
let contentCrc = 0;
|
|
1102
|
+
|
|
559
1103
|
let vectorPos = VECTOR_HEADER_SIZE;
|
|
560
1104
|
let recordPos = RECORD_HEADER_SIZE;
|
|
561
1105
|
let contentPos = CONTENT_HEADER_SIZE;
|
|
@@ -574,8 +1118,9 @@ export class BinaryVectorStore {
|
|
|
574
1118
|
view.setUint32(24, 0, true);
|
|
575
1119
|
view.setUint32(28, 0, true);
|
|
576
1120
|
|
|
577
|
-
await recordsHandle.write(recordBuffer, 0, recordBuffer.length, recordPos);
|
|
578
|
-
recordPos += recordBuffer.length;
|
|
1121
|
+
await recordsHandle.write(recordBuffer, 0, recordBuffer.length, recordPos);
|
|
1122
|
+
recordPos += recordBuffer.length;
|
|
1123
|
+
recordsCrc = updateCrc32(recordsCrc, recordBuffer);
|
|
579
1124
|
|
|
580
1125
|
const chunk = denseChunks[i];
|
|
581
1126
|
const sourceIndex = denseSourceIndices[i];
|
|
@@ -588,17 +1133,25 @@ export class BinaryVectorStore {
|
|
|
588
1133
|
vector.byteOffset,
|
|
589
1134
|
vector.byteLength
|
|
590
1135
|
);
|
|
591
|
-
await vectorsHandle.write(vectorBuffer, 0, vectorBuffer.length, vectorPos);
|
|
592
|
-
vectorPos += vectorBuffer.length;
|
|
1136
|
+
await vectorsHandle.write(vectorBuffer, 0, vectorBuffer.length, vectorPos);
|
|
1137
|
+
vectorPos += vectorBuffer.length;
|
|
1138
|
+
vectorsCrc = updateCrc32(vectorsCrc, vectorBuffer);
|
|
593
1139
|
|
|
594
1140
|
if (entry.contentLength > 0) {
|
|
595
1141
|
|
|
596
1142
|
const val = await resolveContent(chunk, sourceIndex);
|
|
597
|
-
const contentBuffer = Buffer.from(val, 'utf-8');
|
|
598
|
-
await contentHandle.write(contentBuffer, 0, contentBuffer.length, contentPos);
|
|
599
|
-
contentPos += contentBuffer.length;
|
|
600
|
-
|
|
601
|
-
|
|
1143
|
+
const contentBuffer = Buffer.from(val, 'utf-8');
|
|
1144
|
+
await contentHandle.write(contentBuffer, 0, contentBuffer.length, contentPos);
|
|
1145
|
+
contentPos += contentBuffer.length;
|
|
1146
|
+
contentCrc = updateCrc32(contentCrc, contentBuffer);
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
if (count > 0) {
|
|
1151
|
+
await writeHeaderCrc(vectorsHandle, vectorsCrc);
|
|
1152
|
+
await writeHeaderCrc(recordsHandle, recordsCrc);
|
|
1153
|
+
}
|
|
1154
|
+
await writeHeaderCrc(contentHandle, contentCrc);
|
|
602
1155
|
} finally {
|
|
603
1156
|
const closes = [];
|
|
604
1157
|
if (vectorsHandle) closes.push(vectorsHandle.close().catch(() => {}));
|
|
@@ -611,12 +1164,15 @@ export class BinaryVectorStore {
|
|
|
611
1164
|
await preRename();
|
|
612
1165
|
}
|
|
613
1166
|
|
|
614
|
-
await
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
1167
|
+
await replaceFilesAtomically(
|
|
1168
|
+
[
|
|
1169
|
+
{ source: vectorsTmp, target: vectorsPath },
|
|
1170
|
+
{ source: recordsTmp, target: recordsPath },
|
|
1171
|
+
{ source: contentTmp, target: contentPath },
|
|
1172
|
+
{ source: filesTmp, target: filesPath },
|
|
1173
|
+
],
|
|
1174
|
+
renameOptions
|
|
1175
|
+
);
|
|
620
1176
|
|
|
621
1177
|
return BinaryVectorStore.load(cacheDir, {
|
|
622
1178
|
contentCacheEntries,
|