@soulcraft/cortex 2.7.0 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +0 -0
- package/dist/hnsw/NativeHNSWWrapper.d.ts +7 -0
- package/dist/hnsw/NativeHNSWWrapper.js +26 -8
- package/package.json +1 -1
- package/dist/hnsw/NativeDiskAnnWrapper.d.ts +0 -161
- package/dist/hnsw/NativeDiskAnnWrapper.js +0 -329
- package/dist/utils/nativeBinaryEntityIdMapper.d.ts +0 -194
- package/dist/utils/nativeBinaryEntityIdMapper.js +0 -358
package/dist/cli.js
CHANGED
|
File without changes
|
|
@@ -37,6 +37,13 @@ export declare class NativeHNSWWrapper implements HnswProvider {
|
|
|
37
37
|
storage?: StorageAdapter | null;
|
|
38
38
|
persistMode?: 'immediate' | 'deferred';
|
|
39
39
|
});
|
|
40
|
+
/**
|
|
41
|
+
* Resolve the storage root directory for the `_hnsw.bin` mmap file. brainy
|
|
42
|
+
* >= 7.31.8 exposes a public `rootDirectory` getter; earlier 7.x kept the
|
|
43
|
+
* value in the (compile-time-protected, runtime-present) `rootDir` field.
|
|
44
|
+
* Accept either so the mmap fast-path engages on every filesystem brainy.
|
|
45
|
+
*/
|
|
46
|
+
private resolveRootDir;
|
|
40
47
|
addItem(item: VectorDocument): Promise<string>;
|
|
41
48
|
/**
|
|
42
49
|
* Batch insert — inserts all items in a single native call, then persists
|
|
@@ -56,8 +56,11 @@ export class NativeHNSWWrapper {
|
|
|
56
56
|
ml: this.config.ml,
|
|
57
57
|
};
|
|
58
58
|
this.native = new bindings.NativeHNSWIndex(nativeConfig);
|
|
59
|
-
// Initialize mmap binary store if storage
|
|
60
|
-
|
|
59
|
+
// Initialize mmap binary store if storage exposes its root directory
|
|
60
|
+
// (filesystem storage). Accept brainy >= 7.31.8's `rootDirectory` getter
|
|
61
|
+
// OR older 7.x's `rootDir` field, so the mmap fast-path engages on every
|
|
62
|
+
// filesystem deployment — not only the newest brainy.
|
|
63
|
+
const rootDir = this.resolveRootDir();
|
|
61
64
|
if (rootDir && bindings.MmapHnswStore) {
|
|
62
65
|
const { join } = require('node:path');
|
|
63
66
|
this.mmapStore = new bindings.MmapHnswStore(join(rootDir, '_hnsw.bin'));
|
|
@@ -65,6 +68,16 @@ export class NativeHNSWWrapper {
|
|
|
65
68
|
prodLog.info(`NativeHNSWWrapper initialized (M=${this.config.M}, ef=${this.config.efSearch}, ` +
|
|
66
69
|
`persist=${this.persistMode}, mmap=${!!this.mmapStore})`);
|
|
67
70
|
}
|
|
71
|
+
/**
|
|
72
|
+
* Resolve the storage root directory for the `_hnsw.bin` mmap file. brainy
|
|
73
|
+
* >= 7.31.8 exposes a public `rootDirectory` getter; earlier 7.x kept the
|
|
74
|
+
* value in the (compile-time-protected, runtime-present) `rootDir` field.
|
|
75
|
+
* Accept either so the mmap fast-path engages on every filesystem brainy.
|
|
76
|
+
*/
|
|
77
|
+
resolveRootDir() {
|
|
78
|
+
const s = this.storage;
|
|
79
|
+
return s?.rootDirectory ?? s?.rootDir ?? undefined;
|
|
80
|
+
}
|
|
68
81
|
// ---------------------------------------------------------------------------
|
|
69
82
|
// Core CRUD
|
|
70
83
|
// ---------------------------------------------------------------------------
|
|
@@ -227,7 +240,14 @@ export class NativeHNSWWrapper {
|
|
|
227
240
|
async flush() {
|
|
228
241
|
if (!this.storage)
|
|
229
242
|
return 0;
|
|
230
|
-
|
|
243
|
+
// The mmap binary is a FULL snapshot of the in-memory graph, independent of
|
|
244
|
+
// the per-node dirty set. In 'immediate' mode (brainy's filesystem default)
|
|
245
|
+
// nothing is ever marked dirty — nodes persist inline on add — so flush()'s
|
|
246
|
+
// real job there is to write that snapshot. It must NOT be skipped just
|
|
247
|
+
// because the dirty set is empty; only fast-return when there is genuinely
|
|
248
|
+
// nothing to write (no dirty data AND no graph to snapshot).
|
|
249
|
+
const needsMmapSnapshot = !!this.mmapStore && this.native.size() > 0;
|
|
250
|
+
if (this.dirtyNodes.size === 0 && !this.dirtySystem && !needsMmapSnapshot)
|
|
231
251
|
return 0;
|
|
232
252
|
const startTime = Date.now();
|
|
233
253
|
const nodeCount = this.dirtyNodes.size;
|
|
@@ -264,7 +284,7 @@ export class NativeHNSWWrapper {
|
|
|
264
284
|
try {
|
|
265
285
|
this.writeMmapFile();
|
|
266
286
|
// Switch to mmap backend — search now reads from kernel page cache
|
|
267
|
-
const rootDir = this.
|
|
287
|
+
const rootDir = this.resolveRootDir();
|
|
268
288
|
if (rootDir) {
|
|
269
289
|
const { join } = require('node:path');
|
|
270
290
|
this.native.setMmapBackend(join(rootDir, '_hnsw.bin'));
|
|
@@ -350,11 +370,9 @@ export class NativeHNSWWrapper {
|
|
|
350
370
|
loadFromMmap() {
|
|
351
371
|
if (!this.mmapStore)
|
|
352
372
|
return false;
|
|
353
|
-
const mmapPath = this.mmapStore.path ?? this.mmapStore.path;
|
|
354
373
|
// Use the native index's mmap backend directly
|
|
355
|
-
const
|
|
356
|
-
|
|
357
|
-
: '');
|
|
374
|
+
const rootDir = this.resolveRootDir();
|
|
375
|
+
const loaded = this.native.setMmapBackend(rootDir ? require('node:path').join(rootDir, '_hnsw.bin') : '');
|
|
358
376
|
if (!loaded)
|
|
359
377
|
return false;
|
|
360
378
|
const size = this.native.size();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/cortex",
|
|
3
|
-
"version": "2.7.
|
|
3
|
+
"version": "2.7.1",
|
|
4
4
|
"description": "Native Rust acceleration for Brainy — SIMD distance, vector quantization, zero-copy mmap, native embeddings. Free tier for storage, Pro license for compute acceleration.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module hnsw/NativeDiskAnnWrapper
|
|
3
|
-
* @description TypeScript wrapper around cortex's native DiskANN engine
|
|
4
|
-
* that satisfies brainy's `HnswProvider` contract. From brainy's
|
|
5
|
-
* perspective this is interchangeable with `NativeHNSWWrapper` — same
|
|
6
|
-
* `addItem` / `search` / `rebuild` surface — but underneath it drives
|
|
7
|
-
* the billion-scale Vamana + PQ index.
|
|
8
|
-
*
|
|
9
|
-
* @example
|
|
10
|
-
* ```typescript
|
|
11
|
-
* import { BrainyData } from '@soulcraft/brainy'
|
|
12
|
-
* import { register as registerCortex } from '@soulcraft/cortex'
|
|
13
|
-
*
|
|
14
|
-
* const brain = new BrainyData({
|
|
15
|
-
* storage: { type: 'filesystem', rootDirectory: '/data/idx' }
|
|
16
|
-
* })
|
|
17
|
-
* await registerCortex(brain)
|
|
18
|
-
* await brain.init() // [brainy] DiskANN engaged (path=..., dim=384)
|
|
19
|
-
*
|
|
20
|
-
* await brain.add({ data: 'native rust acceleration', type: 'concept' })
|
|
21
|
-
* const hits = await brain.search('billion scale ann', 10)
|
|
22
|
-
* ```
|
|
23
|
-
*
|
|
24
|
-
* @example
|
|
25
|
-
* ```typescript
|
|
26
|
-
* // Explicit billion-scale build config
|
|
27
|
-
* const brain = new BrainyData({
|
|
28
|
-
* storage: { type: 'filesystem', rootDirectory: '/data/idx' },
|
|
29
|
-
* index: {
|
|
30
|
-
* type: 'diskann',
|
|
31
|
-
* diskann: {
|
|
32
|
-
* pqM: 16,
|
|
33
|
-
* maxDegree: 64,
|
|
34
|
-
* searchListSize: 100,
|
|
35
|
-
* useMmapAdjacency: true, // required >100M nodes
|
|
36
|
-
* mmapAdjacencyPath: '/data/scratch/diskann-build.adj'
|
|
37
|
-
* }
|
|
38
|
-
* }
|
|
39
|
-
* })
|
|
40
|
-
* ```
|
|
41
|
-
*
|
|
42
|
-
* ## Operating model
|
|
43
|
-
*
|
|
44
|
-
* DiskANN is build-once, query-many by design: the on-disk file
|
|
45
|
-
* embeds the Vamana graph, PQ codebook, codes, and full vectors in a
|
|
46
|
-
* single contiguous mmap-able layout. Dynamic insertions go to a
|
|
47
|
-
* small **delta buffer** that brute-force-searches alongside the main
|
|
48
|
-
* index until the next `rebuild()` folds them in. This matches
|
|
49
|
-
* FreshDiskANN's published online-update model.
|
|
50
|
-
*
|
|
51
|
-
* ## Search path
|
|
52
|
-
*
|
|
53
|
-
* 1. Query the main index via the native DiskANN searcher: PQ-greedy
|
|
54
|
-
* walk in RAM, full-vector re-rank on the candidate set.
|
|
55
|
-
* 2. Brute-force the delta buffer (typically <0.1% of total size after
|
|
56
|
-
* a recent rebuild).
|
|
57
|
-
* 3. Merge + sort + truncate to `k`.
|
|
58
|
-
*
|
|
59
|
-
* ## When this wrapper engages
|
|
60
|
-
*
|
|
61
|
-
* Brainy's `wireDiskAnn()` decides at init time whether to instantiate
|
|
62
|
-
* this wrapper or the standard HNSW one. The criteria
|
|
63
|
-
* ([ADR-002](../../docs/ADR-002-diskann-100-percent-rust.md)):
|
|
64
|
-
* - Cortex's `index:diskann` provider is registered (this file).
|
|
65
|
-
* - The storage adapter exposes a local filesystem path
|
|
66
|
-
* (`getBinaryBlobPath` is the canonical check).
|
|
67
|
-
* - The metadata index has a stable `idMapper` (the cortex 2.4.0 #23
|
|
68
|
-
* foundation).
|
|
69
|
-
* - `config.index.type !== 'hnsw'` (opt-out path).
|
|
70
|
-
*/
|
|
71
|
-
import type { Vector, VectorDocument, DistanceFunction, StorageAdapter } from '@soulcraft/brainy';
|
|
72
|
-
import type { HnswProvider } from '../providerContracts.js';
|
|
73
|
-
export interface DiskAnnIndexConfig {
|
|
74
|
-
/** Vector dimension (e.g. 384 for all-MiniLM-L6-v2). */
|
|
75
|
-
dimensions: number;
|
|
76
|
-
/** Output path for the on-disk DiskANN file. */
|
|
77
|
-
indexPath: string;
|
|
78
|
-
/** PQ subspaces. Default 16. dim must be divisible by m. */
|
|
79
|
-
pqM?: number;
|
|
80
|
-
/** Centroids per subspace. Default 256 (8-bit codes). */
|
|
81
|
-
pqKsub?: number;
|
|
82
|
-
/** Vamana max degree (R). Default 64. */
|
|
83
|
-
maxDegree?: number;
|
|
84
|
-
/** Build-time candidate list size (L). Default 100. */
|
|
85
|
-
searchListSize?: number;
|
|
86
|
-
/** α-pruning density factor. Default 1.2. */
|
|
87
|
-
alpha?: number;
|
|
88
|
-
/** Default search-time candidate list size. `2*k` is a good baseline. */
|
|
89
|
-
defaultLSearch?: number;
|
|
90
|
-
/** Default padding factor for re-rank over-fetch. Default 1.2. */
|
|
91
|
-
defaultPaddingFactor?: number;
|
|
92
|
-
/** Use a file-backed adjacency during build. Required >~100M nodes. */
|
|
93
|
-
useMmapAdjacency?: boolean;
|
|
94
|
-
/** Scratch file path when `useMmapAdjacency` is true. */
|
|
95
|
-
mmapAdjacencyPath?: string;
|
|
96
|
-
}
|
|
97
|
-
export declare class NativeDiskAnnWrapper implements HnswProvider {
|
|
98
|
-
private config;
|
|
99
|
-
private distanceFunction;
|
|
100
|
-
private storage;
|
|
101
|
-
private persistMode;
|
|
102
|
-
/** Live searcher instance — null until the first build. */
|
|
103
|
-
private native;
|
|
104
|
-
/** Newly added entries since the last build. Brute-force searched. */
|
|
105
|
-
private delta;
|
|
106
|
-
/** Removed entries — filtered out at search time. */
|
|
107
|
-
private tombstones;
|
|
108
|
-
/** Bidirectional UUID ↔ slot map for the main index. */
|
|
109
|
-
private slotByUuid;
|
|
110
|
-
private uuidBySlot;
|
|
111
|
-
constructor(config: DiskAnnIndexConfig & {
|
|
112
|
-
distanceFunction?: DistanceFunction;
|
|
113
|
-
}, distanceFunction: DistanceFunction, options?: {
|
|
114
|
-
storage?: StorageAdapter | null;
|
|
115
|
-
persistMode?: 'immediate' | 'deferred';
|
|
116
|
-
});
|
|
117
|
-
/**
|
|
118
|
-
* Append an entry to the delta buffer. Persisted by the next
|
|
119
|
-
* `rebuild()` call, which folds the delta into the main index.
|
|
120
|
-
*/
|
|
121
|
-
addItem(item: VectorDocument): Promise<string>;
|
|
122
|
-
/**
|
|
123
|
-
* Mark an entry as removed. Filtered out at search time; physically
|
|
124
|
-
* removed at the next `rebuild()`.
|
|
125
|
-
*/
|
|
126
|
-
removeItem(id: string): Promise<boolean>;
|
|
127
|
-
search(queryVector: Vector, k?: number, filter?: (id: string) => Promise<boolean>, options?: {
|
|
128
|
-
rerank?: {
|
|
129
|
-
multiplier: number;
|
|
130
|
-
};
|
|
131
|
-
candidateIds?: string[];
|
|
132
|
-
}): Promise<Array<[string, number]>>;
|
|
133
|
-
size(): number;
|
|
134
|
-
clear(): void;
|
|
135
|
-
/**
|
|
136
|
-
* Rebuild the main index from scratch: concatenate (current main −
|
|
137
|
-
* tombstones) ∪ delta, run a full DiskANN build, swap the searcher
|
|
138
|
-
* atomically.
|
|
139
|
-
*
|
|
140
|
-
* At billion-scale this is the expensive operation (hours of build
|
|
141
|
-
* time). Operators schedule it during off-peak; the delta buffer
|
|
142
|
-
* absorbs writes in between.
|
|
143
|
-
*/
|
|
144
|
-
rebuild(options?: {
|
|
145
|
-
pqM?: number;
|
|
146
|
-
pqKsub?: number;
|
|
147
|
-
maxDegree?: number;
|
|
148
|
-
searchListSize?: number;
|
|
149
|
-
alpha?: number;
|
|
150
|
-
}): Promise<void>;
|
|
151
|
-
/**
|
|
152
|
-
* Flush the delta buffer to disk. For DiskANN the delta is in-memory
|
|
153
|
-
* by design (a few MB at most between rebuilds); returns the buffer
|
|
154
|
-
* size for parity with HNSW's flush contract.
|
|
155
|
-
*/
|
|
156
|
-
flush(): Promise<number>;
|
|
157
|
-
getPersistMode(): 'immediate' | 'deferred';
|
|
158
|
-
private tryOpenExisting;
|
|
159
|
-
private countMainTombstones;
|
|
160
|
-
}
|
|
161
|
-
//# sourceMappingURL=NativeDiskAnnWrapper.d.ts.map
|
|
@@ -1,329 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module hnsw/NativeDiskAnnWrapper
|
|
3
|
-
* @description TypeScript wrapper around cortex's native DiskANN engine
|
|
4
|
-
* that satisfies brainy's `HnswProvider` contract. From brainy's
|
|
5
|
-
* perspective this is interchangeable with `NativeHNSWWrapper` — same
|
|
6
|
-
* `addItem` / `search` / `rebuild` surface — but underneath it drives
|
|
7
|
-
* the billion-scale Vamana + PQ index.
|
|
8
|
-
*
|
|
9
|
-
* @example
|
|
10
|
-
* ```typescript
|
|
11
|
-
* import { BrainyData } from '@soulcraft/brainy'
|
|
12
|
-
* import { register as registerCortex } from '@soulcraft/cortex'
|
|
13
|
-
*
|
|
14
|
-
* const brain = new BrainyData({
|
|
15
|
-
* storage: { type: 'filesystem', rootDirectory: '/data/idx' }
|
|
16
|
-
* })
|
|
17
|
-
* await registerCortex(brain)
|
|
18
|
-
* await brain.init() // [brainy] DiskANN engaged (path=..., dim=384)
|
|
19
|
-
*
|
|
20
|
-
* await brain.add({ data: 'native rust acceleration', type: 'concept' })
|
|
21
|
-
* const hits = await brain.search('billion scale ann', 10)
|
|
22
|
-
* ```
|
|
23
|
-
*
|
|
24
|
-
* @example
|
|
25
|
-
* ```typescript
|
|
26
|
-
* // Explicit billion-scale build config
|
|
27
|
-
* const brain = new BrainyData({
|
|
28
|
-
* storage: { type: 'filesystem', rootDirectory: '/data/idx' },
|
|
29
|
-
* index: {
|
|
30
|
-
* type: 'diskann',
|
|
31
|
-
* diskann: {
|
|
32
|
-
* pqM: 16,
|
|
33
|
-
* maxDegree: 64,
|
|
34
|
-
* searchListSize: 100,
|
|
35
|
-
* useMmapAdjacency: true, // required >100M nodes
|
|
36
|
-
* mmapAdjacencyPath: '/data/scratch/diskann-build.adj'
|
|
37
|
-
* }
|
|
38
|
-
* }
|
|
39
|
-
* })
|
|
40
|
-
* ```
|
|
41
|
-
*
|
|
42
|
-
* ## Operating model
|
|
43
|
-
*
|
|
44
|
-
* DiskANN is build-once, query-many by design: the on-disk file
|
|
45
|
-
* embeds the Vamana graph, PQ codebook, codes, and full vectors in a
|
|
46
|
-
* single contiguous mmap-able layout. Dynamic insertions go to a
|
|
47
|
-
* small **delta buffer** that brute-force-searches alongside the main
|
|
48
|
-
* index until the next `rebuild()` folds them in. This matches
|
|
49
|
-
* FreshDiskANN's published online-update model.
|
|
50
|
-
*
|
|
51
|
-
* ## Search path
|
|
52
|
-
*
|
|
53
|
-
* 1. Query the main index via the native DiskANN searcher: PQ-greedy
|
|
54
|
-
* walk in RAM, full-vector re-rank on the candidate set.
|
|
55
|
-
* 2. Brute-force the delta buffer (typically <0.1% of total size after
|
|
56
|
-
* a recent rebuild).
|
|
57
|
-
* 3. Merge + sort + truncate to `k`.
|
|
58
|
-
*
|
|
59
|
-
* ## When this wrapper engages
|
|
60
|
-
*
|
|
61
|
-
* Brainy's `wireDiskAnn()` decides at init time whether to instantiate
|
|
62
|
-
* this wrapper or the standard HNSW one. The criteria
|
|
63
|
-
* ([ADR-002](../../docs/ADR-002-diskann-100-percent-rust.md)):
|
|
64
|
-
* - Cortex's `index:diskann` provider is registered (this file).
|
|
65
|
-
* - The storage adapter exposes a local filesystem path
|
|
66
|
-
* (`getBinaryBlobPath` is the canonical check).
|
|
67
|
-
* - The metadata index has a stable `idMapper` (the cortex 2.4.0 #23
|
|
68
|
-
* foundation).
|
|
69
|
-
* - `config.index.type !== 'hnsw'` (opt-out path).
|
|
70
|
-
*/
|
|
71
|
-
import { loadNativeModule } from '../native/index.js';
|
|
72
|
-
import { prodLog } from '@soulcraft/brainy/internals';
|
|
73
|
-
const DEFAULTS = {
|
|
74
|
-
pqM: 16,
|
|
75
|
-
pqKsub: 256,
|
|
76
|
-
maxDegree: 64,
|
|
77
|
-
searchListSize: 100,
|
|
78
|
-
alpha: 1.2,
|
|
79
|
-
defaultLSearch: 100,
|
|
80
|
-
defaultPaddingFactor: 1.2,
|
|
81
|
-
useMmapAdjacency: false,
|
|
82
|
-
};
|
|
83
|
-
export class NativeDiskAnnWrapper {
|
|
84
|
-
config;
|
|
85
|
-
distanceFunction;
|
|
86
|
-
storage;
|
|
87
|
-
persistMode;
|
|
88
|
-
/** Live searcher instance — null until the first build. */
|
|
89
|
-
native = null;
|
|
90
|
-
/** Newly added entries since the last build. Brute-force searched. */
|
|
91
|
-
delta = new Map();
|
|
92
|
-
/** Removed entries — filtered out at search time. */
|
|
93
|
-
tombstones = new Set();
|
|
94
|
-
/** Bidirectional UUID ↔ slot map for the main index. */
|
|
95
|
-
slotByUuid = new Map();
|
|
96
|
-
uuidBySlot = new Map();
|
|
97
|
-
constructor(config, distanceFunction, options = {}) {
|
|
98
|
-
this.config = { ...DEFAULTS, ...config };
|
|
99
|
-
this.distanceFunction = distanceFunction;
|
|
100
|
-
this.storage = options.storage ?? null;
|
|
101
|
-
this.persistMode = options.persistMode ?? 'immediate';
|
|
102
|
-
// Try to open an existing file. If absent, the index stays
|
|
103
|
-
// empty until the first rebuild() flushes the delta buffer.
|
|
104
|
-
this.tryOpenExisting();
|
|
105
|
-
}
|
|
106
|
-
/**
|
|
107
|
-
* Append an entry to the delta buffer. Persisted by the next
|
|
108
|
-
* `rebuild()` call, which folds the delta into the main index.
|
|
109
|
-
*/
|
|
110
|
-
async addItem(item) {
|
|
111
|
-
if (this.tombstones.has(item.id)) {
|
|
112
|
-
this.tombstones.delete(item.id);
|
|
113
|
-
}
|
|
114
|
-
this.delta.set(item.id, item.vector);
|
|
115
|
-
return item.id;
|
|
116
|
-
}
|
|
117
|
-
/**
|
|
118
|
-
* Mark an entry as removed. Filtered out at search time; physically
|
|
119
|
-
* removed at the next `rebuild()`.
|
|
120
|
-
*/
|
|
121
|
-
async removeItem(id) {
|
|
122
|
-
const inDelta = this.delta.delete(id);
|
|
123
|
-
const inMain = this.slotByUuid.has(id);
|
|
124
|
-
if (inMain)
|
|
125
|
-
this.tombstones.add(id);
|
|
126
|
-
return inDelta || inMain;
|
|
127
|
-
}
|
|
128
|
-
async search(queryVector, k = 10, filter, options) {
|
|
129
|
-
const lSearch = Math.max(this.config.defaultLSearch, k * 2);
|
|
130
|
-
const padding = options?.rerank?.multiplier ?? this.config.defaultPaddingFactor;
|
|
131
|
-
// 1. Main-index PQ-greedy walk (returns slot ids).
|
|
132
|
-
const mainHits = this.native
|
|
133
|
-
? this.native.search(Array.from(queryVector), k * 2, // over-fetch so filter / tombstone losses don't starve final result
|
|
134
|
-
lSearch, padding)
|
|
135
|
-
: [];
|
|
136
|
-
// 2. Hydrate slot → uuid; drop tombstoned + filter-rejected.
|
|
137
|
-
const merged = [];
|
|
138
|
-
for (const hit of mainHits) {
|
|
139
|
-
const uuid = this.uuidBySlot.get(hit.slot);
|
|
140
|
-
if (!uuid)
|
|
141
|
-
continue;
|
|
142
|
-
if (this.tombstones.has(uuid))
|
|
143
|
-
continue;
|
|
144
|
-
if (filter && !(await filter(uuid)))
|
|
145
|
-
continue;
|
|
146
|
-
merged.push([uuid, hit.distance]);
|
|
147
|
-
}
|
|
148
|
-
// 3. Brute-force the delta buffer.
|
|
149
|
-
for (const [id, v] of this.delta) {
|
|
150
|
-
if (filter && !(await filter(id)))
|
|
151
|
-
continue;
|
|
152
|
-
const d = this.distanceFunction(queryVector, v);
|
|
153
|
-
merged.push([id, d]);
|
|
154
|
-
}
|
|
155
|
-
// 4. Sort ascending by distance, truncate to k.
|
|
156
|
-
merged.sort((a, b) => a[1] - b[1]);
|
|
157
|
-
return merged.slice(0, k);
|
|
158
|
-
}
|
|
159
|
-
size() {
|
|
160
|
-
const mainSize = this.native ? this.native.size() : 0;
|
|
161
|
-
return (mainSize +
|
|
162
|
-
this.delta.size -
|
|
163
|
-
// Tombstones from the main index reduce effective size.
|
|
164
|
-
this.countMainTombstones());
|
|
165
|
-
}
|
|
166
|
-
clear() {
|
|
167
|
-
this.delta.clear();
|
|
168
|
-
this.tombstones.clear();
|
|
169
|
-
this.slotByUuid.clear();
|
|
170
|
-
this.uuidBySlot.clear();
|
|
171
|
-
this.native = null;
|
|
172
|
-
}
|
|
173
|
-
/**
|
|
174
|
-
* Rebuild the main index from scratch: concatenate (current main −
|
|
175
|
-
* tombstones) ∪ delta, run a full DiskANN build, swap the searcher
|
|
176
|
-
* atomically.
|
|
177
|
-
*
|
|
178
|
-
* At billion-scale this is the expensive operation (hours of build
|
|
179
|
-
* time). Operators schedule it during off-peak; the delta buffer
|
|
180
|
-
* absorbs writes in between.
|
|
181
|
-
*/
|
|
182
|
-
async rebuild(options) {
|
|
183
|
-
const bindings = loadNativeModule();
|
|
184
|
-
// napi-rs exports the class as `NativeDiskAnn` (PascalCase
|
|
185
|
-
// normalization of the Rust ident `NativeDiskANN`). The TS type
|
|
186
|
-
// alias `NativeDiskANN = NativeDiskAnn` in `native/index.d.ts` is
|
|
187
|
-
// for backwards-compat in *types* only — at runtime there's a
|
|
188
|
-
// single export under the napi-normalized name.
|
|
189
|
-
const NativeDiskANN = bindings.NativeDiskAnn;
|
|
190
|
-
if (!NativeDiskANN) {
|
|
191
|
-
throw new Error('NativeDiskANN binding missing — rebuild requires the cortex native module');
|
|
192
|
-
}
|
|
193
|
-
// Build the new logical slot ordering: (live old slots) + (delta).
|
|
194
|
-
// **Critical for billion-scale correctness**: the old vectors stay
|
|
195
|
-
// mmap'd inside the native module — we only pass slot IDs across
|
|
196
|
-
// the FFI boundary, not the vector data itself. At 1B × 1536 × 4
|
|
197
|
-
// bytes = ~6 TB this is the difference between "rebuild works" and
|
|
198
|
-
// "rebuild OOMs."
|
|
199
|
-
const liveOldSlots = [];
|
|
200
|
-
const newUuids = [];
|
|
201
|
-
if (this.native) {
|
|
202
|
-
// Iterate in slot order so the new index's first n_live slots
|
|
203
|
-
// mirror the OLD index's surviving subset in deterministic order.
|
|
204
|
-
// We deliberately iterate by sorted slot id rather than uuidBySlot
|
|
205
|
-
// insertion order — sorting keeps the Vamana entry point stable
|
|
206
|
-
// and the on-disk vector section's locality similar to the
|
|
207
|
-
// pre-rebuild file (less page-cache turnover during the post-
|
|
208
|
-
// rebuild warm-up).
|
|
209
|
-
const sortedSlots = Array.from(this.uuidBySlot.keys()).sort((a, b) => a - b);
|
|
210
|
-
for (const slot of sortedSlots) {
|
|
211
|
-
const uuid = this.uuidBySlot.get(slot);
|
|
212
|
-
if (this.tombstones.has(uuid))
|
|
213
|
-
continue;
|
|
214
|
-
liveOldSlots.push(slot);
|
|
215
|
-
newUuids.push(uuid);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
const dim = this.config.dimensions;
|
|
219
|
-
const deltaCount = this.delta.size;
|
|
220
|
-
let deltaBuf = null;
|
|
221
|
-
if (deltaCount > 0) {
|
|
222
|
-
deltaBuf = new Float32Array(deltaCount * dim);
|
|
223
|
-
let idx = 0;
|
|
224
|
-
for (const [uuid, vector] of this.delta) {
|
|
225
|
-
if (vector.length !== dim) {
|
|
226
|
-
throw new Error(`NativeDiskAnnWrapper.rebuild: vector dim ${vector.length} ≠ index dim ${dim}`);
|
|
227
|
-
}
|
|
228
|
-
deltaBuf.set(vector, idx * dim);
|
|
229
|
-
newUuids.push(uuid);
|
|
230
|
-
idx++;
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
if (liveOldSlots.length + deltaCount === 0) {
|
|
234
|
-
prodLog?.warn?.('NativeDiskAnnWrapper.rebuild: nothing to build');
|
|
235
|
-
return;
|
|
236
|
-
}
|
|
237
|
-
const totalCount = liveOldSlots.length + deltaCount;
|
|
238
|
-
const cfg = {
|
|
239
|
-
vamana: {
|
|
240
|
-
maxDegree: options?.maxDegree ?? this.config.maxDegree,
|
|
241
|
-
searchListSize: options?.searchListSize ?? this.config.searchListSize,
|
|
242
|
-
alpha: options?.alpha ?? this.config.alpha,
|
|
243
|
-
seed: BigInt(0xd15ca4440ffff00dn),
|
|
244
|
-
parallel: true,
|
|
245
|
-
parallelBatch: 64,
|
|
246
|
-
},
|
|
247
|
-
pq: {
|
|
248
|
-
m: options?.pqM ?? this.config.pqM,
|
|
249
|
-
ksub: options?.pqKsub ?? this.config.pqKsub,
|
|
250
|
-
iterations: 25,
|
|
251
|
-
trainingSample: Math.min(200_000, totalCount),
|
|
252
|
-
},
|
|
253
|
-
adjacency: this.config.useMmapAdjacency
|
|
254
|
-
? {
|
|
255
|
-
kind: 'mmap',
|
|
256
|
-
mmapPath: this.config.mmapAdjacencyPath ?? `${this.config.indexPath}.adj`,
|
|
257
|
-
}
|
|
258
|
-
: { kind: 'ram' },
|
|
259
|
-
};
|
|
260
|
-
const newNative = NativeDiskANN.rebuildFromExisting({
|
|
261
|
-
existingPath: this.native ? this.config.indexPath : undefined,
|
|
262
|
-
liveOldSlots,
|
|
263
|
-
deltaVectors: deltaBuf != null
|
|
264
|
-
? Buffer.from(deltaBuf.buffer, deltaBuf.byteOffset, deltaBuf.byteLength)
|
|
265
|
-
: undefined,
|
|
266
|
-
deltaCount,
|
|
267
|
-
dim,
|
|
268
|
-
outputPath: this.config.indexPath,
|
|
269
|
-
cfg,
|
|
270
|
-
});
|
|
271
|
-
// Rebuild the bidirectional UUID↔slot maps from `newUuids`. New
|
|
272
|
-
// slot `i` corresponds to `newUuids[i]` — this matches the napi
|
|
273
|
-
// layout invariant (live old slots first, delta tail second).
|
|
274
|
-
const newSlotByUuid = new Map();
|
|
275
|
-
const newUuidBySlot = new Map();
|
|
276
|
-
for (let i = 0; i < newUuids.length; i++) {
|
|
277
|
-
newSlotByUuid.set(newUuids[i], i);
|
|
278
|
-
newUuidBySlot.set(i, newUuids[i]);
|
|
279
|
-
}
|
|
280
|
-
// Atomic swap.
|
|
281
|
-
this.native = newNative;
|
|
282
|
-
this.slotByUuid = newSlotByUuid;
|
|
283
|
-
this.uuidBySlot = newUuidBySlot;
|
|
284
|
-
this.delta.clear();
|
|
285
|
-
this.tombstones.clear();
|
|
286
|
-
}
|
|
287
|
-
/**
|
|
288
|
-
* Flush the delta buffer to disk. For DiskANN the delta is in-memory
|
|
289
|
-
* by design (a few MB at most between rebuilds); returns the buffer
|
|
290
|
-
* size for parity with HNSW's flush contract.
|
|
291
|
-
*/
|
|
292
|
-
async flush() {
|
|
293
|
-
return this.delta.size;
|
|
294
|
-
}
|
|
295
|
-
getPersistMode() {
|
|
296
|
-
return this.persistMode;
|
|
297
|
-
}
|
|
298
|
-
tryOpenExisting() {
|
|
299
|
-
try {
|
|
300
|
-
const bindings = loadNativeModule();
|
|
301
|
-
// napi-rs exports the class as `NativeDiskAnn` (PascalCase
|
|
302
|
-
// normalization of the Rust ident `NativeDiskANN`). The TS type
|
|
303
|
-
// alias `NativeDiskANN = NativeDiskAnn` in `native/index.d.ts` is
|
|
304
|
-
// for backwards-compat in *types* only — at runtime there's a
|
|
305
|
-
// single export under the napi-normalized name.
|
|
306
|
-
const NativeDiskANN = bindings.NativeDiskAnn;
|
|
307
|
-
if (!NativeDiskANN)
|
|
308
|
-
return;
|
|
309
|
-
this.native = NativeDiskANN.openExisting(this.config.indexPath);
|
|
310
|
-
// Populate slot maps from the storage adapter — these are persisted
|
|
311
|
-
// alongside the index file in production. For 35c we read from a
|
|
312
|
-
// sibling `.slots.json` that rebuild() writes.
|
|
313
|
-
// (Stub for now; the real path lands when storage integration ships.)
|
|
314
|
-
}
|
|
315
|
-
catch {
|
|
316
|
-
// No existing file — index stays empty until first rebuild().
|
|
317
|
-
this.native = null;
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
countMainTombstones() {
|
|
321
|
-
let n = 0;
|
|
322
|
-
for (const uuid of this.tombstones) {
|
|
323
|
-
if (this.slotByUuid.has(uuid))
|
|
324
|
-
n++;
|
|
325
|
-
}
|
|
326
|
-
return n;
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
//# sourceMappingURL=NativeDiskAnnWrapper.js.map
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module utils/nativeBinaryEntityIdMapper
|
|
3
|
-
* @description TypeScript wrapper around cortex's native binary
|
|
4
|
-
* `BinaryIdMapper`. Implements brainy's `EntityIdMapperProvider` so the
|
|
5
|
-
* mmap-backed billion-scale mapper is a drop-in for the existing
|
|
6
|
-
* JSON-persisted one.
|
|
7
|
-
*
|
|
8
|
-
* ## When this engages
|
|
9
|
-
*
|
|
10
|
-
* The cortex plugin registers this wrapper as the `'entityIdMapper'`
|
|
11
|
-
* provider when the storage adapter exposes `getBinaryBlobPath()` (i.e.
|
|
12
|
-
* filesystem-backed storage with cortex's 2.4.0 #2 mmap-vector layer).
|
|
13
|
-
* Cloud-storage adapters fall back to the JSON variant
|
|
14
|
-
* (`NativeEntityIdMapperWrapper`) since they have no local-path concept.
|
|
15
|
-
*
|
|
16
|
-
* ## UUID format conversion
|
|
17
|
-
*
|
|
18
|
-
* Brainy passes UUIDs as strings (typically the canonical 36-char
|
|
19
|
-
* `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`). The native side works in
|
|
20
|
-
* 16-byte Buffers. This wrapper converts at the boundary. Non-canonical
|
|
21
|
-
* UUID strings (any other 32-hex-digit form) are also accepted.
|
|
22
|
-
*
|
|
23
|
-
* ## Concurrency
|
|
24
|
-
*
|
|
25
|
-
* `getOrAssign` is atomic across concurrent callers for the same UUID
|
|
26
|
-
* (256 sharded per-UUID mutexes in the native layer). Lookups are
|
|
27
|
-
* lock-free. The wrapper holds no JS-side mutable state besides the
|
|
28
|
-
* native handle.
|
|
29
|
-
*
|
|
30
|
-
* ## IdSpace (Piece 10)
|
|
31
|
-
*
|
|
32
|
-
* The wrapper supports two entity-int wire widths:
|
|
33
|
-
*
|
|
34
|
-
* - `'u32'` (default): cortex 2.x compatible — JS `number` throughout,
|
|
35
|
-
* capped at 4.29 B entities. Persists in the legacy `int_to_uuid.bin`
|
|
36
|
-
* v1 header.
|
|
37
|
-
* - `'u64'`: opt-in via `idSpace: 'u64'`. The native layer's `number`
|
|
38
|
-
* surface throws in this mode, so the wrapper transparently routes
|
|
39
|
-
* the `EntityIdMapperProvider` methods through the BigInt napi
|
|
40
|
-
* siblings and converts BigInt → number at the boundary. Entity ints
|
|
41
|
-
* above `Number.MAX_SAFE_INTEGER` (2^53 - 1 = ~9 PB of entities)
|
|
42
|
-
* throw a clear `EntityIdSpaceExceeded` error so callers get a loud
|
|
43
|
-
* failure rather than a silent precision loss.
|
|
44
|
-
*
|
|
45
|
-
* The `getOrAssignBig` / `getIntBig` / `getUuidBig` / `sizeBig`
|
|
46
|
-
* sibling methods are available on both modes and always return
|
|
47
|
-
* `bigint` losslessly — use them in u64-aware code paths that need
|
|
48
|
-
* the full u64 range.
|
|
49
|
-
*/
|
|
50
|
-
import type { StorageAdapter } from '@soulcraft/brainy';
|
|
51
|
-
import type { EntityIdMapperProvider } from '../providerContracts.js';
|
|
52
|
-
export interface NativeBinaryEntityIdMapperOptions {
|
|
53
|
-
/** Storage adapter — required for binary blob path resolution. */
|
|
54
|
-
storage: StorageAdapter;
|
|
55
|
-
/**
|
|
56
|
-
* Override the relative path under storage for the uuid_to_int file.
|
|
57
|
-
* Default `_id_mapper/uuid_to_int.mkv`.
|
|
58
|
-
*/
|
|
59
|
-
uuidToIntKey?: string;
|
|
60
|
-
/**
|
|
61
|
-
* Override the relative path under storage for the int_to_uuid file.
|
|
62
|
-
* Default `_id_mapper/int_to_uuid.bin`.
|
|
63
|
-
*/
|
|
64
|
-
intToUuidKey?: string;
|
|
65
|
-
/** Sparse file size for int_to_uuid. Default 32 GB. */
|
|
66
|
-
intToUuidSize?: bigint;
|
|
67
|
-
/** Sparse file size for uuid_to_int. Default 32 GB. */
|
|
68
|
-
uuidToIntSize?: bigint;
|
|
69
|
-
/** Bucket capacity in the MmapKv. Default 16. */
|
|
70
|
-
bucketCapacity?: number;
|
|
71
|
-
/** Maximum extendible-hash directory depth. Default 28. */
|
|
72
|
-
maxGlobalDepth?: number;
|
|
73
|
-
/**
|
|
74
|
-
* Entity-int wire width. `'u32'` (default) caps at 4.29 B entities
|
|
75
|
-
* and is cortex 2.x compatible. `'u64'` opts into the Piece 10 U64
|
|
76
|
-
* IdSpace — required when targeting >4.29 B entities. The
|
|
77
|
-
* `EntityIdMapperProvider` `number`-typed methods still work in U64
|
|
78
|
-
* mode by routing through the BigInt napi siblings; entity ints
|
|
79
|
-
* above `Number.MAX_SAFE_INTEGER` (2^53 - 1) throw an explicit
|
|
80
|
-
* `EntityIdSpaceExceeded` error.
|
|
81
|
-
*
|
|
82
|
-
* Ignored on open when the underlying file's header disagrees: the
|
|
83
|
-
* on-disk format wins and any mismatch is surfaced as a hard error.
|
|
84
|
-
*/
|
|
85
|
-
idSpace?: 'u32' | 'u64';
|
|
86
|
-
}
|
|
87
|
-
/**
|
|
88
|
-
* Thrown when a U64-mode mapper allocates or returns an entity int
|
|
89
|
-
* above `Number.MAX_SAFE_INTEGER` (2^53 - 1). At this point a JS
|
|
90
|
-
* `number` can no longer represent the value losslessly; callers must
|
|
91
|
-
* switch to the BigInt sibling methods (`getOrAssignBig`,
|
|
92
|
-
* `getIntBig`, `getUuidBig`).
|
|
93
|
-
*/
|
|
94
|
-
export declare class EntityIdSpaceExceeded extends Error {
|
|
95
|
-
/** The u64 entity int that exceeded the safe-integer ceiling. */
|
|
96
|
-
readonly value: bigint;
|
|
97
|
-
/** The method that was called (`'getOrAssign'`, `'getInt'`, etc.). */
|
|
98
|
-
readonly method: string;
|
|
99
|
-
constructor(method: string, value: bigint);
|
|
100
|
-
}
|
|
101
|
-
/**
|
|
102
|
-
* Drop-in `EntityIdMapperProvider` backed by the native `BinaryIdMapper`.
|
|
103
|
-
*
|
|
104
|
-
* @example
|
|
105
|
-
* ```typescript
|
|
106
|
-
* const mapper = new NativeBinaryEntityIdMapperWrapper({ storage })
|
|
107
|
-
* await mapper.init()
|
|
108
|
-
* const intId = mapper.getOrAssign('12345678-1234-5678-1234-567812345678')
|
|
109
|
-
* const uuid = mapper.getUuid(intId)
|
|
110
|
-
* ```
|
|
111
|
-
*/
|
|
112
|
-
export declare class NativeBinaryEntityIdMapperWrapper implements EntityIdMapperProvider {
|
|
113
|
-
private storage;
|
|
114
|
-
private uuidToIntKey;
|
|
115
|
-
private intToUuidKey;
|
|
116
|
-
private intToUuidSize;
|
|
117
|
-
private uuidToIntSize;
|
|
118
|
-
private bucketCapacity;
|
|
119
|
-
private maxGlobalDepth;
|
|
120
|
-
private requestedIdSpace;
|
|
121
|
-
/**
|
|
122
|
-
* The actual IdSpace of the open mapper, sourced from the native
|
|
123
|
-
* binding's `idSpace()` reflection after `init()`. The on-disk
|
|
124
|
-
* header wins over `requestedIdSpace` (which may be ignored at
|
|
125
|
-
* `openExisting` time).
|
|
126
|
-
*/
|
|
127
|
-
private resolvedIdSpace;
|
|
128
|
-
private native;
|
|
129
|
-
private initialized;
|
|
130
|
-
constructor(options: NativeBinaryEntityIdMapperOptions);
|
|
131
|
-
init(): Promise<void>;
|
|
132
|
-
/**
|
|
133
|
-
* Report the mapper's actual IdSpace mode. Returns `'u32'` before
|
|
134
|
-
* `init()` (the default the wrapper assumes); after init, returns the
|
|
135
|
-
* mode reported by the native binding (which is authoritative).
|
|
136
|
-
*/
|
|
137
|
-
getIdSpace(): 'u32' | 'u64';
|
|
138
|
-
/**
|
|
139
|
-
* Allocate or retrieve the entity int for `uuid`. Returns a JS
|
|
140
|
-
* `number`. In U64 mode, routes through the BigInt sibling and
|
|
141
|
-
* throws {@link EntityIdSpaceExceeded} if the allocated int exceeds
|
|
142
|
-
* `Number.MAX_SAFE_INTEGER` — at that point the caller MUST switch
|
|
143
|
-
* to `getOrAssignBig` for the full u64 range.
|
|
144
|
-
*/
|
|
145
|
-
getOrAssign(uuid: string): number;
|
|
146
|
-
/**
|
|
147
|
-
* Look up the UUID for `intId`. Accepts a JS `number` — in U64 mode
|
|
148
|
-
* this is a lossy conversion above 2^53; use {@link getUuidBig} for
|
|
149
|
-
* the full u64 range.
|
|
150
|
-
*/
|
|
151
|
-
getUuid(intId: number): string | undefined;
|
|
152
|
-
/**
|
|
153
|
-
* Look up the entity int for `uuid`. Returns a JS `number`. In U64
|
|
154
|
-
* mode throws {@link EntityIdSpaceExceeded} if the int exceeds
|
|
155
|
-
* `Number.MAX_SAFE_INTEGER`.
|
|
156
|
-
*/
|
|
157
|
-
getInt(uuid: string): number | undefined;
|
|
158
|
-
remove(uuid: string): boolean;
|
|
159
|
-
flush(): Promise<void>;
|
|
160
|
-
clear(): Promise<void>;
|
|
161
|
-
/**
|
|
162
|
-
* Materialise every live int id into a JS `number[]`. **U32 mode
|
|
163
|
-
* only.** U64 mode throws — the native binding refuses to allocate
|
|
164
|
-
* a giant JS array at the scale a U64 brain implies. Iterate via
|
|
165
|
-
* the BigInt sibling iterator (TBD — a follow-up surfaces it on the
|
|
166
|
-
* wrapper) for U64 brains.
|
|
167
|
-
*/
|
|
168
|
-
getAllIntIds(): number[];
|
|
169
|
-
intsIterableToUuids(ints: Iterable<number>): string[];
|
|
170
|
-
get size(): number;
|
|
171
|
-
/**
|
|
172
|
-
* Allocate or retrieve the entity int for `uuid` as a `bigint`.
|
|
173
|
-
* Lossless across the full u64 range; safe to call in either mode.
|
|
174
|
-
*/
|
|
175
|
-
getOrAssignBig(uuid: string): bigint;
|
|
176
|
-
/** Look up the entity int for `uuid` as a `bigint`. */
|
|
177
|
-
getIntBig(uuid: string): bigint | undefined;
|
|
178
|
-
/** Look up the UUID for `int` (passed as a `bigint`). */
|
|
179
|
-
getUuidBig(int: bigint): string | undefined;
|
|
180
|
-
/** Live (non-tombstone) entry count as a `bigint`. */
|
|
181
|
-
sizeBig(): bigint;
|
|
182
|
-
/** Largest int ever assigned + 1, as a `bigint`. */
|
|
183
|
-
nextIntBig(): bigint;
|
|
184
|
-
/**
|
|
185
|
-
* Encode a UUID string into a 16-byte Buffer. Accepts canonical
|
|
186
|
-
* 36-char form (with hyphens) or any 32-hex-digit form. Throws on
|
|
187
|
-
* malformed input.
|
|
188
|
-
*/
|
|
189
|
-
private encode;
|
|
190
|
-
/** Decode a 16-byte Buffer back to canonical UUID string. */
|
|
191
|
-
private decode;
|
|
192
|
-
private ensure;
|
|
193
|
-
}
|
|
194
|
-
//# sourceMappingURL=nativeBinaryEntityIdMapper.d.ts.map
|
|
@@ -1,358 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* @module utils/nativeBinaryEntityIdMapper
|
|
3
|
-
* @description TypeScript wrapper around cortex's native binary
|
|
4
|
-
* `BinaryIdMapper`. Implements brainy's `EntityIdMapperProvider` so the
|
|
5
|
-
* mmap-backed billion-scale mapper is a drop-in for the existing
|
|
6
|
-
* JSON-persisted one.
|
|
7
|
-
*
|
|
8
|
-
* ## When this engages
|
|
9
|
-
*
|
|
10
|
-
* The cortex plugin registers this wrapper as the `'entityIdMapper'`
|
|
11
|
-
* provider when the storage adapter exposes `getBinaryBlobPath()` (i.e.
|
|
12
|
-
* filesystem-backed storage with cortex's 2.4.0 #2 mmap-vector layer).
|
|
13
|
-
* Cloud-storage adapters fall back to the JSON variant
|
|
14
|
-
* (`NativeEntityIdMapperWrapper`) since they have no local-path concept.
|
|
15
|
-
*
|
|
16
|
-
* ## UUID format conversion
|
|
17
|
-
*
|
|
18
|
-
* Brainy passes UUIDs as strings (typically the canonical 36-char
|
|
19
|
-
* `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`). The native side works in
|
|
20
|
-
* 16-byte Buffers. This wrapper converts at the boundary. Non-canonical
|
|
21
|
-
* UUID strings (any other 32-hex-digit form) are also accepted.
|
|
22
|
-
*
|
|
23
|
-
* ## Concurrency
|
|
24
|
-
*
|
|
25
|
-
* `getOrAssign` is atomic across concurrent callers for the same UUID
|
|
26
|
-
* (256 sharded per-UUID mutexes in the native layer). Lookups are
|
|
27
|
-
* lock-free. The wrapper holds no JS-side mutable state besides the
|
|
28
|
-
* native handle.
|
|
29
|
-
*
|
|
30
|
-
* ## IdSpace (Piece 10)
|
|
31
|
-
*
|
|
32
|
-
* The wrapper supports two entity-int wire widths:
|
|
33
|
-
*
|
|
34
|
-
* - `'u32'` (default): cortex 2.x compatible — JS `number` throughout,
|
|
35
|
-
* capped at 4.29 B entities. Persists in the legacy `int_to_uuid.bin`
|
|
36
|
-
* v1 header.
|
|
37
|
-
* - `'u64'`: opt-in via `idSpace: 'u64'`. The native layer's `number`
|
|
38
|
-
* surface throws in this mode, so the wrapper transparently routes
|
|
39
|
-
* the `EntityIdMapperProvider` methods through the BigInt napi
|
|
40
|
-
* siblings and converts BigInt → number at the boundary. Entity ints
|
|
41
|
-
* above `Number.MAX_SAFE_INTEGER` (2^53 - 1 = ~9 PB of entities)
|
|
42
|
-
* throw a clear `EntityIdSpaceExceeded` error so callers get a loud
|
|
43
|
-
* failure rather than a silent precision loss.
|
|
44
|
-
*
|
|
45
|
-
* The `getOrAssignBig` / `getIntBig` / `getUuidBig` / `sizeBig`
|
|
46
|
-
* sibling methods are available on both modes and always return
|
|
47
|
-
* `bigint` losslessly — use them in u64-aware code paths that need
|
|
48
|
-
* the full u64 range.
|
|
49
|
-
*/
|
|
50
|
-
import { existsSync } from 'node:fs';
|
|
51
|
-
import { loadNativeModule } from '../native/index.js';
|
|
52
|
-
import { prodLog } from '@soulcraft/brainy/internals';
|
|
53
|
-
const UUID_BYTES = 16;
|
|
54
|
-
/**
|
|
55
|
-
* Thrown when a U64-mode mapper allocates or returns an entity int
|
|
56
|
-
* above `Number.MAX_SAFE_INTEGER` (2^53 - 1). At this point a JS
|
|
57
|
-
* `number` can no longer represent the value losslessly; callers must
|
|
58
|
-
* switch to the BigInt sibling methods (`getOrAssignBig`,
|
|
59
|
-
* `getIntBig`, `getUuidBig`).
|
|
60
|
-
*/
|
|
61
|
-
export class EntityIdSpaceExceeded extends Error {
|
|
62
|
-
/** The u64 entity int that exceeded the safe-integer ceiling. */
|
|
63
|
-
value;
|
|
64
|
-
/** The method that was called (`'getOrAssign'`, `'getInt'`, etc.). */
|
|
65
|
-
method;
|
|
66
|
-
constructor(method, value) {
|
|
67
|
-
super(`${method}: entity int ${value} exceeds Number.MAX_SAFE_INTEGER ` +
|
|
68
|
-
`(2^53 - 1). Switch to the BigInt sibling method (${method}Big) ` +
|
|
69
|
-
`for entity ints above 9.007 PB.`);
|
|
70
|
-
this.name = 'EntityIdSpaceExceeded';
|
|
71
|
-
this.method = method;
|
|
72
|
-
this.value = value;
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
const MAX_SAFE_INTEGER_BIG = BigInt(Number.MAX_SAFE_INTEGER);
|
|
76
|
-
/**
|
|
77
|
-
* Convert a `bigint` entity int to a JS `number`. Throws
|
|
78
|
-
* {@link EntityIdSpaceExceeded} if the value exceeds the JS safe-integer
|
|
79
|
-
* range.
|
|
80
|
-
*/
|
|
81
|
-
function bigToSafeNumber(value, method) {
|
|
82
|
-
if (value > MAX_SAFE_INTEGER_BIG) {
|
|
83
|
-
throw new EntityIdSpaceExceeded(method, value);
|
|
84
|
-
}
|
|
85
|
-
return Number(value);
|
|
86
|
-
}
|
|
87
|
-
const DEFAULT_UUID_TO_INT_KEY = '_id_mapper/uuid_to_int.mkv';
|
|
88
|
-
const DEFAULT_INT_TO_UUID_KEY = '_id_mapper/int_to_uuid.bin';
|
|
89
|
-
/**
|
|
90
|
-
* Drop-in `EntityIdMapperProvider` backed by the native `BinaryIdMapper`.
|
|
91
|
-
*
|
|
92
|
-
* @example
|
|
93
|
-
* ```typescript
|
|
94
|
-
* const mapper = new NativeBinaryEntityIdMapperWrapper({ storage })
|
|
95
|
-
* await mapper.init()
|
|
96
|
-
* const intId = mapper.getOrAssign('12345678-1234-5678-1234-567812345678')
|
|
97
|
-
* const uuid = mapper.getUuid(intId)
|
|
98
|
-
* ```
|
|
99
|
-
*/
|
|
100
|
-
export class NativeBinaryEntityIdMapperWrapper {
|
|
101
|
-
storage;
|
|
102
|
-
uuidToIntKey;
|
|
103
|
-
intToUuidKey;
|
|
104
|
-
intToUuidSize;
|
|
105
|
-
uuidToIntSize;
|
|
106
|
-
bucketCapacity;
|
|
107
|
-
maxGlobalDepth;
|
|
108
|
-
requestedIdSpace;
|
|
109
|
-
/**
|
|
110
|
-
* The actual IdSpace of the open mapper, sourced from the native
|
|
111
|
-
* binding's `idSpace()` reflection after `init()`. The on-disk
|
|
112
|
-
* header wins over `requestedIdSpace` (which may be ignored at
|
|
113
|
-
* `openExisting` time).
|
|
114
|
-
*/
|
|
115
|
-
resolvedIdSpace = 'u32';
|
|
116
|
-
native = null;
|
|
117
|
-
initialized = false;
|
|
118
|
-
constructor(options) {
|
|
119
|
-
this.storage = options.storage;
|
|
120
|
-
this.uuidToIntKey = options.uuidToIntKey ?? DEFAULT_UUID_TO_INT_KEY;
|
|
121
|
-
this.intToUuidKey = options.intToUuidKey ?? DEFAULT_INT_TO_UUID_KEY;
|
|
122
|
-
this.intToUuidSize = options.intToUuidSize ?? BigInt(32) * BigInt(1024) ** BigInt(3);
|
|
123
|
-
this.uuidToIntSize = options.uuidToIntSize ?? BigInt(32) * BigInt(1024) ** BigInt(3);
|
|
124
|
-
this.bucketCapacity = options.bucketCapacity ?? 16;
|
|
125
|
-
this.maxGlobalDepth = options.maxGlobalDepth ?? 28;
|
|
126
|
-
this.requestedIdSpace = options.idSpace ?? 'u32';
|
|
127
|
-
}
|
|
128
|
-
async init() {
|
|
129
|
-
if (this.initialized)
|
|
130
|
-
return;
|
|
131
|
-
const storage = this.storage;
|
|
132
|
-
if (!storage.getBinaryBlobPath) {
|
|
133
|
-
throw new Error('NativeBinaryEntityIdMapperWrapper requires a storage adapter that ' +
|
|
134
|
-
'exposes getBinaryBlobPath() (filesystem-backed). For cloud adapters, ' +
|
|
135
|
-
'use NativeEntityIdMapperWrapper (JSON variant) instead.');
|
|
136
|
-
}
|
|
137
|
-
const uuidToIntPath = storage.getBinaryBlobPath(this.uuidToIntKey);
|
|
138
|
-
const intToUuidPath = storage.getBinaryBlobPath(this.intToUuidKey);
|
|
139
|
-
if (!uuidToIntPath || !intToUuidPath) {
|
|
140
|
-
throw new Error(`NativeBinaryEntityIdMapperWrapper: getBinaryBlobPath returned null for ` +
|
|
141
|
-
`${this.uuidToIntKey} or ${this.intToUuidKey}`);
|
|
142
|
-
}
|
|
143
|
-
const bindings = loadNativeModule();
|
|
144
|
-
const NativeBinaryIdMapper = bindings.NativeBinaryIdMapper;
|
|
145
|
-
if (!NativeBinaryIdMapper) {
|
|
146
|
-
throw new Error('NativeBinaryIdMapper binding missing from cortex native module — ' +
|
|
147
|
-
'this build of cortex is older than the BinaryIdMapper feature');
|
|
148
|
-
}
|
|
149
|
-
const config = {
|
|
150
|
-
uuidToIntPath,
|
|
151
|
-
intToUuidPath,
|
|
152
|
-
intToUuidSize: this.intToUuidSize,
|
|
153
|
-
uuidToIntSize: this.uuidToIntSize,
|
|
154
|
-
bucketCapacity: this.bucketCapacity,
|
|
155
|
-
maxGlobalDepth: this.maxGlobalDepth,
|
|
156
|
-
idSpace: this.requestedIdSpace,
|
|
157
|
-
};
|
|
158
|
-
// Explicitly distinguish "fresh install" from "existing files".
|
|
159
|
-
// Both files must exist together (paired write semantics) — a
|
|
160
|
-
// half-present state is corruption from a crash between file
|
|
161
|
-
// creations and is surfaced as an error rather than silently
|
|
162
|
-
// recreated.
|
|
163
|
-
const uuidFileExists = existsSync(uuidToIntPath);
|
|
164
|
-
const intFileExists = existsSync(intToUuidPath);
|
|
165
|
-
if (uuidFileExists && intFileExists) {
|
|
166
|
-
this.native = NativeBinaryIdMapper.openExisting(config);
|
|
167
|
-
}
|
|
168
|
-
else if (!uuidFileExists && !intFileExists) {
|
|
169
|
-
this.native = NativeBinaryIdMapper.create(config);
|
|
170
|
-
}
|
|
171
|
-
else {
|
|
172
|
-
throw new Error(`NativeBinaryEntityIdMapperWrapper: half-present file pair — ` +
|
|
173
|
-
`${this.uuidToIntKey} ${uuidFileExists ? 'exists' : 'missing'}, ` +
|
|
174
|
-
`${this.intToUuidKey} ${intFileExists ? 'exists' : 'missing'}. ` +
|
|
175
|
-
`Refusing to silently recreate; investigate manually.`);
|
|
176
|
-
}
|
|
177
|
-
// Reflect the on-disk IdSpace — authoritative over the requested
|
|
178
|
-
// value when openExisting opens a file with a different mode.
|
|
179
|
-
this.resolvedIdSpace = this.native.idSpace();
|
|
180
|
-
this.initialized = true;
|
|
181
|
-
if (prodLog?.debug) {
|
|
182
|
-
prodLog.debug(`[cortex] BinaryIdMapper wired: paths=[${uuidToIntPath}, ${intToUuidPath}], idSpace=${this.resolvedIdSpace}`);
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
/**
|
|
186
|
-
* Report the mapper's actual IdSpace mode. Returns `'u32'` before
|
|
187
|
-
* `init()` (the default the wrapper assumes); after init, returns the
|
|
188
|
-
* mode reported by the native binding (which is authoritative).
|
|
189
|
-
*/
|
|
190
|
-
getIdSpace() {
|
|
191
|
-
return this.resolvedIdSpace;
|
|
192
|
-
}
|
|
193
|
-
// -- EntityIdMapperProvider surface (number-typed, brainy contract) --
|
|
194
|
-
/**
|
|
195
|
-
* Allocate or retrieve the entity int for `uuid`. Returns a JS
|
|
196
|
-
* `number`. In U64 mode, routes through the BigInt sibling and
|
|
197
|
-
* throws {@link EntityIdSpaceExceeded} if the allocated int exceeds
|
|
198
|
-
* `Number.MAX_SAFE_INTEGER` — at that point the caller MUST switch
|
|
199
|
-
* to `getOrAssignBig` for the full u64 range.
|
|
200
|
-
*/
|
|
201
|
-
getOrAssign(uuid) {
|
|
202
|
-
const native = this.ensure();
|
|
203
|
-
if (this.resolvedIdSpace === 'u64') {
|
|
204
|
-
return bigToSafeNumber(native.getOrAssignBig(this.encode(uuid)), 'getOrAssign');
|
|
205
|
-
}
|
|
206
|
-
return native.getOrAssign(this.encode(uuid));
|
|
207
|
-
}
|
|
208
|
-
/**
|
|
209
|
-
* Look up the UUID for `intId`. Accepts a JS `number` — in U64 mode
|
|
210
|
-
* this is a lossy conversion above 2^53; use {@link getUuidBig} for
|
|
211
|
-
* the full u64 range.
|
|
212
|
-
*/
|
|
213
|
-
getUuid(intId) {
|
|
214
|
-
const native = this.ensure();
|
|
215
|
-
const buf = this.resolvedIdSpace === 'u64'
|
|
216
|
-
? native.getUuidBig(BigInt(intId))
|
|
217
|
-
: native.getUuid(intId);
|
|
218
|
-
if (!buf)
|
|
219
|
-
return undefined;
|
|
220
|
-
return this.decode(buf);
|
|
221
|
-
}
|
|
222
|
-
/**
|
|
223
|
-
* Look up the entity int for `uuid`. Returns a JS `number`. In U64
|
|
224
|
-
* mode throws {@link EntityIdSpaceExceeded} if the int exceeds
|
|
225
|
-
* `Number.MAX_SAFE_INTEGER`.
|
|
226
|
-
*/
|
|
227
|
-
getInt(uuid) {
|
|
228
|
-
const native = this.ensure();
|
|
229
|
-
if (this.resolvedIdSpace === 'u64') {
|
|
230
|
-
const big = native.getIntBig(this.encode(uuid));
|
|
231
|
-
return big == null ? undefined : bigToSafeNumber(big, 'getInt');
|
|
232
|
-
}
|
|
233
|
-
const out = native.getInt(this.encode(uuid));
|
|
234
|
-
return out == null ? undefined : out;
|
|
235
|
-
}
|
|
236
|
-
remove(uuid) {
|
|
237
|
-
const native = this.ensure();
|
|
238
|
-
return native.remove(this.encode(uuid));
|
|
239
|
-
}
|
|
240
|
-
async flush() {
|
|
241
|
-
const native = this.ensure();
|
|
242
|
-
native.flush();
|
|
243
|
-
}
|
|
244
|
-
async clear() {
|
|
245
|
-
// Reset by recreating the files. Atomicity caveat: any concurrent
|
|
246
|
-
// reader holds a stale mmap. Brainy calls clear() during clear()
|
|
247
|
-
// operations that already block other access; this is fine.
|
|
248
|
-
this.initialized = false;
|
|
249
|
-
this.native = null;
|
|
250
|
-
await this.init();
|
|
251
|
-
}
|
|
252
|
-
/**
|
|
253
|
-
* Materialise every live int id into a JS `number[]`. **U32 mode
|
|
254
|
-
* only.** U64 mode throws — the native binding refuses to allocate
|
|
255
|
-
* a giant JS array at the scale a U64 brain implies. Iterate via
|
|
256
|
-
* the BigInt sibling iterator (TBD — a follow-up surfaces it on the
|
|
257
|
-
* wrapper) for U64 brains.
|
|
258
|
-
*/
|
|
259
|
-
getAllIntIds() {
|
|
260
|
-
const native = this.ensure();
|
|
261
|
-
if (this.resolvedIdSpace === 'u64') {
|
|
262
|
-
throw new Error('getAllIntIds: not supported in U64 mode — the materialised ' +
|
|
263
|
-
'number[] would risk OOM at billion scale. Use the BigInt ' +
|
|
264
|
-
'streaming iterator on the underlying native binding instead.');
|
|
265
|
-
}
|
|
266
|
-
return native.getAllIntIds();
|
|
267
|
-
}
|
|
268
|
-
intsIterableToUuids(ints) {
|
|
269
|
-
const native = this.ensure();
|
|
270
|
-
const u64 = this.resolvedIdSpace === 'u64';
|
|
271
|
-
const out = [];
|
|
272
|
-
for (const i of ints) {
|
|
273
|
-
const buf = u64 ? native.getUuidBig(BigInt(i)) : native.getUuid(i);
|
|
274
|
-
if (buf)
|
|
275
|
-
out.push(this.decode(buf));
|
|
276
|
-
}
|
|
277
|
-
return out;
|
|
278
|
-
}
|
|
279
|
-
get size() {
|
|
280
|
-
if (!this.initialized || !this.native)
|
|
281
|
-
return 0;
|
|
282
|
-
if (this.resolvedIdSpace === 'u64') {
|
|
283
|
-
return bigToSafeNumber(this.native.sizeBig(), 'size');
|
|
284
|
-
}
|
|
285
|
-
return this.native.size();
|
|
286
|
-
}
|
|
287
|
-
// -- BigInt sibling surface (u64-safe, works in both modes) -------
|
|
288
|
-
/**
|
|
289
|
-
* Allocate or retrieve the entity int for `uuid` as a `bigint`.
|
|
290
|
-
* Lossless across the full u64 range; safe to call in either mode.
|
|
291
|
-
*/
|
|
292
|
-
getOrAssignBig(uuid) {
|
|
293
|
-
const native = this.ensure();
|
|
294
|
-
return native.getOrAssignBig(this.encode(uuid));
|
|
295
|
-
}
|
|
296
|
-
/** Look up the entity int for `uuid` as a `bigint`. */
|
|
297
|
-
getIntBig(uuid) {
|
|
298
|
-
const native = this.ensure();
|
|
299
|
-
const out = native.getIntBig(this.encode(uuid));
|
|
300
|
-
return out == null ? undefined : out;
|
|
301
|
-
}
|
|
302
|
-
/** Look up the UUID for `int` (passed as a `bigint`). */
|
|
303
|
-
getUuidBig(int) {
|
|
304
|
-
const native = this.ensure();
|
|
305
|
-
const buf = native.getUuidBig(int);
|
|
306
|
-
if (!buf)
|
|
307
|
-
return undefined;
|
|
308
|
-
return this.decode(buf);
|
|
309
|
-
}
|
|
310
|
-
/** Live (non-tombstone) entry count as a `bigint`. */
|
|
311
|
-
sizeBig() {
|
|
312
|
-
if (!this.initialized || !this.native)
|
|
313
|
-
return 0n;
|
|
314
|
-
return this.native.sizeBig();
|
|
315
|
-
}
|
|
316
|
-
/** Largest int ever assigned + 1, as a `bigint`. */
|
|
317
|
-
nextIntBig() {
|
|
318
|
-
return this.ensure().nextIntBig();
|
|
319
|
-
}
|
|
320
|
-
// ---------------------------------------------------------------
|
|
321
|
-
// UUID string ↔ Buffer conversion
|
|
322
|
-
// ---------------------------------------------------------------
|
|
323
|
-
/**
|
|
324
|
-
* Encode a UUID string into a 16-byte Buffer. Accepts canonical
|
|
325
|
-
* 36-char form (with hyphens) or any 32-hex-digit form. Throws on
|
|
326
|
-
* malformed input.
|
|
327
|
-
*/
|
|
328
|
-
encode(uuid) {
|
|
329
|
-
const hex = uuid.replace(/-/g, '').toLowerCase();
|
|
330
|
-
if (hex.length !== 32 || !/^[0-9a-f]{32}$/.test(hex)) {
|
|
331
|
-
throw new Error(`NativeBinaryEntityIdMapperWrapper: invalid UUID string "${uuid}"`);
|
|
332
|
-
}
|
|
333
|
-
return Buffer.from(hex, 'hex');
|
|
334
|
-
}
|
|
335
|
-
/** Decode a 16-byte Buffer back to canonical UUID string. */
|
|
336
|
-
decode(buf) {
|
|
337
|
-
if (buf.length !== UUID_BYTES) {
|
|
338
|
-
throw new Error(`NativeBinaryEntityIdMapperWrapper: native returned ${buf.length}-byte uuid (expected ${UUID_BYTES})`);
|
|
339
|
-
}
|
|
340
|
-
const hex = buf.toString('hex');
|
|
341
|
-
return (hex.slice(0, 8) +
|
|
342
|
-
'-' +
|
|
343
|
-
hex.slice(8, 12) +
|
|
344
|
-
'-' +
|
|
345
|
-
hex.slice(12, 16) +
|
|
346
|
-
'-' +
|
|
347
|
-
hex.slice(16, 20) +
|
|
348
|
-
'-' +
|
|
349
|
-
hex.slice(20, 32));
|
|
350
|
-
}
|
|
351
|
-
ensure() {
|
|
352
|
-
if (!this.initialized || !this.native) {
|
|
353
|
-
throw new Error('NativeBinaryEntityIdMapperWrapper: call init() before any operation');
|
|
354
|
-
}
|
|
355
|
-
return this.native;
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
//# sourceMappingURL=nativeBinaryEntityIdMapper.js.map
|