@soulcraft/brainy 3.41.1 → 3.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Metadata Index Chunking System
|
|
3
|
+
*
|
|
4
|
+
* Implements Adaptive Chunked Sparse Indexing inspired by ClickHouse MergeTree.
|
|
5
|
+
* Reduces file count from 560k to ~89 files (630x reduction) while maintaining performance.
|
|
6
|
+
*
|
|
7
|
+
* Key Components:
|
|
8
|
+
* - BloomFilter: Probabilistic membership testing (fast negative lookups)
|
|
9
|
+
* - SparseIndex: Directory of chunks with zone maps (range query optimization)
|
|
10
|
+
* - ChunkManager: Chunk lifecycle management (create/split/merge)
|
|
11
|
+
* - AdaptiveChunkingStrategy: Field-specific optimization strategies
|
|
12
|
+
*
|
|
13
|
+
* Architecture:
|
|
14
|
+
* - Each high-cardinality field gets a sparse index (directory)
|
|
15
|
+
* - Values are grouped into chunks (~50 values per chunk)
|
|
16
|
+
* - Each chunk has a bloom filter for fast negative lookups
|
|
17
|
+
* - Zone maps enable range query optimization
|
|
18
|
+
* - Backward compatible with existing flat file indexes
|
|
19
|
+
*/
|
|
20
|
+
import { StorageAdapter } from '../coreTypes.js';
|
|
21
|
+
/**
|
|
22
|
+
* Zone Map for range query optimization
|
|
23
|
+
* Tracks min/max values in a chunk for fast range filtering
|
|
24
|
+
*/
|
|
25
|
+
export interface ZoneMap {
|
|
26
|
+
min: any;
|
|
27
|
+
max: any;
|
|
28
|
+
count: number;
|
|
29
|
+
hasNulls: boolean;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Chunk Descriptor
|
|
33
|
+
* Metadata about a chunk including its location, zone map, and bloom filter
|
|
34
|
+
*/
|
|
35
|
+
export interface ChunkDescriptor {
|
|
36
|
+
chunkId: number;
|
|
37
|
+
field: string;
|
|
38
|
+
valueCount: number;
|
|
39
|
+
idCount: number;
|
|
40
|
+
zoneMap: ZoneMap;
|
|
41
|
+
bloomFilterPath?: string;
|
|
42
|
+
lastUpdated: number;
|
|
43
|
+
splitThreshold: number;
|
|
44
|
+
mergeThreshold: number;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Sparse Index Data
|
|
48
|
+
* Directory structure mapping value ranges to chunks
|
|
49
|
+
*/
|
|
50
|
+
export interface SparseIndexData {
|
|
51
|
+
field: string;
|
|
52
|
+
strategy: 'hash' | 'sorted' | 'adaptive';
|
|
53
|
+
chunks: ChunkDescriptor[];
|
|
54
|
+
totalValues: number;
|
|
55
|
+
totalIds: number;
|
|
56
|
+
lastUpdated: number;
|
|
57
|
+
chunkSize: number;
|
|
58
|
+
version: number;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Chunk Data
|
|
62
|
+
* Actual storage of field:value -> IDs mappings
|
|
63
|
+
*/
|
|
64
|
+
export interface ChunkData {
|
|
65
|
+
chunkId: number;
|
|
66
|
+
field: string;
|
|
67
|
+
entries: Map<string, Set<string>>;
|
|
68
|
+
lastUpdated: number;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Bloom Filter for probabilistic membership testing
|
|
72
|
+
*
|
|
73
|
+
* Uses multiple hash functions to achieve ~1% false positive rate.
|
|
74
|
+
* Memory efficient: ~10 bits per element for 1% FPR.
|
|
75
|
+
*
|
|
76
|
+
* Properties:
|
|
77
|
+
* - Never produces false negatives (if returns false, definitely not in set)
|
|
78
|
+
* - May produce false positives (~1% with default config)
|
|
79
|
+
* - Space efficient compared to hash sets
|
|
80
|
+
* - Fast O(k) lookup where k = number of hash functions
|
|
81
|
+
*/
|
|
82
|
+
export declare class BloomFilter {
|
|
83
|
+
private bits;
|
|
84
|
+
private numBits;
|
|
85
|
+
private numHashFunctions;
|
|
86
|
+
private itemCount;
|
|
87
|
+
/**
|
|
88
|
+
* Create a Bloom filter
|
|
89
|
+
* @param expectedItems Expected number of items to store
|
|
90
|
+
* @param falsePositiveRate Target false positive rate (default: 0.01 = 1%)
|
|
91
|
+
*/
|
|
92
|
+
constructor(expectedItems: number, falsePositiveRate?: number);
|
|
93
|
+
/**
|
|
94
|
+
* Add an item to the bloom filter
|
|
95
|
+
*/
|
|
96
|
+
add(item: string): void;
|
|
97
|
+
/**
|
|
98
|
+
* Test if an item might be in the set
|
|
99
|
+
* @returns false = definitely not in set, true = might be in set
|
|
100
|
+
*/
|
|
101
|
+
mightContain(item: string): boolean;
|
|
102
|
+
/**
|
|
103
|
+
* Get multiple hash positions for an item
|
|
104
|
+
* Uses double hashing technique: h(i) = (h1 + i*h2) mod m
|
|
105
|
+
*/
|
|
106
|
+
private getHashPositions;
|
|
107
|
+
/**
|
|
108
|
+
* First hash function (FNV-1a variant)
|
|
109
|
+
*/
|
|
110
|
+
private hash1;
|
|
111
|
+
/**
|
|
112
|
+
* Second hash function (DJB2)
|
|
113
|
+
*/
|
|
114
|
+
private hash2;
|
|
115
|
+
/**
|
|
116
|
+
* Set a bit in the bit array
|
|
117
|
+
*/
|
|
118
|
+
private setBit;
|
|
119
|
+
/**
|
|
120
|
+
* Get a bit from the bit array
|
|
121
|
+
*/
|
|
122
|
+
private getBit;
|
|
123
|
+
/**
|
|
124
|
+
* Serialize to JSON for storage
|
|
125
|
+
*/
|
|
126
|
+
toJSON(): any;
|
|
127
|
+
/**
|
|
128
|
+
* Deserialize from JSON
|
|
129
|
+
*/
|
|
130
|
+
static fromJSON(data: any): BloomFilter;
|
|
131
|
+
/**
|
|
132
|
+
* Get estimated false positive rate based on current fill
|
|
133
|
+
*/
|
|
134
|
+
getEstimatedFPR(): number;
|
|
135
|
+
/**
|
|
136
|
+
* Count number of set bits
|
|
137
|
+
*/
|
|
138
|
+
private countSetBits;
|
|
139
|
+
/**
|
|
140
|
+
* Count set bits in a byte (population count)
|
|
141
|
+
*/
|
|
142
|
+
private popcount;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Sparse Index manages the directory of chunks for a field
|
|
146
|
+
*
|
|
147
|
+
* Inspired by ClickHouse MergeTree sparse primary index:
|
|
148
|
+
* - Maintains sorted list of chunk descriptors
|
|
149
|
+
* - Uses zone maps for range query optimization
|
|
150
|
+
* - Enables fast chunk selection without loading all data
|
|
151
|
+
*
|
|
152
|
+
* Query Flow:
|
|
153
|
+
* 1. Check zone maps to find candidate chunks
|
|
154
|
+
* 2. Load bloom filters for candidate chunks (fast negative lookup)
|
|
155
|
+
* 3. Load only the chunks that likely contain the value
|
|
156
|
+
*/
|
|
157
|
+
export declare class SparseIndex {
|
|
158
|
+
private data;
|
|
159
|
+
private bloomFilters;
|
|
160
|
+
constructor(field: string, chunkSize?: number);
|
|
161
|
+
/**
|
|
162
|
+
* Find chunks that might contain a specific value
|
|
163
|
+
*/
|
|
164
|
+
findChunksForValue(value: any): number[];
|
|
165
|
+
/**
|
|
166
|
+
* Find chunks that overlap with a value range
|
|
167
|
+
*/
|
|
168
|
+
findChunksForRange(min?: any, max?: any): number[];
|
|
169
|
+
/**
|
|
170
|
+
* Check if a value falls within a zone map's range
|
|
171
|
+
*/
|
|
172
|
+
private isValueInZoneMap;
|
|
173
|
+
/**
|
|
174
|
+
* Check if a range overlaps with a zone map
|
|
175
|
+
*/
|
|
176
|
+
private doesRangeOverlap;
|
|
177
|
+
/**
|
|
178
|
+
* Register a chunk in the sparse index
|
|
179
|
+
*/
|
|
180
|
+
registerChunk(descriptor: ChunkDescriptor, bloomFilter?: BloomFilter): void;
|
|
181
|
+
/**
|
|
182
|
+
* Update a chunk descriptor
|
|
183
|
+
*/
|
|
184
|
+
updateChunk(chunkId: number, updates: Partial<ChunkDescriptor>): void;
|
|
185
|
+
/**
|
|
186
|
+
* Remove a chunk from the sparse index
|
|
187
|
+
*/
|
|
188
|
+
removeChunk(chunkId: number): void;
|
|
189
|
+
/**
|
|
190
|
+
* Get chunk descriptor by ID
|
|
191
|
+
*/
|
|
192
|
+
getChunk(chunkId: number): ChunkDescriptor | undefined;
|
|
193
|
+
/**
|
|
194
|
+
* Get all chunk IDs
|
|
195
|
+
*/
|
|
196
|
+
getAllChunkIds(): number[];
|
|
197
|
+
/**
|
|
198
|
+
* Sort chunks by zone map min value
|
|
199
|
+
*/
|
|
200
|
+
private sortChunks;
|
|
201
|
+
/**
|
|
202
|
+
* Get sparse index statistics
|
|
203
|
+
*/
|
|
204
|
+
getStats(): {
|
|
205
|
+
field: string;
|
|
206
|
+
chunkCount: number;
|
|
207
|
+
avgValuesPerChunk: number;
|
|
208
|
+
avgIdsPerChunk: number;
|
|
209
|
+
totalValues: number;
|
|
210
|
+
totalIds: number;
|
|
211
|
+
estimatedFPR: number;
|
|
212
|
+
};
|
|
213
|
+
/**
|
|
214
|
+
* Serialize to JSON for storage
|
|
215
|
+
*/
|
|
216
|
+
toJSON(): any;
|
|
217
|
+
/**
|
|
218
|
+
* Deserialize from JSON
|
|
219
|
+
*/
|
|
220
|
+
static fromJSON(data: any): SparseIndex;
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* ChunkManager handles chunk operations: create, split, merge, compact
|
|
224
|
+
*
|
|
225
|
+
* Responsibilities:
|
|
226
|
+
* - Maintain optimal chunk sizes (~50 values per chunk)
|
|
227
|
+
* - Split chunks that grow too large (> 80 values)
|
|
228
|
+
* - Merge chunks that become too small (< 20 values)
|
|
229
|
+
* - Update zone maps and bloom filters
|
|
230
|
+
* - Coordinate with storage adapter
|
|
231
|
+
*/
|
|
232
|
+
export declare class ChunkManager {
|
|
233
|
+
private storage;
|
|
234
|
+
private chunkCache;
|
|
235
|
+
private nextChunkId;
|
|
236
|
+
constructor(storage: StorageAdapter);
|
|
237
|
+
/**
|
|
238
|
+
* Create a new chunk for a field
|
|
239
|
+
*/
|
|
240
|
+
createChunk(field: string, initialEntries?: Map<string, Set<string>>): Promise<ChunkData>;
|
|
241
|
+
/**
|
|
242
|
+
* Load a chunk from storage
|
|
243
|
+
*/
|
|
244
|
+
loadChunk(field: string, chunkId: number): Promise<ChunkData | null>;
|
|
245
|
+
/**
|
|
246
|
+
* Save a chunk to storage
|
|
247
|
+
*/
|
|
248
|
+
saveChunk(chunk: ChunkData): Promise<void>;
|
|
249
|
+
/**
|
|
250
|
+
* Add a value-ID mapping to a chunk
|
|
251
|
+
*/
|
|
252
|
+
addToChunk(chunk: ChunkData, value: string, id: string): Promise<void>;
|
|
253
|
+
/**
|
|
254
|
+
* Remove an ID from a chunk
|
|
255
|
+
*/
|
|
256
|
+
removeFromChunk(chunk: ChunkData, value: string, id: string): Promise<void>;
|
|
257
|
+
/**
|
|
258
|
+
* Calculate zone map for a chunk
|
|
259
|
+
*/
|
|
260
|
+
calculateZoneMap(chunk: ChunkData): ZoneMap;
|
|
261
|
+
/**
|
|
262
|
+
* Create bloom filter for a chunk
|
|
263
|
+
*/
|
|
264
|
+
createBloomFilter(chunk: ChunkData): BloomFilter;
|
|
265
|
+
/**
|
|
266
|
+
* Split a chunk if it's too large
|
|
267
|
+
*/
|
|
268
|
+
splitChunk(chunk: ChunkData, sparseIndex: SparseIndex): Promise<{
|
|
269
|
+
chunk1: ChunkData;
|
|
270
|
+
chunk2: ChunkData;
|
|
271
|
+
}>;
|
|
272
|
+
/**
|
|
273
|
+
* Delete a chunk
|
|
274
|
+
*/
|
|
275
|
+
deleteChunk(field: string, chunkId: number): Promise<void>;
|
|
276
|
+
/**
|
|
277
|
+
* Get chunk storage path
|
|
278
|
+
*/
|
|
279
|
+
private getChunkPath;
|
|
280
|
+
/**
|
|
281
|
+
* Get next available chunk ID for a field
|
|
282
|
+
*/
|
|
283
|
+
private getNextChunkId;
|
|
284
|
+
/**
|
|
285
|
+
* Clear chunk cache (for testing/maintenance)
|
|
286
|
+
*/
|
|
287
|
+
clearCache(): void;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Determines optimal chunking strategy based on field characteristics
|
|
291
|
+
*/
|
|
292
|
+
export declare class AdaptiveChunkingStrategy {
|
|
293
|
+
/**
|
|
294
|
+
* Determine if a field should use chunking
|
|
295
|
+
*/
|
|
296
|
+
shouldUseChunking(fieldStats: {
|
|
297
|
+
uniqueValues: number;
|
|
298
|
+
totalValues: number;
|
|
299
|
+
distribution: 'uniform' | 'skewed' | 'sparse';
|
|
300
|
+
}): boolean;
|
|
301
|
+
/**
|
|
302
|
+
* Determine optimal chunk size for a field
|
|
303
|
+
*/
|
|
304
|
+
getOptimalChunkSize(fieldStats: {
|
|
305
|
+
uniqueValues: number;
|
|
306
|
+
distribution: 'uniform' | 'skewed' | 'sparse';
|
|
307
|
+
avgIdsPerValue: number;
|
|
308
|
+
}): number;
|
|
309
|
+
/**
|
|
310
|
+
* Determine if a chunk should be split
|
|
311
|
+
*/
|
|
312
|
+
shouldSplit(chunk: {
|
|
313
|
+
valueCount: number;
|
|
314
|
+
idCount: number;
|
|
315
|
+
}, threshold: number): boolean;
|
|
316
|
+
/**
|
|
317
|
+
* Determine if chunks should be merged
|
|
318
|
+
*/
|
|
319
|
+
shouldMerge(chunks: Array<{
|
|
320
|
+
valueCount: number;
|
|
321
|
+
}>, threshold: number): boolean;
|
|
322
|
+
}
|