@soulcraft/brainy 3.41.1 → 3.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,322 @@
1
+ /**
2
+ * Metadata Index Chunking System
3
+ *
4
+ * Implements Adaptive Chunked Sparse Indexing inspired by ClickHouse MergeTree.
5
+ * Reduces file count from 560k to ~89 files (630x reduction) while maintaining performance.
6
+ *
7
+ * Key Components:
8
+ * - BloomFilter: Probabilistic membership testing (fast negative lookups)
9
+ * - SparseIndex: Directory of chunks with zone maps (range query optimization)
10
+ * - ChunkManager: Chunk lifecycle management (create/split/merge)
11
+ * - AdaptiveChunkingStrategy: Field-specific optimization strategies
12
+ *
13
+ * Architecture:
14
+ * - Each high-cardinality field gets a sparse index (directory)
15
+ * - Values are grouped into chunks (~50 values per chunk)
16
+ * - Each chunk has a bloom filter for fast negative lookups
17
+ * - Zone maps enable range query optimization
18
+ * - Backward compatible with existing flat file indexes
19
+ */
20
+ import { StorageAdapter } from '../coreTypes.js';
21
+ /**
22
+ * Zone Map for range query optimization
23
+ * Tracks min/max values in a chunk for fast range filtering
24
+ */
25
+ export interface ZoneMap {
26
+ min: any;
27
+ max: any;
28
+ count: number;
29
+ hasNulls: boolean;
30
+ }
31
+ /**
32
+ * Chunk Descriptor
33
+ * Metadata about a chunk including its location, zone map, and bloom filter
34
+ */
35
+ export interface ChunkDescriptor {
36
+ chunkId: number;
37
+ field: string;
38
+ valueCount: number;
39
+ idCount: number;
40
+ zoneMap: ZoneMap;
41
+ bloomFilterPath?: string;
42
+ lastUpdated: number;
43
+ splitThreshold: number;
44
+ mergeThreshold: number;
45
+ }
46
+ /**
47
+ * Sparse Index Data
48
+ * Directory structure mapping value ranges to chunks
49
+ */
50
+ export interface SparseIndexData {
51
+ field: string;
52
+ strategy: 'hash' | 'sorted' | 'adaptive';
53
+ chunks: ChunkDescriptor[];
54
+ totalValues: number;
55
+ totalIds: number;
56
+ lastUpdated: number;
57
+ chunkSize: number;
58
+ version: number;
59
+ }
60
+ /**
61
+ * Chunk Data
62
+ * Actual storage of field:value -> IDs mappings
63
+ */
64
+ export interface ChunkData {
65
+ chunkId: number;
66
+ field: string;
67
+ entries: Map<string, Set<string>>;
68
+ lastUpdated: number;
69
+ }
70
+ /**
71
+ * Bloom Filter for probabilistic membership testing
72
+ *
73
+ * Uses multiple hash functions to achieve ~1% false positive rate.
74
+ * Memory efficient: ~10 bits per element for 1% FPR.
75
+ *
76
+ * Properties:
77
+ * - Never produces false negatives (if returns false, definitely not in set)
78
+ * - May produce false positives (~1% with default config)
79
+ * - Space efficient compared to hash sets
80
+ * - Fast O(k) lookup where k = number of hash functions
81
+ */
82
+ export declare class BloomFilter {
83
+ private bits;
84
+ private numBits;
85
+ private numHashFunctions;
86
+ private itemCount;
87
+ /**
88
+ * Create a Bloom filter
89
+ * @param expectedItems Expected number of items to store
90
+ * @param falsePositiveRate Target false positive rate (default: 0.01 = 1%)
91
+ */
92
+ constructor(expectedItems: number, falsePositiveRate?: number);
93
+ /**
94
+ * Add an item to the bloom filter
95
+ */
96
+ add(item: string): void;
97
+ /**
98
+ * Test if an item might be in the set
99
+ * @returns false = definitely not in set, true = might be in set
100
+ */
101
+ mightContain(item: string): boolean;
102
+ /**
103
+ * Get multiple hash positions for an item
104
+ * Uses double hashing technique: h(i) = (h1 + i*h2) mod m
105
+ */
106
+ private getHashPositions;
107
+ /**
108
+ * First hash function (FNV-1a variant)
109
+ */
110
+ private hash1;
111
+ /**
112
+ * Second hash function (DJB2)
113
+ */
114
+ private hash2;
115
+ /**
116
+ * Set a bit in the bit array
117
+ */
118
+ private setBit;
119
+ /**
120
+ * Get a bit from the bit array
121
+ */
122
+ private getBit;
123
+ /**
124
+ * Serialize to JSON for storage
125
+ */
126
+ toJSON(): any;
127
+ /**
128
+ * Deserialize from JSON
129
+ */
130
+ static fromJSON(data: any): BloomFilter;
131
+ /**
132
+ * Get estimated false positive rate based on current fill
133
+ */
134
+ getEstimatedFPR(): number;
135
+ /**
136
+ * Count number of set bits
137
+ */
138
+ private countSetBits;
139
+ /**
140
+ * Count set bits in a byte (population count)
141
+ */
142
+ private popcount;
143
+ }
144
+ /**
145
+ * Sparse Index manages the directory of chunks for a field
146
+ *
147
+ * Inspired by ClickHouse MergeTree sparse primary index:
148
+ * - Maintains sorted list of chunk descriptors
149
+ * - Uses zone maps for range query optimization
150
+ * - Enables fast chunk selection without loading all data
151
+ *
152
+ * Query Flow:
153
+ * 1. Check zone maps to find candidate chunks
154
+ * 2. Load bloom filters for candidate chunks (fast negative lookup)
155
+ * 3. Load only the chunks that likely contain the value
156
+ */
157
+ export declare class SparseIndex {
158
+ private data;
159
+ private bloomFilters;
160
+ constructor(field: string, chunkSize?: number);
161
+ /**
162
+ * Find chunks that might contain a specific value
163
+ */
164
+ findChunksForValue(value: any): number[];
165
+ /**
166
+ * Find chunks that overlap with a value range
167
+ */
168
+ findChunksForRange(min?: any, max?: any): number[];
169
+ /**
170
+ * Check if a value falls within a zone map's range
171
+ */
172
+ private isValueInZoneMap;
173
+ /**
174
+ * Check if a range overlaps with a zone map
175
+ */
176
+ private doesRangeOverlap;
177
+ /**
178
+ * Register a chunk in the sparse index
179
+ */
180
+ registerChunk(descriptor: ChunkDescriptor, bloomFilter?: BloomFilter): void;
181
+ /**
182
+ * Update a chunk descriptor
183
+ */
184
+ updateChunk(chunkId: number, updates: Partial<ChunkDescriptor>): void;
185
+ /**
186
+ * Remove a chunk from the sparse index
187
+ */
188
+ removeChunk(chunkId: number): void;
189
+ /**
190
+ * Get chunk descriptor by ID
191
+ */
192
+ getChunk(chunkId: number): ChunkDescriptor | undefined;
193
+ /**
194
+ * Get all chunk IDs
195
+ */
196
+ getAllChunkIds(): number[];
197
+ /**
198
+ * Sort chunks by zone map min value
199
+ */
200
+ private sortChunks;
201
+ /**
202
+ * Get sparse index statistics
203
+ */
204
+ getStats(): {
205
+ field: string;
206
+ chunkCount: number;
207
+ avgValuesPerChunk: number;
208
+ avgIdsPerChunk: number;
209
+ totalValues: number;
210
+ totalIds: number;
211
+ estimatedFPR: number;
212
+ };
213
+ /**
214
+ * Serialize to JSON for storage
215
+ */
216
+ toJSON(): any;
217
+ /**
218
+ * Deserialize from JSON
219
+ */
220
+ static fromJSON(data: any): SparseIndex;
221
+ }
222
+ /**
223
+ * ChunkManager handles chunk operations: create, split, merge, compact
224
+ *
225
+ * Responsibilities:
226
+ * - Maintain optimal chunk sizes (~50 values per chunk)
227
+ * - Split chunks that grow too large (> 80 values)
228
+ * - Merge chunks that become too small (< 20 values)
229
+ * - Update zone maps and bloom filters
230
+ * - Coordinate with storage adapter
231
+ */
232
+ export declare class ChunkManager {
233
+ private storage;
234
+ private chunkCache;
235
+ private nextChunkId;
236
+ constructor(storage: StorageAdapter);
237
+ /**
238
+ * Create a new chunk for a field
239
+ */
240
+ createChunk(field: string, initialEntries?: Map<string, Set<string>>): Promise<ChunkData>;
241
+ /**
242
+ * Load a chunk from storage
243
+ */
244
+ loadChunk(field: string, chunkId: number): Promise<ChunkData | null>;
245
+ /**
246
+ * Save a chunk to storage
247
+ */
248
+ saveChunk(chunk: ChunkData): Promise<void>;
249
+ /**
250
+ * Add a value-ID mapping to a chunk
251
+ */
252
+ addToChunk(chunk: ChunkData, value: string, id: string): Promise<void>;
253
+ /**
254
+ * Remove an ID from a chunk
255
+ */
256
+ removeFromChunk(chunk: ChunkData, value: string, id: string): Promise<void>;
257
+ /**
258
+ * Calculate zone map for a chunk
259
+ */
260
+ calculateZoneMap(chunk: ChunkData): ZoneMap;
261
+ /**
262
+ * Create bloom filter for a chunk
263
+ */
264
+ createBloomFilter(chunk: ChunkData): BloomFilter;
265
+ /**
266
+ * Split a chunk if it's too large
267
+ */
268
+ splitChunk(chunk: ChunkData, sparseIndex: SparseIndex): Promise<{
269
+ chunk1: ChunkData;
270
+ chunk2: ChunkData;
271
+ }>;
272
+ /**
273
+ * Delete a chunk
274
+ */
275
+ deleteChunk(field: string, chunkId: number): Promise<void>;
276
+ /**
277
+ * Get chunk storage path
278
+ */
279
+ private getChunkPath;
280
+ /**
281
+ * Get next available chunk ID for a field
282
+ */
283
+ private getNextChunkId;
284
+ /**
285
+ * Clear chunk cache (for testing/maintenance)
286
+ */
287
+ clearCache(): void;
288
+ }
289
+ /**
290
+ * Determines optimal chunking strategy based on field characteristics
291
+ */
292
+ export declare class AdaptiveChunkingStrategy {
293
+ /**
294
+ * Determine if a field should use chunking
295
+ */
296
+ shouldUseChunking(fieldStats: {
297
+ uniqueValues: number;
298
+ totalValues: number;
299
+ distribution: 'uniform' | 'skewed' | 'sparse';
300
+ }): boolean;
301
+ /**
302
+ * Determine optimal chunk size for a field
303
+ */
304
+ getOptimalChunkSize(fieldStats: {
305
+ uniqueValues: number;
306
+ distribution: 'uniform' | 'skewed' | 'sparse';
307
+ avgIdsPerValue: number;
308
+ }): number;
309
+ /**
310
+ * Determine if a chunk should be split
311
+ */
312
+ shouldSplit(chunk: {
313
+ valueCount: number;
314
+ idCount: number;
315
+ }, threshold: number): boolean;
316
+ /**
317
+ * Determine if chunks should be merged
318
+ */
319
+ shouldMerge(chunks: Array<{
320
+ valueCount: number;
321
+ }>, threshold: number): boolean;
322
+ }