@soulcraft/brainy 3.41.1 → 3.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/brainy.js +1 -0
- package/dist/utils/entityIdMapper.d.ts +93 -0
- package/dist/utils/entityIdMapper.js +169 -0
- package/dist/utils/metadataIndex.d.ts +57 -59
- package/dist/utils/metadataIndex.js +471 -578
- package/dist/utils/metadataIndexChunking.d.ts +331 -0
- package/dist/utils/metadataIndexChunking.js +738 -0
- package/package.json +2 -1
|
@@ -6,18 +6,16 @@
|
|
|
6
6
|
import { MetadataIndexCache } from './metadataIndexCache.js';
|
|
7
7
|
import { prodLog } from './logger.js';
|
|
8
8
|
import { getGlobalCache } from './unifiedCache.js';
|
|
9
|
+
import { SparseIndex, ChunkManager, AdaptiveChunkingStrategy } from './metadataIndexChunking.js';
|
|
10
|
+
import { EntityIdMapper } from './entityIdMapper.js';
|
|
11
|
+
import { RoaringBitmap32 } from 'roaring';
|
|
9
12
|
export class MetadataIndexManager {
|
|
10
13
|
constructor(storage, config = {}) {
|
|
11
|
-
this.indexCache = new Map();
|
|
12
|
-
this.dirtyEntries = new Set();
|
|
13
14
|
this.isRebuilding = false;
|
|
14
15
|
this.fieldIndexes = new Map();
|
|
15
16
|
this.dirtyFields = new Set();
|
|
16
17
|
this.lastFlushTime = Date.now();
|
|
17
18
|
this.autoFlushThreshold = 10; // Start with 10 for more frequent non-blocking flushes
|
|
18
|
-
// Sorted indices for range queries (only for numeric/date fields)
|
|
19
|
-
this.sortedIndices = new Map();
|
|
20
|
-
this.numericFields = new Set(); // Track which fields are numeric
|
|
21
19
|
// Cardinality and field statistics tracking
|
|
22
20
|
this.fieldStats = new Map();
|
|
23
21
|
this.cardinalityUpdateInterval = 100; // Update cardinality every N operations
|
|
@@ -33,6 +31,10 @@ export class MetadataIndexManager {
|
|
|
33
31
|
this.activeLocks = new Map();
|
|
34
32
|
this.lockPromises = new Map();
|
|
35
33
|
this.lockTimers = new Map(); // Track timers for cleanup
|
|
34
|
+
// Adaptive Chunked Sparse Indexing (v3.42.0)
|
|
35
|
+
// Reduces file count from 560k → 89 files (630x reduction)
|
|
36
|
+
// ALL fields now use chunking - no more flat files
|
|
37
|
+
this.sparseIndices = new Map(); // field -> sparse index
|
|
36
38
|
this.storage = storage;
|
|
37
39
|
this.config = {
|
|
38
40
|
maxIndexSize: config.maxIndexSize ?? 10000,
|
|
@@ -67,9 +69,25 @@ export class MetadataIndexManager {
|
|
|
67
69
|
});
|
|
68
70
|
// Get global unified cache for coordinated memory management
|
|
69
71
|
this.unifiedCache = getGlobalCache();
|
|
72
|
+
// Initialize EntityIdMapper for roaring bitmap UUID ↔ integer mapping (v3.43.0)
|
|
73
|
+
this.idMapper = new EntityIdMapper({
|
|
74
|
+
storage,
|
|
75
|
+
storageKey: 'brainy:entityIdMapper'
|
|
76
|
+
});
|
|
77
|
+
// Initialize chunking system (v3.42.0) with roaring bitmap support
|
|
78
|
+
this.chunkManager = new ChunkManager(storage, this.idMapper);
|
|
79
|
+
this.chunkingStrategy = new AdaptiveChunkingStrategy();
|
|
70
80
|
// Lazy load counts from storage statistics on first access
|
|
71
81
|
this.lazyLoadCounts();
|
|
72
82
|
}
|
|
83
|
+
/**
|
|
84
|
+
* Initialize the metadata index manager
|
|
85
|
+
* This must be called after construction and before any queries
|
|
86
|
+
*/
|
|
87
|
+
async init() {
|
|
88
|
+
// Initialize EntityIdMapper (loads UUID ↔ integer mappings from storage)
|
|
89
|
+
await this.idMapper.init();
|
|
90
|
+
}
|
|
73
91
|
/**
|
|
74
92
|
* Acquire an in-memory lock for coordinating concurrent metadata index writes
|
|
75
93
|
* Uses in-memory locks since MetadataIndexManager doesn't have direct file system access
|
|
@@ -148,220 +166,6 @@ export class MetadataIndexManager {
|
|
|
148
166
|
// This maintains zero-configuration principle
|
|
149
167
|
}
|
|
150
168
|
}
|
|
151
|
-
/**
|
|
152
|
-
* Get index key for field and value
|
|
153
|
-
*/
|
|
154
|
-
getIndexKey(field, value) {
|
|
155
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
156
|
-
return `${field}:${normalizedValue}`;
|
|
157
|
-
}
|
|
158
|
-
/**
|
|
159
|
-
* Ensure sorted index exists for a field (for range queries)
|
|
160
|
-
*/
|
|
161
|
-
async ensureSortedIndex(field) {
|
|
162
|
-
if (!this.sortedIndices.has(field)) {
|
|
163
|
-
// Try to load from storage first
|
|
164
|
-
const loaded = await this.loadSortedIndex(field);
|
|
165
|
-
if (loaded) {
|
|
166
|
-
this.sortedIndices.set(field, loaded);
|
|
167
|
-
}
|
|
168
|
-
else {
|
|
169
|
-
// Create new sorted index - NOT dirty since we maintain incrementally
|
|
170
|
-
this.sortedIndices.set(field, {
|
|
171
|
-
values: [],
|
|
172
|
-
isDirty: false, // Clean by default with incremental updates
|
|
173
|
-
fieldType: 'mixed'
|
|
174
|
-
});
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
/**
|
|
179
|
-
* Build sorted index for a field from hash index
|
|
180
|
-
*/
|
|
181
|
-
async buildSortedIndex(field) {
|
|
182
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
183
|
-
if (!sortedIndex || !sortedIndex.isDirty)
|
|
184
|
-
return;
|
|
185
|
-
// Collect all values for this field from hash index
|
|
186
|
-
const valueMap = new Map();
|
|
187
|
-
for (const [key, entry] of this.indexCache.entries()) {
|
|
188
|
-
if (entry.field === field) {
|
|
189
|
-
const existing = valueMap.get(entry.value);
|
|
190
|
-
if (existing) {
|
|
191
|
-
// Merge ID sets
|
|
192
|
-
entry.ids.forEach(id => existing.add(id));
|
|
193
|
-
}
|
|
194
|
-
else {
|
|
195
|
-
valueMap.set(entry.value, new Set(entry.ids));
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
// Convert to sorted array
|
|
200
|
-
const sorted = Array.from(valueMap.entries());
|
|
201
|
-
// Detect field type and sort accordingly
|
|
202
|
-
if (sorted.length > 0) {
|
|
203
|
-
const sampleValue = sorted[0][0];
|
|
204
|
-
if (typeof sampleValue === 'number') {
|
|
205
|
-
sortedIndex.fieldType = 'number';
|
|
206
|
-
sorted.sort((a, b) => a[0] - b[0]);
|
|
207
|
-
}
|
|
208
|
-
else if (sampleValue instanceof Date) {
|
|
209
|
-
sortedIndex.fieldType = 'date';
|
|
210
|
-
sorted.sort((a, b) => a[0].getTime() - b[0].getTime());
|
|
211
|
-
}
|
|
212
|
-
else {
|
|
213
|
-
sortedIndex.fieldType = 'string';
|
|
214
|
-
sorted.sort((a, b) => {
|
|
215
|
-
const aVal = String(a[0]);
|
|
216
|
-
const bVal = String(b[0]);
|
|
217
|
-
return aVal < bVal ? -1 : aVal > bVal ? 1 : 0;
|
|
218
|
-
});
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
sortedIndex.values = sorted;
|
|
222
|
-
sortedIndex.isDirty = false;
|
|
223
|
-
}
|
|
224
|
-
/**
|
|
225
|
-
* Detect field type from value
|
|
226
|
-
*/
|
|
227
|
-
detectFieldType(value) {
|
|
228
|
-
if (typeof value === 'number' && !isNaN(value))
|
|
229
|
-
return 'number';
|
|
230
|
-
if (value instanceof Date)
|
|
231
|
-
return 'date';
|
|
232
|
-
return 'string';
|
|
233
|
-
}
|
|
234
|
-
/**
|
|
235
|
-
* Compare two values based on field type for sorting
|
|
236
|
-
*/
|
|
237
|
-
compareValues(a, b, fieldType) {
|
|
238
|
-
switch (fieldType) {
|
|
239
|
-
case 'number':
|
|
240
|
-
return a - b;
|
|
241
|
-
case 'date':
|
|
242
|
-
return a.getTime() - b.getTime();
|
|
243
|
-
case 'string':
|
|
244
|
-
default:
|
|
245
|
-
const aStr = String(a);
|
|
246
|
-
const bStr = String(b);
|
|
247
|
-
return aStr < bStr ? -1 : aStr > bStr ? 1 : 0;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
/**
|
|
251
|
-
* Binary search to find insertion position for a value
|
|
252
|
-
* Returns the index where the value should be inserted to maintain sorted order
|
|
253
|
-
*/
|
|
254
|
-
findInsertPosition(sortedArray, value, fieldType) {
|
|
255
|
-
if (sortedArray.length === 0)
|
|
256
|
-
return 0;
|
|
257
|
-
let left = 0;
|
|
258
|
-
let right = sortedArray.length - 1;
|
|
259
|
-
while (left <= right) {
|
|
260
|
-
const mid = Math.floor((left + right) / 2);
|
|
261
|
-
const midVal = sortedArray[mid][0];
|
|
262
|
-
const comparison = this.compareValues(midVal, value, fieldType);
|
|
263
|
-
if (comparison < 0) {
|
|
264
|
-
left = mid + 1;
|
|
265
|
-
}
|
|
266
|
-
else if (comparison > 0) {
|
|
267
|
-
right = mid - 1;
|
|
268
|
-
}
|
|
269
|
-
else {
|
|
270
|
-
return mid; // Value already exists at this position
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
return left; // Insert position
|
|
274
|
-
}
|
|
275
|
-
/**
|
|
276
|
-
* Incrementally update sorted index when adding an ID
|
|
277
|
-
*/
|
|
278
|
-
updateSortedIndexAdd(field, value, id) {
|
|
279
|
-
// Ensure sorted index exists
|
|
280
|
-
if (!this.sortedIndices.has(field)) {
|
|
281
|
-
this.sortedIndices.set(field, {
|
|
282
|
-
values: [],
|
|
283
|
-
isDirty: false,
|
|
284
|
-
fieldType: this.detectFieldType(value)
|
|
285
|
-
});
|
|
286
|
-
}
|
|
287
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
288
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
289
|
-
// Find where this value should be in the sorted array
|
|
290
|
-
const insertPos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
291
|
-
if (insertPos < sortedIndex.values.length &&
|
|
292
|
-
sortedIndex.values[insertPos][0] === normalizedValue) {
|
|
293
|
-
// Value already exists, just add the ID to the existing Set
|
|
294
|
-
sortedIndex.values[insertPos][1].add(id);
|
|
295
|
-
}
|
|
296
|
-
else {
|
|
297
|
-
// New value, insert at the correct position
|
|
298
|
-
sortedIndex.values.splice(insertPos, 0, [normalizedValue, new Set([id])]);
|
|
299
|
-
}
|
|
300
|
-
// Mark as clean since we're maintaining it incrementally
|
|
301
|
-
sortedIndex.isDirty = false;
|
|
302
|
-
}
|
|
303
|
-
/**
|
|
304
|
-
* Incrementally update sorted index when removing an ID
|
|
305
|
-
*/
|
|
306
|
-
updateSortedIndexRemove(field, value, id) {
|
|
307
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
308
|
-
if (!sortedIndex || sortedIndex.values.length === 0)
|
|
309
|
-
return;
|
|
310
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
311
|
-
// Binary search to find the value
|
|
312
|
-
const pos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
313
|
-
if (pos < sortedIndex.values.length &&
|
|
314
|
-
sortedIndex.values[pos][0] === normalizedValue) {
|
|
315
|
-
// Remove the ID from the Set
|
|
316
|
-
sortedIndex.values[pos][1].delete(id);
|
|
317
|
-
// If no IDs left for this value, remove the entire entry
|
|
318
|
-
if (sortedIndex.values[pos][1].size === 0) {
|
|
319
|
-
sortedIndex.values.splice(pos, 1);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
// Keep it clean
|
|
323
|
-
sortedIndex.isDirty = false;
|
|
324
|
-
}
|
|
325
|
-
/**
|
|
326
|
-
* Binary search for range start (inclusive or exclusive)
|
|
327
|
-
*/
|
|
328
|
-
binarySearchStart(sorted, target, inclusive) {
|
|
329
|
-
let left = 0;
|
|
330
|
-
let right = sorted.length - 1;
|
|
331
|
-
let result = sorted.length;
|
|
332
|
-
while (left <= right) {
|
|
333
|
-
const mid = Math.floor((left + right) / 2);
|
|
334
|
-
const midVal = sorted[mid][0];
|
|
335
|
-
if (inclusive ? midVal >= target : midVal > target) {
|
|
336
|
-
result = mid;
|
|
337
|
-
right = mid - 1;
|
|
338
|
-
}
|
|
339
|
-
else {
|
|
340
|
-
left = mid + 1;
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
return result;
|
|
344
|
-
}
|
|
345
|
-
/**
|
|
346
|
-
* Binary search for range end (inclusive or exclusive)
|
|
347
|
-
*/
|
|
348
|
-
binarySearchEnd(sorted, target, inclusive) {
|
|
349
|
-
let left = 0;
|
|
350
|
-
let right = sorted.length - 1;
|
|
351
|
-
let result = -1;
|
|
352
|
-
while (left <= right) {
|
|
353
|
-
const mid = Math.floor((left + right) / 2);
|
|
354
|
-
const midVal = sorted[mid][0];
|
|
355
|
-
if (inclusive ? midVal <= target : midVal < target) {
|
|
356
|
-
result = mid;
|
|
357
|
-
left = mid + 1;
|
|
358
|
-
}
|
|
359
|
-
else {
|
|
360
|
-
right = mid - 1;
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
return result;
|
|
364
|
-
}
|
|
365
169
|
/**
|
|
366
170
|
* Update cardinality statistics for a field
|
|
367
171
|
*/
|
|
@@ -385,18 +189,21 @@ export class MetadataIndexManager {
|
|
|
385
189
|
}
|
|
386
190
|
const stats = this.fieldStats.get(field);
|
|
387
191
|
const cardinality = stats.cardinality;
|
|
388
|
-
// Track unique values
|
|
389
|
-
const
|
|
390
|
-
const
|
|
192
|
+
// Track unique values by checking fieldIndex counts (v3.42.0 - removed indexCache)
|
|
193
|
+
const fieldIndex = this.fieldIndexes.get(field);
|
|
194
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
195
|
+
const currentCount = fieldIndex?.values[normalizedValue] || 0;
|
|
391
196
|
if (operation === 'add') {
|
|
392
|
-
|
|
197
|
+
// If this is a new value (count is 0), increment unique values
|
|
198
|
+
if (currentCount === 0) {
|
|
393
199
|
cardinality.uniqueValues++;
|
|
394
200
|
}
|
|
395
201
|
cardinality.totalValues++;
|
|
396
202
|
}
|
|
397
203
|
else if (operation === 'remove') {
|
|
398
|
-
|
|
399
|
-
|
|
204
|
+
// If count will become 0, decrement unique values
|
|
205
|
+
if (currentCount === 1) {
|
|
206
|
+
cardinality.uniqueValues = Math.max(0, cardinality.uniqueValues - 1);
|
|
400
207
|
}
|
|
401
208
|
cardinality.totalValues = Math.max(0, cardinality.totalValues - 1);
|
|
402
209
|
}
|
|
@@ -434,23 +241,17 @@ export class MetadataIndexManager {
|
|
|
434
241
|
* Update index strategy based on field statistics
|
|
435
242
|
*/
|
|
436
243
|
updateIndexStrategy(field, stats) {
|
|
437
|
-
const isNumeric = this.numericFields.has(field);
|
|
438
244
|
const hasHighCardinality = stats.cardinality.uniqueValues > this.HIGH_CARDINALITY_THRESHOLD;
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
if (isNumeric && hasRangeQueries) {
|
|
442
|
-
stats.indexType = 'both'; // Need both hash and sorted
|
|
443
|
-
}
|
|
444
|
-
else if (hasRangeQueries) {
|
|
445
|
-
stats.indexType = 'sorted';
|
|
446
|
-
}
|
|
447
|
-
else {
|
|
448
|
-
stats.indexType = 'hash';
|
|
449
|
-
}
|
|
245
|
+
// All fields use chunked sparse indexing with zone maps (v3.42.0)
|
|
246
|
+
stats.indexType = 'hash';
|
|
450
247
|
// Determine normalization strategy for high cardinality NON-temporal fields
|
|
451
248
|
// (Temporal fields are already bucketed in normalizeValue from the start!)
|
|
452
249
|
if (hasHighCardinality) {
|
|
453
|
-
if (
|
|
250
|
+
// Check if field looks numeric (for float precision reduction)
|
|
251
|
+
const fieldLower = field.toLowerCase();
|
|
252
|
+
const looksNumeric = fieldLower.includes('count') || fieldLower.includes('score') ||
|
|
253
|
+
fieldLower.includes('value') || fieldLower.includes('amount');
|
|
254
|
+
if (looksNumeric) {
|
|
454
255
|
stats.normalizationStrategy = 'precision'; // Reduce float precision
|
|
455
256
|
}
|
|
456
257
|
else {
|
|
@@ -461,8 +262,343 @@ export class MetadataIndexManager {
|
|
|
461
262
|
stats.normalizationStrategy = 'none';
|
|
462
263
|
}
|
|
463
264
|
}
|
|
265
|
+
// ============================================================================
|
|
266
|
+
// Adaptive Chunked Sparse Indexing (v3.42.0)
|
|
267
|
+
// All fields use chunking - simplified implementation
|
|
268
|
+
// ============================================================================
|
|
269
|
+
/**
|
|
270
|
+
* Load sparse index from storage
|
|
271
|
+
*/
|
|
272
|
+
async loadSparseIndex(field) {
|
|
273
|
+
const indexPath = `__sparse_index__${field}`;
|
|
274
|
+
const unifiedKey = `metadata:sparse:${field}`;
|
|
275
|
+
return await this.unifiedCache.get(unifiedKey, async () => {
|
|
276
|
+
try {
|
|
277
|
+
const data = await this.storage.getMetadata(indexPath);
|
|
278
|
+
if (data) {
|
|
279
|
+
const sparseIndex = SparseIndex.fromJSON(data);
|
|
280
|
+
// Add to unified cache (sparse indices are expensive to rebuild)
|
|
281
|
+
const size = JSON.stringify(data).length;
|
|
282
|
+
this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
|
|
283
|
+
return sparseIndex;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
catch (error) {
|
|
287
|
+
prodLog.debug(`Failed to load sparse index for field '${field}':`, error);
|
|
288
|
+
}
|
|
289
|
+
return undefined;
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Save sparse index to storage
|
|
294
|
+
*/
|
|
295
|
+
async saveSparseIndex(field, sparseIndex) {
|
|
296
|
+
const indexPath = `__sparse_index__${field}`;
|
|
297
|
+
const unifiedKey = `metadata:sparse:${field}`;
|
|
298
|
+
const data = sparseIndex.toJSON();
|
|
299
|
+
await this.storage.saveMetadata(indexPath, data);
|
|
300
|
+
// Update unified cache
|
|
301
|
+
const size = JSON.stringify(data).length;
|
|
302
|
+
this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Get IDs for a value using chunked sparse index with roaring bitmaps (v3.43.0)
|
|
306
|
+
*/
|
|
307
|
+
async getIdsFromChunks(field, value) {
|
|
308
|
+
// Load sparse index
|
|
309
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
310
|
+
if (!sparseIndex) {
|
|
311
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
312
|
+
if (!sparseIndex) {
|
|
313
|
+
return []; // No chunked index exists yet
|
|
314
|
+
}
|
|
315
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
316
|
+
}
|
|
317
|
+
// Find candidate chunks using zone maps and bloom filters
|
|
318
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
319
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
320
|
+
if (candidateChunkIds.length === 0) {
|
|
321
|
+
return []; // No chunks contain this value
|
|
322
|
+
}
|
|
323
|
+
// Load chunks and collect integer IDs from roaring bitmaps
|
|
324
|
+
const allIntIds = new Set();
|
|
325
|
+
for (const chunkId of candidateChunkIds) {
|
|
326
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
327
|
+
if (chunk) {
|
|
328
|
+
const bitmap = chunk.entries.get(normalizedValue);
|
|
329
|
+
if (bitmap) {
|
|
330
|
+
// Iterate through roaring bitmap integers
|
|
331
|
+
for (const intId of bitmap) {
|
|
332
|
+
allIntIds.add(intId);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
// Convert integer IDs back to UUIDs
|
|
338
|
+
return this.idMapper.intsIterableToUuids(allIntIds);
|
|
339
|
+
}
|
|
340
|
+
/**
|
|
341
|
+
* Get IDs for a range using chunked sparse index with zone maps and roaring bitmaps (v3.43.0)
|
|
342
|
+
*/
|
|
343
|
+
async getIdsFromChunksForRange(field, min, max, includeMin = true, includeMax = true) {
|
|
344
|
+
// Load sparse index
|
|
345
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
346
|
+
if (!sparseIndex) {
|
|
347
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
348
|
+
if (!sparseIndex) {
|
|
349
|
+
return []; // No chunked index exists yet
|
|
350
|
+
}
|
|
351
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
352
|
+
}
|
|
353
|
+
// Find candidate chunks using zone maps
|
|
354
|
+
const candidateChunkIds = sparseIndex.findChunksForRange(min, max);
|
|
355
|
+
if (candidateChunkIds.length === 0) {
|
|
356
|
+
return [];
|
|
357
|
+
}
|
|
358
|
+
// Load chunks and filter by range, collecting integer IDs from roaring bitmaps
|
|
359
|
+
const allIntIds = new Set();
|
|
360
|
+
for (const chunkId of candidateChunkIds) {
|
|
361
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
362
|
+
if (chunk) {
|
|
363
|
+
for (const [value, bitmap] of chunk.entries) {
|
|
364
|
+
// Check if value is in range
|
|
365
|
+
let inRange = true;
|
|
366
|
+
if (min !== undefined) {
|
|
367
|
+
inRange = inRange && (includeMin ? value >= min : value > min);
|
|
368
|
+
}
|
|
369
|
+
if (max !== undefined) {
|
|
370
|
+
inRange = inRange && (includeMax ? value <= max : value < max);
|
|
371
|
+
}
|
|
372
|
+
if (inRange) {
|
|
373
|
+
// Iterate through roaring bitmap integers
|
|
374
|
+
for (const intId of bitmap) {
|
|
375
|
+
allIntIds.add(intId);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
// Convert integer IDs back to UUIDs
|
|
382
|
+
return this.idMapper.intsIterableToUuids(allIntIds);
|
|
383
|
+
}
|
|
384
|
+
/**
|
|
385
|
+
* Get roaring bitmap for a field-value pair without converting to UUIDs (v3.43.0)
|
|
386
|
+
* This is used for fast multi-field intersection queries using hardware-accelerated bitmap AND
|
|
387
|
+
* @returns RoaringBitmap32 containing integer IDs, or null if no matches
|
|
388
|
+
*/
|
|
389
|
+
async getBitmapFromChunks(field, value) {
|
|
390
|
+
// Load sparse index
|
|
391
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
392
|
+
if (!sparseIndex) {
|
|
393
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
394
|
+
if (!sparseIndex) {
|
|
395
|
+
return null; // No chunked index exists yet
|
|
396
|
+
}
|
|
397
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
398
|
+
}
|
|
399
|
+
// Find candidate chunks using zone maps and bloom filters
|
|
400
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
401
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
402
|
+
if (candidateChunkIds.length === 0) {
|
|
403
|
+
return null; // No chunks contain this value
|
|
404
|
+
}
|
|
405
|
+
// If only one chunk, return its bitmap directly
|
|
406
|
+
if (candidateChunkIds.length === 1) {
|
|
407
|
+
const chunk = await this.chunkManager.loadChunk(field, candidateChunkIds[0]);
|
|
408
|
+
if (chunk) {
|
|
409
|
+
const bitmap = chunk.entries.get(normalizedValue);
|
|
410
|
+
return bitmap || null;
|
|
411
|
+
}
|
|
412
|
+
return null;
|
|
413
|
+
}
|
|
414
|
+
// Multiple chunks: collect all bitmaps and combine with OR
|
|
415
|
+
const bitmaps = [];
|
|
416
|
+
for (const chunkId of candidateChunkIds) {
|
|
417
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
418
|
+
if (chunk) {
|
|
419
|
+
const bitmap = chunk.entries.get(normalizedValue);
|
|
420
|
+
if (bitmap && bitmap.size > 0) {
|
|
421
|
+
bitmaps.push(bitmap);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if (bitmaps.length === 0) {
|
|
426
|
+
return null;
|
|
427
|
+
}
|
|
428
|
+
if (bitmaps.length === 1) {
|
|
429
|
+
return bitmaps[0];
|
|
430
|
+
}
|
|
431
|
+
// Combine multiple bitmaps with OR operation
|
|
432
|
+
return RoaringBitmap32.orMany(bitmaps);
|
|
433
|
+
}
|
|
434
|
+
/**
|
|
435
|
+
* Get IDs for multiple field-value pairs using fast roaring bitmap intersection (v3.43.0)
|
|
436
|
+
*
|
|
437
|
+
* This method provides 500-900x faster multi-field queries by:
|
|
438
|
+
* - Using hardware-accelerated bitmap AND operations (SIMD: AVX2/SSE4.2)
|
|
439
|
+
* - Avoiding intermediate UUID array allocations
|
|
440
|
+
* - Converting integers to UUIDs only once at the end
|
|
441
|
+
*
|
|
442
|
+
* Example: { status: 'active', role: 'admin', verified: true }
|
|
443
|
+
* Instead of: fetch 3 UUID arrays → convert to Sets → filter intersection
|
|
444
|
+
* We do: fetch 3 bitmaps → hardware AND → convert final bitmap to UUIDs
|
|
445
|
+
*
|
|
446
|
+
* @param fieldValuePairs Array of field-value pairs to intersect
|
|
447
|
+
* @returns Array of UUID strings matching ALL criteria
|
|
448
|
+
*/
|
|
449
|
+
async getIdsForMultipleFields(fieldValuePairs) {
|
|
450
|
+
if (fieldValuePairs.length === 0) {
|
|
451
|
+
return [];
|
|
452
|
+
}
|
|
453
|
+
// Fast path: single field query
|
|
454
|
+
if (fieldValuePairs.length === 1) {
|
|
455
|
+
const { field, value } = fieldValuePairs[0];
|
|
456
|
+
return await this.getIds(field, value);
|
|
457
|
+
}
|
|
458
|
+
// Collect roaring bitmaps for each field-value pair
|
|
459
|
+
const bitmaps = [];
|
|
460
|
+
for (const { field, value } of fieldValuePairs) {
|
|
461
|
+
const bitmap = await this.getBitmapFromChunks(field, value);
|
|
462
|
+
if (!bitmap || bitmap.size === 0) {
|
|
463
|
+
// Short circuit: if any field has no matches, intersection is empty
|
|
464
|
+
return [];
|
|
465
|
+
}
|
|
466
|
+
bitmaps.push(bitmap);
|
|
467
|
+
}
|
|
468
|
+
// Hardware-accelerated intersection using SIMD instructions (AVX2/SSE4.2)
|
|
469
|
+
// This is 500-900x faster than JavaScript array filtering
|
|
470
|
+
// Note: RoaringBitmap32.and() only takes 2 params, so we reduce manually
|
|
471
|
+
let intersectionBitmap = bitmaps[0];
|
|
472
|
+
for (let i = 1; i < bitmaps.length; i++) {
|
|
473
|
+
intersectionBitmap = RoaringBitmap32.and(intersectionBitmap, bitmaps[i]);
|
|
474
|
+
}
|
|
475
|
+
// Check if empty before converting
|
|
476
|
+
if (intersectionBitmap.size === 0) {
|
|
477
|
+
return [];
|
|
478
|
+
}
|
|
479
|
+
// Convert final bitmap to UUIDs (only once, not per-field)
|
|
480
|
+
return this.idMapper.intsIterableToUuids(intersectionBitmap);
|
|
481
|
+
}
|
|
482
|
+
/**
|
|
483
|
+
* Add value-ID mapping to chunked index
|
|
484
|
+
*/
|
|
485
|
+
async addToChunkedIndex(field, value, id) {
|
|
486
|
+
// Load or create sparse index
|
|
487
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
488
|
+
if (!sparseIndex) {
|
|
489
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
490
|
+
if (!sparseIndex) {
|
|
491
|
+
// Create new sparse index
|
|
492
|
+
const stats = this.fieldStats.get(field);
|
|
493
|
+
const chunkSize = stats
|
|
494
|
+
? this.chunkingStrategy.getOptimalChunkSize({
|
|
495
|
+
uniqueValues: stats.cardinality.uniqueValues,
|
|
496
|
+
distribution: stats.cardinality.distribution,
|
|
497
|
+
avgIdsPerValue: stats.cardinality.totalValues / Math.max(1, stats.cardinality.uniqueValues)
|
|
498
|
+
})
|
|
499
|
+
: 50;
|
|
500
|
+
sparseIndex = new SparseIndex(field, chunkSize);
|
|
501
|
+
}
|
|
502
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
503
|
+
}
|
|
504
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
505
|
+
// Find existing chunk for this value (check zone maps)
|
|
506
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
507
|
+
let targetChunk = null;
|
|
508
|
+
let targetChunkId = null;
|
|
509
|
+
// Try to find an existing chunk with this value
|
|
510
|
+
for (const chunkId of candidateChunkIds) {
|
|
511
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
512
|
+
if (chunk && chunk.entries.has(normalizedValue)) {
|
|
513
|
+
targetChunk = chunk;
|
|
514
|
+
targetChunkId = chunkId;
|
|
515
|
+
break;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
// If no chunk has this value, find chunk with space or create new one
|
|
519
|
+
if (!targetChunk) {
|
|
520
|
+
// Find a chunk with available space
|
|
521
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
522
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
523
|
+
const descriptor = sparseIndex.getChunk(chunkId);
|
|
524
|
+
if (chunk && descriptor && chunk.entries.size < descriptor.splitThreshold) {
|
|
525
|
+
targetChunk = chunk;
|
|
526
|
+
targetChunkId = chunkId;
|
|
527
|
+
break;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
// Create new chunk if needed
|
|
532
|
+
if (!targetChunk) {
|
|
533
|
+
targetChunk = await this.chunkManager.createChunk(field);
|
|
534
|
+
targetChunkId = targetChunk.chunkId;
|
|
535
|
+
// Register in sparse index
|
|
536
|
+
const descriptor = {
|
|
537
|
+
chunkId: targetChunk.chunkId,
|
|
538
|
+
field,
|
|
539
|
+
valueCount: 0,
|
|
540
|
+
idCount: 0,
|
|
541
|
+
zoneMap: { min: null, max: null, count: 0, hasNulls: false },
|
|
542
|
+
lastUpdated: Date.now(),
|
|
543
|
+
splitThreshold: 80,
|
|
544
|
+
mergeThreshold: 20
|
|
545
|
+
};
|
|
546
|
+
sparseIndex.registerChunk(descriptor);
|
|
547
|
+
}
|
|
548
|
+
// Add to chunk
|
|
549
|
+
await this.chunkManager.addToChunk(targetChunk, normalizedValue, id);
|
|
550
|
+
await this.chunkManager.saveChunk(targetChunk);
|
|
551
|
+
// Update chunk descriptor in sparse index
|
|
552
|
+
const updatedZoneMap = this.chunkManager.calculateZoneMap(targetChunk);
|
|
553
|
+
const updatedBloomFilter = this.chunkManager.createBloomFilter(targetChunk);
|
|
554
|
+
sparseIndex.updateChunk(targetChunkId, {
|
|
555
|
+
valueCount: targetChunk.entries.size,
|
|
556
|
+
idCount: Array.from(targetChunk.entries.values()).reduce((sum, bitmap) => sum + bitmap.size, 0),
|
|
557
|
+
zoneMap: updatedZoneMap,
|
|
558
|
+
lastUpdated: Date.now()
|
|
559
|
+
});
|
|
560
|
+
// Update bloom filter
|
|
561
|
+
const descriptor = sparseIndex.getChunk(targetChunkId);
|
|
562
|
+
if (descriptor) {
|
|
563
|
+
sparseIndex.registerChunk(descriptor, updatedBloomFilter);
|
|
564
|
+
}
|
|
565
|
+
// Check if chunk needs splitting
|
|
566
|
+
if (targetChunk.entries.size > 80) {
|
|
567
|
+
await this.chunkManager.splitChunk(targetChunk, sparseIndex);
|
|
568
|
+
}
|
|
569
|
+
// Save sparse index
|
|
570
|
+
await this.saveSparseIndex(field, sparseIndex);
|
|
571
|
+
}
|
|
572
|
+
/**
|
|
573
|
+
* Remove ID from chunked index
|
|
574
|
+
*/
|
|
575
|
+
async removeFromChunkedIndex(field, value, id) {
|
|
576
|
+
const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
|
|
577
|
+
if (!sparseIndex) {
|
|
578
|
+
return; // No chunked index exists
|
|
579
|
+
}
|
|
580
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
581
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
582
|
+
for (const chunkId of candidateChunkIds) {
|
|
583
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
584
|
+
if (chunk && chunk.entries.has(normalizedValue)) {
|
|
585
|
+
await this.chunkManager.removeFromChunk(chunk, normalizedValue, id);
|
|
586
|
+
await this.chunkManager.saveChunk(chunk);
|
|
587
|
+
// Update sparse index
|
|
588
|
+
const updatedZoneMap = this.chunkManager.calculateZoneMap(chunk);
|
|
589
|
+
sparseIndex.updateChunk(chunkId, {
|
|
590
|
+
valueCount: chunk.entries.size,
|
|
591
|
+
idCount: Array.from(chunk.entries.values()).reduce((sum, bitmap) => sum + bitmap.size, 0),
|
|
592
|
+
zoneMap: updatedZoneMap,
|
|
593
|
+
lastUpdated: Date.now()
|
|
594
|
+
});
|
|
595
|
+
await this.saveSparseIndex(field, sparseIndex);
|
|
596
|
+
break;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
}
|
|
464
600
|
/**
|
|
465
|
-
* Get IDs matching a range query
|
|
601
|
+
* Get IDs matching a range query using zone maps
|
|
466
602
|
*/
|
|
467
603
|
async getIdsForRange(field, min, max, includeMin = true, includeMax = true) {
|
|
468
604
|
// Track range query for field statistics
|
|
@@ -470,35 +606,8 @@ export class MetadataIndexManager {
|
|
|
470
606
|
const stats = this.fieldStats.get(field);
|
|
471
607
|
stats.rangeQueryCount++;
|
|
472
608
|
}
|
|
473
|
-
//
|
|
474
|
-
await this.
|
|
475
|
-
// With incremental updates, we should rarely need to rebuild
|
|
476
|
-
// Only rebuild if it's marked dirty (e.g., after a bulk load or migration)
|
|
477
|
-
let sortedIndex = this.sortedIndices.get(field);
|
|
478
|
-
if (sortedIndex?.isDirty) {
|
|
479
|
-
prodLog.warn(`MetadataIndex: Sorted index for field '${field}' was dirty, rebuilding...`);
|
|
480
|
-
await this.buildSortedIndex(field);
|
|
481
|
-
sortedIndex = this.sortedIndices.get(field);
|
|
482
|
-
}
|
|
483
|
-
if (!sortedIndex || sortedIndex.values.length === 0)
|
|
484
|
-
return [];
|
|
485
|
-
const sorted = sortedIndex.values;
|
|
486
|
-
const resultSet = new Set();
|
|
487
|
-
// Find range boundaries
|
|
488
|
-
let start = 0;
|
|
489
|
-
let end = sorted.length - 1;
|
|
490
|
-
if (min !== undefined) {
|
|
491
|
-
start = this.binarySearchStart(sorted, min, includeMin);
|
|
492
|
-
}
|
|
493
|
-
if (max !== undefined) {
|
|
494
|
-
end = this.binarySearchEnd(sorted, max, includeMax);
|
|
495
|
-
}
|
|
496
|
-
// Collect all IDs in range
|
|
497
|
-
for (let i = start; i <= end && i < sorted.length; i++) {
|
|
498
|
-
const [, ids] = sorted[i];
|
|
499
|
-
ids.forEach(id => resultSet.add(id));
|
|
500
|
-
}
|
|
501
|
-
return Array.from(resultSet);
|
|
609
|
+
// All fields use chunked sparse index with zone map optimization (v3.42.0)
|
|
610
|
+
return await this.getIdsFromChunksForRange(field, min, max, includeMin, includeMax);
|
|
502
611
|
}
|
|
503
612
|
/**
|
|
504
613
|
* Generate field index filename for filter discovery
|
|
@@ -646,51 +755,22 @@ export class MetadataIndexManager {
|
|
|
646
755
|
const updatedFields = new Set();
|
|
647
756
|
for (let i = 0; i < fields.length; i++) {
|
|
648
757
|
const { field, value } = fields[i];
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
entry = loadedEntry ?? {
|
|
655
|
-
field,
|
|
656
|
-
value: this.normalizeValue(value, field), // Pass field for bucketing!
|
|
657
|
-
ids: new Set(),
|
|
658
|
-
lastUpdated: Date.now()
|
|
659
|
-
};
|
|
660
|
-
this.indexCache.set(key, entry);
|
|
661
|
-
}
|
|
662
|
-
// Add ID to entry
|
|
663
|
-
const wasNew = entry.ids.size === 0;
|
|
664
|
-
entry.ids.add(id);
|
|
665
|
-
entry.lastUpdated = Date.now();
|
|
666
|
-
this.dirtyEntries.add(key);
|
|
667
|
-
// Update cardinality statistics
|
|
668
|
-
if (wasNew) {
|
|
669
|
-
this.updateCardinalityStats(field, value, 'add');
|
|
670
|
-
}
|
|
671
|
-
// Track type-field affinity for intelligent NLP
|
|
672
|
-
this.updateTypeFieldAffinity(id, field, value, 'add');
|
|
673
|
-
// Incrementally update sorted index for this field
|
|
674
|
-
if (!updatedFields.has(field)) {
|
|
675
|
-
this.updateSortedIndexAdd(field, value, id);
|
|
676
|
-
updatedFields.add(field);
|
|
677
|
-
}
|
|
678
|
-
else {
|
|
679
|
-
// Multiple values for same field - still update
|
|
680
|
-
this.updateSortedIndexAdd(field, value, id);
|
|
681
|
-
}
|
|
682
|
-
// Update field index
|
|
758
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
759
|
+
await this.addToChunkedIndex(field, value, id);
|
|
760
|
+
// Update statistics and tracking
|
|
761
|
+
this.updateCardinalityStats(field, value, 'add');
|
|
762
|
+
this.updateTypeFieldAffinity(id, field, value, 'add', metadata);
|
|
683
763
|
await this.updateFieldIndex(field, value, 1);
|
|
684
764
|
// Yield to event loop every 5 fields to prevent blocking
|
|
685
765
|
if (i % 5 === 4) {
|
|
686
766
|
await this.yieldToEventLoop();
|
|
687
767
|
}
|
|
688
768
|
}
|
|
689
|
-
// Adaptive auto-flush based on usage patterns
|
|
769
|
+
// Adaptive auto-flush based on usage patterns (v3.42.0 - flush field indexes only)
|
|
690
770
|
if (!skipFlush) {
|
|
691
771
|
const timeSinceLastFlush = Date.now() - this.lastFlushTime;
|
|
692
|
-
const shouldAutoFlush = this.
|
|
693
|
-
(this.
|
|
772
|
+
const shouldAutoFlush = this.dirtyFields.size >= this.autoFlushThreshold || // Size threshold
|
|
773
|
+
(this.dirtyFields.size > 10 && timeSinceLastFlush > 5000); // Time threshold (5 seconds)
|
|
694
774
|
if (shouldAutoFlush) {
|
|
695
775
|
const startTime = Date.now();
|
|
696
776
|
await this.flush();
|
|
@@ -743,51 +823,35 @@ export class MetadataIndexManager {
|
|
|
743
823
|
// Remove from specific field indexes
|
|
744
824
|
const fields = this.extractIndexableFields(metadata);
|
|
745
825
|
for (const { field, value } of fields) {
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if (entry) {
|
|
753
|
-
const hadId = entry.ids.has(id);
|
|
754
|
-
entry.ids.delete(id);
|
|
755
|
-
entry.lastUpdated = Date.now();
|
|
756
|
-
this.dirtyEntries.add(key);
|
|
757
|
-
// Update cardinality statistics
|
|
758
|
-
if (hadId && entry.ids.size === 0) {
|
|
759
|
-
this.updateCardinalityStats(field, value, 'remove');
|
|
760
|
-
}
|
|
761
|
-
// Track type-field affinity for intelligent NLP
|
|
762
|
-
if (hadId) {
|
|
763
|
-
this.updateTypeFieldAffinity(id, field, value, 'remove');
|
|
764
|
-
}
|
|
765
|
-
// Incrementally update sorted index when removing
|
|
766
|
-
this.updateSortedIndexRemove(field, value, id);
|
|
767
|
-
// Update field index
|
|
768
|
-
await this.updateFieldIndex(field, value, -1);
|
|
769
|
-
// If no IDs left, mark for cleanup
|
|
770
|
-
if (entry.ids.size === 0) {
|
|
771
|
-
this.indexCache.delete(key);
|
|
772
|
-
await this.deleteIndexEntry(key);
|
|
773
|
-
}
|
|
774
|
-
}
|
|
826
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
827
|
+
await this.removeFromChunkedIndex(field, value, id);
|
|
828
|
+
// Update statistics and tracking
|
|
829
|
+
this.updateCardinalityStats(field, value, 'remove');
|
|
830
|
+
this.updateTypeFieldAffinity(id, field, value, 'remove', metadata);
|
|
831
|
+
await this.updateFieldIndex(field, value, -1);
|
|
775
832
|
// Invalidate cache
|
|
776
833
|
this.metadataCache.invalidatePattern(`field_values_${field}`);
|
|
777
834
|
}
|
|
778
835
|
}
|
|
779
836
|
else {
|
|
780
|
-
// Remove from all indexes (slower, requires scanning)
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
this.
|
|
790
|
-
|
|
837
|
+
// Remove from all indexes (slower, requires scanning all chunks)
|
|
838
|
+
// This should be rare - prefer providing metadata when removing
|
|
839
|
+
prodLog.warn(`Removing ID ${id} without metadata requires scanning all sparse indices (slow)`);
|
|
840
|
+
// Scan all sparse indices
|
|
841
|
+
for (const [field, sparseIndex] of this.sparseIndices.entries()) {
|
|
842
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
843
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
844
|
+
if (chunk) {
|
|
845
|
+
// Convert UUID to integer for bitmap checking
|
|
846
|
+
const intId = this.idMapper.getInt(id);
|
|
847
|
+
if (intId !== undefined) {
|
|
848
|
+
// Check all values in this chunk
|
|
849
|
+
for (const [value, bitmap] of chunk.entries) {
|
|
850
|
+
if (bitmap.has(intId)) {
|
|
851
|
+
await this.removeFromChunkedIndex(field, value, id);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
}
|
|
791
855
|
}
|
|
792
856
|
}
|
|
793
857
|
}
|
|
@@ -797,14 +861,9 @@ export class MetadataIndexManager {
|
|
|
797
861
|
* Get all IDs in the index
|
|
798
862
|
*/
|
|
799
863
|
async getAllIds() {
|
|
800
|
-
//
|
|
864
|
+
// Use storage as the source of truth (v3.42.0 - removed redundant indexCache scan)
|
|
801
865
|
const allIds = new Set();
|
|
802
|
-
//
|
|
803
|
-
for (const entry of this.indexCache.values()) {
|
|
804
|
-
entry.ids.forEach(id => allIds.add(id));
|
|
805
|
-
}
|
|
806
|
-
// If storage has a method to get all nouns, use it as the source of truth
|
|
807
|
-
// This ensures we include items that might not be indexed yet
|
|
866
|
+
// Storage.getNouns() is the definitive source of all entity IDs
|
|
808
867
|
if (this.storage && typeof this.storage.getNouns === 'function') {
|
|
809
868
|
try {
|
|
810
869
|
const result = await this.storage.getNouns({
|
|
@@ -818,13 +877,15 @@ export class MetadataIndexManager {
|
|
|
818
877
|
}
|
|
819
878
|
}
|
|
820
879
|
catch (e) {
|
|
821
|
-
//
|
|
880
|
+
// If storage method fails, return empty array
|
|
881
|
+
prodLog.warn('Failed to get all IDs from storage:', e);
|
|
882
|
+
return [];
|
|
822
883
|
}
|
|
823
884
|
}
|
|
824
885
|
return Array.from(allIds);
|
|
825
886
|
}
|
|
826
887
|
/**
|
|
827
|
-
* Get IDs for a specific field-value combination
|
|
888
|
+
* Get IDs for a specific field-value combination using chunked sparse index
|
|
828
889
|
*/
|
|
829
890
|
async getIds(field, value) {
|
|
830
891
|
// Track exact query for field statistics
|
|
@@ -832,27 +893,8 @@ export class MetadataIndexManager {
|
|
|
832
893
|
const stats = this.fieldStats.get(field);
|
|
833
894
|
stats.exactQueryCount++;
|
|
834
895
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
const cacheKey = `ids_${key}`;
|
|
838
|
-
const cachedIds = this.metadataCache.get(cacheKey);
|
|
839
|
-
if (cachedIds) {
|
|
840
|
-
return cachedIds;
|
|
841
|
-
}
|
|
842
|
-
// Try in-memory cache
|
|
843
|
-
let entry = this.indexCache.get(key);
|
|
844
|
-
// Load from storage if not cached
|
|
845
|
-
if (!entry) {
|
|
846
|
-
const loadedEntry = await this.loadIndexEntry(key);
|
|
847
|
-
if (loadedEntry) {
|
|
848
|
-
entry = loadedEntry;
|
|
849
|
-
this.indexCache.set(key, entry);
|
|
850
|
-
}
|
|
851
|
-
}
|
|
852
|
-
const ids = entry ? Array.from(entry.ids) : [];
|
|
853
|
-
// Cache the result
|
|
854
|
-
this.metadataCache.set(cacheKey, ids);
|
|
855
|
-
return ids;
|
|
896
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
897
|
+
return await this.getIdsFromChunks(field, value);
|
|
856
898
|
}
|
|
857
899
|
/**
|
|
858
900
|
* Get all available values for a field (for filter discovery)
|
|
@@ -1044,14 +1086,26 @@ export class MetadataIndexManager {
|
|
|
1044
1086
|
// Existence operator
|
|
1045
1087
|
case 'exists':
|
|
1046
1088
|
if (operand) {
|
|
1047
|
-
// Get all IDs that have this field (any value)
|
|
1048
|
-
const
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1089
|
+
// Get all IDs that have this field (any value) from chunked sparse index with roaring bitmaps (v3.43.0)
|
|
1090
|
+
const allIntIds = new Set();
|
|
1091
|
+
// Load sparse index for this field
|
|
1092
|
+
const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
|
|
1093
|
+
if (sparseIndex) {
|
|
1094
|
+
// Iterate through all chunks for this field
|
|
1095
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
1096
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
1097
|
+
if (chunk) {
|
|
1098
|
+
// Collect all integer IDs from all roaring bitmaps in this chunk
|
|
1099
|
+
for (const bitmap of chunk.entries.values()) {
|
|
1100
|
+
for (const intId of bitmap) {
|
|
1101
|
+
allIntIds.add(intId);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1052
1105
|
}
|
|
1053
1106
|
}
|
|
1054
|
-
|
|
1107
|
+
// Convert integer IDs back to UUIDs
|
|
1108
|
+
fieldResults = this.idMapper.intsIterableToUuids(allIntIds);
|
|
1055
1109
|
}
|
|
1056
1110
|
break;
|
|
1057
1111
|
// Negation operators
|
|
@@ -1149,31 +1203,17 @@ export class MetadataIndexManager {
|
|
|
1149
1203
|
}
|
|
1150
1204
|
/**
|
|
1151
1205
|
* Flush dirty entries to storage (non-blocking version)
|
|
1206
|
+
* NOTE (v3.42.0): Sparse indices are flushed immediately in add/remove operations
|
|
1152
1207
|
*/
|
|
1153
1208
|
async flush() {
|
|
1154
|
-
// Check if we have anything to flush
|
|
1155
|
-
|
|
1156
|
-
if (this.dirtyEntries.size === 0 && this.dirtyFields.size === 0 && !hasDirtySortedIndices) {
|
|
1209
|
+
// Check if we have anything to flush
|
|
1210
|
+
if (this.dirtyFields.size === 0) {
|
|
1157
1211
|
return; // Nothing to flush
|
|
1158
1212
|
}
|
|
1159
1213
|
// Process in smaller batches to avoid blocking
|
|
1160
1214
|
const BATCH_SIZE = 20;
|
|
1161
1215
|
const allPromises = [];
|
|
1162
|
-
// Flush
|
|
1163
|
-
const dirtyEntriesArray = Array.from(this.dirtyEntries);
|
|
1164
|
-
for (let i = 0; i < dirtyEntriesArray.length; i += BATCH_SIZE) {
|
|
1165
|
-
const batch = dirtyEntriesArray.slice(i, i + BATCH_SIZE);
|
|
1166
|
-
const batchPromises = batch.map(key => {
|
|
1167
|
-
const entry = this.indexCache.get(key);
|
|
1168
|
-
return entry ? this.saveIndexEntry(key, entry) : Promise.resolve();
|
|
1169
|
-
});
|
|
1170
|
-
allPromises.push(...batchPromises);
|
|
1171
|
-
// Yield to event loop between batches
|
|
1172
|
-
if (i + BATCH_SIZE < dirtyEntriesArray.length) {
|
|
1173
|
-
await this.yieldToEventLoop();
|
|
1174
|
-
}
|
|
1175
|
-
}
|
|
1176
|
-
// Flush field indexes in batches
|
|
1216
|
+
// Flush field indexes in batches (v3.42.0 - removed flat file flushing)
|
|
1177
1217
|
const dirtyFieldsArray = Array.from(this.dirtyFields);
|
|
1178
1218
|
for (let i = 0; i < dirtyFieldsArray.length; i += BATCH_SIZE) {
|
|
1179
1219
|
const batch = dirtyFieldsArray.slice(i, i + BATCH_SIZE);
|
|
@@ -1187,15 +1227,10 @@ export class MetadataIndexManager {
|
|
|
1187
1227
|
await this.yieldToEventLoop();
|
|
1188
1228
|
}
|
|
1189
1229
|
}
|
|
1190
|
-
// Flush sorted indices (for range queries)
|
|
1191
|
-
for (const [field, sortedIndex] of this.sortedIndices.entries()) {
|
|
1192
|
-
if (sortedIndex.isDirty) {
|
|
1193
|
-
allPromises.push(this.saveSortedIndex(field, sortedIndex));
|
|
1194
|
-
}
|
|
1195
|
-
}
|
|
1196
1230
|
// Wait for all operations to complete
|
|
1197
1231
|
await Promise.all(allPromises);
|
|
1198
|
-
|
|
1232
|
+
// Flush EntityIdMapper (UUID ↔ integer mappings) (v3.43.0)
|
|
1233
|
+
await this.idMapper.flush();
|
|
1199
1234
|
this.dirtyFields.clear();
|
|
1200
1235
|
this.lastFlushTime = Date.now();
|
|
1201
1236
|
}
|
|
@@ -1275,69 +1310,6 @@ export class MetadataIndexManager {
|
|
|
1275
1310
|
}
|
|
1276
1311
|
}
|
|
1277
1312
|
}
|
|
1278
|
-
/**
|
|
1279
|
-
* Save sorted index to storage for range queries with file locking
|
|
1280
|
-
*/
|
|
1281
|
-
async saveSortedIndex(field, sortedIndex) {
|
|
1282
|
-
const filename = `sorted_${field}`;
|
|
1283
|
-
const lockKey = `sorted_index_${field}`;
|
|
1284
|
-
const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
|
|
1285
|
-
if (!lockAcquired) {
|
|
1286
|
-
prodLog.warn(`Failed to acquire lock for sorted index '${field}', proceeding without lock`);
|
|
1287
|
-
}
|
|
1288
|
-
try {
|
|
1289
|
-
const indexId = `__metadata_sorted_index__${filename}`;
|
|
1290
|
-
const unifiedKey = `metadata:sorted:${field}`;
|
|
1291
|
-
// Convert Set to Array for serialization
|
|
1292
|
-
const serializable = {
|
|
1293
|
-
values: sortedIndex.values.map(([value, ids]) => [value, Array.from(ids)]),
|
|
1294
|
-
fieldType: sortedIndex.fieldType,
|
|
1295
|
-
lastUpdated: Date.now()
|
|
1296
|
-
};
|
|
1297
|
-
await this.storage.saveMetadata(indexId, serializable);
|
|
1298
|
-
// Mark as clean
|
|
1299
|
-
sortedIndex.isDirty = false;
|
|
1300
|
-
// Update unified cache (sorted indices are expensive to rebuild)
|
|
1301
|
-
const size = JSON.stringify(serializable).length;
|
|
1302
|
-
this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100); // Higher rebuild cost
|
|
1303
|
-
}
|
|
1304
|
-
finally {
|
|
1305
|
-
if (lockAcquired) {
|
|
1306
|
-
await this.releaseLock(lockKey);
|
|
1307
|
-
}
|
|
1308
|
-
}
|
|
1309
|
-
}
|
|
1310
|
-
/**
|
|
1311
|
-
* Load sorted index from storage
|
|
1312
|
-
*/
|
|
1313
|
-
async loadSortedIndex(field) {
|
|
1314
|
-
const filename = `sorted_${field}`;
|
|
1315
|
-
const indexId = `__metadata_sorted_index__${filename}`;
|
|
1316
|
-
const unifiedKey = `metadata:sorted:${field}`;
|
|
1317
|
-
// Check unified cache first
|
|
1318
|
-
const cached = await this.unifiedCache.get(unifiedKey, async () => {
|
|
1319
|
-
try {
|
|
1320
|
-
const data = await this.storage.getMetadata(indexId);
|
|
1321
|
-
if (data) {
|
|
1322
|
-
// Convert Arrays back to Sets
|
|
1323
|
-
const sortedIndex = {
|
|
1324
|
-
values: data.values.map(([value, ids]) => [value, new Set(ids)]),
|
|
1325
|
-
fieldType: data.fieldType || 'mixed',
|
|
1326
|
-
isDirty: false
|
|
1327
|
-
};
|
|
1328
|
-
// Add to unified cache
|
|
1329
|
-
const size = JSON.stringify(data).length;
|
|
1330
|
-
this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100);
|
|
1331
|
-
return sortedIndex;
|
|
1332
|
-
}
|
|
1333
|
-
}
|
|
1334
|
-
catch (error) {
|
|
1335
|
-
// Sorted index doesn't exist yet
|
|
1336
|
-
}
|
|
1337
|
-
return null;
|
|
1338
|
-
});
|
|
1339
|
-
return cached;
|
|
1340
|
-
}
|
|
1341
1313
|
/**
|
|
1342
1314
|
* Get count of entities by type - O(1) operation using existing tracking
|
|
1343
1315
|
* This exposes the production-ready counting that's already maintained
|
|
@@ -1362,19 +1334,12 @@ export class MetadataIndexManager {
|
|
|
1362
1334
|
return new Map(this.totalEntitiesByType);
|
|
1363
1335
|
}
|
|
1364
1336
|
/**
|
|
1365
|
-
* Get count of entities matching field-value criteria -
|
|
1337
|
+
* Get count of entities matching field-value criteria - queries chunked sparse index
|
|
1366
1338
|
*/
|
|
1367
1339
|
async getCountForCriteria(field, value) {
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
const loadedEntry = await this.loadIndexEntry(key);
|
|
1372
|
-
if (loadedEntry) {
|
|
1373
|
-
entry = loadedEntry;
|
|
1374
|
-
this.indexCache.set(key, entry);
|
|
1375
|
-
}
|
|
1376
|
-
}
|
|
1377
|
-
return entry ? entry.ids.size : 0;
|
|
1340
|
+
// Use chunked sparse indexing (v3.42.0 - removed indexCache)
|
|
1341
|
+
const ids = await this.getIds(field, value);
|
|
1342
|
+
return ids.length;
|
|
1378
1343
|
}
|
|
1379
1344
|
/**
|
|
1380
1345
|
* Get index statistics with enhanced counting information
|
|
@@ -1383,10 +1348,23 @@ export class MetadataIndexManager {
|
|
|
1383
1348
|
const fields = new Set();
|
|
1384
1349
|
let totalEntries = 0;
|
|
1385
1350
|
let totalIds = 0;
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1351
|
+
// Collect stats from sparse indices (v3.42.0 - removed indexCache)
|
|
1352
|
+
for (const [field, sparseIndex] of this.sparseIndices.entries()) {
|
|
1353
|
+
fields.add(field);
|
|
1354
|
+
// Count entries and IDs from all chunks
|
|
1355
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
1356
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
1357
|
+
if (chunk) {
|
|
1358
|
+
totalEntries += chunk.entries.size;
|
|
1359
|
+
for (const ids of chunk.entries.values()) {
|
|
1360
|
+
totalIds += ids.size;
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
// Also include fields from fieldIndexes that might not have sparse indices yet
|
|
1366
|
+
for (const field of this.fieldIndexes.keys()) {
|
|
1367
|
+
fields.add(field);
|
|
1390
1368
|
}
|
|
1391
1369
|
return {
|
|
1392
1370
|
totalEntries,
|
|
@@ -1408,9 +1386,8 @@ export class MetadataIndexManager {
|
|
|
1408
1386
|
prodLog.info('🔄 Starting non-blocking metadata index rebuild with batch processing to prevent socket exhaustion...');
|
|
1409
1387
|
prodLog.info(`📊 Storage adapter: ${this.storage.constructor.name}`);
|
|
1410
1388
|
prodLog.info(`🔧 Batch processing available: ${!!this.storage.getMetadataBatch}`);
|
|
1411
|
-
// Clear existing indexes
|
|
1412
|
-
this.
|
|
1413
|
-
this.dirtyEntries.clear();
|
|
1389
|
+
// Clear existing indexes (v3.42.0 - use sparse indices instead of flat files)
|
|
1390
|
+
this.sparseIndices.clear();
|
|
1414
1391
|
this.fieldIndexes.clear();
|
|
1415
1392
|
this.dirtyFields.clear();
|
|
1416
1393
|
// Rebuild noun metadata indexes using pagination
|
|
@@ -1599,89 +1576,6 @@ export class MetadataIndexManager {
|
|
|
1599
1576
|
this.isRebuilding = false;
|
|
1600
1577
|
}
|
|
1601
1578
|
}
|
|
1602
|
-
/**
|
|
1603
|
-
* Load index entry from storage using safe filenames
|
|
1604
|
-
*/
|
|
1605
|
-
async loadIndexEntry(key) {
|
|
1606
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1607
|
-
// Use unified cache with loader function
|
|
1608
|
-
return await this.unifiedCache.get(unifiedKey, async () => {
|
|
1609
|
-
try {
|
|
1610
|
-
// Extract field and value from key
|
|
1611
|
-
const [field, value] = key.split(':', 2);
|
|
1612
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1613
|
-
// Load from metadata indexes directory with safe filename
|
|
1614
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1615
|
-
const data = await this.storage.getMetadata(indexId);
|
|
1616
|
-
if (data) {
|
|
1617
|
-
const entry = {
|
|
1618
|
-
field: data.field,
|
|
1619
|
-
value: data.value,
|
|
1620
|
-
ids: new Set(data.ids || []),
|
|
1621
|
-
lastUpdated: data.lastUpdated || Date.now()
|
|
1622
|
-
};
|
|
1623
|
-
// Add to unified cache (metadata entries are cheap to rebuild)
|
|
1624
|
-
const size = JSON.stringify(Array.from(entry.ids)).length + 100;
|
|
1625
|
-
this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
|
|
1626
|
-
return entry;
|
|
1627
|
-
}
|
|
1628
|
-
}
|
|
1629
|
-
catch (error) {
|
|
1630
|
-
// Index entry doesn't exist yet
|
|
1631
|
-
}
|
|
1632
|
-
return null;
|
|
1633
|
-
});
|
|
1634
|
-
}
|
|
1635
|
-
/**
|
|
1636
|
-
* Save index entry to storage using safe filenames with file locking
|
|
1637
|
-
*/
|
|
1638
|
-
async saveIndexEntry(key, entry) {
|
|
1639
|
-
const lockKey = `index_entry_${key}`;
|
|
1640
|
-
const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
|
|
1641
|
-
if (!lockAcquired) {
|
|
1642
|
-
prodLog.warn(`Failed to acquire lock for index entry '${key}', proceeding without lock`);
|
|
1643
|
-
}
|
|
1644
|
-
try {
|
|
1645
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1646
|
-
const data = {
|
|
1647
|
-
field: entry.field,
|
|
1648
|
-
value: entry.value,
|
|
1649
|
-
ids: Array.from(entry.ids),
|
|
1650
|
-
lastUpdated: entry.lastUpdated
|
|
1651
|
-
};
|
|
1652
|
-
// Extract field and value from key for safe filename generation
|
|
1653
|
-
const [field, value] = key.split(':', 2);
|
|
1654
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1655
|
-
// Store metadata indexes with safe filename
|
|
1656
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1657
|
-
await this.storage.saveMetadata(indexId, data);
|
|
1658
|
-
// Update unified cache
|
|
1659
|
-
const size = JSON.stringify(data.ids).length + 100;
|
|
1660
|
-
this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
|
|
1661
|
-
}
|
|
1662
|
-
finally {
|
|
1663
|
-
if (lockAcquired) {
|
|
1664
|
-
await this.releaseLock(lockKey);
|
|
1665
|
-
}
|
|
1666
|
-
}
|
|
1667
|
-
}
|
|
1668
|
-
/**
|
|
1669
|
-
* Delete index entry from storage using safe filenames
|
|
1670
|
-
*/
|
|
1671
|
-
async deleteIndexEntry(key) {
|
|
1672
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1673
|
-
try {
|
|
1674
|
-
const [field, value] = key.split(':', 2);
|
|
1675
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1676
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1677
|
-
await this.storage.saveMetadata(indexId, null);
|
|
1678
|
-
// Remove from unified cache
|
|
1679
|
-
this.unifiedCache.delete(unifiedKey);
|
|
1680
|
-
}
|
|
1681
|
-
catch (error) {
|
|
1682
|
-
// Entry might not exist
|
|
1683
|
-
}
|
|
1684
|
-
}
|
|
1685
1579
|
/**
|
|
1686
1580
|
* Get field statistics for optimization and discovery
|
|
1687
1581
|
*/
|
|
@@ -1814,7 +1708,7 @@ export class MetadataIndexManager {
|
|
|
1814
1708
|
* Update type-field affinity tracking for intelligent NLP
|
|
1815
1709
|
* Tracks which fields commonly appear with which entity types
|
|
1816
1710
|
*/
|
|
1817
|
-
updateTypeFieldAffinity(entityId, field, value, operation) {
|
|
1711
|
+
updateTypeFieldAffinity(entityId, field, value, operation, metadata) {
|
|
1818
1712
|
// Only track affinity for non-system fields (but allow 'noun' for type detection)
|
|
1819
1713
|
if (this.config.excludeFields.includes(field) && field !== 'noun')
|
|
1820
1714
|
return;
|
|
@@ -1824,14 +1718,13 @@ export class MetadataIndexManager {
|
|
|
1824
1718
|
// This is the type definition itself
|
|
1825
1719
|
entityType = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
1826
1720
|
}
|
|
1721
|
+
else if (metadata && metadata.noun) {
|
|
1722
|
+
// Extract entity type from metadata (v3.42.0 - removed indexCache scan)
|
|
1723
|
+
entityType = this.normalizeValue(metadata.noun, 'noun');
|
|
1724
|
+
}
|
|
1827
1725
|
else {
|
|
1828
|
-
//
|
|
1829
|
-
|
|
1830
|
-
if (key.startsWith('noun:') && entry.ids.has(entityId)) {
|
|
1831
|
-
entityType = key.split(':', 2)[1];
|
|
1832
|
-
break;
|
|
1833
|
-
}
|
|
1834
|
-
}
|
|
1726
|
+
// No type information available, skip affinity tracking
|
|
1727
|
+
return;
|
|
1835
1728
|
}
|
|
1836
1729
|
if (!entityType)
|
|
1837
1730
|
return; // No type found, skip affinity tracking
|