@soulcraft/brainy 3.41.0 → 3.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/utils/metadataIndex.d.ts +30 -63
- package/dist/utils/metadataIndex.js +339 -576
- package/dist/utils/metadataIndexChunking.d.ts +322 -0
- package/dist/utils/metadataIndexChunking.js +709 -0
- package/package.json +1 -1
|
@@ -6,18 +6,14 @@
|
|
|
6
6
|
import { MetadataIndexCache } from './metadataIndexCache.js';
|
|
7
7
|
import { prodLog } from './logger.js';
|
|
8
8
|
import { getGlobalCache } from './unifiedCache.js';
|
|
9
|
+
import { SparseIndex, ChunkManager, AdaptiveChunkingStrategy } from './metadataIndexChunking.js';
|
|
9
10
|
export class MetadataIndexManager {
|
|
10
11
|
constructor(storage, config = {}) {
|
|
11
|
-
this.indexCache = new Map();
|
|
12
|
-
this.dirtyEntries = new Set();
|
|
13
12
|
this.isRebuilding = false;
|
|
14
13
|
this.fieldIndexes = new Map();
|
|
15
14
|
this.dirtyFields = new Set();
|
|
16
15
|
this.lastFlushTime = Date.now();
|
|
17
16
|
this.autoFlushThreshold = 10; // Start with 10 for more frequent non-blocking flushes
|
|
18
|
-
// Sorted indices for range queries (only for numeric/date fields)
|
|
19
|
-
this.sortedIndices = new Map();
|
|
20
|
-
this.numericFields = new Set(); // Track which fields are numeric
|
|
21
17
|
// Cardinality and field statistics tracking
|
|
22
18
|
this.fieldStats = new Map();
|
|
23
19
|
this.cardinalityUpdateInterval = 100; // Update cardinality every N operations
|
|
@@ -33,6 +29,10 @@ export class MetadataIndexManager {
|
|
|
33
29
|
this.activeLocks = new Map();
|
|
34
30
|
this.lockPromises = new Map();
|
|
35
31
|
this.lockTimers = new Map(); // Track timers for cleanup
|
|
32
|
+
// Adaptive Chunked Sparse Indexing (v3.42.0)
|
|
33
|
+
// Reduces file count from 560k → 89 files (630x reduction)
|
|
34
|
+
// ALL fields now use chunking - no more flat files
|
|
35
|
+
this.sparseIndices = new Map(); // field -> sparse index
|
|
36
36
|
this.storage = storage;
|
|
37
37
|
this.config = {
|
|
38
38
|
maxIndexSize: config.maxIndexSize ?? 10000,
|
|
@@ -67,6 +67,9 @@ export class MetadataIndexManager {
|
|
|
67
67
|
});
|
|
68
68
|
// Get global unified cache for coordinated memory management
|
|
69
69
|
this.unifiedCache = getGlobalCache();
|
|
70
|
+
// Initialize chunking system (v3.42.0)
|
|
71
|
+
this.chunkManager = new ChunkManager(storage);
|
|
72
|
+
this.chunkingStrategy = new AdaptiveChunkingStrategy();
|
|
70
73
|
// Lazy load counts from storage statistics on first access
|
|
71
74
|
this.lazyLoadCounts();
|
|
72
75
|
}
|
|
@@ -148,220 +151,6 @@ export class MetadataIndexManager {
|
|
|
148
151
|
// This maintains zero-configuration principle
|
|
149
152
|
}
|
|
150
153
|
}
|
|
151
|
-
/**
|
|
152
|
-
* Get index key for field and value
|
|
153
|
-
*/
|
|
154
|
-
getIndexKey(field, value) {
|
|
155
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
156
|
-
return `${field}:${normalizedValue}`;
|
|
157
|
-
}
|
|
158
|
-
/**
|
|
159
|
-
* Ensure sorted index exists for a field (for range queries)
|
|
160
|
-
*/
|
|
161
|
-
async ensureSortedIndex(field) {
|
|
162
|
-
if (!this.sortedIndices.has(field)) {
|
|
163
|
-
// Try to load from storage first
|
|
164
|
-
const loaded = await this.loadSortedIndex(field);
|
|
165
|
-
if (loaded) {
|
|
166
|
-
this.sortedIndices.set(field, loaded);
|
|
167
|
-
}
|
|
168
|
-
else {
|
|
169
|
-
// Create new sorted index - NOT dirty since we maintain incrementally
|
|
170
|
-
this.sortedIndices.set(field, {
|
|
171
|
-
values: [],
|
|
172
|
-
isDirty: false, // Clean by default with incremental updates
|
|
173
|
-
fieldType: 'mixed'
|
|
174
|
-
});
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
/**
|
|
179
|
-
* Build sorted index for a field from hash index
|
|
180
|
-
*/
|
|
181
|
-
async buildSortedIndex(field) {
|
|
182
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
183
|
-
if (!sortedIndex || !sortedIndex.isDirty)
|
|
184
|
-
return;
|
|
185
|
-
// Collect all values for this field from hash index
|
|
186
|
-
const valueMap = new Map();
|
|
187
|
-
for (const [key, entry] of this.indexCache.entries()) {
|
|
188
|
-
if (entry.field === field) {
|
|
189
|
-
const existing = valueMap.get(entry.value);
|
|
190
|
-
if (existing) {
|
|
191
|
-
// Merge ID sets
|
|
192
|
-
entry.ids.forEach(id => existing.add(id));
|
|
193
|
-
}
|
|
194
|
-
else {
|
|
195
|
-
valueMap.set(entry.value, new Set(entry.ids));
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
// Convert to sorted array
|
|
200
|
-
const sorted = Array.from(valueMap.entries());
|
|
201
|
-
// Detect field type and sort accordingly
|
|
202
|
-
if (sorted.length > 0) {
|
|
203
|
-
const sampleValue = sorted[0][0];
|
|
204
|
-
if (typeof sampleValue === 'number') {
|
|
205
|
-
sortedIndex.fieldType = 'number';
|
|
206
|
-
sorted.sort((a, b) => a[0] - b[0]);
|
|
207
|
-
}
|
|
208
|
-
else if (sampleValue instanceof Date) {
|
|
209
|
-
sortedIndex.fieldType = 'date';
|
|
210
|
-
sorted.sort((a, b) => a[0].getTime() - b[0].getTime());
|
|
211
|
-
}
|
|
212
|
-
else {
|
|
213
|
-
sortedIndex.fieldType = 'string';
|
|
214
|
-
sorted.sort((a, b) => {
|
|
215
|
-
const aVal = String(a[0]);
|
|
216
|
-
const bVal = String(b[0]);
|
|
217
|
-
return aVal < bVal ? -1 : aVal > bVal ? 1 : 0;
|
|
218
|
-
});
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
sortedIndex.values = sorted;
|
|
222
|
-
sortedIndex.isDirty = false;
|
|
223
|
-
}
|
|
224
|
-
/**
|
|
225
|
-
* Detect field type from value
|
|
226
|
-
*/
|
|
227
|
-
detectFieldType(value) {
|
|
228
|
-
if (typeof value === 'number' && !isNaN(value))
|
|
229
|
-
return 'number';
|
|
230
|
-
if (value instanceof Date)
|
|
231
|
-
return 'date';
|
|
232
|
-
return 'string';
|
|
233
|
-
}
|
|
234
|
-
/**
|
|
235
|
-
* Compare two values based on field type for sorting
|
|
236
|
-
*/
|
|
237
|
-
compareValues(a, b, fieldType) {
|
|
238
|
-
switch (fieldType) {
|
|
239
|
-
case 'number':
|
|
240
|
-
return a - b;
|
|
241
|
-
case 'date':
|
|
242
|
-
return a.getTime() - b.getTime();
|
|
243
|
-
case 'string':
|
|
244
|
-
default:
|
|
245
|
-
const aStr = String(a);
|
|
246
|
-
const bStr = String(b);
|
|
247
|
-
return aStr < bStr ? -1 : aStr > bStr ? 1 : 0;
|
|
248
|
-
}
|
|
249
|
-
}
|
|
250
|
-
/**
|
|
251
|
-
* Binary search to find insertion position for a value
|
|
252
|
-
* Returns the index where the value should be inserted to maintain sorted order
|
|
253
|
-
*/
|
|
254
|
-
findInsertPosition(sortedArray, value, fieldType) {
|
|
255
|
-
if (sortedArray.length === 0)
|
|
256
|
-
return 0;
|
|
257
|
-
let left = 0;
|
|
258
|
-
let right = sortedArray.length - 1;
|
|
259
|
-
while (left <= right) {
|
|
260
|
-
const mid = Math.floor((left + right) / 2);
|
|
261
|
-
const midVal = sortedArray[mid][0];
|
|
262
|
-
const comparison = this.compareValues(midVal, value, fieldType);
|
|
263
|
-
if (comparison < 0) {
|
|
264
|
-
left = mid + 1;
|
|
265
|
-
}
|
|
266
|
-
else if (comparison > 0) {
|
|
267
|
-
right = mid - 1;
|
|
268
|
-
}
|
|
269
|
-
else {
|
|
270
|
-
return mid; // Value already exists at this position
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
return left; // Insert position
|
|
274
|
-
}
|
|
275
|
-
/**
|
|
276
|
-
* Incrementally update sorted index when adding an ID
|
|
277
|
-
*/
|
|
278
|
-
updateSortedIndexAdd(field, value, id) {
|
|
279
|
-
// Ensure sorted index exists
|
|
280
|
-
if (!this.sortedIndices.has(field)) {
|
|
281
|
-
this.sortedIndices.set(field, {
|
|
282
|
-
values: [],
|
|
283
|
-
isDirty: false,
|
|
284
|
-
fieldType: this.detectFieldType(value)
|
|
285
|
-
});
|
|
286
|
-
}
|
|
287
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
288
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
289
|
-
// Find where this value should be in the sorted array
|
|
290
|
-
const insertPos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
291
|
-
if (insertPos < sortedIndex.values.length &&
|
|
292
|
-
sortedIndex.values[insertPos][0] === normalizedValue) {
|
|
293
|
-
// Value already exists, just add the ID to the existing Set
|
|
294
|
-
sortedIndex.values[insertPos][1].add(id);
|
|
295
|
-
}
|
|
296
|
-
else {
|
|
297
|
-
// New value, insert at the correct position
|
|
298
|
-
sortedIndex.values.splice(insertPos, 0, [normalizedValue, new Set([id])]);
|
|
299
|
-
}
|
|
300
|
-
// Mark as clean since we're maintaining it incrementally
|
|
301
|
-
sortedIndex.isDirty = false;
|
|
302
|
-
}
|
|
303
|
-
/**
|
|
304
|
-
* Incrementally update sorted index when removing an ID
|
|
305
|
-
*/
|
|
306
|
-
updateSortedIndexRemove(field, value, id) {
|
|
307
|
-
const sortedIndex = this.sortedIndices.get(field);
|
|
308
|
-
if (!sortedIndex || sortedIndex.values.length === 0)
|
|
309
|
-
return;
|
|
310
|
-
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
311
|
-
// Binary search to find the value
|
|
312
|
-
const pos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
313
|
-
if (pos < sortedIndex.values.length &&
|
|
314
|
-
sortedIndex.values[pos][0] === normalizedValue) {
|
|
315
|
-
// Remove the ID from the Set
|
|
316
|
-
sortedIndex.values[pos][1].delete(id);
|
|
317
|
-
// If no IDs left for this value, remove the entire entry
|
|
318
|
-
if (sortedIndex.values[pos][1].size === 0) {
|
|
319
|
-
sortedIndex.values.splice(pos, 1);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
// Keep it clean
|
|
323
|
-
sortedIndex.isDirty = false;
|
|
324
|
-
}
|
|
325
|
-
/**
|
|
326
|
-
* Binary search for range start (inclusive or exclusive)
|
|
327
|
-
*/
|
|
328
|
-
binarySearchStart(sorted, target, inclusive) {
|
|
329
|
-
let left = 0;
|
|
330
|
-
let right = sorted.length - 1;
|
|
331
|
-
let result = sorted.length;
|
|
332
|
-
while (left <= right) {
|
|
333
|
-
const mid = Math.floor((left + right) / 2);
|
|
334
|
-
const midVal = sorted[mid][0];
|
|
335
|
-
if (inclusive ? midVal >= target : midVal > target) {
|
|
336
|
-
result = mid;
|
|
337
|
-
right = mid - 1;
|
|
338
|
-
}
|
|
339
|
-
else {
|
|
340
|
-
left = mid + 1;
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
return result;
|
|
344
|
-
}
|
|
345
|
-
/**
|
|
346
|
-
* Binary search for range end (inclusive or exclusive)
|
|
347
|
-
*/
|
|
348
|
-
binarySearchEnd(sorted, target, inclusive) {
|
|
349
|
-
let left = 0;
|
|
350
|
-
let right = sorted.length - 1;
|
|
351
|
-
let result = -1;
|
|
352
|
-
while (left <= right) {
|
|
353
|
-
const mid = Math.floor((left + right) / 2);
|
|
354
|
-
const midVal = sorted[mid][0];
|
|
355
|
-
if (inclusive ? midVal <= target : midVal < target) {
|
|
356
|
-
result = mid;
|
|
357
|
-
left = mid + 1;
|
|
358
|
-
}
|
|
359
|
-
else {
|
|
360
|
-
right = mid - 1;
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
return result;
|
|
364
|
-
}
|
|
365
154
|
/**
|
|
366
155
|
* Update cardinality statistics for a field
|
|
367
156
|
*/
|
|
@@ -385,18 +174,21 @@ export class MetadataIndexManager {
|
|
|
385
174
|
}
|
|
386
175
|
const stats = this.fieldStats.get(field);
|
|
387
176
|
const cardinality = stats.cardinality;
|
|
388
|
-
// Track unique values
|
|
389
|
-
const
|
|
390
|
-
const
|
|
177
|
+
// Track unique values by checking fieldIndex counts (v3.42.0 - removed indexCache)
|
|
178
|
+
const fieldIndex = this.fieldIndexes.get(field);
|
|
179
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
180
|
+
const currentCount = fieldIndex?.values[normalizedValue] || 0;
|
|
391
181
|
if (operation === 'add') {
|
|
392
|
-
|
|
182
|
+
// If this is a new value (count is 0), increment unique values
|
|
183
|
+
if (currentCount === 0) {
|
|
393
184
|
cardinality.uniqueValues++;
|
|
394
185
|
}
|
|
395
186
|
cardinality.totalValues++;
|
|
396
187
|
}
|
|
397
188
|
else if (operation === 'remove') {
|
|
398
|
-
|
|
399
|
-
|
|
189
|
+
// If count will become 0, decrement unique values
|
|
190
|
+
if (currentCount === 1) {
|
|
191
|
+
cardinality.uniqueValues = Math.max(0, cardinality.uniqueValues - 1);
|
|
400
192
|
}
|
|
401
193
|
cardinality.totalValues = Math.max(0, cardinality.totalValues - 1);
|
|
402
194
|
}
|
|
@@ -434,23 +226,17 @@ export class MetadataIndexManager {
|
|
|
434
226
|
* Update index strategy based on field statistics
|
|
435
227
|
*/
|
|
436
228
|
updateIndexStrategy(field, stats) {
|
|
437
|
-
const isNumeric = this.numericFields.has(field);
|
|
438
229
|
const hasHighCardinality = stats.cardinality.uniqueValues > this.HIGH_CARDINALITY_THRESHOLD;
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
if (isNumeric && hasRangeQueries) {
|
|
442
|
-
stats.indexType = 'both'; // Need both hash and sorted
|
|
443
|
-
}
|
|
444
|
-
else if (hasRangeQueries) {
|
|
445
|
-
stats.indexType = 'sorted';
|
|
446
|
-
}
|
|
447
|
-
else {
|
|
448
|
-
stats.indexType = 'hash';
|
|
449
|
-
}
|
|
230
|
+
// All fields use chunked sparse indexing with zone maps (v3.42.0)
|
|
231
|
+
stats.indexType = 'hash';
|
|
450
232
|
// Determine normalization strategy for high cardinality NON-temporal fields
|
|
451
233
|
// (Temporal fields are already bucketed in normalizeValue from the start!)
|
|
452
234
|
if (hasHighCardinality) {
|
|
453
|
-
if (
|
|
235
|
+
// Check if field looks numeric (for float precision reduction)
|
|
236
|
+
const fieldLower = field.toLowerCase();
|
|
237
|
+
const looksNumeric = fieldLower.includes('count') || fieldLower.includes('score') ||
|
|
238
|
+
fieldLower.includes('value') || fieldLower.includes('amount');
|
|
239
|
+
if (looksNumeric) {
|
|
454
240
|
stats.normalizationStrategy = 'precision'; // Reduce float precision
|
|
455
241
|
}
|
|
456
242
|
else {
|
|
@@ -461,8 +247,237 @@ export class MetadataIndexManager {
|
|
|
461
247
|
stats.normalizationStrategy = 'none';
|
|
462
248
|
}
|
|
463
249
|
}
|
|
250
|
+
// ============================================================================
|
|
251
|
+
// Adaptive Chunked Sparse Indexing (v3.42.0)
|
|
252
|
+
// All fields use chunking - simplified implementation
|
|
253
|
+
// ============================================================================
|
|
254
|
+
/**
|
|
255
|
+
* Load sparse index from storage
|
|
256
|
+
*/
|
|
257
|
+
async loadSparseIndex(field) {
|
|
258
|
+
const indexPath = `__sparse_index__${field}`;
|
|
259
|
+
const unifiedKey = `metadata:sparse:${field}`;
|
|
260
|
+
return await this.unifiedCache.get(unifiedKey, async () => {
|
|
261
|
+
try {
|
|
262
|
+
const data = await this.storage.getMetadata(indexPath);
|
|
263
|
+
if (data) {
|
|
264
|
+
const sparseIndex = SparseIndex.fromJSON(data);
|
|
265
|
+
// Add to unified cache (sparse indices are expensive to rebuild)
|
|
266
|
+
const size = JSON.stringify(data).length;
|
|
267
|
+
this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
|
|
268
|
+
return sparseIndex;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
catch (error) {
|
|
272
|
+
prodLog.debug(`Failed to load sparse index for field '${field}':`, error);
|
|
273
|
+
}
|
|
274
|
+
return undefined;
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Save sparse index to storage
|
|
279
|
+
*/
|
|
280
|
+
async saveSparseIndex(field, sparseIndex) {
|
|
281
|
+
const indexPath = `__sparse_index__${field}`;
|
|
282
|
+
const unifiedKey = `metadata:sparse:${field}`;
|
|
283
|
+
const data = sparseIndex.toJSON();
|
|
284
|
+
await this.storage.saveMetadata(indexPath, data);
|
|
285
|
+
// Update unified cache
|
|
286
|
+
const size = JSON.stringify(data).length;
|
|
287
|
+
this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Get IDs for a value using chunked sparse index
|
|
291
|
+
*/
|
|
292
|
+
async getIdsFromChunks(field, value) {
|
|
293
|
+
// Load sparse index
|
|
294
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
295
|
+
if (!sparseIndex) {
|
|
296
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
297
|
+
if (!sparseIndex) {
|
|
298
|
+
return []; // No chunked index exists yet
|
|
299
|
+
}
|
|
300
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
301
|
+
}
|
|
302
|
+
// Find candidate chunks using zone maps and bloom filters
|
|
303
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
304
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
305
|
+
if (candidateChunkIds.length === 0) {
|
|
306
|
+
return []; // No chunks contain this value
|
|
307
|
+
}
|
|
308
|
+
// Load chunks and collect IDs
|
|
309
|
+
const allIds = new Set();
|
|
310
|
+
for (const chunkId of candidateChunkIds) {
|
|
311
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
312
|
+
if (chunk) {
|
|
313
|
+
const ids = chunk.entries.get(normalizedValue);
|
|
314
|
+
if (ids) {
|
|
315
|
+
ids.forEach(id => allIds.add(id));
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
return Array.from(allIds);
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* Get IDs for a range using chunked sparse index with zone maps
|
|
323
|
+
*/
|
|
324
|
+
async getIdsFromChunksForRange(field, min, max, includeMin = true, includeMax = true) {
|
|
325
|
+
// Load sparse index
|
|
326
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
327
|
+
if (!sparseIndex) {
|
|
328
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
329
|
+
if (!sparseIndex) {
|
|
330
|
+
return []; // No chunked index exists yet
|
|
331
|
+
}
|
|
332
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
333
|
+
}
|
|
334
|
+
// Find candidate chunks using zone maps
|
|
335
|
+
const candidateChunkIds = sparseIndex.findChunksForRange(min, max);
|
|
336
|
+
if (candidateChunkIds.length === 0) {
|
|
337
|
+
return [];
|
|
338
|
+
}
|
|
339
|
+
// Load chunks and filter by range
|
|
340
|
+
const allIds = new Set();
|
|
341
|
+
for (const chunkId of candidateChunkIds) {
|
|
342
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
343
|
+
if (chunk) {
|
|
344
|
+
for (const [value, ids] of chunk.entries) {
|
|
345
|
+
// Check if value is in range
|
|
346
|
+
let inRange = true;
|
|
347
|
+
if (min !== undefined) {
|
|
348
|
+
inRange = inRange && (includeMin ? value >= min : value > min);
|
|
349
|
+
}
|
|
350
|
+
if (max !== undefined) {
|
|
351
|
+
inRange = inRange && (includeMax ? value <= max : value < max);
|
|
352
|
+
}
|
|
353
|
+
if (inRange) {
|
|
354
|
+
ids.forEach(id => allIds.add(id));
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return Array.from(allIds);
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Add value-ID mapping to chunked index
|
|
363
|
+
*/
|
|
364
|
+
async addToChunkedIndex(field, value, id) {
|
|
365
|
+
// Load or create sparse index
|
|
366
|
+
let sparseIndex = this.sparseIndices.get(field);
|
|
367
|
+
if (!sparseIndex) {
|
|
368
|
+
sparseIndex = await this.loadSparseIndex(field);
|
|
369
|
+
if (!sparseIndex) {
|
|
370
|
+
// Create new sparse index
|
|
371
|
+
const stats = this.fieldStats.get(field);
|
|
372
|
+
const chunkSize = stats
|
|
373
|
+
? this.chunkingStrategy.getOptimalChunkSize({
|
|
374
|
+
uniqueValues: stats.cardinality.uniqueValues,
|
|
375
|
+
distribution: stats.cardinality.distribution,
|
|
376
|
+
avgIdsPerValue: stats.cardinality.totalValues / Math.max(1, stats.cardinality.uniqueValues)
|
|
377
|
+
})
|
|
378
|
+
: 50;
|
|
379
|
+
sparseIndex = new SparseIndex(field, chunkSize);
|
|
380
|
+
}
|
|
381
|
+
this.sparseIndices.set(field, sparseIndex);
|
|
382
|
+
}
|
|
383
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
384
|
+
// Find existing chunk for this value (check zone maps)
|
|
385
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
386
|
+
let targetChunk = null;
|
|
387
|
+
let targetChunkId = null;
|
|
388
|
+
// Try to find an existing chunk with this value
|
|
389
|
+
for (const chunkId of candidateChunkIds) {
|
|
390
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
391
|
+
if (chunk && chunk.entries.has(normalizedValue)) {
|
|
392
|
+
targetChunk = chunk;
|
|
393
|
+
targetChunkId = chunkId;
|
|
394
|
+
break;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
// If no chunk has this value, find chunk with space or create new one
|
|
398
|
+
if (!targetChunk) {
|
|
399
|
+
// Find a chunk with available space
|
|
400
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
401
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
402
|
+
const descriptor = sparseIndex.getChunk(chunkId);
|
|
403
|
+
if (chunk && descriptor && chunk.entries.size < descriptor.splitThreshold) {
|
|
404
|
+
targetChunk = chunk;
|
|
405
|
+
targetChunkId = chunkId;
|
|
406
|
+
break;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
// Create new chunk if needed
|
|
411
|
+
if (!targetChunk) {
|
|
412
|
+
targetChunk = await this.chunkManager.createChunk(field);
|
|
413
|
+
targetChunkId = targetChunk.chunkId;
|
|
414
|
+
// Register in sparse index
|
|
415
|
+
const descriptor = {
|
|
416
|
+
chunkId: targetChunk.chunkId,
|
|
417
|
+
field,
|
|
418
|
+
valueCount: 0,
|
|
419
|
+
idCount: 0,
|
|
420
|
+
zoneMap: { min: null, max: null, count: 0, hasNulls: false },
|
|
421
|
+
lastUpdated: Date.now(),
|
|
422
|
+
splitThreshold: 80,
|
|
423
|
+
mergeThreshold: 20
|
|
424
|
+
};
|
|
425
|
+
sparseIndex.registerChunk(descriptor);
|
|
426
|
+
}
|
|
427
|
+
// Add to chunk
|
|
428
|
+
await this.chunkManager.addToChunk(targetChunk, normalizedValue, id);
|
|
429
|
+
await this.chunkManager.saveChunk(targetChunk);
|
|
430
|
+
// Update chunk descriptor in sparse index
|
|
431
|
+
const updatedZoneMap = this.chunkManager.calculateZoneMap(targetChunk);
|
|
432
|
+
const updatedBloomFilter = this.chunkManager.createBloomFilter(targetChunk);
|
|
433
|
+
sparseIndex.updateChunk(targetChunkId, {
|
|
434
|
+
valueCount: targetChunk.entries.size,
|
|
435
|
+
idCount: Array.from(targetChunk.entries.values()).reduce((sum, ids) => sum + ids.size, 0),
|
|
436
|
+
zoneMap: updatedZoneMap,
|
|
437
|
+
lastUpdated: Date.now()
|
|
438
|
+
});
|
|
439
|
+
// Update bloom filter
|
|
440
|
+
const descriptor = sparseIndex.getChunk(targetChunkId);
|
|
441
|
+
if (descriptor) {
|
|
442
|
+
sparseIndex.registerChunk(descriptor, updatedBloomFilter);
|
|
443
|
+
}
|
|
444
|
+
// Check if chunk needs splitting
|
|
445
|
+
if (targetChunk.entries.size > 80) {
|
|
446
|
+
await this.chunkManager.splitChunk(targetChunk, sparseIndex);
|
|
447
|
+
}
|
|
448
|
+
// Save sparse index
|
|
449
|
+
await this.saveSparseIndex(field, sparseIndex);
|
|
450
|
+
}
|
|
464
451
|
/**
|
|
465
|
-
*
|
|
452
|
+
* Remove ID from chunked index
|
|
453
|
+
*/
|
|
454
|
+
async removeFromChunkedIndex(field, value, id) {
|
|
455
|
+
const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
|
|
456
|
+
if (!sparseIndex) {
|
|
457
|
+
return; // No chunked index exists
|
|
458
|
+
}
|
|
459
|
+
const normalizedValue = this.normalizeValue(value, field);
|
|
460
|
+
const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
|
|
461
|
+
for (const chunkId of candidateChunkIds) {
|
|
462
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
463
|
+
if (chunk && chunk.entries.has(normalizedValue)) {
|
|
464
|
+
await this.chunkManager.removeFromChunk(chunk, normalizedValue, id);
|
|
465
|
+
await this.chunkManager.saveChunk(chunk);
|
|
466
|
+
// Update sparse index
|
|
467
|
+
const updatedZoneMap = this.chunkManager.calculateZoneMap(chunk);
|
|
468
|
+
sparseIndex.updateChunk(chunkId, {
|
|
469
|
+
valueCount: chunk.entries.size,
|
|
470
|
+
idCount: Array.from(chunk.entries.values()).reduce((sum, ids) => sum + ids.size, 0),
|
|
471
|
+
zoneMap: updatedZoneMap,
|
|
472
|
+
lastUpdated: Date.now()
|
|
473
|
+
});
|
|
474
|
+
await this.saveSparseIndex(field, sparseIndex);
|
|
475
|
+
break;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Get IDs matching a range query using zone maps
|
|
466
481
|
*/
|
|
467
482
|
async getIdsForRange(field, min, max, includeMin = true, includeMax = true) {
|
|
468
483
|
// Track range query for field statistics
|
|
@@ -470,35 +485,8 @@ export class MetadataIndexManager {
|
|
|
470
485
|
const stats = this.fieldStats.get(field);
|
|
471
486
|
stats.rangeQueryCount++;
|
|
472
487
|
}
|
|
473
|
-
//
|
|
474
|
-
await this.
|
|
475
|
-
// With incremental updates, we should rarely need to rebuild
|
|
476
|
-
// Only rebuild if it's marked dirty (e.g., after a bulk load or migration)
|
|
477
|
-
let sortedIndex = this.sortedIndices.get(field);
|
|
478
|
-
if (sortedIndex?.isDirty) {
|
|
479
|
-
prodLog.warn(`MetadataIndex: Sorted index for field '${field}' was dirty, rebuilding...`);
|
|
480
|
-
await this.buildSortedIndex(field);
|
|
481
|
-
sortedIndex = this.sortedIndices.get(field);
|
|
482
|
-
}
|
|
483
|
-
if (!sortedIndex || sortedIndex.values.length === 0)
|
|
484
|
-
return [];
|
|
485
|
-
const sorted = sortedIndex.values;
|
|
486
|
-
const resultSet = new Set();
|
|
487
|
-
// Find range boundaries
|
|
488
|
-
let start = 0;
|
|
489
|
-
let end = sorted.length - 1;
|
|
490
|
-
if (min !== undefined) {
|
|
491
|
-
start = this.binarySearchStart(sorted, min, includeMin);
|
|
492
|
-
}
|
|
493
|
-
if (max !== undefined) {
|
|
494
|
-
end = this.binarySearchEnd(sorted, max, includeMax);
|
|
495
|
-
}
|
|
496
|
-
// Collect all IDs in range
|
|
497
|
-
for (let i = start; i <= end && i < sorted.length; i++) {
|
|
498
|
-
const [, ids] = sorted[i];
|
|
499
|
-
ids.forEach(id => resultSet.add(id));
|
|
500
|
-
}
|
|
501
|
-
return Array.from(resultSet);
|
|
488
|
+
// All fields use chunked sparse index with zone map optimization (v3.42.0)
|
|
489
|
+
return await this.getIdsFromChunksForRange(field, min, max, includeMin, includeMax);
|
|
502
490
|
}
|
|
503
491
|
/**
|
|
504
492
|
* Generate field index filename for filter discovery
|
|
@@ -646,51 +634,22 @@ export class MetadataIndexManager {
|
|
|
646
634
|
const updatedFields = new Set();
|
|
647
635
|
for (let i = 0; i < fields.length; i++) {
|
|
648
636
|
const { field, value } = fields[i];
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
entry = loadedEntry ?? {
|
|
655
|
-
field,
|
|
656
|
-
value: this.normalizeValue(value, field), // Pass field for bucketing!
|
|
657
|
-
ids: new Set(),
|
|
658
|
-
lastUpdated: Date.now()
|
|
659
|
-
};
|
|
660
|
-
this.indexCache.set(key, entry);
|
|
661
|
-
}
|
|
662
|
-
// Add ID to entry
|
|
663
|
-
const wasNew = entry.ids.size === 0;
|
|
664
|
-
entry.ids.add(id);
|
|
665
|
-
entry.lastUpdated = Date.now();
|
|
666
|
-
this.dirtyEntries.add(key);
|
|
667
|
-
// Update cardinality statistics
|
|
668
|
-
if (wasNew) {
|
|
669
|
-
this.updateCardinalityStats(field, value, 'add');
|
|
670
|
-
}
|
|
671
|
-
// Track type-field affinity for intelligent NLP
|
|
672
|
-
this.updateTypeFieldAffinity(id, field, value, 'add');
|
|
673
|
-
// Incrementally update sorted index for this field
|
|
674
|
-
if (!updatedFields.has(field)) {
|
|
675
|
-
this.updateSortedIndexAdd(field, value, id);
|
|
676
|
-
updatedFields.add(field);
|
|
677
|
-
}
|
|
678
|
-
else {
|
|
679
|
-
// Multiple values for same field - still update
|
|
680
|
-
this.updateSortedIndexAdd(field, value, id);
|
|
681
|
-
}
|
|
682
|
-
// Update field index
|
|
637
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
638
|
+
await this.addToChunkedIndex(field, value, id);
|
|
639
|
+
// Update statistics and tracking
|
|
640
|
+
this.updateCardinalityStats(field, value, 'add');
|
|
641
|
+
this.updateTypeFieldAffinity(id, field, value, 'add', metadata);
|
|
683
642
|
await this.updateFieldIndex(field, value, 1);
|
|
684
643
|
// Yield to event loop every 5 fields to prevent blocking
|
|
685
644
|
if (i % 5 === 4) {
|
|
686
645
|
await this.yieldToEventLoop();
|
|
687
646
|
}
|
|
688
647
|
}
|
|
689
|
-
// Adaptive auto-flush based on usage patterns
|
|
648
|
+
// Adaptive auto-flush based on usage patterns (v3.42.0 - flush field indexes only)
|
|
690
649
|
if (!skipFlush) {
|
|
691
650
|
const timeSinceLastFlush = Date.now() - this.lastFlushTime;
|
|
692
|
-
const shouldAutoFlush = this.
|
|
693
|
-
(this.
|
|
651
|
+
const shouldAutoFlush = this.dirtyFields.size >= this.autoFlushThreshold || // Size threshold
|
|
652
|
+
(this.dirtyFields.size > 10 && timeSinceLastFlush > 5000); // Time threshold (5 seconds)
|
|
694
653
|
if (shouldAutoFlush) {
|
|
695
654
|
const startTime = Date.now();
|
|
696
655
|
await this.flush();
|
|
@@ -743,51 +702,31 @@ export class MetadataIndexManager {
|
|
|
743
702
|
// Remove from specific field indexes
|
|
744
703
|
const fields = this.extractIndexableFields(metadata);
|
|
745
704
|
for (const { field, value } of fields) {
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if (entry) {
|
|
753
|
-
const hadId = entry.ids.has(id);
|
|
754
|
-
entry.ids.delete(id);
|
|
755
|
-
entry.lastUpdated = Date.now();
|
|
756
|
-
this.dirtyEntries.add(key);
|
|
757
|
-
// Update cardinality statistics
|
|
758
|
-
if (hadId && entry.ids.size === 0) {
|
|
759
|
-
this.updateCardinalityStats(field, value, 'remove');
|
|
760
|
-
}
|
|
761
|
-
// Track type-field affinity for intelligent NLP
|
|
762
|
-
if (hadId) {
|
|
763
|
-
this.updateTypeFieldAffinity(id, field, value, 'remove');
|
|
764
|
-
}
|
|
765
|
-
// Incrementally update sorted index when removing
|
|
766
|
-
this.updateSortedIndexRemove(field, value, id);
|
|
767
|
-
// Update field index
|
|
768
|
-
await this.updateFieldIndex(field, value, -1);
|
|
769
|
-
// If no IDs left, mark for cleanup
|
|
770
|
-
if (entry.ids.size === 0) {
|
|
771
|
-
this.indexCache.delete(key);
|
|
772
|
-
await this.deleteIndexEntry(key);
|
|
773
|
-
}
|
|
774
|
-
}
|
|
705
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
706
|
+
await this.removeFromChunkedIndex(field, value, id);
|
|
707
|
+
// Update statistics and tracking
|
|
708
|
+
this.updateCardinalityStats(field, value, 'remove');
|
|
709
|
+
this.updateTypeFieldAffinity(id, field, value, 'remove', metadata);
|
|
710
|
+
await this.updateFieldIndex(field, value, -1);
|
|
775
711
|
// Invalidate cache
|
|
776
712
|
this.metadataCache.invalidatePattern(`field_values_${field}`);
|
|
777
713
|
}
|
|
778
714
|
}
|
|
779
715
|
else {
|
|
780
|
-
// Remove from all indexes (slower, requires scanning)
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
716
|
+
// Remove from all indexes (slower, requires scanning all chunks)
|
|
717
|
+
// This should be rare - prefer providing metadata when removing
|
|
718
|
+
prodLog.warn(`Removing ID ${id} without metadata requires scanning all sparse indices (slow)`);
|
|
719
|
+
// Scan all sparse indices
|
|
720
|
+
for (const [field, sparseIndex] of this.sparseIndices.entries()) {
|
|
721
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
722
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
723
|
+
if (chunk) {
|
|
724
|
+
// Check all values in this chunk
|
|
725
|
+
for (const [value, ids] of chunk.entries) {
|
|
726
|
+
if (ids.has(id)) {
|
|
727
|
+
await this.removeFromChunkedIndex(field, value, id);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
791
730
|
}
|
|
792
731
|
}
|
|
793
732
|
}
|
|
@@ -797,14 +736,9 @@ export class MetadataIndexManager {
|
|
|
797
736
|
* Get all IDs in the index
|
|
798
737
|
*/
|
|
799
738
|
async getAllIds() {
|
|
800
|
-
//
|
|
739
|
+
// Use storage as the source of truth (v3.42.0 - removed redundant indexCache scan)
|
|
801
740
|
const allIds = new Set();
|
|
802
|
-
//
|
|
803
|
-
for (const entry of this.indexCache.values()) {
|
|
804
|
-
entry.ids.forEach(id => allIds.add(id));
|
|
805
|
-
}
|
|
806
|
-
// If storage has a method to get all nouns, use it as the source of truth
|
|
807
|
-
// This ensures we include items that might not be indexed yet
|
|
741
|
+
// Storage.getNouns() is the definitive source of all entity IDs
|
|
808
742
|
if (this.storage && typeof this.storage.getNouns === 'function') {
|
|
809
743
|
try {
|
|
810
744
|
const result = await this.storage.getNouns({
|
|
@@ -818,13 +752,15 @@ export class MetadataIndexManager {
|
|
|
818
752
|
}
|
|
819
753
|
}
|
|
820
754
|
catch (e) {
|
|
821
|
-
//
|
|
755
|
+
// If storage method fails, return empty array
|
|
756
|
+
prodLog.warn('Failed to get all IDs from storage:', e);
|
|
757
|
+
return [];
|
|
822
758
|
}
|
|
823
759
|
}
|
|
824
760
|
return Array.from(allIds);
|
|
825
761
|
}
|
|
826
762
|
/**
|
|
827
|
-
* Get IDs for a specific field-value combination
|
|
763
|
+
* Get IDs for a specific field-value combination using chunked sparse index
|
|
828
764
|
*/
|
|
829
765
|
async getIds(field, value) {
|
|
830
766
|
// Track exact query for field statistics
|
|
@@ -832,27 +768,8 @@ export class MetadataIndexManager {
|
|
|
832
768
|
const stats = this.fieldStats.get(field);
|
|
833
769
|
stats.exactQueryCount++;
|
|
834
770
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
const cacheKey = `ids_${key}`;
|
|
838
|
-
const cachedIds = this.metadataCache.get(cacheKey);
|
|
839
|
-
if (cachedIds) {
|
|
840
|
-
return cachedIds;
|
|
841
|
-
}
|
|
842
|
-
// Try in-memory cache
|
|
843
|
-
let entry = this.indexCache.get(key);
|
|
844
|
-
// Load from storage if not cached
|
|
845
|
-
if (!entry) {
|
|
846
|
-
const loadedEntry = await this.loadIndexEntry(key);
|
|
847
|
-
if (loadedEntry) {
|
|
848
|
-
entry = loadedEntry;
|
|
849
|
-
this.indexCache.set(key, entry);
|
|
850
|
-
}
|
|
851
|
-
}
|
|
852
|
-
const ids = entry ? Array.from(entry.ids) : [];
|
|
853
|
-
// Cache the result
|
|
854
|
-
this.metadataCache.set(cacheKey, ids);
|
|
855
|
-
return ids;
|
|
771
|
+
// All fields use chunked sparse indexing (v3.42.0)
|
|
772
|
+
return await this.getIdsFromChunks(field, value);
|
|
856
773
|
}
|
|
857
774
|
/**
|
|
858
775
|
* Get all available values for a field (for filter discovery)
|
|
@@ -1044,11 +961,20 @@ export class MetadataIndexManager {
|
|
|
1044
961
|
// Existence operator
|
|
1045
962
|
case 'exists':
|
|
1046
963
|
if (operand) {
|
|
1047
|
-
// Get all IDs that have this field (any value)
|
|
964
|
+
// Get all IDs that have this field (any value) from chunked sparse index (v3.42.0)
|
|
1048
965
|
const allIds = new Set();
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
966
|
+
// Load sparse index for this field
|
|
967
|
+
const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
|
|
968
|
+
if (sparseIndex) {
|
|
969
|
+
// Iterate through all chunks for this field
|
|
970
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
971
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
972
|
+
if (chunk) {
|
|
973
|
+
// Collect all IDs from all values in this chunk
|
|
974
|
+
for (const ids of chunk.entries.values()) {
|
|
975
|
+
ids.forEach(id => allIds.add(id));
|
|
976
|
+
}
|
|
977
|
+
}
|
|
1052
978
|
}
|
|
1053
979
|
}
|
|
1054
980
|
fieldResults = Array.from(allIds);
|
|
@@ -1149,31 +1075,17 @@ export class MetadataIndexManager {
|
|
|
1149
1075
|
}
|
|
1150
1076
|
/**
|
|
1151
1077
|
* Flush dirty entries to storage (non-blocking version)
|
|
1078
|
+
* NOTE (v3.42.0): Sparse indices are flushed immediately in add/remove operations
|
|
1152
1079
|
*/
|
|
1153
1080
|
async flush() {
|
|
1154
|
-
// Check if we have anything to flush
|
|
1155
|
-
|
|
1156
|
-
if (this.dirtyEntries.size === 0 && this.dirtyFields.size === 0 && !hasDirtySortedIndices) {
|
|
1081
|
+
// Check if we have anything to flush
|
|
1082
|
+
if (this.dirtyFields.size === 0) {
|
|
1157
1083
|
return; // Nothing to flush
|
|
1158
1084
|
}
|
|
1159
1085
|
// Process in smaller batches to avoid blocking
|
|
1160
1086
|
const BATCH_SIZE = 20;
|
|
1161
1087
|
const allPromises = [];
|
|
1162
|
-
// Flush
|
|
1163
|
-
const dirtyEntriesArray = Array.from(this.dirtyEntries);
|
|
1164
|
-
for (let i = 0; i < dirtyEntriesArray.length; i += BATCH_SIZE) {
|
|
1165
|
-
const batch = dirtyEntriesArray.slice(i, i + BATCH_SIZE);
|
|
1166
|
-
const batchPromises = batch.map(key => {
|
|
1167
|
-
const entry = this.indexCache.get(key);
|
|
1168
|
-
return entry ? this.saveIndexEntry(key, entry) : Promise.resolve();
|
|
1169
|
-
});
|
|
1170
|
-
allPromises.push(...batchPromises);
|
|
1171
|
-
// Yield to event loop between batches
|
|
1172
|
-
if (i + BATCH_SIZE < dirtyEntriesArray.length) {
|
|
1173
|
-
await this.yieldToEventLoop();
|
|
1174
|
-
}
|
|
1175
|
-
}
|
|
1176
|
-
// Flush field indexes in batches
|
|
1088
|
+
// Flush field indexes in batches (v3.42.0 - removed flat file flushing)
|
|
1177
1089
|
const dirtyFieldsArray = Array.from(this.dirtyFields);
|
|
1178
1090
|
for (let i = 0; i < dirtyFieldsArray.length; i += BATCH_SIZE) {
|
|
1179
1091
|
const batch = dirtyFieldsArray.slice(i, i + BATCH_SIZE);
|
|
@@ -1187,15 +1099,8 @@ export class MetadataIndexManager {
|
|
|
1187
1099
|
await this.yieldToEventLoop();
|
|
1188
1100
|
}
|
|
1189
1101
|
}
|
|
1190
|
-
// Flush sorted indices (for range queries)
|
|
1191
|
-
for (const [field, sortedIndex] of this.sortedIndices.entries()) {
|
|
1192
|
-
if (sortedIndex.isDirty) {
|
|
1193
|
-
allPromises.push(this.saveSortedIndex(field, sortedIndex));
|
|
1194
|
-
}
|
|
1195
|
-
}
|
|
1196
1102
|
// Wait for all operations to complete
|
|
1197
1103
|
await Promise.all(allPromises);
|
|
1198
|
-
this.dirtyEntries.clear();
|
|
1199
1104
|
this.dirtyFields.clear();
|
|
1200
1105
|
this.lastFlushTime = Date.now();
|
|
1201
1106
|
}
|
|
@@ -1275,69 +1180,6 @@ export class MetadataIndexManager {
|
|
|
1275
1180
|
}
|
|
1276
1181
|
}
|
|
1277
1182
|
}
|
|
1278
|
-
/**
|
|
1279
|
-
* Save sorted index to storage for range queries with file locking
|
|
1280
|
-
*/
|
|
1281
|
-
async saveSortedIndex(field, sortedIndex) {
|
|
1282
|
-
const filename = `sorted_${field}`;
|
|
1283
|
-
const lockKey = `sorted_index_${field}`;
|
|
1284
|
-
const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
|
|
1285
|
-
if (!lockAcquired) {
|
|
1286
|
-
prodLog.warn(`Failed to acquire lock for sorted index '${field}', proceeding without lock`);
|
|
1287
|
-
}
|
|
1288
|
-
try {
|
|
1289
|
-
const indexId = `__metadata_sorted_index__${filename}`;
|
|
1290
|
-
const unifiedKey = `metadata:sorted:${field}`;
|
|
1291
|
-
// Convert Set to Array for serialization
|
|
1292
|
-
const serializable = {
|
|
1293
|
-
values: sortedIndex.values.map(([value, ids]) => [value, Array.from(ids)]),
|
|
1294
|
-
fieldType: sortedIndex.fieldType,
|
|
1295
|
-
lastUpdated: Date.now()
|
|
1296
|
-
};
|
|
1297
|
-
await this.storage.saveMetadata(indexId, serializable);
|
|
1298
|
-
// Mark as clean
|
|
1299
|
-
sortedIndex.isDirty = false;
|
|
1300
|
-
// Update unified cache (sorted indices are expensive to rebuild)
|
|
1301
|
-
const size = JSON.stringify(serializable).length;
|
|
1302
|
-
this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100); // Higher rebuild cost
|
|
1303
|
-
}
|
|
1304
|
-
finally {
|
|
1305
|
-
if (lockAcquired) {
|
|
1306
|
-
await this.releaseLock(lockKey);
|
|
1307
|
-
}
|
|
1308
|
-
}
|
|
1309
|
-
}
|
|
1310
|
-
/**
|
|
1311
|
-
* Load sorted index from storage
|
|
1312
|
-
*/
|
|
1313
|
-
async loadSortedIndex(field) {
|
|
1314
|
-
const filename = `sorted_${field}`;
|
|
1315
|
-
const indexId = `__metadata_sorted_index__${filename}`;
|
|
1316
|
-
const unifiedKey = `metadata:sorted:${field}`;
|
|
1317
|
-
// Check unified cache first
|
|
1318
|
-
const cached = await this.unifiedCache.get(unifiedKey, async () => {
|
|
1319
|
-
try {
|
|
1320
|
-
const data = await this.storage.getMetadata(indexId);
|
|
1321
|
-
if (data) {
|
|
1322
|
-
// Convert Arrays back to Sets
|
|
1323
|
-
const sortedIndex = {
|
|
1324
|
-
values: data.values.map(([value, ids]) => [value, new Set(ids)]),
|
|
1325
|
-
fieldType: data.fieldType || 'mixed',
|
|
1326
|
-
isDirty: false
|
|
1327
|
-
};
|
|
1328
|
-
// Add to unified cache
|
|
1329
|
-
const size = JSON.stringify(data).length;
|
|
1330
|
-
this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100);
|
|
1331
|
-
return sortedIndex;
|
|
1332
|
-
}
|
|
1333
|
-
}
|
|
1334
|
-
catch (error) {
|
|
1335
|
-
// Sorted index doesn't exist yet
|
|
1336
|
-
}
|
|
1337
|
-
return null;
|
|
1338
|
-
});
|
|
1339
|
-
return cached;
|
|
1340
|
-
}
|
|
1341
1183
|
/**
|
|
1342
1184
|
* Get count of entities by type - O(1) operation using existing tracking
|
|
1343
1185
|
* This exposes the production-ready counting that's already maintained
|
|
@@ -1362,19 +1204,12 @@ export class MetadataIndexManager {
|
|
|
1362
1204
|
return new Map(this.totalEntitiesByType);
|
|
1363
1205
|
}
|
|
1364
1206
|
/**
|
|
1365
|
-
* Get count of entities matching field-value criteria -
|
|
1207
|
+
* Get count of entities matching field-value criteria - queries chunked sparse index
|
|
1366
1208
|
*/
|
|
1367
1209
|
async getCountForCriteria(field, value) {
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
const loadedEntry = await this.loadIndexEntry(key);
|
|
1372
|
-
if (loadedEntry) {
|
|
1373
|
-
entry = loadedEntry;
|
|
1374
|
-
this.indexCache.set(key, entry);
|
|
1375
|
-
}
|
|
1376
|
-
}
|
|
1377
|
-
return entry ? entry.ids.size : 0;
|
|
1210
|
+
// Use chunked sparse indexing (v3.42.0 - removed indexCache)
|
|
1211
|
+
const ids = await this.getIds(field, value);
|
|
1212
|
+
return ids.length;
|
|
1378
1213
|
}
|
|
1379
1214
|
/**
|
|
1380
1215
|
* Get index statistics with enhanced counting information
|
|
@@ -1383,10 +1218,23 @@ export class MetadataIndexManager {
|
|
|
1383
1218
|
const fields = new Set();
|
|
1384
1219
|
let totalEntries = 0;
|
|
1385
1220
|
let totalIds = 0;
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1221
|
+
// Collect stats from sparse indices (v3.42.0 - removed indexCache)
|
|
1222
|
+
for (const [field, sparseIndex] of this.sparseIndices.entries()) {
|
|
1223
|
+
fields.add(field);
|
|
1224
|
+
// Count entries and IDs from all chunks
|
|
1225
|
+
for (const chunkId of sparseIndex.getAllChunkIds()) {
|
|
1226
|
+
const chunk = await this.chunkManager.loadChunk(field, chunkId);
|
|
1227
|
+
if (chunk) {
|
|
1228
|
+
totalEntries += chunk.entries.size;
|
|
1229
|
+
for (const ids of chunk.entries.values()) {
|
|
1230
|
+
totalIds += ids.size;
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
// Also include fields from fieldIndexes that might not have sparse indices yet
|
|
1236
|
+
for (const field of this.fieldIndexes.keys()) {
|
|
1237
|
+
fields.add(field);
|
|
1390
1238
|
}
|
|
1391
1239
|
return {
|
|
1392
1240
|
totalEntries,
|
|
@@ -1408,9 +1256,8 @@ export class MetadataIndexManager {
|
|
|
1408
1256
|
prodLog.info('🔄 Starting non-blocking metadata index rebuild with batch processing to prevent socket exhaustion...');
|
|
1409
1257
|
prodLog.info(`📊 Storage adapter: ${this.storage.constructor.name}`);
|
|
1410
1258
|
prodLog.info(`🔧 Batch processing available: ${!!this.storage.getMetadataBatch}`);
|
|
1411
|
-
// Clear existing indexes
|
|
1412
|
-
this.
|
|
1413
|
-
this.dirtyEntries.clear();
|
|
1259
|
+
// Clear existing indexes (v3.42.0 - use sparse indices instead of flat files)
|
|
1260
|
+
this.sparseIndices.clear();
|
|
1414
1261
|
this.fieldIndexes.clear();
|
|
1415
1262
|
this.dirtyFields.clear();
|
|
1416
1263
|
// Rebuild noun metadata indexes using pagination
|
|
@@ -1599,89 +1446,6 @@ export class MetadataIndexManager {
|
|
|
1599
1446
|
this.isRebuilding = false;
|
|
1600
1447
|
}
|
|
1601
1448
|
}
|
|
1602
|
-
/**
|
|
1603
|
-
* Load index entry from storage using safe filenames
|
|
1604
|
-
*/
|
|
1605
|
-
async loadIndexEntry(key) {
|
|
1606
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1607
|
-
// Use unified cache with loader function
|
|
1608
|
-
return await this.unifiedCache.get(unifiedKey, async () => {
|
|
1609
|
-
try {
|
|
1610
|
-
// Extract field and value from key
|
|
1611
|
-
const [field, value] = key.split(':', 2);
|
|
1612
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1613
|
-
// Load from metadata indexes directory with safe filename
|
|
1614
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1615
|
-
const data = await this.storage.getMetadata(indexId);
|
|
1616
|
-
if (data) {
|
|
1617
|
-
const entry = {
|
|
1618
|
-
field: data.field,
|
|
1619
|
-
value: data.value,
|
|
1620
|
-
ids: new Set(data.ids || []),
|
|
1621
|
-
lastUpdated: data.lastUpdated || Date.now()
|
|
1622
|
-
};
|
|
1623
|
-
// Add to unified cache (metadata entries are cheap to rebuild)
|
|
1624
|
-
const size = JSON.stringify(Array.from(entry.ids)).length + 100;
|
|
1625
|
-
this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
|
|
1626
|
-
return entry;
|
|
1627
|
-
}
|
|
1628
|
-
}
|
|
1629
|
-
catch (error) {
|
|
1630
|
-
// Index entry doesn't exist yet
|
|
1631
|
-
}
|
|
1632
|
-
return null;
|
|
1633
|
-
});
|
|
1634
|
-
}
|
|
1635
|
-
/**
|
|
1636
|
-
* Save index entry to storage using safe filenames with file locking
|
|
1637
|
-
*/
|
|
1638
|
-
async saveIndexEntry(key, entry) {
|
|
1639
|
-
const lockKey = `index_entry_${key}`;
|
|
1640
|
-
const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
|
|
1641
|
-
if (!lockAcquired) {
|
|
1642
|
-
prodLog.warn(`Failed to acquire lock for index entry '${key}', proceeding without lock`);
|
|
1643
|
-
}
|
|
1644
|
-
try {
|
|
1645
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1646
|
-
const data = {
|
|
1647
|
-
field: entry.field,
|
|
1648
|
-
value: entry.value,
|
|
1649
|
-
ids: Array.from(entry.ids),
|
|
1650
|
-
lastUpdated: entry.lastUpdated
|
|
1651
|
-
};
|
|
1652
|
-
// Extract field and value from key for safe filename generation
|
|
1653
|
-
const [field, value] = key.split(':', 2);
|
|
1654
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1655
|
-
// Store metadata indexes with safe filename
|
|
1656
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1657
|
-
await this.storage.saveMetadata(indexId, data);
|
|
1658
|
-
// Update unified cache
|
|
1659
|
-
const size = JSON.stringify(data.ids).length + 100;
|
|
1660
|
-
this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
|
|
1661
|
-
}
|
|
1662
|
-
finally {
|
|
1663
|
-
if (lockAcquired) {
|
|
1664
|
-
await this.releaseLock(lockKey);
|
|
1665
|
-
}
|
|
1666
|
-
}
|
|
1667
|
-
}
|
|
1668
|
-
/**
|
|
1669
|
-
* Delete index entry from storage using safe filenames
|
|
1670
|
-
*/
|
|
1671
|
-
async deleteIndexEntry(key) {
|
|
1672
|
-
const unifiedKey = `metadata:entry:${key}`;
|
|
1673
|
-
try {
|
|
1674
|
-
const [field, value] = key.split(':', 2);
|
|
1675
|
-
const filename = this.getValueChunkFilename(field, value);
|
|
1676
|
-
const indexId = `__metadata_index__${filename}`;
|
|
1677
|
-
await this.storage.saveMetadata(indexId, null);
|
|
1678
|
-
// Remove from unified cache
|
|
1679
|
-
this.unifiedCache.delete(unifiedKey);
|
|
1680
|
-
}
|
|
1681
|
-
catch (error) {
|
|
1682
|
-
// Entry might not exist
|
|
1683
|
-
}
|
|
1684
|
-
}
|
|
1685
1449
|
/**
|
|
1686
1450
|
* Get field statistics for optimization and discovery
|
|
1687
1451
|
*/
|
|
@@ -1814,7 +1578,7 @@ export class MetadataIndexManager {
|
|
|
1814
1578
|
* Update type-field affinity tracking for intelligent NLP
|
|
1815
1579
|
* Tracks which fields commonly appear with which entity types
|
|
1816
1580
|
*/
|
|
1817
|
-
updateTypeFieldAffinity(entityId, field, value, operation) {
|
|
1581
|
+
updateTypeFieldAffinity(entityId, field, value, operation, metadata) {
|
|
1818
1582
|
// Only track affinity for non-system fields (but allow 'noun' for type detection)
|
|
1819
1583
|
if (this.config.excludeFields.includes(field) && field !== 'noun')
|
|
1820
1584
|
return;
|
|
@@ -1824,14 +1588,13 @@ export class MetadataIndexManager {
|
|
|
1824
1588
|
// This is the type definition itself
|
|
1825
1589
|
entityType = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
1826
1590
|
}
|
|
1591
|
+
else if (metadata && metadata.noun) {
|
|
1592
|
+
// Extract entity type from metadata (v3.42.0 - removed indexCache scan)
|
|
1593
|
+
entityType = this.normalizeValue(metadata.noun, 'noun');
|
|
1594
|
+
}
|
|
1827
1595
|
else {
|
|
1828
|
-
//
|
|
1829
|
-
|
|
1830
|
-
if (key.startsWith('noun:') && entry.ids.has(entityId)) {
|
|
1831
|
-
entityType = key.split(':', 2)[1];
|
|
1832
|
-
break;
|
|
1833
|
-
}
|
|
1834
|
-
}
|
|
1596
|
+
// No type information available, skip affinity tracking
|
|
1597
|
+
return;
|
|
1835
1598
|
}
|
|
1836
1599
|
if (!entityType)
|
|
1837
1600
|
return; // No type found, skip affinity tracking
|