@soulcraft/brainy 3.41.0 → 3.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,18 +6,14 @@
6
6
  import { MetadataIndexCache } from './metadataIndexCache.js';
7
7
  import { prodLog } from './logger.js';
8
8
  import { getGlobalCache } from './unifiedCache.js';
9
+ import { SparseIndex, ChunkManager, AdaptiveChunkingStrategy } from './metadataIndexChunking.js';
9
10
  export class MetadataIndexManager {
10
11
  constructor(storage, config = {}) {
11
- this.indexCache = new Map();
12
- this.dirtyEntries = new Set();
13
12
  this.isRebuilding = false;
14
13
  this.fieldIndexes = new Map();
15
14
  this.dirtyFields = new Set();
16
15
  this.lastFlushTime = Date.now();
17
16
  this.autoFlushThreshold = 10; // Start with 10 for more frequent non-blocking flushes
18
- // Sorted indices for range queries (only for numeric/date fields)
19
- this.sortedIndices = new Map();
20
- this.numericFields = new Set(); // Track which fields are numeric
21
17
  // Cardinality and field statistics tracking
22
18
  this.fieldStats = new Map();
23
19
  this.cardinalityUpdateInterval = 100; // Update cardinality every N operations
@@ -33,6 +29,10 @@ export class MetadataIndexManager {
33
29
  this.activeLocks = new Map();
34
30
  this.lockPromises = new Map();
35
31
  this.lockTimers = new Map(); // Track timers for cleanup
32
+ // Adaptive Chunked Sparse Indexing (v3.42.0)
33
+ // Reduces file count from 560k → 89 files (630x reduction)
34
+ // ALL fields now use chunking - no more flat files
35
+ this.sparseIndices = new Map(); // field -> sparse index
36
36
  this.storage = storage;
37
37
  this.config = {
38
38
  maxIndexSize: config.maxIndexSize ?? 10000,
@@ -67,6 +67,9 @@ export class MetadataIndexManager {
67
67
  });
68
68
  // Get global unified cache for coordinated memory management
69
69
  this.unifiedCache = getGlobalCache();
70
+ // Initialize chunking system (v3.42.0)
71
+ this.chunkManager = new ChunkManager(storage);
72
+ this.chunkingStrategy = new AdaptiveChunkingStrategy();
70
73
  // Lazy load counts from storage statistics on first access
71
74
  this.lazyLoadCounts();
72
75
  }
@@ -148,220 +151,6 @@ export class MetadataIndexManager {
148
151
  // This maintains zero-configuration principle
149
152
  }
150
153
  }
151
- /**
152
- * Get index key for field and value
153
- */
154
- getIndexKey(field, value) {
155
- const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
156
- return `${field}:${normalizedValue}`;
157
- }
158
- /**
159
- * Ensure sorted index exists for a field (for range queries)
160
- */
161
- async ensureSortedIndex(field) {
162
- if (!this.sortedIndices.has(field)) {
163
- // Try to load from storage first
164
- const loaded = await this.loadSortedIndex(field);
165
- if (loaded) {
166
- this.sortedIndices.set(field, loaded);
167
- }
168
- else {
169
- // Create new sorted index - NOT dirty since we maintain incrementally
170
- this.sortedIndices.set(field, {
171
- values: [],
172
- isDirty: false, // Clean by default with incremental updates
173
- fieldType: 'mixed'
174
- });
175
- }
176
- }
177
- }
178
- /**
179
- * Build sorted index for a field from hash index
180
- */
181
- async buildSortedIndex(field) {
182
- const sortedIndex = this.sortedIndices.get(field);
183
- if (!sortedIndex || !sortedIndex.isDirty)
184
- return;
185
- // Collect all values for this field from hash index
186
- const valueMap = new Map();
187
- for (const [key, entry] of this.indexCache.entries()) {
188
- if (entry.field === field) {
189
- const existing = valueMap.get(entry.value);
190
- if (existing) {
191
- // Merge ID sets
192
- entry.ids.forEach(id => existing.add(id));
193
- }
194
- else {
195
- valueMap.set(entry.value, new Set(entry.ids));
196
- }
197
- }
198
- }
199
- // Convert to sorted array
200
- const sorted = Array.from(valueMap.entries());
201
- // Detect field type and sort accordingly
202
- if (sorted.length > 0) {
203
- const sampleValue = sorted[0][0];
204
- if (typeof sampleValue === 'number') {
205
- sortedIndex.fieldType = 'number';
206
- sorted.sort((a, b) => a[0] - b[0]);
207
- }
208
- else if (sampleValue instanceof Date) {
209
- sortedIndex.fieldType = 'date';
210
- sorted.sort((a, b) => a[0].getTime() - b[0].getTime());
211
- }
212
- else {
213
- sortedIndex.fieldType = 'string';
214
- sorted.sort((a, b) => {
215
- const aVal = String(a[0]);
216
- const bVal = String(b[0]);
217
- return aVal < bVal ? -1 : aVal > bVal ? 1 : 0;
218
- });
219
- }
220
- }
221
- sortedIndex.values = sorted;
222
- sortedIndex.isDirty = false;
223
- }
224
- /**
225
- * Detect field type from value
226
- */
227
- detectFieldType(value) {
228
- if (typeof value === 'number' && !isNaN(value))
229
- return 'number';
230
- if (value instanceof Date)
231
- return 'date';
232
- return 'string';
233
- }
234
- /**
235
- * Compare two values based on field type for sorting
236
- */
237
- compareValues(a, b, fieldType) {
238
- switch (fieldType) {
239
- case 'number':
240
- return a - b;
241
- case 'date':
242
- return a.getTime() - b.getTime();
243
- case 'string':
244
- default:
245
- const aStr = String(a);
246
- const bStr = String(b);
247
- return aStr < bStr ? -1 : aStr > bStr ? 1 : 0;
248
- }
249
- }
250
- /**
251
- * Binary search to find insertion position for a value
252
- * Returns the index where the value should be inserted to maintain sorted order
253
- */
254
- findInsertPosition(sortedArray, value, fieldType) {
255
- if (sortedArray.length === 0)
256
- return 0;
257
- let left = 0;
258
- let right = sortedArray.length - 1;
259
- while (left <= right) {
260
- const mid = Math.floor((left + right) / 2);
261
- const midVal = sortedArray[mid][0];
262
- const comparison = this.compareValues(midVal, value, fieldType);
263
- if (comparison < 0) {
264
- left = mid + 1;
265
- }
266
- else if (comparison > 0) {
267
- right = mid - 1;
268
- }
269
- else {
270
- return mid; // Value already exists at this position
271
- }
272
- }
273
- return left; // Insert position
274
- }
275
- /**
276
- * Incrementally update sorted index when adding an ID
277
- */
278
- updateSortedIndexAdd(field, value, id) {
279
- // Ensure sorted index exists
280
- if (!this.sortedIndices.has(field)) {
281
- this.sortedIndices.set(field, {
282
- values: [],
283
- isDirty: false,
284
- fieldType: this.detectFieldType(value)
285
- });
286
- }
287
- const sortedIndex = this.sortedIndices.get(field);
288
- const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
289
- // Find where this value should be in the sorted array
290
- const insertPos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
291
- if (insertPos < sortedIndex.values.length &&
292
- sortedIndex.values[insertPos][0] === normalizedValue) {
293
- // Value already exists, just add the ID to the existing Set
294
- sortedIndex.values[insertPos][1].add(id);
295
- }
296
- else {
297
- // New value, insert at the correct position
298
- sortedIndex.values.splice(insertPos, 0, [normalizedValue, new Set([id])]);
299
- }
300
- // Mark as clean since we're maintaining it incrementally
301
- sortedIndex.isDirty = false;
302
- }
303
- /**
304
- * Incrementally update sorted index when removing an ID
305
- */
306
- updateSortedIndexRemove(field, value, id) {
307
- const sortedIndex = this.sortedIndices.get(field);
308
- if (!sortedIndex || sortedIndex.values.length === 0)
309
- return;
310
- const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
311
- // Binary search to find the value
312
- const pos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
313
- if (pos < sortedIndex.values.length &&
314
- sortedIndex.values[pos][0] === normalizedValue) {
315
- // Remove the ID from the Set
316
- sortedIndex.values[pos][1].delete(id);
317
- // If no IDs left for this value, remove the entire entry
318
- if (sortedIndex.values[pos][1].size === 0) {
319
- sortedIndex.values.splice(pos, 1);
320
- }
321
- }
322
- // Keep it clean
323
- sortedIndex.isDirty = false;
324
- }
325
- /**
326
- * Binary search for range start (inclusive or exclusive)
327
- */
328
- binarySearchStart(sorted, target, inclusive) {
329
- let left = 0;
330
- let right = sorted.length - 1;
331
- let result = sorted.length;
332
- while (left <= right) {
333
- const mid = Math.floor((left + right) / 2);
334
- const midVal = sorted[mid][0];
335
- if (inclusive ? midVal >= target : midVal > target) {
336
- result = mid;
337
- right = mid - 1;
338
- }
339
- else {
340
- left = mid + 1;
341
- }
342
- }
343
- return result;
344
- }
345
- /**
346
- * Binary search for range end (inclusive or exclusive)
347
- */
348
- binarySearchEnd(sorted, target, inclusive) {
349
- let left = 0;
350
- let right = sorted.length - 1;
351
- let result = -1;
352
- while (left <= right) {
353
- const mid = Math.floor((left + right) / 2);
354
- const midVal = sorted[mid][0];
355
- if (inclusive ? midVal <= target : midVal < target) {
356
- result = mid;
357
- left = mid + 1;
358
- }
359
- else {
360
- right = mid - 1;
361
- }
362
- }
363
- return result;
364
- }
365
154
  /**
366
155
  * Update cardinality statistics for a field
367
156
  */
@@ -385,18 +174,21 @@ export class MetadataIndexManager {
385
174
  }
386
175
  const stats = this.fieldStats.get(field);
387
176
  const cardinality = stats.cardinality;
388
- // Track unique values
389
- const fieldIndexKey = `${field}:${String(value)}`;
390
- const entry = this.indexCache.get(fieldIndexKey);
177
+ // Track unique values by checking fieldIndex counts (v3.42.0 - removed indexCache)
178
+ const fieldIndex = this.fieldIndexes.get(field);
179
+ const normalizedValue = this.normalizeValue(value, field);
180
+ const currentCount = fieldIndex?.values[normalizedValue] || 0;
391
181
  if (operation === 'add') {
392
- if (!entry || entry.ids.size === 1) {
182
+ // If this is a new value (count is 0), increment unique values
183
+ if (currentCount === 0) {
393
184
  cardinality.uniqueValues++;
394
185
  }
395
186
  cardinality.totalValues++;
396
187
  }
397
188
  else if (operation === 'remove') {
398
- if (entry && entry.ids.size === 0) {
399
- cardinality.uniqueValues--;
189
+ // If count will become 0, decrement unique values
190
+ if (currentCount === 1) {
191
+ cardinality.uniqueValues = Math.max(0, cardinality.uniqueValues - 1);
400
192
  }
401
193
  cardinality.totalValues = Math.max(0, cardinality.totalValues - 1);
402
194
  }
@@ -434,23 +226,17 @@ export class MetadataIndexManager {
434
226
  * Update index strategy based on field statistics
435
227
  */
436
228
  updateIndexStrategy(field, stats) {
437
- const isNumeric = this.numericFields.has(field);
438
229
  const hasHighCardinality = stats.cardinality.uniqueValues > this.HIGH_CARDINALITY_THRESHOLD;
439
- const hasRangeQueries = stats.rangeQueryCount > stats.exactQueryCount * 0.3;
440
- // Determine optimal index type
441
- if (isNumeric && hasRangeQueries) {
442
- stats.indexType = 'both'; // Need both hash and sorted
443
- }
444
- else if (hasRangeQueries) {
445
- stats.indexType = 'sorted';
446
- }
447
- else {
448
- stats.indexType = 'hash';
449
- }
230
+ // All fields use chunked sparse indexing with zone maps (v3.42.0)
231
+ stats.indexType = 'hash';
450
232
  // Determine normalization strategy for high cardinality NON-temporal fields
451
233
  // (Temporal fields are already bucketed in normalizeValue from the start!)
452
234
  if (hasHighCardinality) {
453
- if (isNumeric) {
235
+ // Check if field looks numeric (for float precision reduction)
236
+ const fieldLower = field.toLowerCase();
237
+ const looksNumeric = fieldLower.includes('count') || fieldLower.includes('score') ||
238
+ fieldLower.includes('value') || fieldLower.includes('amount');
239
+ if (looksNumeric) {
454
240
  stats.normalizationStrategy = 'precision'; // Reduce float precision
455
241
  }
456
242
  else {
@@ -461,8 +247,237 @@ export class MetadataIndexManager {
461
247
  stats.normalizationStrategy = 'none';
462
248
  }
463
249
  }
250
+ // ============================================================================
251
+ // Adaptive Chunked Sparse Indexing (v3.42.0)
252
+ // All fields use chunking - simplified implementation
253
+ // ============================================================================
254
+ /**
255
+ * Load sparse index from storage
256
+ */
257
+ async loadSparseIndex(field) {
258
+ const indexPath = `__sparse_index__${field}`;
259
+ const unifiedKey = `metadata:sparse:${field}`;
260
+ return await this.unifiedCache.get(unifiedKey, async () => {
261
+ try {
262
+ const data = await this.storage.getMetadata(indexPath);
263
+ if (data) {
264
+ const sparseIndex = SparseIndex.fromJSON(data);
265
+ // Add to unified cache (sparse indices are expensive to rebuild)
266
+ const size = JSON.stringify(data).length;
267
+ this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
268
+ return sparseIndex;
269
+ }
270
+ }
271
+ catch (error) {
272
+ prodLog.debug(`Failed to load sparse index for field '${field}':`, error);
273
+ }
274
+ return undefined;
275
+ });
276
+ }
277
+ /**
278
+ * Save sparse index to storage
279
+ */
280
+ async saveSparseIndex(field, sparseIndex) {
281
+ const indexPath = `__sparse_index__${field}`;
282
+ const unifiedKey = `metadata:sparse:${field}`;
283
+ const data = sparseIndex.toJSON();
284
+ await this.storage.saveMetadata(indexPath, data);
285
+ // Update unified cache
286
+ const size = JSON.stringify(data).length;
287
+ this.unifiedCache.set(unifiedKey, sparseIndex, 'metadata', size, 200);
288
+ }
289
+ /**
290
+ * Get IDs for a value using chunked sparse index
291
+ */
292
+ async getIdsFromChunks(field, value) {
293
+ // Load sparse index
294
+ let sparseIndex = this.sparseIndices.get(field);
295
+ if (!sparseIndex) {
296
+ sparseIndex = await this.loadSparseIndex(field);
297
+ if (!sparseIndex) {
298
+ return []; // No chunked index exists yet
299
+ }
300
+ this.sparseIndices.set(field, sparseIndex);
301
+ }
302
+ // Find candidate chunks using zone maps and bloom filters
303
+ const normalizedValue = this.normalizeValue(value, field);
304
+ const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
305
+ if (candidateChunkIds.length === 0) {
306
+ return []; // No chunks contain this value
307
+ }
308
+ // Load chunks and collect IDs
309
+ const allIds = new Set();
310
+ for (const chunkId of candidateChunkIds) {
311
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
312
+ if (chunk) {
313
+ const ids = chunk.entries.get(normalizedValue);
314
+ if (ids) {
315
+ ids.forEach(id => allIds.add(id));
316
+ }
317
+ }
318
+ }
319
+ return Array.from(allIds);
320
+ }
321
+ /**
322
+ * Get IDs for a range using chunked sparse index with zone maps
323
+ */
324
+ async getIdsFromChunksForRange(field, min, max, includeMin = true, includeMax = true) {
325
+ // Load sparse index
326
+ let sparseIndex = this.sparseIndices.get(field);
327
+ if (!sparseIndex) {
328
+ sparseIndex = await this.loadSparseIndex(field);
329
+ if (!sparseIndex) {
330
+ return []; // No chunked index exists yet
331
+ }
332
+ this.sparseIndices.set(field, sparseIndex);
333
+ }
334
+ // Find candidate chunks using zone maps
335
+ const candidateChunkIds = sparseIndex.findChunksForRange(min, max);
336
+ if (candidateChunkIds.length === 0) {
337
+ return [];
338
+ }
339
+ // Load chunks and filter by range
340
+ const allIds = new Set();
341
+ for (const chunkId of candidateChunkIds) {
342
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
343
+ if (chunk) {
344
+ for (const [value, ids] of chunk.entries) {
345
+ // Check if value is in range
346
+ let inRange = true;
347
+ if (min !== undefined) {
348
+ inRange = inRange && (includeMin ? value >= min : value > min);
349
+ }
350
+ if (max !== undefined) {
351
+ inRange = inRange && (includeMax ? value <= max : value < max);
352
+ }
353
+ if (inRange) {
354
+ ids.forEach(id => allIds.add(id));
355
+ }
356
+ }
357
+ }
358
+ }
359
+ return Array.from(allIds);
360
+ }
361
+ /**
362
+ * Add value-ID mapping to chunked index
363
+ */
364
+ async addToChunkedIndex(field, value, id) {
365
+ // Load or create sparse index
366
+ let sparseIndex = this.sparseIndices.get(field);
367
+ if (!sparseIndex) {
368
+ sparseIndex = await this.loadSparseIndex(field);
369
+ if (!sparseIndex) {
370
+ // Create new sparse index
371
+ const stats = this.fieldStats.get(field);
372
+ const chunkSize = stats
373
+ ? this.chunkingStrategy.getOptimalChunkSize({
374
+ uniqueValues: stats.cardinality.uniqueValues,
375
+ distribution: stats.cardinality.distribution,
376
+ avgIdsPerValue: stats.cardinality.totalValues / Math.max(1, stats.cardinality.uniqueValues)
377
+ })
378
+ : 50;
379
+ sparseIndex = new SparseIndex(field, chunkSize);
380
+ }
381
+ this.sparseIndices.set(field, sparseIndex);
382
+ }
383
+ const normalizedValue = this.normalizeValue(value, field);
384
+ // Find existing chunk for this value (check zone maps)
385
+ const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
386
+ let targetChunk = null;
387
+ let targetChunkId = null;
388
+ // Try to find an existing chunk with this value
389
+ for (const chunkId of candidateChunkIds) {
390
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
391
+ if (chunk && chunk.entries.has(normalizedValue)) {
392
+ targetChunk = chunk;
393
+ targetChunkId = chunkId;
394
+ break;
395
+ }
396
+ }
397
+ // If no chunk has this value, find chunk with space or create new one
398
+ if (!targetChunk) {
399
+ // Find a chunk with available space
400
+ for (const chunkId of sparseIndex.getAllChunkIds()) {
401
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
402
+ const descriptor = sparseIndex.getChunk(chunkId);
403
+ if (chunk && descriptor && chunk.entries.size < descriptor.splitThreshold) {
404
+ targetChunk = chunk;
405
+ targetChunkId = chunkId;
406
+ break;
407
+ }
408
+ }
409
+ }
410
+ // Create new chunk if needed
411
+ if (!targetChunk) {
412
+ targetChunk = await this.chunkManager.createChunk(field);
413
+ targetChunkId = targetChunk.chunkId;
414
+ // Register in sparse index
415
+ const descriptor = {
416
+ chunkId: targetChunk.chunkId,
417
+ field,
418
+ valueCount: 0,
419
+ idCount: 0,
420
+ zoneMap: { min: null, max: null, count: 0, hasNulls: false },
421
+ lastUpdated: Date.now(),
422
+ splitThreshold: 80,
423
+ mergeThreshold: 20
424
+ };
425
+ sparseIndex.registerChunk(descriptor);
426
+ }
427
+ // Add to chunk
428
+ await this.chunkManager.addToChunk(targetChunk, normalizedValue, id);
429
+ await this.chunkManager.saveChunk(targetChunk);
430
+ // Update chunk descriptor in sparse index
431
+ const updatedZoneMap = this.chunkManager.calculateZoneMap(targetChunk);
432
+ const updatedBloomFilter = this.chunkManager.createBloomFilter(targetChunk);
433
+ sparseIndex.updateChunk(targetChunkId, {
434
+ valueCount: targetChunk.entries.size,
435
+ idCount: Array.from(targetChunk.entries.values()).reduce((sum, ids) => sum + ids.size, 0),
436
+ zoneMap: updatedZoneMap,
437
+ lastUpdated: Date.now()
438
+ });
439
+ // Update bloom filter
440
+ const descriptor = sparseIndex.getChunk(targetChunkId);
441
+ if (descriptor) {
442
+ sparseIndex.registerChunk(descriptor, updatedBloomFilter);
443
+ }
444
+ // Check if chunk needs splitting
445
+ if (targetChunk.entries.size > 80) {
446
+ await this.chunkManager.splitChunk(targetChunk, sparseIndex);
447
+ }
448
+ // Save sparse index
449
+ await this.saveSparseIndex(field, sparseIndex);
450
+ }
464
451
  /**
465
- * Get IDs matching a range query
452
+ * Remove ID from chunked index
453
+ */
454
+ async removeFromChunkedIndex(field, value, id) {
455
+ const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
456
+ if (!sparseIndex) {
457
+ return; // No chunked index exists
458
+ }
459
+ const normalizedValue = this.normalizeValue(value, field);
460
+ const candidateChunkIds = sparseIndex.findChunksForValue(normalizedValue);
461
+ for (const chunkId of candidateChunkIds) {
462
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
463
+ if (chunk && chunk.entries.has(normalizedValue)) {
464
+ await this.chunkManager.removeFromChunk(chunk, normalizedValue, id);
465
+ await this.chunkManager.saveChunk(chunk);
466
+ // Update sparse index
467
+ const updatedZoneMap = this.chunkManager.calculateZoneMap(chunk);
468
+ sparseIndex.updateChunk(chunkId, {
469
+ valueCount: chunk.entries.size,
470
+ idCount: Array.from(chunk.entries.values()).reduce((sum, ids) => sum + ids.size, 0),
471
+ zoneMap: updatedZoneMap,
472
+ lastUpdated: Date.now()
473
+ });
474
+ await this.saveSparseIndex(field, sparseIndex);
475
+ break;
476
+ }
477
+ }
478
+ }
479
+ /**
480
+ * Get IDs matching a range query using zone maps
466
481
  */
467
482
  async getIdsForRange(field, min, max, includeMin = true, includeMax = true) {
468
483
  // Track range query for field statistics
@@ -470,35 +485,8 @@ export class MetadataIndexManager {
470
485
  const stats = this.fieldStats.get(field);
471
486
  stats.rangeQueryCount++;
472
487
  }
473
- // Ensure sorted index exists
474
- await this.ensureSortedIndex(field);
475
- // With incremental updates, we should rarely need to rebuild
476
- // Only rebuild if it's marked dirty (e.g., after a bulk load or migration)
477
- let sortedIndex = this.sortedIndices.get(field);
478
- if (sortedIndex?.isDirty) {
479
- prodLog.warn(`MetadataIndex: Sorted index for field '${field}' was dirty, rebuilding...`);
480
- await this.buildSortedIndex(field);
481
- sortedIndex = this.sortedIndices.get(field);
482
- }
483
- if (!sortedIndex || sortedIndex.values.length === 0)
484
- return [];
485
- const sorted = sortedIndex.values;
486
- const resultSet = new Set();
487
- // Find range boundaries
488
- let start = 0;
489
- let end = sorted.length - 1;
490
- if (min !== undefined) {
491
- start = this.binarySearchStart(sorted, min, includeMin);
492
- }
493
- if (max !== undefined) {
494
- end = this.binarySearchEnd(sorted, max, includeMax);
495
- }
496
- // Collect all IDs in range
497
- for (let i = start; i <= end && i < sorted.length; i++) {
498
- const [, ids] = sorted[i];
499
- ids.forEach(id => resultSet.add(id));
500
- }
501
- return Array.from(resultSet);
488
+ // All fields use chunked sparse index with zone map optimization (v3.42.0)
489
+ return await this.getIdsFromChunksForRange(field, min, max, includeMin, includeMax);
502
490
  }
503
491
  /**
504
492
  * Generate field index filename for filter discovery
@@ -646,51 +634,22 @@ export class MetadataIndexManager {
646
634
  const updatedFields = new Set();
647
635
  for (let i = 0; i < fields.length; i++) {
648
636
  const { field, value } = fields[i];
649
- const key = this.getIndexKey(field, value);
650
- // Get or create index entry
651
- let entry = this.indexCache.get(key);
652
- if (!entry) {
653
- const loadedEntry = await this.loadIndexEntry(key);
654
- entry = loadedEntry ?? {
655
- field,
656
- value: this.normalizeValue(value, field), // Pass field for bucketing!
657
- ids: new Set(),
658
- lastUpdated: Date.now()
659
- };
660
- this.indexCache.set(key, entry);
661
- }
662
- // Add ID to entry
663
- const wasNew = entry.ids.size === 0;
664
- entry.ids.add(id);
665
- entry.lastUpdated = Date.now();
666
- this.dirtyEntries.add(key);
667
- // Update cardinality statistics
668
- if (wasNew) {
669
- this.updateCardinalityStats(field, value, 'add');
670
- }
671
- // Track type-field affinity for intelligent NLP
672
- this.updateTypeFieldAffinity(id, field, value, 'add');
673
- // Incrementally update sorted index for this field
674
- if (!updatedFields.has(field)) {
675
- this.updateSortedIndexAdd(field, value, id);
676
- updatedFields.add(field);
677
- }
678
- else {
679
- // Multiple values for same field - still update
680
- this.updateSortedIndexAdd(field, value, id);
681
- }
682
- // Update field index
637
+ // All fields use chunked sparse indexing (v3.42.0)
638
+ await this.addToChunkedIndex(field, value, id);
639
+ // Update statistics and tracking
640
+ this.updateCardinalityStats(field, value, 'add');
641
+ this.updateTypeFieldAffinity(id, field, value, 'add', metadata);
683
642
  await this.updateFieldIndex(field, value, 1);
684
643
  // Yield to event loop every 5 fields to prevent blocking
685
644
  if (i % 5 === 4) {
686
645
  await this.yieldToEventLoop();
687
646
  }
688
647
  }
689
- // Adaptive auto-flush based on usage patterns
648
+ // Adaptive auto-flush based on usage patterns (v3.42.0 - flush field indexes only)
690
649
  if (!skipFlush) {
691
650
  const timeSinceLastFlush = Date.now() - this.lastFlushTime;
692
- const shouldAutoFlush = this.dirtyEntries.size >= this.autoFlushThreshold || // Size threshold
693
- (this.dirtyEntries.size > 10 && timeSinceLastFlush > 5000); // Time threshold (5 seconds)
651
+ const shouldAutoFlush = this.dirtyFields.size >= this.autoFlushThreshold || // Size threshold
652
+ (this.dirtyFields.size > 10 && timeSinceLastFlush > 5000); // Time threshold (5 seconds)
694
653
  if (shouldAutoFlush) {
695
654
  const startTime = Date.now();
696
655
  await this.flush();
@@ -743,51 +702,31 @@ export class MetadataIndexManager {
743
702
  // Remove from specific field indexes
744
703
  const fields = this.extractIndexableFields(metadata);
745
704
  for (const { field, value } of fields) {
746
- const key = this.getIndexKey(field, value);
747
- let entry = this.indexCache.get(key);
748
- if (!entry) {
749
- const loadedEntry = await this.loadIndexEntry(key);
750
- entry = loadedEntry ?? undefined;
751
- }
752
- if (entry) {
753
- const hadId = entry.ids.has(id);
754
- entry.ids.delete(id);
755
- entry.lastUpdated = Date.now();
756
- this.dirtyEntries.add(key);
757
- // Update cardinality statistics
758
- if (hadId && entry.ids.size === 0) {
759
- this.updateCardinalityStats(field, value, 'remove');
760
- }
761
- // Track type-field affinity for intelligent NLP
762
- if (hadId) {
763
- this.updateTypeFieldAffinity(id, field, value, 'remove');
764
- }
765
- // Incrementally update sorted index when removing
766
- this.updateSortedIndexRemove(field, value, id);
767
- // Update field index
768
- await this.updateFieldIndex(field, value, -1);
769
- // If no IDs left, mark for cleanup
770
- if (entry.ids.size === 0) {
771
- this.indexCache.delete(key);
772
- await this.deleteIndexEntry(key);
773
- }
774
- }
705
+ // All fields use chunked sparse indexing (v3.42.0)
706
+ await this.removeFromChunkedIndex(field, value, id);
707
+ // Update statistics and tracking
708
+ this.updateCardinalityStats(field, value, 'remove');
709
+ this.updateTypeFieldAffinity(id, field, value, 'remove', metadata);
710
+ await this.updateFieldIndex(field, value, -1);
775
711
  // Invalidate cache
776
712
  this.metadataCache.invalidatePattern(`field_values_${field}`);
777
713
  }
778
714
  }
779
715
  else {
780
- // Remove from all indexes (slower, requires scanning)
781
- for (const [key, entry] of this.indexCache.entries()) {
782
- if (entry.ids.has(id)) {
783
- entry.ids.delete(id);
784
- entry.lastUpdated = Date.now();
785
- this.dirtyEntries.add(key);
786
- // Incrementally update sorted index
787
- this.updateSortedIndexRemove(entry.field, entry.value, id);
788
- if (entry.ids.size === 0) {
789
- this.indexCache.delete(key);
790
- await this.deleteIndexEntry(key);
716
+ // Remove from all indexes (slower, requires scanning all chunks)
717
+ // This should be rare - prefer providing metadata when removing
718
+ prodLog.warn(`Removing ID ${id} without metadata requires scanning all sparse indices (slow)`);
719
+ // Scan all sparse indices
720
+ for (const [field, sparseIndex] of this.sparseIndices.entries()) {
721
+ for (const chunkId of sparseIndex.getAllChunkIds()) {
722
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
723
+ if (chunk) {
724
+ // Check all values in this chunk
725
+ for (const [value, ids] of chunk.entries) {
726
+ if (ids.has(id)) {
727
+ await this.removeFromChunkedIndex(field, value, id);
728
+ }
729
+ }
791
730
  }
792
731
  }
793
732
  }
@@ -797,14 +736,9 @@ export class MetadataIndexManager {
797
736
  * Get all IDs in the index
798
737
  */
799
738
  async getAllIds() {
800
- // Collect all unique IDs from all index entries
739
+ // Use storage as the source of truth (v3.42.0 - removed redundant indexCache scan)
801
740
  const allIds = new Set();
802
- // First, add all IDs from the in-memory cache
803
- for (const entry of this.indexCache.values()) {
804
- entry.ids.forEach(id => allIds.add(id));
805
- }
806
- // If storage has a method to get all nouns, use it as the source of truth
807
- // This ensures we include items that might not be indexed yet
741
+ // Storage.getNouns() is the definitive source of all entity IDs
808
742
  if (this.storage && typeof this.storage.getNouns === 'function') {
809
743
  try {
810
744
  const result = await this.storage.getNouns({
@@ -818,13 +752,15 @@ export class MetadataIndexManager {
818
752
  }
819
753
  }
820
754
  catch (e) {
821
- // Fall back to using only indexed IDs
755
+ // If storage method fails, return empty array
756
+ prodLog.warn('Failed to get all IDs from storage:', e);
757
+ return [];
822
758
  }
823
759
  }
824
760
  return Array.from(allIds);
825
761
  }
826
762
  /**
827
- * Get IDs for a specific field-value combination with caching
763
+ * Get IDs for a specific field-value combination using chunked sparse index
828
764
  */
829
765
  async getIds(field, value) {
830
766
  // Track exact query for field statistics
@@ -832,27 +768,8 @@ export class MetadataIndexManager {
832
768
  const stats = this.fieldStats.get(field);
833
769
  stats.exactQueryCount++;
834
770
  }
835
- const key = this.getIndexKey(field, value);
836
- // Check metadata cache first
837
- const cacheKey = `ids_${key}`;
838
- const cachedIds = this.metadataCache.get(cacheKey);
839
- if (cachedIds) {
840
- return cachedIds;
841
- }
842
- // Try in-memory cache
843
- let entry = this.indexCache.get(key);
844
- // Load from storage if not cached
845
- if (!entry) {
846
- const loadedEntry = await this.loadIndexEntry(key);
847
- if (loadedEntry) {
848
- entry = loadedEntry;
849
- this.indexCache.set(key, entry);
850
- }
851
- }
852
- const ids = entry ? Array.from(entry.ids) : [];
853
- // Cache the result
854
- this.metadataCache.set(cacheKey, ids);
855
- return ids;
771
+ // All fields use chunked sparse indexing (v3.42.0)
772
+ return await this.getIdsFromChunks(field, value);
856
773
  }
857
774
  /**
858
775
  * Get all available values for a field (for filter discovery)
@@ -1044,11 +961,20 @@ export class MetadataIndexManager {
1044
961
  // Existence operator
1045
962
  case 'exists':
1046
963
  if (operand) {
1047
- // Get all IDs that have this field (any value)
964
+ // Get all IDs that have this field (any value) from chunked sparse index (v3.42.0)
1048
965
  const allIds = new Set();
1049
- for (const [key, entry] of this.indexCache.entries()) {
1050
- if (entry.field === field) {
1051
- entry.ids.forEach(id => allIds.add(id));
966
+ // Load sparse index for this field
967
+ const sparseIndex = this.sparseIndices.get(field) || await this.loadSparseIndex(field);
968
+ if (sparseIndex) {
969
+ // Iterate through all chunks for this field
970
+ for (const chunkId of sparseIndex.getAllChunkIds()) {
971
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
972
+ if (chunk) {
973
+ // Collect all IDs from all values in this chunk
974
+ for (const ids of chunk.entries.values()) {
975
+ ids.forEach(id => allIds.add(id));
976
+ }
977
+ }
1052
978
  }
1053
979
  }
1054
980
  fieldResults = Array.from(allIds);
@@ -1149,31 +1075,17 @@ export class MetadataIndexManager {
1149
1075
  }
1150
1076
  /**
1151
1077
  * Flush dirty entries to storage (non-blocking version)
1078
+ * NOTE (v3.42.0): Sparse indices are flushed immediately in add/remove operations
1152
1079
  */
1153
1080
  async flush() {
1154
- // Check if we have anything to flush (including sorted indices)
1155
- const hasDirtySortedIndices = Array.from(this.sortedIndices.values()).some(idx => idx.isDirty);
1156
- if (this.dirtyEntries.size === 0 && this.dirtyFields.size === 0 && !hasDirtySortedIndices) {
1081
+ // Check if we have anything to flush
1082
+ if (this.dirtyFields.size === 0) {
1157
1083
  return; // Nothing to flush
1158
1084
  }
1159
1085
  // Process in smaller batches to avoid blocking
1160
1086
  const BATCH_SIZE = 20;
1161
1087
  const allPromises = [];
1162
- // Flush value entries in batches
1163
- const dirtyEntriesArray = Array.from(this.dirtyEntries);
1164
- for (let i = 0; i < dirtyEntriesArray.length; i += BATCH_SIZE) {
1165
- const batch = dirtyEntriesArray.slice(i, i + BATCH_SIZE);
1166
- const batchPromises = batch.map(key => {
1167
- const entry = this.indexCache.get(key);
1168
- return entry ? this.saveIndexEntry(key, entry) : Promise.resolve();
1169
- });
1170
- allPromises.push(...batchPromises);
1171
- // Yield to event loop between batches
1172
- if (i + BATCH_SIZE < dirtyEntriesArray.length) {
1173
- await this.yieldToEventLoop();
1174
- }
1175
- }
1176
- // Flush field indexes in batches
1088
+ // Flush field indexes in batches (v3.42.0 - removed flat file flushing)
1177
1089
  const dirtyFieldsArray = Array.from(this.dirtyFields);
1178
1090
  for (let i = 0; i < dirtyFieldsArray.length; i += BATCH_SIZE) {
1179
1091
  const batch = dirtyFieldsArray.slice(i, i + BATCH_SIZE);
@@ -1187,15 +1099,8 @@ export class MetadataIndexManager {
1187
1099
  await this.yieldToEventLoop();
1188
1100
  }
1189
1101
  }
1190
- // Flush sorted indices (for range queries)
1191
- for (const [field, sortedIndex] of this.sortedIndices.entries()) {
1192
- if (sortedIndex.isDirty) {
1193
- allPromises.push(this.saveSortedIndex(field, sortedIndex));
1194
- }
1195
- }
1196
1102
  // Wait for all operations to complete
1197
1103
  await Promise.all(allPromises);
1198
- this.dirtyEntries.clear();
1199
1104
  this.dirtyFields.clear();
1200
1105
  this.lastFlushTime = Date.now();
1201
1106
  }
@@ -1275,69 +1180,6 @@ export class MetadataIndexManager {
1275
1180
  }
1276
1181
  }
1277
1182
  }
1278
- /**
1279
- * Save sorted index to storage for range queries with file locking
1280
- */
1281
- async saveSortedIndex(field, sortedIndex) {
1282
- const filename = `sorted_${field}`;
1283
- const lockKey = `sorted_index_${field}`;
1284
- const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
1285
- if (!lockAcquired) {
1286
- prodLog.warn(`Failed to acquire lock for sorted index '${field}', proceeding without lock`);
1287
- }
1288
- try {
1289
- const indexId = `__metadata_sorted_index__${filename}`;
1290
- const unifiedKey = `metadata:sorted:${field}`;
1291
- // Convert Set to Array for serialization
1292
- const serializable = {
1293
- values: sortedIndex.values.map(([value, ids]) => [value, Array.from(ids)]),
1294
- fieldType: sortedIndex.fieldType,
1295
- lastUpdated: Date.now()
1296
- };
1297
- await this.storage.saveMetadata(indexId, serializable);
1298
- // Mark as clean
1299
- sortedIndex.isDirty = false;
1300
- // Update unified cache (sorted indices are expensive to rebuild)
1301
- const size = JSON.stringify(serializable).length;
1302
- this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100); // Higher rebuild cost
1303
- }
1304
- finally {
1305
- if (lockAcquired) {
1306
- await this.releaseLock(lockKey);
1307
- }
1308
- }
1309
- }
1310
- /**
1311
- * Load sorted index from storage
1312
- */
1313
- async loadSortedIndex(field) {
1314
- const filename = `sorted_${field}`;
1315
- const indexId = `__metadata_sorted_index__${filename}`;
1316
- const unifiedKey = `metadata:sorted:${field}`;
1317
- // Check unified cache first
1318
- const cached = await this.unifiedCache.get(unifiedKey, async () => {
1319
- try {
1320
- const data = await this.storage.getMetadata(indexId);
1321
- if (data) {
1322
- // Convert Arrays back to Sets
1323
- const sortedIndex = {
1324
- values: data.values.map(([value, ids]) => [value, new Set(ids)]),
1325
- fieldType: data.fieldType || 'mixed',
1326
- isDirty: false
1327
- };
1328
- // Add to unified cache
1329
- const size = JSON.stringify(data).length;
1330
- this.unifiedCache.set(unifiedKey, sortedIndex, 'metadata', size, 100);
1331
- return sortedIndex;
1332
- }
1333
- }
1334
- catch (error) {
1335
- // Sorted index doesn't exist yet
1336
- }
1337
- return null;
1338
- });
1339
- return cached;
1340
- }
1341
1183
  /**
1342
1184
  * Get count of entities by type - O(1) operation using existing tracking
1343
1185
  * This exposes the production-ready counting that's already maintained
@@ -1362,19 +1204,12 @@ export class MetadataIndexManager {
1362
1204
  return new Map(this.totalEntitiesByType);
1363
1205
  }
1364
1206
  /**
1365
- * Get count of entities matching field-value criteria - O(1) lookup from existing indexes
1207
+ * Get count of entities matching field-value criteria - queries chunked sparse index
1366
1208
  */
1367
1209
  async getCountForCriteria(field, value) {
1368
- const key = this.getIndexKey(field, value);
1369
- let entry = this.indexCache.get(key);
1370
- if (!entry) {
1371
- const loadedEntry = await this.loadIndexEntry(key);
1372
- if (loadedEntry) {
1373
- entry = loadedEntry;
1374
- this.indexCache.set(key, entry);
1375
- }
1376
- }
1377
- return entry ? entry.ids.size : 0;
1210
+ // Use chunked sparse indexing (v3.42.0 - removed indexCache)
1211
+ const ids = await this.getIds(field, value);
1212
+ return ids.length;
1378
1213
  }
1379
1214
  /**
1380
1215
  * Get index statistics with enhanced counting information
@@ -1383,10 +1218,23 @@ export class MetadataIndexManager {
1383
1218
  const fields = new Set();
1384
1219
  let totalEntries = 0;
1385
1220
  let totalIds = 0;
1386
- for (const entry of this.indexCache.values()) {
1387
- fields.add(entry.field);
1388
- totalEntries++;
1389
- totalIds += entry.ids.size;
1221
+ // Collect stats from sparse indices (v3.42.0 - removed indexCache)
1222
+ for (const [field, sparseIndex] of this.sparseIndices.entries()) {
1223
+ fields.add(field);
1224
+ // Count entries and IDs from all chunks
1225
+ for (const chunkId of sparseIndex.getAllChunkIds()) {
1226
+ const chunk = await this.chunkManager.loadChunk(field, chunkId);
1227
+ if (chunk) {
1228
+ totalEntries += chunk.entries.size;
1229
+ for (const ids of chunk.entries.values()) {
1230
+ totalIds += ids.size;
1231
+ }
1232
+ }
1233
+ }
1234
+ }
1235
+ // Also include fields from fieldIndexes that might not have sparse indices yet
1236
+ for (const field of this.fieldIndexes.keys()) {
1237
+ fields.add(field);
1390
1238
  }
1391
1239
  return {
1392
1240
  totalEntries,
@@ -1408,9 +1256,8 @@ export class MetadataIndexManager {
1408
1256
  prodLog.info('🔄 Starting non-blocking metadata index rebuild with batch processing to prevent socket exhaustion...');
1409
1257
  prodLog.info(`📊 Storage adapter: ${this.storage.constructor.name}`);
1410
1258
  prodLog.info(`🔧 Batch processing available: ${!!this.storage.getMetadataBatch}`);
1411
- // Clear existing indexes
1412
- this.indexCache.clear();
1413
- this.dirtyEntries.clear();
1259
+ // Clear existing indexes (v3.42.0 - use sparse indices instead of flat files)
1260
+ this.sparseIndices.clear();
1414
1261
  this.fieldIndexes.clear();
1415
1262
  this.dirtyFields.clear();
1416
1263
  // Rebuild noun metadata indexes using pagination
@@ -1599,89 +1446,6 @@ export class MetadataIndexManager {
1599
1446
  this.isRebuilding = false;
1600
1447
  }
1601
1448
  }
1602
- /**
1603
- * Load index entry from storage using safe filenames
1604
- */
1605
- async loadIndexEntry(key) {
1606
- const unifiedKey = `metadata:entry:${key}`;
1607
- // Use unified cache with loader function
1608
- return await this.unifiedCache.get(unifiedKey, async () => {
1609
- try {
1610
- // Extract field and value from key
1611
- const [field, value] = key.split(':', 2);
1612
- const filename = this.getValueChunkFilename(field, value);
1613
- // Load from metadata indexes directory with safe filename
1614
- const indexId = `__metadata_index__${filename}`;
1615
- const data = await this.storage.getMetadata(indexId);
1616
- if (data) {
1617
- const entry = {
1618
- field: data.field,
1619
- value: data.value,
1620
- ids: new Set(data.ids || []),
1621
- lastUpdated: data.lastUpdated || Date.now()
1622
- };
1623
- // Add to unified cache (metadata entries are cheap to rebuild)
1624
- const size = JSON.stringify(Array.from(entry.ids)).length + 100;
1625
- this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
1626
- return entry;
1627
- }
1628
- }
1629
- catch (error) {
1630
- // Index entry doesn't exist yet
1631
- }
1632
- return null;
1633
- });
1634
- }
1635
- /**
1636
- * Save index entry to storage using safe filenames with file locking
1637
- */
1638
- async saveIndexEntry(key, entry) {
1639
- const lockKey = `index_entry_${key}`;
1640
- const lockAcquired = await this.acquireLock(lockKey, 5000); // 5 second timeout
1641
- if (!lockAcquired) {
1642
- prodLog.warn(`Failed to acquire lock for index entry '${key}', proceeding without lock`);
1643
- }
1644
- try {
1645
- const unifiedKey = `metadata:entry:${key}`;
1646
- const data = {
1647
- field: entry.field,
1648
- value: entry.value,
1649
- ids: Array.from(entry.ids),
1650
- lastUpdated: entry.lastUpdated
1651
- };
1652
- // Extract field and value from key for safe filename generation
1653
- const [field, value] = key.split(':', 2);
1654
- const filename = this.getValueChunkFilename(field, value);
1655
- // Store metadata indexes with safe filename
1656
- const indexId = `__metadata_index__${filename}`;
1657
- await this.storage.saveMetadata(indexId, data);
1658
- // Update unified cache
1659
- const size = JSON.stringify(data.ids).length + 100;
1660
- this.unifiedCache.set(unifiedKey, entry, 'metadata', size, 1);
1661
- }
1662
- finally {
1663
- if (lockAcquired) {
1664
- await this.releaseLock(lockKey);
1665
- }
1666
- }
1667
- }
1668
- /**
1669
- * Delete index entry from storage using safe filenames
1670
- */
1671
- async deleteIndexEntry(key) {
1672
- const unifiedKey = `metadata:entry:${key}`;
1673
- try {
1674
- const [field, value] = key.split(':', 2);
1675
- const filename = this.getValueChunkFilename(field, value);
1676
- const indexId = `__metadata_index__${filename}`;
1677
- await this.storage.saveMetadata(indexId, null);
1678
- // Remove from unified cache
1679
- this.unifiedCache.delete(unifiedKey);
1680
- }
1681
- catch (error) {
1682
- // Entry might not exist
1683
- }
1684
- }
1685
1449
  /**
1686
1450
  * Get field statistics for optimization and discovery
1687
1451
  */
@@ -1814,7 +1578,7 @@ export class MetadataIndexManager {
1814
1578
  * Update type-field affinity tracking for intelligent NLP
1815
1579
  * Tracks which fields commonly appear with which entity types
1816
1580
  */
1817
- updateTypeFieldAffinity(entityId, field, value, operation) {
1581
+ updateTypeFieldAffinity(entityId, field, value, operation, metadata) {
1818
1582
  // Only track affinity for non-system fields (but allow 'noun' for type detection)
1819
1583
  if (this.config.excludeFields.includes(field) && field !== 'noun')
1820
1584
  return;
@@ -1824,14 +1588,13 @@ export class MetadataIndexManager {
1824
1588
  // This is the type definition itself
1825
1589
  entityType = this.normalizeValue(value, field); // Pass field for bucketing!
1826
1590
  }
1591
+ else if (metadata && metadata.noun) {
1592
+ // Extract entity type from metadata (v3.42.0 - removed indexCache scan)
1593
+ entityType = this.normalizeValue(metadata.noun, 'noun');
1594
+ }
1827
1595
  else {
1828
- // Find the noun type for this entity by looking for entries with this entityId
1829
- for (const [key, entry] of this.indexCache.entries()) {
1830
- if (key.startsWith('noun:') && entry.ids.has(entityId)) {
1831
- entityType = key.split(':', 2)[1];
1832
- break;
1833
- }
1834
- }
1596
+ // No type information available, skip affinity tracking
1597
+ return;
1835
1598
  }
1836
1599
  if (!entityType)
1837
1600
  return; // No type found, skip affinity tracking