@soulcraft/brainy 3.40.2 → 3.41.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/utils/metadataIndex.js +45 -23
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
## [3.41.0](https://github.com/soulcraftlabs/brainy/compare/v3.40.3...v3.41.0) (2025-10-13)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### ✨ Features
|
|
9
|
+
|
|
10
|
+
* automatic temporal bucketing for metadata indexes ([b3edd4b](https://github.com/soulcraftlabs/brainy/commit/b3edd4b60a49d26d1ca776d459aa013736a0db9d))
|
|
11
|
+
|
|
12
|
+
### [3.40.3](https://github.com/soulcraftlabs/brainy/compare/v3.40.2...v3.40.3) (2025-10-13)
|
|
13
|
+
|
|
14
|
+
- fix: prevent metadata index file pollution by excluding high-cardinality fields (0c86c4f)
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
### [3.40.2](https://github.com/soulcraftlabs/brainy/compare/v3.40.1...v3.40.2) (2025-10-13)
|
|
6
18
|
|
|
7
19
|
|
|
@@ -39,7 +39,25 @@ export class MetadataIndexManager {
|
|
|
39
39
|
rebuildThreshold: config.rebuildThreshold ?? 0.1,
|
|
40
40
|
autoOptimize: config.autoOptimize ?? true,
|
|
41
41
|
indexedFields: config.indexedFields ?? [],
|
|
42
|
-
excludeFields: config.excludeFields ?? [
|
|
42
|
+
excludeFields: config.excludeFields ?? [
|
|
43
|
+
// ONLY exclude truly un-indexable fields (binary data, large content)
|
|
44
|
+
// Timestamps are NOW indexed with automatic bucketing (prevents pollution)
|
|
45
|
+
// Vectors and embeddings (binary data, already have HNSW indexes)
|
|
46
|
+
'embedding',
|
|
47
|
+
'vector',
|
|
48
|
+
'embeddings',
|
|
49
|
+
'vectors',
|
|
50
|
+
// Large content fields (too large for metadata indexing)
|
|
51
|
+
'content',
|
|
52
|
+
'data',
|
|
53
|
+
'originalData',
|
|
54
|
+
'_data',
|
|
55
|
+
// Primary keys (use direct lookups instead)
|
|
56
|
+
'id'
|
|
57
|
+
// NOTE: 'accessed', 'modified', 'createdAt', etc. are NO LONGER excluded!
|
|
58
|
+
// They are now indexed with automatic 1-minute bucketing to prevent file pollution
|
|
59
|
+
// This enables range queries like: modified > yesterday
|
|
60
|
+
]
|
|
43
61
|
};
|
|
44
62
|
// Initialize metadata cache with similar config to search cache
|
|
45
63
|
this.metadataCache = new MetadataIndexCache({
|
|
@@ -134,7 +152,7 @@ export class MetadataIndexManager {
|
|
|
134
152
|
* Get index key for field and value
|
|
135
153
|
*/
|
|
136
154
|
getIndexKey(field, value) {
|
|
137
|
-
const normalizedValue = this.normalizeValue(value);
|
|
155
|
+
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
138
156
|
return `${field}:${normalizedValue}`;
|
|
139
157
|
}
|
|
140
158
|
/**
|
|
@@ -267,7 +285,7 @@ export class MetadataIndexManager {
|
|
|
267
285
|
});
|
|
268
286
|
}
|
|
269
287
|
const sortedIndex = this.sortedIndices.get(field);
|
|
270
|
-
const normalizedValue = this.normalizeValue(value);
|
|
288
|
+
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
271
289
|
// Find where this value should be in the sorted array
|
|
272
290
|
const insertPos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
273
291
|
if (insertPos < sortedIndex.values.length &&
|
|
@@ -289,7 +307,7 @@ export class MetadataIndexManager {
|
|
|
289
307
|
const sortedIndex = this.sortedIndices.get(field);
|
|
290
308
|
if (!sortedIndex || sortedIndex.values.length === 0)
|
|
291
309
|
return;
|
|
292
|
-
const normalizedValue = this.normalizeValue(value);
|
|
310
|
+
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
293
311
|
// Binary search to find the value
|
|
294
312
|
const pos = this.findInsertPosition(sortedIndex.values, normalizedValue, sortedIndex.fieldType);
|
|
295
313
|
if (pos < sortedIndex.values.length &&
|
|
@@ -429,12 +447,10 @@ export class MetadataIndexManager {
|
|
|
429
447
|
else {
|
|
430
448
|
stats.indexType = 'hash';
|
|
431
449
|
}
|
|
432
|
-
// Determine normalization strategy for high cardinality fields
|
|
450
|
+
// Determine normalization strategy for high cardinality NON-temporal fields
|
|
451
|
+
// (Temporal fields are already bucketed in normalizeValue from the start!)
|
|
433
452
|
if (hasHighCardinality) {
|
|
434
|
-
if (
|
|
435
|
-
stats.normalizationStrategy = 'bucket'; // Time bucketing
|
|
436
|
-
}
|
|
437
|
-
else if (isNumeric) {
|
|
453
|
+
if (isNumeric) {
|
|
438
454
|
stats.normalizationStrategy = 'precision'; // Reduce float precision
|
|
439
455
|
}
|
|
440
456
|
else {
|
|
@@ -494,7 +510,7 @@ export class MetadataIndexManager {
|
|
|
494
510
|
* Generate value chunk filename for scalable storage
|
|
495
511
|
*/
|
|
496
512
|
getValueChunkFilename(field, value, chunkIndex = 0) {
|
|
497
|
-
const normalizedValue = this.normalizeValue(value);
|
|
513
|
+
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
498
514
|
const safeValue = this.makeSafeFilename(normalizedValue);
|
|
499
515
|
return `${field}_${safeValue}_chunk${chunkIndex}`;
|
|
500
516
|
}
|
|
@@ -516,19 +532,25 @@ export class MetadataIndexManager {
|
|
|
516
532
|
return '__NULL__';
|
|
517
533
|
if (typeof value === 'boolean')
|
|
518
534
|
return value ? '__TRUE__' : '__FALSE__';
|
|
519
|
-
//
|
|
535
|
+
// ALWAYS apply bucketing to temporal fields (prevents pollution from the start!)
|
|
536
|
+
// This is the key fix: don't wait for cardinality stats, just bucket immediately
|
|
537
|
+
if (field && typeof value === 'number') {
|
|
538
|
+
const fieldLower = field.toLowerCase();
|
|
539
|
+
const isTemporal = fieldLower.includes('time') || fieldLower.includes('date') ||
|
|
540
|
+
fieldLower.includes('accessed') || fieldLower.includes('modified') ||
|
|
541
|
+
fieldLower.includes('created') || fieldLower.includes('updated');
|
|
542
|
+
if (isTemporal) {
|
|
543
|
+
// Apply time bucketing immediately (no need to wait for stats)
|
|
544
|
+
const bucketSize = this.TIMESTAMP_PRECISION_MS; // 1 minute buckets
|
|
545
|
+
const bucketed = Math.floor(value / bucketSize) * bucketSize;
|
|
546
|
+
return bucketed.toString();
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
// Apply smart normalization based on field statistics (for non-temporal fields)
|
|
520
550
|
if (field && this.fieldStats.has(field)) {
|
|
521
551
|
const stats = this.fieldStats.get(field);
|
|
522
552
|
const strategy = stats.normalizationStrategy;
|
|
523
|
-
if (strategy === '
|
|
524
|
-
// Time bucketing for timestamps
|
|
525
|
-
if (field.toLowerCase().includes('time') || field.toLowerCase().includes('date')) {
|
|
526
|
-
const bucketSize = this.TIMESTAMP_PRECISION_MS;
|
|
527
|
-
const bucketed = Math.floor(value / bucketSize) * bucketSize;
|
|
528
|
-
return bucketed.toString();
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
else if (strategy === 'precision' && typeof value === 'number') {
|
|
553
|
+
if (strategy === 'precision' && typeof value === 'number') {
|
|
532
554
|
// Reduce float precision for high cardinality numeric fields
|
|
533
555
|
const rounded = Math.round(value * Math.pow(10, this.FLOAT_PRECISION)) / Math.pow(10, this.FLOAT_PRECISION);
|
|
534
556
|
return rounded.toString();
|
|
@@ -631,7 +653,7 @@ export class MetadataIndexManager {
|
|
|
631
653
|
const loadedEntry = await this.loadIndexEntry(key);
|
|
632
654
|
entry = loadedEntry ?? {
|
|
633
655
|
field,
|
|
634
|
-
value: this.normalizeValue(value),
|
|
656
|
+
value: this.normalizeValue(value, field), // Pass field for bucketing!
|
|
635
657
|
ids: new Set(),
|
|
636
658
|
lastUpdated: Date.now()
|
|
637
659
|
};
|
|
@@ -704,7 +726,7 @@ export class MetadataIndexManager {
|
|
|
704
726
|
};
|
|
705
727
|
this.fieldIndexes.set(field, fieldIndex);
|
|
706
728
|
}
|
|
707
|
-
const normalizedValue = this.normalizeValue(value);
|
|
729
|
+
const normalizedValue = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
708
730
|
fieldIndex.values[normalizedValue] = (fieldIndex.values[normalizedValue] || 0) + delta;
|
|
709
731
|
// Remove if count drops to 0
|
|
710
732
|
if (fieldIndex.values[normalizedValue] <= 0) {
|
|
@@ -1800,7 +1822,7 @@ export class MetadataIndexManager {
|
|
|
1800
1822
|
let entityType = null;
|
|
1801
1823
|
if (field === 'noun') {
|
|
1802
1824
|
// This is the type definition itself
|
|
1803
|
-
entityType = this.normalizeValue(value);
|
|
1825
|
+
entityType = this.normalizeValue(value, field); // Pass field for bucketing!
|
|
1804
1826
|
}
|
|
1805
1827
|
else {
|
|
1806
1828
|
// Find the noun type for this entity by looking for entries with this entityId
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/brainy",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.41.0",
|
|
4
4
|
"description": "Universal Knowledge Protocol™ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns × 40 verbs for infinite expressiveness.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|