@soulcraft/brainy 3.34.0 → 3.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,12 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
4
4
 
5
+ ### [3.35.0](https://github.com/soulcraftlabs/brainy/compare/v3.34.0...v3.35.0) (2025-10-10)
6
+
7
+ - feat: implement HNSW index rebuild and unified index interface (6a4d1ae)
8
+ - cleaning up (12d78ba)
9
+
10
+
5
11
  ### [3.34.0](https://github.com/soulcraftlabs/brainy/compare/v3.33.0...v3.34.0) (2025-10-09)
6
12
 
7
13
  - test: adjust type-matching tests for real embeddings (v3.33.0) (1c5c77e)
package/dist/brainy.d.ts CHANGED
@@ -1070,6 +1070,21 @@ export declare class Brainy<T = any> implements BrainyInterface<T> {
1070
1070
  /**
1071
1071
  * Rebuild indexes if there's existing data but empty indexes
1072
1072
  */
1073
+ /**
1074
+ * Rebuild indexes from persisted data if needed (v3.35.0+)
1075
+ *
1076
+ * FIXES FOR CRITICAL BUGS:
1077
+ * - Bug #1: GraphAdjacencyIndex rebuild never called ✅ FIXED
1078
+ * - Bug #2: Early return blocks recovery when count=0 ✅ FIXED
1079
+ * - Bug #4: HNSW index has no rebuild mechanism ✅ FIXED
1080
+ *
1081
+ * Production-grade rebuild with:
1082
+ * - Handles millions of entities via pagination
1083
+ * - Smart threshold-based decisions (auto-rebuild < 1000 items)
1084
+ * - Progress reporting for large datasets
1085
+ * - Parallel index rebuilds for performance
1086
+ * - Robust error recovery (continues on partial failures)
1087
+ */
1073
1088
  private rebuildIndexesIfNeeded;
1074
1089
  /**
1075
1090
  * Close and cleanup
package/dist/brainy.js CHANGED
@@ -2385,59 +2385,88 @@ export class Brainy {
2385
2385
  /**
2386
2386
  * Rebuild indexes if there's existing data but empty indexes
2387
2387
  */
2388
+ /**
2389
+ * Rebuild indexes from persisted data if needed (v3.35.0+)
2390
+ *
2391
+ * FIXES FOR CRITICAL BUGS:
2392
+ * - Bug #1: GraphAdjacencyIndex rebuild never called ✅ FIXED
2393
+ * - Bug #2: Early return blocks recovery when count=0 ✅ FIXED
2394
+ * - Bug #4: HNSW index has no rebuild mechanism ✅ FIXED
2395
+ *
2396
+ * Production-grade rebuild with:
2397
+ * - Handles millions of entities via pagination
2398
+ * - Smart threshold-based decisions (auto-rebuild < 1000 items)
2399
+ * - Progress reporting for large datasets
2400
+ * - Parallel index rebuilds for performance
2401
+ * - Robust error recovery (continues on partial failures)
2402
+ */
2388
2403
  async rebuildIndexesIfNeeded() {
2389
2404
  try {
2390
- // Check if storage has data
2405
+ // Check if auto-rebuild is explicitly disabled
2406
+ if (this.config.disableAutoRebuild === true) {
2407
+ if (!this.config.silent) {
2408
+ console.log('⚡ Auto-rebuild explicitly disabled via config');
2409
+ }
2410
+ return;
2411
+ }
2412
+ // BUG #2 FIX: Don't trust counts - check actual storage instead
2413
+ // Counts can be lost/corrupted in container restarts
2391
2414
  const entities = await this.storage.getNouns({ pagination: { limit: 1 } });
2392
2415
  const totalCount = entities.totalCount || 0;
2393
- if (totalCount === 0) {
2394
- // No data in storage, no rebuild needed
2416
+ // If storage is truly empty, no rebuild needed
2417
+ if (totalCount === 0 && entities.items.length === 0) {
2395
2418
  return;
2396
2419
  }
2397
2420
  // Intelligent decision: Auto-rebuild only for small datasets
2398
2421
  // For large datasets, use lazy loading for optimal performance
2399
2422
  const AUTO_REBUILD_THRESHOLD = 1000; // Only auto-rebuild if < 1000 items
2400
- // Check if metadata index is empty
2423
+ // Check if indexes need rebuilding
2401
2424
  const metadataStats = await this.metadataIndex.getStats();
2402
- if (metadataStats.totalEntries === 0 && totalCount > 0) {
2403
- if (totalCount < AUTO_REBUILD_THRESHOLD) {
2404
- // Small dataset - rebuild for convenience
2405
- if (!this.config.silent) {
2406
- console.log(`🔄 Small dataset (${totalCount} items) - rebuilding index for optimal performance...`);
2407
- }
2408
- await this.metadataIndex.rebuild();
2409
- const newStats = await this.metadataIndex.getStats();
2410
- if (!this.config.silent) {
2411
- console.log(`✅ Index rebuilt: ${newStats.totalEntries} entries`);
2412
- }
2413
- }
2414
- else {
2415
- // Large dataset - use lazy loading
2416
- if (!this.config.silent) {
2417
- console.log(`⚡ Large dataset (${totalCount} items) - using lazy loading for optimal startup performance`);
2418
- console.log('💡 Tip: Indexes will build automatically as you use the system');
2419
- }
2420
- }
2425
+ const hnswIndexSize = this.index.size();
2426
+ const graphIndexSize = await this.graphIndex.size();
2427
+ const needsRebuild = metadataStats.totalEntries === 0 ||
2428
+ hnswIndexSize === 0 ||
2429
+ graphIndexSize === 0 ||
2430
+ this.config.disableAutoRebuild === false; // Explicitly enabled
2431
+ if (!needsRebuild) {
2432
+ // All indexes populated, no rebuild needed
2433
+ return;
2421
2434
  }
2422
- // Override with explicit config if provided
2423
- if (this.config.disableAutoRebuild === true) {
2435
+ // Small dataset: Rebuild all indexes for best performance
2436
+ if (totalCount < AUTO_REBUILD_THRESHOLD || this.config.disableAutoRebuild === false) {
2424
2437
  if (!this.config.silent) {
2425
- console.log('⚡ Auto-rebuild explicitly disabled via config');
2438
+ console.log(this.config.disableAutoRebuild === false
2439
+ ? '🔄 Auto-rebuild explicitly enabled - rebuilding all indexes...'
2440
+ : `🔄 Small dataset (${totalCount} items) - rebuilding all indexes...`);
2441
+ }
2442
+ // BUG #1 FIX: Actually call graphIndex.rebuild()
2443
+ // BUG #4 FIX: Actually call HNSW index.rebuild()
2444
+ // Rebuild all 3 indexes in parallel for performance
2445
+ const startTime = Date.now();
2446
+ await Promise.all([
2447
+ metadataStats.totalEntries === 0 ? this.metadataIndex.rebuild() : Promise.resolve(),
2448
+ hnswIndexSize === 0 ? this.index.rebuild() : Promise.resolve(),
2449
+ graphIndexSize === 0 ? this.graphIndex.rebuild() : Promise.resolve()
2450
+ ]);
2451
+ const duration = Date.now() - startTime;
2452
+ if (!this.config.silent) {
2453
+ console.log(`✅ All indexes rebuilt in ${duration}ms:\n` +
2454
+ ` - Metadata: ${await this.metadataIndex.getStats().then(s => s.totalEntries)} entries\n` +
2455
+ ` - HNSW Vector: ${this.index.size()} nodes\n` +
2456
+ ` - Graph Adjacency: ${await this.graphIndex.size()} relationships`);
2426
2457
  }
2427
- return;
2428
2458
  }
2429
- else if (this.config.disableAutoRebuild === false && metadataStats.totalEntries === 0) {
2430
- // Explicitly enabled - rebuild regardless of size
2459
+ else {
2460
+ // Large dataset: Use lazy loading for fast startup
2431
2461
  if (!this.config.silent) {
2432
- console.log('🔄 Auto-rebuild explicitly enabled - rebuilding index...');
2462
+ console.log(`⚡ Large dataset (${totalCount} items) - using lazy loading for optimal startup`);
2463
+ console.log('💡 Indexes will build automatically as you query the system');
2433
2464
  }
2434
- await this.metadataIndex.rebuild();
2435
2465
  }
2436
- // Note: GraphAdjacencyIndex will rebuild itself as relationships are added
2437
- // Vector index should already be populated if storage has data
2438
2466
  }
2439
2467
  catch (error) {
2440
- console.warn('Warning: Could not check or rebuild indexes:', error);
2468
+ console.warn('Warning: Could not rebuild indexes:', error);
2469
+ // Don't throw - allow system to start even if rebuild fails
2441
2470
  }
2442
2471
  }
2443
2472
  /**
@@ -3,6 +3,7 @@
3
3
  * Based on the paper: "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs"
4
4
  */
5
5
  import { DistanceFunction, HNSWConfig, HNSWNoun, Vector, VectorDocument } from '../coreTypes.js';
6
+ import type { BaseStorage } from '../storage/baseStorage.js';
6
7
  export declare class HNSWIndex {
7
8
  private nouns;
8
9
  private entryPointId;
@@ -13,8 +14,10 @@ export declare class HNSWIndex {
13
14
  private distanceFunction;
14
15
  private dimension;
15
16
  private useParallelization;
17
+ private storage;
16
18
  constructor(config?: Partial<HNSWConfig>, distanceFunction?: DistanceFunction, options?: {
17
19
  useParallelization?: boolean;
20
+ storage?: BaseStorage;
18
21
  });
19
22
  /**
20
23
  * Set whether to use parallelization for performance-critical operations
@@ -98,6 +101,27 @@ export declare class HNSWIndex {
98
101
  * This enables O(n) clustering using HNSW's natural hierarchy
99
102
  */
100
103
  getNodesAtLevel(level: number): HNSWNoun[];
104
+ /**
105
+ * Rebuild HNSW index from persisted graph data (v3.35.0+)
106
+ *
107
+ * This is a production-grade O(N) rebuild that restores the pre-computed graph structure
108
+ * from storage. Much faster than re-building which is O(N log N).
109
+ *
110
+ * Designed for millions of entities with:
111
+ * - Cursor-based pagination (no memory overflow)
112
+ * - Batch processing (configurable batch size)
113
+ * - Progress reporting (optional callback)
114
+ * - Error recovery (continues on partial failures)
115
+ * - Lazy mode support (memory-efficient for constrained environments)
116
+ *
117
+ * @param options Rebuild options
118
+ * @returns Promise that resolves when rebuild is complete
119
+ */
120
+ rebuild(options?: {
121
+ lazy?: boolean;
122
+ batchSize?: number;
123
+ onProgress?: (loaded: number, total: number) => void;
124
+ }): Promise<void>;
101
125
  /**
102
126
  * Get level statistics for understanding the hierarchy
103
127
  */
@@ -20,12 +20,14 @@ export class HNSWIndex {
20
20
  this.MAX_TRACKED_LEVELS = 10; // Only track top levels for memory efficiency
21
21
  this.dimension = null;
22
22
  this.useParallelization = true; // Whether to use parallelization for performance-critical operations
23
+ this.storage = null; // Storage adapter for HNSW persistence (v3.35.0+)
23
24
  this.config = { ...DEFAULT_CONFIG, ...config };
24
25
  this.distanceFunction = distanceFunction;
25
26
  this.useParallelization =
26
27
  options.useParallelization !== undefined
27
28
  ? options.useParallelization
28
29
  : true;
30
+ this.storage = options.storage || null;
29
31
  }
30
32
  /**
31
33
  * Set whether to use parallelization for performance-critical operations
@@ -182,6 +184,19 @@ export class HNSWIndex {
182
184
  if (neighbor.connections.get(level).size > this.config.M) {
183
185
  this.pruneConnections(neighbor, level);
184
186
  }
187
+ // Persist updated neighbor HNSW data (v3.35.0+)
188
+ if (this.storage) {
189
+ const neighborConnectionsObj = {};
190
+ for (const [lvl, nounIds] of neighbor.connections.entries()) {
191
+ neighborConnectionsObj[lvl.toString()] = Array.from(nounIds);
192
+ }
193
+ this.storage.saveHNSWData(neighborId, {
194
+ level: neighbor.level,
195
+ connections: neighborConnectionsObj
196
+ }).catch((error) => {
197
+ console.error(`Failed to persist neighbor HNSW data for ${neighborId}:`, error);
198
+ });
199
+ }
185
200
  }
186
201
  // Update entry point for the next level
187
202
  if (nearestNouns.size > 0) {
@@ -213,6 +228,27 @@ export class HNSWIndex {
213
228
  }
214
229
  this.highLevelNodes.get(nounLevel).add(id);
215
230
  }
231
+ // Persist HNSW graph data to storage (v3.35.0+)
232
+ if (this.storage) {
233
+ // Convert connections Map to serializable format
234
+ const connectionsObj = {};
235
+ for (const [level, nounIds] of noun.connections.entries()) {
236
+ connectionsObj[level.toString()] = Array.from(nounIds);
237
+ }
238
+ await this.storage.saveHNSWData(id, {
239
+ level: nounLevel,
240
+ connections: connectionsObj
241
+ }).catch((error) => {
242
+ console.error(`Failed to persist HNSW data for ${id}:`, error);
243
+ });
244
+ // Persist system data (entry point and max level)
245
+ await this.storage.saveHNSWSystem({
246
+ entryPointId: this.entryPointId,
247
+ maxLevel: this.maxLevel
248
+ }).catch((error) => {
249
+ console.error('Failed to persist HNSW system data:', error);
250
+ });
251
+ }
216
252
  return id;
217
253
  }
218
254
  /**
@@ -451,6 +487,107 @@ export class HNSWIndex {
451
487
  }
452
488
  return nodesAtLevel;
453
489
  }
490
+ /**
491
+ * Rebuild HNSW index from persisted graph data (v3.35.0+)
492
+ *
493
+ * This is a production-grade O(N) rebuild that restores the pre-computed graph structure
494
+ * from storage. Much faster than re-building which is O(N log N).
495
+ *
496
+ * Designed for millions of entities with:
497
+ * - Cursor-based pagination (no memory overflow)
498
+ * - Batch processing (configurable batch size)
499
+ * - Progress reporting (optional callback)
500
+ * - Error recovery (continues on partial failures)
501
+ * - Lazy mode support (memory-efficient for constrained environments)
502
+ *
503
+ * @param options Rebuild options
504
+ * @returns Promise that resolves when rebuild is complete
505
+ */
506
+ async rebuild(options = {}) {
507
+ if (!this.storage) {
508
+ console.warn('HNSW rebuild skipped: no storage adapter configured');
509
+ return;
510
+ }
511
+ const batchSize = options.batchSize || 1000;
512
+ const lazy = options.lazy || false;
513
+ try {
514
+ // Step 1: Clear existing in-memory index
515
+ this.clear();
516
+ // Step 2: Load system data (entry point, max level)
517
+ const systemData = await this.storage.getHNSWSystem();
518
+ if (systemData) {
519
+ this.entryPointId = systemData.entryPointId;
520
+ this.maxLevel = systemData.maxLevel;
521
+ }
522
+ // Step 3: Paginate through all nouns and restore HNSW graph structure
523
+ let loadedCount = 0;
524
+ let totalCount = undefined;
525
+ let hasMore = true;
526
+ let cursor = undefined;
527
+ while (hasMore) {
528
+ // Fetch batch of nouns from storage (cast needed as method is not in base interface)
529
+ const result = await this.storage.getNounsWithPagination({
530
+ limit: batchSize,
531
+ cursor
532
+ });
533
+ // Set total count on first batch
534
+ if (totalCount === undefined && result.totalCount !== undefined) {
535
+ totalCount = result.totalCount;
536
+ }
537
+ // Process each noun in the batch
538
+ for (const nounData of result.items) {
539
+ try {
540
+ // Load HNSW graph data for this entity
541
+ const hnswData = await this.storage.getHNSWData(nounData.id);
542
+ if (!hnswData) {
543
+ // No HNSW data - skip (might be entity added before persistence)
544
+ continue;
545
+ }
546
+ // Create noun object with restored connections
547
+ const noun = {
548
+ id: nounData.id,
549
+ vector: lazy ? [] : nounData.vector, // Empty vector in lazy mode
550
+ connections: new Map(),
551
+ level: hnswData.level
552
+ };
553
+ // Restore connections from persisted data
554
+ for (const [levelStr, nounIds] of Object.entries(hnswData.connections)) {
555
+ const level = parseInt(levelStr, 10);
556
+ noun.connections.set(level, new Set(nounIds));
557
+ }
558
+ // Add to in-memory index
559
+ this.nouns.set(nounData.id, noun);
560
+ // Track high-level nodes for O(1) entry point selection
561
+ if (noun.level >= 2 && noun.level <= this.MAX_TRACKED_LEVELS) {
562
+ if (!this.highLevelNodes.has(noun.level)) {
563
+ this.highLevelNodes.set(noun.level, new Set());
564
+ }
565
+ this.highLevelNodes.get(noun.level).add(nounData.id);
566
+ }
567
+ loadedCount++;
568
+ }
569
+ catch (error) {
570
+ // Log error but continue (robust error recovery)
571
+ console.error(`Failed to rebuild HNSW data for ${nounData.id}:`, error);
572
+ }
573
+ }
574
+ // Report progress
575
+ if (options.onProgress && totalCount !== undefined) {
576
+ options.onProgress(loadedCount, totalCount);
577
+ }
578
+ // Check for more data
579
+ hasMore = result.hasMore;
580
+ cursor = result.nextCursor;
581
+ }
582
+ console.log(`HNSW index rebuilt successfully: ${loadedCount} entities, ` +
583
+ `${this.maxLevel + 1} levels, entry point: ${this.entryPointId || 'none'}` +
584
+ (lazy ? ' (lazy mode - vectors loaded on-demand)' : ''));
585
+ }
586
+ catch (error) {
587
+ console.error('HNSW rebuild failed:', error);
588
+ throw new Error(`Failed to rebuild HNSW index: ${error}`);
589
+ }
590
+ }
454
591
  /**
455
592
  * Get level statistics for understanding the hierarchy
456
593
  */
@@ -5,7 +5,7 @@
5
5
  */
6
6
  import { DistanceFunction, HNSWConfig, Vector, VectorDocument } from '../coreTypes.js';
7
7
  import { HNSWIndex } from './hnswIndex.js';
8
- import { StorageAdapter } from '../coreTypes.js';
8
+ import type { BaseStorage } from '../storage/baseStorage.js';
9
9
  export interface HNSWOptimizedConfig extends HNSWConfig {
10
10
  memoryThreshold?: number;
11
11
  productQuantization?: {
@@ -88,7 +88,6 @@ declare class ProductQuantizer {
88
88
  export declare class HNSWIndexOptimized extends HNSWIndex {
89
89
  private optimizedConfig;
90
90
  private productQuantizer;
91
- private storage;
92
91
  private useDiskBasedIndex;
93
92
  private useProductQuantization;
94
93
  private quantizedVectors;
@@ -96,7 +95,7 @@ export declare class HNSWIndexOptimized extends HNSWIndex {
96
95
  private vectorCount;
97
96
  private memoryUpdateLock;
98
97
  private unifiedCache;
99
- constructor(config: Partial<HNSWOptimizedConfig>, distanceFunction: DistanceFunction, storage?: StorageAdapter | null);
98
+ constructor(config: Partial<HNSWOptimizedConfig>, distanceFunction: DistanceFunction, storage?: BaseStorage | null);
100
99
  /**
101
100
  * Thread-safe method to update memory usage
102
101
  * @param memoryDelta Change in memory usage (can be negative)
@@ -145,16 +144,6 @@ export declare class HNSWIndexOptimized extends HNSWIndex {
145
144
  * @returns Estimated memory usage in bytes
146
145
  */
147
146
  getMemoryUsage(): number;
148
- /**
149
- * Set the storage adapter
150
- * @param storage Storage adapter
151
- */
152
- setStorage(storage: StorageAdapter): void;
153
- /**
154
- * Get the storage adapter
155
- * @returns Storage adapter or null if not set
156
- */
157
- getStorage(): StorageAdapter | null;
158
147
  /**
159
148
  * Set whether to use disk-based index
160
149
  * @param useDiskBasedIndex Whether to use disk-based index
@@ -211,10 +211,9 @@ class ProductQuantizer {
211
211
  */
212
212
  export class HNSWIndexOptimized extends HNSWIndex {
213
213
  constructor(config = {}, distanceFunction, storage = null) {
214
- // Initialize base HNSW index with standard config
215
- super(config, distanceFunction);
214
+ // Initialize base HNSW index with standard config and storage
215
+ super(config, distanceFunction, { storage: storage || undefined });
216
216
  this.productQuantizer = null;
217
- this.storage = null;
218
217
  this.useDiskBasedIndex = false;
219
218
  this.useProductQuantization = false;
220
219
  this.quantizedVectors = new Map();
@@ -224,8 +223,6 @@ export class HNSWIndexOptimized extends HNSWIndex {
224
223
  this.memoryUpdateLock = Promise.resolve();
225
224
  // Set optimized config
226
225
  this.optimizedConfig = { ...DEFAULT_OPTIMIZED_CONFIG, ...config };
227
- // Set storage adapter
228
- this.storage = storage;
229
226
  // Initialize product quantizer if enabled
230
227
  if (this.optimizedConfig.productQuantization?.enabled) {
231
228
  this.useProductQuantization = true;
@@ -302,18 +299,9 @@ export class HNSWIndexOptimized extends HNSWIndex {
302
299
  return await super.addItem({ id, vector: reconstructedVector });
303
300
  }
304
301
  // If disk-based index is active and storage is available, store the vector
305
- if (this.useDiskBasedIndex && this.storage) {
306
- // Create a noun object
307
- const noun = {
308
- id,
309
- vector,
310
- connections: new Map(),
311
- level: 0
312
- };
313
- // Store the noun
314
- this.storage.saveNoun(noun).catch((error) => {
315
- console.error(`Failed to save noun ${id} to storage:`, error);
316
- });
302
+ if (this.useDiskBasedIndex) {
303
+ // Storage is handled by the base class now via HNSW persistence
304
+ // No additional storage needed here
317
305
  }
318
306
  // Add the vector to the in-memory index
319
307
  return await super.addItem(item);
@@ -349,12 +337,8 @@ export class HNSWIndexOptimized extends HNSWIndex {
349
337
  if (this.useProductQuantization) {
350
338
  this.quantizedVectors.delete(id);
351
339
  }
352
- // If disk-based index is active and storage is available, remove the vector from storage
353
- if (this.useDiskBasedIndex && this.storage) {
354
- this.storage.deleteNoun(id).catch((error) => {
355
- console.error(`Failed to delete noun ${id} from storage:`, error);
356
- });
357
- }
340
+ // If disk-based index is active, removal is handled by base class
341
+ // No additional removal needed here
358
342
  // Update memory usage estimate (async operation, but don't block removal)
359
343
  this.getMemoryUsageAsync().then((currentMemoryUsage) => {
360
344
  if (currentMemoryUsage.vectorCount > 0) {
@@ -428,20 +412,7 @@ export class HNSWIndexOptimized extends HNSWIndex {
428
412
  getMemoryUsage() {
429
413
  return this.memoryUsage;
430
414
  }
431
- /**
432
- * Set the storage adapter
433
- * @param storage Storage adapter
434
- */
435
- setStorage(storage) {
436
- this.storage = storage;
437
- }
438
- /**
439
- * Get the storage adapter
440
- * @returns Storage adapter or null if not set
441
- */
442
- getStorage() {
443
- return this.storage;
444
- }
415
+ // Storage methods removed - now handled by base class
445
416
  /**
446
417
  * Set whether to use disk-based index
447
418
  * @param useDiskBasedIndex Whether to use disk-based index