superlocalmemory 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/ATTRIBUTION.md +140 -0
  2. package/CHANGELOG.md +1749 -0
  3. package/LICENSE +21 -0
  4. package/README.md +600 -0
  5. package/bin/aider-smart +72 -0
  6. package/bin/slm +202 -0
  7. package/bin/slm-npm +73 -0
  8. package/bin/slm.bat +195 -0
  9. package/bin/slm.cmd +10 -0
  10. package/bin/superlocalmemoryv2:list +3 -0
  11. package/bin/superlocalmemoryv2:profile +3 -0
  12. package/bin/superlocalmemoryv2:recall +3 -0
  13. package/bin/superlocalmemoryv2:remember +3 -0
  14. package/bin/superlocalmemoryv2:reset +3 -0
  15. package/bin/superlocalmemoryv2:status +3 -0
  16. package/completions/slm.bash +58 -0
  17. package/completions/slm.zsh +76 -0
  18. package/configs/antigravity-mcp.json +13 -0
  19. package/configs/chatgpt-desktop-mcp.json +7 -0
  20. package/configs/claude-desktop-mcp.json +15 -0
  21. package/configs/codex-mcp.toml +13 -0
  22. package/configs/cody-commands.json +29 -0
  23. package/configs/continue-mcp.yaml +14 -0
  24. package/configs/continue-skills.yaml +26 -0
  25. package/configs/cursor-mcp.json +15 -0
  26. package/configs/gemini-cli-mcp.json +11 -0
  27. package/configs/jetbrains-mcp.json +11 -0
  28. package/configs/opencode-mcp.json +12 -0
  29. package/configs/perplexity-mcp.json +9 -0
  30. package/configs/vscode-copilot-mcp.json +12 -0
  31. package/configs/windsurf-mcp.json +16 -0
  32. package/configs/zed-mcp.json +12 -0
  33. package/docs/ARCHITECTURE.md +877 -0
  34. package/docs/CLI-COMMANDS-REFERENCE.md +425 -0
  35. package/docs/COMPETITIVE-ANALYSIS.md +210 -0
  36. package/docs/COMPRESSION-README.md +390 -0
  37. package/docs/GRAPH-ENGINE.md +503 -0
  38. package/docs/MCP-MANUAL-SETUP.md +720 -0
  39. package/docs/MCP-TROUBLESHOOTING.md +787 -0
  40. package/docs/PATTERN-LEARNING.md +363 -0
  41. package/docs/PROFILES-GUIDE.md +453 -0
  42. package/docs/RESET-GUIDE.md +353 -0
  43. package/docs/SEARCH-ENGINE-V2.2.0.md +748 -0
  44. package/docs/SEARCH-INTEGRATION-GUIDE.md +502 -0
  45. package/docs/UI-SERVER.md +254 -0
  46. package/docs/UNIVERSAL-INTEGRATION.md +432 -0
  47. package/docs/V2.2.0-OPTIONAL-SEARCH.md +666 -0
  48. package/docs/WINDOWS-INSTALL-README.txt +34 -0
  49. package/docs/WINDOWS-POST-INSTALL.txt +45 -0
  50. package/docs/example_graph_usage.py +148 -0
  51. package/hooks/memory-list-skill.js +130 -0
  52. package/hooks/memory-profile-skill.js +284 -0
  53. package/hooks/memory-recall-skill.js +109 -0
  54. package/hooks/memory-remember-skill.js +127 -0
  55. package/hooks/memory-reset-skill.js +274 -0
  56. package/install-skills.sh +436 -0
  57. package/install.ps1 +417 -0
  58. package/install.sh +755 -0
  59. package/mcp_server.py +585 -0
  60. package/package.json +94 -0
  61. package/requirements-core.txt +24 -0
  62. package/requirements.txt +10 -0
  63. package/scripts/postinstall.js +126 -0
  64. package/scripts/preuninstall.js +57 -0
  65. package/skills/slm-build-graph/SKILL.md +423 -0
  66. package/skills/slm-list-recent/SKILL.md +348 -0
  67. package/skills/slm-recall/SKILL.md +325 -0
  68. package/skills/slm-remember/SKILL.md +194 -0
  69. package/skills/slm-status/SKILL.md +363 -0
  70. package/skills/slm-switch-profile/SKILL.md +442 -0
  71. package/src/__pycache__/cache_manager.cpython-312.pyc +0 -0
  72. package/src/__pycache__/embedding_engine.cpython-312.pyc +0 -0
  73. package/src/__pycache__/graph_engine.cpython-312.pyc +0 -0
  74. package/src/__pycache__/hnsw_index.cpython-312.pyc +0 -0
  75. package/src/__pycache__/hybrid_search.cpython-312.pyc +0 -0
  76. package/src/__pycache__/memory-profiles.cpython-312.pyc +0 -0
  77. package/src/__pycache__/memory-reset.cpython-312.pyc +0 -0
  78. package/src/__pycache__/memory_compression.cpython-312.pyc +0 -0
  79. package/src/__pycache__/memory_store_v2.cpython-312.pyc +0 -0
  80. package/src/__pycache__/migrate_v1_to_v2.cpython-312.pyc +0 -0
  81. package/src/__pycache__/pattern_learner.cpython-312.pyc +0 -0
  82. package/src/__pycache__/query_optimizer.cpython-312.pyc +0 -0
  83. package/src/__pycache__/search_engine_v2.cpython-312.pyc +0 -0
  84. package/src/__pycache__/setup_validator.cpython-312.pyc +0 -0
  85. package/src/__pycache__/tree_manager.cpython-312.pyc +0 -0
  86. package/src/cache_manager.py +520 -0
  87. package/src/embedding_engine.py +671 -0
  88. package/src/graph_engine.py +970 -0
  89. package/src/hnsw_index.py +626 -0
  90. package/src/hybrid_search.py +693 -0
  91. package/src/memory-profiles.py +518 -0
  92. package/src/memory-reset.py +485 -0
  93. package/src/memory_compression.py +999 -0
  94. package/src/memory_store_v2.py +1088 -0
  95. package/src/migrate_v1_to_v2.py +638 -0
  96. package/src/pattern_learner.py +898 -0
  97. package/src/query_optimizer.py +513 -0
  98. package/src/search_engine_v2.py +403 -0
  99. package/src/setup_validator.py +479 -0
  100. package/src/tree_manager.py +720 -0
@@ -0,0 +1,626 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HNSWIndex - Fast Approximate Nearest Neighbor Search for SuperLocalMemory V2
4
+
5
+ Copyright (c) 2026 Varun Pratap Bhardwaj
6
+ Licensed under MIT License
7
+ Repository: https://github.com/varun369/SuperLocalMemoryV2
8
+
9
+ Implements HNSW (Hierarchical Navigable Small World) algorithm for:
10
+ - Sub-10ms vector similarity search for 10K+ memories
11
+ - Incremental updates without full rebuild
12
+ - Disk persistence for instant startup
13
+ - Graceful fallback to linear search if hnswlib unavailable
14
+
15
+ All processing is local - no external APIs.
16
+
17
+ LIMITS:
18
+ - MAX_MEMORIES_FOR_HNSW: 100,000 (prevents memory exhaustion)
19
+ - MAX_DIMENSION: 5000 (typical: 384 for sentence embeddings)
20
+ - Performance target: <10ms for 10K memories, <50ms for 100K memories
21
+ """
22
+
23
+ # SECURITY: HNSW index limits to prevent resource exhaustion
24
+ MAX_MEMORIES_FOR_HNSW = 100_000
25
+ MAX_DIMENSION = 5000
26
+ DEFAULT_M = 16 # HNSW parameter: number of connections per layer
27
+ DEFAULT_EF_CONSTRUCTION = 200 # HNSW parameter: size of dynamic candidate list
28
+ DEFAULT_EF_SEARCH = 50 # HNSW parameter: search-time candidate list size
29
+
30
+ import sqlite3
31
+ import json
32
+ import time
33
+ import logging
34
+ from datetime import datetime
35
+ from pathlib import Path
36
+ from typing import List, Dict, Optional, Tuple, Any
37
+ import numpy as np
38
+
39
+ # Core dependencies for fallback
40
+ try:
41
+ from sklearn.metrics.pairwise import cosine_similarity
42
+ SKLEARN_AVAILABLE = True
43
+ except ImportError:
44
+ SKLEARN_AVAILABLE = False
45
+
46
+ # Optional HNSW dependency
47
+ HNSW_AVAILABLE = False
48
+ try:
49
+ import hnswlib
50
+ HNSW_AVAILABLE = True
51
+ except ImportError:
52
+ HNSW_AVAILABLE = False
53
+ # Graceful degradation - will use linear search fallback
54
+
55
+ MEMORY_DIR = Path.home() / ".claude-memory"
56
+ HNSW_INDEX_PATH = MEMORY_DIR / "hnsw_index.bin"
57
+ HNSW_METADATA_PATH = MEMORY_DIR / "hnsw_metadata.json"
58
+
59
+ logger = logging.getLogger(__name__)
60
+
61
+
62
+ class HNSWIndex:
63
+ """
64
+ Fast approximate nearest neighbor search using HNSW algorithm.
65
+
66
+ Features:
67
+ - Sub-10ms search for 10K memories
68
+ - Incremental updates (no full rebuild needed)
69
+ - Disk persistence with automatic loading
70
+ - Graceful fallback to linear search if hnswlib unavailable
71
+
72
+ Performance:
73
+ - 10K memories: <10ms search time
74
+ - 100K memories: <50ms search time
75
+ - Memory overhead: ~200 bytes per vector (configurable)
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ dimension: int = 384,
81
+ max_elements: int = MAX_MEMORIES_FOR_HNSW,
82
+ m: int = DEFAULT_M,
83
+ ef_construction: int = DEFAULT_EF_CONSTRUCTION,
84
+ ef_search: int = DEFAULT_EF_SEARCH,
85
+ index_path: Optional[Path] = None,
86
+ metadata_path: Optional[Path] = None
87
+ ):
88
+ """
89
+ Initialize HNSW index.
90
+
91
+ Args:
92
+ dimension: Vector dimension (e.g., 384 for all-MiniLM-L6-v2)
93
+ max_elements: Maximum number of vectors to index
94
+ m: HNSW M parameter (connections per layer, typical: 16)
95
+ ef_construction: HNSW ef_construction (candidate list size, typical: 200)
96
+ ef_search: HNSW ef_search (search candidate list size, typical: 50)
97
+ index_path: Custom path for index file
98
+ metadata_path: Custom path for metadata file
99
+
100
+ Raises:
101
+ ValueError: If parameters exceed security limits
102
+ """
103
+ # SECURITY: Input validation
104
+ if dimension > MAX_DIMENSION:
105
+ raise ValueError(f"Dimension {dimension} exceeds maximum {MAX_DIMENSION}")
106
+
107
+ if max_elements > MAX_MEMORIES_FOR_HNSW:
108
+ raise ValueError(f"Max elements {max_elements} exceeds limit {MAX_MEMORIES_FOR_HNSW}")
109
+
110
+ self.dimension = dimension
111
+ self.max_elements = max_elements
112
+ self.m = m
113
+ self.ef_construction = ef_construction
114
+ self.ef_search = ef_search
115
+
116
+ self.index_path = index_path or HNSW_INDEX_PATH
117
+ self.metadata_path = metadata_path or HNSW_METADATA_PATH
118
+
119
+ # Initialize index and metadata
120
+ self.index = None
121
+ self.memory_ids = [] # Maps index position to memory ID
122
+ self.id_to_idx = {} # Maps memory ID to index position
123
+ self.use_hnsw = HNSW_AVAILABLE
124
+
125
+ # Fallback: store vectors for linear search
126
+ self.vectors = None
127
+
128
+ # Load existing index if available
129
+ self._load()
130
+
131
+ def _load(self):
132
+ """Load existing index and metadata from disk."""
133
+ if not self.use_hnsw:
134
+ logger.info("HNSW unavailable - will use linear search fallback")
135
+ return
136
+
137
+ if not self.index_path.exists() or not self.metadata_path.exists():
138
+ logger.info("No existing HNSW index found - will create new index")
139
+ return
140
+
141
+ try:
142
+ # Load metadata
143
+ with open(self.metadata_path, 'r') as f:
144
+ metadata = json.load(f)
145
+
146
+ # Validate metadata
147
+ if metadata.get('dimension') != self.dimension:
148
+ logger.warning(
149
+ f"Index dimension mismatch: {metadata.get('dimension')} != {self.dimension}. "
150
+ "Will rebuild index."
151
+ )
152
+ return
153
+
154
+ # Load HNSW index
155
+ self.index = hnswlib.Index(space='cosine', dim=self.dimension)
156
+ self.index.load_index(str(self.index_path))
157
+ self.index.set_ef(self.ef_search)
158
+
159
+ # Load memory ID mapping
160
+ self.memory_ids = metadata.get('memory_ids', [])
161
+ self.id_to_idx = {mem_id: idx for idx, mem_id in enumerate(self.memory_ids)}
162
+
163
+ logger.info(f"Loaded HNSW index with {len(self.memory_ids)} vectors")
164
+
165
+ except Exception as e:
166
+ logger.error(f"Failed to load HNSW index: {e}. Will rebuild.")
167
+ self.index = None
168
+ self.memory_ids = []
169
+ self.id_to_idx = {}
170
+
171
+ def _save(self):
172
+ """Save index and metadata to disk."""
173
+ if not self.use_hnsw or self.index is None:
174
+ return
175
+
176
+ try:
177
+ # Create directory if needed
178
+ self.index_path.parent.mkdir(parents=True, exist_ok=True)
179
+
180
+ # Save HNSW index
181
+ self.index.save_index(str(self.index_path))
182
+
183
+ # Save metadata
184
+ metadata = {
185
+ 'dimension': self.dimension,
186
+ 'max_elements': self.max_elements,
187
+ 'm': self.m,
188
+ 'ef_construction': self.ef_construction,
189
+ 'ef_search': self.ef_search,
190
+ 'memory_ids': self.memory_ids,
191
+ 'created_at': datetime.now().isoformat(),
192
+ 'version': '2.2.0'
193
+ }
194
+
195
+ with open(self.metadata_path, 'w') as f:
196
+ json.dump(metadata, f, indent=2)
197
+
198
+ logger.info(f"Saved HNSW index with {len(self.memory_ids)} vectors")
199
+
200
+ except Exception as e:
201
+ logger.error(f"Failed to save HNSW index: {e}")
202
+
203
+ def build(self, vectors: np.ndarray, memory_ids: List[int]):
204
+ """
205
+ Build HNSW index from vectors.
206
+
207
+ Args:
208
+ vectors: Array of shape (n_memories, dimension)
209
+ memory_ids: List of memory IDs corresponding to vectors
210
+
211
+ Raises:
212
+ ValueError: If input validation fails
213
+ """
214
+ # SECURITY: Input validation
215
+ if len(vectors) > self.max_elements:
216
+ raise ValueError(
217
+ f"Cannot index {len(vectors)} vectors (max: {self.max_elements}). "
218
+ "Use incremental updates or increase max_elements."
219
+ )
220
+
221
+ if vectors.shape[1] != self.dimension:
222
+ raise ValueError(
223
+ f"Vector dimension {vectors.shape[1]} does not match index dimension {self.dimension}"
224
+ )
225
+
226
+ if len(vectors) != len(memory_ids):
227
+ raise ValueError("Number of vectors must match number of memory IDs")
228
+
229
+ # Convert to float32 for efficiency
230
+ vectors = vectors.astype('float32')
231
+
232
+ if self.use_hnsw:
233
+ # Build HNSW index
234
+ try:
235
+ start_time = time.time()
236
+
237
+ self.index = hnswlib.Index(space='cosine', dim=self.dimension)
238
+ self.index.init_index(
239
+ max_elements=self.max_elements,
240
+ M=self.m,
241
+ ef_construction=self.ef_construction,
242
+ random_seed=42
243
+ )
244
+ self.index.set_ef(self.ef_search)
245
+
246
+ # Add vectors in batch
247
+ self.index.add_items(vectors, list(range(len(vectors))))
248
+
249
+ self.memory_ids = list(memory_ids)
250
+ self.id_to_idx = {mem_id: idx for idx, mem_id in enumerate(memory_ids)}
251
+
252
+ # Save to disk
253
+ self._save()
254
+
255
+ elapsed = time.time() - start_time
256
+ logger.info(f"Built HNSW index with {len(vectors)} vectors in {elapsed:.2f}s")
257
+
258
+ except Exception as e:
259
+ logger.error(f"HNSW build failed: {e}. Falling back to linear search.")
260
+ self.use_hnsw = False
261
+ self._build_fallback(vectors, memory_ids)
262
+ else:
263
+ # Fallback: store vectors for linear search
264
+ self._build_fallback(vectors, memory_ids)
265
+
266
+ def _build_fallback(self, vectors: np.ndarray, memory_ids: List[int]):
267
+ """Build fallback index using linear search."""
268
+ if not SKLEARN_AVAILABLE:
269
+ logger.warning("sklearn unavailable - search functionality disabled")
270
+ return
271
+
272
+ self.vectors = vectors.astype('float32')
273
+ self.memory_ids = list(memory_ids)
274
+ self.id_to_idx = {mem_id: idx for idx, mem_id in enumerate(memory_ids)}
275
+ logger.info(f"Built fallback index with {len(vectors)} vectors (linear search)")
276
+
277
+ def add(self, vector: np.ndarray, memory_id: int):
278
+ """
279
+ Add single vector to index (incremental update).
280
+
281
+ Args:
282
+ vector: Vector of shape (dimension,)
283
+ memory_id: Memory ID for this vector
284
+
285
+ Raises:
286
+ ValueError: If index is full or vector invalid
287
+ """
288
+ # SECURITY: Input validation
289
+ if len(self.memory_ids) >= self.max_elements:
290
+ raise ValueError(f"Index is full (max: {self.max_elements})")
291
+
292
+ if len(vector) != self.dimension:
293
+ raise ValueError(f"Vector dimension {len(vector)} does not match {self.dimension}")
294
+
295
+ vector = vector.astype('float32').reshape(1, -1)
296
+
297
+ if self.use_hnsw and self.index is not None:
298
+ try:
299
+ # Add to HNSW index
300
+ idx = len(self.memory_ids)
301
+ self.index.add_items(vector, [idx])
302
+ self.memory_ids.append(memory_id)
303
+ self.id_to_idx[memory_id] = idx
304
+
305
+ # Save updated index
306
+ self._save()
307
+
308
+ except Exception as e:
309
+ logger.error(f"Failed to add vector to HNSW: {e}")
310
+ # Continue with best effort - don't crash
311
+ else:
312
+ # Fallback: append to vectors array
313
+ if self.vectors is None:
314
+ self.vectors = vector
315
+ else:
316
+ self.vectors = np.vstack([self.vectors, vector])
317
+
318
+ idx = len(self.memory_ids)
319
+ self.memory_ids.append(memory_id)
320
+ self.id_to_idx[memory_id] = idx
321
+
322
+ def search(
323
+ self,
324
+ query_vector: np.ndarray,
325
+ k: int = 5,
326
+ filter_ids: Optional[List[int]] = None
327
+ ) -> List[Tuple[int, float]]:
328
+ """
329
+ Search for k nearest neighbors.
330
+
331
+ Args:
332
+ query_vector: Query vector of shape (dimension,)
333
+ k: Number of results to return
334
+ filter_ids: Optional list of memory IDs to restrict search
335
+
336
+ Returns:
337
+ List of (memory_id, distance) tuples, sorted by distance (lower = more similar)
338
+
339
+ Performance:
340
+ - HNSW: <10ms for 10K vectors, <50ms for 100K vectors
341
+ - Fallback: O(n) linear search, ~100ms for 10K vectors
342
+ """
343
+ if len(self.memory_ids) == 0:
344
+ return []
345
+
346
+ # SECURITY: Input validation
347
+ if len(query_vector) != self.dimension:
348
+ raise ValueError(f"Query dimension {len(query_vector)} does not match {self.dimension}")
349
+
350
+ query_vector = query_vector.astype('float32').reshape(1, -1)
351
+ k = min(k, len(self.memory_ids)) # Don't request more than available
352
+
353
+ if self.use_hnsw and self.index is not None:
354
+ # HNSW search
355
+ try:
356
+ start_time = time.time()
357
+
358
+ # Get more candidates if filtering is needed
359
+ search_k = k * 3 if filter_ids else k
360
+ search_k = min(search_k, len(self.memory_ids))
361
+
362
+ labels, distances = self.index.knn_query(query_vector, k=search_k)
363
+
364
+ # Convert to results
365
+ results = []
366
+ for idx, dist in zip(labels[0], distances[0]):
367
+ mem_id = self.memory_ids[idx]
368
+
369
+ # Apply filter if provided
370
+ if filter_ids is None or mem_id in filter_ids:
371
+ # Convert cosine distance to similarity score (1 - distance)
372
+ similarity = 1.0 - dist
373
+ results.append((mem_id, float(similarity)))
374
+
375
+ if len(results) >= k:
376
+ break
377
+
378
+ elapsed = time.time() - start_time
379
+ logger.debug(f"HNSW search took {elapsed*1000:.2f}ms for {len(self.memory_ids)} vectors")
380
+
381
+ return results
382
+
383
+ except Exception as e:
384
+ logger.error(f"HNSW search failed: {e}. Falling back to linear search.")
385
+ # Fall through to fallback
386
+
387
+ # Fallback: linear search with sklearn
388
+ if SKLEARN_AVAILABLE and self.vectors is not None:
389
+ start_time = time.time()
390
+
391
+ # Compute similarities
392
+ similarities = cosine_similarity(query_vector, self.vectors)[0]
393
+
394
+ # Get top k indices
395
+ if filter_ids:
396
+ # Filter first, then sort
397
+ filtered_indices = [idx for idx, mem_id in enumerate(self.memory_ids) if mem_id in filter_ids]
398
+ if not filtered_indices:
399
+ return []
400
+ filtered_similarities = similarities[filtered_indices]
401
+ top_indices = np.argsort(filtered_similarities)[::-1][:k]
402
+ results = [(self.memory_ids[filtered_indices[idx]], float(filtered_similarities[idx]))
403
+ for idx in top_indices]
404
+ else:
405
+ # Direct sorting
406
+ top_indices = np.argsort(similarities)[::-1][:k]
407
+ results = [(self.memory_ids[idx], float(similarities[idx])) for idx in top_indices]
408
+
409
+ elapsed = time.time() - start_time
410
+ logger.debug(f"Linear search took {elapsed*1000:.2f}ms for {len(self.memory_ids)} vectors")
411
+
412
+ return results
413
+
414
+ logger.warning("No search method available (HNSW and sklearn both unavailable)")
415
+ return []
416
+
417
+ def update(self, memory_id: int, vector: np.ndarray):
418
+ """
419
+ Update vector for existing memory.
420
+
421
+ Note: HNSW doesn't support in-place updates efficiently.
422
+ This marks the item for rebuild or uses a workaround.
423
+
424
+ Args:
425
+ memory_id: Memory ID to update
426
+ vector: New vector of shape (dimension,)
427
+ """
428
+ if memory_id not in self.id_to_idx:
429
+ logger.warning(f"Memory ID {memory_id} not in index - adding as new")
430
+ self.add(vector, memory_id)
431
+ return
432
+
433
+ # For HNSW, mark as dirty and suggest rebuild
434
+ # HNSW doesn't support efficient updates - best practice is periodic rebuild
435
+ logger.warning(
436
+ f"Updated memory {memory_id} - HNSW index is now stale. "
437
+ "Consider calling rebuild() periodically for optimal performance."
438
+ )
439
+
440
+ # Update fallback index if available
441
+ if self.vectors is not None:
442
+ idx = self.id_to_idx[memory_id]
443
+ self.vectors[idx] = vector.astype('float32')
444
+
445
+ def delete(self, memory_id: int):
446
+ """
447
+ Delete memory from index.
448
+
449
+ Note: HNSW doesn't support efficient deletion.
450
+ This marks the item for rebuild.
451
+
452
+ Args:
453
+ memory_id: Memory ID to delete
454
+ """
455
+ if memory_id not in self.id_to_idx:
456
+ logger.warning(f"Memory ID {memory_id} not in index")
457
+ return
458
+
459
+ # For now, just remove from mapping (soft delete)
460
+ # Physical removal requires full rebuild
461
+ idx = self.id_to_idx[memory_id]
462
+ del self.id_to_idx[memory_id]
463
+
464
+ logger.info(
465
+ f"Soft-deleted memory {memory_id} from index. "
466
+ "Call rebuild() to physically remove."
467
+ )
468
+
469
+ def rebuild_from_db(self, db_path: Path, embedding_column: str = 'embedding'):
470
+ """
471
+ Rebuild index from database.
472
+
473
+ Args:
474
+ db_path: Path to SQLite database
475
+ embedding_column: Name of column containing embeddings (JSON array)
476
+ """
477
+ conn = sqlite3.connect(db_path)
478
+ cursor = conn.cursor()
479
+
480
+ try:
481
+ # Check if embedding column exists
482
+ cursor.execute("PRAGMA table_info(memories)")
483
+ columns = {row[1] for row in cursor.fetchall()}
484
+
485
+ if embedding_column not in columns:
486
+ logger.warning(f"Column '{embedding_column}' not found in database")
487
+ conn.close()
488
+ return
489
+
490
+ # Load embeddings
491
+ cursor.execute(f'SELECT id, {embedding_column} FROM memories WHERE {embedding_column} IS NOT NULL')
492
+ rows = cursor.fetchall()
493
+
494
+ if not rows:
495
+ logger.info("No embeddings found in database")
496
+ conn.close()
497
+ return
498
+
499
+ # Parse embeddings
500
+ memory_ids = []
501
+ vectors = []
502
+
503
+ for mem_id, embedding_json in rows:
504
+ try:
505
+ embedding = json.loads(embedding_json)
506
+ memory_ids.append(mem_id)
507
+ vectors.append(embedding)
508
+ except json.JSONDecodeError:
509
+ logger.warning(f"Invalid embedding JSON for memory {mem_id}")
510
+
511
+ if not vectors:
512
+ logger.info("No valid embeddings to index")
513
+ conn.close()
514
+ return
515
+
516
+ vectors = np.array(vectors, dtype='float32')
517
+
518
+ # Build index
519
+ self.build(vectors, memory_ids)
520
+ logger.info(f"Rebuilt HNSW index from database with {len(memory_ids)} vectors")
521
+
522
+ finally:
523
+ conn.close()
524
+
525
+ def get_stats(self) -> Dict[str, Any]:
526
+ """
527
+ Get index statistics.
528
+
529
+ Returns:
530
+ Dictionary with index stats
531
+ """
532
+ return {
533
+ 'hnsw_available': HNSW_AVAILABLE,
534
+ 'use_hnsw': self.use_hnsw,
535
+ 'sklearn_available': SKLEARN_AVAILABLE,
536
+ 'dimension': self.dimension,
537
+ 'max_elements': self.max_elements,
538
+ 'indexed_count': len(self.memory_ids),
539
+ 'capacity_used_pct': (len(self.memory_ids) / self.max_elements * 100) if self.max_elements > 0 else 0,
540
+ 'm': self.m,
541
+ 'ef_construction': self.ef_construction,
542
+ 'ef_search': self.ef_search,
543
+ 'index_exists': self.index is not None,
544
+ 'fallback_active': self.vectors is not None
545
+ }
546
+
547
+
548
+ # CLI interface for testing
549
+ if __name__ == "__main__":
550
+ import sys
551
+
552
+ # Configure logging
553
+ logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
554
+
555
+ if len(sys.argv) < 2:
556
+ print("HNSWIndex CLI - Fast Approximate Nearest Neighbor Search")
557
+ print("\nCommands:")
558
+ print(" python hnsw_index.py stats # Show index statistics")
559
+ print(" python hnsw_index.py rebuild # Rebuild from database")
560
+ print(" python hnsw_index.py test # Run performance test")
561
+ sys.exit(0)
562
+
563
+ command = sys.argv[1]
564
+
565
+ if command == "stats":
566
+ index = HNSWIndex()
567
+ stats = index.get_stats()
568
+ print(json.dumps(stats, indent=2))
569
+
570
+ elif command == "rebuild":
571
+ db_path = MEMORY_DIR / "memory.db"
572
+ if not db_path.exists():
573
+ print(f"Database not found at {db_path}")
574
+ sys.exit(1)
575
+
576
+ print("Rebuilding HNSW index from database...")
577
+ index = HNSWIndex()
578
+ index.rebuild_from_db(db_path)
579
+ print("Rebuild complete!")
580
+ print(json.dumps(index.get_stats(), indent=2))
581
+
582
+ elif command == "test":
583
+ print("Running HNSW performance test...")
584
+
585
+ # Generate random test data
586
+ n_vectors = 10000
587
+ dimension = 384
588
+
589
+ print(f"Generating {n_vectors} random {dimension}-dim vectors...")
590
+ vectors = np.random.randn(n_vectors, dimension).astype('float32')
591
+ memory_ids = list(range(n_vectors))
592
+
593
+ # Build index
594
+ print("Building index...")
595
+ index = HNSWIndex(dimension=dimension)
596
+ start = time.time()
597
+ index.build(vectors, memory_ids)
598
+ build_time = time.time() - start
599
+ print(f"Build time: {build_time:.2f}s ({n_vectors/build_time:.0f} vectors/sec)")
600
+
601
+ # Test search performance
602
+ print("\nTesting search performance...")
603
+ query = np.random.randn(dimension).astype('float32')
604
+
605
+ # Warm-up
606
+ for _ in range(10):
607
+ index.search(query, k=5)
608
+
609
+ # Benchmark
610
+ n_queries = 100
611
+ start = time.time()
612
+ for _ in range(n_queries):
613
+ results = index.search(query, k=5)
614
+ search_time = (time.time() - start) / n_queries
615
+
616
+ print(f"Average search time: {search_time*1000:.2f}ms")
617
+ print(f"Queries per second: {1/search_time:.0f}")
618
+ print(f"\nSample results: {results[:3]}")
619
+
620
+ # Print stats
621
+ print("\nIndex statistics:")
622
+ print(json.dumps(index.get_stats(), indent=2))
623
+
624
+ else:
625
+ print(f"Unknown command: {command}")
626
+ print("Run without arguments to see available commands.")