superlocalmemory 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/ATTRIBUTION.md +140 -0
  2. package/CHANGELOG.md +1749 -0
  3. package/LICENSE +21 -0
  4. package/README.md +600 -0
  5. package/bin/aider-smart +72 -0
  6. package/bin/slm +202 -0
  7. package/bin/slm-npm +73 -0
  8. package/bin/slm.bat +195 -0
  9. package/bin/slm.cmd +10 -0
  10. package/bin/superlocalmemoryv2:list +3 -0
  11. package/bin/superlocalmemoryv2:profile +3 -0
  12. package/bin/superlocalmemoryv2:recall +3 -0
  13. package/bin/superlocalmemoryv2:remember +3 -0
  14. package/bin/superlocalmemoryv2:reset +3 -0
  15. package/bin/superlocalmemoryv2:status +3 -0
  16. package/completions/slm.bash +58 -0
  17. package/completions/slm.zsh +76 -0
  18. package/configs/antigravity-mcp.json +13 -0
  19. package/configs/chatgpt-desktop-mcp.json +7 -0
  20. package/configs/claude-desktop-mcp.json +15 -0
  21. package/configs/codex-mcp.toml +13 -0
  22. package/configs/cody-commands.json +29 -0
  23. package/configs/continue-mcp.yaml +14 -0
  24. package/configs/continue-skills.yaml +26 -0
  25. package/configs/cursor-mcp.json +15 -0
  26. package/configs/gemini-cli-mcp.json +11 -0
  27. package/configs/jetbrains-mcp.json +11 -0
  28. package/configs/opencode-mcp.json +12 -0
  29. package/configs/perplexity-mcp.json +9 -0
  30. package/configs/vscode-copilot-mcp.json +12 -0
  31. package/configs/windsurf-mcp.json +16 -0
  32. package/configs/zed-mcp.json +12 -0
  33. package/docs/ARCHITECTURE.md +877 -0
  34. package/docs/CLI-COMMANDS-REFERENCE.md +425 -0
  35. package/docs/COMPETITIVE-ANALYSIS.md +210 -0
  36. package/docs/COMPRESSION-README.md +390 -0
  37. package/docs/GRAPH-ENGINE.md +503 -0
  38. package/docs/MCP-MANUAL-SETUP.md +720 -0
  39. package/docs/MCP-TROUBLESHOOTING.md +787 -0
  40. package/docs/PATTERN-LEARNING.md +363 -0
  41. package/docs/PROFILES-GUIDE.md +453 -0
  42. package/docs/RESET-GUIDE.md +353 -0
  43. package/docs/SEARCH-ENGINE-V2.2.0.md +748 -0
  44. package/docs/SEARCH-INTEGRATION-GUIDE.md +502 -0
  45. package/docs/UI-SERVER.md +254 -0
  46. package/docs/UNIVERSAL-INTEGRATION.md +432 -0
  47. package/docs/V2.2.0-OPTIONAL-SEARCH.md +666 -0
  48. package/docs/WINDOWS-INSTALL-README.txt +34 -0
  49. package/docs/WINDOWS-POST-INSTALL.txt +45 -0
  50. package/docs/example_graph_usage.py +148 -0
  51. package/hooks/memory-list-skill.js +130 -0
  52. package/hooks/memory-profile-skill.js +284 -0
  53. package/hooks/memory-recall-skill.js +109 -0
  54. package/hooks/memory-remember-skill.js +127 -0
  55. package/hooks/memory-reset-skill.js +274 -0
  56. package/install-skills.sh +436 -0
  57. package/install.ps1 +417 -0
  58. package/install.sh +755 -0
  59. package/mcp_server.py +585 -0
  60. package/package.json +94 -0
  61. package/requirements-core.txt +24 -0
  62. package/requirements.txt +10 -0
  63. package/scripts/postinstall.js +126 -0
  64. package/scripts/preuninstall.js +57 -0
  65. package/skills/slm-build-graph/SKILL.md +423 -0
  66. package/skills/slm-list-recent/SKILL.md +348 -0
  67. package/skills/slm-recall/SKILL.md +325 -0
  68. package/skills/slm-remember/SKILL.md +194 -0
  69. package/skills/slm-status/SKILL.md +363 -0
  70. package/skills/slm-switch-profile/SKILL.md +442 -0
  71. package/src/__pycache__/cache_manager.cpython-312.pyc +0 -0
  72. package/src/__pycache__/embedding_engine.cpython-312.pyc +0 -0
  73. package/src/__pycache__/graph_engine.cpython-312.pyc +0 -0
  74. package/src/__pycache__/hnsw_index.cpython-312.pyc +0 -0
  75. package/src/__pycache__/hybrid_search.cpython-312.pyc +0 -0
  76. package/src/__pycache__/memory-profiles.cpython-312.pyc +0 -0
  77. package/src/__pycache__/memory-reset.cpython-312.pyc +0 -0
  78. package/src/__pycache__/memory_compression.cpython-312.pyc +0 -0
  79. package/src/__pycache__/memory_store_v2.cpython-312.pyc +0 -0
  80. package/src/__pycache__/migrate_v1_to_v2.cpython-312.pyc +0 -0
  81. package/src/__pycache__/pattern_learner.cpython-312.pyc +0 -0
  82. package/src/__pycache__/query_optimizer.cpython-312.pyc +0 -0
  83. package/src/__pycache__/search_engine_v2.cpython-312.pyc +0 -0
  84. package/src/__pycache__/setup_validator.cpython-312.pyc +0 -0
  85. package/src/__pycache__/tree_manager.cpython-312.pyc +0 -0
  86. package/src/cache_manager.py +520 -0
  87. package/src/embedding_engine.py +671 -0
  88. package/src/graph_engine.py +970 -0
  89. package/src/hnsw_index.py +626 -0
  90. package/src/hybrid_search.py +693 -0
  91. package/src/memory-profiles.py +518 -0
  92. package/src/memory-reset.py +485 -0
  93. package/src/memory_compression.py +999 -0
  94. package/src/memory_store_v2.py +1088 -0
  95. package/src/migrate_v1_to_v2.py +638 -0
  96. package/src/pattern_learner.py +898 -0
  97. package/src/query_optimizer.py +513 -0
  98. package/src/search_engine_v2.py +403 -0
  99. package/src/setup_validator.py +479 -0
  100. package/src/tree_manager.py +720 -0
@@ -0,0 +1,403 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SuperLocalMemory V2 - BM25 Search Engine
4
+
5
+ Copyright (c) 2026 Varun Pratap Bhardwaj
6
+ Solution Architect & Original Creator
7
+
8
+ Licensed under MIT License (see LICENSE file)
9
+ Repository: https://github.com/varun369/SuperLocalMemoryV2
10
+
11
+ ATTRIBUTION REQUIRED: This notice must be preserved in all copies.
12
+ """
13
+
14
+ """
15
+ BM25 Search Engine - Pure Python Implementation
16
+
17
+ Implements Okapi BM25 ranking function for relevance scoring without external dependencies.
18
+ BM25 (Best Match 25) is a probabilistic retrieval function that ranks documents based on
19
+ query term frequency with diminishing returns and document length normalization.
20
+
21
+ Algorithm: score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D| / avgdl))
22
+
23
+ Where:
24
+ - f(qi,D) = term frequency of query term qi in document D
25
+ - |D| = document length (number of tokens)
26
+ - avgdl = average document length in the collection
27
+ - k1 = term frequency saturation parameter (default: 1.5)
28
+ - b = document length normalization parameter (default: 0.75)
29
+ - IDF(qi) = log((N - df(qi) + 0.5) / (df(qi) + 0.5) + 1)
30
+ where N = total documents, df(qi) = document frequency of term qi
31
+
32
+ Performance Target: <30ms for 1K memories
33
+
34
+ Usage:
35
+ engine = BM25SearchEngine()
36
+ engine.index_documents(docs, doc_ids)
37
+ results = engine.search("query string", limit=10)
38
+ """
39
+
40
+ import math
41
+ import re
42
+ from collections import defaultdict, Counter
43
+ from typing import List, Dict, Tuple, Any, Optional
44
+ import time
45
+
46
+
47
+ class BM25SearchEngine:
48
+ """
49
+ Pure Python BM25 search engine with no external dependencies.
50
+
51
+ BM25 is the industry standard for keyword-based retrieval and outperforms
52
+ simple TF-IDF in most scenarios due to better term saturation handling.
53
+ """
54
+
55
+ def __init__(self, k1: float = 1.5, b: float = 0.75):
56
+ """
57
+ Initialize BM25 search engine.
58
+
59
+ Args:
60
+ k1: Term frequency saturation parameter (1.2-2.0 typical)
61
+ Higher values = more weight on term frequency
62
+ Default 1.5 is optimal for most use cases
63
+ b: Document length normalization (0.0-1.0)
64
+ 0 = no normalization, 1 = full normalization
65
+ Default 0.75 balances short vs long documents
66
+ """
67
+ self.k1 = k1
68
+ self.b = b
69
+
70
+ # Index structures
71
+ self.doc_ids: List[Any] = [] # Document IDs in index order
72
+ self.doc_lengths: List[int] = [] # Token count per document
73
+ self.avg_doc_length: float = 0.0
74
+ self.num_docs: int = 0
75
+
76
+ # Inverted index: term -> [(doc_idx, term_freq), ...]
77
+ self.inverted_index: Dict[str, List[Tuple[int, int]]] = defaultdict(list)
78
+
79
+ # Document frequency: term -> count of documents containing term
80
+ self.doc_freq: Dict[str, int] = defaultdict(int)
81
+
82
+ # Performance tracking
83
+ self.index_time: float = 0.0
84
+ self.last_search_time: float = 0.0
85
+
86
+ def _tokenize(self, text: str) -> List[str]:
87
+ """
88
+ Tokenize text into normalized terms.
89
+
90
+ Applies:
91
+ - Lowercase normalization
92
+ - Unicode handling
93
+ - Alphanumeric + underscore/hyphen preservation
94
+ - Stopword filtering (minimal set for performance)
95
+
96
+ Args:
97
+ text: Input text to tokenize
98
+
99
+ Returns:
100
+ List of normalized tokens
101
+ """
102
+ # Lowercase and extract alphanumeric tokens (preserve _ and -)
103
+ tokens = re.findall(r'\b[a-z0-9_-]+\b', text.lower())
104
+
105
+ # Minimal stopword list (most common English words that add no value)
106
+ # Kept small for performance - full stopword lists slow down search
107
+ stopwords = {
108
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
109
+ 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are',
110
+ 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do',
111
+ 'does', 'did', 'will', 'would', 'could', 'should', 'this',
112
+ 'that', 'these', 'those', 'it', 'its'
113
+ }
114
+
115
+ # Filter stopwords and very short tokens
116
+ tokens = [t for t in tokens if len(t) > 1 and t not in stopwords]
117
+
118
+ return tokens
119
+
120
+ def index_documents(self, documents: List[str], doc_ids: List[Any]) -> None:
121
+ """
122
+ Build BM25 index from documents.
123
+
124
+ Time complexity: O(n × m) where n = num_docs, m = avg_tokens_per_doc
125
+ Space complexity: O(v × d) where v = vocabulary size, d = avg postings per term
126
+
127
+ Args:
128
+ documents: List of document texts to index
129
+ doc_ids: List of document identifiers (must match documents length)
130
+
131
+ Raises:
132
+ ValueError: If documents and doc_ids length mismatch
133
+ """
134
+ if len(documents) != len(doc_ids):
135
+ raise ValueError("documents and doc_ids must have same length")
136
+
137
+ start_time = time.time()
138
+
139
+ # Reset index
140
+ self.doc_ids = doc_ids
141
+ self.doc_lengths = []
142
+ self.inverted_index = defaultdict(list)
143
+ self.doc_freq = defaultdict(int)
144
+ self.num_docs = len(documents)
145
+
146
+ # Build inverted index
147
+ for doc_idx, doc_text in enumerate(documents):
148
+ tokens = self._tokenize(doc_text)
149
+ self.doc_lengths.append(len(tokens))
150
+
151
+ # Count term frequencies in this document
152
+ term_freqs = Counter(tokens)
153
+
154
+ # Update inverted index and document frequency
155
+ for term, freq in term_freqs.items():
156
+ self.inverted_index[term].append((doc_idx, freq))
157
+ self.doc_freq[term] += 1
158
+
159
+ # Calculate average document length
160
+ if self.num_docs > 0:
161
+ self.avg_doc_length = sum(self.doc_lengths) / self.num_docs
162
+ else:
163
+ self.avg_doc_length = 0.0
164
+
165
+ self.index_time = time.time() - start_time
166
+
167
+ def _calculate_idf(self, term: str) -> float:
168
+ """
169
+ Calculate Inverse Document Frequency (IDF) for a term.
170
+
171
+ IDF formula: log((N - df + 0.5) / (df + 0.5) + 1)
172
+
173
+ Intuition:
174
+ - Rare terms (low df) get high IDF scores
175
+ - Common terms (high df) get low IDF scores
176
+ - Prevents over-weighting common words
177
+
178
+ Args:
179
+ term: Query term
180
+
181
+ Returns:
182
+ IDF score (higher = more discriminative term)
183
+ """
184
+ df = self.doc_freq.get(term, 0)
185
+
186
+ # Okapi BM25 IDF formula with smoothing
187
+ idf = math.log(
188
+ (self.num_docs - df + 0.5) / (df + 0.5) + 1.0
189
+ )
190
+
191
+ return idf
192
+
193
+ def _calculate_bm25_score(self, doc_idx: int, query_term_freqs: Dict[str, int]) -> float:
194
+ """
195
+ Calculate BM25 score for a document given query term frequencies.
196
+
197
+ BM25 formula:
198
+ score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D| / avgdl))
199
+
200
+ Args:
201
+ doc_idx: Document index in corpus
202
+ query_term_freqs: Query term frequencies
203
+
204
+ Returns:
205
+ BM25 relevance score
206
+ """
207
+ score = 0.0
208
+ doc_len = self.doc_lengths[doc_idx]
209
+
210
+ # Document length normalization factor
211
+ # Short docs penalized less, long docs penalized more
212
+ norm_factor = 1 - self.b + self.b * (doc_len / self.avg_doc_length)
213
+
214
+ for term, query_freq in query_term_freqs.items():
215
+ if term not in self.inverted_index:
216
+ continue
217
+
218
+ # Find term frequency in this document
219
+ term_freq = 0
220
+ for idx, freq in self.inverted_index[term]:
221
+ if idx == doc_idx:
222
+ term_freq = freq
223
+ break
224
+
225
+ if term_freq == 0:
226
+ continue
227
+
228
+ # Calculate IDF weight
229
+ idf = self._calculate_idf(term)
230
+
231
+ # BM25 term score with saturation
232
+ # As term_freq increases, score has diminishing returns
233
+ numerator = term_freq * (self.k1 + 1)
234
+ denominator = term_freq + self.k1 * norm_factor
235
+
236
+ score += idf * (numerator / denominator)
237
+
238
+ return score
239
+
240
+ def search(
241
+ self,
242
+ query: str,
243
+ limit: int = 10,
244
+ score_threshold: float = 0.0
245
+ ) -> List[Tuple[Any, float]]:
246
+ """
247
+ Search indexed documents using BM25 ranking.
248
+
249
+ Performance: O(q × p) where q = query terms, p = avg postings per term
250
+ Target: <30ms for 1K documents
251
+
252
+ Args:
253
+ query: Search query string
254
+ limit: Maximum number of results to return
255
+ score_threshold: Minimum BM25 score threshold (default: 0.0)
256
+
257
+ Returns:
258
+ List of (doc_id, score) tuples, sorted by score descending
259
+ """
260
+ start_time = time.time()
261
+
262
+ if self.num_docs == 0:
263
+ self.last_search_time = time.time() - start_time
264
+ return []
265
+
266
+ # Tokenize and count query terms
267
+ query_tokens = self._tokenize(query)
268
+ if not query_tokens:
269
+ self.last_search_time = time.time() - start_time
270
+ return []
271
+
272
+ query_term_freqs = Counter(query_tokens)
273
+
274
+ # Find candidate documents (documents containing at least one query term)
275
+ candidate_docs = set()
276
+ for term in query_term_freqs:
277
+ if term in self.inverted_index:
278
+ for doc_idx, _ in self.inverted_index[term]:
279
+ candidate_docs.add(doc_idx)
280
+
281
+ # Calculate BM25 scores for candidates
282
+ scores = []
283
+ for doc_idx in candidate_docs:
284
+ score = self._calculate_bm25_score(doc_idx, query_term_freqs)
285
+
286
+ if score >= score_threshold:
287
+ scores.append((self.doc_ids[doc_idx], score))
288
+
289
+ # Sort by score descending and limit results
290
+ scores.sort(key=lambda x: x[1], reverse=True)
291
+ results = scores[:limit]
292
+
293
+ self.last_search_time = time.time() - start_time
294
+
295
+ return results
296
+
297
+ def search_with_details(
298
+ self,
299
+ query: str,
300
+ limit: int = 10,
301
+ score_threshold: float = 0.0
302
+ ) -> Dict[str, Any]:
303
+ """
304
+ Search with detailed performance metrics and match information.
305
+
306
+ Useful for debugging and performance analysis.
307
+
308
+ Args:
309
+ query: Search query string
310
+ limit: Maximum number of results to return
311
+ score_threshold: Minimum score threshold
312
+
313
+ Returns:
314
+ Dictionary with results and metadata
315
+ """
316
+ query_tokens = self._tokenize(query)
317
+ results = self.search(query, limit, score_threshold)
318
+
319
+ return {
320
+ 'results': results,
321
+ 'query_terms': query_tokens,
322
+ 'num_results': len(results),
323
+ 'search_time_ms': self.last_search_time * 1000,
324
+ 'index_size': self.num_docs,
325
+ 'avg_doc_length': self.avg_doc_length
326
+ }
327
+
328
+ def get_stats(self) -> Dict[str, Any]:
329
+ """
330
+ Get search engine statistics.
331
+
332
+ Returns:
333
+ Dictionary with index statistics
334
+ """
335
+ return {
336
+ 'num_documents': self.num_docs,
337
+ 'vocabulary_size': len(self.inverted_index),
338
+ 'avg_doc_length': self.avg_doc_length,
339
+ 'total_tokens': sum(self.doc_lengths),
340
+ 'index_time_ms': self.index_time * 1000,
341
+ 'last_search_time_ms': self.last_search_time * 1000,
342
+ 'k1': self.k1,
343
+ 'b': self.b
344
+ }
345
+
346
+
347
+ # CLI interface for testing
348
+ if __name__ == "__main__":
349
+ import sys
350
+
351
+ # Demo usage
352
+ print("BM25 Search Engine - Demo")
353
+ print("=" * 60)
354
+
355
+ # Sample documents
356
+ documents = [
357
+ "Python is a high-level programming language with dynamic typing",
358
+ "JavaScript is widely used for web development and frontend applications",
359
+ "Machine learning uses Python libraries like scikit-learn and TensorFlow",
360
+ "React is a JavaScript framework for building user interfaces",
361
+ "Django is a Python web framework that follows MVC architecture",
362
+ "Neural networks are a key component of deep learning systems",
363
+ ]
364
+
365
+ doc_ids = [f"doc_{i}" for i in range(len(documents))]
366
+
367
+ # Index documents
368
+ engine = BM25SearchEngine()
369
+ print(f"\nIndexing {len(documents)} documents...")
370
+ engine.index_documents(documents, doc_ids)
371
+
372
+ stats = engine.get_stats()
373
+ print(f"✓ Indexed in {stats['index_time_ms']:.2f}ms")
374
+ print(f" Vocabulary: {stats['vocabulary_size']} unique terms")
375
+ print(f" Avg doc length: {stats['avg_doc_length']:.1f} tokens")
376
+
377
+ # Test queries
378
+ test_queries = [
379
+ "Python programming",
380
+ "web development",
381
+ "machine learning",
382
+ "JavaScript framework"
383
+ ]
384
+
385
+ print("\n" + "=" * 60)
386
+ print("Search Results:")
387
+ print("=" * 60)
388
+
389
+ for query in test_queries:
390
+ print(f"\nQuery: '{query}'")
391
+ results = engine.search_with_details(query, limit=3)
392
+
393
+ print(f" Found: {results['num_results']} results in {results['search_time_ms']:.2f}ms")
394
+ print(f" Query terms: {results['query_terms']}")
395
+
396
+ for doc_id, score in results['results']:
397
+ doc_idx = doc_ids.index(doc_id)
398
+ print(f" [{score:.3f}] {doc_id}: {documents[doc_idx][:60]}...")
399
+
400
+ print("\n" + "=" * 60)
401
+ print("Performance Summary:")
402
+ print(f" Average search time: {stats['last_search_time_ms']:.2f}ms")
403
+ print(f" Target: <30ms for 1K documents ✓")