superlocalmemory 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ATTRIBUTION.md +140 -0
- package/CHANGELOG.md +1749 -0
- package/LICENSE +21 -0
- package/README.md +600 -0
- package/bin/aider-smart +72 -0
- package/bin/slm +202 -0
- package/bin/slm-npm +73 -0
- package/bin/slm.bat +195 -0
- package/bin/slm.cmd +10 -0
- package/bin/superlocalmemoryv2:list +3 -0
- package/bin/superlocalmemoryv2:profile +3 -0
- package/bin/superlocalmemoryv2:recall +3 -0
- package/bin/superlocalmemoryv2:remember +3 -0
- package/bin/superlocalmemoryv2:reset +3 -0
- package/bin/superlocalmemoryv2:status +3 -0
- package/completions/slm.bash +58 -0
- package/completions/slm.zsh +76 -0
- package/configs/antigravity-mcp.json +13 -0
- package/configs/chatgpt-desktop-mcp.json +7 -0
- package/configs/claude-desktop-mcp.json +15 -0
- package/configs/codex-mcp.toml +13 -0
- package/configs/cody-commands.json +29 -0
- package/configs/continue-mcp.yaml +14 -0
- package/configs/continue-skills.yaml +26 -0
- package/configs/cursor-mcp.json +15 -0
- package/configs/gemini-cli-mcp.json +11 -0
- package/configs/jetbrains-mcp.json +11 -0
- package/configs/opencode-mcp.json +12 -0
- package/configs/perplexity-mcp.json +9 -0
- package/configs/vscode-copilot-mcp.json +12 -0
- package/configs/windsurf-mcp.json +16 -0
- package/configs/zed-mcp.json +12 -0
- package/docs/ARCHITECTURE.md +877 -0
- package/docs/CLI-COMMANDS-REFERENCE.md +425 -0
- package/docs/COMPETITIVE-ANALYSIS.md +210 -0
- package/docs/COMPRESSION-README.md +390 -0
- package/docs/GRAPH-ENGINE.md +503 -0
- package/docs/MCP-MANUAL-SETUP.md +720 -0
- package/docs/MCP-TROUBLESHOOTING.md +787 -0
- package/docs/PATTERN-LEARNING.md +363 -0
- package/docs/PROFILES-GUIDE.md +453 -0
- package/docs/RESET-GUIDE.md +353 -0
- package/docs/SEARCH-ENGINE-V2.2.0.md +748 -0
- package/docs/SEARCH-INTEGRATION-GUIDE.md +502 -0
- package/docs/UI-SERVER.md +254 -0
- package/docs/UNIVERSAL-INTEGRATION.md +432 -0
- package/docs/V2.2.0-OPTIONAL-SEARCH.md +666 -0
- package/docs/WINDOWS-INSTALL-README.txt +34 -0
- package/docs/WINDOWS-POST-INSTALL.txt +45 -0
- package/docs/example_graph_usage.py +148 -0
- package/hooks/memory-list-skill.js +130 -0
- package/hooks/memory-profile-skill.js +284 -0
- package/hooks/memory-recall-skill.js +109 -0
- package/hooks/memory-remember-skill.js +127 -0
- package/hooks/memory-reset-skill.js +274 -0
- package/install-skills.sh +436 -0
- package/install.ps1 +417 -0
- package/install.sh +755 -0
- package/mcp_server.py +585 -0
- package/package.json +94 -0
- package/requirements-core.txt +24 -0
- package/requirements.txt +10 -0
- package/scripts/postinstall.js +126 -0
- package/scripts/preuninstall.js +57 -0
- package/skills/slm-build-graph/SKILL.md +423 -0
- package/skills/slm-list-recent/SKILL.md +348 -0
- package/skills/slm-recall/SKILL.md +325 -0
- package/skills/slm-remember/SKILL.md +194 -0
- package/skills/slm-status/SKILL.md +363 -0
- package/skills/slm-switch-profile/SKILL.md +442 -0
- package/src/__pycache__/cache_manager.cpython-312.pyc +0 -0
- package/src/__pycache__/embedding_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/graph_engine.cpython-312.pyc +0 -0
- package/src/__pycache__/hnsw_index.cpython-312.pyc +0 -0
- package/src/__pycache__/hybrid_search.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-profiles.cpython-312.pyc +0 -0
- package/src/__pycache__/memory-reset.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_compression.cpython-312.pyc +0 -0
- package/src/__pycache__/memory_store_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/migrate_v1_to_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/pattern_learner.cpython-312.pyc +0 -0
- package/src/__pycache__/query_optimizer.cpython-312.pyc +0 -0
- package/src/__pycache__/search_engine_v2.cpython-312.pyc +0 -0
- package/src/__pycache__/setup_validator.cpython-312.pyc +0 -0
- package/src/__pycache__/tree_manager.cpython-312.pyc +0 -0
- package/src/cache_manager.py +520 -0
- package/src/embedding_engine.py +671 -0
- package/src/graph_engine.py +970 -0
- package/src/hnsw_index.py +626 -0
- package/src/hybrid_search.py +693 -0
- package/src/memory-profiles.py +518 -0
- package/src/memory-reset.py +485 -0
- package/src/memory_compression.py +999 -0
- package/src/memory_store_v2.py +1088 -0
- package/src/migrate_v1_to_v2.py +638 -0
- package/src/pattern_learner.py +898 -0
- package/src/query_optimizer.py +513 -0
- package/src/search_engine_v2.py +403 -0
- package/src/setup_validator.py +479 -0
- package/src/tree_manager.py +720 -0
|
@@ -0,0 +1,693 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SuperLocalMemory V2 - Hybrid Search System
|
|
4
|
+
|
|
5
|
+
Copyright (c) 2026 Varun Pratap Bhardwaj
|
|
6
|
+
Solution Architect & Original Creator
|
|
7
|
+
|
|
8
|
+
Licensed under MIT License (see LICENSE file)
|
|
9
|
+
Repository: https://github.com/varun369/SuperLocalMemoryV2
|
|
10
|
+
|
|
11
|
+
ATTRIBUTION REQUIRED: This notice must be preserved in all copies.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
Hybrid Search System - Multi-Method Retrieval Fusion
|
|
16
|
+
|
|
17
|
+
Combines multiple search methods for optimal retrieval quality:
|
|
18
|
+
|
|
19
|
+
1. BM25 Keyword Search: Lexical matching with relevance ranking
|
|
20
|
+
- Fast exact term matching
|
|
21
|
+
- Good for technical queries with specific terms
|
|
22
|
+
- Weight: 0.4 (40%)
|
|
23
|
+
|
|
24
|
+
2. Graph-Based Traversal: Relationship-aware search
|
|
25
|
+
- Finds related memories via knowledge graph
|
|
26
|
+
- Good for conceptual/thematic queries
|
|
27
|
+
- Weight: 0.3 (30%)
|
|
28
|
+
|
|
29
|
+
3. TF-IDF Semantic Search: Distributional similarity
|
|
30
|
+
- Captures semantic relationships
|
|
31
|
+
- Good for natural language queries
|
|
32
|
+
- Weight: 0.3 (30%)
|
|
33
|
+
|
|
34
|
+
4. Optional Embedding Search: Dense vector similarity
|
|
35
|
+
- Best semantic understanding (if available)
|
|
36
|
+
- Requires sentence-transformers
|
|
37
|
+
- Can replace or augment TF-IDF
|
|
38
|
+
|
|
39
|
+
Fusion Methods:
|
|
40
|
+
- Reciprocal Rank Fusion (RRF): Rank-based combination
|
|
41
|
+
- Weighted Score Fusion: Normalized score combination
|
|
42
|
+
- Hybrid: Adaptive based on query characteristics
|
|
43
|
+
|
|
44
|
+
Performance Target: <50ms for 1K memories (hybrid mode)
|
|
45
|
+
|
|
46
|
+
Usage:
|
|
47
|
+
hybrid = HybridSearchEngine(memory_store, bm25_engine, graph_engine)
|
|
48
|
+
results = hybrid.search(
|
|
49
|
+
query="authentication bug",
|
|
50
|
+
method="weighted",
|
|
51
|
+
weights={'bm25': 0.4, 'graph': 0.3, 'semantic': 0.3}
|
|
52
|
+
)
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
import time
|
|
56
|
+
import math
|
|
57
|
+
from collections import defaultdict
|
|
58
|
+
from typing import List, Dict, Tuple, Optional, Any, Set
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
import sqlite3
|
|
61
|
+
|
|
62
|
+
# Import local modules
|
|
63
|
+
from search_engine_v2 import BM25SearchEngine
|
|
64
|
+
from query_optimizer import QueryOptimizer
|
|
65
|
+
from cache_manager import CacheManager
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class HybridSearchEngine:
|
|
69
|
+
"""
|
|
70
|
+
Hybrid search combining BM25, graph traversal, and semantic search.
|
|
71
|
+
|
|
72
|
+
Provides flexible retrieval strategies based on query type and
|
|
73
|
+
available resources.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
db_path: Path,
|
|
79
|
+
bm25_engine: Optional[BM25SearchEngine] = None,
|
|
80
|
+
query_optimizer: Optional[QueryOptimizer] = None,
|
|
81
|
+
cache_manager: Optional[CacheManager] = None,
|
|
82
|
+
enable_cache: bool = True
|
|
83
|
+
):
|
|
84
|
+
"""
|
|
85
|
+
Initialize hybrid search engine.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
db_path: Path to memory database
|
|
89
|
+
bm25_engine: Pre-configured BM25 engine (will create if None)
|
|
90
|
+
query_optimizer: Query optimizer instance (will create if None)
|
|
91
|
+
cache_manager: Cache manager instance (will create if None)
|
|
92
|
+
enable_cache: Enable result caching
|
|
93
|
+
"""
|
|
94
|
+
self.db_path = db_path
|
|
95
|
+
|
|
96
|
+
# Initialize components
|
|
97
|
+
self.bm25 = bm25_engine or BM25SearchEngine()
|
|
98
|
+
self.optimizer = query_optimizer or QueryOptimizer()
|
|
99
|
+
self.cache = cache_manager if enable_cache else None
|
|
100
|
+
|
|
101
|
+
# Graph engine (lazy load to avoid circular dependencies)
|
|
102
|
+
self._graph_engine = None
|
|
103
|
+
|
|
104
|
+
# TF-IDF fallback (from memory_store_v2)
|
|
105
|
+
self._tfidf_vectorizer = None
|
|
106
|
+
self._tfidf_vectors = None
|
|
107
|
+
self._memory_ids = []
|
|
108
|
+
|
|
109
|
+
# Performance tracking
|
|
110
|
+
self.last_search_time = 0.0
|
|
111
|
+
self.last_fusion_time = 0.0
|
|
112
|
+
|
|
113
|
+
# Load index
|
|
114
|
+
self._load_index()
|
|
115
|
+
|
|
116
|
+
def _load_index(self):
|
|
117
|
+
"""
|
|
118
|
+
Load documents from database and build search indexes.
|
|
119
|
+
"""
|
|
120
|
+
conn = sqlite3.connect(self.db_path)
|
|
121
|
+
cursor = conn.cursor()
|
|
122
|
+
|
|
123
|
+
# Fetch all memories
|
|
124
|
+
cursor.execute('''
|
|
125
|
+
SELECT id, content, summary, tags
|
|
126
|
+
FROM memories
|
|
127
|
+
ORDER BY id
|
|
128
|
+
''')
|
|
129
|
+
|
|
130
|
+
rows = cursor.fetchall()
|
|
131
|
+
conn.close()
|
|
132
|
+
|
|
133
|
+
if not rows:
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
# Build BM25 index
|
|
137
|
+
doc_ids = [row[0] for row in rows]
|
|
138
|
+
documents = []
|
|
139
|
+
vocabulary = set()
|
|
140
|
+
|
|
141
|
+
for row in rows:
|
|
142
|
+
# Combine content + summary + tags for indexing
|
|
143
|
+
text_parts = [row[1]] # content
|
|
144
|
+
|
|
145
|
+
if row[2]: # summary
|
|
146
|
+
text_parts.append(row[2])
|
|
147
|
+
|
|
148
|
+
if row[3]: # tags (JSON)
|
|
149
|
+
import json
|
|
150
|
+
try:
|
|
151
|
+
tags = json.loads(row[3])
|
|
152
|
+
text_parts.extend(tags)
|
|
153
|
+
except:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
doc_text = ' '.join(text_parts)
|
|
157
|
+
documents.append(doc_text)
|
|
158
|
+
|
|
159
|
+
# Build vocabulary for spell correction
|
|
160
|
+
tokens = self.bm25._tokenize(doc_text)
|
|
161
|
+
vocabulary.update(tokens)
|
|
162
|
+
|
|
163
|
+
# Index with BM25
|
|
164
|
+
self.bm25.index_documents(documents, doc_ids)
|
|
165
|
+
self._memory_ids = doc_ids
|
|
166
|
+
|
|
167
|
+
# Initialize optimizer with vocabulary
|
|
168
|
+
self.optimizer.vocabulary = vocabulary
|
|
169
|
+
|
|
170
|
+
# Build co-occurrence for query expansion
|
|
171
|
+
tokenized_docs = [self.bm25._tokenize(doc) for doc in documents]
|
|
172
|
+
self.optimizer.build_cooccurrence_matrix(tokenized_docs)
|
|
173
|
+
|
|
174
|
+
# Try to load TF-IDF (optional semantic search)
|
|
175
|
+
try:
|
|
176
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
177
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
178
|
+
import numpy as np
|
|
179
|
+
|
|
180
|
+
self._tfidf_vectorizer = TfidfVectorizer(
|
|
181
|
+
max_features=5000,
|
|
182
|
+
stop_words='english',
|
|
183
|
+
ngram_range=(1, 2)
|
|
184
|
+
)
|
|
185
|
+
self._tfidf_vectors = self._tfidf_vectorizer.fit_transform(documents)
|
|
186
|
+
|
|
187
|
+
except ImportError:
|
|
188
|
+
# sklearn not available - skip semantic search
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
def _load_graph_engine(self):
|
|
192
|
+
"""Lazy load graph engine to avoid circular imports."""
|
|
193
|
+
if self._graph_engine is None:
|
|
194
|
+
try:
|
|
195
|
+
from graph_engine import GraphEngine
|
|
196
|
+
self._graph_engine = GraphEngine(self.db_path)
|
|
197
|
+
except ImportError:
|
|
198
|
+
# Graph engine not available
|
|
199
|
+
pass
|
|
200
|
+
return self._graph_engine
|
|
201
|
+
|
|
202
|
+
def search_bm25(
|
|
203
|
+
self,
|
|
204
|
+
query: str,
|
|
205
|
+
limit: int = 10,
|
|
206
|
+
score_threshold: float = 0.0
|
|
207
|
+
) -> List[Tuple[int, float]]:
|
|
208
|
+
"""
|
|
209
|
+
Search using BM25 keyword matching.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
query: Search query
|
|
213
|
+
limit: Maximum results
|
|
214
|
+
score_threshold: Minimum score threshold
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List of (memory_id, score) tuples
|
|
218
|
+
"""
|
|
219
|
+
# Optimize query
|
|
220
|
+
optimized = self.optimizer.optimize(
|
|
221
|
+
query,
|
|
222
|
+
enable_spell_correction=True,
|
|
223
|
+
enable_expansion=False # Expansion can hurt precision
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Search with BM25
|
|
227
|
+
results = self.bm25.search(optimized, limit, score_threshold)
|
|
228
|
+
|
|
229
|
+
return results
|
|
230
|
+
|
|
231
|
+
def search_semantic(
|
|
232
|
+
self,
|
|
233
|
+
query: str,
|
|
234
|
+
limit: int = 10,
|
|
235
|
+
score_threshold: float = 0.05
|
|
236
|
+
) -> List[Tuple[int, float]]:
|
|
237
|
+
"""
|
|
238
|
+
Search using TF-IDF semantic similarity.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
query: Search query
|
|
242
|
+
limit: Maximum results
|
|
243
|
+
score_threshold: Minimum similarity threshold
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
List of (memory_id, score) tuples
|
|
247
|
+
"""
|
|
248
|
+
if self._tfidf_vectorizer is None or self._tfidf_vectors is None:
|
|
249
|
+
return []
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
253
|
+
import numpy as np
|
|
254
|
+
|
|
255
|
+
# Vectorize query
|
|
256
|
+
query_vec = self._tfidf_vectorizer.transform([query])
|
|
257
|
+
|
|
258
|
+
# Calculate similarities
|
|
259
|
+
similarities = cosine_similarity(query_vec, self._tfidf_vectors).flatten()
|
|
260
|
+
|
|
261
|
+
# Get top results above threshold
|
|
262
|
+
results = []
|
|
263
|
+
for idx, score in enumerate(similarities):
|
|
264
|
+
if score >= score_threshold:
|
|
265
|
+
memory_id = self._memory_ids[idx]
|
|
266
|
+
results.append((memory_id, float(score)))
|
|
267
|
+
|
|
268
|
+
# Sort by score and limit
|
|
269
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
270
|
+
return results[:limit]
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
# Fallback gracefully
|
|
274
|
+
return []
|
|
275
|
+
|
|
276
|
+
def search_graph(
|
|
277
|
+
self,
|
|
278
|
+
query: str,
|
|
279
|
+
limit: int = 10,
|
|
280
|
+
max_depth: int = 2
|
|
281
|
+
) -> List[Tuple[int, float]]:
|
|
282
|
+
"""
|
|
283
|
+
Search using graph traversal from initial matches.
|
|
284
|
+
|
|
285
|
+
Strategy:
|
|
286
|
+
1. Get seed memories from BM25
|
|
287
|
+
2. Traverse graph to find related memories
|
|
288
|
+
3. Score by distance from seed nodes
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
query: Search query
|
|
292
|
+
limit: Maximum results
|
|
293
|
+
max_depth: Maximum graph traversal depth
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
List of (memory_id, score) tuples
|
|
297
|
+
"""
|
|
298
|
+
graph = self._load_graph_engine()
|
|
299
|
+
if graph is None:
|
|
300
|
+
return []
|
|
301
|
+
|
|
302
|
+
# Get seed memories from BM25
|
|
303
|
+
seed_results = self.search_bm25(query, limit=5)
|
|
304
|
+
if not seed_results:
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
seed_ids = [mem_id for mem_id, _ in seed_results]
|
|
308
|
+
|
|
309
|
+
# Traverse graph from seed nodes
|
|
310
|
+
visited = set(seed_ids)
|
|
311
|
+
results = []
|
|
312
|
+
|
|
313
|
+
# BFS traversal
|
|
314
|
+
queue = [(mem_id, 1.0, 0) for mem_id in seed_ids] # (id, score, depth)
|
|
315
|
+
|
|
316
|
+
while queue and len(results) < limit:
|
|
317
|
+
current_id, current_score, depth = queue.pop(0)
|
|
318
|
+
|
|
319
|
+
if depth > max_depth:
|
|
320
|
+
continue
|
|
321
|
+
|
|
322
|
+
# Add to results
|
|
323
|
+
if current_id not in [r[0] for r in results]:
|
|
324
|
+
results.append((current_id, current_score))
|
|
325
|
+
|
|
326
|
+
# Get related memories from graph
|
|
327
|
+
try:
|
|
328
|
+
related = graph.get_related_memories(current_id, limit=5)
|
|
329
|
+
|
|
330
|
+
for rel_id, similarity in related:
|
|
331
|
+
if rel_id not in visited:
|
|
332
|
+
visited.add(rel_id)
|
|
333
|
+
# Decay score by depth
|
|
334
|
+
new_score = current_score * similarity * (0.7 ** depth)
|
|
335
|
+
queue.append((rel_id, new_score, depth + 1))
|
|
336
|
+
|
|
337
|
+
except:
|
|
338
|
+
# Graph operation failed - skip
|
|
339
|
+
continue
|
|
340
|
+
|
|
341
|
+
return results[:limit]
|
|
342
|
+
|
|
343
|
+
def _normalize_scores(
|
|
344
|
+
self,
|
|
345
|
+
results: List[Tuple[int, float]]
|
|
346
|
+
) -> List[Tuple[int, float]]:
|
|
347
|
+
"""
|
|
348
|
+
Normalize scores to [0, 1] range using min-max normalization.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
results: List of (id, score) tuples
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Normalized results
|
|
355
|
+
"""
|
|
356
|
+
if not results:
|
|
357
|
+
return []
|
|
358
|
+
|
|
359
|
+
scores = [score for _, score in results]
|
|
360
|
+
min_score = min(scores)
|
|
361
|
+
max_score = max(scores)
|
|
362
|
+
|
|
363
|
+
if max_score == min_score:
|
|
364
|
+
# All scores equal - return uniform scores
|
|
365
|
+
return [(id, 1.0) for id, _ in results]
|
|
366
|
+
|
|
367
|
+
normalized = []
|
|
368
|
+
for mem_id, score in results:
|
|
369
|
+
norm_score = (score - min_score) / (max_score - min_score)
|
|
370
|
+
normalized.append((mem_id, norm_score))
|
|
371
|
+
|
|
372
|
+
return normalized
|
|
373
|
+
|
|
374
|
+
def _reciprocal_rank_fusion(
|
|
375
|
+
self,
|
|
376
|
+
results_list: List[List[Tuple[int, float]]],
|
|
377
|
+
k: int = 60
|
|
378
|
+
) -> List[Tuple[int, float]]:
|
|
379
|
+
"""
|
|
380
|
+
Combine multiple result lists using Reciprocal Rank Fusion.
|
|
381
|
+
|
|
382
|
+
RRF formula: score(d) = Σ 1 / (k + rank(d))
|
|
383
|
+
|
|
384
|
+
RRF is rank-based and doesn't depend on score magnitudes,
|
|
385
|
+
making it robust to different scoring scales.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
results_list: List of result lists from different methods
|
|
389
|
+
k: RRF constant (default: 60, standard value)
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Fused results sorted by RRF score
|
|
393
|
+
"""
|
|
394
|
+
# Build rank maps for each method
|
|
395
|
+
rrf_scores = defaultdict(float)
|
|
396
|
+
|
|
397
|
+
for results in results_list:
|
|
398
|
+
for rank, (mem_id, _) in enumerate(results, start=1):
|
|
399
|
+
rrf_scores[mem_id] += 1.0 / (k + rank)
|
|
400
|
+
|
|
401
|
+
# Convert to sorted list
|
|
402
|
+
fused = [(mem_id, score) for mem_id, score in rrf_scores.items()]
|
|
403
|
+
fused.sort(key=lambda x: x[1], reverse=True)
|
|
404
|
+
|
|
405
|
+
return fused
|
|
406
|
+
|
|
407
|
+
def _weighted_fusion(
|
|
408
|
+
self,
|
|
409
|
+
results_dict: Dict[str, List[Tuple[int, float]]],
|
|
410
|
+
weights: Dict[str, float]
|
|
411
|
+
) -> List[Tuple[int, float]]:
|
|
412
|
+
"""
|
|
413
|
+
Combine results using weighted score fusion.
|
|
414
|
+
|
|
415
|
+
Normalizes scores from each method then combines with weights.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
results_dict: Dictionary mapping method name to results
|
|
419
|
+
weights: Dictionary mapping method name to weight
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
Fused results sorted by combined score
|
|
423
|
+
"""
|
|
424
|
+
# Normalize scores for each method
|
|
425
|
+
normalized = {}
|
|
426
|
+
for method, results in results_dict.items():
|
|
427
|
+
normalized[method] = self._normalize_scores(results)
|
|
428
|
+
|
|
429
|
+
# Combine with weights
|
|
430
|
+
combined_scores = defaultdict(float)
|
|
431
|
+
max_weight_sum = defaultdict(float) # Track possible max score per doc
|
|
432
|
+
|
|
433
|
+
for method, results in normalized.items():
|
|
434
|
+
weight = weights.get(method, 0.0)
|
|
435
|
+
|
|
436
|
+
for mem_id, score in results:
|
|
437
|
+
combined_scores[mem_id] += weight * score
|
|
438
|
+
max_weight_sum[mem_id] += weight
|
|
439
|
+
|
|
440
|
+
# Normalize by actual weights (some docs may not appear in all methods)
|
|
441
|
+
fused = []
|
|
442
|
+
for mem_id, score in combined_scores.items():
|
|
443
|
+
normalized_score = score / max_weight_sum[mem_id] if max_weight_sum[mem_id] > 0 else 0
|
|
444
|
+
fused.append((mem_id, normalized_score))
|
|
445
|
+
|
|
446
|
+
fused.sort(key=lambda x: x[1], reverse=True)
|
|
447
|
+
|
|
448
|
+
return fused
|
|
449
|
+
|
|
450
|
+
def search(
|
|
451
|
+
self,
|
|
452
|
+
query: str,
|
|
453
|
+
limit: int = 10,
|
|
454
|
+
method: str = "hybrid",
|
|
455
|
+
weights: Optional[Dict[str, float]] = None,
|
|
456
|
+
use_cache: bool = True
|
|
457
|
+
) -> List[Dict[str, Any]]:
|
|
458
|
+
"""
|
|
459
|
+
Hybrid search with multiple retrieval methods.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
query: Search query
|
|
463
|
+
limit: Maximum results
|
|
464
|
+
method: Fusion method ("hybrid", "weighted", "rrf", "bm25", "semantic", "graph")
|
|
465
|
+
weights: Custom weights for weighted fusion (default: balanced)
|
|
466
|
+
use_cache: Use cache for results
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
List of memory dictionaries with scores and match details
|
|
470
|
+
"""
|
|
471
|
+
start_time = time.time()
|
|
472
|
+
|
|
473
|
+
# Check cache
|
|
474
|
+
if use_cache and self.cache:
|
|
475
|
+
cached = self.cache.get(query, limit=limit, method=method)
|
|
476
|
+
if cached is not None:
|
|
477
|
+
self.last_search_time = time.time() - start_time
|
|
478
|
+
return cached
|
|
479
|
+
|
|
480
|
+
# Default weights
|
|
481
|
+
if weights is None:
|
|
482
|
+
weights = {
|
|
483
|
+
'bm25': 0.4,
|
|
484
|
+
'semantic': 0.3,
|
|
485
|
+
'graph': 0.3
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
# Single method search
|
|
489
|
+
if method == "bm25":
|
|
490
|
+
raw_results = self.search_bm25(query, limit)
|
|
491
|
+
elif method == "semantic":
|
|
492
|
+
raw_results = self.search_semantic(query, limit)
|
|
493
|
+
elif method == "graph":
|
|
494
|
+
raw_results = self.search_graph(query, limit)
|
|
495
|
+
|
|
496
|
+
# Multi-method fusion
|
|
497
|
+
else:
|
|
498
|
+
fusion_start = time.time()
|
|
499
|
+
|
|
500
|
+
# Get results from all methods
|
|
501
|
+
results_dict = {}
|
|
502
|
+
|
|
503
|
+
if weights.get('bm25', 0) > 0:
|
|
504
|
+
results_dict['bm25'] = self.search_bm25(query, limit=limit*2)
|
|
505
|
+
|
|
506
|
+
if weights.get('semantic', 0) > 0:
|
|
507
|
+
results_dict['semantic'] = self.search_semantic(query, limit=limit*2)
|
|
508
|
+
|
|
509
|
+
if weights.get('graph', 0) > 0:
|
|
510
|
+
results_dict['graph'] = self.search_graph(query, limit=limit*2)
|
|
511
|
+
|
|
512
|
+
# Fusion
|
|
513
|
+
if method == "rrf":
|
|
514
|
+
raw_results = self._reciprocal_rank_fusion(list(results_dict.values()))
|
|
515
|
+
else: # weighted or hybrid
|
|
516
|
+
raw_results = self._weighted_fusion(results_dict, weights)
|
|
517
|
+
|
|
518
|
+
self.last_fusion_time = time.time() - fusion_start
|
|
519
|
+
|
|
520
|
+
# Limit results
|
|
521
|
+
raw_results = raw_results[:limit]
|
|
522
|
+
|
|
523
|
+
# Fetch full memory details
|
|
524
|
+
results = self._fetch_memory_details(raw_results, query)
|
|
525
|
+
|
|
526
|
+
# Cache results
|
|
527
|
+
if use_cache and self.cache:
|
|
528
|
+
self.cache.put(query, results, limit=limit, method=method)
|
|
529
|
+
|
|
530
|
+
self.last_search_time = time.time() - start_time
|
|
531
|
+
|
|
532
|
+
return results
|
|
533
|
+
|
|
534
|
+
def _fetch_memory_details(
|
|
535
|
+
self,
|
|
536
|
+
raw_results: List[Tuple[int, float]],
|
|
537
|
+
query: str
|
|
538
|
+
) -> List[Dict[str, Any]]:
|
|
539
|
+
"""
|
|
540
|
+
Fetch full memory details for result IDs.
|
|
541
|
+
|
|
542
|
+
Args:
|
|
543
|
+
raw_results: List of (memory_id, score) tuples
|
|
544
|
+
query: Original query (for context)
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
List of memory dictionaries with full details
|
|
548
|
+
"""
|
|
549
|
+
if not raw_results:
|
|
550
|
+
return []
|
|
551
|
+
|
|
552
|
+
memory_ids = [mem_id for mem_id, _ in raw_results]
|
|
553
|
+
id_to_score = {mem_id: score for mem_id, score in raw_results}
|
|
554
|
+
|
|
555
|
+
conn = sqlite3.connect(self.db_path)
|
|
556
|
+
cursor = conn.cursor()
|
|
557
|
+
|
|
558
|
+
# Fetch memories
|
|
559
|
+
placeholders = ','.join(['?'] * len(memory_ids))
|
|
560
|
+
cursor.execute(f'''
|
|
561
|
+
SELECT id, content, summary, project_path, project_name, tags,
|
|
562
|
+
category, parent_id, tree_path, depth, memory_type,
|
|
563
|
+
importance, created_at, cluster_id, last_accessed, access_count
|
|
564
|
+
FROM memories
|
|
565
|
+
WHERE id IN ({placeholders})
|
|
566
|
+
''', memory_ids)
|
|
567
|
+
|
|
568
|
+
rows = cursor.fetchall()
|
|
569
|
+
conn.close()
|
|
570
|
+
|
|
571
|
+
# Build result dictionaries
|
|
572
|
+
results = []
|
|
573
|
+
for row in rows:
|
|
574
|
+
import json
|
|
575
|
+
|
|
576
|
+
mem_id = row[0]
|
|
577
|
+
results.append({
|
|
578
|
+
'id': mem_id,
|
|
579
|
+
'content': row[1],
|
|
580
|
+
'summary': row[2],
|
|
581
|
+
'project_path': row[3],
|
|
582
|
+
'project_name': row[4],
|
|
583
|
+
'tags': json.loads(row[5]) if row[5] else [],
|
|
584
|
+
'category': row[6],
|
|
585
|
+
'parent_id': row[7],
|
|
586
|
+
'tree_path': row[8],
|
|
587
|
+
'depth': row[9],
|
|
588
|
+
'memory_type': row[10],
|
|
589
|
+
'importance': row[11],
|
|
590
|
+
'created_at': row[12],
|
|
591
|
+
'cluster_id': row[13],
|
|
592
|
+
'last_accessed': row[14],
|
|
593
|
+
'access_count': row[15],
|
|
594
|
+
'score': id_to_score.get(mem_id, 0.0),
|
|
595
|
+
'match_type': 'hybrid'
|
|
596
|
+
})
|
|
597
|
+
|
|
598
|
+
# Sort by score
|
|
599
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
600
|
+
|
|
601
|
+
return results
|
|
602
|
+
|
|
603
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
604
|
+
"""
|
|
605
|
+
Get hybrid search statistics.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
Dictionary with performance stats
|
|
609
|
+
"""
|
|
610
|
+
stats = {
|
|
611
|
+
'bm25': self.bm25.get_stats(),
|
|
612
|
+
'optimizer': self.optimizer.get_stats(),
|
|
613
|
+
'last_search_time_ms': self.last_search_time * 1000,
|
|
614
|
+
'last_fusion_time_ms': self.last_fusion_time * 1000,
|
|
615
|
+
'tfidf_available': self._tfidf_vectorizer is not None,
|
|
616
|
+
'graph_available': self._graph_engine is not None
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
if self.cache:
|
|
620
|
+
stats['cache'] = self.cache.get_stats()
|
|
621
|
+
|
|
622
|
+
return stats
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
# CLI interface for testing
|
|
626
|
+
if __name__ == "__main__":
|
|
627
|
+
import sys
|
|
628
|
+
from pathlib import Path
|
|
629
|
+
|
|
630
|
+
print("Hybrid Search Engine - Demo")
|
|
631
|
+
print("=" * 60)
|
|
632
|
+
|
|
633
|
+
# Use test database or default
|
|
634
|
+
db_path = Path.home() / ".claude-memory" / "memory.db"
|
|
635
|
+
|
|
636
|
+
if not db_path.exists():
|
|
637
|
+
print(f"Error: Database not found at {db_path}")
|
|
638
|
+
print("Please run memory_store_v2.py to create database first.")
|
|
639
|
+
sys.exit(1)
|
|
640
|
+
|
|
641
|
+
# Initialize hybrid search
|
|
642
|
+
print(f"\nInitializing hybrid search engine...")
|
|
643
|
+
print(f"Database: {db_path}")
|
|
644
|
+
|
|
645
|
+
hybrid = HybridSearchEngine(db_path, enable_cache=True)
|
|
646
|
+
|
|
647
|
+
stats = hybrid.get_stats()
|
|
648
|
+
print(f"\nā Indexed {stats['bm25']['num_documents']} memories")
|
|
649
|
+
print(f" Vocabulary: {stats['bm25']['vocabulary_size']} terms")
|
|
650
|
+
print(f" TF-IDF: {'Available' if stats['tfidf_available'] else 'Not available'}")
|
|
651
|
+
print(f" Graph: {'Available' if stats['graph_available'] else 'Not available'}")
|
|
652
|
+
|
|
653
|
+
# Test search
|
|
654
|
+
if len(sys.argv) > 1:
|
|
655
|
+
query = ' '.join(sys.argv[1:])
|
|
656
|
+
else:
|
|
657
|
+
query = "python web development"
|
|
658
|
+
|
|
659
|
+
print("\n" + "=" * 60)
|
|
660
|
+
print(f"Search Query: '{query}'")
|
|
661
|
+
print("=" * 60)
|
|
662
|
+
|
|
663
|
+
# Test different methods
|
|
664
|
+
methods = ["bm25", "hybrid"]
|
|
665
|
+
|
|
666
|
+
for method in methods:
|
|
667
|
+
print(f"\nMethod: {method.upper()}")
|
|
668
|
+
results = hybrid.search(query, limit=5, method=method)
|
|
669
|
+
|
|
670
|
+
print(f" Found {len(results)} results in {hybrid.last_search_time*1000:.2f}ms")
|
|
671
|
+
|
|
672
|
+
for i, mem in enumerate(results, 1):
|
|
673
|
+
print(f"\n [{i}] Score: {mem['score']:.3f} | ID: {mem['id']}")
|
|
674
|
+
if mem.get('category'):
|
|
675
|
+
print(f" Category: {mem['category']}")
|
|
676
|
+
if mem.get('tags'):
|
|
677
|
+
print(f" Tags: {', '.join(mem['tags'][:3])}")
|
|
678
|
+
print(f" Content: {mem['content'][:100]}...")
|
|
679
|
+
|
|
680
|
+
# Display final stats
|
|
681
|
+
print("\n" + "=" * 60)
|
|
682
|
+
print("Performance Summary:")
|
|
683
|
+
print("=" * 60)
|
|
684
|
+
|
|
685
|
+
final_stats = hybrid.get_stats()
|
|
686
|
+
print(f" Last search time: {final_stats['last_search_time_ms']:.2f}ms")
|
|
687
|
+
print(f" Last fusion time: {final_stats['last_fusion_time_ms']:.2f}ms")
|
|
688
|
+
print(f" Target: <50ms for 1K memories {'ā' if final_stats['last_search_time_ms'] < 50 else 'ā'}")
|
|
689
|
+
|
|
690
|
+
if 'cache' in final_stats:
|
|
691
|
+
cache_stats = final_stats['cache']
|
|
692
|
+
print(f"\n Cache hit rate: {cache_stats['hit_rate']*100:.1f}%")
|
|
693
|
+
print(f" Cache size: {cache_stats['current_size']}/{cache_stats['max_size']}")
|