superlocalmemory 2.7.5 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +120 -155
- package/README.md +115 -89
- package/api_server.py +2 -12
- package/docs/PATTERN-LEARNING.md +64 -199
- package/docs/example_graph_usage.py +4 -6
- package/install.ps1 +226 -0
- package/install.sh +59 -0
- package/mcp_server.py +83 -7
- package/package.json +3 -10
- package/scripts/generate-thumbnails.py +3 -5
- package/skills/slm-build-graph/SKILL.md +1 -1
- package/skills/slm-list-recent/SKILL.md +1 -1
- package/skills/slm-recall/SKILL.md +1 -1
- package/skills/slm-remember/SKILL.md +1 -1
- package/skills/slm-show-patterns/SKILL.md +1 -1
- package/skills/slm-status/SKILL.md +1 -1
- package/skills/slm-switch-profile/SKILL.md +1 -1
- package/src/agent_registry.py +7 -18
- package/src/auth_middleware.py +3 -5
- package/src/auto_backup.py +3 -7
- package/src/behavioral/__init__.py +49 -0
- package/src/behavioral/behavioral_listener.py +203 -0
- package/src/behavioral/behavioral_patterns.py +275 -0
- package/src/behavioral/cross_project_transfer.py +206 -0
- package/src/behavioral/outcome_inference.py +194 -0
- package/src/behavioral/outcome_tracker.py +193 -0
- package/src/behavioral/tests/__init__.py +4 -0
- package/src/behavioral/tests/test_behavioral_integration.py +108 -0
- package/src/behavioral/tests/test_behavioral_patterns.py +150 -0
- package/src/behavioral/tests/test_cross_project_transfer.py +142 -0
- package/src/behavioral/tests/test_mcp_behavioral.py +139 -0
- package/src/behavioral/tests/test_mcp_report_outcome.py +117 -0
- package/src/behavioral/tests/test_outcome_inference.py +107 -0
- package/src/behavioral/tests/test_outcome_tracker.py +96 -0
- package/src/cache_manager.py +4 -6
- package/src/compliance/__init__.py +48 -0
- package/src/compliance/abac_engine.py +149 -0
- package/src/compliance/abac_middleware.py +116 -0
- package/src/compliance/audit_db.py +215 -0
- package/src/compliance/audit_logger.py +148 -0
- package/src/compliance/retention_manager.py +289 -0
- package/src/compliance/retention_scheduler.py +186 -0
- package/src/compliance/tests/__init__.py +4 -0
- package/src/compliance/tests/test_abac_enforcement.py +95 -0
- package/src/compliance/tests/test_abac_engine.py +124 -0
- package/src/compliance/tests/test_abac_mcp_integration.py +118 -0
- package/src/compliance/tests/test_audit_db.py +123 -0
- package/src/compliance/tests/test_audit_logger.py +98 -0
- package/src/compliance/tests/test_mcp_audit.py +128 -0
- package/src/compliance/tests/test_mcp_retention_policy.py +125 -0
- package/src/compliance/tests/test_retention_manager.py +131 -0
- package/src/compliance/tests/test_retention_scheduler.py +99 -0
- package/src/db_connection_manager.py +2 -12
- package/src/embedding_engine.py +61 -669
- package/src/embeddings/__init__.py +47 -0
- package/src/embeddings/cache.py +70 -0
- package/src/embeddings/cli.py +113 -0
- package/src/embeddings/constants.py +47 -0
- package/src/embeddings/database.py +91 -0
- package/src/embeddings/engine.py +247 -0
- package/src/embeddings/model_loader.py +145 -0
- package/src/event_bus.py +3 -13
- package/src/graph/__init__.py +36 -0
- package/src/graph/build_helpers.py +74 -0
- package/src/graph/cli.py +87 -0
- package/src/graph/cluster_builder.py +188 -0
- package/src/graph/cluster_summary.py +148 -0
- package/src/graph/constants.py +47 -0
- package/src/graph/edge_builder.py +162 -0
- package/src/graph/entity_extractor.py +95 -0
- package/src/graph/graph_core.py +226 -0
- package/src/graph/graph_search.py +231 -0
- package/src/graph/hierarchical.py +207 -0
- package/src/graph/schema.py +99 -0
- package/src/graph_engine.py +45 -1451
- package/src/hnsw_index.py +3 -7
- package/src/hybrid_search.py +36 -683
- package/src/learning/__init__.py +27 -12
- package/src/learning/adaptive_ranker.py +50 -12
- package/src/learning/cross_project_aggregator.py +2 -12
- package/src/learning/engagement_tracker.py +2 -12
- package/src/learning/feature_extractor.py +175 -43
- package/src/learning/feedback_collector.py +7 -12
- package/src/learning/learning_db.py +180 -12
- package/src/learning/project_context_manager.py +2 -12
- package/src/learning/source_quality_scorer.py +2 -12
- package/src/learning/synthetic_bootstrap.py +2 -12
- package/src/learning/tests/__init__.py +2 -0
- package/src/learning/tests/test_adaptive_ranker.py +2 -6
- package/src/learning/tests/test_adaptive_ranker_v28.py +60 -0
- package/src/learning/tests/test_aggregator.py +2 -6
- package/src/learning/tests/test_auto_retrain_v28.py +35 -0
- package/src/learning/tests/test_e2e_ranking_v28.py +82 -0
- package/src/learning/tests/test_feature_extractor_v28.py +93 -0
- package/src/learning/tests/test_feedback_collector.py +2 -6
- package/src/learning/tests/test_learning_db.py +2 -6
- package/src/learning/tests/test_learning_db_v28.py +110 -0
- package/src/learning/tests/test_learning_init_v28.py +48 -0
- package/src/learning/tests/test_outcome_signals.py +48 -0
- package/src/learning/tests/test_project_context.py +2 -6
- package/src/learning/tests/test_schema_migration.py +319 -0
- package/src/learning/tests/test_signal_inference.py +11 -13
- package/src/learning/tests/test_source_quality.py +2 -6
- package/src/learning/tests/test_synthetic_bootstrap.py +3 -7
- package/src/learning/tests/test_workflow_miner.py +2 -6
- package/src/learning/workflow_pattern_miner.py +2 -12
- package/src/lifecycle/__init__.py +54 -0
- package/src/lifecycle/bounded_growth.py +239 -0
- package/src/lifecycle/compaction_engine.py +226 -0
- package/src/lifecycle/lifecycle_engine.py +302 -0
- package/src/lifecycle/lifecycle_evaluator.py +225 -0
- package/src/lifecycle/lifecycle_scheduler.py +130 -0
- package/src/lifecycle/retention_policy.py +285 -0
- package/src/lifecycle/tests/__init__.py +4 -0
- package/src/lifecycle/tests/test_bounded_growth.py +193 -0
- package/src/lifecycle/tests/test_compaction.py +179 -0
- package/src/lifecycle/tests/test_lifecycle_engine.py +137 -0
- package/src/lifecycle/tests/test_lifecycle_evaluation.py +177 -0
- package/src/lifecycle/tests/test_lifecycle_scheduler.py +127 -0
- package/src/lifecycle/tests/test_lifecycle_search.py +109 -0
- package/src/lifecycle/tests/test_mcp_compact.py +149 -0
- package/src/lifecycle/tests/test_mcp_lifecycle_status.py +114 -0
- package/src/lifecycle/tests/test_retention_policy.py +162 -0
- package/src/mcp_tools_v28.py +280 -0
- package/src/memory-profiles.py +2 -12
- package/src/memory-reset.py +2 -12
- package/src/memory_compression.py +2 -12
- package/src/memory_store_v2.py +76 -20
- package/src/migrate_v1_to_v2.py +2 -12
- package/src/pattern_learner.py +29 -975
- package/src/patterns/__init__.py +24 -0
- package/src/patterns/analyzers.py +247 -0
- package/src/patterns/learner.py +267 -0
- package/src/patterns/scoring.py +167 -0
- package/src/patterns/store.py +223 -0
- package/src/patterns/terminology.py +138 -0
- package/src/provenance_tracker.py +4 -14
- package/src/query_optimizer.py +4 -6
- package/src/rate_limiter.py +2 -6
- package/src/search/__init__.py +20 -0
- package/src/search/cli.py +77 -0
- package/src/search/constants.py +26 -0
- package/src/search/engine.py +239 -0
- package/src/search/fusion.py +122 -0
- package/src/search/index_loader.py +112 -0
- package/src/search/methods.py +162 -0
- package/src/search_engine_v2.py +4 -6
- package/src/setup_validator.py +7 -13
- package/src/subscription_manager.py +2 -12
- package/src/tree/__init__.py +59 -0
- package/src/tree/builder.py +183 -0
- package/src/tree/nodes.py +196 -0
- package/src/tree/queries.py +252 -0
- package/src/tree/schema.py +76 -0
- package/src/tree_manager.py +10 -711
- package/src/trust/__init__.py +45 -0
- package/src/trust/constants.py +66 -0
- package/src/trust/queries.py +157 -0
- package/src/trust/schema.py +95 -0
- package/src/trust/scorer.py +299 -0
- package/src/trust/signals.py +95 -0
- package/src/trust_scorer.py +39 -697
- package/src/webhook_dispatcher.py +2 -12
- package/ui/app.js +1 -1
- package/ui/index.html +3 -0
- package/ui/js/agents.js +1 -1
- package/ui/js/core.js +21 -5
- package/ui/js/profiles.js +29 -7
- package/ui_server.py +2 -14
- package/ATTRIBUTION.md +0 -140
- package/docs/ARCHITECTURE-V2.5.md +0 -190
- package/docs/GRAPH-ENGINE.md +0 -503
- package/docs/architecture-diagram.drawio +0 -405
- package/docs/plans/2026-02-13-benchmark-suite.md +0 -1349
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""CLI interface for testing the hybrid search engine.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from search.engine import HybridSearchEngine
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
"""Run the hybrid search CLI demo."""
|
|
14
|
+
print("Hybrid Search Engine - Demo")
|
|
15
|
+
print("=" * 60)
|
|
16
|
+
|
|
17
|
+
# Use test database or default
|
|
18
|
+
db_path = Path.home() / ".claude-memory" / "memory.db"
|
|
19
|
+
|
|
20
|
+
if not db_path.exists():
|
|
21
|
+
print(f"Error: Database not found at {db_path}")
|
|
22
|
+
print("Please run memory_store_v2.py to create database first.")
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
# Initialize hybrid search
|
|
26
|
+
print(f"\nInitializing hybrid search engine...")
|
|
27
|
+
print(f"Database: {db_path}")
|
|
28
|
+
|
|
29
|
+
hybrid = HybridSearchEngine(db_path, enable_cache=True)
|
|
30
|
+
|
|
31
|
+
stats = hybrid.get_stats()
|
|
32
|
+
print(f"\nIndexed {stats['bm25']['num_documents']} memories")
|
|
33
|
+
print(f" Vocabulary: {stats['bm25']['vocabulary_size']} terms")
|
|
34
|
+
print(f" TF-IDF: {'Available' if stats['tfidf_available'] else 'Not available'}")
|
|
35
|
+
print(f" Graph: {'Available' if stats['graph_available'] else 'Not available'}")
|
|
36
|
+
|
|
37
|
+
# Test search
|
|
38
|
+
if len(sys.argv) > 1:
|
|
39
|
+
query = ' '.join(sys.argv[1:])
|
|
40
|
+
else:
|
|
41
|
+
query = "python web development"
|
|
42
|
+
|
|
43
|
+
print("\n" + "=" * 60)
|
|
44
|
+
print(f"Search Query: '{query}'")
|
|
45
|
+
print("=" * 60)
|
|
46
|
+
|
|
47
|
+
# Test different methods
|
|
48
|
+
methods = ["bm25", "hybrid"]
|
|
49
|
+
|
|
50
|
+
for method in methods:
|
|
51
|
+
print(f"\nMethod: {method.upper()}")
|
|
52
|
+
results = hybrid.search(query, limit=5, method=method)
|
|
53
|
+
|
|
54
|
+
print(f" Found {len(results)} results in {hybrid.last_search_time*1000:.2f}ms")
|
|
55
|
+
|
|
56
|
+
for i, mem in enumerate(results, 1):
|
|
57
|
+
print(f"\n [{i}] Score: {mem['score']:.3f} | ID: {mem['id']}")
|
|
58
|
+
if mem.get('category'):
|
|
59
|
+
print(f" Category: {mem['category']}")
|
|
60
|
+
if mem.get('tags'):
|
|
61
|
+
print(f" Tags: {', '.join(mem['tags'][:3])}")
|
|
62
|
+
print(f" Content: {mem['content'][:100]}...")
|
|
63
|
+
|
|
64
|
+
# Display final stats
|
|
65
|
+
print("\n" + "=" * 60)
|
|
66
|
+
print("Performance Summary:")
|
|
67
|
+
print("=" * 60)
|
|
68
|
+
|
|
69
|
+
final_stats = hybrid.get_stats()
|
|
70
|
+
print(f" Last search time: {final_stats['last_search_time_ms']:.2f}ms")
|
|
71
|
+
print(f" Last fusion time: {final_stats['last_fusion_time_ms']:.2f}ms")
|
|
72
|
+
print(f" Target: <50ms for 1K memories")
|
|
73
|
+
|
|
74
|
+
if 'cache' in final_stats:
|
|
75
|
+
cache_stats = final_stats['cache']
|
|
76
|
+
print(f"\n Cache hit rate: {cache_stats['hit_rate']*100:.1f}%")
|
|
77
|
+
print(f" Cache size: {cache_stats['current_size']}/{cache_stats['max_size']}")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""SuperLocalMemory V2 - Hybrid Search System
|
|
5
|
+
|
|
6
|
+
Solution Architect & Original Creator
|
|
7
|
+
|
|
8
|
+
(see LICENSE file)
|
|
9
|
+
|
|
10
|
+
ATTRIBUTION REQUIRED: This notice must be preserved in all copies.
|
|
11
|
+
"""
|
|
12
|
+
"""
|
|
13
|
+
Shared imports and constants for the hybrid search package.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
import math
|
|
18
|
+
import json
|
|
19
|
+
import sqlite3
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from typing import List, Dict, Tuple, Optional, Any, Set
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from search_engine_v2 import BM25SearchEngine
|
|
25
|
+
from query_optimizer import QueryOptimizer
|
|
26
|
+
from cache_manager import CacheManager
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""HybridSearchEngine - Main orchestrator for multi-method retrieval fusion.
|
|
5
|
+
"""
|
|
6
|
+
import time
|
|
7
|
+
import json
|
|
8
|
+
import sqlite3
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
11
|
+
|
|
12
|
+
from search_engine_v2 import BM25SearchEngine
|
|
13
|
+
from query_optimizer import QueryOptimizer
|
|
14
|
+
from cache_manager import CacheManager
|
|
15
|
+
|
|
16
|
+
from search.index_loader import IndexLoaderMixin
|
|
17
|
+
from search.methods import SearchMethodsMixin
|
|
18
|
+
from search.fusion import FusionMixin
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HybridSearchEngine(IndexLoaderMixin, SearchMethodsMixin, FusionMixin):
|
|
22
|
+
"""
|
|
23
|
+
Hybrid search combining BM25, graph traversal, and semantic search.
|
|
24
|
+
|
|
25
|
+
Provides flexible retrieval strategies based on query type and
|
|
26
|
+
available resources.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
db_path: Path,
|
|
32
|
+
bm25_engine: Optional[BM25SearchEngine] = None,
|
|
33
|
+
query_optimizer: Optional[QueryOptimizer] = None,
|
|
34
|
+
cache_manager: Optional[CacheManager] = None,
|
|
35
|
+
enable_cache: bool = True
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize hybrid search engine.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
db_path: Path to memory database
|
|
42
|
+
bm25_engine: Pre-configured BM25 engine (will create if None)
|
|
43
|
+
query_optimizer: Query optimizer instance (will create if None)
|
|
44
|
+
cache_manager: Cache manager instance (will create if None)
|
|
45
|
+
enable_cache: Enable result caching
|
|
46
|
+
"""
|
|
47
|
+
self.db_path = db_path
|
|
48
|
+
|
|
49
|
+
# Initialize components
|
|
50
|
+
self.bm25 = bm25_engine or BM25SearchEngine()
|
|
51
|
+
self.optimizer = query_optimizer or QueryOptimizer()
|
|
52
|
+
self.cache = cache_manager if enable_cache else None
|
|
53
|
+
|
|
54
|
+
# Graph engine (lazy load to avoid circular dependencies)
|
|
55
|
+
self._graph_engine = None
|
|
56
|
+
|
|
57
|
+
# TF-IDF fallback (from memory_store_v2)
|
|
58
|
+
self._tfidf_vectorizer = None
|
|
59
|
+
self._tfidf_vectors = None
|
|
60
|
+
self._memory_ids = []
|
|
61
|
+
|
|
62
|
+
# Performance tracking
|
|
63
|
+
self.last_search_time = 0.0
|
|
64
|
+
self.last_fusion_time = 0.0
|
|
65
|
+
|
|
66
|
+
# Load index
|
|
67
|
+
self._load_index()
|
|
68
|
+
|
|
69
|
+
def search(
|
|
70
|
+
self,
|
|
71
|
+
query: str,
|
|
72
|
+
limit: int = 10,
|
|
73
|
+
method: str = "hybrid",
|
|
74
|
+
weights: Optional[Dict[str, float]] = None,
|
|
75
|
+
use_cache: bool = True
|
|
76
|
+
) -> List[Dict[str, Any]]:
|
|
77
|
+
"""
|
|
78
|
+
Hybrid search with multiple retrieval methods.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
query: Search query
|
|
82
|
+
limit: Maximum results
|
|
83
|
+
method: Fusion method ("hybrid", "weighted", "rrf", "bm25", "semantic", "graph")
|
|
84
|
+
weights: Custom weights for weighted fusion (default: balanced)
|
|
85
|
+
use_cache: Use cache for results
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List of memory dictionaries with scores and match details
|
|
89
|
+
"""
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
|
|
92
|
+
# Check cache
|
|
93
|
+
if use_cache and self.cache:
|
|
94
|
+
cached = self.cache.get(query, limit=limit, method=method)
|
|
95
|
+
if cached is not None:
|
|
96
|
+
self.last_search_time = time.time() - start_time
|
|
97
|
+
return cached
|
|
98
|
+
|
|
99
|
+
# Default weights
|
|
100
|
+
if weights is None:
|
|
101
|
+
weights = {
|
|
102
|
+
'bm25': 0.4,
|
|
103
|
+
'semantic': 0.3,
|
|
104
|
+
'graph': 0.3
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Single method search
|
|
108
|
+
if method == "bm25":
|
|
109
|
+
raw_results = self.search_bm25(query, limit)
|
|
110
|
+
elif method == "semantic":
|
|
111
|
+
raw_results = self.search_semantic(query, limit)
|
|
112
|
+
elif method == "graph":
|
|
113
|
+
raw_results = self.search_graph(query, limit)
|
|
114
|
+
|
|
115
|
+
# Multi-method fusion
|
|
116
|
+
else:
|
|
117
|
+
fusion_start = time.time()
|
|
118
|
+
|
|
119
|
+
# Get results from all methods
|
|
120
|
+
results_dict = {}
|
|
121
|
+
|
|
122
|
+
if weights.get('bm25', 0) > 0:
|
|
123
|
+
results_dict['bm25'] = self.search_bm25(query, limit=limit*2)
|
|
124
|
+
|
|
125
|
+
if weights.get('semantic', 0) > 0:
|
|
126
|
+
results_dict['semantic'] = self.search_semantic(query, limit=limit*2)
|
|
127
|
+
|
|
128
|
+
if weights.get('graph', 0) > 0:
|
|
129
|
+
results_dict['graph'] = self.search_graph(query, limit=limit*2)
|
|
130
|
+
|
|
131
|
+
# Fusion
|
|
132
|
+
if method == "rrf":
|
|
133
|
+
raw_results = self._reciprocal_rank_fusion(list(results_dict.values()))
|
|
134
|
+
else: # weighted or hybrid
|
|
135
|
+
raw_results = self._weighted_fusion(results_dict, weights)
|
|
136
|
+
|
|
137
|
+
self.last_fusion_time = time.time() - fusion_start
|
|
138
|
+
|
|
139
|
+
# Limit results
|
|
140
|
+
raw_results = raw_results[:limit]
|
|
141
|
+
|
|
142
|
+
# Fetch full memory details
|
|
143
|
+
results = self._fetch_memory_details(raw_results, query)
|
|
144
|
+
|
|
145
|
+
# Cache results
|
|
146
|
+
if use_cache and self.cache:
|
|
147
|
+
self.cache.put(query, results, limit=limit, method=method)
|
|
148
|
+
|
|
149
|
+
self.last_search_time = time.time() - start_time
|
|
150
|
+
|
|
151
|
+
return results
|
|
152
|
+
|
|
153
|
+
def _fetch_memory_details(
|
|
154
|
+
self,
|
|
155
|
+
raw_results: List[Tuple[int, float]],
|
|
156
|
+
query: str
|
|
157
|
+
) -> List[Dict[str, Any]]:
|
|
158
|
+
"""
|
|
159
|
+
Fetch full memory details for result IDs.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
raw_results: List of (memory_id, score) tuples
|
|
163
|
+
query: Original query (for context)
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List of memory dictionaries with full details
|
|
167
|
+
"""
|
|
168
|
+
if not raw_results:
|
|
169
|
+
return []
|
|
170
|
+
|
|
171
|
+
memory_ids = [mem_id for mem_id, _ in raw_results]
|
|
172
|
+
id_to_score = {mem_id: score for mem_id, score in raw_results}
|
|
173
|
+
|
|
174
|
+
conn = sqlite3.connect(self.db_path)
|
|
175
|
+
cursor = conn.cursor()
|
|
176
|
+
|
|
177
|
+
# Fetch memories
|
|
178
|
+
placeholders = ','.join(['?'] * len(memory_ids))
|
|
179
|
+
cursor.execute(f'''
|
|
180
|
+
SELECT id, content, summary, project_path, project_name, tags,
|
|
181
|
+
category, parent_id, tree_path, depth, memory_type,
|
|
182
|
+
importance, created_at, cluster_id, last_accessed, access_count
|
|
183
|
+
FROM memories
|
|
184
|
+
WHERE id IN ({placeholders})
|
|
185
|
+
''', memory_ids)
|
|
186
|
+
|
|
187
|
+
rows = cursor.fetchall()
|
|
188
|
+
conn.close()
|
|
189
|
+
|
|
190
|
+
# Build result dictionaries
|
|
191
|
+
results = []
|
|
192
|
+
for row in rows:
|
|
193
|
+
mem_id = row[0]
|
|
194
|
+
results.append({
|
|
195
|
+
'id': mem_id,
|
|
196
|
+
'content': row[1],
|
|
197
|
+
'summary': row[2],
|
|
198
|
+
'project_path': row[3],
|
|
199
|
+
'project_name': row[4],
|
|
200
|
+
'tags': json.loads(row[5]) if row[5] else [],
|
|
201
|
+
'category': row[6],
|
|
202
|
+
'parent_id': row[7],
|
|
203
|
+
'tree_path': row[8],
|
|
204
|
+
'depth': row[9],
|
|
205
|
+
'memory_type': row[10],
|
|
206
|
+
'importance': row[11],
|
|
207
|
+
'created_at': row[12],
|
|
208
|
+
'cluster_id': row[13],
|
|
209
|
+
'last_accessed': row[14],
|
|
210
|
+
'access_count': row[15],
|
|
211
|
+
'score': id_to_score.get(mem_id, 0.0),
|
|
212
|
+
'match_type': 'hybrid'
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
# Sort by score
|
|
216
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
217
|
+
|
|
218
|
+
return results
|
|
219
|
+
|
|
220
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
221
|
+
"""
|
|
222
|
+
Get hybrid search statistics.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Dictionary with performance stats
|
|
226
|
+
"""
|
|
227
|
+
stats = {
|
|
228
|
+
'bm25': self.bm25.get_stats(),
|
|
229
|
+
'optimizer': self.optimizer.get_stats(),
|
|
230
|
+
'last_search_time_ms': self.last_search_time * 1000,
|
|
231
|
+
'last_fusion_time_ms': self.last_fusion_time * 1000,
|
|
232
|
+
'tfidf_available': self._tfidf_vectorizer is not None,
|
|
233
|
+
'graph_available': self._graph_engine is not None
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if self.cache:
|
|
237
|
+
stats['cache'] = self.cache.get_stats()
|
|
238
|
+
|
|
239
|
+
return stats
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""Score fusion strategies for combining multi-method search results.
|
|
5
|
+
"""
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import List, Dict, Tuple
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FusionMixin:
|
|
11
|
+
"""
|
|
12
|
+
Mixin providing score normalization and fusion strategies.
|
|
13
|
+
|
|
14
|
+
No external dependencies -- operates purely on (id, score) tuples.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def _normalize_scores(
|
|
18
|
+
self,
|
|
19
|
+
results: List[Tuple[int, float]]
|
|
20
|
+
) -> List[Tuple[int, float]]:
|
|
21
|
+
"""
|
|
22
|
+
Normalize scores to [0, 1] range using min-max normalization.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
results: List of (id, score) tuples
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Normalized results
|
|
29
|
+
"""
|
|
30
|
+
if not results:
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
scores = [score for _, score in results]
|
|
34
|
+
min_score = min(scores)
|
|
35
|
+
max_score = max(scores)
|
|
36
|
+
|
|
37
|
+
if max_score == min_score:
|
|
38
|
+
# All scores equal - return uniform scores
|
|
39
|
+
return [(id, 1.0) for id, _ in results]
|
|
40
|
+
|
|
41
|
+
normalized = []
|
|
42
|
+
for mem_id, score in results:
|
|
43
|
+
norm_score = (score - min_score) / (max_score - min_score)
|
|
44
|
+
normalized.append((mem_id, norm_score))
|
|
45
|
+
|
|
46
|
+
return normalized
|
|
47
|
+
|
|
48
|
+
def _reciprocal_rank_fusion(
|
|
49
|
+
self,
|
|
50
|
+
results_list: List[List[Tuple[int, float]]],
|
|
51
|
+
k: int = 60
|
|
52
|
+
) -> List[Tuple[int, float]]:
|
|
53
|
+
"""
|
|
54
|
+
Combine multiple result lists using Reciprocal Rank Fusion.
|
|
55
|
+
|
|
56
|
+
RRF formula: score(d) = sum 1 / (k + rank(d))
|
|
57
|
+
|
|
58
|
+
RRF is rank-based and doesn't depend on score magnitudes,
|
|
59
|
+
making it robust to different scoring scales.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
results_list: List of result lists from different methods
|
|
63
|
+
k: RRF constant (default: 60, standard value)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Fused results sorted by RRF score
|
|
67
|
+
"""
|
|
68
|
+
# Build rank maps for each method
|
|
69
|
+
rrf_scores = defaultdict(float)
|
|
70
|
+
|
|
71
|
+
for results in results_list:
|
|
72
|
+
for rank, (mem_id, _) in enumerate(results, start=1):
|
|
73
|
+
rrf_scores[mem_id] += 1.0 / (k + rank)
|
|
74
|
+
|
|
75
|
+
# Convert to sorted list
|
|
76
|
+
fused = [(mem_id, score) for mem_id, score in rrf_scores.items()]
|
|
77
|
+
fused.sort(key=lambda x: x[1], reverse=True)
|
|
78
|
+
|
|
79
|
+
return fused
|
|
80
|
+
|
|
81
|
+
def _weighted_fusion(
|
|
82
|
+
self,
|
|
83
|
+
results_dict: Dict[str, List[Tuple[int, float]]],
|
|
84
|
+
weights: Dict[str, float]
|
|
85
|
+
) -> List[Tuple[int, float]]:
|
|
86
|
+
"""
|
|
87
|
+
Combine results using weighted score fusion.
|
|
88
|
+
|
|
89
|
+
Normalizes scores from each method then combines with weights.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
results_dict: Dictionary mapping method name to results
|
|
93
|
+
weights: Dictionary mapping method name to weight
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Fused results sorted by combined score
|
|
97
|
+
"""
|
|
98
|
+
# Normalize scores for each method
|
|
99
|
+
normalized = {}
|
|
100
|
+
for method, results in results_dict.items():
|
|
101
|
+
normalized[method] = self._normalize_scores(results)
|
|
102
|
+
|
|
103
|
+
# Combine with weights
|
|
104
|
+
combined_scores = defaultdict(float)
|
|
105
|
+
max_weight_sum = defaultdict(float) # Track possible max score per doc
|
|
106
|
+
|
|
107
|
+
for method, results in normalized.items():
|
|
108
|
+
weight = weights.get(method, 0.0)
|
|
109
|
+
|
|
110
|
+
for mem_id, score in results:
|
|
111
|
+
combined_scores[mem_id] += weight * score
|
|
112
|
+
max_weight_sum[mem_id] += weight
|
|
113
|
+
|
|
114
|
+
# Normalize by actual weights (some docs may not appear in all methods)
|
|
115
|
+
fused = []
|
|
116
|
+
for mem_id, score in combined_scores.items():
|
|
117
|
+
normalized_score = score / max_weight_sum[mem_id] if max_weight_sum[mem_id] > 0 else 0
|
|
118
|
+
fused.append((mem_id, normalized_score))
|
|
119
|
+
|
|
120
|
+
fused.sort(key=lambda x: x[1], reverse=True)
|
|
121
|
+
|
|
122
|
+
return fused
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""Index loading and graph engine lazy-loading for hybrid search.
|
|
5
|
+
"""
|
|
6
|
+
import json
|
|
7
|
+
import sqlite3
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from search_engine_v2 import BM25SearchEngine
|
|
12
|
+
from query_optimizer import QueryOptimizer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class IndexLoaderMixin:
|
|
16
|
+
"""
|
|
17
|
+
Mixin that provides index loading and graph engine lazy-loading.
|
|
18
|
+
|
|
19
|
+
Expects the host class to have:
|
|
20
|
+
- self.db_path: Path
|
|
21
|
+
- self.bm25: BM25SearchEngine
|
|
22
|
+
- self.optimizer: QueryOptimizer
|
|
23
|
+
- self._graph_engine: Optional[GraphEngine]
|
|
24
|
+
- self._tfidf_vectorizer
|
|
25
|
+
- self._tfidf_vectors
|
|
26
|
+
- self._memory_ids: list
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def _load_index(self):
|
|
30
|
+
"""
|
|
31
|
+
Load documents from database and build search indexes.
|
|
32
|
+
"""
|
|
33
|
+
conn = sqlite3.connect(self.db_path)
|
|
34
|
+
cursor = conn.cursor()
|
|
35
|
+
|
|
36
|
+
# Fetch all memories
|
|
37
|
+
cursor.execute('''
|
|
38
|
+
SELECT id, content, summary, tags
|
|
39
|
+
FROM memories
|
|
40
|
+
ORDER BY id
|
|
41
|
+
''')
|
|
42
|
+
|
|
43
|
+
rows = cursor.fetchall()
|
|
44
|
+
conn.close()
|
|
45
|
+
|
|
46
|
+
if not rows:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# Build BM25 index
|
|
50
|
+
doc_ids = [row[0] for row in rows]
|
|
51
|
+
documents = []
|
|
52
|
+
vocabulary = set()
|
|
53
|
+
|
|
54
|
+
for row in rows:
|
|
55
|
+
# Combine content + summary + tags for indexing
|
|
56
|
+
text_parts = [row[1]] # content
|
|
57
|
+
|
|
58
|
+
if row[2]: # summary
|
|
59
|
+
text_parts.append(row[2])
|
|
60
|
+
|
|
61
|
+
if row[3]: # tags (JSON)
|
|
62
|
+
try:
|
|
63
|
+
tags = json.loads(row[3])
|
|
64
|
+
text_parts.extend(tags)
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
doc_text = ' '.join(text_parts)
|
|
69
|
+
documents.append(doc_text)
|
|
70
|
+
|
|
71
|
+
# Build vocabulary for spell correction
|
|
72
|
+
tokens = self.bm25._tokenize(doc_text)
|
|
73
|
+
vocabulary.update(tokens)
|
|
74
|
+
|
|
75
|
+
# Index with BM25
|
|
76
|
+
self.bm25.index_documents(documents, doc_ids)
|
|
77
|
+
self._memory_ids = doc_ids
|
|
78
|
+
|
|
79
|
+
# Initialize optimizer with vocabulary
|
|
80
|
+
self.optimizer.vocabulary = vocabulary
|
|
81
|
+
|
|
82
|
+
# Build co-occurrence for query expansion
|
|
83
|
+
tokenized_docs = [self.bm25._tokenize(doc) for doc in documents]
|
|
84
|
+
self.optimizer.build_cooccurrence_matrix(tokenized_docs)
|
|
85
|
+
|
|
86
|
+
# Try to load TF-IDF (optional semantic search)
|
|
87
|
+
try:
|
|
88
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
89
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
90
|
+
import numpy as np
|
|
91
|
+
|
|
92
|
+
self._tfidf_vectorizer = TfidfVectorizer(
|
|
93
|
+
max_features=5000,
|
|
94
|
+
stop_words='english',
|
|
95
|
+
ngram_range=(1, 2)
|
|
96
|
+
)
|
|
97
|
+
self._tfidf_vectors = self._tfidf_vectorizer.fit_transform(documents)
|
|
98
|
+
|
|
99
|
+
except ImportError:
|
|
100
|
+
# sklearn not available - skip semantic search
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def _load_graph_engine(self):
|
|
104
|
+
"""Lazy load graph engine to avoid circular imports."""
|
|
105
|
+
if self._graph_engine is None:
|
|
106
|
+
try:
|
|
107
|
+
from graph_engine import GraphEngine
|
|
108
|
+
self._graph_engine = GraphEngine(self.db_path)
|
|
109
|
+
except ImportError:
|
|
110
|
+
# Graph engine not available
|
|
111
|
+
pass
|
|
112
|
+
return self._graph_engine
|