superlocalmemory 2.7.6 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +120 -155
- package/README.md +115 -89
- package/api_server.py +2 -12
- package/docs/PATTERN-LEARNING.md +64 -199
- package/docs/example_graph_usage.py +4 -6
- package/install.sh +59 -0
- package/mcp_server.py +83 -7
- package/package.json +1 -8
- package/scripts/generate-thumbnails.py +3 -5
- package/skills/slm-build-graph/SKILL.md +1 -1
- package/skills/slm-list-recent/SKILL.md +1 -1
- package/skills/slm-recall/SKILL.md +1 -1
- package/skills/slm-remember/SKILL.md +1 -1
- package/skills/slm-show-patterns/SKILL.md +1 -1
- package/skills/slm-status/SKILL.md +1 -1
- package/skills/slm-switch-profile/SKILL.md +1 -1
- package/src/agent_registry.py +7 -18
- package/src/auth_middleware.py +3 -5
- package/src/auto_backup.py +3 -7
- package/src/behavioral/__init__.py +49 -0
- package/src/behavioral/behavioral_listener.py +203 -0
- package/src/behavioral/behavioral_patterns.py +275 -0
- package/src/behavioral/cross_project_transfer.py +206 -0
- package/src/behavioral/outcome_inference.py +194 -0
- package/src/behavioral/outcome_tracker.py +193 -0
- package/src/behavioral/tests/__init__.py +4 -0
- package/src/behavioral/tests/test_behavioral_integration.py +108 -0
- package/src/behavioral/tests/test_behavioral_patterns.py +150 -0
- package/src/behavioral/tests/test_cross_project_transfer.py +142 -0
- package/src/behavioral/tests/test_mcp_behavioral.py +139 -0
- package/src/behavioral/tests/test_mcp_report_outcome.py +117 -0
- package/src/behavioral/tests/test_outcome_inference.py +107 -0
- package/src/behavioral/tests/test_outcome_tracker.py +96 -0
- package/src/cache_manager.py +4 -6
- package/src/compliance/__init__.py +48 -0
- package/src/compliance/abac_engine.py +149 -0
- package/src/compliance/abac_middleware.py +116 -0
- package/src/compliance/audit_db.py +215 -0
- package/src/compliance/audit_logger.py +148 -0
- package/src/compliance/retention_manager.py +289 -0
- package/src/compliance/retention_scheduler.py +186 -0
- package/src/compliance/tests/__init__.py +4 -0
- package/src/compliance/tests/test_abac_enforcement.py +95 -0
- package/src/compliance/tests/test_abac_engine.py +124 -0
- package/src/compliance/tests/test_abac_mcp_integration.py +118 -0
- package/src/compliance/tests/test_audit_db.py +123 -0
- package/src/compliance/tests/test_audit_logger.py +98 -0
- package/src/compliance/tests/test_mcp_audit.py +128 -0
- package/src/compliance/tests/test_mcp_retention_policy.py +125 -0
- package/src/compliance/tests/test_retention_manager.py +131 -0
- package/src/compliance/tests/test_retention_scheduler.py +99 -0
- package/src/db_connection_manager.py +2 -12
- package/src/embedding_engine.py +61 -669
- package/src/embeddings/__init__.py +47 -0
- package/src/embeddings/cache.py +70 -0
- package/src/embeddings/cli.py +113 -0
- package/src/embeddings/constants.py +47 -0
- package/src/embeddings/database.py +91 -0
- package/src/embeddings/engine.py +247 -0
- package/src/embeddings/model_loader.py +145 -0
- package/src/event_bus.py +3 -13
- package/src/graph/__init__.py +36 -0
- package/src/graph/build_helpers.py +74 -0
- package/src/graph/cli.py +87 -0
- package/src/graph/cluster_builder.py +188 -0
- package/src/graph/cluster_summary.py +148 -0
- package/src/graph/constants.py +47 -0
- package/src/graph/edge_builder.py +162 -0
- package/src/graph/entity_extractor.py +95 -0
- package/src/graph/graph_core.py +226 -0
- package/src/graph/graph_search.py +231 -0
- package/src/graph/hierarchical.py +207 -0
- package/src/graph/schema.py +99 -0
- package/src/graph_engine.py +45 -1451
- package/src/hnsw_index.py +3 -7
- package/src/hybrid_search.py +36 -683
- package/src/learning/__init__.py +27 -12
- package/src/learning/adaptive_ranker.py +50 -12
- package/src/learning/cross_project_aggregator.py +2 -12
- package/src/learning/engagement_tracker.py +2 -12
- package/src/learning/feature_extractor.py +175 -43
- package/src/learning/feedback_collector.py +7 -12
- package/src/learning/learning_db.py +180 -12
- package/src/learning/project_context_manager.py +2 -12
- package/src/learning/source_quality_scorer.py +2 -12
- package/src/learning/synthetic_bootstrap.py +2 -12
- package/src/learning/tests/__init__.py +2 -0
- package/src/learning/tests/test_adaptive_ranker.py +2 -6
- package/src/learning/tests/test_adaptive_ranker_v28.py +60 -0
- package/src/learning/tests/test_aggregator.py +2 -6
- package/src/learning/tests/test_auto_retrain_v28.py +35 -0
- package/src/learning/tests/test_e2e_ranking_v28.py +82 -0
- package/src/learning/tests/test_feature_extractor_v28.py +93 -0
- package/src/learning/tests/test_feedback_collector.py +2 -6
- package/src/learning/tests/test_learning_db.py +2 -6
- package/src/learning/tests/test_learning_db_v28.py +110 -0
- package/src/learning/tests/test_learning_init_v28.py +48 -0
- package/src/learning/tests/test_outcome_signals.py +48 -0
- package/src/learning/tests/test_project_context.py +2 -6
- package/src/learning/tests/test_schema_migration.py +319 -0
- package/src/learning/tests/test_signal_inference.py +11 -13
- package/src/learning/tests/test_source_quality.py +2 -6
- package/src/learning/tests/test_synthetic_bootstrap.py +3 -7
- package/src/learning/tests/test_workflow_miner.py +2 -6
- package/src/learning/workflow_pattern_miner.py +2 -12
- package/src/lifecycle/__init__.py +54 -0
- package/src/lifecycle/bounded_growth.py +239 -0
- package/src/lifecycle/compaction_engine.py +226 -0
- package/src/lifecycle/lifecycle_engine.py +302 -0
- package/src/lifecycle/lifecycle_evaluator.py +225 -0
- package/src/lifecycle/lifecycle_scheduler.py +130 -0
- package/src/lifecycle/retention_policy.py +285 -0
- package/src/lifecycle/tests/__init__.py +4 -0
- package/src/lifecycle/tests/test_bounded_growth.py +193 -0
- package/src/lifecycle/tests/test_compaction.py +179 -0
- package/src/lifecycle/tests/test_lifecycle_engine.py +137 -0
- package/src/lifecycle/tests/test_lifecycle_evaluation.py +177 -0
- package/src/lifecycle/tests/test_lifecycle_scheduler.py +127 -0
- package/src/lifecycle/tests/test_lifecycle_search.py +109 -0
- package/src/lifecycle/tests/test_mcp_compact.py +149 -0
- package/src/lifecycle/tests/test_mcp_lifecycle_status.py +114 -0
- package/src/lifecycle/tests/test_retention_policy.py +162 -0
- package/src/mcp_tools_v28.py +280 -0
- package/src/memory-profiles.py +2 -12
- package/src/memory-reset.py +2 -12
- package/src/memory_compression.py +2 -12
- package/src/memory_store_v2.py +76 -20
- package/src/migrate_v1_to_v2.py +2 -12
- package/src/pattern_learner.py +29 -975
- package/src/patterns/__init__.py +24 -0
- package/src/patterns/analyzers.py +247 -0
- package/src/patterns/learner.py +267 -0
- package/src/patterns/scoring.py +167 -0
- package/src/patterns/store.py +223 -0
- package/src/patterns/terminology.py +138 -0
- package/src/provenance_tracker.py +4 -14
- package/src/query_optimizer.py +4 -6
- package/src/rate_limiter.py +2 -6
- package/src/search/__init__.py +20 -0
- package/src/search/cli.py +77 -0
- package/src/search/constants.py +26 -0
- package/src/search/engine.py +239 -0
- package/src/search/fusion.py +122 -0
- package/src/search/index_loader.py +112 -0
- package/src/search/methods.py +162 -0
- package/src/search_engine_v2.py +4 -6
- package/src/setup_validator.py +7 -13
- package/src/subscription_manager.py +2 -12
- package/src/tree/__init__.py +59 -0
- package/src/tree/builder.py +183 -0
- package/src/tree/nodes.py +196 -0
- package/src/tree/queries.py +252 -0
- package/src/tree/schema.py +76 -0
- package/src/tree_manager.py +10 -711
- package/src/trust/__init__.py +45 -0
- package/src/trust/constants.py +66 -0
- package/src/trust/queries.py +157 -0
- package/src/trust/schema.py +95 -0
- package/src/trust/scorer.py +299 -0
- package/src/trust/signals.py +95 -0
- package/src/trust_scorer.py +39 -697
- package/src/webhook_dispatcher.py +2 -12
- package/ui/app.js +1 -1
- package/ui/js/agents.js +1 -1
- package/ui_server.py +2 -14
- package/ATTRIBUTION.md +0 -140
- package/docs/ARCHITECTURE-V2.5.md +0 -190
- package/docs/GRAPH-ENGINE.md +0 -503
- package/docs/architecture-diagram.drawio +0 -405
- package/docs/plans/2026-02-13-benchmark-suite.md +0 -1349
package/src/graph_engine.py
CHANGED
|
@@ -1,1458 +1,52 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
# Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
|
|
4
|
+
"""GraphEngine - Knowledge Graph Clustering for SuperLocalMemory V2
|
|
5
|
+
|
|
6
|
+
BACKWARD-COMPATIBILITY SHIM
|
|
7
|
+
----------------------------
|
|
8
|
+
This file re-exports every public symbol from the ``graph`` package so that
|
|
9
|
+
existing code using ``from graph_engine import GraphEngine`` (or any other
|
|
10
|
+
name) continues to work without modification.
|
|
11
|
+
|
|
12
|
+
The actual implementation now lives in:
|
|
13
|
+
src/graph/constants.py - Shared imports, constants, logger
|
|
14
|
+
src/graph/entity_extractor.py - EntityExtractor, ClusterNamer
|
|
15
|
+
src/graph/edge_builder.py - EdgeBuilder
|
|
16
|
+
src/graph/cluster_builder.py - ClusterBuilder
|
|
17
|
+
src/graph/graph_core.py - GraphEngine, main()
|
|
2
18
|
"""
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
- For larger datasets, use incremental updates
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
# SECURITY: Graph build limits to prevent resource exhaustion
|
|
23
|
-
MAX_MEMORIES_FOR_GRAPH = 10000
|
|
24
|
-
|
|
25
|
-
import sqlite3
|
|
26
|
-
import json
|
|
27
|
-
import time
|
|
28
|
-
import logging
|
|
29
|
-
from datetime import datetime
|
|
30
|
-
from pathlib import Path
|
|
31
|
-
from typing import List, Dict, Optional, Tuple, Set
|
|
32
|
-
from collections import Counter
|
|
33
|
-
|
|
34
|
-
# Core dependencies
|
|
35
|
-
try:
|
|
36
|
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
37
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
38
|
-
import numpy as np
|
|
39
|
-
SKLEARN_AVAILABLE = True
|
|
40
|
-
except ImportError:
|
|
41
|
-
SKLEARN_AVAILABLE = False
|
|
42
|
-
raise ImportError("scikit-learn is required. Install: pip install scikit-learn")
|
|
43
|
-
|
|
44
|
-
# Graph dependencies - lazy import to avoid conflicts with compression module
|
|
45
|
-
IGRAPH_AVAILABLE = False
|
|
46
|
-
try:
|
|
47
|
-
# Import only when needed to avoid module conflicts
|
|
48
|
-
import importlib
|
|
49
|
-
ig_module = importlib.import_module('igraph')
|
|
50
|
-
leiden_module = importlib.import_module('leidenalg')
|
|
51
|
-
IGRAPH_AVAILABLE = True
|
|
52
|
-
except ImportError:
|
|
53
|
-
pass # Will raise error when building clusters if not available
|
|
54
|
-
|
|
55
|
-
# Setup logging
|
|
56
|
-
logging.basicConfig(
|
|
57
|
-
level=logging.INFO,
|
|
58
|
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
19
|
+
# Re-export everything from the graph package
|
|
20
|
+
from graph import (
|
|
21
|
+
# Constants
|
|
22
|
+
MAX_MEMORIES_FOR_GRAPH,
|
|
23
|
+
SKLEARN_AVAILABLE,
|
|
24
|
+
IGRAPH_AVAILABLE,
|
|
25
|
+
MEMORY_DIR,
|
|
26
|
+
DB_PATH,
|
|
27
|
+
# Classes
|
|
28
|
+
EntityExtractor,
|
|
29
|
+
ClusterNamer,
|
|
30
|
+
EdgeBuilder,
|
|
31
|
+
ClusterBuilder,
|
|
32
|
+
GraphEngine,
|
|
33
|
+
# Functions
|
|
34
|
+
main,
|
|
59
35
|
)
|
|
60
|
-
logger = logging.getLogger(__name__)
|
|
61
|
-
|
|
62
|
-
MEMORY_DIR = Path.home() / ".claude-memory"
|
|
63
|
-
DB_PATH = MEMORY_DIR / "memory.db"
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class EntityExtractor:
|
|
67
|
-
"""Extract key entities/concepts from memory content using TF-IDF."""
|
|
68
|
-
|
|
69
|
-
def __init__(self, max_features: int = 20, min_df: int = 1):
|
|
70
|
-
"""
|
|
71
|
-
Initialize entity extractor.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
max_features: Top N keywords to extract per memory
|
|
75
|
-
min_df: Minimum document frequency (ignore very rare terms)
|
|
76
|
-
"""
|
|
77
|
-
self.max_features = max_features
|
|
78
|
-
self.vectorizer = TfidfVectorizer(
|
|
79
|
-
max_features=max_features,
|
|
80
|
-
stop_words='english',
|
|
81
|
-
ngram_range=(1, 2), # Unigrams + bigrams
|
|
82
|
-
min_df=min_df,
|
|
83
|
-
lowercase=True,
|
|
84
|
-
token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z0-9_-]*\b' # Alphanumeric tokens
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
def extract_entities(self, contents: List[str]) -> Tuple[List[List[str]], np.ndarray]:
|
|
88
|
-
"""
|
|
89
|
-
Extract entities from multiple contents.
|
|
90
|
-
|
|
91
|
-
Args:
|
|
92
|
-
contents: List of memory content strings
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
Tuple of (entities_per_content, tfidf_vectors)
|
|
96
|
-
"""
|
|
97
|
-
if not contents:
|
|
98
|
-
return [], np.array([])
|
|
99
|
-
|
|
100
|
-
try:
|
|
101
|
-
# Fit and transform all contents
|
|
102
|
-
vectors = self.vectorizer.fit_transform(contents)
|
|
103
|
-
feature_names = self.vectorizer.get_feature_names_out()
|
|
104
|
-
|
|
105
|
-
# Extract top entities for each content
|
|
106
|
-
all_entities = []
|
|
107
|
-
for idx in range(len(contents)):
|
|
108
|
-
scores = vectors[idx].toarray()[0]
|
|
109
|
-
|
|
110
|
-
# Get indices of top features
|
|
111
|
-
top_indices = np.argsort(scores)[::-1]
|
|
112
|
-
|
|
113
|
-
# Extract entities with score > 0
|
|
114
|
-
entities = [
|
|
115
|
-
feature_names[i]
|
|
116
|
-
for i in top_indices
|
|
117
|
-
if scores[i] > 0.05 # Minimum threshold
|
|
118
|
-
][:self.max_features]
|
|
119
|
-
|
|
120
|
-
all_entities.append(entities)
|
|
121
|
-
|
|
122
|
-
return all_entities, vectors.toarray()
|
|
123
|
-
|
|
124
|
-
except Exception as e:
|
|
125
|
-
logger.error(f"Entity extraction failed: {e}")
|
|
126
|
-
return [[] for _ in contents], np.zeros((len(contents), 1))
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class EdgeBuilder:
|
|
130
|
-
"""Build similarity edges between memories based on entity overlap."""
|
|
131
|
-
|
|
132
|
-
def __init__(self, db_path: Path, min_similarity: float = 0.3):
|
|
133
|
-
"""
|
|
134
|
-
Initialize edge builder.
|
|
135
|
-
|
|
136
|
-
Args:
|
|
137
|
-
db_path: Path to SQLite database
|
|
138
|
-
min_similarity: Minimum cosine similarity to create edge
|
|
139
|
-
"""
|
|
140
|
-
self.db_path = db_path
|
|
141
|
-
self.min_similarity = min_similarity
|
|
142
|
-
|
|
143
|
-
def build_edges(self, memory_ids: List[int], vectors: np.ndarray,
|
|
144
|
-
entities_list: List[List[str]]) -> int:
|
|
145
|
-
"""
|
|
146
|
-
Build edges between similar memories.
|
|
147
|
-
|
|
148
|
-
Args:
|
|
149
|
-
memory_ids: List of memory IDs
|
|
150
|
-
vectors: TF-IDF vectors (n x features)
|
|
151
|
-
entities_list: List of entity lists per memory
|
|
152
|
-
|
|
153
|
-
Returns:
|
|
154
|
-
Number of edges created
|
|
155
|
-
"""
|
|
156
|
-
if len(memory_ids) < 2:
|
|
157
|
-
logger.warning("Need at least 2 memories to build edges")
|
|
158
|
-
return 0
|
|
159
|
-
|
|
160
|
-
# Try HNSW-accelerated edge building first (O(n log n))
|
|
161
|
-
use_hnsw = False
|
|
162
|
-
try:
|
|
163
|
-
from hnsw_index import HNSWIndex
|
|
164
|
-
if len(memory_ids) >= 50: # HNSW overhead not worth it for small sets
|
|
165
|
-
use_hnsw = True
|
|
166
|
-
except ImportError:
|
|
167
|
-
pass
|
|
168
|
-
|
|
169
|
-
edges_added = 0
|
|
170
|
-
conn = sqlite3.connect(self.db_path)
|
|
171
|
-
cursor = conn.cursor()
|
|
172
|
-
|
|
173
|
-
try:
|
|
174
|
-
if use_hnsw:
|
|
175
|
-
logger.info("Using HNSW-accelerated edge building for %d memories", len(memory_ids))
|
|
176
|
-
try:
|
|
177
|
-
dim = vectors.shape[1]
|
|
178
|
-
hnsw = HNSWIndex(dimension=dim, max_elements=len(memory_ids))
|
|
179
|
-
hnsw.build(vectors, memory_ids)
|
|
180
|
-
|
|
181
|
-
for i in range(len(memory_ids)):
|
|
182
|
-
neighbors = hnsw.search(vectors[i], k=min(20, len(memory_ids) - 1))
|
|
183
|
-
for neighbor_id, similarity in neighbors:
|
|
184
|
-
if neighbor_id == memory_ids[i]:
|
|
185
|
-
continue # Skip self
|
|
186
|
-
# Only process each pair once (lower ID first)
|
|
187
|
-
if memory_ids[i] > neighbor_id:
|
|
188
|
-
continue
|
|
189
|
-
if similarity >= self.min_similarity:
|
|
190
|
-
# Find indices for entity lookup
|
|
191
|
-
j = memory_ids.index(neighbor_id)
|
|
192
|
-
entities_i = set(entities_list[i])
|
|
193
|
-
entities_j = set(entities_list[j])
|
|
194
|
-
shared = list(entities_i & entities_j)
|
|
195
|
-
rel_type = self._classify_relationship(similarity, shared)
|
|
196
|
-
|
|
197
|
-
cursor.execute('''
|
|
198
|
-
INSERT OR REPLACE INTO graph_edges
|
|
199
|
-
(source_memory_id, target_memory_id, relationship_type,
|
|
200
|
-
weight, shared_entities, similarity_score)
|
|
201
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
202
|
-
''', (
|
|
203
|
-
memory_ids[i], neighbor_id, rel_type,
|
|
204
|
-
float(similarity), json.dumps(shared), float(similarity)
|
|
205
|
-
))
|
|
206
|
-
edges_added += 1
|
|
207
|
-
|
|
208
|
-
except Exception as e:
|
|
209
|
-
logger.warning("HNSW edge building failed, falling back to O(n²): %s", e)
|
|
210
|
-
use_hnsw = False # Fall through to O(n²) below
|
|
211
|
-
|
|
212
|
-
if not use_hnsw:
|
|
213
|
-
# Fallback: O(n²) pairwise cosine similarity
|
|
214
|
-
similarity_matrix = cosine_similarity(vectors)
|
|
215
|
-
|
|
216
|
-
for i in range(len(memory_ids)):
|
|
217
|
-
for j in range(i + 1, len(memory_ids)):
|
|
218
|
-
sim = similarity_matrix[i, j]
|
|
219
|
-
|
|
220
|
-
if sim >= self.min_similarity:
|
|
221
|
-
entities_i = set(entities_list[i])
|
|
222
|
-
entities_j = set(entities_list[j])
|
|
223
|
-
shared = list(entities_i & entities_j)
|
|
224
|
-
rel_type = self._classify_relationship(sim, shared)
|
|
225
|
-
|
|
226
|
-
cursor.execute('''
|
|
227
|
-
INSERT OR REPLACE INTO graph_edges
|
|
228
|
-
(source_memory_id, target_memory_id, relationship_type,
|
|
229
|
-
weight, shared_entities, similarity_score)
|
|
230
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
231
|
-
''', (
|
|
232
|
-
memory_ids[i], memory_ids[j], rel_type,
|
|
233
|
-
float(sim), json.dumps(shared), float(sim)
|
|
234
|
-
))
|
|
235
|
-
edges_added += 1
|
|
236
|
-
|
|
237
|
-
conn.commit()
|
|
238
|
-
logger.info(f"Created {edges_added} edges")
|
|
239
|
-
return edges_added
|
|
240
|
-
|
|
241
|
-
except Exception as e:
|
|
242
|
-
logger.error(f"Edge building failed: {e}")
|
|
243
|
-
conn.rollback()
|
|
244
|
-
return 0
|
|
245
|
-
finally:
|
|
246
|
-
conn.close()
|
|
247
|
-
|
|
248
|
-
def _classify_relationship(self, similarity: float, shared_entities: List[str]) -> str:
|
|
249
|
-
"""
|
|
250
|
-
Classify edge type based on similarity and shared entities.
|
|
251
|
-
|
|
252
|
-
Args:
|
|
253
|
-
similarity: Cosine similarity score
|
|
254
|
-
shared_entities: List of shared entity strings
|
|
255
|
-
|
|
256
|
-
Returns:
|
|
257
|
-
Relationship type: 'similar', 'depends_on', or 'related_to'
|
|
258
|
-
"""
|
|
259
|
-
# Check for dependency keywords
|
|
260
|
-
dependency_keywords = {'dependency', 'require', 'import', 'use', 'need'}
|
|
261
|
-
has_dependency = any(
|
|
262
|
-
any(kw in entity.lower() for kw in dependency_keywords)
|
|
263
|
-
for entity in shared_entities
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
if similarity > 0.7:
|
|
267
|
-
return 'similar'
|
|
268
|
-
elif has_dependency:
|
|
269
|
-
return 'depends_on'
|
|
270
|
-
else:
|
|
271
|
-
return 'related_to'
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
class ClusterBuilder:
|
|
275
|
-
"""Detect memory communities using Leiden algorithm."""
|
|
276
|
-
|
|
277
|
-
def __init__(self, db_path: Path):
|
|
278
|
-
"""Initialize cluster builder."""
|
|
279
|
-
self.db_path = db_path
|
|
280
|
-
|
|
281
|
-
def _get_active_profile(self) -> str:
|
|
282
|
-
"""Get the currently active profile name from config."""
|
|
283
|
-
config_file = MEMORY_DIR / "profiles.json"
|
|
284
|
-
if config_file.exists():
|
|
285
|
-
try:
|
|
286
|
-
with open(config_file, 'r') as f:
|
|
287
|
-
config = json.load(f)
|
|
288
|
-
return config.get('active_profile', 'default')
|
|
289
|
-
except (json.JSONDecodeError, IOError):
|
|
290
|
-
pass
|
|
291
|
-
return 'default'
|
|
292
|
-
|
|
293
|
-
def detect_communities(self) -> int:
|
|
294
|
-
"""
|
|
295
|
-
Run Leiden algorithm to find memory clusters (active profile only).
|
|
296
|
-
|
|
297
|
-
Returns:
|
|
298
|
-
Number of clusters created
|
|
299
|
-
"""
|
|
300
|
-
if not IGRAPH_AVAILABLE:
|
|
301
|
-
logger.warning("igraph/leidenalg not installed. Graph clustering disabled. Install with: pip3 install python-igraph leidenalg")
|
|
302
|
-
return 0
|
|
303
|
-
import igraph as ig
|
|
304
|
-
import leidenalg
|
|
305
|
-
|
|
306
|
-
conn = sqlite3.connect(self.db_path)
|
|
307
|
-
cursor = conn.cursor()
|
|
308
|
-
active_profile = self._get_active_profile()
|
|
309
|
-
|
|
310
|
-
try:
|
|
311
|
-
# Load edges for active profile's memories only
|
|
312
|
-
edges = cursor.execute('''
|
|
313
|
-
SELECT ge.source_memory_id, ge.target_memory_id, ge.weight
|
|
314
|
-
FROM graph_edges ge
|
|
315
|
-
WHERE ge.source_memory_id IN (SELECT id FROM memories WHERE profile = ?)
|
|
316
|
-
AND ge.target_memory_id IN (SELECT id FROM memories WHERE profile = ?)
|
|
317
|
-
''', (active_profile, active_profile)).fetchall()
|
|
318
|
-
|
|
319
|
-
if not edges:
|
|
320
|
-
logger.warning("No edges found - cannot build clusters")
|
|
321
|
-
return 0
|
|
322
|
-
|
|
323
|
-
# Build memory ID mapping
|
|
324
|
-
memory_ids = set()
|
|
325
|
-
for source, target, _ in edges:
|
|
326
|
-
memory_ids.add(source)
|
|
327
|
-
memory_ids.add(target)
|
|
328
|
-
|
|
329
|
-
memory_ids = sorted(list(memory_ids))
|
|
330
|
-
memory_id_to_vertex = {mid: idx for idx, mid in enumerate(memory_ids)}
|
|
331
|
-
vertex_to_memory_id = {idx: mid for mid, idx in memory_id_to_vertex.items()}
|
|
332
|
-
|
|
333
|
-
# Create igraph graph
|
|
334
|
-
g = ig.Graph()
|
|
335
|
-
g.add_vertices(len(memory_ids))
|
|
336
|
-
|
|
337
|
-
# Add edges with weights
|
|
338
|
-
edge_list = []
|
|
339
|
-
edge_weights = []
|
|
340
|
-
|
|
341
|
-
for source, target, weight in edges:
|
|
342
|
-
edge_list.append((
|
|
343
|
-
memory_id_to_vertex[source],
|
|
344
|
-
memory_id_to_vertex[target]
|
|
345
|
-
))
|
|
346
|
-
edge_weights.append(weight)
|
|
347
|
-
|
|
348
|
-
g.add_edges(edge_list)
|
|
349
|
-
|
|
350
|
-
# Run Leiden algorithm
|
|
351
|
-
logger.info(f"Running Leiden on {len(memory_ids)} nodes, {len(edges)} edges")
|
|
352
|
-
partition = leidenalg.find_partition(
|
|
353
|
-
g,
|
|
354
|
-
leidenalg.ModularityVertexPartition,
|
|
355
|
-
weights=edge_weights,
|
|
356
|
-
n_iterations=100,
|
|
357
|
-
seed=42 # Reproducible
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
# Process communities
|
|
361
|
-
clusters_created = 0
|
|
362
|
-
|
|
363
|
-
for cluster_idx, community in enumerate(partition):
|
|
364
|
-
if len(community) < 2: # Skip singleton clusters
|
|
365
|
-
continue
|
|
366
|
-
|
|
367
|
-
# Get memory IDs in this cluster
|
|
368
|
-
cluster_memory_ids = [vertex_to_memory_id[v] for v in community]
|
|
369
|
-
|
|
370
|
-
# Calculate cluster stats
|
|
371
|
-
avg_importance = self._get_avg_importance(cursor, cluster_memory_ids)
|
|
372
|
-
|
|
373
|
-
# Auto-generate cluster name
|
|
374
|
-
cluster_name = self._generate_cluster_name(cursor, cluster_memory_ids)
|
|
375
|
-
|
|
376
|
-
# Insert cluster
|
|
377
|
-
result = cursor.execute('''
|
|
378
|
-
INSERT INTO graph_clusters (name, member_count, avg_importance)
|
|
379
|
-
VALUES (?, ?, ?)
|
|
380
|
-
''', (cluster_name, len(cluster_memory_ids), avg_importance))
|
|
381
|
-
|
|
382
|
-
cluster_id = result.lastrowid
|
|
383
|
-
|
|
384
|
-
# Update memories with cluster_id
|
|
385
|
-
cursor.executemany('''
|
|
386
|
-
UPDATE memories SET cluster_id = ? WHERE id = ?
|
|
387
|
-
''', [(cluster_id, mid) for mid in cluster_memory_ids])
|
|
388
|
-
|
|
389
|
-
clusters_created += 1
|
|
390
|
-
logger.info(f"Cluster {cluster_id}: '{cluster_name}' ({len(cluster_memory_ids)} members)")
|
|
391
|
-
|
|
392
|
-
conn.commit()
|
|
393
|
-
logger.info(f"Created {clusters_created} clusters")
|
|
394
|
-
return clusters_created
|
|
395
|
-
|
|
396
|
-
except Exception as e:
|
|
397
|
-
logger.error(f"Community detection failed: {e}")
|
|
398
|
-
conn.rollback()
|
|
399
|
-
return 0
|
|
400
|
-
finally:
|
|
401
|
-
conn.close()
|
|
402
|
-
|
|
403
|
-
def _get_avg_importance(self, cursor, memory_ids: List[int]) -> float:
|
|
404
|
-
"""Calculate average importance for cluster."""
|
|
405
|
-
placeholders = ','.join('?' * len(memory_ids))
|
|
406
|
-
result = cursor.execute(f'''
|
|
407
|
-
SELECT AVG(importance) FROM memories WHERE id IN ({placeholders})
|
|
408
|
-
''', memory_ids).fetchone()
|
|
409
|
-
|
|
410
|
-
return result[0] if result and result[0] else 5.0
|
|
411
|
-
|
|
412
|
-
def _generate_cluster_name(self, cursor, memory_ids: List[int]) -> str:
|
|
413
|
-
"""Generate cluster name from member entities (TF-IDF approach)."""
|
|
414
|
-
# Get all entities from cluster members
|
|
415
|
-
placeholders = ','.join('?' * len(memory_ids))
|
|
416
|
-
nodes = cursor.execute(f'''
|
|
417
|
-
SELECT entities FROM graph_nodes WHERE memory_id IN ({placeholders})
|
|
418
|
-
''', memory_ids).fetchall()
|
|
419
|
-
|
|
420
|
-
all_entities = []
|
|
421
|
-
for node in nodes:
|
|
422
|
-
if node[0]:
|
|
423
|
-
all_entities.extend(json.loads(node[0]))
|
|
424
|
-
|
|
425
|
-
if not all_entities:
|
|
426
|
-
return f"Cluster (ID auto-assigned)"
|
|
427
|
-
|
|
428
|
-
# Count entity frequencies
|
|
429
|
-
entity_counts = Counter(all_entities)
|
|
430
|
-
|
|
431
|
-
# Top 2-3 most common entities
|
|
432
|
-
top_entities = [e for e, _ in entity_counts.most_common(3)]
|
|
433
|
-
|
|
434
|
-
# Build name
|
|
435
|
-
if len(top_entities) >= 2:
|
|
436
|
-
name = f"{top_entities[0].title()} & {top_entities[1].title()}"
|
|
437
|
-
elif len(top_entities) == 1:
|
|
438
|
-
name = f"{top_entities[0].title()} Contexts"
|
|
439
|
-
else:
|
|
440
|
-
name = "Mixed Contexts"
|
|
441
|
-
|
|
442
|
-
return name[:100] # Limit length
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
def hierarchical_cluster(self, min_subcluster_size: int = 5, max_depth: int = 3) -> Dict[str, any]:
|
|
446
|
-
"""
|
|
447
|
-
Run recursive Leiden clustering — cluster the clusters.
|
|
448
|
-
|
|
449
|
-
Large communities (>= min_subcluster_size * 2) are recursively sub-clustered
|
|
450
|
-
to reveal finer-grained thematic structure. E.g., "Python" → "FastAPI" → "Auth".
|
|
451
|
-
|
|
452
|
-
Args:
|
|
453
|
-
min_subcluster_size: Minimum members to attempt sub-clustering (default 5)
|
|
454
|
-
max_depth: Maximum recursion depth (default 3)
|
|
455
|
-
|
|
456
|
-
Returns:
|
|
457
|
-
Dictionary with hierarchical clustering statistics
|
|
458
|
-
"""
|
|
459
|
-
if not IGRAPH_AVAILABLE:
|
|
460
|
-
logger.warning("igraph/leidenalg not installed. Hierarchical clustering disabled. Install with: pip3 install python-igraph leidenalg")
|
|
461
|
-
return {'subclusters_created': 0, 'depth_reached': 0}
|
|
462
|
-
import igraph as ig
|
|
463
|
-
import leidenalg
|
|
464
|
-
|
|
465
|
-
conn = sqlite3.connect(self.db_path)
|
|
466
|
-
cursor = conn.cursor()
|
|
467
|
-
active_profile = self._get_active_profile()
|
|
468
|
-
|
|
469
|
-
try:
|
|
470
|
-
# Get top-level clusters for this profile that are large enough to sub-cluster
|
|
471
|
-
cursor.execute('''
|
|
472
|
-
SELECT cluster_id, COUNT(*) as cnt
|
|
473
|
-
FROM memories
|
|
474
|
-
WHERE cluster_id IS NOT NULL AND profile = ?
|
|
475
|
-
GROUP BY cluster_id
|
|
476
|
-
HAVING cnt >= ?
|
|
477
|
-
''', (active_profile, min_subcluster_size * 2))
|
|
478
|
-
large_clusters = cursor.fetchall()
|
|
479
|
-
|
|
480
|
-
if not large_clusters:
|
|
481
|
-
logger.info("No clusters large enough for hierarchical decomposition")
|
|
482
|
-
return {'subclusters_created': 0, 'depth_reached': 0}
|
|
483
|
-
|
|
484
|
-
total_subclusters = 0
|
|
485
|
-
max_depth_reached = 0
|
|
486
|
-
|
|
487
|
-
for parent_cid, member_count in large_clusters:
|
|
488
|
-
subs, depth = self._recursive_subcluster(
|
|
489
|
-
conn, cursor, parent_cid, active_profile,
|
|
490
|
-
min_subcluster_size, max_depth, current_depth=1
|
|
491
|
-
)
|
|
492
|
-
total_subclusters += subs
|
|
493
|
-
max_depth_reached = max(max_depth_reached, depth)
|
|
494
|
-
|
|
495
|
-
conn.commit()
|
|
496
|
-
logger.info(f"Hierarchical clustering: {total_subclusters} sub-clusters, depth {max_depth_reached}")
|
|
497
|
-
return {
|
|
498
|
-
'subclusters_created': total_subclusters,
|
|
499
|
-
'depth_reached': max_depth_reached,
|
|
500
|
-
'parent_clusters_processed': len(large_clusters)
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
except Exception as e:
|
|
504
|
-
logger.error(f"Hierarchical clustering failed: {e}")
|
|
505
|
-
conn.rollback()
|
|
506
|
-
return {'subclusters_created': 0, 'error': str(e)}
|
|
507
|
-
finally:
|
|
508
|
-
conn.close()
|
|
509
|
-
|
|
510
|
-
def _recursive_subcluster(self, conn, cursor, parent_cluster_id: int,
|
|
511
|
-
profile: str, min_size: int, max_depth: int,
|
|
512
|
-
current_depth: int) -> Tuple[int, int]:
|
|
513
|
-
"""Recursively sub-cluster a community using Leiden."""
|
|
514
|
-
if not IGRAPH_AVAILABLE:
|
|
515
|
-
return 0, current_depth - 1
|
|
516
|
-
import igraph as ig
|
|
517
|
-
import leidenalg
|
|
518
|
-
|
|
519
|
-
if current_depth > max_depth:
|
|
520
|
-
return 0, current_depth - 1
|
|
521
|
-
|
|
522
|
-
# Get memory IDs in this cluster
|
|
523
|
-
cursor.execute('''
|
|
524
|
-
SELECT id FROM memories
|
|
525
|
-
WHERE cluster_id = ? AND profile = ?
|
|
526
|
-
''', (parent_cluster_id, profile))
|
|
527
|
-
member_ids = [row[0] for row in cursor.fetchall()]
|
|
528
|
-
|
|
529
|
-
if len(member_ids) < min_size * 2:
|
|
530
|
-
return 0, current_depth - 1
|
|
531
|
-
|
|
532
|
-
# Get edges between members of this cluster
|
|
533
|
-
placeholders = ','.join('?' * len(member_ids))
|
|
534
|
-
edges = cursor.execute(f'''
|
|
535
|
-
SELECT source_memory_id, target_memory_id, weight
|
|
536
|
-
FROM graph_edges
|
|
537
|
-
WHERE source_memory_id IN ({placeholders})
|
|
538
|
-
AND target_memory_id IN ({placeholders})
|
|
539
|
-
''', member_ids + member_ids).fetchall()
|
|
540
|
-
|
|
541
|
-
if len(edges) < 2:
|
|
542
|
-
return 0, current_depth - 1
|
|
543
|
-
|
|
544
|
-
# Build sub-graph
|
|
545
|
-
id_to_vertex = {mid: idx for idx, mid in enumerate(member_ids)}
|
|
546
|
-
vertex_to_id = {idx: mid for mid, idx in id_to_vertex.items()}
|
|
547
|
-
|
|
548
|
-
g = ig.Graph()
|
|
549
|
-
g.add_vertices(len(member_ids))
|
|
550
|
-
edge_list, edge_weights = [], []
|
|
551
|
-
for src, tgt, w in edges:
|
|
552
|
-
if src in id_to_vertex and tgt in id_to_vertex:
|
|
553
|
-
edge_list.append((id_to_vertex[src], id_to_vertex[tgt]))
|
|
554
|
-
edge_weights.append(w)
|
|
555
|
-
|
|
556
|
-
if not edge_list:
|
|
557
|
-
return 0, current_depth - 1
|
|
558
|
-
|
|
559
|
-
g.add_edges(edge_list)
|
|
560
|
-
|
|
561
|
-
# Run Leiden with higher resolution for finer communities
|
|
562
|
-
partition = leidenalg.find_partition(
|
|
563
|
-
g, leidenalg.ModularityVertexPartition,
|
|
564
|
-
weights=edge_weights, n_iterations=100, seed=42
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
# Only proceed if Leiden found > 1 community (actual split)
|
|
568
|
-
non_singleton = [c for c in partition if len(c) >= 2]
|
|
569
|
-
if len(non_singleton) <= 1:
|
|
570
|
-
return 0, current_depth - 1
|
|
571
|
-
|
|
572
|
-
subclusters_created = 0
|
|
573
|
-
deepest = current_depth
|
|
574
|
-
|
|
575
|
-
# Get parent depth
|
|
576
|
-
cursor.execute('SELECT depth FROM graph_clusters WHERE id = ?', (parent_cluster_id,))
|
|
577
|
-
parent_row = cursor.fetchone()
|
|
578
|
-
parent_depth = parent_row[0] if parent_row else 0
|
|
579
|
-
|
|
580
|
-
for community in non_singleton:
|
|
581
|
-
sub_member_ids = [vertex_to_id[v] for v in community]
|
|
582
|
-
|
|
583
|
-
if len(sub_member_ids) < 2:
|
|
584
|
-
continue
|
|
585
|
-
|
|
586
|
-
avg_imp = self._get_avg_importance(cursor, sub_member_ids)
|
|
587
|
-
cluster_name = self._generate_cluster_name(cursor, sub_member_ids)
|
|
588
|
-
|
|
589
|
-
result = cursor.execute('''
|
|
590
|
-
INSERT INTO graph_clusters (name, member_count, avg_importance, parent_cluster_id, depth)
|
|
591
|
-
VALUES (?, ?, ?, ?, ?)
|
|
592
|
-
''', (cluster_name, len(sub_member_ids), avg_imp, parent_cluster_id, parent_depth + 1))
|
|
593
|
-
|
|
594
|
-
sub_cluster_id = result.lastrowid
|
|
595
|
-
|
|
596
|
-
# Update memories to point to sub-cluster
|
|
597
|
-
cursor.executemany('''
|
|
598
|
-
UPDATE memories SET cluster_id = ? WHERE id = ?
|
|
599
|
-
''', [(sub_cluster_id, mid) for mid in sub_member_ids])
|
|
600
|
-
|
|
601
|
-
subclusters_created += 1
|
|
602
|
-
logger.info(f"Sub-cluster {sub_cluster_id} under {parent_cluster_id}: "
|
|
603
|
-
f"'{cluster_name}' ({len(sub_member_ids)} members, depth {parent_depth + 1})")
|
|
604
|
-
|
|
605
|
-
# Recurse into this sub-cluster if large enough
|
|
606
|
-
child_subs, child_depth = self._recursive_subcluster(
|
|
607
|
-
conn, cursor, sub_cluster_id, profile,
|
|
608
|
-
min_size, max_depth, current_depth + 1
|
|
609
|
-
)
|
|
610
|
-
subclusters_created += child_subs
|
|
611
|
-
deepest = max(deepest, child_depth)
|
|
612
|
-
|
|
613
|
-
return subclusters_created, deepest
|
|
614
|
-
|
|
615
|
-
def generate_cluster_summaries(self) -> int:
|
|
616
|
-
"""
|
|
617
|
-
Generate TF-IDF structured summaries for all clusters.
|
|
618
|
-
|
|
619
|
-
For each cluster, analyzes member content to produce a human-readable
|
|
620
|
-
summary describing the cluster's theme, key topics, and scope.
|
|
621
|
-
|
|
622
|
-
Returns:
|
|
623
|
-
Number of clusters with summaries generated
|
|
624
|
-
"""
|
|
625
|
-
conn = sqlite3.connect(self.db_path)
|
|
626
|
-
cursor = conn.cursor()
|
|
627
|
-
active_profile = self._get_active_profile()
|
|
628
|
-
|
|
629
|
-
try:
|
|
630
|
-
# Get all clusters for this profile
|
|
631
|
-
cursor.execute('''
|
|
632
|
-
SELECT DISTINCT gc.id, gc.name, gc.member_count
|
|
633
|
-
FROM graph_clusters gc
|
|
634
|
-
JOIN memories m ON m.cluster_id = gc.id
|
|
635
|
-
WHERE m.profile = ?
|
|
636
|
-
''', (active_profile,))
|
|
637
|
-
clusters = cursor.fetchall()
|
|
638
|
-
|
|
639
|
-
if not clusters:
|
|
640
|
-
return 0
|
|
641
|
-
|
|
642
|
-
summaries_generated = 0
|
|
643
|
-
|
|
644
|
-
for cluster_id, cluster_name, member_count in clusters:
|
|
645
|
-
summary = self._build_cluster_summary(cursor, cluster_id, active_profile)
|
|
646
|
-
if summary:
|
|
647
|
-
cursor.execute('''
|
|
648
|
-
UPDATE graph_clusters SET summary = ?, updated_at = CURRENT_TIMESTAMP
|
|
649
|
-
WHERE id = ?
|
|
650
|
-
''', (summary, cluster_id))
|
|
651
|
-
summaries_generated += 1
|
|
652
|
-
logger.info(f"Summary for cluster {cluster_id} ({cluster_name}): {summary[:80]}...")
|
|
653
|
-
|
|
654
|
-
conn.commit()
|
|
655
|
-
logger.info(f"Generated {summaries_generated} cluster summaries")
|
|
656
|
-
return summaries_generated
|
|
657
|
-
|
|
658
|
-
except Exception as e:
|
|
659
|
-
logger.error(f"Summary generation failed: {e}")
|
|
660
|
-
conn.rollback()
|
|
661
|
-
return 0
|
|
662
|
-
finally:
|
|
663
|
-
conn.close()
|
|
664
|
-
|
|
665
|
-
def _build_cluster_summary(self, cursor, cluster_id: int, profile: str) -> str:
|
|
666
|
-
"""Build a TF-IDF structured summary for a single cluster."""
|
|
667
|
-
# Get member content
|
|
668
|
-
cursor.execute('''
|
|
669
|
-
SELECT m.content, m.summary, m.tags, m.category, m.project_name
|
|
670
|
-
FROM memories m
|
|
671
|
-
WHERE m.cluster_id = ? AND m.profile = ?
|
|
672
|
-
''', (cluster_id, profile))
|
|
673
|
-
members = cursor.fetchall()
|
|
674
|
-
|
|
675
|
-
if not members:
|
|
676
|
-
return ""
|
|
677
|
-
|
|
678
|
-
# Collect entities from graph nodes
|
|
679
|
-
cursor.execute('''
|
|
680
|
-
SELECT gn.entities
|
|
681
|
-
FROM graph_nodes gn
|
|
682
|
-
JOIN memories m ON gn.memory_id = m.id
|
|
683
|
-
WHERE m.cluster_id = ? AND m.profile = ?
|
|
684
|
-
''', (cluster_id, profile))
|
|
685
|
-
all_entities = []
|
|
686
|
-
for row in cursor.fetchall():
|
|
687
|
-
if row[0]:
|
|
688
|
-
try:
|
|
689
|
-
all_entities.extend(json.loads(row[0]))
|
|
690
|
-
except (json.JSONDecodeError, TypeError):
|
|
691
|
-
pass
|
|
692
|
-
|
|
693
|
-
# Top entities by frequency (TF-IDF already extracted these)
|
|
694
|
-
entity_counts = Counter(all_entities)
|
|
695
|
-
top_entities = [e for e, _ in entity_counts.most_common(5)]
|
|
696
|
-
|
|
697
|
-
# Collect unique projects and categories
|
|
698
|
-
projects = set()
|
|
699
|
-
categories = set()
|
|
700
|
-
for m in members:
|
|
701
|
-
if m[3]: # category
|
|
702
|
-
categories.add(m[3])
|
|
703
|
-
if m[4]: # project_name
|
|
704
|
-
projects.add(m[4])
|
|
705
|
-
|
|
706
|
-
# Build structured summary
|
|
707
|
-
parts = []
|
|
708
|
-
|
|
709
|
-
# Theme from top entities
|
|
710
|
-
if top_entities:
|
|
711
|
-
parts.append(f"Key topics: {', '.join(top_entities[:5])}")
|
|
712
|
-
|
|
713
|
-
# Scope
|
|
714
|
-
if projects:
|
|
715
|
-
parts.append(f"Projects: {', '.join(sorted(projects)[:3])}")
|
|
716
|
-
if categories:
|
|
717
|
-
parts.append(f"Categories: {', '.join(sorted(categories)[:3])}")
|
|
718
|
-
|
|
719
|
-
# Size context
|
|
720
|
-
parts.append(f"{len(members)} memories")
|
|
721
|
-
|
|
722
|
-
# Check for hierarchical context
|
|
723
|
-
cursor.execute('SELECT parent_cluster_id FROM graph_clusters WHERE id = ?', (cluster_id,))
|
|
724
|
-
parent_row = cursor.fetchone()
|
|
725
|
-
if parent_row and parent_row[0]:
|
|
726
|
-
cursor.execute('SELECT name FROM graph_clusters WHERE id = ?', (parent_row[0],))
|
|
727
|
-
parent_name_row = cursor.fetchone()
|
|
728
|
-
if parent_name_row:
|
|
729
|
-
parts.append(f"Sub-cluster of: {parent_name_row[0]}")
|
|
730
|
-
|
|
731
|
-
return " | ".join(parts)
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
class ClusterNamer:
|
|
735
|
-
"""Enhanced cluster naming with optional LLM support (future)."""
|
|
736
|
-
|
|
737
|
-
@staticmethod
|
|
738
|
-
def generate_name_tfidf(entities: List[str]) -> str:
|
|
739
|
-
"""Generate name from entity list (TF-IDF fallback)."""
|
|
740
|
-
if not entities:
|
|
741
|
-
return "Unnamed Cluster"
|
|
742
|
-
|
|
743
|
-
entity_counts = Counter(entities)
|
|
744
|
-
top_entities = [e for e, _ in entity_counts.most_common(2)]
|
|
745
|
-
|
|
746
|
-
if len(top_entities) >= 2:
|
|
747
|
-
return f"{top_entities[0].title()} & {top_entities[1].title()}"
|
|
748
|
-
else:
|
|
749
|
-
return f"{top_entities[0].title()} Contexts"
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
class GraphEngine:
|
|
753
|
-
"""Main graph engine coordinating all graph operations."""
|
|
754
|
-
|
|
755
|
-
def __init__(self, db_path: Path = DB_PATH):
|
|
756
|
-
"""Initialize graph engine."""
|
|
757
|
-
self.db_path = db_path
|
|
758
|
-
self.entity_extractor = EntityExtractor(max_features=20)
|
|
759
|
-
self.edge_builder = EdgeBuilder(db_path)
|
|
760
|
-
self.cluster_builder = ClusterBuilder(db_path)
|
|
761
|
-
self._ensure_graph_tables()
|
|
762
|
-
|
|
763
|
-
def _get_active_profile(self) -> str:
|
|
764
|
-
"""Get the currently active profile name from config."""
|
|
765
|
-
config_file = MEMORY_DIR / "profiles.json"
|
|
766
|
-
if config_file.exists():
|
|
767
|
-
try:
|
|
768
|
-
with open(config_file, 'r') as f:
|
|
769
|
-
config = json.load(f)
|
|
770
|
-
return config.get('active_profile', 'default')
|
|
771
|
-
except (json.JSONDecodeError, IOError):
|
|
772
|
-
pass
|
|
773
|
-
return 'default'
|
|
774
|
-
|
|
775
|
-
def _ensure_graph_tables(self):
|
|
776
|
-
"""Create graph tables if they don't exist, or recreate if schema is incomplete."""
|
|
777
|
-
conn = sqlite3.connect(self.db_path)
|
|
778
|
-
cursor = conn.cursor()
|
|
779
|
-
|
|
780
|
-
# Check if existing tables have correct schema (not just id column)
|
|
781
|
-
for table_name, required_cols in [
|
|
782
|
-
('graph_nodes', {'memory_id', 'entities'}),
|
|
783
|
-
('graph_edges', {'source_memory_id', 'target_memory_id', 'weight'}),
|
|
784
|
-
('graph_clusters', {'name', 'member_count'}),
|
|
785
|
-
]:
|
|
786
|
-
cursor.execute(f"PRAGMA table_info({table_name})")
|
|
787
|
-
existing_cols = {row[1] for row in cursor.fetchall()}
|
|
788
|
-
if existing_cols and not required_cols.issubset(existing_cols):
|
|
789
|
-
# Table exists but has incomplete schema — drop and recreate
|
|
790
|
-
logger.warning(f"Dropping incomplete {table_name} table (missing: {required_cols - existing_cols})")
|
|
791
|
-
cursor.execute(f'DROP TABLE IF EXISTS {table_name}')
|
|
792
|
-
|
|
793
|
-
# Graph nodes table
|
|
794
|
-
cursor.execute('''
|
|
795
|
-
CREATE TABLE IF NOT EXISTS graph_nodes (
|
|
796
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
797
|
-
memory_id INTEGER UNIQUE NOT NULL,
|
|
798
|
-
entities TEXT,
|
|
799
|
-
embedding_vector TEXT,
|
|
800
|
-
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
801
|
-
FOREIGN KEY (memory_id) REFERENCES memories(id) ON DELETE CASCADE
|
|
802
|
-
)
|
|
803
|
-
''')
|
|
804
|
-
|
|
805
|
-
# Graph edges table
|
|
806
|
-
cursor.execute('''
|
|
807
|
-
CREATE TABLE IF NOT EXISTS graph_edges (
|
|
808
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
809
|
-
source_memory_id INTEGER NOT NULL,
|
|
810
|
-
target_memory_id INTEGER NOT NULL,
|
|
811
|
-
relationship_type TEXT,
|
|
812
|
-
weight REAL DEFAULT 1.0,
|
|
813
|
-
shared_entities TEXT,
|
|
814
|
-
similarity_score REAL,
|
|
815
|
-
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
816
|
-
FOREIGN KEY (source_memory_id) REFERENCES memories(id) ON DELETE CASCADE,
|
|
817
|
-
FOREIGN KEY (target_memory_id) REFERENCES memories(id) ON DELETE CASCADE,
|
|
818
|
-
UNIQUE(source_memory_id, target_memory_id)
|
|
819
|
-
)
|
|
820
|
-
''')
|
|
821
|
-
|
|
822
|
-
# Graph clusters table
|
|
823
|
-
cursor.execute('''
|
|
824
|
-
CREATE TABLE IF NOT EXISTS graph_clusters (
|
|
825
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
826
|
-
name TEXT NOT NULL,
|
|
827
|
-
description TEXT,
|
|
828
|
-
summary TEXT,
|
|
829
|
-
member_count INTEGER DEFAULT 0,
|
|
830
|
-
avg_importance REAL,
|
|
831
|
-
parent_cluster_id INTEGER,
|
|
832
|
-
depth INTEGER DEFAULT 0,
|
|
833
|
-
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
834
|
-
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
835
|
-
FOREIGN KEY (parent_cluster_id) REFERENCES graph_clusters(id) ON DELETE SET NULL
|
|
836
|
-
)
|
|
837
|
-
''')
|
|
838
|
-
|
|
839
|
-
# Safe column additions for existing databases
|
|
840
|
-
for col, col_type in [('summary', 'TEXT'), ('parent_cluster_id', 'INTEGER'), ('depth', 'INTEGER DEFAULT 0')]:
|
|
841
|
-
try:
|
|
842
|
-
cursor.execute(f'ALTER TABLE graph_clusters ADD COLUMN {col} {col_type}')
|
|
843
|
-
except sqlite3.OperationalError:
|
|
844
|
-
pass
|
|
845
|
-
|
|
846
|
-
# Add cluster_id to memories if not exists
|
|
847
|
-
try:
|
|
848
|
-
cursor.execute('ALTER TABLE memories ADD COLUMN cluster_id INTEGER')
|
|
849
|
-
except sqlite3.OperationalError:
|
|
850
|
-
pass # Column already exists
|
|
851
|
-
|
|
852
|
-
# Create indexes
|
|
853
|
-
cursor.execute('CREATE INDEX IF NOT EXISTS idx_graph_source ON graph_edges(source_memory_id)')
|
|
854
|
-
cursor.execute('CREATE INDEX IF NOT EXISTS idx_graph_target ON graph_edges(target_memory_id)')
|
|
855
|
-
cursor.execute('CREATE INDEX IF NOT EXISTS idx_cluster_members ON memories(cluster_id)')
|
|
856
|
-
|
|
857
|
-
conn.commit()
|
|
858
|
-
conn.close()
|
|
859
|
-
logger.info("Graph tables initialized")
|
|
860
|
-
|
|
861
|
-
def build_graph(self, min_similarity: float = 0.3) -> Dict[str, any]:
|
|
862
|
-
"""
|
|
863
|
-
Build complete knowledge graph from all memories.
|
|
864
|
-
|
|
865
|
-
Args:
|
|
866
|
-
min_similarity: Minimum cosine similarity for edge creation
|
|
867
|
-
|
|
868
|
-
Returns:
|
|
869
|
-
Dictionary with build statistics
|
|
870
|
-
|
|
871
|
-
Raises:
|
|
872
|
-
ValueError: If too many memories (>10000) for safe processing
|
|
873
|
-
"""
|
|
874
|
-
start_time = time.time()
|
|
875
|
-
logger.info("Starting full graph build...")
|
|
876
|
-
|
|
877
|
-
conn = sqlite3.connect(self.db_path)
|
|
878
|
-
cursor = conn.cursor()
|
|
879
|
-
|
|
880
|
-
try:
|
|
881
|
-
# First check if required tables exist
|
|
882
|
-
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
883
|
-
existing_tables = {row[0] for row in cursor.fetchall()}
|
|
884
|
-
|
|
885
|
-
required_tables = {'memories', 'graph_edges', 'graph_nodes', 'graph_clusters'}
|
|
886
|
-
missing_tables = required_tables - existing_tables
|
|
887
|
-
|
|
888
|
-
if missing_tables:
|
|
889
|
-
logger.error(f"Missing required tables: {missing_tables}")
|
|
890
|
-
return {
|
|
891
|
-
'success': False,
|
|
892
|
-
'error': 'database_not_initialized',
|
|
893
|
-
'message': f"Database not initialized. Missing tables: {', '.join(missing_tables)}",
|
|
894
|
-
'fix': "Run 'superlocalmemoryv2:status' first to initialize the database, or add some memories."
|
|
895
|
-
}
|
|
896
|
-
|
|
897
|
-
# Load memories for active profile only
|
|
898
|
-
active_profile = self._get_active_profile()
|
|
899
|
-
logger.info(f"Building graph for profile: {active_profile}")
|
|
900
|
-
memories = cursor.execute('''
|
|
901
|
-
SELECT id, content, summary FROM memories
|
|
902
|
-
WHERE profile = ?
|
|
903
|
-
ORDER BY id
|
|
904
|
-
''', (active_profile,)).fetchall()
|
|
905
|
-
|
|
906
|
-
if len(memories) == 0:
|
|
907
|
-
logger.warning("No memories found")
|
|
908
|
-
return {
|
|
909
|
-
'success': False,
|
|
910
|
-
'error': 'no_memories',
|
|
911
|
-
'message': 'No memories found in database.',
|
|
912
|
-
'fix': "Add some memories first: superlocalmemoryv2:remember 'Your content here'"
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
if len(memories) < 2:
|
|
916
|
-
logger.warning("Need at least 2 memories to build graph")
|
|
917
|
-
return {
|
|
918
|
-
'success': False,
|
|
919
|
-
'error': 'insufficient_memories',
|
|
920
|
-
'message': 'Need at least 2 memories to build knowledge graph.',
|
|
921
|
-
'memories': len(memories),
|
|
922
|
-
'fix': "Add more memories: superlocalmemoryv2:remember 'Your content here'"
|
|
923
|
-
}
|
|
924
|
-
|
|
925
|
-
# SCALABILITY: Intelligent sampling for large datasets (v2.6)
|
|
926
|
-
if len(memories) > MAX_MEMORIES_FOR_GRAPH:
|
|
927
|
-
logger.warning(
|
|
928
|
-
"Memory count (%d) exceeds graph cap (%d). Using intelligent sampling.",
|
|
929
|
-
len(memories), MAX_MEMORIES_FOR_GRAPH
|
|
930
|
-
)
|
|
931
|
-
# Sample: 60% most recent + 40% highest importance (with overlap dedup)
|
|
932
|
-
recent_count = int(MAX_MEMORIES_FOR_GRAPH * 0.6)
|
|
933
|
-
important_count = int(MAX_MEMORIES_FOR_GRAPH * 0.4)
|
|
934
|
-
|
|
935
|
-
recent_memories = cursor.execute('''
|
|
936
|
-
SELECT id, content, summary FROM memories
|
|
937
|
-
WHERE profile = ?
|
|
938
|
-
ORDER BY created_at DESC
|
|
939
|
-
LIMIT ?
|
|
940
|
-
''', (active_profile, recent_count)).fetchall()
|
|
941
|
-
|
|
942
|
-
important_memories = cursor.execute('''
|
|
943
|
-
SELECT id, content, summary FROM memories
|
|
944
|
-
WHERE profile = ?
|
|
945
|
-
ORDER BY importance DESC, access_count DESC
|
|
946
|
-
LIMIT ?
|
|
947
|
-
''', (active_profile, important_count)).fetchall()
|
|
948
|
-
|
|
949
|
-
# Deduplicate by ID, preserving order
|
|
950
|
-
seen_ids = set()
|
|
951
|
-
memories = []
|
|
952
|
-
for m in recent_memories + important_memories:
|
|
953
|
-
if m[0] not in seen_ids:
|
|
954
|
-
seen_ids.add(m[0])
|
|
955
|
-
memories.append(m)
|
|
956
|
-
memories = memories[:MAX_MEMORIES_FOR_GRAPH]
|
|
957
|
-
logger.info("Sampled %d memories for graph build", len(memories))
|
|
958
|
-
|
|
959
|
-
elif len(memories) > MAX_MEMORIES_FOR_GRAPH * 0.8:
|
|
960
|
-
logger.warning(
|
|
961
|
-
"Approaching graph cap: %d/%d memories (%.0f%%). "
|
|
962
|
-
"Consider running memory compression.",
|
|
963
|
-
len(memories), MAX_MEMORIES_FOR_GRAPH,
|
|
964
|
-
len(memories) / MAX_MEMORIES_FOR_GRAPH * 100
|
|
965
|
-
)
|
|
966
|
-
|
|
967
|
-
# Clear existing graph data for this profile's memories
|
|
968
|
-
profile_memory_ids = [m[0] for m in memories]
|
|
969
|
-
if profile_memory_ids:
|
|
970
|
-
placeholders = ','.join('?' * len(profile_memory_ids))
|
|
971
|
-
cursor.execute(f'''
|
|
972
|
-
DELETE FROM graph_edges
|
|
973
|
-
WHERE source_memory_id IN ({placeholders})
|
|
974
|
-
OR target_memory_id IN ({placeholders})
|
|
975
|
-
''', profile_memory_ids + profile_memory_ids)
|
|
976
|
-
cursor.execute(f'''
|
|
977
|
-
DELETE FROM graph_nodes
|
|
978
|
-
WHERE memory_id IN ({placeholders})
|
|
979
|
-
''', profile_memory_ids)
|
|
980
|
-
# Remove orphaned clusters (no remaining members)
|
|
981
|
-
cursor.execute('''
|
|
982
|
-
DELETE FROM graph_clusters
|
|
983
|
-
WHERE id NOT IN (
|
|
984
|
-
SELECT DISTINCT cluster_id FROM memories
|
|
985
|
-
WHERE cluster_id IS NOT NULL
|
|
986
|
-
)
|
|
987
|
-
''')
|
|
988
|
-
cursor.execute('UPDATE memories SET cluster_id = NULL WHERE profile = ?',
|
|
989
|
-
(active_profile,))
|
|
990
|
-
conn.commit()
|
|
991
|
-
|
|
992
|
-
logger.info(f"Processing {len(memories)} memories")
|
|
993
|
-
|
|
994
|
-
# Extract entities and vectors
|
|
995
|
-
memory_ids = [m[0] for m in memories]
|
|
996
|
-
contents = [f"{m[1]} {m[2] or ''}" for m in memories] # Combine content + summary
|
|
997
|
-
|
|
998
|
-
entities_list, vectors = self.entity_extractor.extract_entities(contents)
|
|
999
|
-
|
|
1000
|
-
# Store nodes
|
|
1001
|
-
for memory_id, entities, vector in zip(memory_ids, entities_list, vectors):
|
|
1002
|
-
cursor.execute('''
|
|
1003
|
-
INSERT INTO graph_nodes (memory_id, entities, embedding_vector)
|
|
1004
|
-
VALUES (?, ?, ?)
|
|
1005
|
-
''', (
|
|
1006
|
-
memory_id,
|
|
1007
|
-
json.dumps(entities),
|
|
1008
|
-
json.dumps(vector.tolist())
|
|
1009
|
-
))
|
|
1010
|
-
|
|
1011
|
-
conn.commit()
|
|
1012
|
-
logger.info(f"Stored {len(memory_ids)} graph nodes")
|
|
1013
|
-
|
|
1014
|
-
# Build edges
|
|
1015
|
-
edges_count = self.edge_builder.build_edges(
|
|
1016
|
-
memory_ids, vectors, entities_list
|
|
1017
|
-
)
|
|
1018
|
-
|
|
1019
|
-
# Detect communities (flat Leiden)
|
|
1020
|
-
clusters_count = self.cluster_builder.detect_communities()
|
|
1021
|
-
|
|
1022
|
-
# Hierarchical sub-clustering on large communities
|
|
1023
|
-
hierarchical_stats = self.cluster_builder.hierarchical_cluster()
|
|
1024
|
-
subclusters = hierarchical_stats.get('subclusters_created', 0)
|
|
1025
|
-
|
|
1026
|
-
# Generate TF-IDF structured summaries for all clusters
|
|
1027
|
-
summaries = self.cluster_builder.generate_cluster_summaries()
|
|
1028
|
-
|
|
1029
|
-
elapsed = time.time() - start_time
|
|
1030
|
-
|
|
1031
|
-
stats = {
|
|
1032
|
-
'success': True,
|
|
1033
|
-
'memories': len(memories),
|
|
1034
|
-
'nodes': len(memory_ids),
|
|
1035
|
-
'edges': edges_count,
|
|
1036
|
-
'clusters': clusters_count,
|
|
1037
|
-
'subclusters': subclusters,
|
|
1038
|
-
'max_depth': hierarchical_stats.get('depth_reached', 0),
|
|
1039
|
-
'summaries_generated': summaries,
|
|
1040
|
-
'time_seconds': round(elapsed, 2)
|
|
1041
|
-
}
|
|
1042
|
-
if not IGRAPH_AVAILABLE:
|
|
1043
|
-
stats['warning'] = 'igraph/leidenalg not installed — graph built without clustering. Install with: pip3 install python-igraph leidenalg'
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
logger.info(f"Graph build complete: {stats}")
|
|
1047
|
-
return stats
|
|
1048
|
-
|
|
1049
|
-
except Exception as e:
|
|
1050
|
-
logger.error(f"Graph build failed: {e}")
|
|
1051
|
-
conn.rollback()
|
|
1052
|
-
return {
|
|
1053
|
-
'success': False,
|
|
1054
|
-
'error': str(e)
|
|
1055
|
-
}
|
|
1056
|
-
finally:
|
|
1057
|
-
conn.close()
|
|
1058
|
-
|
|
1059
|
-
def extract_entities(self, memory_id: int) -> List[str]:
|
|
1060
|
-
"""
|
|
1061
|
-
Extract entities for a single memory.
|
|
1062
|
-
|
|
1063
|
-
Args:
|
|
1064
|
-
memory_id: Memory ID
|
|
1065
|
-
|
|
1066
|
-
Returns:
|
|
1067
|
-
List of entity strings
|
|
1068
|
-
"""
|
|
1069
|
-
conn = sqlite3.connect(self.db_path)
|
|
1070
|
-
cursor = conn.cursor()
|
|
1071
|
-
|
|
1072
|
-
try:
|
|
1073
|
-
# Get memory content
|
|
1074
|
-
memory = cursor.execute('''
|
|
1075
|
-
SELECT content, summary FROM memories WHERE id = ?
|
|
1076
|
-
''', (memory_id,)).fetchone()
|
|
1077
|
-
|
|
1078
|
-
if not memory:
|
|
1079
|
-
return []
|
|
1080
|
-
|
|
1081
|
-
content = f"{memory[0]} {memory[1] or ''}"
|
|
1082
|
-
entities_list, _ = self.entity_extractor.extract_entities([content])
|
|
1083
|
-
|
|
1084
|
-
return entities_list[0] if entities_list else []
|
|
1085
|
-
|
|
1086
|
-
finally:
|
|
1087
|
-
conn.close()
|
|
1088
|
-
|
|
1089
|
-
def get_related(self, memory_id: int, max_hops: int = 2) -> List[Dict]:
|
|
1090
|
-
"""
|
|
1091
|
-
Get memories connected to this memory via graph edges (active profile only).
|
|
1092
|
-
|
|
1093
|
-
Args:
|
|
1094
|
-
memory_id: Source memory ID
|
|
1095
|
-
max_hops: Maximum traversal depth (1 or 2)
|
|
1096
|
-
|
|
1097
|
-
Returns:
|
|
1098
|
-
List of related memory dictionaries
|
|
1099
|
-
"""
|
|
1100
|
-
conn = sqlite3.connect(self.db_path)
|
|
1101
|
-
cursor = conn.cursor()
|
|
1102
|
-
active_profile = self._get_active_profile()
|
|
1103
|
-
|
|
1104
|
-
try:
|
|
1105
|
-
# Get 1-hop neighbors (filtered to active profile)
|
|
1106
|
-
edges = cursor.execute('''
|
|
1107
|
-
SELECT ge.target_memory_id, ge.relationship_type, ge.weight, ge.shared_entities
|
|
1108
|
-
FROM graph_edges ge
|
|
1109
|
-
JOIN memories m ON ge.target_memory_id = m.id
|
|
1110
|
-
WHERE ge.source_memory_id = ? AND m.profile = ?
|
|
1111
|
-
UNION
|
|
1112
|
-
SELECT ge.source_memory_id, ge.relationship_type, ge.weight, ge.shared_entities
|
|
1113
|
-
FROM graph_edges ge
|
|
1114
|
-
JOIN memories m ON ge.source_memory_id = m.id
|
|
1115
|
-
WHERE ge.target_memory_id = ? AND m.profile = ?
|
|
1116
|
-
''', (memory_id, active_profile, memory_id, active_profile)).fetchall()
|
|
1117
|
-
|
|
1118
|
-
results = []
|
|
1119
|
-
seen_ids = {memory_id}
|
|
1120
|
-
|
|
1121
|
-
for target_id, rel_type, weight, shared_entities in edges:
|
|
1122
|
-
if target_id in seen_ids:
|
|
1123
|
-
continue
|
|
1124
|
-
|
|
1125
|
-
seen_ids.add(target_id)
|
|
1126
|
-
|
|
1127
|
-
# Get memory details
|
|
1128
|
-
memory = cursor.execute('''
|
|
1129
|
-
SELECT id, summary, importance, tags
|
|
1130
|
-
FROM memories WHERE id = ?
|
|
1131
|
-
''', (target_id,)).fetchone()
|
|
1132
|
-
|
|
1133
|
-
if memory:
|
|
1134
|
-
results.append({
|
|
1135
|
-
'id': memory[0],
|
|
1136
|
-
'summary': memory[1],
|
|
1137
|
-
'importance': memory[2],
|
|
1138
|
-
'tags': json.loads(memory[3]) if memory[3] else [],
|
|
1139
|
-
'relationship': rel_type,
|
|
1140
|
-
'weight': weight,
|
|
1141
|
-
'shared_entities': json.loads(shared_entities) if shared_entities else [],
|
|
1142
|
-
'hops': 1
|
|
1143
|
-
})
|
|
1144
|
-
|
|
1145
|
-
# If max_hops == 2, get 2-hop neighbors
|
|
1146
|
-
if max_hops >= 2:
|
|
1147
|
-
for result in results[:]: # Copy to avoid modification during iteration
|
|
1148
|
-
second_hop = cursor.execute('''
|
|
1149
|
-
SELECT target_memory_id, relationship_type, weight
|
|
1150
|
-
FROM graph_edges
|
|
1151
|
-
WHERE source_memory_id = ?
|
|
1152
|
-
UNION
|
|
1153
|
-
SELECT source_memory_id, relationship_type, weight
|
|
1154
|
-
FROM graph_edges
|
|
1155
|
-
WHERE target_memory_id = ?
|
|
1156
|
-
''', (result['id'], result['id'])).fetchall()
|
|
1157
|
-
|
|
1158
|
-
for target_id, rel_type, weight in second_hop:
|
|
1159
|
-
if target_id in seen_ids:
|
|
1160
|
-
continue
|
|
1161
|
-
|
|
1162
|
-
seen_ids.add(target_id)
|
|
1163
|
-
|
|
1164
|
-
memory = cursor.execute('''
|
|
1165
|
-
SELECT id, summary, importance, tags
|
|
1166
|
-
FROM memories WHERE id = ?
|
|
1167
|
-
''', (target_id,)).fetchone()
|
|
1168
|
-
|
|
1169
|
-
if memory:
|
|
1170
|
-
results.append({
|
|
1171
|
-
'id': memory[0],
|
|
1172
|
-
'summary': memory[1],
|
|
1173
|
-
'importance': memory[2],
|
|
1174
|
-
'tags': json.loads(memory[3]) if memory[3] else [],
|
|
1175
|
-
'relationship': rel_type,
|
|
1176
|
-
'weight': weight,
|
|
1177
|
-
'shared_entities': [],
|
|
1178
|
-
'hops': 2
|
|
1179
|
-
})
|
|
1180
|
-
|
|
1181
|
-
# Sort by weight (strongest connections first)
|
|
1182
|
-
results.sort(key=lambda x: (-x['hops'], -x['weight']))
|
|
1183
|
-
|
|
1184
|
-
return results
|
|
1185
|
-
|
|
1186
|
-
finally:
|
|
1187
|
-
conn.close()
|
|
1188
|
-
|
|
1189
|
-
def get_cluster_members(self, cluster_id: int) -> List[Dict]:
|
|
1190
|
-
"""
|
|
1191
|
-
Get all memories in a cluster (filtered by active profile).
|
|
1192
|
-
|
|
1193
|
-
Args:
|
|
1194
|
-
cluster_id: Cluster ID
|
|
1195
|
-
|
|
1196
|
-
Returns:
|
|
1197
|
-
List of memory dictionaries
|
|
1198
|
-
"""
|
|
1199
|
-
conn = sqlite3.connect(self.db_path)
|
|
1200
|
-
cursor = conn.cursor()
|
|
1201
|
-
active_profile = self._get_active_profile()
|
|
1202
|
-
|
|
1203
|
-
try:
|
|
1204
|
-
memories = cursor.execute('''
|
|
1205
|
-
SELECT id, summary, importance, tags, created_at
|
|
1206
|
-
FROM memories
|
|
1207
|
-
WHERE cluster_id = ? AND profile = ?
|
|
1208
|
-
ORDER BY importance DESC
|
|
1209
|
-
''', (cluster_id, active_profile)).fetchall()
|
|
1210
|
-
|
|
1211
|
-
return [
|
|
1212
|
-
{
|
|
1213
|
-
'id': m[0],
|
|
1214
|
-
'summary': m[1],
|
|
1215
|
-
'importance': m[2],
|
|
1216
|
-
'tags': json.loads(m[3]) if m[3] else [],
|
|
1217
|
-
'created_at': m[4]
|
|
1218
|
-
}
|
|
1219
|
-
for m in memories
|
|
1220
|
-
]
|
|
1221
|
-
|
|
1222
|
-
finally:
|
|
1223
|
-
conn.close()
|
|
1224
|
-
|
|
1225
|
-
def add_memory_incremental(self, memory_id: int) -> bool:
|
|
1226
|
-
"""
|
|
1227
|
-
Add single memory to existing graph (incremental update).
|
|
1228
|
-
|
|
1229
|
-
Args:
|
|
1230
|
-
memory_id: New memory ID to add
|
|
1231
|
-
|
|
1232
|
-
Returns:
|
|
1233
|
-
Success status
|
|
1234
|
-
"""
|
|
1235
|
-
conn = sqlite3.connect(self.db_path)
|
|
1236
|
-
cursor = conn.cursor()
|
|
1237
|
-
|
|
1238
|
-
try:
|
|
1239
|
-
# Get new memory content
|
|
1240
|
-
memory = cursor.execute('''
|
|
1241
|
-
SELECT content, summary FROM memories WHERE id = ?
|
|
1242
|
-
''', (memory_id,)).fetchone()
|
|
1243
|
-
|
|
1244
|
-
if not memory:
|
|
1245
|
-
return False
|
|
1246
|
-
|
|
1247
|
-
# Extract entities for new memory
|
|
1248
|
-
content = f"{memory[0]} {memory[1] or ''}"
|
|
1249
|
-
entities_list, vector = self.entity_extractor.extract_entities([content])
|
|
1250
|
-
|
|
1251
|
-
if not entities_list:
|
|
1252
|
-
return False
|
|
1253
|
-
|
|
1254
|
-
new_entities = entities_list[0]
|
|
1255
|
-
new_vector = vector[0]
|
|
1256
|
-
|
|
1257
|
-
# Store node
|
|
1258
|
-
cursor.execute('''
|
|
1259
|
-
INSERT OR REPLACE INTO graph_nodes (memory_id, entities, embedding_vector)
|
|
1260
|
-
VALUES (?, ?, ?)
|
|
1261
|
-
''', (memory_id, json.dumps(new_entities), json.dumps(new_vector.tolist())))
|
|
1262
|
-
|
|
1263
|
-
# Compare to existing memories in the same profile
|
|
1264
|
-
active_profile = self._get_active_profile()
|
|
1265
|
-
existing = cursor.execute('''
|
|
1266
|
-
SELECT gn.memory_id, gn.embedding_vector, gn.entities
|
|
1267
|
-
FROM graph_nodes gn
|
|
1268
|
-
JOIN memories m ON gn.memory_id = m.id
|
|
1269
|
-
WHERE gn.memory_id != ? AND m.profile = ?
|
|
1270
|
-
''', (memory_id, active_profile)).fetchall()
|
|
1271
|
-
|
|
1272
|
-
edges_added = 0
|
|
1273
|
-
|
|
1274
|
-
for existing_id, existing_vector_json, existing_entities_json in existing:
|
|
1275
|
-
existing_vector = np.array(json.loads(existing_vector_json))
|
|
1276
|
-
|
|
1277
|
-
# Compute similarity
|
|
1278
|
-
sim = cosine_similarity([new_vector], [existing_vector])[0][0]
|
|
1279
|
-
|
|
1280
|
-
if sim >= self.edge_builder.min_similarity:
|
|
1281
|
-
# Find shared entities
|
|
1282
|
-
existing_entities = json.loads(existing_entities_json)
|
|
1283
|
-
shared = list(set(new_entities) & set(existing_entities))
|
|
1284
|
-
|
|
1285
|
-
# Classify relationship
|
|
1286
|
-
rel_type = self.edge_builder._classify_relationship(sim, shared)
|
|
1287
|
-
|
|
1288
|
-
# Insert edge
|
|
1289
|
-
cursor.execute('''
|
|
1290
|
-
INSERT OR REPLACE INTO graph_edges
|
|
1291
|
-
(source_memory_id, target_memory_id, relationship_type,
|
|
1292
|
-
weight, shared_entities, similarity_score)
|
|
1293
|
-
VALUES (?, ?, ?, ?, ?, ?)
|
|
1294
|
-
''', (
|
|
1295
|
-
memory_id,
|
|
1296
|
-
existing_id,
|
|
1297
|
-
rel_type,
|
|
1298
|
-
float(sim),
|
|
1299
|
-
json.dumps(shared),
|
|
1300
|
-
float(sim)
|
|
1301
|
-
))
|
|
1302
|
-
|
|
1303
|
-
edges_added += 1
|
|
1304
|
-
|
|
1305
|
-
conn.commit()
|
|
1306
|
-
logger.info(f"Added memory {memory_id} to graph with {edges_added} edges")
|
|
1307
|
-
|
|
1308
|
-
# Optionally re-cluster if significant change
|
|
1309
|
-
if edges_added > 5:
|
|
1310
|
-
logger.info("Significant graph change - consider re-clustering")
|
|
1311
|
-
|
|
1312
|
-
return True
|
|
1313
|
-
|
|
1314
|
-
except Exception as e:
|
|
1315
|
-
logger.error(f"Incremental add failed: {e}")
|
|
1316
|
-
conn.rollback()
|
|
1317
|
-
return False
|
|
1318
|
-
finally:
|
|
1319
|
-
conn.close()
|
|
1320
|
-
|
|
1321
|
-
def get_stats(self) -> Dict[str, any]:
|
|
1322
|
-
"""Get graph statistics for the active profile."""
|
|
1323
|
-
conn = sqlite3.connect(self.db_path)
|
|
1324
|
-
cursor = conn.cursor()
|
|
1325
|
-
active_profile = self._get_active_profile()
|
|
1326
|
-
|
|
1327
|
-
try:
|
|
1328
|
-
# Count nodes for active profile's memories
|
|
1329
|
-
nodes = cursor.execute('''
|
|
1330
|
-
SELECT COUNT(*) FROM graph_nodes
|
|
1331
|
-
WHERE memory_id IN (SELECT id FROM memories WHERE profile = ?)
|
|
1332
|
-
''', (active_profile,)).fetchone()[0]
|
|
1333
|
-
|
|
1334
|
-
# Count edges where at least one end is in active profile
|
|
1335
|
-
edges = cursor.execute('''
|
|
1336
|
-
SELECT COUNT(*) FROM graph_edges
|
|
1337
|
-
WHERE source_memory_id IN (SELECT id FROM memories WHERE profile = ?)
|
|
1338
|
-
''', (active_profile,)).fetchone()[0]
|
|
1339
|
-
|
|
1340
|
-
# Clusters that have members in active profile
|
|
1341
|
-
clusters = cursor.execute('''
|
|
1342
|
-
SELECT COUNT(DISTINCT cluster_id) FROM memories
|
|
1343
|
-
WHERE cluster_id IS NOT NULL AND profile = ?
|
|
1344
|
-
''', (active_profile,)).fetchone()[0]
|
|
1345
|
-
|
|
1346
|
-
# Cluster breakdown for active profile (including hierarchy)
|
|
1347
|
-
cluster_info = cursor.execute('''
|
|
1348
|
-
SELECT gc.name, gc.member_count, gc.avg_importance,
|
|
1349
|
-
gc.summary, gc.parent_cluster_id, gc.depth
|
|
1350
|
-
FROM graph_clusters gc
|
|
1351
|
-
WHERE gc.id IN (
|
|
1352
|
-
SELECT DISTINCT cluster_id FROM memories
|
|
1353
|
-
WHERE cluster_id IS NOT NULL AND profile = ?
|
|
1354
|
-
)
|
|
1355
|
-
ORDER BY gc.depth ASC, gc.member_count DESC
|
|
1356
|
-
LIMIT 20
|
|
1357
|
-
''', (active_profile,)).fetchall()
|
|
1358
|
-
|
|
1359
|
-
# Count hierarchical depth
|
|
1360
|
-
max_depth = max((c[5] or 0 for c in cluster_info), default=0) if cluster_info else 0
|
|
1361
|
-
|
|
1362
|
-
return {
|
|
1363
|
-
'profile': active_profile,
|
|
1364
|
-
'nodes': nodes,
|
|
1365
|
-
'edges': edges,
|
|
1366
|
-
'clusters': clusters,
|
|
1367
|
-
'max_depth': max_depth,
|
|
1368
|
-
'top_clusters': [
|
|
1369
|
-
{
|
|
1370
|
-
'name': c[0],
|
|
1371
|
-
'members': c[1],
|
|
1372
|
-
'avg_importance': round(c[2], 1) if c[2] else 5.0,
|
|
1373
|
-
'summary': c[3],
|
|
1374
|
-
'parent_cluster_id': c[4],
|
|
1375
|
-
'depth': c[5] or 0
|
|
1376
|
-
}
|
|
1377
|
-
for c in cluster_info
|
|
1378
|
-
]
|
|
1379
|
-
}
|
|
1380
|
-
|
|
1381
|
-
finally:
|
|
1382
|
-
conn.close()
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
def main():
|
|
1386
|
-
"""CLI interface for manual graph operations."""
|
|
1387
|
-
import argparse
|
|
1388
|
-
|
|
1389
|
-
parser = argparse.ArgumentParser(description='GraphEngine - Knowledge Graph Management')
|
|
1390
|
-
parser.add_argument('command', choices=['build', 'stats', 'related', 'cluster', 'hierarchical', 'summaries'],
|
|
1391
|
-
help='Command to execute')
|
|
1392
|
-
parser.add_argument('--memory-id', type=int, help='Memory ID for related/add commands')
|
|
1393
|
-
parser.add_argument('--cluster-id', type=int, help='Cluster ID for cluster command')
|
|
1394
|
-
parser.add_argument('--min-similarity', type=float, default=0.3,
|
|
1395
|
-
help='Minimum similarity for edges (default: 0.3)')
|
|
1396
|
-
parser.add_argument('--hops', type=int, default=2, help='Max hops for related (default: 2)')
|
|
1397
|
-
|
|
1398
|
-
args = parser.parse_args()
|
|
1399
|
-
|
|
1400
|
-
engine = GraphEngine()
|
|
1401
|
-
|
|
1402
|
-
if args.command == 'build':
|
|
1403
|
-
print("Building knowledge graph...")
|
|
1404
|
-
stats = engine.build_graph(min_similarity=args.min_similarity)
|
|
1405
|
-
print(json.dumps(stats, indent=2))
|
|
1406
|
-
|
|
1407
|
-
elif args.command == 'stats':
|
|
1408
|
-
print("Graph Statistics:")
|
|
1409
|
-
stats = engine.get_stats()
|
|
1410
|
-
print(json.dumps(stats, indent=2))
|
|
1411
|
-
|
|
1412
|
-
elif args.command == 'related':
|
|
1413
|
-
if not args.memory_id:
|
|
1414
|
-
print("Error: --memory-id required for 'related' command")
|
|
1415
|
-
return
|
|
1416
|
-
|
|
1417
|
-
print(f"Finding memories related to #{args.memory_id}...")
|
|
1418
|
-
related = engine.get_related(args.memory_id, max_hops=args.hops)
|
|
1419
|
-
|
|
1420
|
-
if not related:
|
|
1421
|
-
print("No related memories found")
|
|
1422
|
-
else:
|
|
1423
|
-
for idx, mem in enumerate(related, 1):
|
|
1424
|
-
print(f"\n{idx}. Memory #{mem['id']} ({mem['hops']}-hop, weight={mem['weight']:.3f})")
|
|
1425
|
-
print(f" Relationship: {mem['relationship']}")
|
|
1426
|
-
summary = mem['summary'] or '[No summary]'
|
|
1427
|
-
print(f" Summary: {summary[:100]}...")
|
|
1428
|
-
if mem['shared_entities']:
|
|
1429
|
-
print(f" Shared: {', '.join(mem['shared_entities'][:5])}")
|
|
1430
|
-
|
|
1431
|
-
elif args.command == 'cluster':
|
|
1432
|
-
if not args.cluster_id:
|
|
1433
|
-
print("Error: --cluster-id required for 'cluster' command")
|
|
1434
|
-
return
|
|
1435
|
-
|
|
1436
|
-
print(f"Cluster #{args.cluster_id} members:")
|
|
1437
|
-
members = engine.get_cluster_members(args.cluster_id)
|
|
1438
|
-
|
|
1439
|
-
for idx, mem in enumerate(members, 1):
|
|
1440
|
-
print(f"\n{idx}. Memory #{mem['id']} (importance={mem['importance']})")
|
|
1441
|
-
summary = mem['summary'] or '[No summary]'
|
|
1442
|
-
print(f" {summary[:100]}...")
|
|
1443
|
-
|
|
1444
|
-
elif args.command == 'hierarchical':
|
|
1445
|
-
print("Running hierarchical sub-clustering...")
|
|
1446
|
-
cluster_builder = ClusterBuilder(engine.db_path)
|
|
1447
|
-
stats = cluster_builder.hierarchical_cluster()
|
|
1448
|
-
print(json.dumps(stats, indent=2))
|
|
1449
|
-
|
|
1450
|
-
elif args.command == 'summaries':
|
|
1451
|
-
print("Generating cluster summaries...")
|
|
1452
|
-
cluster_builder = ClusterBuilder(engine.db_path)
|
|
1453
|
-
count = cluster_builder.generate_cluster_summaries()
|
|
1454
|
-
print(f"Generated summaries for {count} clusters")
|
|
1455
36
|
|
|
37
|
+
__all__ = [
|
|
38
|
+
"MAX_MEMORIES_FOR_GRAPH",
|
|
39
|
+
"SKLEARN_AVAILABLE",
|
|
40
|
+
"IGRAPH_AVAILABLE",
|
|
41
|
+
"MEMORY_DIR",
|
|
42
|
+
"DB_PATH",
|
|
43
|
+
"EntityExtractor",
|
|
44
|
+
"ClusterNamer",
|
|
45
|
+
"EdgeBuilder",
|
|
46
|
+
"ClusterBuilder",
|
|
47
|
+
"GraphEngine",
|
|
48
|
+
"main",
|
|
49
|
+
]
|
|
1456
50
|
|
|
1457
51
|
if __name__ == '__main__':
|
|
1458
52
|
main()
|