superlocalmemory 2.7.6 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. package/CHANGELOG.md +120 -155
  2. package/README.md +115 -89
  3. package/api_server.py +2 -12
  4. package/docs/PATTERN-LEARNING.md +64 -199
  5. package/docs/example_graph_usage.py +4 -6
  6. package/install.sh +59 -0
  7. package/mcp_server.py +83 -7
  8. package/package.json +1 -8
  9. package/scripts/generate-thumbnails.py +3 -5
  10. package/skills/slm-build-graph/SKILL.md +1 -1
  11. package/skills/slm-list-recent/SKILL.md +1 -1
  12. package/skills/slm-recall/SKILL.md +1 -1
  13. package/skills/slm-remember/SKILL.md +1 -1
  14. package/skills/slm-show-patterns/SKILL.md +1 -1
  15. package/skills/slm-status/SKILL.md +1 -1
  16. package/skills/slm-switch-profile/SKILL.md +1 -1
  17. package/src/agent_registry.py +7 -18
  18. package/src/auth_middleware.py +3 -5
  19. package/src/auto_backup.py +3 -7
  20. package/src/behavioral/__init__.py +49 -0
  21. package/src/behavioral/behavioral_listener.py +203 -0
  22. package/src/behavioral/behavioral_patterns.py +275 -0
  23. package/src/behavioral/cross_project_transfer.py +206 -0
  24. package/src/behavioral/outcome_inference.py +194 -0
  25. package/src/behavioral/outcome_tracker.py +193 -0
  26. package/src/behavioral/tests/__init__.py +4 -0
  27. package/src/behavioral/tests/test_behavioral_integration.py +108 -0
  28. package/src/behavioral/tests/test_behavioral_patterns.py +150 -0
  29. package/src/behavioral/tests/test_cross_project_transfer.py +142 -0
  30. package/src/behavioral/tests/test_mcp_behavioral.py +139 -0
  31. package/src/behavioral/tests/test_mcp_report_outcome.py +117 -0
  32. package/src/behavioral/tests/test_outcome_inference.py +107 -0
  33. package/src/behavioral/tests/test_outcome_tracker.py +96 -0
  34. package/src/cache_manager.py +4 -6
  35. package/src/compliance/__init__.py +48 -0
  36. package/src/compliance/abac_engine.py +149 -0
  37. package/src/compliance/abac_middleware.py +116 -0
  38. package/src/compliance/audit_db.py +215 -0
  39. package/src/compliance/audit_logger.py +148 -0
  40. package/src/compliance/retention_manager.py +289 -0
  41. package/src/compliance/retention_scheduler.py +186 -0
  42. package/src/compliance/tests/__init__.py +4 -0
  43. package/src/compliance/tests/test_abac_enforcement.py +95 -0
  44. package/src/compliance/tests/test_abac_engine.py +124 -0
  45. package/src/compliance/tests/test_abac_mcp_integration.py +118 -0
  46. package/src/compliance/tests/test_audit_db.py +123 -0
  47. package/src/compliance/tests/test_audit_logger.py +98 -0
  48. package/src/compliance/tests/test_mcp_audit.py +128 -0
  49. package/src/compliance/tests/test_mcp_retention_policy.py +125 -0
  50. package/src/compliance/tests/test_retention_manager.py +131 -0
  51. package/src/compliance/tests/test_retention_scheduler.py +99 -0
  52. package/src/db_connection_manager.py +2 -12
  53. package/src/embedding_engine.py +61 -669
  54. package/src/embeddings/__init__.py +47 -0
  55. package/src/embeddings/cache.py +70 -0
  56. package/src/embeddings/cli.py +113 -0
  57. package/src/embeddings/constants.py +47 -0
  58. package/src/embeddings/database.py +91 -0
  59. package/src/embeddings/engine.py +247 -0
  60. package/src/embeddings/model_loader.py +145 -0
  61. package/src/event_bus.py +3 -13
  62. package/src/graph/__init__.py +36 -0
  63. package/src/graph/build_helpers.py +74 -0
  64. package/src/graph/cli.py +87 -0
  65. package/src/graph/cluster_builder.py +188 -0
  66. package/src/graph/cluster_summary.py +148 -0
  67. package/src/graph/constants.py +47 -0
  68. package/src/graph/edge_builder.py +162 -0
  69. package/src/graph/entity_extractor.py +95 -0
  70. package/src/graph/graph_core.py +226 -0
  71. package/src/graph/graph_search.py +231 -0
  72. package/src/graph/hierarchical.py +207 -0
  73. package/src/graph/schema.py +99 -0
  74. package/src/graph_engine.py +45 -1451
  75. package/src/hnsw_index.py +3 -7
  76. package/src/hybrid_search.py +36 -683
  77. package/src/learning/__init__.py +27 -12
  78. package/src/learning/adaptive_ranker.py +50 -12
  79. package/src/learning/cross_project_aggregator.py +2 -12
  80. package/src/learning/engagement_tracker.py +2 -12
  81. package/src/learning/feature_extractor.py +175 -43
  82. package/src/learning/feedback_collector.py +7 -12
  83. package/src/learning/learning_db.py +180 -12
  84. package/src/learning/project_context_manager.py +2 -12
  85. package/src/learning/source_quality_scorer.py +2 -12
  86. package/src/learning/synthetic_bootstrap.py +2 -12
  87. package/src/learning/tests/__init__.py +2 -0
  88. package/src/learning/tests/test_adaptive_ranker.py +2 -6
  89. package/src/learning/tests/test_adaptive_ranker_v28.py +60 -0
  90. package/src/learning/tests/test_aggregator.py +2 -6
  91. package/src/learning/tests/test_auto_retrain_v28.py +35 -0
  92. package/src/learning/tests/test_e2e_ranking_v28.py +82 -0
  93. package/src/learning/tests/test_feature_extractor_v28.py +93 -0
  94. package/src/learning/tests/test_feedback_collector.py +2 -6
  95. package/src/learning/tests/test_learning_db.py +2 -6
  96. package/src/learning/tests/test_learning_db_v28.py +110 -0
  97. package/src/learning/tests/test_learning_init_v28.py +48 -0
  98. package/src/learning/tests/test_outcome_signals.py +48 -0
  99. package/src/learning/tests/test_project_context.py +2 -6
  100. package/src/learning/tests/test_schema_migration.py +319 -0
  101. package/src/learning/tests/test_signal_inference.py +11 -13
  102. package/src/learning/tests/test_source_quality.py +2 -6
  103. package/src/learning/tests/test_synthetic_bootstrap.py +3 -7
  104. package/src/learning/tests/test_workflow_miner.py +2 -6
  105. package/src/learning/workflow_pattern_miner.py +2 -12
  106. package/src/lifecycle/__init__.py +54 -0
  107. package/src/lifecycle/bounded_growth.py +239 -0
  108. package/src/lifecycle/compaction_engine.py +226 -0
  109. package/src/lifecycle/lifecycle_engine.py +302 -0
  110. package/src/lifecycle/lifecycle_evaluator.py +225 -0
  111. package/src/lifecycle/lifecycle_scheduler.py +130 -0
  112. package/src/lifecycle/retention_policy.py +285 -0
  113. package/src/lifecycle/tests/__init__.py +4 -0
  114. package/src/lifecycle/tests/test_bounded_growth.py +193 -0
  115. package/src/lifecycle/tests/test_compaction.py +179 -0
  116. package/src/lifecycle/tests/test_lifecycle_engine.py +137 -0
  117. package/src/lifecycle/tests/test_lifecycle_evaluation.py +177 -0
  118. package/src/lifecycle/tests/test_lifecycle_scheduler.py +127 -0
  119. package/src/lifecycle/tests/test_lifecycle_search.py +109 -0
  120. package/src/lifecycle/tests/test_mcp_compact.py +149 -0
  121. package/src/lifecycle/tests/test_mcp_lifecycle_status.py +114 -0
  122. package/src/lifecycle/tests/test_retention_policy.py +162 -0
  123. package/src/mcp_tools_v28.py +280 -0
  124. package/src/memory-profiles.py +2 -12
  125. package/src/memory-reset.py +2 -12
  126. package/src/memory_compression.py +2 -12
  127. package/src/memory_store_v2.py +76 -20
  128. package/src/migrate_v1_to_v2.py +2 -12
  129. package/src/pattern_learner.py +29 -975
  130. package/src/patterns/__init__.py +24 -0
  131. package/src/patterns/analyzers.py +247 -0
  132. package/src/patterns/learner.py +267 -0
  133. package/src/patterns/scoring.py +167 -0
  134. package/src/patterns/store.py +223 -0
  135. package/src/patterns/terminology.py +138 -0
  136. package/src/provenance_tracker.py +4 -14
  137. package/src/query_optimizer.py +4 -6
  138. package/src/rate_limiter.py +2 -6
  139. package/src/search/__init__.py +20 -0
  140. package/src/search/cli.py +77 -0
  141. package/src/search/constants.py +26 -0
  142. package/src/search/engine.py +239 -0
  143. package/src/search/fusion.py +122 -0
  144. package/src/search/index_loader.py +112 -0
  145. package/src/search/methods.py +162 -0
  146. package/src/search_engine_v2.py +4 -6
  147. package/src/setup_validator.py +7 -13
  148. package/src/subscription_manager.py +2 -12
  149. package/src/tree/__init__.py +59 -0
  150. package/src/tree/builder.py +183 -0
  151. package/src/tree/nodes.py +196 -0
  152. package/src/tree/queries.py +252 -0
  153. package/src/tree/schema.py +76 -0
  154. package/src/tree_manager.py +10 -711
  155. package/src/trust/__init__.py +45 -0
  156. package/src/trust/constants.py +66 -0
  157. package/src/trust/queries.py +157 -0
  158. package/src/trust/schema.py +95 -0
  159. package/src/trust/scorer.py +299 -0
  160. package/src/trust/signals.py +95 -0
  161. package/src/trust_scorer.py +39 -697
  162. package/src/webhook_dispatcher.py +2 -12
  163. package/ui/app.js +1 -1
  164. package/ui/js/agents.js +1 -1
  165. package/ui_server.py +2 -14
  166. package/ATTRIBUTION.md +0 -140
  167. package/docs/ARCHITECTURE-V2.5.md +0 -190
  168. package/docs/GRAPH-ENGINE.md +0 -503
  169. package/docs/architecture-diagram.drawio +0 -405
  170. package/docs/plans/2026-02-13-benchmark-suite.md +0 -1349
@@ -1,1458 +1,52 @@
1
1
  #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """GraphEngine - Knowledge Graph Clustering for SuperLocalMemory V2
5
+
6
+ BACKWARD-COMPATIBILITY SHIM
7
+ ----------------------------
8
+ This file re-exports every public symbol from the ``graph`` package so that
9
+ existing code using ``from graph_engine import GraphEngine`` (or any other
10
+ name) continues to work without modification.
11
+
12
+ The actual implementation now lives in:
13
+ src/graph/constants.py - Shared imports, constants, logger
14
+ src/graph/entity_extractor.py - EntityExtractor, ClusterNamer
15
+ src/graph/edge_builder.py - EdgeBuilder
16
+ src/graph/cluster_builder.py - ClusterBuilder
17
+ src/graph/graph_core.py - GraphEngine, main()
2
18
  """
3
- GraphEngine - Knowledge Graph Clustering for SuperLocalMemory V2
4
-
5
- Copyright (c) 2026 Varun Pratap Bhardwaj
6
- Licensed under MIT License
7
- Repository: https://github.com/varun369/SuperLocalMemoryV2
8
-
9
- Implements GraphRAG with Leiden community detection to:
10
- - Extract entities from memories (TF-IDF keyword extraction)
11
- - Build similarity-based edges between memories
12
- - Detect thematic clusters using Leiden algorithm
13
- - Enable graph traversal for related memory discovery
14
-
15
- All processing is local - no external APIs.
16
-
17
- LIMITS:
18
- - MAX_MEMORIES_FOR_GRAPH: 10000 (prevents O(n²) explosion)
19
- - For larger datasets, use incremental updates
20
- """
21
-
22
- # SECURITY: Graph build limits to prevent resource exhaustion
23
- MAX_MEMORIES_FOR_GRAPH = 10000
24
-
25
- import sqlite3
26
- import json
27
- import time
28
- import logging
29
- from datetime import datetime
30
- from pathlib import Path
31
- from typing import List, Dict, Optional, Tuple, Set
32
- from collections import Counter
33
-
34
- # Core dependencies
35
- try:
36
- from sklearn.feature_extraction.text import TfidfVectorizer
37
- from sklearn.metrics.pairwise import cosine_similarity
38
- import numpy as np
39
- SKLEARN_AVAILABLE = True
40
- except ImportError:
41
- SKLEARN_AVAILABLE = False
42
- raise ImportError("scikit-learn is required. Install: pip install scikit-learn")
43
-
44
- # Graph dependencies - lazy import to avoid conflicts with compression module
45
- IGRAPH_AVAILABLE = False
46
- try:
47
- # Import only when needed to avoid module conflicts
48
- import importlib
49
- ig_module = importlib.import_module('igraph')
50
- leiden_module = importlib.import_module('leidenalg')
51
- IGRAPH_AVAILABLE = True
52
- except ImportError:
53
- pass # Will raise error when building clusters if not available
54
-
55
- # Setup logging
56
- logging.basicConfig(
57
- level=logging.INFO,
58
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
19
+ # Re-export everything from the graph package
20
+ from graph import (
21
+ # Constants
22
+ MAX_MEMORIES_FOR_GRAPH,
23
+ SKLEARN_AVAILABLE,
24
+ IGRAPH_AVAILABLE,
25
+ MEMORY_DIR,
26
+ DB_PATH,
27
+ # Classes
28
+ EntityExtractor,
29
+ ClusterNamer,
30
+ EdgeBuilder,
31
+ ClusterBuilder,
32
+ GraphEngine,
33
+ # Functions
34
+ main,
59
35
  )
60
- logger = logging.getLogger(__name__)
61
-
62
- MEMORY_DIR = Path.home() / ".claude-memory"
63
- DB_PATH = MEMORY_DIR / "memory.db"
64
-
65
-
66
- class EntityExtractor:
67
- """Extract key entities/concepts from memory content using TF-IDF."""
68
-
69
- def __init__(self, max_features: int = 20, min_df: int = 1):
70
- """
71
- Initialize entity extractor.
72
-
73
- Args:
74
- max_features: Top N keywords to extract per memory
75
- min_df: Minimum document frequency (ignore very rare terms)
76
- """
77
- self.max_features = max_features
78
- self.vectorizer = TfidfVectorizer(
79
- max_features=max_features,
80
- stop_words='english',
81
- ngram_range=(1, 2), # Unigrams + bigrams
82
- min_df=min_df,
83
- lowercase=True,
84
- token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z0-9_-]*\b' # Alphanumeric tokens
85
- )
86
-
87
- def extract_entities(self, contents: List[str]) -> Tuple[List[List[str]], np.ndarray]:
88
- """
89
- Extract entities from multiple contents.
90
-
91
- Args:
92
- contents: List of memory content strings
93
-
94
- Returns:
95
- Tuple of (entities_per_content, tfidf_vectors)
96
- """
97
- if not contents:
98
- return [], np.array([])
99
-
100
- try:
101
- # Fit and transform all contents
102
- vectors = self.vectorizer.fit_transform(contents)
103
- feature_names = self.vectorizer.get_feature_names_out()
104
-
105
- # Extract top entities for each content
106
- all_entities = []
107
- for idx in range(len(contents)):
108
- scores = vectors[idx].toarray()[0]
109
-
110
- # Get indices of top features
111
- top_indices = np.argsort(scores)[::-1]
112
-
113
- # Extract entities with score > 0
114
- entities = [
115
- feature_names[i]
116
- for i in top_indices
117
- if scores[i] > 0.05 # Minimum threshold
118
- ][:self.max_features]
119
-
120
- all_entities.append(entities)
121
-
122
- return all_entities, vectors.toarray()
123
-
124
- except Exception as e:
125
- logger.error(f"Entity extraction failed: {e}")
126
- return [[] for _ in contents], np.zeros((len(contents), 1))
127
-
128
-
129
- class EdgeBuilder:
130
- """Build similarity edges between memories based on entity overlap."""
131
-
132
- def __init__(self, db_path: Path, min_similarity: float = 0.3):
133
- """
134
- Initialize edge builder.
135
-
136
- Args:
137
- db_path: Path to SQLite database
138
- min_similarity: Minimum cosine similarity to create edge
139
- """
140
- self.db_path = db_path
141
- self.min_similarity = min_similarity
142
-
143
- def build_edges(self, memory_ids: List[int], vectors: np.ndarray,
144
- entities_list: List[List[str]]) -> int:
145
- """
146
- Build edges between similar memories.
147
-
148
- Args:
149
- memory_ids: List of memory IDs
150
- vectors: TF-IDF vectors (n x features)
151
- entities_list: List of entity lists per memory
152
-
153
- Returns:
154
- Number of edges created
155
- """
156
- if len(memory_ids) < 2:
157
- logger.warning("Need at least 2 memories to build edges")
158
- return 0
159
-
160
- # Try HNSW-accelerated edge building first (O(n log n))
161
- use_hnsw = False
162
- try:
163
- from hnsw_index import HNSWIndex
164
- if len(memory_ids) >= 50: # HNSW overhead not worth it for small sets
165
- use_hnsw = True
166
- except ImportError:
167
- pass
168
-
169
- edges_added = 0
170
- conn = sqlite3.connect(self.db_path)
171
- cursor = conn.cursor()
172
-
173
- try:
174
- if use_hnsw:
175
- logger.info("Using HNSW-accelerated edge building for %d memories", len(memory_ids))
176
- try:
177
- dim = vectors.shape[1]
178
- hnsw = HNSWIndex(dimension=dim, max_elements=len(memory_ids))
179
- hnsw.build(vectors, memory_ids)
180
-
181
- for i in range(len(memory_ids)):
182
- neighbors = hnsw.search(vectors[i], k=min(20, len(memory_ids) - 1))
183
- for neighbor_id, similarity in neighbors:
184
- if neighbor_id == memory_ids[i]:
185
- continue # Skip self
186
- # Only process each pair once (lower ID first)
187
- if memory_ids[i] > neighbor_id:
188
- continue
189
- if similarity >= self.min_similarity:
190
- # Find indices for entity lookup
191
- j = memory_ids.index(neighbor_id)
192
- entities_i = set(entities_list[i])
193
- entities_j = set(entities_list[j])
194
- shared = list(entities_i & entities_j)
195
- rel_type = self._classify_relationship(similarity, shared)
196
-
197
- cursor.execute('''
198
- INSERT OR REPLACE INTO graph_edges
199
- (source_memory_id, target_memory_id, relationship_type,
200
- weight, shared_entities, similarity_score)
201
- VALUES (?, ?, ?, ?, ?, ?)
202
- ''', (
203
- memory_ids[i], neighbor_id, rel_type,
204
- float(similarity), json.dumps(shared), float(similarity)
205
- ))
206
- edges_added += 1
207
-
208
- except Exception as e:
209
- logger.warning("HNSW edge building failed, falling back to O(n²): %s", e)
210
- use_hnsw = False # Fall through to O(n²) below
211
-
212
- if not use_hnsw:
213
- # Fallback: O(n²) pairwise cosine similarity
214
- similarity_matrix = cosine_similarity(vectors)
215
-
216
- for i in range(len(memory_ids)):
217
- for j in range(i + 1, len(memory_ids)):
218
- sim = similarity_matrix[i, j]
219
-
220
- if sim >= self.min_similarity:
221
- entities_i = set(entities_list[i])
222
- entities_j = set(entities_list[j])
223
- shared = list(entities_i & entities_j)
224
- rel_type = self._classify_relationship(sim, shared)
225
-
226
- cursor.execute('''
227
- INSERT OR REPLACE INTO graph_edges
228
- (source_memory_id, target_memory_id, relationship_type,
229
- weight, shared_entities, similarity_score)
230
- VALUES (?, ?, ?, ?, ?, ?)
231
- ''', (
232
- memory_ids[i], memory_ids[j], rel_type,
233
- float(sim), json.dumps(shared), float(sim)
234
- ))
235
- edges_added += 1
236
-
237
- conn.commit()
238
- logger.info(f"Created {edges_added} edges")
239
- return edges_added
240
-
241
- except Exception as e:
242
- logger.error(f"Edge building failed: {e}")
243
- conn.rollback()
244
- return 0
245
- finally:
246
- conn.close()
247
-
248
- def _classify_relationship(self, similarity: float, shared_entities: List[str]) -> str:
249
- """
250
- Classify edge type based on similarity and shared entities.
251
-
252
- Args:
253
- similarity: Cosine similarity score
254
- shared_entities: List of shared entity strings
255
-
256
- Returns:
257
- Relationship type: 'similar', 'depends_on', or 'related_to'
258
- """
259
- # Check for dependency keywords
260
- dependency_keywords = {'dependency', 'require', 'import', 'use', 'need'}
261
- has_dependency = any(
262
- any(kw in entity.lower() for kw in dependency_keywords)
263
- for entity in shared_entities
264
- )
265
-
266
- if similarity > 0.7:
267
- return 'similar'
268
- elif has_dependency:
269
- return 'depends_on'
270
- else:
271
- return 'related_to'
272
-
273
-
274
- class ClusterBuilder:
275
- """Detect memory communities using Leiden algorithm."""
276
-
277
- def __init__(self, db_path: Path):
278
- """Initialize cluster builder."""
279
- self.db_path = db_path
280
-
281
- def _get_active_profile(self) -> str:
282
- """Get the currently active profile name from config."""
283
- config_file = MEMORY_DIR / "profiles.json"
284
- if config_file.exists():
285
- try:
286
- with open(config_file, 'r') as f:
287
- config = json.load(f)
288
- return config.get('active_profile', 'default')
289
- except (json.JSONDecodeError, IOError):
290
- pass
291
- return 'default'
292
-
293
- def detect_communities(self) -> int:
294
- """
295
- Run Leiden algorithm to find memory clusters (active profile only).
296
-
297
- Returns:
298
- Number of clusters created
299
- """
300
- if not IGRAPH_AVAILABLE:
301
- logger.warning("igraph/leidenalg not installed. Graph clustering disabled. Install with: pip3 install python-igraph leidenalg")
302
- return 0
303
- import igraph as ig
304
- import leidenalg
305
-
306
- conn = sqlite3.connect(self.db_path)
307
- cursor = conn.cursor()
308
- active_profile = self._get_active_profile()
309
-
310
- try:
311
- # Load edges for active profile's memories only
312
- edges = cursor.execute('''
313
- SELECT ge.source_memory_id, ge.target_memory_id, ge.weight
314
- FROM graph_edges ge
315
- WHERE ge.source_memory_id IN (SELECT id FROM memories WHERE profile = ?)
316
- AND ge.target_memory_id IN (SELECT id FROM memories WHERE profile = ?)
317
- ''', (active_profile, active_profile)).fetchall()
318
-
319
- if not edges:
320
- logger.warning("No edges found - cannot build clusters")
321
- return 0
322
-
323
- # Build memory ID mapping
324
- memory_ids = set()
325
- for source, target, _ in edges:
326
- memory_ids.add(source)
327
- memory_ids.add(target)
328
-
329
- memory_ids = sorted(list(memory_ids))
330
- memory_id_to_vertex = {mid: idx for idx, mid in enumerate(memory_ids)}
331
- vertex_to_memory_id = {idx: mid for mid, idx in memory_id_to_vertex.items()}
332
-
333
- # Create igraph graph
334
- g = ig.Graph()
335
- g.add_vertices(len(memory_ids))
336
-
337
- # Add edges with weights
338
- edge_list = []
339
- edge_weights = []
340
-
341
- for source, target, weight in edges:
342
- edge_list.append((
343
- memory_id_to_vertex[source],
344
- memory_id_to_vertex[target]
345
- ))
346
- edge_weights.append(weight)
347
-
348
- g.add_edges(edge_list)
349
-
350
- # Run Leiden algorithm
351
- logger.info(f"Running Leiden on {len(memory_ids)} nodes, {len(edges)} edges")
352
- partition = leidenalg.find_partition(
353
- g,
354
- leidenalg.ModularityVertexPartition,
355
- weights=edge_weights,
356
- n_iterations=100,
357
- seed=42 # Reproducible
358
- )
359
-
360
- # Process communities
361
- clusters_created = 0
362
-
363
- for cluster_idx, community in enumerate(partition):
364
- if len(community) < 2: # Skip singleton clusters
365
- continue
366
-
367
- # Get memory IDs in this cluster
368
- cluster_memory_ids = [vertex_to_memory_id[v] for v in community]
369
-
370
- # Calculate cluster stats
371
- avg_importance = self._get_avg_importance(cursor, cluster_memory_ids)
372
-
373
- # Auto-generate cluster name
374
- cluster_name = self._generate_cluster_name(cursor, cluster_memory_ids)
375
-
376
- # Insert cluster
377
- result = cursor.execute('''
378
- INSERT INTO graph_clusters (name, member_count, avg_importance)
379
- VALUES (?, ?, ?)
380
- ''', (cluster_name, len(cluster_memory_ids), avg_importance))
381
-
382
- cluster_id = result.lastrowid
383
-
384
- # Update memories with cluster_id
385
- cursor.executemany('''
386
- UPDATE memories SET cluster_id = ? WHERE id = ?
387
- ''', [(cluster_id, mid) for mid in cluster_memory_ids])
388
-
389
- clusters_created += 1
390
- logger.info(f"Cluster {cluster_id}: '{cluster_name}' ({len(cluster_memory_ids)} members)")
391
-
392
- conn.commit()
393
- logger.info(f"Created {clusters_created} clusters")
394
- return clusters_created
395
-
396
- except Exception as e:
397
- logger.error(f"Community detection failed: {e}")
398
- conn.rollback()
399
- return 0
400
- finally:
401
- conn.close()
402
-
403
- def _get_avg_importance(self, cursor, memory_ids: List[int]) -> float:
404
- """Calculate average importance for cluster."""
405
- placeholders = ','.join('?' * len(memory_ids))
406
- result = cursor.execute(f'''
407
- SELECT AVG(importance) FROM memories WHERE id IN ({placeholders})
408
- ''', memory_ids).fetchone()
409
-
410
- return result[0] if result and result[0] else 5.0
411
-
412
- def _generate_cluster_name(self, cursor, memory_ids: List[int]) -> str:
413
- """Generate cluster name from member entities (TF-IDF approach)."""
414
- # Get all entities from cluster members
415
- placeholders = ','.join('?' * len(memory_ids))
416
- nodes = cursor.execute(f'''
417
- SELECT entities FROM graph_nodes WHERE memory_id IN ({placeholders})
418
- ''', memory_ids).fetchall()
419
-
420
- all_entities = []
421
- for node in nodes:
422
- if node[0]:
423
- all_entities.extend(json.loads(node[0]))
424
-
425
- if not all_entities:
426
- return f"Cluster (ID auto-assigned)"
427
-
428
- # Count entity frequencies
429
- entity_counts = Counter(all_entities)
430
-
431
- # Top 2-3 most common entities
432
- top_entities = [e for e, _ in entity_counts.most_common(3)]
433
-
434
- # Build name
435
- if len(top_entities) >= 2:
436
- name = f"{top_entities[0].title()} & {top_entities[1].title()}"
437
- elif len(top_entities) == 1:
438
- name = f"{top_entities[0].title()} Contexts"
439
- else:
440
- name = "Mixed Contexts"
441
-
442
- return name[:100] # Limit length
443
-
444
-
445
- def hierarchical_cluster(self, min_subcluster_size: int = 5, max_depth: int = 3) -> Dict[str, any]:
446
- """
447
- Run recursive Leiden clustering — cluster the clusters.
448
-
449
- Large communities (>= min_subcluster_size * 2) are recursively sub-clustered
450
- to reveal finer-grained thematic structure. E.g., "Python" → "FastAPI" → "Auth".
451
-
452
- Args:
453
- min_subcluster_size: Minimum members to attempt sub-clustering (default 5)
454
- max_depth: Maximum recursion depth (default 3)
455
-
456
- Returns:
457
- Dictionary with hierarchical clustering statistics
458
- """
459
- if not IGRAPH_AVAILABLE:
460
- logger.warning("igraph/leidenalg not installed. Hierarchical clustering disabled. Install with: pip3 install python-igraph leidenalg")
461
- return {'subclusters_created': 0, 'depth_reached': 0}
462
- import igraph as ig
463
- import leidenalg
464
-
465
- conn = sqlite3.connect(self.db_path)
466
- cursor = conn.cursor()
467
- active_profile = self._get_active_profile()
468
-
469
- try:
470
- # Get top-level clusters for this profile that are large enough to sub-cluster
471
- cursor.execute('''
472
- SELECT cluster_id, COUNT(*) as cnt
473
- FROM memories
474
- WHERE cluster_id IS NOT NULL AND profile = ?
475
- GROUP BY cluster_id
476
- HAVING cnt >= ?
477
- ''', (active_profile, min_subcluster_size * 2))
478
- large_clusters = cursor.fetchall()
479
-
480
- if not large_clusters:
481
- logger.info("No clusters large enough for hierarchical decomposition")
482
- return {'subclusters_created': 0, 'depth_reached': 0}
483
-
484
- total_subclusters = 0
485
- max_depth_reached = 0
486
-
487
- for parent_cid, member_count in large_clusters:
488
- subs, depth = self._recursive_subcluster(
489
- conn, cursor, parent_cid, active_profile,
490
- min_subcluster_size, max_depth, current_depth=1
491
- )
492
- total_subclusters += subs
493
- max_depth_reached = max(max_depth_reached, depth)
494
-
495
- conn.commit()
496
- logger.info(f"Hierarchical clustering: {total_subclusters} sub-clusters, depth {max_depth_reached}")
497
- return {
498
- 'subclusters_created': total_subclusters,
499
- 'depth_reached': max_depth_reached,
500
- 'parent_clusters_processed': len(large_clusters)
501
- }
502
-
503
- except Exception as e:
504
- logger.error(f"Hierarchical clustering failed: {e}")
505
- conn.rollback()
506
- return {'subclusters_created': 0, 'error': str(e)}
507
- finally:
508
- conn.close()
509
-
510
- def _recursive_subcluster(self, conn, cursor, parent_cluster_id: int,
511
- profile: str, min_size: int, max_depth: int,
512
- current_depth: int) -> Tuple[int, int]:
513
- """Recursively sub-cluster a community using Leiden."""
514
- if not IGRAPH_AVAILABLE:
515
- return 0, current_depth - 1
516
- import igraph as ig
517
- import leidenalg
518
-
519
- if current_depth > max_depth:
520
- return 0, current_depth - 1
521
-
522
- # Get memory IDs in this cluster
523
- cursor.execute('''
524
- SELECT id FROM memories
525
- WHERE cluster_id = ? AND profile = ?
526
- ''', (parent_cluster_id, profile))
527
- member_ids = [row[0] for row in cursor.fetchall()]
528
-
529
- if len(member_ids) < min_size * 2:
530
- return 0, current_depth - 1
531
-
532
- # Get edges between members of this cluster
533
- placeholders = ','.join('?' * len(member_ids))
534
- edges = cursor.execute(f'''
535
- SELECT source_memory_id, target_memory_id, weight
536
- FROM graph_edges
537
- WHERE source_memory_id IN ({placeholders})
538
- AND target_memory_id IN ({placeholders})
539
- ''', member_ids + member_ids).fetchall()
540
-
541
- if len(edges) < 2:
542
- return 0, current_depth - 1
543
-
544
- # Build sub-graph
545
- id_to_vertex = {mid: idx for idx, mid in enumerate(member_ids)}
546
- vertex_to_id = {idx: mid for mid, idx in id_to_vertex.items()}
547
-
548
- g = ig.Graph()
549
- g.add_vertices(len(member_ids))
550
- edge_list, edge_weights = [], []
551
- for src, tgt, w in edges:
552
- if src in id_to_vertex and tgt in id_to_vertex:
553
- edge_list.append((id_to_vertex[src], id_to_vertex[tgt]))
554
- edge_weights.append(w)
555
-
556
- if not edge_list:
557
- return 0, current_depth - 1
558
-
559
- g.add_edges(edge_list)
560
-
561
- # Run Leiden with higher resolution for finer communities
562
- partition = leidenalg.find_partition(
563
- g, leidenalg.ModularityVertexPartition,
564
- weights=edge_weights, n_iterations=100, seed=42
565
- )
566
-
567
- # Only proceed if Leiden found > 1 community (actual split)
568
- non_singleton = [c for c in partition if len(c) >= 2]
569
- if len(non_singleton) <= 1:
570
- return 0, current_depth - 1
571
-
572
- subclusters_created = 0
573
- deepest = current_depth
574
-
575
- # Get parent depth
576
- cursor.execute('SELECT depth FROM graph_clusters WHERE id = ?', (parent_cluster_id,))
577
- parent_row = cursor.fetchone()
578
- parent_depth = parent_row[0] if parent_row else 0
579
-
580
- for community in non_singleton:
581
- sub_member_ids = [vertex_to_id[v] for v in community]
582
-
583
- if len(sub_member_ids) < 2:
584
- continue
585
-
586
- avg_imp = self._get_avg_importance(cursor, sub_member_ids)
587
- cluster_name = self._generate_cluster_name(cursor, sub_member_ids)
588
-
589
- result = cursor.execute('''
590
- INSERT INTO graph_clusters (name, member_count, avg_importance, parent_cluster_id, depth)
591
- VALUES (?, ?, ?, ?, ?)
592
- ''', (cluster_name, len(sub_member_ids), avg_imp, parent_cluster_id, parent_depth + 1))
593
-
594
- sub_cluster_id = result.lastrowid
595
-
596
- # Update memories to point to sub-cluster
597
- cursor.executemany('''
598
- UPDATE memories SET cluster_id = ? WHERE id = ?
599
- ''', [(sub_cluster_id, mid) for mid in sub_member_ids])
600
-
601
- subclusters_created += 1
602
- logger.info(f"Sub-cluster {sub_cluster_id} under {parent_cluster_id}: "
603
- f"'{cluster_name}' ({len(sub_member_ids)} members, depth {parent_depth + 1})")
604
-
605
- # Recurse into this sub-cluster if large enough
606
- child_subs, child_depth = self._recursive_subcluster(
607
- conn, cursor, sub_cluster_id, profile,
608
- min_size, max_depth, current_depth + 1
609
- )
610
- subclusters_created += child_subs
611
- deepest = max(deepest, child_depth)
612
-
613
- return subclusters_created, deepest
614
-
615
- def generate_cluster_summaries(self) -> int:
616
- """
617
- Generate TF-IDF structured summaries for all clusters.
618
-
619
- For each cluster, analyzes member content to produce a human-readable
620
- summary describing the cluster's theme, key topics, and scope.
621
-
622
- Returns:
623
- Number of clusters with summaries generated
624
- """
625
- conn = sqlite3.connect(self.db_path)
626
- cursor = conn.cursor()
627
- active_profile = self._get_active_profile()
628
-
629
- try:
630
- # Get all clusters for this profile
631
- cursor.execute('''
632
- SELECT DISTINCT gc.id, gc.name, gc.member_count
633
- FROM graph_clusters gc
634
- JOIN memories m ON m.cluster_id = gc.id
635
- WHERE m.profile = ?
636
- ''', (active_profile,))
637
- clusters = cursor.fetchall()
638
-
639
- if not clusters:
640
- return 0
641
-
642
- summaries_generated = 0
643
-
644
- for cluster_id, cluster_name, member_count in clusters:
645
- summary = self._build_cluster_summary(cursor, cluster_id, active_profile)
646
- if summary:
647
- cursor.execute('''
648
- UPDATE graph_clusters SET summary = ?, updated_at = CURRENT_TIMESTAMP
649
- WHERE id = ?
650
- ''', (summary, cluster_id))
651
- summaries_generated += 1
652
- logger.info(f"Summary for cluster {cluster_id} ({cluster_name}): {summary[:80]}...")
653
-
654
- conn.commit()
655
- logger.info(f"Generated {summaries_generated} cluster summaries")
656
- return summaries_generated
657
-
658
- except Exception as e:
659
- logger.error(f"Summary generation failed: {e}")
660
- conn.rollback()
661
- return 0
662
- finally:
663
- conn.close()
664
-
665
- def _build_cluster_summary(self, cursor, cluster_id: int, profile: str) -> str:
666
- """Build a TF-IDF structured summary for a single cluster."""
667
- # Get member content
668
- cursor.execute('''
669
- SELECT m.content, m.summary, m.tags, m.category, m.project_name
670
- FROM memories m
671
- WHERE m.cluster_id = ? AND m.profile = ?
672
- ''', (cluster_id, profile))
673
- members = cursor.fetchall()
674
-
675
- if not members:
676
- return ""
677
-
678
- # Collect entities from graph nodes
679
- cursor.execute('''
680
- SELECT gn.entities
681
- FROM graph_nodes gn
682
- JOIN memories m ON gn.memory_id = m.id
683
- WHERE m.cluster_id = ? AND m.profile = ?
684
- ''', (cluster_id, profile))
685
- all_entities = []
686
- for row in cursor.fetchall():
687
- if row[0]:
688
- try:
689
- all_entities.extend(json.loads(row[0]))
690
- except (json.JSONDecodeError, TypeError):
691
- pass
692
-
693
- # Top entities by frequency (TF-IDF already extracted these)
694
- entity_counts = Counter(all_entities)
695
- top_entities = [e for e, _ in entity_counts.most_common(5)]
696
-
697
- # Collect unique projects and categories
698
- projects = set()
699
- categories = set()
700
- for m in members:
701
- if m[3]: # category
702
- categories.add(m[3])
703
- if m[4]: # project_name
704
- projects.add(m[4])
705
-
706
- # Build structured summary
707
- parts = []
708
-
709
- # Theme from top entities
710
- if top_entities:
711
- parts.append(f"Key topics: {', '.join(top_entities[:5])}")
712
-
713
- # Scope
714
- if projects:
715
- parts.append(f"Projects: {', '.join(sorted(projects)[:3])}")
716
- if categories:
717
- parts.append(f"Categories: {', '.join(sorted(categories)[:3])}")
718
-
719
- # Size context
720
- parts.append(f"{len(members)} memories")
721
-
722
- # Check for hierarchical context
723
- cursor.execute('SELECT parent_cluster_id FROM graph_clusters WHERE id = ?', (cluster_id,))
724
- parent_row = cursor.fetchone()
725
- if parent_row and parent_row[0]:
726
- cursor.execute('SELECT name FROM graph_clusters WHERE id = ?', (parent_row[0],))
727
- parent_name_row = cursor.fetchone()
728
- if parent_name_row:
729
- parts.append(f"Sub-cluster of: {parent_name_row[0]}")
730
-
731
- return " | ".join(parts)
732
-
733
-
734
- class ClusterNamer:
735
- """Enhanced cluster naming with optional LLM support (future)."""
736
-
737
- @staticmethod
738
- def generate_name_tfidf(entities: List[str]) -> str:
739
- """Generate name from entity list (TF-IDF fallback)."""
740
- if not entities:
741
- return "Unnamed Cluster"
742
-
743
- entity_counts = Counter(entities)
744
- top_entities = [e for e, _ in entity_counts.most_common(2)]
745
-
746
- if len(top_entities) >= 2:
747
- return f"{top_entities[0].title()} & {top_entities[1].title()}"
748
- else:
749
- return f"{top_entities[0].title()} Contexts"
750
-
751
-
752
- class GraphEngine:
753
- """Main graph engine coordinating all graph operations."""
754
-
755
- def __init__(self, db_path: Path = DB_PATH):
756
- """Initialize graph engine."""
757
- self.db_path = db_path
758
- self.entity_extractor = EntityExtractor(max_features=20)
759
- self.edge_builder = EdgeBuilder(db_path)
760
- self.cluster_builder = ClusterBuilder(db_path)
761
- self._ensure_graph_tables()
762
-
763
- def _get_active_profile(self) -> str:
764
- """Get the currently active profile name from config."""
765
- config_file = MEMORY_DIR / "profiles.json"
766
- if config_file.exists():
767
- try:
768
- with open(config_file, 'r') as f:
769
- config = json.load(f)
770
- return config.get('active_profile', 'default')
771
- except (json.JSONDecodeError, IOError):
772
- pass
773
- return 'default'
774
-
775
- def _ensure_graph_tables(self):
776
- """Create graph tables if they don't exist, or recreate if schema is incomplete."""
777
- conn = sqlite3.connect(self.db_path)
778
- cursor = conn.cursor()
779
-
780
- # Check if existing tables have correct schema (not just id column)
781
- for table_name, required_cols in [
782
- ('graph_nodes', {'memory_id', 'entities'}),
783
- ('graph_edges', {'source_memory_id', 'target_memory_id', 'weight'}),
784
- ('graph_clusters', {'name', 'member_count'}),
785
- ]:
786
- cursor.execute(f"PRAGMA table_info({table_name})")
787
- existing_cols = {row[1] for row in cursor.fetchall()}
788
- if existing_cols and not required_cols.issubset(existing_cols):
789
- # Table exists but has incomplete schema — drop and recreate
790
- logger.warning(f"Dropping incomplete {table_name} table (missing: {required_cols - existing_cols})")
791
- cursor.execute(f'DROP TABLE IF EXISTS {table_name}')
792
-
793
- # Graph nodes table
794
- cursor.execute('''
795
- CREATE TABLE IF NOT EXISTS graph_nodes (
796
- id INTEGER PRIMARY KEY AUTOINCREMENT,
797
- memory_id INTEGER UNIQUE NOT NULL,
798
- entities TEXT,
799
- embedding_vector TEXT,
800
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
801
- FOREIGN KEY (memory_id) REFERENCES memories(id) ON DELETE CASCADE
802
- )
803
- ''')
804
-
805
- # Graph edges table
806
- cursor.execute('''
807
- CREATE TABLE IF NOT EXISTS graph_edges (
808
- id INTEGER PRIMARY KEY AUTOINCREMENT,
809
- source_memory_id INTEGER NOT NULL,
810
- target_memory_id INTEGER NOT NULL,
811
- relationship_type TEXT,
812
- weight REAL DEFAULT 1.0,
813
- shared_entities TEXT,
814
- similarity_score REAL,
815
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
816
- FOREIGN KEY (source_memory_id) REFERENCES memories(id) ON DELETE CASCADE,
817
- FOREIGN KEY (target_memory_id) REFERENCES memories(id) ON DELETE CASCADE,
818
- UNIQUE(source_memory_id, target_memory_id)
819
- )
820
- ''')
821
-
822
- # Graph clusters table
823
- cursor.execute('''
824
- CREATE TABLE IF NOT EXISTS graph_clusters (
825
- id INTEGER PRIMARY KEY AUTOINCREMENT,
826
- name TEXT NOT NULL,
827
- description TEXT,
828
- summary TEXT,
829
- member_count INTEGER DEFAULT 0,
830
- avg_importance REAL,
831
- parent_cluster_id INTEGER,
832
- depth INTEGER DEFAULT 0,
833
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
834
- updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
835
- FOREIGN KEY (parent_cluster_id) REFERENCES graph_clusters(id) ON DELETE SET NULL
836
- )
837
- ''')
838
-
839
- # Safe column additions for existing databases
840
- for col, col_type in [('summary', 'TEXT'), ('parent_cluster_id', 'INTEGER'), ('depth', 'INTEGER DEFAULT 0')]:
841
- try:
842
- cursor.execute(f'ALTER TABLE graph_clusters ADD COLUMN {col} {col_type}')
843
- except sqlite3.OperationalError:
844
- pass
845
-
846
- # Add cluster_id to memories if not exists
847
- try:
848
- cursor.execute('ALTER TABLE memories ADD COLUMN cluster_id INTEGER')
849
- except sqlite3.OperationalError:
850
- pass # Column already exists
851
-
852
- # Create indexes
853
- cursor.execute('CREATE INDEX IF NOT EXISTS idx_graph_source ON graph_edges(source_memory_id)')
854
- cursor.execute('CREATE INDEX IF NOT EXISTS idx_graph_target ON graph_edges(target_memory_id)')
855
- cursor.execute('CREATE INDEX IF NOT EXISTS idx_cluster_members ON memories(cluster_id)')
856
-
857
- conn.commit()
858
- conn.close()
859
- logger.info("Graph tables initialized")
860
-
861
- def build_graph(self, min_similarity: float = 0.3) -> Dict[str, any]:
862
- """
863
- Build complete knowledge graph from all memories.
864
-
865
- Args:
866
- min_similarity: Minimum cosine similarity for edge creation
867
-
868
- Returns:
869
- Dictionary with build statistics
870
-
871
- Raises:
872
- ValueError: If too many memories (>10000) for safe processing
873
- """
874
- start_time = time.time()
875
- logger.info("Starting full graph build...")
876
-
877
- conn = sqlite3.connect(self.db_path)
878
- cursor = conn.cursor()
879
-
880
- try:
881
- # First check if required tables exist
882
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
883
- existing_tables = {row[0] for row in cursor.fetchall()}
884
-
885
- required_tables = {'memories', 'graph_edges', 'graph_nodes', 'graph_clusters'}
886
- missing_tables = required_tables - existing_tables
887
-
888
- if missing_tables:
889
- logger.error(f"Missing required tables: {missing_tables}")
890
- return {
891
- 'success': False,
892
- 'error': 'database_not_initialized',
893
- 'message': f"Database not initialized. Missing tables: {', '.join(missing_tables)}",
894
- 'fix': "Run 'superlocalmemoryv2:status' first to initialize the database, or add some memories."
895
- }
896
-
897
- # Load memories for active profile only
898
- active_profile = self._get_active_profile()
899
- logger.info(f"Building graph for profile: {active_profile}")
900
- memories = cursor.execute('''
901
- SELECT id, content, summary FROM memories
902
- WHERE profile = ?
903
- ORDER BY id
904
- ''', (active_profile,)).fetchall()
905
-
906
- if len(memories) == 0:
907
- logger.warning("No memories found")
908
- return {
909
- 'success': False,
910
- 'error': 'no_memories',
911
- 'message': 'No memories found in database.',
912
- 'fix': "Add some memories first: superlocalmemoryv2:remember 'Your content here'"
913
- }
914
-
915
- if len(memories) < 2:
916
- logger.warning("Need at least 2 memories to build graph")
917
- return {
918
- 'success': False,
919
- 'error': 'insufficient_memories',
920
- 'message': 'Need at least 2 memories to build knowledge graph.',
921
- 'memories': len(memories),
922
- 'fix': "Add more memories: superlocalmemoryv2:remember 'Your content here'"
923
- }
924
-
925
- # SCALABILITY: Intelligent sampling for large datasets (v2.6)
926
- if len(memories) > MAX_MEMORIES_FOR_GRAPH:
927
- logger.warning(
928
- "Memory count (%d) exceeds graph cap (%d). Using intelligent sampling.",
929
- len(memories), MAX_MEMORIES_FOR_GRAPH
930
- )
931
- # Sample: 60% most recent + 40% highest importance (with overlap dedup)
932
- recent_count = int(MAX_MEMORIES_FOR_GRAPH * 0.6)
933
- important_count = int(MAX_MEMORIES_FOR_GRAPH * 0.4)
934
-
935
- recent_memories = cursor.execute('''
936
- SELECT id, content, summary FROM memories
937
- WHERE profile = ?
938
- ORDER BY created_at DESC
939
- LIMIT ?
940
- ''', (active_profile, recent_count)).fetchall()
941
-
942
- important_memories = cursor.execute('''
943
- SELECT id, content, summary FROM memories
944
- WHERE profile = ?
945
- ORDER BY importance DESC, access_count DESC
946
- LIMIT ?
947
- ''', (active_profile, important_count)).fetchall()
948
-
949
- # Deduplicate by ID, preserving order
950
- seen_ids = set()
951
- memories = []
952
- for m in recent_memories + important_memories:
953
- if m[0] not in seen_ids:
954
- seen_ids.add(m[0])
955
- memories.append(m)
956
- memories = memories[:MAX_MEMORIES_FOR_GRAPH]
957
- logger.info("Sampled %d memories for graph build", len(memories))
958
-
959
- elif len(memories) > MAX_MEMORIES_FOR_GRAPH * 0.8:
960
- logger.warning(
961
- "Approaching graph cap: %d/%d memories (%.0f%%). "
962
- "Consider running memory compression.",
963
- len(memories), MAX_MEMORIES_FOR_GRAPH,
964
- len(memories) / MAX_MEMORIES_FOR_GRAPH * 100
965
- )
966
-
967
- # Clear existing graph data for this profile's memories
968
- profile_memory_ids = [m[0] for m in memories]
969
- if profile_memory_ids:
970
- placeholders = ','.join('?' * len(profile_memory_ids))
971
- cursor.execute(f'''
972
- DELETE FROM graph_edges
973
- WHERE source_memory_id IN ({placeholders})
974
- OR target_memory_id IN ({placeholders})
975
- ''', profile_memory_ids + profile_memory_ids)
976
- cursor.execute(f'''
977
- DELETE FROM graph_nodes
978
- WHERE memory_id IN ({placeholders})
979
- ''', profile_memory_ids)
980
- # Remove orphaned clusters (no remaining members)
981
- cursor.execute('''
982
- DELETE FROM graph_clusters
983
- WHERE id NOT IN (
984
- SELECT DISTINCT cluster_id FROM memories
985
- WHERE cluster_id IS NOT NULL
986
- )
987
- ''')
988
- cursor.execute('UPDATE memories SET cluster_id = NULL WHERE profile = ?',
989
- (active_profile,))
990
- conn.commit()
991
-
992
- logger.info(f"Processing {len(memories)} memories")
993
-
994
- # Extract entities and vectors
995
- memory_ids = [m[0] for m in memories]
996
- contents = [f"{m[1]} {m[2] or ''}" for m in memories] # Combine content + summary
997
-
998
- entities_list, vectors = self.entity_extractor.extract_entities(contents)
999
-
1000
- # Store nodes
1001
- for memory_id, entities, vector in zip(memory_ids, entities_list, vectors):
1002
- cursor.execute('''
1003
- INSERT INTO graph_nodes (memory_id, entities, embedding_vector)
1004
- VALUES (?, ?, ?)
1005
- ''', (
1006
- memory_id,
1007
- json.dumps(entities),
1008
- json.dumps(vector.tolist())
1009
- ))
1010
-
1011
- conn.commit()
1012
- logger.info(f"Stored {len(memory_ids)} graph nodes")
1013
-
1014
- # Build edges
1015
- edges_count = self.edge_builder.build_edges(
1016
- memory_ids, vectors, entities_list
1017
- )
1018
-
1019
- # Detect communities (flat Leiden)
1020
- clusters_count = self.cluster_builder.detect_communities()
1021
-
1022
- # Hierarchical sub-clustering on large communities
1023
- hierarchical_stats = self.cluster_builder.hierarchical_cluster()
1024
- subclusters = hierarchical_stats.get('subclusters_created', 0)
1025
-
1026
- # Generate TF-IDF structured summaries for all clusters
1027
- summaries = self.cluster_builder.generate_cluster_summaries()
1028
-
1029
- elapsed = time.time() - start_time
1030
-
1031
- stats = {
1032
- 'success': True,
1033
- 'memories': len(memories),
1034
- 'nodes': len(memory_ids),
1035
- 'edges': edges_count,
1036
- 'clusters': clusters_count,
1037
- 'subclusters': subclusters,
1038
- 'max_depth': hierarchical_stats.get('depth_reached', 0),
1039
- 'summaries_generated': summaries,
1040
- 'time_seconds': round(elapsed, 2)
1041
- }
1042
- if not IGRAPH_AVAILABLE:
1043
- stats['warning'] = 'igraph/leidenalg not installed — graph built without clustering. Install with: pip3 install python-igraph leidenalg'
1044
-
1045
-
1046
- logger.info(f"Graph build complete: {stats}")
1047
- return stats
1048
-
1049
- except Exception as e:
1050
- logger.error(f"Graph build failed: {e}")
1051
- conn.rollback()
1052
- return {
1053
- 'success': False,
1054
- 'error': str(e)
1055
- }
1056
- finally:
1057
- conn.close()
1058
-
1059
- def extract_entities(self, memory_id: int) -> List[str]:
1060
- """
1061
- Extract entities for a single memory.
1062
-
1063
- Args:
1064
- memory_id: Memory ID
1065
-
1066
- Returns:
1067
- List of entity strings
1068
- """
1069
- conn = sqlite3.connect(self.db_path)
1070
- cursor = conn.cursor()
1071
-
1072
- try:
1073
- # Get memory content
1074
- memory = cursor.execute('''
1075
- SELECT content, summary FROM memories WHERE id = ?
1076
- ''', (memory_id,)).fetchone()
1077
-
1078
- if not memory:
1079
- return []
1080
-
1081
- content = f"{memory[0]} {memory[1] or ''}"
1082
- entities_list, _ = self.entity_extractor.extract_entities([content])
1083
-
1084
- return entities_list[0] if entities_list else []
1085
-
1086
- finally:
1087
- conn.close()
1088
-
1089
- def get_related(self, memory_id: int, max_hops: int = 2) -> List[Dict]:
1090
- """
1091
- Get memories connected to this memory via graph edges (active profile only).
1092
-
1093
- Args:
1094
- memory_id: Source memory ID
1095
- max_hops: Maximum traversal depth (1 or 2)
1096
-
1097
- Returns:
1098
- List of related memory dictionaries
1099
- """
1100
- conn = sqlite3.connect(self.db_path)
1101
- cursor = conn.cursor()
1102
- active_profile = self._get_active_profile()
1103
-
1104
- try:
1105
- # Get 1-hop neighbors (filtered to active profile)
1106
- edges = cursor.execute('''
1107
- SELECT ge.target_memory_id, ge.relationship_type, ge.weight, ge.shared_entities
1108
- FROM graph_edges ge
1109
- JOIN memories m ON ge.target_memory_id = m.id
1110
- WHERE ge.source_memory_id = ? AND m.profile = ?
1111
- UNION
1112
- SELECT ge.source_memory_id, ge.relationship_type, ge.weight, ge.shared_entities
1113
- FROM graph_edges ge
1114
- JOIN memories m ON ge.source_memory_id = m.id
1115
- WHERE ge.target_memory_id = ? AND m.profile = ?
1116
- ''', (memory_id, active_profile, memory_id, active_profile)).fetchall()
1117
-
1118
- results = []
1119
- seen_ids = {memory_id}
1120
-
1121
- for target_id, rel_type, weight, shared_entities in edges:
1122
- if target_id in seen_ids:
1123
- continue
1124
-
1125
- seen_ids.add(target_id)
1126
-
1127
- # Get memory details
1128
- memory = cursor.execute('''
1129
- SELECT id, summary, importance, tags
1130
- FROM memories WHERE id = ?
1131
- ''', (target_id,)).fetchone()
1132
-
1133
- if memory:
1134
- results.append({
1135
- 'id': memory[0],
1136
- 'summary': memory[1],
1137
- 'importance': memory[2],
1138
- 'tags': json.loads(memory[3]) if memory[3] else [],
1139
- 'relationship': rel_type,
1140
- 'weight': weight,
1141
- 'shared_entities': json.loads(shared_entities) if shared_entities else [],
1142
- 'hops': 1
1143
- })
1144
-
1145
- # If max_hops == 2, get 2-hop neighbors
1146
- if max_hops >= 2:
1147
- for result in results[:]: # Copy to avoid modification during iteration
1148
- second_hop = cursor.execute('''
1149
- SELECT target_memory_id, relationship_type, weight
1150
- FROM graph_edges
1151
- WHERE source_memory_id = ?
1152
- UNION
1153
- SELECT source_memory_id, relationship_type, weight
1154
- FROM graph_edges
1155
- WHERE target_memory_id = ?
1156
- ''', (result['id'], result['id'])).fetchall()
1157
-
1158
- for target_id, rel_type, weight in second_hop:
1159
- if target_id in seen_ids:
1160
- continue
1161
-
1162
- seen_ids.add(target_id)
1163
-
1164
- memory = cursor.execute('''
1165
- SELECT id, summary, importance, tags
1166
- FROM memories WHERE id = ?
1167
- ''', (target_id,)).fetchone()
1168
-
1169
- if memory:
1170
- results.append({
1171
- 'id': memory[0],
1172
- 'summary': memory[1],
1173
- 'importance': memory[2],
1174
- 'tags': json.loads(memory[3]) if memory[3] else [],
1175
- 'relationship': rel_type,
1176
- 'weight': weight,
1177
- 'shared_entities': [],
1178
- 'hops': 2
1179
- })
1180
-
1181
- # Sort by weight (strongest connections first)
1182
- results.sort(key=lambda x: (-x['hops'], -x['weight']))
1183
-
1184
- return results
1185
-
1186
- finally:
1187
- conn.close()
1188
-
1189
- def get_cluster_members(self, cluster_id: int) -> List[Dict]:
1190
- """
1191
- Get all memories in a cluster (filtered by active profile).
1192
-
1193
- Args:
1194
- cluster_id: Cluster ID
1195
-
1196
- Returns:
1197
- List of memory dictionaries
1198
- """
1199
- conn = sqlite3.connect(self.db_path)
1200
- cursor = conn.cursor()
1201
- active_profile = self._get_active_profile()
1202
-
1203
- try:
1204
- memories = cursor.execute('''
1205
- SELECT id, summary, importance, tags, created_at
1206
- FROM memories
1207
- WHERE cluster_id = ? AND profile = ?
1208
- ORDER BY importance DESC
1209
- ''', (cluster_id, active_profile)).fetchall()
1210
-
1211
- return [
1212
- {
1213
- 'id': m[0],
1214
- 'summary': m[1],
1215
- 'importance': m[2],
1216
- 'tags': json.loads(m[3]) if m[3] else [],
1217
- 'created_at': m[4]
1218
- }
1219
- for m in memories
1220
- ]
1221
-
1222
- finally:
1223
- conn.close()
1224
-
1225
- def add_memory_incremental(self, memory_id: int) -> bool:
1226
- """
1227
- Add single memory to existing graph (incremental update).
1228
-
1229
- Args:
1230
- memory_id: New memory ID to add
1231
-
1232
- Returns:
1233
- Success status
1234
- """
1235
- conn = sqlite3.connect(self.db_path)
1236
- cursor = conn.cursor()
1237
-
1238
- try:
1239
- # Get new memory content
1240
- memory = cursor.execute('''
1241
- SELECT content, summary FROM memories WHERE id = ?
1242
- ''', (memory_id,)).fetchone()
1243
-
1244
- if not memory:
1245
- return False
1246
-
1247
- # Extract entities for new memory
1248
- content = f"{memory[0]} {memory[1] or ''}"
1249
- entities_list, vector = self.entity_extractor.extract_entities([content])
1250
-
1251
- if not entities_list:
1252
- return False
1253
-
1254
- new_entities = entities_list[0]
1255
- new_vector = vector[0]
1256
-
1257
- # Store node
1258
- cursor.execute('''
1259
- INSERT OR REPLACE INTO graph_nodes (memory_id, entities, embedding_vector)
1260
- VALUES (?, ?, ?)
1261
- ''', (memory_id, json.dumps(new_entities), json.dumps(new_vector.tolist())))
1262
-
1263
- # Compare to existing memories in the same profile
1264
- active_profile = self._get_active_profile()
1265
- existing = cursor.execute('''
1266
- SELECT gn.memory_id, gn.embedding_vector, gn.entities
1267
- FROM graph_nodes gn
1268
- JOIN memories m ON gn.memory_id = m.id
1269
- WHERE gn.memory_id != ? AND m.profile = ?
1270
- ''', (memory_id, active_profile)).fetchall()
1271
-
1272
- edges_added = 0
1273
-
1274
- for existing_id, existing_vector_json, existing_entities_json in existing:
1275
- existing_vector = np.array(json.loads(existing_vector_json))
1276
-
1277
- # Compute similarity
1278
- sim = cosine_similarity([new_vector], [existing_vector])[0][0]
1279
-
1280
- if sim >= self.edge_builder.min_similarity:
1281
- # Find shared entities
1282
- existing_entities = json.loads(existing_entities_json)
1283
- shared = list(set(new_entities) & set(existing_entities))
1284
-
1285
- # Classify relationship
1286
- rel_type = self.edge_builder._classify_relationship(sim, shared)
1287
-
1288
- # Insert edge
1289
- cursor.execute('''
1290
- INSERT OR REPLACE INTO graph_edges
1291
- (source_memory_id, target_memory_id, relationship_type,
1292
- weight, shared_entities, similarity_score)
1293
- VALUES (?, ?, ?, ?, ?, ?)
1294
- ''', (
1295
- memory_id,
1296
- existing_id,
1297
- rel_type,
1298
- float(sim),
1299
- json.dumps(shared),
1300
- float(sim)
1301
- ))
1302
-
1303
- edges_added += 1
1304
-
1305
- conn.commit()
1306
- logger.info(f"Added memory {memory_id} to graph with {edges_added} edges")
1307
-
1308
- # Optionally re-cluster if significant change
1309
- if edges_added > 5:
1310
- logger.info("Significant graph change - consider re-clustering")
1311
-
1312
- return True
1313
-
1314
- except Exception as e:
1315
- logger.error(f"Incremental add failed: {e}")
1316
- conn.rollback()
1317
- return False
1318
- finally:
1319
- conn.close()
1320
-
1321
- def get_stats(self) -> Dict[str, any]:
1322
- """Get graph statistics for the active profile."""
1323
- conn = sqlite3.connect(self.db_path)
1324
- cursor = conn.cursor()
1325
- active_profile = self._get_active_profile()
1326
-
1327
- try:
1328
- # Count nodes for active profile's memories
1329
- nodes = cursor.execute('''
1330
- SELECT COUNT(*) FROM graph_nodes
1331
- WHERE memory_id IN (SELECT id FROM memories WHERE profile = ?)
1332
- ''', (active_profile,)).fetchone()[0]
1333
-
1334
- # Count edges where at least one end is in active profile
1335
- edges = cursor.execute('''
1336
- SELECT COUNT(*) FROM graph_edges
1337
- WHERE source_memory_id IN (SELECT id FROM memories WHERE profile = ?)
1338
- ''', (active_profile,)).fetchone()[0]
1339
-
1340
- # Clusters that have members in active profile
1341
- clusters = cursor.execute('''
1342
- SELECT COUNT(DISTINCT cluster_id) FROM memories
1343
- WHERE cluster_id IS NOT NULL AND profile = ?
1344
- ''', (active_profile,)).fetchone()[0]
1345
-
1346
- # Cluster breakdown for active profile (including hierarchy)
1347
- cluster_info = cursor.execute('''
1348
- SELECT gc.name, gc.member_count, gc.avg_importance,
1349
- gc.summary, gc.parent_cluster_id, gc.depth
1350
- FROM graph_clusters gc
1351
- WHERE gc.id IN (
1352
- SELECT DISTINCT cluster_id FROM memories
1353
- WHERE cluster_id IS NOT NULL AND profile = ?
1354
- )
1355
- ORDER BY gc.depth ASC, gc.member_count DESC
1356
- LIMIT 20
1357
- ''', (active_profile,)).fetchall()
1358
-
1359
- # Count hierarchical depth
1360
- max_depth = max((c[5] or 0 for c in cluster_info), default=0) if cluster_info else 0
1361
-
1362
- return {
1363
- 'profile': active_profile,
1364
- 'nodes': nodes,
1365
- 'edges': edges,
1366
- 'clusters': clusters,
1367
- 'max_depth': max_depth,
1368
- 'top_clusters': [
1369
- {
1370
- 'name': c[0],
1371
- 'members': c[1],
1372
- 'avg_importance': round(c[2], 1) if c[2] else 5.0,
1373
- 'summary': c[3],
1374
- 'parent_cluster_id': c[4],
1375
- 'depth': c[5] or 0
1376
- }
1377
- for c in cluster_info
1378
- ]
1379
- }
1380
-
1381
- finally:
1382
- conn.close()
1383
-
1384
-
1385
- def main():
1386
- """CLI interface for manual graph operations."""
1387
- import argparse
1388
-
1389
- parser = argparse.ArgumentParser(description='GraphEngine - Knowledge Graph Management')
1390
- parser.add_argument('command', choices=['build', 'stats', 'related', 'cluster', 'hierarchical', 'summaries'],
1391
- help='Command to execute')
1392
- parser.add_argument('--memory-id', type=int, help='Memory ID for related/add commands')
1393
- parser.add_argument('--cluster-id', type=int, help='Cluster ID for cluster command')
1394
- parser.add_argument('--min-similarity', type=float, default=0.3,
1395
- help='Minimum similarity for edges (default: 0.3)')
1396
- parser.add_argument('--hops', type=int, default=2, help='Max hops for related (default: 2)')
1397
-
1398
- args = parser.parse_args()
1399
-
1400
- engine = GraphEngine()
1401
-
1402
- if args.command == 'build':
1403
- print("Building knowledge graph...")
1404
- stats = engine.build_graph(min_similarity=args.min_similarity)
1405
- print(json.dumps(stats, indent=2))
1406
-
1407
- elif args.command == 'stats':
1408
- print("Graph Statistics:")
1409
- stats = engine.get_stats()
1410
- print(json.dumps(stats, indent=2))
1411
-
1412
- elif args.command == 'related':
1413
- if not args.memory_id:
1414
- print("Error: --memory-id required for 'related' command")
1415
- return
1416
-
1417
- print(f"Finding memories related to #{args.memory_id}...")
1418
- related = engine.get_related(args.memory_id, max_hops=args.hops)
1419
-
1420
- if not related:
1421
- print("No related memories found")
1422
- else:
1423
- for idx, mem in enumerate(related, 1):
1424
- print(f"\n{idx}. Memory #{mem['id']} ({mem['hops']}-hop, weight={mem['weight']:.3f})")
1425
- print(f" Relationship: {mem['relationship']}")
1426
- summary = mem['summary'] or '[No summary]'
1427
- print(f" Summary: {summary[:100]}...")
1428
- if mem['shared_entities']:
1429
- print(f" Shared: {', '.join(mem['shared_entities'][:5])}")
1430
-
1431
- elif args.command == 'cluster':
1432
- if not args.cluster_id:
1433
- print("Error: --cluster-id required for 'cluster' command")
1434
- return
1435
-
1436
- print(f"Cluster #{args.cluster_id} members:")
1437
- members = engine.get_cluster_members(args.cluster_id)
1438
-
1439
- for idx, mem in enumerate(members, 1):
1440
- print(f"\n{idx}. Memory #{mem['id']} (importance={mem['importance']})")
1441
- summary = mem['summary'] or '[No summary]'
1442
- print(f" {summary[:100]}...")
1443
-
1444
- elif args.command == 'hierarchical':
1445
- print("Running hierarchical sub-clustering...")
1446
- cluster_builder = ClusterBuilder(engine.db_path)
1447
- stats = cluster_builder.hierarchical_cluster()
1448
- print(json.dumps(stats, indent=2))
1449
-
1450
- elif args.command == 'summaries':
1451
- print("Generating cluster summaries...")
1452
- cluster_builder = ClusterBuilder(engine.db_path)
1453
- count = cluster_builder.generate_cluster_summaries()
1454
- print(f"Generated summaries for {count} clusters")
1455
36
 
37
+ __all__ = [
38
+ "MAX_MEMORIES_FOR_GRAPH",
39
+ "SKLEARN_AVAILABLE",
40
+ "IGRAPH_AVAILABLE",
41
+ "MEMORY_DIR",
42
+ "DB_PATH",
43
+ "EntityExtractor",
44
+ "ClusterNamer",
45
+ "EdgeBuilder",
46
+ "ClusterBuilder",
47
+ "GraphEngine",
48
+ "main",
49
+ ]
1456
50
 
1457
51
  if __name__ == '__main__':
1458
52
  main()