superlocalmemory 2.7.6 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. package/CHANGELOG.md +120 -155
  2. package/README.md +115 -89
  3. package/api_server.py +2 -12
  4. package/docs/PATTERN-LEARNING.md +64 -199
  5. package/docs/example_graph_usage.py +4 -6
  6. package/install.sh +59 -0
  7. package/mcp_server.py +83 -7
  8. package/package.json +1 -8
  9. package/scripts/generate-thumbnails.py +3 -5
  10. package/skills/slm-build-graph/SKILL.md +1 -1
  11. package/skills/slm-list-recent/SKILL.md +1 -1
  12. package/skills/slm-recall/SKILL.md +1 -1
  13. package/skills/slm-remember/SKILL.md +1 -1
  14. package/skills/slm-show-patterns/SKILL.md +1 -1
  15. package/skills/slm-status/SKILL.md +1 -1
  16. package/skills/slm-switch-profile/SKILL.md +1 -1
  17. package/src/agent_registry.py +7 -18
  18. package/src/auth_middleware.py +3 -5
  19. package/src/auto_backup.py +3 -7
  20. package/src/behavioral/__init__.py +49 -0
  21. package/src/behavioral/behavioral_listener.py +203 -0
  22. package/src/behavioral/behavioral_patterns.py +275 -0
  23. package/src/behavioral/cross_project_transfer.py +206 -0
  24. package/src/behavioral/outcome_inference.py +194 -0
  25. package/src/behavioral/outcome_tracker.py +193 -0
  26. package/src/behavioral/tests/__init__.py +4 -0
  27. package/src/behavioral/tests/test_behavioral_integration.py +108 -0
  28. package/src/behavioral/tests/test_behavioral_patterns.py +150 -0
  29. package/src/behavioral/tests/test_cross_project_transfer.py +142 -0
  30. package/src/behavioral/tests/test_mcp_behavioral.py +139 -0
  31. package/src/behavioral/tests/test_mcp_report_outcome.py +117 -0
  32. package/src/behavioral/tests/test_outcome_inference.py +107 -0
  33. package/src/behavioral/tests/test_outcome_tracker.py +96 -0
  34. package/src/cache_manager.py +4 -6
  35. package/src/compliance/__init__.py +48 -0
  36. package/src/compliance/abac_engine.py +149 -0
  37. package/src/compliance/abac_middleware.py +116 -0
  38. package/src/compliance/audit_db.py +215 -0
  39. package/src/compliance/audit_logger.py +148 -0
  40. package/src/compliance/retention_manager.py +289 -0
  41. package/src/compliance/retention_scheduler.py +186 -0
  42. package/src/compliance/tests/__init__.py +4 -0
  43. package/src/compliance/tests/test_abac_enforcement.py +95 -0
  44. package/src/compliance/tests/test_abac_engine.py +124 -0
  45. package/src/compliance/tests/test_abac_mcp_integration.py +118 -0
  46. package/src/compliance/tests/test_audit_db.py +123 -0
  47. package/src/compliance/tests/test_audit_logger.py +98 -0
  48. package/src/compliance/tests/test_mcp_audit.py +128 -0
  49. package/src/compliance/tests/test_mcp_retention_policy.py +125 -0
  50. package/src/compliance/tests/test_retention_manager.py +131 -0
  51. package/src/compliance/tests/test_retention_scheduler.py +99 -0
  52. package/src/db_connection_manager.py +2 -12
  53. package/src/embedding_engine.py +61 -669
  54. package/src/embeddings/__init__.py +47 -0
  55. package/src/embeddings/cache.py +70 -0
  56. package/src/embeddings/cli.py +113 -0
  57. package/src/embeddings/constants.py +47 -0
  58. package/src/embeddings/database.py +91 -0
  59. package/src/embeddings/engine.py +247 -0
  60. package/src/embeddings/model_loader.py +145 -0
  61. package/src/event_bus.py +3 -13
  62. package/src/graph/__init__.py +36 -0
  63. package/src/graph/build_helpers.py +74 -0
  64. package/src/graph/cli.py +87 -0
  65. package/src/graph/cluster_builder.py +188 -0
  66. package/src/graph/cluster_summary.py +148 -0
  67. package/src/graph/constants.py +47 -0
  68. package/src/graph/edge_builder.py +162 -0
  69. package/src/graph/entity_extractor.py +95 -0
  70. package/src/graph/graph_core.py +226 -0
  71. package/src/graph/graph_search.py +231 -0
  72. package/src/graph/hierarchical.py +207 -0
  73. package/src/graph/schema.py +99 -0
  74. package/src/graph_engine.py +45 -1451
  75. package/src/hnsw_index.py +3 -7
  76. package/src/hybrid_search.py +36 -683
  77. package/src/learning/__init__.py +27 -12
  78. package/src/learning/adaptive_ranker.py +50 -12
  79. package/src/learning/cross_project_aggregator.py +2 -12
  80. package/src/learning/engagement_tracker.py +2 -12
  81. package/src/learning/feature_extractor.py +175 -43
  82. package/src/learning/feedback_collector.py +7 -12
  83. package/src/learning/learning_db.py +180 -12
  84. package/src/learning/project_context_manager.py +2 -12
  85. package/src/learning/source_quality_scorer.py +2 -12
  86. package/src/learning/synthetic_bootstrap.py +2 -12
  87. package/src/learning/tests/__init__.py +2 -0
  88. package/src/learning/tests/test_adaptive_ranker.py +2 -6
  89. package/src/learning/tests/test_adaptive_ranker_v28.py +60 -0
  90. package/src/learning/tests/test_aggregator.py +2 -6
  91. package/src/learning/tests/test_auto_retrain_v28.py +35 -0
  92. package/src/learning/tests/test_e2e_ranking_v28.py +82 -0
  93. package/src/learning/tests/test_feature_extractor_v28.py +93 -0
  94. package/src/learning/tests/test_feedback_collector.py +2 -6
  95. package/src/learning/tests/test_learning_db.py +2 -6
  96. package/src/learning/tests/test_learning_db_v28.py +110 -0
  97. package/src/learning/tests/test_learning_init_v28.py +48 -0
  98. package/src/learning/tests/test_outcome_signals.py +48 -0
  99. package/src/learning/tests/test_project_context.py +2 -6
  100. package/src/learning/tests/test_schema_migration.py +319 -0
  101. package/src/learning/tests/test_signal_inference.py +11 -13
  102. package/src/learning/tests/test_source_quality.py +2 -6
  103. package/src/learning/tests/test_synthetic_bootstrap.py +3 -7
  104. package/src/learning/tests/test_workflow_miner.py +2 -6
  105. package/src/learning/workflow_pattern_miner.py +2 -12
  106. package/src/lifecycle/__init__.py +54 -0
  107. package/src/lifecycle/bounded_growth.py +239 -0
  108. package/src/lifecycle/compaction_engine.py +226 -0
  109. package/src/lifecycle/lifecycle_engine.py +302 -0
  110. package/src/lifecycle/lifecycle_evaluator.py +225 -0
  111. package/src/lifecycle/lifecycle_scheduler.py +130 -0
  112. package/src/lifecycle/retention_policy.py +285 -0
  113. package/src/lifecycle/tests/__init__.py +4 -0
  114. package/src/lifecycle/tests/test_bounded_growth.py +193 -0
  115. package/src/lifecycle/tests/test_compaction.py +179 -0
  116. package/src/lifecycle/tests/test_lifecycle_engine.py +137 -0
  117. package/src/lifecycle/tests/test_lifecycle_evaluation.py +177 -0
  118. package/src/lifecycle/tests/test_lifecycle_scheduler.py +127 -0
  119. package/src/lifecycle/tests/test_lifecycle_search.py +109 -0
  120. package/src/lifecycle/tests/test_mcp_compact.py +149 -0
  121. package/src/lifecycle/tests/test_mcp_lifecycle_status.py +114 -0
  122. package/src/lifecycle/tests/test_retention_policy.py +162 -0
  123. package/src/mcp_tools_v28.py +280 -0
  124. package/src/memory-profiles.py +2 -12
  125. package/src/memory-reset.py +2 -12
  126. package/src/memory_compression.py +2 -12
  127. package/src/memory_store_v2.py +76 -20
  128. package/src/migrate_v1_to_v2.py +2 -12
  129. package/src/pattern_learner.py +29 -975
  130. package/src/patterns/__init__.py +24 -0
  131. package/src/patterns/analyzers.py +247 -0
  132. package/src/patterns/learner.py +267 -0
  133. package/src/patterns/scoring.py +167 -0
  134. package/src/patterns/store.py +223 -0
  135. package/src/patterns/terminology.py +138 -0
  136. package/src/provenance_tracker.py +4 -14
  137. package/src/query_optimizer.py +4 -6
  138. package/src/rate_limiter.py +2 -6
  139. package/src/search/__init__.py +20 -0
  140. package/src/search/cli.py +77 -0
  141. package/src/search/constants.py +26 -0
  142. package/src/search/engine.py +239 -0
  143. package/src/search/fusion.py +122 -0
  144. package/src/search/index_loader.py +112 -0
  145. package/src/search/methods.py +162 -0
  146. package/src/search_engine_v2.py +4 -6
  147. package/src/setup_validator.py +7 -13
  148. package/src/subscription_manager.py +2 -12
  149. package/src/tree/__init__.py +59 -0
  150. package/src/tree/builder.py +183 -0
  151. package/src/tree/nodes.py +196 -0
  152. package/src/tree/queries.py +252 -0
  153. package/src/tree/schema.py +76 -0
  154. package/src/tree_manager.py +10 -711
  155. package/src/trust/__init__.py +45 -0
  156. package/src/trust/constants.py +66 -0
  157. package/src/trust/queries.py +157 -0
  158. package/src/trust/schema.py +95 -0
  159. package/src/trust/scorer.py +299 -0
  160. package/src/trust/signals.py +95 -0
  161. package/src/trust_scorer.py +39 -697
  162. package/src/webhook_dispatcher.py +2 -12
  163. package/ui/app.js +1 -1
  164. package/ui/js/agents.js +1 -1
  165. package/ui_server.py +2 -14
  166. package/ATTRIBUTION.md +0 -140
  167. package/docs/ARCHITECTURE-V2.5.md +0 -190
  168. package/docs/GRAPH-ENGINE.md +0 -503
  169. package/docs/architecture-diagram.drawio +0 -405
  170. package/docs/plans/2026-02-13-benchmark-suite.md +0 -1349
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """Cluster summary generation for the graph engine.
5
+
6
+ Generates TF-IDF structured summaries for graph clusters,
7
+ analyzing member content to produce human-readable descriptions
8
+ of each cluster's theme, key topics, and scope.
9
+ """
10
+ import sqlite3
11
+ import json
12
+ from pathlib import Path
13
+ from collections import Counter
14
+
15
+ from graph.constants import logger, MEMORY_DIR
16
+
17
+
18
+ def _get_active_profile() -> str:
19
+ """Get the currently active profile name from config."""
20
+ config_file = MEMORY_DIR / "profiles.json"
21
+ if config_file.exists():
22
+ try:
23
+ with open(config_file, 'r') as f:
24
+ config = json.load(f)
25
+ return config.get('active_profile', 'default')
26
+ except (json.JSONDecodeError, IOError):
27
+ pass
28
+ return 'default'
29
+
30
+
31
+ def generate_cluster_summaries(db_path: Path) -> int:
32
+ """
33
+ Generate TF-IDF structured summaries for all clusters.
34
+
35
+ For each cluster, analyzes member content to produce a human-readable
36
+ summary describing the cluster's theme, key topics, and scope.
37
+
38
+ Returns:
39
+ Number of clusters with summaries generated
40
+ """
41
+ conn = sqlite3.connect(db_path)
42
+ cursor = conn.cursor()
43
+ active_profile = _get_active_profile()
44
+
45
+ try:
46
+ # Get all clusters for this profile
47
+ cursor.execute('''
48
+ SELECT DISTINCT gc.id, gc.name, gc.member_count
49
+ FROM graph_clusters gc
50
+ JOIN memories m ON m.cluster_id = gc.id
51
+ WHERE m.profile = ?
52
+ ''', (active_profile,))
53
+ clusters = cursor.fetchall()
54
+
55
+ if not clusters:
56
+ return 0
57
+
58
+ summaries_generated = 0
59
+
60
+ for cluster_id, cluster_name, member_count in clusters:
61
+ summary = _build_cluster_summary(cursor, cluster_id, active_profile)
62
+ if summary:
63
+ cursor.execute('''
64
+ UPDATE graph_clusters SET summary = ?, updated_at = CURRENT_TIMESTAMP
65
+ WHERE id = ?
66
+ ''', (summary, cluster_id))
67
+ summaries_generated += 1
68
+ logger.info(f"Summary for cluster {cluster_id} ({cluster_name}): {summary[:80]}...")
69
+
70
+ conn.commit()
71
+ logger.info(f"Generated {summaries_generated} cluster summaries")
72
+ return summaries_generated
73
+
74
+ except Exception as e:
75
+ logger.error(f"Summary generation failed: {e}")
76
+ conn.rollback()
77
+ return 0
78
+ finally:
79
+ conn.close()
80
+
81
+
82
+ def _build_cluster_summary(cursor, cluster_id: int, profile: str) -> str:
83
+ """Build a TF-IDF structured summary for a single cluster."""
84
+ # Get member content
85
+ cursor.execute('''
86
+ SELECT m.content, m.summary, m.tags, m.category, m.project_name
87
+ FROM memories m
88
+ WHERE m.cluster_id = ? AND m.profile = ?
89
+ ''', (cluster_id, profile))
90
+ members = cursor.fetchall()
91
+
92
+ if not members:
93
+ return ""
94
+
95
+ # Collect entities from graph nodes
96
+ cursor.execute('''
97
+ SELECT gn.entities
98
+ FROM graph_nodes gn
99
+ JOIN memories m ON gn.memory_id = m.id
100
+ WHERE m.cluster_id = ? AND m.profile = ?
101
+ ''', (cluster_id, profile))
102
+ all_entities = []
103
+ for row in cursor.fetchall():
104
+ if row[0]:
105
+ try:
106
+ all_entities.extend(json.loads(row[0]))
107
+ except (json.JSONDecodeError, TypeError):
108
+ pass
109
+
110
+ # Top entities by frequency (TF-IDF already extracted these)
111
+ entity_counts = Counter(all_entities)
112
+ top_entities = [e for e, _ in entity_counts.most_common(5)]
113
+
114
+ # Collect unique projects and categories
115
+ projects = set()
116
+ categories = set()
117
+ for m in members:
118
+ if m[3]: # category
119
+ categories.add(m[3])
120
+ if m[4]: # project_name
121
+ projects.add(m[4])
122
+
123
+ # Build structured summary
124
+ parts = []
125
+
126
+ # Theme from top entities
127
+ if top_entities:
128
+ parts.append(f"Key topics: {', '.join(top_entities[:5])}")
129
+
130
+ # Scope
131
+ if projects:
132
+ parts.append(f"Projects: {', '.join(sorted(projects)[:3])}")
133
+ if categories:
134
+ parts.append(f"Categories: {', '.join(sorted(categories)[:3])}")
135
+
136
+ # Size context
137
+ parts.append(f"{len(members)} memories")
138
+
139
+ # Check for hierarchical context
140
+ cursor.execute('SELECT parent_cluster_id FROM graph_clusters WHERE id = ?', (cluster_id,))
141
+ parent_row = cursor.fetchone()
142
+ if parent_row and parent_row[0]:
143
+ cursor.execute('SELECT name FROM graph_clusters WHERE id = ?', (parent_row[0],))
144
+ parent_name_row = cursor.fetchone()
145
+ if parent_name_row:
146
+ parts.append(f"Sub-cluster of: {parent_name_row[0]}")
147
+
148
+ return " | ".join(parts)
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """Shared constants, imports, and configuration for the graph engine modules.
5
+ """
6
+ # SECURITY: Graph build limits to prevent resource exhaustion
7
+ MAX_MEMORIES_FOR_GRAPH = 10000
8
+
9
+ import sqlite3
10
+ import json
11
+ import time
12
+ import logging
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import List, Dict, Optional, Tuple, Set
16
+ from collections import Counter
17
+
18
+ # Core dependencies
19
+ try:
20
+ from sklearn.feature_extraction.text import TfidfVectorizer
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ import numpy as np
23
+ SKLEARN_AVAILABLE = True
24
+ except ImportError:
25
+ SKLEARN_AVAILABLE = False
26
+ raise ImportError("scikit-learn is required. Install: pip install scikit-learn")
27
+
28
+ # Graph dependencies - lazy import to avoid conflicts with compression module
29
+ IGRAPH_AVAILABLE = False
30
+ try:
31
+ # Import only when needed to avoid module conflicts
32
+ import importlib
33
+ ig_module = importlib.import_module('igraph')
34
+ leiden_module = importlib.import_module('leidenalg')
35
+ IGRAPH_AVAILABLE = True
36
+ except ImportError:
37
+ pass # Will raise error when building clusters if not available
38
+
39
+ # Setup logging
40
+ logging.basicConfig(
41
+ level=logging.INFO,
42
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
43
+ )
44
+ logger = logging.getLogger('graph_engine')
45
+
46
+ MEMORY_DIR = Path.home() / ".claude-memory"
47
+ DB_PATH = MEMORY_DIR / "memory.db"
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """Edge building for the graph engine.
5
+
6
+ Builds similarity edges between memories based on entity overlap
7
+ and TF-IDF vector cosine similarity. Supports HNSW-accelerated
8
+ edge building for large datasets.
9
+ """
10
+ import sqlite3
11
+ import json
12
+ from pathlib import Path
13
+ from typing import List
14
+
15
+ import numpy as np
16
+
17
+ from graph.constants import logger, cosine_similarity
18
+
19
+
20
+ class EdgeBuilder:
21
+ """Build similarity edges between memories based on entity overlap."""
22
+
23
+ def __init__(self, db_path: Path, min_similarity: float = 0.3):
24
+ """
25
+ Initialize edge builder.
26
+
27
+ Args:
28
+ db_path: Path to SQLite database
29
+ min_similarity: Minimum cosine similarity to create edge
30
+ """
31
+ self.db_path = db_path
32
+ self.min_similarity = min_similarity
33
+
34
+ def build_edges(self, memory_ids: List[int], vectors: np.ndarray,
35
+ entities_list: List[List[str]]) -> int:
36
+ """
37
+ Build edges between similar memories.
38
+
39
+ Args:
40
+ memory_ids: List of memory IDs
41
+ vectors: TF-IDF vectors (n x features)
42
+ entities_list: List of entity lists per memory
43
+
44
+ Returns:
45
+ Number of edges created
46
+ """
47
+ if len(memory_ids) < 2:
48
+ logger.warning("Need at least 2 memories to build edges")
49
+ return 0
50
+
51
+ # Try HNSW-accelerated edge building first (O(n log n))
52
+ use_hnsw = False
53
+ try:
54
+ from hnsw_index import HNSWIndex
55
+ if len(memory_ids) >= 50: # HNSW overhead not worth it for small sets
56
+ use_hnsw = True
57
+ except ImportError:
58
+ pass
59
+
60
+ edges_added = 0
61
+ conn = sqlite3.connect(self.db_path)
62
+ cursor = conn.cursor()
63
+
64
+ try:
65
+ if use_hnsw:
66
+ logger.info("Using HNSW-accelerated edge building for %d memories", len(memory_ids))
67
+ try:
68
+ dim = vectors.shape[1]
69
+ hnsw = HNSWIndex(dimension=dim, max_elements=len(memory_ids))
70
+ hnsw.build(vectors, memory_ids)
71
+
72
+ for i in range(len(memory_ids)):
73
+ neighbors = hnsw.search(vectors[i], k=min(20, len(memory_ids) - 1))
74
+ for neighbor_id, similarity in neighbors:
75
+ if neighbor_id == memory_ids[i]:
76
+ continue # Skip self
77
+ # Only process each pair once (lower ID first)
78
+ if memory_ids[i] > neighbor_id:
79
+ continue
80
+ if similarity >= self.min_similarity:
81
+ # Find indices for entity lookup
82
+ j = memory_ids.index(neighbor_id)
83
+ entities_i = set(entities_list[i])
84
+ entities_j = set(entities_list[j])
85
+ shared = list(entities_i & entities_j)
86
+ rel_type = self._classify_relationship(similarity, shared)
87
+
88
+ cursor.execute('''
89
+ INSERT OR REPLACE INTO graph_edges
90
+ (source_memory_id, target_memory_id, relationship_type,
91
+ weight, shared_entities, similarity_score)
92
+ VALUES (?, ?, ?, ?, ?, ?)
93
+ ''', (
94
+ memory_ids[i], neighbor_id, rel_type,
95
+ float(similarity), json.dumps(shared), float(similarity)
96
+ ))
97
+ edges_added += 1
98
+
99
+ except Exception as e:
100
+ logger.warning("HNSW edge building failed, falling back to O(n²): %s", e)
101
+ use_hnsw = False # Fall through to O(n²) below
102
+
103
+ if not use_hnsw:
104
+ # Fallback: O(n²) pairwise cosine similarity
105
+ similarity_matrix = cosine_similarity(vectors)
106
+
107
+ for i in range(len(memory_ids)):
108
+ for j in range(i + 1, len(memory_ids)):
109
+ sim = similarity_matrix[i, j]
110
+
111
+ if sim >= self.min_similarity:
112
+ entities_i = set(entities_list[i])
113
+ entities_j = set(entities_list[j])
114
+ shared = list(entities_i & entities_j)
115
+ rel_type = self._classify_relationship(sim, shared)
116
+
117
+ cursor.execute('''
118
+ INSERT OR REPLACE INTO graph_edges
119
+ (source_memory_id, target_memory_id, relationship_type,
120
+ weight, shared_entities, similarity_score)
121
+ VALUES (?, ?, ?, ?, ?, ?)
122
+ ''', (
123
+ memory_ids[i], memory_ids[j], rel_type,
124
+ float(sim), json.dumps(shared), float(sim)
125
+ ))
126
+ edges_added += 1
127
+
128
+ conn.commit()
129
+ logger.info(f"Created {edges_added} edges")
130
+ return edges_added
131
+
132
+ except Exception as e:
133
+ logger.error(f"Edge building failed: {e}")
134
+ conn.rollback()
135
+ return 0
136
+ finally:
137
+ conn.close()
138
+
139
+ def _classify_relationship(self, similarity: float, shared_entities: List[str]) -> str:
140
+ """
141
+ Classify edge type based on similarity and shared entities.
142
+
143
+ Args:
144
+ similarity: Cosine similarity score
145
+ shared_entities: List of shared entity strings
146
+
147
+ Returns:
148
+ Relationship type: 'similar', 'depends_on', or 'related_to'
149
+ """
150
+ # Check for dependency keywords
151
+ dependency_keywords = {'dependency', 'require', 'import', 'use', 'need'}
152
+ has_dependency = any(
153
+ any(kw in entity.lower() for kw in dependency_keywords)
154
+ for entity in shared_entities
155
+ )
156
+
157
+ if similarity > 0.7:
158
+ return 'similar'
159
+ elif has_dependency:
160
+ return 'depends_on'
161
+ else:
162
+ return 'related_to'
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """Entity extraction and cluster naming for the graph engine.
5
+
6
+ Provides TF-IDF based entity extraction from memory content
7
+ and cluster naming utilities.
8
+ """
9
+ from typing import List, Tuple
10
+ from collections import Counter
11
+
12
+ import numpy as np
13
+
14
+ from graph.constants import logger, TfidfVectorizer
15
+
16
+
17
+ class EntityExtractor:
18
+ """Extract key entities/concepts from memory content using TF-IDF."""
19
+
20
+ def __init__(self, max_features: int = 20, min_df: int = 1):
21
+ """
22
+ Initialize entity extractor.
23
+
24
+ Args:
25
+ max_features: Top N keywords to extract per memory
26
+ min_df: Minimum document frequency (ignore very rare terms)
27
+ """
28
+ self.max_features = max_features
29
+ self.vectorizer = TfidfVectorizer(
30
+ max_features=max_features,
31
+ stop_words='english',
32
+ ngram_range=(1, 2), # Unigrams + bigrams
33
+ min_df=min_df,
34
+ lowercase=True,
35
+ token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z0-9_-]*\b' # Alphanumeric tokens
36
+ )
37
+
38
+ def extract_entities(self, contents: List[str]) -> Tuple[List[List[str]], np.ndarray]:
39
+ """
40
+ Extract entities from multiple contents.
41
+
42
+ Args:
43
+ contents: List of memory content strings
44
+
45
+ Returns:
46
+ Tuple of (entities_per_content, tfidf_vectors)
47
+ """
48
+ if not contents:
49
+ return [], np.array([])
50
+
51
+ try:
52
+ # Fit and transform all contents
53
+ vectors = self.vectorizer.fit_transform(contents)
54
+ feature_names = self.vectorizer.get_feature_names_out()
55
+
56
+ # Extract top entities for each content
57
+ all_entities = []
58
+ for idx in range(len(contents)):
59
+ scores = vectors[idx].toarray()[0]
60
+
61
+ # Get indices of top features
62
+ top_indices = np.argsort(scores)[::-1]
63
+
64
+ # Extract entities with score > 0
65
+ entities = [
66
+ feature_names[i]
67
+ for i in top_indices
68
+ if scores[i] > 0.05 # Minimum threshold
69
+ ][:self.max_features]
70
+
71
+ all_entities.append(entities)
72
+
73
+ return all_entities, vectors.toarray()
74
+
75
+ except Exception as e:
76
+ logger.error(f"Entity extraction failed: {e}")
77
+ return [[] for _ in contents], np.zeros((len(contents), 1))
78
+
79
+
80
+ class ClusterNamer:
81
+ """Enhanced cluster naming with optional LLM support (future)."""
82
+
83
+ @staticmethod
84
+ def generate_name_tfidf(entities: List[str]) -> str:
85
+ """Generate name from entity list (TF-IDF fallback)."""
86
+ if not entities:
87
+ return "Unnamed Cluster"
88
+
89
+ entity_counts = Counter(entities)
90
+ top_entities = [e for e, _ in entity_counts.most_common(2)]
91
+
92
+ if len(top_entities) >= 2:
93
+ return f"{top_entities[0].title()} & {top_entities[1].title()}"
94
+ else:
95
+ return f"{top_entities[0].title()} Contexts"
@@ -0,0 +1,226 @@
1
+ #!/usr/bin/env python3
2
+ # SPDX-License-Identifier: MIT
3
+ # Copyright (c) 2026 SuperLocalMemory (superlocalmemory.com)
4
+ """GraphEngine - Main orchestrator for the knowledge graph.
5
+
6
+ Coordinates entity extraction, edge building, community detection,
7
+ and graph traversal operations. All processing is local.
8
+ """
9
+ import sqlite3
10
+ import json
11
+ import time
12
+ from pathlib import Path
13
+ from typing import List, Dict
14
+
15
+ import numpy as np
16
+
17
+ from graph.constants import (
18
+ logger, MEMORY_DIR, DB_PATH, IGRAPH_AVAILABLE, cosine_similarity
19
+ )
20
+ from graph.entity_extractor import EntityExtractor
21
+ from graph.edge_builder import EdgeBuilder
22
+ from graph.cluster_builder import ClusterBuilder
23
+ from graph.schema import ensure_graph_tables
24
+ from graph.build_helpers import apply_sampling, clear_profile_graph_data
25
+ from graph.graph_search import (
26
+ get_related as _get_related,
27
+ get_cluster_members as _get_cluster_members,
28
+ get_stats as _get_stats,
29
+ )
30
+
31
+
32
+ class GraphEngine:
33
+ """Main graph engine coordinating all graph operations."""
34
+
35
+ def __init__(self, db_path: Path = DB_PATH):
36
+ """Initialize graph engine."""
37
+ self.db_path = db_path
38
+ self.entity_extractor = EntityExtractor(max_features=20)
39
+ self.edge_builder = EdgeBuilder(db_path)
40
+ self.cluster_builder = ClusterBuilder(db_path)
41
+ self._ensure_graph_tables()
42
+
43
+ def _get_active_profile(self) -> str:
44
+ """Get the currently active profile name from config."""
45
+ config_file = MEMORY_DIR / "profiles.json"
46
+ if config_file.exists():
47
+ try:
48
+ with open(config_file, 'r') as f:
49
+ config = json.load(f)
50
+ return config.get('active_profile', 'default')
51
+ except (json.JSONDecodeError, IOError):
52
+ pass
53
+ return 'default'
54
+
55
+ def _ensure_graph_tables(self):
56
+ """Create graph tables if they don't exist, or recreate if schema is incomplete."""
57
+ ensure_graph_tables(self.db_path)
58
+
59
+ def build_graph(self, min_similarity: float = 0.3) -> Dict[str, any]:
60
+ """
61
+ Build complete knowledge graph from all memories.
62
+
63
+ Args:
64
+ min_similarity: Minimum cosine similarity for edge creation
65
+
66
+ Returns:
67
+ Dictionary with build statistics
68
+ """
69
+ start_time = time.time()
70
+ logger.info("Starting full graph build...")
71
+ conn = sqlite3.connect(self.db_path)
72
+ cursor = conn.cursor()
73
+
74
+ try:
75
+ # Check required tables
76
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
77
+ existing_tables = {row[0] for row in cursor.fetchall()}
78
+ missing = {'memories', 'graph_edges', 'graph_nodes', 'graph_clusters'} - existing_tables
79
+ if missing:
80
+ logger.error(f"Missing required tables: {missing}")
81
+ return {'success': False, 'error': 'database_not_initialized',
82
+ 'message': f"Database not initialized. Missing tables: {', '.join(missing)}",
83
+ 'fix': "Run 'superlocalmemoryv2:status' first to initialize the database, or add some memories."}
84
+
85
+ active_profile = self._get_active_profile()
86
+ logger.info(f"Building graph for profile: {active_profile}")
87
+ memories = cursor.execute(
88
+ 'SELECT id, content, summary FROM memories WHERE profile = ? ORDER BY id',
89
+ (active_profile,)).fetchall()
90
+
91
+ if len(memories) == 0:
92
+ return {'success': False, 'error': 'no_memories',
93
+ 'message': 'No memories found in database.',
94
+ 'fix': "Add some memories first: superlocalmemoryv2:remember 'Your content here'"}
95
+ if len(memories) < 2:
96
+ return {'success': False, 'error': 'insufficient_memories',
97
+ 'message': 'Need at least 2 memories to build knowledge graph.',
98
+ 'memories': len(memories),
99
+ 'fix': "Add more memories: superlocalmemoryv2:remember 'Your content here'"}
100
+
101
+ memories = apply_sampling(cursor, memories, active_profile)
102
+ clear_profile_graph_data(cursor, conn, memories, active_profile)
103
+
104
+ logger.info(f"Processing {len(memories)} memories")
105
+ memory_ids = [m[0] for m in memories]
106
+ contents = [f"{m[1]} {m[2] or ''}" for m in memories]
107
+ entities_list, vectors = self.entity_extractor.extract_entities(contents)
108
+
109
+ for memory_id, entities, vector in zip(memory_ids, entities_list, vectors):
110
+ cursor.execute('''
111
+ INSERT INTO graph_nodes (memory_id, entities, embedding_vector)
112
+ VALUES (?, ?, ?)
113
+ ''', (memory_id, json.dumps(entities), json.dumps(vector.tolist())))
114
+ conn.commit()
115
+ logger.info(f"Stored {len(memory_ids)} graph nodes")
116
+
117
+ edges_count = self.edge_builder.build_edges(memory_ids, vectors, entities_list)
118
+ clusters_count = self.cluster_builder.detect_communities()
119
+ hierarchical_stats = self.cluster_builder.hierarchical_cluster()
120
+ subclusters = hierarchical_stats.get('subclusters_created', 0)
121
+ summaries = self.cluster_builder.generate_cluster_summaries()
122
+ elapsed = time.time() - start_time
123
+
124
+ stats = {
125
+ 'success': True, 'memories': len(memories), 'nodes': len(memory_ids),
126
+ 'edges': edges_count, 'clusters': clusters_count, 'subclusters': subclusters,
127
+ 'max_depth': hierarchical_stats.get('depth_reached', 0),
128
+ 'summaries_generated': summaries, 'time_seconds': round(elapsed, 2)
129
+ }
130
+ if not IGRAPH_AVAILABLE:
131
+ stats['warning'] = 'igraph/leidenalg not installed — graph built without clustering. Install with: pip3 install python-igraph leidenalg'
132
+ logger.info(f"Graph build complete: {stats}")
133
+ return stats
134
+
135
+ except Exception as e:
136
+ logger.error(f"Graph build failed: {e}")
137
+ conn.rollback()
138
+ return {'success': False, 'error': str(e)}
139
+ finally:
140
+ conn.close()
141
+
142
+ def extract_entities(self, memory_id: int) -> List[str]:
143
+ """Extract entities for a single memory."""
144
+ conn = sqlite3.connect(self.db_path)
145
+ cursor = conn.cursor()
146
+ try:
147
+ memory = cursor.execute(
148
+ 'SELECT content, summary FROM memories WHERE id = ?', (memory_id,)
149
+ ).fetchone()
150
+ if not memory:
151
+ return []
152
+ content = f"{memory[0]} {memory[1] or ''}"
153
+ entities_list, _ = self.entity_extractor.extract_entities([content])
154
+ return entities_list[0] if entities_list else []
155
+ finally:
156
+ conn.close()
157
+
158
+ def get_related(self, memory_id: int, max_hops: int = 2) -> List[Dict]:
159
+ """Get memories connected to this memory via graph edges (active profile only)."""
160
+ return _get_related(self.db_path, memory_id, max_hops)
161
+
162
+ def get_cluster_members(self, cluster_id: int) -> List[Dict]:
163
+ """Get all memories in a cluster (filtered by active profile)."""
164
+ return _get_cluster_members(self.db_path, cluster_id)
165
+
166
+ def add_memory_incremental(self, memory_id: int) -> bool:
167
+ """Add single memory to existing graph (incremental update)."""
168
+ conn = sqlite3.connect(self.db_path)
169
+ cursor = conn.cursor()
170
+ try:
171
+ memory = cursor.execute(
172
+ 'SELECT content, summary FROM memories WHERE id = ?', (memory_id,)
173
+ ).fetchone()
174
+ if not memory:
175
+ return False
176
+
177
+ content = f"{memory[0]} {memory[1] or ''}"
178
+ entities_list, vector = self.entity_extractor.extract_entities([content])
179
+ if not entities_list:
180
+ return False
181
+
182
+ new_entities, new_vector = entities_list[0], vector[0]
183
+ cursor.execute('''
184
+ INSERT OR REPLACE INTO graph_nodes (memory_id, entities, embedding_vector)
185
+ VALUES (?, ?, ?)
186
+ ''', (memory_id, json.dumps(new_entities), json.dumps(new_vector.tolist())))
187
+
188
+ active_profile = self._get_active_profile()
189
+ existing = cursor.execute('''
190
+ SELECT gn.memory_id, gn.embedding_vector, gn.entities
191
+ FROM graph_nodes gn JOIN memories m ON gn.memory_id = m.id
192
+ WHERE gn.memory_id != ? AND m.profile = ?
193
+ ''', (memory_id, active_profile)).fetchall()
194
+
195
+ edges_added = 0
196
+ for existing_id, ev_json, ee_json in existing:
197
+ ev = np.array(json.loads(ev_json))
198
+ sim = cosine_similarity([new_vector], [ev])[0][0]
199
+ if sim >= self.edge_builder.min_similarity:
200
+ ee = json.loads(ee_json)
201
+ shared = list(set(new_entities) & set(ee))
202
+ rel_type = self.edge_builder._classify_relationship(sim, shared)
203
+ cursor.execute('''
204
+ INSERT OR REPLACE INTO graph_edges
205
+ (source_memory_id, target_memory_id, relationship_type,
206
+ weight, shared_entities, similarity_score)
207
+ VALUES (?, ?, ?, ?, ?, ?)
208
+ ''', (memory_id, existing_id, rel_type,
209
+ float(sim), json.dumps(shared), float(sim)))
210
+ edges_added += 1
211
+
212
+ conn.commit()
213
+ logger.info(f"Added memory {memory_id} to graph with {edges_added} edges")
214
+ if edges_added > 5:
215
+ logger.info("Significant graph change - consider re-clustering")
216
+ return True
217
+ except Exception as e:
218
+ logger.error(f"Incremental add failed: {e}")
219
+ conn.rollback()
220
+ return False
221
+ finally:
222
+ conn.close()
223
+
224
+ def get_stats(self) -> Dict[str, any]:
225
+ """Get graph statistics for the active profile."""
226
+ return _get_stats(self.db_path)