roampal 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roampal/__init__.py +29 -0
- roampal/__main__.py +6 -0
- roampal/backend/__init__.py +1 -0
- roampal/backend/modules/__init__.py +1 -0
- roampal/backend/modules/memory/__init__.py +43 -0
- roampal/backend/modules/memory/chromadb_adapter.py +623 -0
- roampal/backend/modules/memory/config.py +102 -0
- roampal/backend/modules/memory/content_graph.py +543 -0
- roampal/backend/modules/memory/context_service.py +455 -0
- roampal/backend/modules/memory/embedding_service.py +96 -0
- roampal/backend/modules/memory/knowledge_graph_service.py +1052 -0
- roampal/backend/modules/memory/memory_bank_service.py +433 -0
- roampal/backend/modules/memory/memory_types.py +296 -0
- roampal/backend/modules/memory/outcome_service.py +400 -0
- roampal/backend/modules/memory/promotion_service.py +473 -0
- roampal/backend/modules/memory/routing_service.py +444 -0
- roampal/backend/modules/memory/scoring_service.py +324 -0
- roampal/backend/modules/memory/search_service.py +646 -0
- roampal/backend/modules/memory/tests/__init__.py +1 -0
- roampal/backend/modules/memory/tests/conftest.py +12 -0
- roampal/backend/modules/memory/tests/unit/__init__.py +1 -0
- roampal/backend/modules/memory/tests/unit/conftest.py +7 -0
- roampal/backend/modules/memory/tests/unit/test_knowledge_graph_service.py +517 -0
- roampal/backend/modules/memory/tests/unit/test_memory_bank_service.py +504 -0
- roampal/backend/modules/memory/tests/unit/test_outcome_service.py +485 -0
- roampal/backend/modules/memory/tests/unit/test_scoring_service.py +255 -0
- roampal/backend/modules/memory/tests/unit/test_search_service.py +413 -0
- roampal/backend/modules/memory/tests/unit/test_unified_memory_system.py +418 -0
- roampal/backend/modules/memory/unified_memory_system.py +1277 -0
- roampal/cli.py +638 -0
- roampal/hooks/__init__.py +16 -0
- roampal/hooks/session_manager.py +587 -0
- roampal/hooks/stop_hook.py +176 -0
- roampal/hooks/user_prompt_submit_hook.py +103 -0
- roampal/mcp/__init__.py +7 -0
- roampal/mcp/server.py +611 -0
- roampal/server/__init__.py +7 -0
- roampal/server/main.py +744 -0
- roampal-0.1.4.dist-info/METADATA +179 -0
- roampal-0.1.4.dist-info/RECORD +44 -0
- roampal-0.1.4.dist-info/WHEEL +5 -0
- roampal-0.1.4.dist-info/entry_points.txt +2 -0
- roampal-0.1.4.dist-info/licenses/LICENSE +190 -0
- roampal-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Memory System Configuration
|
|
3
|
+
|
|
4
|
+
Centralizes all magic numbers and configuration constants for the memory system.
|
|
5
|
+
Extracted from UnifiedMemorySystem lines 202-206, 709, 993, 398, 4221, and scattered
|
|
6
|
+
hardcoded values throughout the codebase.
|
|
7
|
+
|
|
8
|
+
This replaces scattered class constants with a single, configurable dataclass.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class MemoryConfig:
|
|
17
|
+
"""
|
|
18
|
+
Configuration for the unified memory system.
|
|
19
|
+
|
|
20
|
+
All values have production defaults matching the original codebase.
|
|
21
|
+
Can be overridden for testing or different deployment scenarios.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# Threshold constants (from lines 202-206)
|
|
25
|
+
high_value_threshold: float = 0.9
|
|
26
|
+
"""Memories above this score are preserved beyond retention period"""
|
|
27
|
+
|
|
28
|
+
promotion_score_threshold: float = 0.7
|
|
29
|
+
"""Minimum score for working->history or history->patterns promotion"""
|
|
30
|
+
|
|
31
|
+
demotion_score_threshold: float = 0.4
|
|
32
|
+
"""Below this, patterns demote to history"""
|
|
33
|
+
|
|
34
|
+
deletion_score_threshold: float = 0.2
|
|
35
|
+
"""Below this, history items are deleted"""
|
|
36
|
+
|
|
37
|
+
new_item_deletion_threshold: float = 0.1
|
|
38
|
+
"""More lenient deletion threshold for items < 7 days old"""
|
|
39
|
+
|
|
40
|
+
# Search and scoring (from lines 709, 1580-1581)
|
|
41
|
+
cross_encoder_blend_ratio: float = 0.6
|
|
42
|
+
"""Weight given to cross-encoder in final ranking (0.4 original, 0.6 cross-encoder)"""
|
|
43
|
+
|
|
44
|
+
embedding_weight_proven: float = 0.2
|
|
45
|
+
"""Embedding weight for proven memories (high use count)"""
|
|
46
|
+
|
|
47
|
+
learned_weight_proven: float = 0.8
|
|
48
|
+
"""Learned/outcome weight for proven memories"""
|
|
49
|
+
|
|
50
|
+
embedding_weight_new: float = 0.8
|
|
51
|
+
"""Embedding weight for new memories (low use count)"""
|
|
52
|
+
|
|
53
|
+
learned_weight_new: float = 0.2
|
|
54
|
+
"""Learned/outcome weight for new memories"""
|
|
55
|
+
|
|
56
|
+
# Multipliers (from lines 1359, 1398, 1413, 1421)
|
|
57
|
+
search_multiplier: int = 3
|
|
58
|
+
"""
|
|
59
|
+
Fetch limit * search_multiplier results for better ranking.
|
|
60
|
+
Currently hardcoded as `limit * 3` in 4 locations.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Storage limits (from line 4221)
|
|
64
|
+
max_memory_bank_items: int = 1000
|
|
65
|
+
"""Maximum items in memory_bank collection"""
|
|
66
|
+
|
|
67
|
+
# Timing (from line 398)
|
|
68
|
+
kg_debounce_seconds: int = 5
|
|
69
|
+
"""Debounce window for knowledge graph saves"""
|
|
70
|
+
|
|
71
|
+
# Default values (from line 993)
|
|
72
|
+
default_importance: float = 0.7
|
|
73
|
+
"""Default importance for new memories"""
|
|
74
|
+
|
|
75
|
+
default_confidence: float = 0.7
|
|
76
|
+
"""Default confidence for new memories"""
|
|
77
|
+
|
|
78
|
+
# Promotion timing
|
|
79
|
+
promotion_use_threshold: int = 3
|
|
80
|
+
"""Minimum uses before considering for promotion"""
|
|
81
|
+
|
|
82
|
+
promotion_age_days: int = 7
|
|
83
|
+
"""Minimum age in days before considering for promotion"""
|
|
84
|
+
|
|
85
|
+
# Retention periods
|
|
86
|
+
working_memory_retention_hours: int = 24
|
|
87
|
+
"""Working memory cleanup threshold"""
|
|
88
|
+
|
|
89
|
+
history_retention_days: int = 30
|
|
90
|
+
"""History cleanup threshold for low-value items"""
|
|
91
|
+
|
|
92
|
+
# Cross-encoder settings
|
|
93
|
+
cross_encoder_top_k: int = 30
|
|
94
|
+
"""Number of candidates to rerank with cross-encoder"""
|
|
95
|
+
|
|
96
|
+
# Wilson score
|
|
97
|
+
wilson_confidence: float = 0.95
|
|
98
|
+
"""Confidence level for Wilson score calculation"""
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Default configuration instance
|
|
102
|
+
DEFAULT_CONFIG = MemoryConfig()
|
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content Knowledge Graph - Indexes entity relationships from memory_bank content.
|
|
3
|
+
|
|
4
|
+
Part of dual KG system:
|
|
5
|
+
- Routing KG: Query patterns → collection routing decisions
|
|
6
|
+
- Content KG: Memory content → entity relationships (THIS FILE)
|
|
7
|
+
|
|
8
|
+
Author: Roampal AI
|
|
9
|
+
Date: 2025-11-06
|
|
10
|
+
Version: v0.2.0
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Dict, List, Set, Tuple, Any, Optional, Callable
|
|
16
|
+
from collections import defaultdict, deque
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ContentGraph:
|
|
23
|
+
"""
|
|
24
|
+
Content-based knowledge graph that indexes entity relationships from memory_bank content.
|
|
25
|
+
|
|
26
|
+
Key differences from routing KG:
|
|
27
|
+
- Data source: memory_bank text content (not queries)
|
|
28
|
+
- Purpose: Entity relationship mapping (not search routing)
|
|
29
|
+
- Storage: content_graph.json (separate from knowledge_graph.json)
|
|
30
|
+
|
|
31
|
+
Graph structure:
|
|
32
|
+
{
|
|
33
|
+
"entities": {
|
|
34
|
+
"logan": {
|
|
35
|
+
"mentions": 12,
|
|
36
|
+
"collections": {"memory_bank": 12},
|
|
37
|
+
"documents": ["doc_id_1", "doc_id_2", ...],
|
|
38
|
+
"first_seen": "2025-11-06T10:30:00",
|
|
39
|
+
"last_seen": "2025-11-06T14:45:00"
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"relationships": {
|
|
43
|
+
"logan__everbright": {
|
|
44
|
+
"entities": ["logan", "everbright"],
|
|
45
|
+
"strength": 8.0,
|
|
46
|
+
"co_occurrences": 8,
|
|
47
|
+
"documents": ["doc_id_1", "doc_id_2", ...],
|
|
48
|
+
"first_seen": "2025-11-06T10:30:00",
|
|
49
|
+
"last_seen": "2025-11-06T14:45:00"
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
"metadata": {
|
|
53
|
+
"version": "0.2.0",
|
|
54
|
+
"created": "2025-11-06T10:00:00",
|
|
55
|
+
"last_updated": "2025-11-06T14:45:00",
|
|
56
|
+
"total_documents": 29,
|
|
57
|
+
"total_entities": 47,
|
|
58
|
+
"total_relationships": 123
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
"""Initialize empty content graph."""
|
|
65
|
+
self.entities: Dict[str, Dict[str, Any]] = {}
|
|
66
|
+
self.relationships: Dict[str, Dict[str, Any]] = {}
|
|
67
|
+
self.entity_metadata: Dict[str, Dict[str, Any]] = {}
|
|
68
|
+
self.metadata = {
|
|
69
|
+
"version": "0.2.0",
|
|
70
|
+
"created": datetime.now().isoformat(),
|
|
71
|
+
"last_updated": datetime.now().isoformat(),
|
|
72
|
+
"total_documents": 0,
|
|
73
|
+
"total_entities": 0,
|
|
74
|
+
"total_relationships": 0
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Document tracking for relationship building
|
|
78
|
+
self._doc_entities: Dict[str, Set[str]] = defaultdict(set)
|
|
79
|
+
|
|
80
|
+
logger.info("ContentGraph initialized (empty)")
|
|
81
|
+
|
|
82
|
+
def add_entities_from_text(
|
|
83
|
+
self,
|
|
84
|
+
text: str,
|
|
85
|
+
doc_id: str,
|
|
86
|
+
collection: str,
|
|
87
|
+
extract_concepts_fn: Callable[[str], List[str]],
|
|
88
|
+
quality_score: Optional[float] = None
|
|
89
|
+
) -> List[str]:
|
|
90
|
+
"""
|
|
91
|
+
Extract entities from text and index them in the content graph.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
text: Raw text content to analyze
|
|
95
|
+
doc_id: Unique document identifier
|
|
96
|
+
collection: Collection name (e.g., "memory_bank")
|
|
97
|
+
extract_concepts_fn: Function to extract concepts (reuses _extract_concepts from UnifiedMemorySystem)
|
|
98
|
+
quality_score: Optional quality score (importance × confidence) for this document's entities
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of extracted entity names
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> cg = ContentGraph()
|
|
105
|
+
>>> entities = cg.add_entities_from_text(
|
|
106
|
+
... text="User prefers Docker Compose for local development environments",
|
|
107
|
+
... doc_id="mem_001",
|
|
108
|
+
... collection="memory_bank",
|
|
109
|
+
... extract_concepts_fn=memory_system._extract_concepts
|
|
110
|
+
... )
|
|
111
|
+
>>> entities
|
|
112
|
+
['docker', 'compose', 'local', 'development', 'environments']
|
|
113
|
+
"""
|
|
114
|
+
# Extract concepts using same logic as routing KG (consistency)
|
|
115
|
+
concepts = extract_concepts_fn(text)
|
|
116
|
+
|
|
117
|
+
# Filter to meaningful entities (exclude very short/generic terms)
|
|
118
|
+
entities = [c for c in concepts if len(c) >= 3 and not c.isdigit()]
|
|
119
|
+
|
|
120
|
+
if not entities:
|
|
121
|
+
logger.debug(f"No entities extracted from doc {doc_id}")
|
|
122
|
+
return []
|
|
123
|
+
|
|
124
|
+
now = datetime.now().isoformat()
|
|
125
|
+
|
|
126
|
+
# Index each entity
|
|
127
|
+
for entity in entities:
|
|
128
|
+
if entity not in self.entities:
|
|
129
|
+
self.entities[entity] = {
|
|
130
|
+
"mentions": 0,
|
|
131
|
+
"collections": defaultdict(int),
|
|
132
|
+
"documents": [],
|
|
133
|
+
"first_seen": now,
|
|
134
|
+
"last_seen": now,
|
|
135
|
+
"total_quality": 0.0,
|
|
136
|
+
"avg_quality": 0.0
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Update entity metadata
|
|
140
|
+
self.entities[entity]["mentions"] += 1
|
|
141
|
+
self.entities[entity]["collections"][collection] += 1
|
|
142
|
+
self.entities[entity]["last_seen"] = now
|
|
143
|
+
|
|
144
|
+
if doc_id not in self.entities[entity]["documents"]:
|
|
145
|
+
self.entities[entity]["documents"].append(doc_id)
|
|
146
|
+
|
|
147
|
+
# Update quality scores if provided
|
|
148
|
+
if quality_score is not None:
|
|
149
|
+
self.entities[entity]["total_quality"] += quality_score
|
|
150
|
+
self.entities[entity]["avg_quality"] = (
|
|
151
|
+
self.entities[entity]["total_quality"] / self.entities[entity]["mentions"]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
# Track entities in this document for relationship building
|
|
155
|
+
self._doc_entities[doc_id].update(entities)
|
|
156
|
+
|
|
157
|
+
# Build relationships (co-occurrence within same document)
|
|
158
|
+
self._build_relationships(entities, doc_id, now)
|
|
159
|
+
|
|
160
|
+
# Update metadata
|
|
161
|
+
self.metadata["last_updated"] = now
|
|
162
|
+
self.metadata["total_entities"] = len(self.entities)
|
|
163
|
+
self.metadata["total_relationships"] = len(self.relationships)
|
|
164
|
+
|
|
165
|
+
logger.info(f"Indexed {len(entities)} entities from doc {doc_id}: {entities[:5]}{'...' if len(entities) > 5 else ''}")
|
|
166
|
+
|
|
167
|
+
return entities
|
|
168
|
+
|
|
169
|
+
def _build_relationships(self, entities: List[str], doc_id: str, timestamp: str):
|
|
170
|
+
"""
|
|
171
|
+
Build entity relationships based on co-occurrence in the same document.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
entities: List of entities found in document
|
|
175
|
+
doc_id: Document identifier
|
|
176
|
+
timestamp: ISO timestamp for tracking
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
If doc contains ["logan", "everbright", "operations_manager"],
|
|
180
|
+
creates relationships:
|
|
181
|
+
- logan <-> everbright
|
|
182
|
+
- logan <-> operations_manager
|
|
183
|
+
- everbright <-> operations_manager
|
|
184
|
+
"""
|
|
185
|
+
# Create pairwise relationships (undirected)
|
|
186
|
+
unique_entities = list(set(entities))
|
|
187
|
+
|
|
188
|
+
for i in range(len(unique_entities)):
|
|
189
|
+
for j in range(i + 1, len(unique_entities)):
|
|
190
|
+
entity_a = unique_entities[i]
|
|
191
|
+
entity_b = unique_entities[j]
|
|
192
|
+
|
|
193
|
+
# Create canonical relationship ID (sorted to ensure consistency)
|
|
194
|
+
rel_id = "__".join(sorted([entity_a, entity_b]))
|
|
195
|
+
|
|
196
|
+
if rel_id not in self.relationships:
|
|
197
|
+
self.relationships[rel_id] = {
|
|
198
|
+
"entities": sorted([entity_a, entity_b]),
|
|
199
|
+
"strength": 0.0,
|
|
200
|
+
"co_occurrences": 0,
|
|
201
|
+
"documents": [],
|
|
202
|
+
"first_seen": timestamp,
|
|
203
|
+
"last_seen": timestamp
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
# Update relationship metadata
|
|
207
|
+
self.relationships[rel_id]["co_occurrences"] += 1
|
|
208
|
+
self.relationships[rel_id]["last_seen"] = timestamp
|
|
209
|
+
|
|
210
|
+
if doc_id not in self.relationships[rel_id]["documents"]:
|
|
211
|
+
self.relationships[rel_id]["documents"].append(doc_id)
|
|
212
|
+
|
|
213
|
+
# Calculate relationship strength (co-occurrence count with decay)
|
|
214
|
+
# Simple formula: log2(co_occurrences + 1) for diminishing returns
|
|
215
|
+
co_occur = self.relationships[rel_id]["co_occurrences"]
|
|
216
|
+
self.relationships[rel_id]["strength"] = round(
|
|
217
|
+
(co_occur ** 0.5) * 2.0, # sqrt for diminishing returns, * 2 for scaling
|
|
218
|
+
2
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def remove_entity_mention(self, doc_id: str):
|
|
222
|
+
"""
|
|
223
|
+
Remove a document's entity mentions (when memory archived/deleted).
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
doc_id: Document identifier to remove
|
|
227
|
+
|
|
228
|
+
Notes:
|
|
229
|
+
- Decrements mention counts
|
|
230
|
+
- Removes doc from entity.documents
|
|
231
|
+
- Removes relationships if no more co-occurrences
|
|
232
|
+
- Does NOT delete entities (preserves historical context)
|
|
233
|
+
"""
|
|
234
|
+
if doc_id not in self._doc_entities:
|
|
235
|
+
logger.debug(f"Doc {doc_id} not tracked in content graph")
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
entities_in_doc = self._doc_entities[doc_id]
|
|
239
|
+
now = datetime.now().isoformat()
|
|
240
|
+
|
|
241
|
+
# Decrement entity mentions
|
|
242
|
+
for entity in entities_in_doc:
|
|
243
|
+
if entity in self.entities:
|
|
244
|
+
self.entities[entity]["mentions"] -= 1
|
|
245
|
+
|
|
246
|
+
if doc_id in self.entities[entity]["documents"]:
|
|
247
|
+
self.entities[entity]["documents"].remove(doc_id)
|
|
248
|
+
|
|
249
|
+
# Keep entity metadata even if mentions reach 0 (historical context)
|
|
250
|
+
|
|
251
|
+
# Remove relationships involving these entities
|
|
252
|
+
rels_to_remove = []
|
|
253
|
+
for rel_id, rel_data in self.relationships.items():
|
|
254
|
+
if doc_id in rel_data["documents"]:
|
|
255
|
+
rel_data["documents"].remove(doc_id)
|
|
256
|
+
rel_data["co_occurrences"] -= 1
|
|
257
|
+
|
|
258
|
+
# Recalculate strength
|
|
259
|
+
if rel_data["co_occurrences"] > 0:
|
|
260
|
+
co_occur = rel_data["co_occurrences"]
|
|
261
|
+
rel_data["strength"] = round((co_occur ** 0.5) * 2.0, 2)
|
|
262
|
+
else:
|
|
263
|
+
# Mark for deletion if no more co-occurrences
|
|
264
|
+
rels_to_remove.append(rel_id)
|
|
265
|
+
|
|
266
|
+
# Clean up relationships with 0 co-occurrences
|
|
267
|
+
for rel_id in rels_to_remove:
|
|
268
|
+
del self.relationships[rel_id]
|
|
269
|
+
|
|
270
|
+
# Remove from tracking
|
|
271
|
+
del self._doc_entities[doc_id]
|
|
272
|
+
|
|
273
|
+
# Update metadata
|
|
274
|
+
self.metadata["last_updated"] = now
|
|
275
|
+
self.metadata["total_relationships"] = len(self.relationships)
|
|
276
|
+
|
|
277
|
+
logger.info(f"Removed doc {doc_id} from content graph ({len(entities_in_doc)} entities, {len(rels_to_remove)} relationships)")
|
|
278
|
+
|
|
279
|
+
def get_entity_relationships(self, entity: str, min_strength: float = 0.0) -> List[Dict[str, Any]]:
|
|
280
|
+
"""
|
|
281
|
+
Get all relationships for a specific entity.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
entity: Entity name to query
|
|
285
|
+
min_strength: Minimum relationship strength threshold
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of relationships with connected entities and metadata
|
|
289
|
+
|
|
290
|
+
Example:
|
|
291
|
+
>>> cg.get_entity_relationships("logan", min_strength=2.0)
|
|
292
|
+
[
|
|
293
|
+
{
|
|
294
|
+
"related_entity": "everbright",
|
|
295
|
+
"strength": 5.66,
|
|
296
|
+
"co_occurrences": 8,
|
|
297
|
+
"documents": ["mem_001", "mem_003", ...],
|
|
298
|
+
"first_seen": "2025-11-06T10:30:00",
|
|
299
|
+
"last_seen": "2025-11-06T14:45:00"
|
|
300
|
+
},
|
|
301
|
+
...
|
|
302
|
+
]
|
|
303
|
+
"""
|
|
304
|
+
if entity not in self.entities:
|
|
305
|
+
logger.debug(f"Entity '{entity}' not found in content graph")
|
|
306
|
+
return []
|
|
307
|
+
|
|
308
|
+
relationships = []
|
|
309
|
+
|
|
310
|
+
for rel_id, rel_data in self.relationships.items():
|
|
311
|
+
if entity in rel_data["entities"] and rel_data["strength"] >= min_strength:
|
|
312
|
+
# Find the other entity in the relationship
|
|
313
|
+
related_entity = [e for e in rel_data["entities"] if e != entity][0]
|
|
314
|
+
|
|
315
|
+
relationships.append({
|
|
316
|
+
"related_entity": related_entity,
|
|
317
|
+
"strength": rel_data["strength"],
|
|
318
|
+
"co_occurrences": rel_data["co_occurrences"],
|
|
319
|
+
"documents": rel_data["documents"],
|
|
320
|
+
"first_seen": rel_data["first_seen"],
|
|
321
|
+
"last_seen": rel_data["last_seen"]
|
|
322
|
+
})
|
|
323
|
+
|
|
324
|
+
# Sort by strength (descending)
|
|
325
|
+
relationships.sort(key=lambda x: x["strength"], reverse=True)
|
|
326
|
+
|
|
327
|
+
logger.debug(f"Found {len(relationships)} relationships for entity '{entity}'")
|
|
328
|
+
return relationships
|
|
329
|
+
|
|
330
|
+
def find_path(
|
|
331
|
+
self,
|
|
332
|
+
from_entity: str,
|
|
333
|
+
to_entity: str,
|
|
334
|
+
max_depth: int = 3
|
|
335
|
+
) -> Optional[List[str]]:
|
|
336
|
+
"""
|
|
337
|
+
Find shortest path between two entities using BFS.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
from_entity: Starting entity
|
|
341
|
+
to_entity: Target entity
|
|
342
|
+
max_depth: Maximum path length to search
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
List of entities forming path, or None if no path found
|
|
346
|
+
|
|
347
|
+
Example:
|
|
348
|
+
>>> cg.find_path("logan", "everbright", max_depth=3)
|
|
349
|
+
["logan", "everbright"] # Direct connection
|
|
350
|
+
|
|
351
|
+
>>> cg.find_path("logan", "solar_energy", max_depth=3)
|
|
352
|
+
["logan", "everbright", "solar_energy"] # 2-hop connection
|
|
353
|
+
"""
|
|
354
|
+
if from_entity not in self.entities or to_entity not in self.entities:
|
|
355
|
+
logger.debug(f"Entity not found: from='{from_entity}' to='{to_entity}'")
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
if from_entity == to_entity:
|
|
359
|
+
return [from_entity]
|
|
360
|
+
|
|
361
|
+
# BFS to find shortest path
|
|
362
|
+
queue = deque([(from_entity, [from_entity])])
|
|
363
|
+
visited = {from_entity}
|
|
364
|
+
|
|
365
|
+
while queue:
|
|
366
|
+
current, path = queue.popleft()
|
|
367
|
+
|
|
368
|
+
if len(path) > max_depth:
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
# Get neighbors
|
|
372
|
+
for rel_id, rel_data in self.relationships.items():
|
|
373
|
+
if current in rel_data["entities"]:
|
|
374
|
+
# Find connected entity
|
|
375
|
+
neighbor = [e for e in rel_data["entities"] if e != current][0]
|
|
376
|
+
|
|
377
|
+
if neighbor == to_entity:
|
|
378
|
+
# Found target
|
|
379
|
+
result_path = path + [neighbor]
|
|
380
|
+
logger.info(f"Found path: {' -> '.join(result_path)}")
|
|
381
|
+
return result_path
|
|
382
|
+
|
|
383
|
+
if neighbor not in visited:
|
|
384
|
+
visited.add(neighbor)
|
|
385
|
+
queue.append((neighbor, path + [neighbor]))
|
|
386
|
+
|
|
387
|
+
logger.debug(f"No path found between '{from_entity}' and '{to_entity}' (max_depth={max_depth})")
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
def get_all_entities(self, min_mentions: int = 1) -> List[Dict[str, Any]]:
|
|
391
|
+
"""
|
|
392
|
+
Get all entities with metadata.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
min_mentions: Minimum mention count threshold
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
List of entities with full metadata
|
|
399
|
+
"""
|
|
400
|
+
entities = [
|
|
401
|
+
{
|
|
402
|
+
"entity": name, # Changed from "name" to "entity" for consistency with routing KG
|
|
403
|
+
**data
|
|
404
|
+
}
|
|
405
|
+
for name, data in self.entities.items()
|
|
406
|
+
if data["mentions"] >= min_mentions
|
|
407
|
+
]
|
|
408
|
+
|
|
409
|
+
# Sort by avg_quality (descending), fallback to mentions for backward compatibility
|
|
410
|
+
entities.sort(key=lambda x: x.get("avg_quality", 0.0), reverse=True)
|
|
411
|
+
|
|
412
|
+
return entities
|
|
413
|
+
|
|
414
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
415
|
+
"""
|
|
416
|
+
Get content graph statistics.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Dictionary with graph metrics
|
|
420
|
+
"""
|
|
421
|
+
return {
|
|
422
|
+
"total_entities": len(self.entities),
|
|
423
|
+
"total_relationships": len(self.relationships),
|
|
424
|
+
"total_documents": len(self._doc_entities),
|
|
425
|
+
"avg_mentions_per_entity": round(
|
|
426
|
+
sum(e["mentions"] for e in self.entities.values()) / len(self.entities), 2
|
|
427
|
+
) if self.entities else 0.0,
|
|
428
|
+
"avg_relationships_per_entity": round(
|
|
429
|
+
(len(self.relationships) * 2) / len(self.entities), 2
|
|
430
|
+
) if self.entities else 0.0,
|
|
431
|
+
"strongest_relationship": max(
|
|
432
|
+
(r for r in self.relationships.values()),
|
|
433
|
+
key=lambda x: x["strength"],
|
|
434
|
+
default=None
|
|
435
|
+
),
|
|
436
|
+
"most_mentioned_entity": max(
|
|
437
|
+
self.entities.items(),
|
|
438
|
+
key=lambda x: x[1]["mentions"],
|
|
439
|
+
default=(None, None)
|
|
440
|
+
)[0],
|
|
441
|
+
"metadata": self.metadata
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
445
|
+
"""
|
|
446
|
+
Serialize content graph to dictionary.
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
Complete graph data as dictionary
|
|
450
|
+
"""
|
|
451
|
+
# Convert defaultdicts to regular dicts for JSON serialization
|
|
452
|
+
serializable_entities = {}
|
|
453
|
+
for name, data in self.entities.items():
|
|
454
|
+
entity_copy = data.copy()
|
|
455
|
+
if isinstance(entity_copy.get("collections"), defaultdict):
|
|
456
|
+
entity_copy["collections"] = dict(entity_copy["collections"])
|
|
457
|
+
serializable_entities[name] = entity_copy
|
|
458
|
+
|
|
459
|
+
return {
|
|
460
|
+
"entities": serializable_entities,
|
|
461
|
+
"relationships": self.relationships,
|
|
462
|
+
"metadata": self.metadata
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
@classmethod
|
|
466
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ContentGraph":
|
|
467
|
+
"""
|
|
468
|
+
Deserialize content graph from dictionary.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
data: Graph data dictionary
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
ContentGraph instance
|
|
475
|
+
"""
|
|
476
|
+
graph = cls()
|
|
477
|
+
|
|
478
|
+
# Load entities
|
|
479
|
+
graph.entities = {}
|
|
480
|
+
for name, entity_data in data.get("entities", {}).items():
|
|
481
|
+
entity_copy = entity_data.copy()
|
|
482
|
+
# Convert collections dict back to defaultdict
|
|
483
|
+
if "collections" in entity_copy:
|
|
484
|
+
entity_copy["collections"] = defaultdict(int, entity_copy["collections"])
|
|
485
|
+
graph.entities[name] = entity_copy
|
|
486
|
+
|
|
487
|
+
# Load relationships
|
|
488
|
+
graph.relationships = data.get("relationships", {})
|
|
489
|
+
|
|
490
|
+
# Load metadata
|
|
491
|
+
graph.metadata = data.get("metadata", graph.metadata)
|
|
492
|
+
|
|
493
|
+
# Rebuild doc_entities tracking
|
|
494
|
+
for entity_name, entity_data in graph.entities.items():
|
|
495
|
+
for doc_id in entity_data.get("documents", []):
|
|
496
|
+
graph._doc_entities[doc_id].add(entity_name)
|
|
497
|
+
|
|
498
|
+
logger.info(
|
|
499
|
+
f"ContentGraph loaded: {len(graph.entities)} entities, "
|
|
500
|
+
f"{len(graph.relationships)} relationships, "
|
|
501
|
+
f"{len(graph._doc_entities)} documents"
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
return graph
|
|
505
|
+
|
|
506
|
+
def save_to_file(self, filepath: str):
|
|
507
|
+
"""
|
|
508
|
+
Save content graph to JSON file.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
filepath: Path to save file
|
|
512
|
+
"""
|
|
513
|
+
try:
|
|
514
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
515
|
+
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
|
|
516
|
+
logger.info(f"ContentGraph saved to {filepath}")
|
|
517
|
+
except Exception as e:
|
|
518
|
+
logger.error(f"Failed to save ContentGraph to {filepath}: {e}")
|
|
519
|
+
raise
|
|
520
|
+
|
|
521
|
+
@classmethod
|
|
522
|
+
def load_from_file(cls, filepath: str) -> "ContentGraph":
|
|
523
|
+
"""
|
|
524
|
+
Load content graph from JSON file.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
filepath: Path to load file
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
ContentGraph instance
|
|
531
|
+
"""
|
|
532
|
+
try:
|
|
533
|
+
with open(filepath, "r", encoding="utf-8") as f:
|
|
534
|
+
data = json.load(f)
|
|
535
|
+
graph = cls.from_dict(data)
|
|
536
|
+
logger.info(f"ContentGraph loaded from {filepath}")
|
|
537
|
+
return graph
|
|
538
|
+
except FileNotFoundError:
|
|
539
|
+
logger.warning(f"ContentGraph file not found: {filepath}, creating new graph")
|
|
540
|
+
return cls()
|
|
541
|
+
except Exception as e:
|
|
542
|
+
logger.error(f"Failed to load ContentGraph from {filepath}: {e}")
|
|
543
|
+
raise
|