odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. benchmarks/__init__.py +17 -17
  2. benchmarks/datasets.py +284 -284
  3. benchmarks/metrics.py +275 -275
  4. benchmarks/run_ablation.py +279 -279
  5. benchmarks/run_npll_benchmark.py +270 -270
  6. npll/__init__.py +10 -10
  7. npll/bootstrap.py +474 -474
  8. npll/core/__init__.py +33 -33
  9. npll/core/knowledge_graph.py +308 -308
  10. npll/core/logical_rules.py +496 -496
  11. npll/core/mln.py +474 -474
  12. npll/inference/__init__.py +40 -40
  13. npll/inference/e_step.py +419 -419
  14. npll/inference/elbo.py +434 -434
  15. npll/inference/m_step.py +576 -576
  16. npll/npll_model.py +631 -631
  17. npll/scoring/__init__.py +42 -42
  18. npll/scoring/embeddings.py +441 -441
  19. npll/scoring/probability.py +402 -402
  20. npll/scoring/scoring_module.py +369 -369
  21. npll/training/__init__.py +24 -24
  22. npll/training/evaluation.py +496 -496
  23. npll/training/npll_trainer.py +520 -520
  24. npll/utils/__init__.py +47 -47
  25. npll/utils/batch_utils.py +492 -492
  26. npll/utils/config.py +144 -144
  27. npll/utils/math_utils.py +338 -338
  28. odin/__init__.py +21 -20
  29. odin/engine.py +264 -264
  30. odin/schema.py +210 -0
  31. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
  32. odin_engine-0.2.0.dist-info/RECORD +63 -0
  33. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
  34. retrieval/__init__.py +50 -50
  35. retrieval/adapters.py +140 -140
  36. retrieval/adapters_arango.py +1418 -1418
  37. retrieval/aggregators.py +707 -707
  38. retrieval/beam.py +127 -127
  39. retrieval/budget.py +60 -60
  40. retrieval/cache.py +159 -159
  41. retrieval/confidence.py +88 -88
  42. retrieval/eval.py +49 -49
  43. retrieval/linker.py +87 -87
  44. retrieval/metrics.py +105 -105
  45. retrieval/metrics_motifs.py +36 -36
  46. retrieval/orchestrator.py +571 -571
  47. retrieval/ppr/__init__.py +12 -12
  48. retrieval/ppr/anchors.py +41 -41
  49. retrieval/ppr/bippr.py +61 -61
  50. retrieval/ppr/engines.py +257 -257
  51. retrieval/ppr/global_pr.py +76 -76
  52. retrieval/ppr/indexes.py +78 -78
  53. retrieval/ppr.py +156 -156
  54. retrieval/ppr_cache.py +25 -25
  55. retrieval/scoring.py +294 -294
  56. retrieval/utils/pii_redaction.py +36 -36
  57. retrieval/writers/__init__.py +9 -9
  58. retrieval/writers/arango_writer.py +28 -28
  59. retrieval/writers/base.py +21 -21
  60. retrieval/writers/janus_writer.py +36 -36
  61. odin_engine-0.1.0.dist-info/RECORD +0 -62
  62. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
  63. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
odin/__init__.py CHANGED
@@ -1,20 +1,21 @@
1
- """
2
- Odin Knowledge Graph Intelligence Engine
3
-
4
- A library for intelligent knowledge graph exploration using:
5
- - Personalized PageRank (PPR) for structural importance
6
- - Beam Search for efficient path finding
7
- - NPLL (Neural Probabilistic Logic Learning) for semantic plausibility
8
-
9
- Usage:
10
- from odin import OdinEngine
11
-
12
- engine = OdinEngine(db=my_arango_db)
13
- results = engine.retrieve(seeds=["Patient_123"])
14
- score = engine.score_edge("Patient_A", "treated_by", "Dr_Smith")
15
- """
16
-
17
- from .engine import OdinEngine
18
-
19
- __all__ = ["OdinEngine"]
20
- __version__ = "1.0.0"
1
+ """
2
+ Odin Knowledge Graph Intelligence Engine
3
+
4
+ A library for intelligent knowledge graph exploration using:
5
+ - Personalized PageRank (PPR) for structural importance
6
+ - Beam Search for efficient path finding
7
+ - NPLL (Neural Probabilistic Logic Learning) for semantic plausibility
8
+
9
+ Usage:
10
+ from odin import OdinEngine
11
+
12
+ engine = OdinEngine(db=my_arango_db)
13
+ results = engine.retrieve(seeds=["Patient_123"])
14
+ score = engine.score_edge("Patient_A", "treated_by", "Dr_Smith")
15
+ """
16
+
17
+ from .engine import OdinEngine
18
+ from .schema import SchemaInspector, inspect_arango_schema
19
+
20
+ __all__ = ["OdinEngine", "SchemaInspector", "inspect_arango_schema"]
21
+ __version__ = "0.2.0"
odin/engine.py CHANGED
@@ -1,264 +1,264 @@
1
- """
2
- OdinEngine: The main entry point for the Odin KG Intelligence Library.
3
-
4
- This class orchestrates all components:
5
- - Graph access (with caching)
6
- - NPLL model management (auto-train if needed)
7
- - Retrieval (PPR + Beam Search + Scoring)
8
- """
9
-
10
- import os
11
- import sys
12
- import logging
13
- from typing import List, Dict, Any, Optional
14
-
15
- from arango.database import StandardDatabase
16
-
17
- # Add parent path for imports
18
- sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
-
20
- from npll.bootstrap import KnowledgeBootstrapper
21
- from npll.npll_model import NPLLModel
22
- from retrieval.orchestrator import RetrievalOrchestrator, OrchestratorParams
23
- from retrieval.adapters_arango import ArangoCommunityAccessor, GlobalGraphAccessor
24
- from retrieval.cache import CachedGraphAccessor
25
- from retrieval.confidence import NPLLConfidence, ConstantConfidence
26
- from retrieval.ppr.anchors import APPRAnchors, APPRAnchorParams
27
-
28
- logger = logging.getLogger("odin")
29
-
30
-
31
- class OdinEngine:
32
- """
33
- Main entry point for the Odin Knowledge Graph Intelligence Library.
34
-
35
- Handles:
36
- - Graph access (with caching)
37
- - NPLL model loading (auto-trains if needed)
38
- - Retrieval orchestration (PPR + Beam Search + NPLL Scoring)
39
-
40
- Example:
41
- from odin import OdinEngine
42
- from arango import ArangoClient
43
-
44
- client = ArangoClient(hosts="http://localhost:8529")
45
- db = client.db("KG-test", username="root", password="")
46
-
47
- engine = OdinEngine(db)
48
- results = engine.retrieve(seeds=["Patient_123"])
49
- """
50
-
51
- def __init__(
52
- self,
53
- db: StandardDatabase,
54
- community_id: str = "global",
55
- cache_size: int = 5000,
56
- auto_train: bool = True,
57
- community_mode: str = "none", # "none" = global, "mapping" = scoped
58
- ):
59
- """
60
- Initialize the Odin Engine.
61
-
62
- Args:
63
- db: Connected ArangoDB database instance
64
- community_id: Community to scope queries to (default: "global")
65
- cache_size: Size of the graph accessor cache (default: 5000)
66
- auto_train: If True, automatically train NPLL if no model exists (default: True)
67
- community_mode: "none" for global exploration, "mapping" for community-scoped
68
- """
69
- self.db = db
70
- self.community_id = community_id
71
-
72
- logger.info(f"Initializing OdinEngine for community '{community_id}' (mode: {community_mode})...")
73
-
74
- # 1. Setup Graph Accessor (with caching)
75
- base_accessor = ArangoCommunityAccessor(
76
- db=db,
77
- community_id=community_id,
78
- community_mode=community_mode,
79
- )
80
- self.accessor = CachedGraphAccessor(base_accessor, cache_size=cache_size)
81
-
82
- # Global accessor for cross-community queries
83
- self.global_accessor = GlobalGraphAccessor(db=db, algorithm="gnn")
84
-
85
- # 2. Load/Train NPLL Model
86
- self.npll_model: Optional[NPLLModel] = None
87
- self.confidence = self._initialize_intelligence(auto_train)
88
-
89
- # 3. Setup Orchestrator
90
- self.orchestrator = RetrievalOrchestrator(
91
- accessor=self.accessor,
92
- edge_confidence=self.confidence,
93
- )
94
-
95
- # 4. Setup PPR Anchor Engine
96
- self.anchor_engine = APPRAnchors(self.accessor)
97
-
98
- mode = "NPLL" if self.npll_model else "Fallback"
99
- logger.info(f"✓ OdinEngine initialized (Intelligence: {mode})")
100
-
101
- def _initialize_intelligence(self, auto_train: bool):
102
- """Load or train NPLL model."""
103
- if not auto_train:
104
- logger.info("Auto-train disabled. Using constant confidence.")
105
- return ConstantConfidence(0.8)
106
-
107
- try:
108
- bootstrapper = KnowledgeBootstrapper(db=self.db)
109
- self.npll_model = bootstrapper.ensure_model_ready()
110
-
111
- if self.npll_model:
112
- return NPLLConfidence(self.npll_model, cache_size=10000)
113
- else:
114
- logger.warning("NPLL training failed. Using constant confidence.")
115
- return ConstantConfidence(0.8)
116
-
117
- except Exception as e:
118
- logger.error(f"Failed to initialize NPLL: {e}")
119
- return ConstantConfidence(0.8)
120
-
121
- def retrieve(
122
- self,
123
- seeds: List[str],
124
- max_paths: int = 50,
125
- hop_limit: int = 3,
126
- beam_width: int = 64,
127
- ) -> Dict[str, Any]:
128
- """
129
- Retrieve relevant paths from seed nodes.
130
-
131
- Uses PPR + Beam Search + NPLL Scoring to find the most relevant
132
- paths in the knowledge graph starting from the given seeds.
133
-
134
- Args:
135
- seeds: List of starting node IDs (e.g., ["Patient_123", "Claim_456"])
136
- max_paths: Maximum number of paths to return (default: 50)
137
- hop_limit: Maximum path length (default: 3)
138
- beam_width: Beam search width (default: 64)
139
-
140
- Returns:
141
- Dict containing:
142
- - topk_ppr: Top nodes by PageRank importance
143
- - paths: Discovered paths with scores
144
- - insight_score: Overall quality score
145
- - aggregates: Motifs, relations, anchors
146
- """
147
- params = OrchestratorParams(
148
- community_id=self.community_id,
149
- max_paths=max_paths,
150
- hop_limit=hop_limit,
151
- beam_width=beam_width,
152
- )
153
- return self.orchestrator.retrieve(seeds=seeds, params=params)
154
-
155
- def score_edge(self, src: str, rel: str, dst: str) -> float:
156
- """
157
- Score how plausible an edge is (0.0 to 1.0).
158
-
159
- Uses the trained NPLL model to estimate the probability
160
- that the given edge (src --rel--> dst) is valid.
161
-
162
- Args:
163
- src: Source node ID
164
- rel: Relationship type
165
- dst: Destination node ID
166
-
167
- Returns:
168
- Probability score between 0.0 and 1.0
169
- """
170
- return self.confidence.confidence(src, rel, dst)
171
-
172
- def find_anchors(self, seeds: List[str], topn: int = 20) -> List[tuple]:
173
- """
174
- Use PPR (PageRank) to find the most important nodes relative to seeds.
175
-
176
- Args:
177
- seeds: Starting node IDs
178
- topn: Number of top nodes to return (default: 20)
179
-
180
- Returns:
181
- List of (node_id, ppr_score) tuples sorted by importance
182
- """
183
- params = APPRAnchorParams(topn=topn)
184
- return self.anchor_engine.build_for_community(
185
- community_id=self.community_id,
186
- seed_set=seeds,
187
- params=params,
188
- )
189
-
190
- def get_neighbors(self, node_id: str) -> Dict[str, Any]:
191
- """
192
- Get all neighbors of a node with relationship types.
193
-
194
- Args:
195
- node_id: The node to inspect
196
-
197
- Returns:
198
- Dict with node info and list of neighbors
199
- """
200
- node = self.accessor.get_node(node_id)
201
-
202
- neighbors = []
203
- for neighbor_id, relation, weight in self.accessor.iter_out(node_id):
204
- neighbors.append({
205
- "id": neighbor_id,
206
- "rel": relation,
207
- "weight": weight,
208
- "direction": "out"
209
- })
210
-
211
- for neighbor_id, relation, weight in self.accessor.iter_in(node_id):
212
- neighbors.append({
213
- "id": neighbor_id,
214
- "rel": relation,
215
- "weight": weight,
216
- "direction": "in"
217
- })
218
-
219
- return {
220
- "node": node,
221
- "neighbors": neighbors,
222
- "degree": len(neighbors),
223
- }
224
-
225
- def retrain_model(self) -> bool:
226
- """
227
- Force retrain the NPLL model.
228
-
229
- Useful after significant data changes.
230
-
231
- Returns:
232
- True if training succeeded, False otherwise
233
- """
234
- try:
235
- bootstrapper = KnowledgeBootstrapper(db=self.db)
236
- self.npll_model = bootstrapper.ensure_model_ready(force_retrain=True)
237
-
238
- if self.npll_model:
239
- self.confidence = NPLLConfidence(self.npll_model, cache_size=10000)
240
- self.orchestrator = RetrievalOrchestrator(
241
- accessor=self.accessor,
242
- edge_confidence=self.confidence,
243
- )
244
- logger.info("✓ Model retrained successfully")
245
- return True
246
- return False
247
-
248
- except Exception as e:
249
- logger.error(f"Retraining failed: {e}")
250
- return False
251
-
252
- @property
253
- def has_npll(self) -> bool:
254
- """Check if NPLL model is loaded."""
255
- return self.npll_model is not None
256
-
257
- def get_status(self) -> Dict[str, Any]:
258
- """Get engine status information."""
259
- return {
260
- "community_id": self.community_id,
261
- "npll_loaded": self.has_npll,
262
- "intelligence_mode": "NPLL" if self.has_npll else "Constant",
263
- "cache_size": getattr(self.accessor, 'cache_size', 'unknown'),
264
- }
1
+ """
2
+ OdinEngine: The main entry point for the Odin KG Intelligence Library.
3
+
4
+ This class orchestrates all components:
5
+ - Graph access (with caching)
6
+ - NPLL model management (auto-train if needed)
7
+ - Retrieval (PPR + Beam Search + Scoring)
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import logging
13
+ from typing import List, Dict, Any, Optional
14
+
15
+ from arango.database import StandardDatabase
16
+
17
+ # Add parent path for imports
18
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
+
20
+ from npll.bootstrap import KnowledgeBootstrapper
21
+ from npll.npll_model import NPLLModel
22
+ from retrieval.orchestrator import RetrievalOrchestrator, OrchestratorParams
23
+ from retrieval.adapters_arango import ArangoCommunityAccessor, GlobalGraphAccessor
24
+ from retrieval.cache import CachedGraphAccessor
25
+ from retrieval.confidence import NPLLConfidence, ConstantConfidence
26
+ from retrieval.ppr.anchors import APPRAnchors, APPRAnchorParams
27
+
28
+ logger = logging.getLogger("odin")
29
+
30
+
31
+ class OdinEngine:
32
+ """
33
+ Main entry point for the Odin Knowledge Graph Intelligence Library.
34
+
35
+ Handles:
36
+ - Graph access (with caching)
37
+ - NPLL model loading (auto-trains if needed)
38
+ - Retrieval orchestration (PPR + Beam Search + NPLL Scoring)
39
+
40
+ Example:
41
+ from odin import OdinEngine
42
+ from arango import ArangoClient
43
+
44
+ client = ArangoClient(hosts="http://localhost:8529")
45
+ db = client.db("KG-test", username="root", password="")
46
+
47
+ engine = OdinEngine(db)
48
+ results = engine.retrieve(seeds=["Patient_123"])
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ db: StandardDatabase,
54
+ community_id: str = "global",
55
+ cache_size: int = 5000,
56
+ auto_train: bool = True,
57
+ community_mode: str = "none", # "none" = global, "mapping" = scoped
58
+ ):
59
+ """
60
+ Initialize the Odin Engine.
61
+
62
+ Args:
63
+ db: Connected ArangoDB database instance
64
+ community_id: Community to scope queries to (default: "global")
65
+ cache_size: Size of the graph accessor cache (default: 5000)
66
+ auto_train: If True, automatically train NPLL if no model exists (default: True)
67
+ community_mode: "none" for global exploration, "mapping" for community-scoped
68
+ """
69
+ self.db = db
70
+ self.community_id = community_id
71
+
72
+ logger.info(f"Initializing OdinEngine for community '{community_id}' (mode: {community_mode})...")
73
+
74
+ # 1. Setup Graph Accessor (with caching)
75
+ base_accessor = ArangoCommunityAccessor(
76
+ db=db,
77
+ community_id=community_id,
78
+ community_mode=community_mode,
79
+ )
80
+ self.accessor = CachedGraphAccessor(base_accessor, cache_size=cache_size)
81
+
82
+ # Global accessor for cross-community queries
83
+ self.global_accessor = GlobalGraphAccessor(db=db, algorithm="gnn")
84
+
85
+ # 2. Load/Train NPLL Model
86
+ self.npll_model: Optional[NPLLModel] = None
87
+ self.confidence = self._initialize_intelligence(auto_train)
88
+
89
+ # 3. Setup Orchestrator
90
+ self.orchestrator = RetrievalOrchestrator(
91
+ accessor=self.accessor,
92
+ edge_confidence=self.confidence,
93
+ )
94
+
95
+ # 4. Setup PPR Anchor Engine
96
+ self.anchor_engine = APPRAnchors(self.accessor)
97
+
98
+ mode = "NPLL" if self.npll_model else "Fallback"
99
+ logger.info(f"✓ OdinEngine initialized (Intelligence: {mode})")
100
+
101
+ def _initialize_intelligence(self, auto_train: bool):
102
+ """Load or train NPLL model."""
103
+ if not auto_train:
104
+ logger.info("Auto-train disabled. Using constant confidence.")
105
+ return ConstantConfidence(0.8)
106
+
107
+ try:
108
+ bootstrapper = KnowledgeBootstrapper(db=self.db)
109
+ self.npll_model = bootstrapper.ensure_model_ready()
110
+
111
+ if self.npll_model:
112
+ return NPLLConfidence(self.npll_model, cache_size=10000)
113
+ else:
114
+ logger.warning("NPLL training failed. Using constant confidence.")
115
+ return ConstantConfidence(0.8)
116
+
117
+ except Exception as e:
118
+ logger.error(f"Failed to initialize NPLL: {e}")
119
+ return ConstantConfidence(0.8)
120
+
121
+ def retrieve(
122
+ self,
123
+ seeds: List[str],
124
+ max_paths: int = 50,
125
+ hop_limit: int = 3,
126
+ beam_width: int = 64,
127
+ ) -> Dict[str, Any]:
128
+ """
129
+ Retrieve relevant paths from seed nodes.
130
+
131
+ Uses PPR + Beam Search + NPLL Scoring to find the most relevant
132
+ paths in the knowledge graph starting from the given seeds.
133
+
134
+ Args:
135
+ seeds: List of starting node IDs (e.g., ["Patient_123", "Claim_456"])
136
+ max_paths: Maximum number of paths to return (default: 50)
137
+ hop_limit: Maximum path length (default: 3)
138
+ beam_width: Beam search width (default: 64)
139
+
140
+ Returns:
141
+ Dict containing:
142
+ - topk_ppr: Top nodes by PageRank importance
143
+ - paths: Discovered paths with scores
144
+ - insight_score: Overall quality score
145
+ - aggregates: Motifs, relations, anchors
146
+ """
147
+ params = OrchestratorParams(
148
+ community_id=self.community_id,
149
+ max_paths=max_paths,
150
+ hop_limit=hop_limit,
151
+ beam_width=beam_width,
152
+ )
153
+ return self.orchestrator.retrieve(seeds=seeds, params=params)
154
+
155
+ def score_edge(self, src: str, rel: str, dst: str) -> float:
156
+ """
157
+ Score how plausible an edge is (0.0 to 1.0).
158
+
159
+ Uses the trained NPLL model to estimate the probability
160
+ that the given edge (src --rel--> dst) is valid.
161
+
162
+ Args:
163
+ src: Source node ID
164
+ rel: Relationship type
165
+ dst: Destination node ID
166
+
167
+ Returns:
168
+ Probability score between 0.0 and 1.0
169
+ """
170
+ return self.confidence.confidence(src, rel, dst)
171
+
172
+ def find_anchors(self, seeds: List[str], topn: int = 20) -> List[tuple]:
173
+ """
174
+ Use PPR (PageRank) to find the most important nodes relative to seeds.
175
+
176
+ Args:
177
+ seeds: Starting node IDs
178
+ topn: Number of top nodes to return (default: 20)
179
+
180
+ Returns:
181
+ List of (node_id, ppr_score) tuples sorted by importance
182
+ """
183
+ params = APPRAnchorParams(topn=topn)
184
+ return self.anchor_engine.build_for_community(
185
+ community_id=self.community_id,
186
+ seed_set=seeds,
187
+ params=params,
188
+ )
189
+
190
+ def get_neighbors(self, node_id: str) -> Dict[str, Any]:
191
+ """
192
+ Get all neighbors of a node with relationship types.
193
+
194
+ Args:
195
+ node_id: The node to inspect
196
+
197
+ Returns:
198
+ Dict with node info and list of neighbors
199
+ """
200
+ node = self.accessor.get_node(node_id)
201
+
202
+ neighbors = []
203
+ for neighbor_id, relation, weight in self.accessor.iter_out(node_id):
204
+ neighbors.append({
205
+ "id": neighbor_id,
206
+ "rel": relation,
207
+ "weight": weight,
208
+ "direction": "out"
209
+ })
210
+
211
+ for neighbor_id, relation, weight in self.accessor.iter_in(node_id):
212
+ neighbors.append({
213
+ "id": neighbor_id,
214
+ "rel": relation,
215
+ "weight": weight,
216
+ "direction": "in"
217
+ })
218
+
219
+ return {
220
+ "node": node,
221
+ "neighbors": neighbors,
222
+ "degree": len(neighbors),
223
+ }
224
+
225
+ def retrain_model(self) -> bool:
226
+ """
227
+ Force retrain the NPLL model.
228
+
229
+ Useful after significant data changes.
230
+
231
+ Returns:
232
+ True if training succeeded, False otherwise
233
+ """
234
+ try:
235
+ bootstrapper = KnowledgeBootstrapper(db=self.db)
236
+ self.npll_model = bootstrapper.ensure_model_ready(force_retrain=True)
237
+
238
+ if self.npll_model:
239
+ self.confidence = NPLLConfidence(self.npll_model, cache_size=10000)
240
+ self.orchestrator = RetrievalOrchestrator(
241
+ accessor=self.accessor,
242
+ edge_confidence=self.confidence,
243
+ )
244
+ logger.info("✓ Model retrained successfully")
245
+ return True
246
+ return False
247
+
248
+ except Exception as e:
249
+ logger.error(f"Retraining failed: {e}")
250
+ return False
251
+
252
+ @property
253
+ def has_npll(self) -> bool:
254
+ """Check if NPLL model is loaded."""
255
+ return self.npll_model is not None
256
+
257
+ def get_status(self) -> Dict[str, Any]:
258
+ """Get engine status information."""
259
+ return {
260
+ "community_id": self.community_id,
261
+ "npll_loaded": self.has_npll,
262
+ "intelligence_mode": "NPLL" if self.has_npll else "Constant",
263
+ "cache_size": getattr(self.accessor, 'cache_size', 'unknown'),
264
+ }