PyPI - odin-engine - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

odin-engine 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

benchmarks/__init__.py +17 -17
benchmarks/datasets.py +284 -284
benchmarks/metrics.py +275 -275
benchmarks/run_ablation.py +279 -279
benchmarks/run_npll_benchmark.py +270 -270
npll/__init__.py +10 -10
npll/bootstrap.py +474 -474
npll/core/__init__.py +33 -33
npll/core/knowledge_graph.py +308 -308
npll/core/logical_rules.py +496 -496
npll/core/mln.py +474 -474
npll/inference/__init__.py +40 -40
npll/inference/e_step.py +419 -419
npll/inference/elbo.py +434 -434
npll/inference/m_step.py +576 -576
npll/npll_model.py +631 -631
npll/scoring/__init__.py +42 -42
npll/scoring/embeddings.py +441 -441
npll/scoring/probability.py +402 -402
npll/scoring/scoring_module.py +369 -369
npll/training/__init__.py +24 -24
npll/training/evaluation.py +496 -496
npll/training/npll_trainer.py +520 -520
npll/utils/__init__.py +47 -47
npll/utils/batch_utils.py +492 -492
npll/utils/config.py +144 -144
npll/utils/math_utils.py +338 -338
odin/__init__.py +21 -20
odin/engine.py +264 -264
odin/schema.py +210 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
odin_engine-0.2.0.dist-info/RECORD +63 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
retrieval/__init__.py +50 -50
retrieval/adapters.py +140 -140
retrieval/adapters_arango.py +1418 -1418
retrieval/aggregators.py +707 -707
retrieval/beam.py +127 -127
retrieval/budget.py +60 -60
retrieval/cache.py +159 -159
retrieval/confidence.py +88 -88
retrieval/eval.py +49 -49
retrieval/linker.py +87 -87
retrieval/metrics.py +105 -105
retrieval/metrics_motifs.py +36 -36
retrieval/orchestrator.py +571 -571
retrieval/ppr/__init__.py +12 -12
retrieval/ppr/anchors.py +41 -41
retrieval/ppr/bippr.py +61 -61
retrieval/ppr/engines.py +257 -257
retrieval/ppr/global_pr.py +76 -76
retrieval/ppr/indexes.py +78 -78
retrieval/ppr.py +156 -156
retrieval/ppr_cache.py +25 -25
retrieval/scoring.py +294 -294
retrieval/utils/pii_redaction.py +36 -36
retrieval/writers/__init__.py +9 -9
retrieval/writers/arango_writer.py +28 -28
retrieval/writers/base.py +21 -21
retrieval/writers/janus_writer.py +36 -36
odin_engine-0.1.0.dist-info/RECORD +0 -62
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0

retrieval/cache.py CHANGED Viewed

@@ -1,159 +1,159 @@
-"""
-Caching layer for GraphAccessor to prevent network hammering during PPR.
-PRODUCTION FIX: PPR's push algorithm repeatedly calls accessor.iter_out(u) for
-the same nodes, causing excessive network traffic. CachedGraphAccessor wraps any
-accessor and caches neighbor lookups.
-"""
-from __future__ import annotations
-from typing import Iterable, Optional, List, Tuple
-from collections import OrderedDict
-from .adapters import GraphAccessor, NodeId, RelId
-class CachedGraphAccessor:
-    """
-    Wraps a GraphAccessor with LRU caching for neighbor queries.
-    Critical for production: Prevents "network hammer" issue where PPR
-    makes repeated calls to iter_out() for the same nodes, each hitting
-    the database/network.
-    Usage:
-        base_accessor = ArangoCommunityAccessor(db, community_id="insurance")
-        cached_accessor = CachedGraphAccessor(base_accessor, cache_size=5000)
-        # Now PPR won't hammer the network
-        orchestrator = RetrievalOrchestrator(accessor=cached_accessor, ...)
-    """
-    def __init__(self, base: GraphAccessor, cache_size: int = 5000):
-        """
-        Args:
-            base: The underlying GraphAccessor to wrap
-            cache_size: Maximum number of nodes to cache (default: 5000)
-        """
-        self.base = base
-        self.cache_size = cache_size
-        # LRU caches for outbound and inbound neighbors
-        self._out_cache: OrderedDict[NodeId, List[Tuple[NodeId, RelId, float]]] = OrderedDict()
-        self._in_cache: OrderedDict[NodeId, List[Tuple[NodeId, RelId, float]]] = OrderedDict()
-        # Stats for monitoring
-        self._hits = 0
-        self._misses = 0
-    def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
-        """Get outbound neighbors with caching."""
-        if node in self._out_cache:
-            # Cache hit - move to end (LRU)
-            self._out_cache.move_to_end(node)
-            self._hits += 1
-            return iter(self._out_cache[node])
-        # Cache miss - fetch from base accessor
-        self._misses += 1
-        neighbors = list(self.base.iter_out(node))
-        # Store in cache with LRU eviction
-        if len(self._out_cache) >= self.cache_size:
-            self._out_cache.popitem(last=False)  # Remove oldest
-        self._out_cache[node] = neighbors
-        return iter(neighbors)
-    def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
-        """Get inbound neighbors with caching."""
-        if node in self._in_cache:
-            # Cache hit - move to end (LRU)
-            self._in_cache.move_to_end(node)
-            self._hits += 1
-            return iter(self._in_cache[node])
-        # Cache miss - fetch from base accessor
-        self._misses += 1
-        neighbors = list(self.base.iter_in(node))
-        # Store in cache with LRU eviction
-        if len(self._in_cache) >= self.cache_size:
-            self._in_cache.popitem(last=False)  # Remove oldest
-        self._in_cache[node] = neighbors
-        return iter(neighbors)
-    def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
-        """Pass through to base accessor (no caching)."""
-        return self.base.nodes(community_id)
-    def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> dict:
-        """Pass through to base accessor (node lookups are typically one-off)."""
-        return self.base.get_node(node_id, fields)
-    def degree(self, node: NodeId) -> int:
-        """Pass through to base accessor."""
-        return self.base.degree(node)
-    def community_seed_norm(self, community_id: str, seeds: List[str]) -> List[str]:
-        """Pass through to base accessor."""
-        return self.base.community_seed_norm(community_id, seeds)
-    def clear_cache(self):
-        """Clear all caches. Useful for memory management or testing."""
-        self._out_cache.clear()
-        self._in_cache.clear()
-        self._hits = 0
-        self._misses = 0
-    def cache_stats(self) -> dict:
-        """
-        Return cache statistics for monitoring.
-        Returns:
-            Dict with hit rate, sizes, and utilization metrics
-        """
-        total = self._hits + self._misses
-        hit_rate = self._hits / total if total > 0 else 0
-        return {
-            "hits": self._hits,
-            "misses": self._misses,
-            "total_requests": total,
-            "hit_rate": hit_rate,
-            "out_cache_size": len(self._out_cache),
-            "in_cache_size": len(self._in_cache),
-            "max_cache_size": self.cache_size,
-            "out_cache_utilization": len(self._out_cache) / self.cache_size,
-            "in_cache_utilization": len(self._in_cache) / self.cache_size,
-        }
-    def warm_cache(self, nodes: List[NodeId], direction: str = "out"):
-        """
-        Pre-populate cache for a list of nodes.
-        Useful for batch operations where you know which nodes will be accessed.
-        Args:
-            nodes: List of node IDs to pre-fetch
-            direction: "out" or "in" for outbound/inbound neighbors
-        """
-        if direction == "out":
-            for node in nodes:
-                if node not in self._out_cache:
-                    neighbors = list(self.base.iter_out(node))
-                    if len(self._out_cache) >= self.cache_size:
-                        self._out_cache.popitem(last=False)
-                    self._out_cache[node] = neighbors
-        elif direction == "in":
-            for node in nodes:
-                if node not in self._in_cache:
-                    neighbors = list(self.base.iter_in(node))
-                    if len(self._in_cache) >= self.cache_size:
-                        self._in_cache.popitem(last=False)
-                    self._in_cache[node] = neighbors
-    # Delegate all other methods to base accessor
-    def __getattr__(self, name):
-        """Delegate unknown methods to the base accessor."""
-        return getattr(self.base, name)
+"""
+Caching layer for GraphAccessor to prevent network hammering during PPR.
+PRODUCTION FIX: PPR's push algorithm repeatedly calls accessor.iter_out(u) for
+the same nodes, causing excessive network traffic. CachedGraphAccessor wraps any
+accessor and caches neighbor lookups.
+"""
+from __future__ import annotations
+from typing import Iterable, Optional, List, Tuple
+from collections import OrderedDict
+from .adapters import GraphAccessor, NodeId, RelId
+class CachedGraphAccessor:
+    """
+    Wraps a GraphAccessor with LRU caching for neighbor queries.
+    Critical for production: Prevents "network hammer" issue where PPR
+    makes repeated calls to iter_out() for the same nodes, each hitting
+    the database/network.
+    Usage:
+        base_accessor = ArangoCommunityAccessor(db, community_id="insurance")
+        cached_accessor = CachedGraphAccessor(base_accessor, cache_size=5000)
+        # Now PPR won't hammer the network
+        orchestrator = RetrievalOrchestrator(accessor=cached_accessor, ...)
+    """
+    def __init__(self, base: GraphAccessor, cache_size: int = 5000):
+        """
+        Args:
+            base: The underlying GraphAccessor to wrap
+            cache_size: Maximum number of nodes to cache (default: 5000)
+        """
+        self.base = base
+        self.cache_size = cache_size
+        # LRU caches for outbound and inbound neighbors
+        self._out_cache: OrderedDict[NodeId, List[Tuple[NodeId, RelId, float]]] = OrderedDict()
+        self._in_cache: OrderedDict[NodeId, List[Tuple[NodeId, RelId, float]]] = OrderedDict()
+        # Stats for monitoring
+        self._hits = 0
+        self._misses = 0
+    def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
+        """Get outbound neighbors with caching."""
+        if node in self._out_cache:
+            # Cache hit - move to end (LRU)
+            self._out_cache.move_to_end(node)
+            self._hits += 1
+            return iter(self._out_cache[node])
+        # Cache miss - fetch from base accessor
+        self._misses += 1
+        neighbors = list(self.base.iter_out(node))
+        # Store in cache with LRU eviction
+        if len(self._out_cache) >= self.cache_size:
+            self._out_cache.popitem(last=False)  # Remove oldest
+        self._out_cache[node] = neighbors
+        return iter(neighbors)
+    def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
+        """Get inbound neighbors with caching."""
+        if node in self._in_cache:
+            # Cache hit - move to end (LRU)
+            self._in_cache.move_to_end(node)
+            self._hits += 1
+            return iter(self._in_cache[node])
+        # Cache miss - fetch from base accessor
+        self._misses += 1
+        neighbors = list(self.base.iter_in(node))
+        # Store in cache with LRU eviction
+        if len(self._in_cache) >= self.cache_size:
+            self._in_cache.popitem(last=False)  # Remove oldest
+        self._in_cache[node] = neighbors
+        return iter(neighbors)
+    def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
+        """Pass through to base accessor (no caching)."""
+        return self.base.nodes(community_id)
+    def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> dict:
+        """Pass through to base accessor (node lookups are typically one-off)."""
+        return self.base.get_node(node_id, fields)
+    def degree(self, node: NodeId) -> int:
+        """Pass through to base accessor."""
+        return self.base.degree(node)
+    def community_seed_norm(self, community_id: str, seeds: List[str]) -> List[str]:
+        """Pass through to base accessor."""
+        return self.base.community_seed_norm(community_id, seeds)
+    def clear_cache(self):
+        """Clear all caches. Useful for memory management or testing."""
+        self._out_cache.clear()
+        self._in_cache.clear()
+        self._hits = 0
+        self._misses = 0
+    def cache_stats(self) -> dict:
+        """
+        Return cache statistics for monitoring.
+        Returns:
+            Dict with hit rate, sizes, and utilization metrics
+        """
+        total = self._hits + self._misses
+        hit_rate = self._hits / total if total > 0 else 0
+        return {
+            "hits": self._hits,
+            "misses": self._misses,
+            "total_requests": total,
+            "hit_rate": hit_rate,
+            "out_cache_size": len(self._out_cache),
+            "in_cache_size": len(self._in_cache),
+            "max_cache_size": self.cache_size,
+            "out_cache_utilization": len(self._out_cache) / self.cache_size,
+            "in_cache_utilization": len(self._in_cache) / self.cache_size,
+        }
+    def warm_cache(self, nodes: List[NodeId], direction: str = "out"):
+        """
+        Pre-populate cache for a list of nodes.
+        Useful for batch operations where you know which nodes will be accessed.
+        Args:
+            nodes: List of node IDs to pre-fetch
+            direction: "out" or "in" for outbound/inbound neighbors
+        """
+        if direction == "out":
+            for node in nodes:
+                if node not in self._out_cache:
+                    neighbors = list(self.base.iter_out(node))
+                    if len(self._out_cache) >= self.cache_size:
+                        self._out_cache.popitem(last=False)
+                    self._out_cache[node] = neighbors
+        elif direction == "in":
+            for node in nodes:
+                if node not in self._in_cache:
+                    neighbors = list(self.base.iter_in(node))
+                    if len(self._in_cache) >= self.cache_size:
+                        self._in_cache.popitem(last=False)
+                    self._in_cache[node] = neighbors
+    # Delegate all other methods to base accessor
+    def __getattr__(self, name):
+        """Delegate unknown methods to the base accessor."""
+        return getattr(self.base, name)

retrieval/confidence.py CHANGED Viewed

@@ -1,88 +1,88 @@
-from __future__ import annotations
-from typing import Protocol, Tuple, Optional, Callable, List
-from collections import OrderedDict
-from math import isfinite
-import torch
-NodeId = str
-RelId = str
-class EdgeConfidenceProvider(Protocol):
-    def confidence(self, u: NodeId, rel: RelId, v: NodeId) -> float: ...
-    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]: ...
-class ConstantConfidence:
-    def __init__(self, value: float = 0.8):
-        self.value = max(1e-6, min(1.0, value))
-    def confidence(self, u, rel, v) -> float:
-        return self.value
-    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]:
-        return [self.value] * len(edges)
-class NPLLConfidence:
-    """
-    Wraps an NPLL model for retrieval-time scoring with LRU caching.
-    Uses the model's scoring module for batched probabilities when available.
-    PRODUCTION FIX: Uses bounded LRU cache to prevent memory leaks in long-running processes.
-    """
-    def __init__(self, npll_model, cache_size: int = 10000):
-        """
-        Args:
-            npll_model: Trained NPLL model for scoring triples
-            cache_size: Maximum number of cached confidence scores (default: 10K)
-        """
-        self.model = npll_model
-        self.cache_size = cache_size
-        self._cache: OrderedDict[Tuple[str, str, str], float] = OrderedDict()
-    def confidence(self, u: NodeId, rel: RelId, v: NodeId) -> float:
-        return self.confidence_batch([(u, rel, v)])[0]
-    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]:
-        todo = [(u, r, v) for (u, r, v) in edges if (u, r, v) not in self._cache]
-        if todo:
-            heads, rels, tails = zip(*todo)
-            self.model.eval()
-            with torch.no_grad():
-                scores = self.model.scoring_module.forward_with_names(list(heads), list(rels), list(tails))
-                # Don't apply per-group temperature scaling (requires group_ids we don't have)
-                probs = self.model.probability_transform(scores, apply_temperature=False)
-            for (u, r, v), p in zip(todo, probs.tolist()):
-                confidence = max(1e-6, float(p)) if isfinite(p) else 1e-6
-                # LRU eviction: remove oldest if at capacity
-                if len(self._cache) >= self.cache_size:
-                    self._cache.popitem(last=False)  # Remove oldest (FIFO)
-                self._cache[(u, r, v)] = confidence
-        # Move accessed items to end (LRU behavior)
-        result = []
-        for edge in edges:
-            conf = self._cache[edge]
-            # Move to end to mark as recently used
-            self._cache.move_to_end(edge)
-            result.append(conf)
-        return result
-    def clear_cache(self):
-        """Clear the confidence cache. Useful for testing or memory management."""
-        self._cache.clear()
-    def cache_stats(self) -> dict:
-        """Return cache statistics for monitoring."""
-        return {
-            "size": len(self._cache),
-            "max_size": self.cache_size,
-            "utilization": len(self._cache) / self.cache_size if self.cache_size > 0 else 0,
-        }
+from __future__ import annotations
+from typing import Protocol, Tuple, Optional, Callable, List
+from collections import OrderedDict
+from math import isfinite
+import torch
+NodeId = str
+RelId = str
+class EdgeConfidenceProvider(Protocol):
+    def confidence(self, u: NodeId, rel: RelId, v: NodeId) -> float: ...
+    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]: ...
+class ConstantConfidence:
+    def __init__(self, value: float = 0.8):
+        self.value = max(1e-6, min(1.0, value))
+    def confidence(self, u, rel, v) -> float:
+        return self.value
+    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]:
+        return [self.value] * len(edges)
+class NPLLConfidence:
+    """
+    Wraps an NPLL model for retrieval-time scoring with LRU caching.
+    Uses the model's scoring module for batched probabilities when available.
+    PRODUCTION FIX: Uses bounded LRU cache to prevent memory leaks in long-running processes.
+    """
+    def __init__(self, npll_model, cache_size: int = 10000):
+        """
+        Args:
+            npll_model: Trained NPLL model for scoring triples
+            cache_size: Maximum number of cached confidence scores (default: 10K)
+        """
+        self.model = npll_model
+        self.cache_size = cache_size
+        self._cache: OrderedDict[Tuple[str, str, str], float] = OrderedDict()
+    def confidence(self, u: NodeId, rel: RelId, v: NodeId) -> float:
+        return self.confidence_batch([(u, rel, v)])[0]
+    def confidence_batch(self, edges: List[Tuple[NodeId, RelId, NodeId]]) -> List[float]:
+        todo = [(u, r, v) for (u, r, v) in edges if (u, r, v) not in self._cache]
+        if todo:
+            heads, rels, tails = zip(*todo)
+            self.model.eval()
+            with torch.no_grad():
+                scores = self.model.scoring_module.forward_with_names(list(heads), list(rels), list(tails))
+                # Don't apply per-group temperature scaling (requires group_ids we don't have)
+                probs = self.model.probability_transform(scores, apply_temperature=False)
+            for (u, r, v), p in zip(todo, probs.tolist()):
+                confidence = max(1e-6, float(p)) if isfinite(p) else 1e-6
+                # LRU eviction: remove oldest if at capacity
+                if len(self._cache) >= self.cache_size:
+                    self._cache.popitem(last=False)  # Remove oldest (FIFO)
+                self._cache[(u, r, v)] = confidence
+        # Move accessed items to end (LRU behavior)
+        result = []
+        for edge in edges:
+            conf = self._cache[edge]
+            # Move to end to mark as recently used
+            self._cache.move_to_end(edge)
+            result.append(conf)
+        return result
+    def clear_cache(self):
+        """Clear the confidence cache. Useful for testing or memory management."""
+        self._cache.clear()
+    def cache_stats(self) -> dict:
+        """Return cache statistics for monitoring."""
+        return {
+            "size": len(self._cache),
+            "max_size": self.cache_size,
+            "utilization": len(self._cache) / self.cache_size if self.cache_size > 0 else 0,
+        }

retrieval/eval.py CHANGED Viewed

@@ -1,49 +1,49 @@
-from __future__ import annotations
-from typing import List, Tuple, Set
-from sklearn.isotonic import IsotonicRegression
-import numpy as np
-import numpy as np
-def recall_at_k(predicted: List[Tuple[str, float]], relevant: Set[str], k: int = 10) -> float:
-    if not predicted or not relevant:
-        return 0.0
-    top = [n for n, _ in predicted[:k]]
-    hits = sum(1 for n in top if n in relevant)
-    denom = min(k, len(relevant)) or 1
-    return hits / denom
-def expected_calibration_error(probs: np.ndarray, labels: np.ndarray, n_bins: int = 10) -> float:
-    bins = np.linspace(0.0, 1.0, n_bins + 1)
-    ece = 0.0
-    N = len(probs)
-    for i in range(n_bins):
-        lo, hi = bins[i], bins[i + 1]
-        mask = (probs > lo) & (probs <= hi)
-        if not np.any(mask):
-            continue
-        acc = labels[mask].mean()
-        conf = probs[mask].mean()
-        ece += np.abs(conf - acc) * (mask.sum() / max(N, 1))
-    return float(ece)
-class SimpleLLMCalibrator:
-    """
-    Wraps isotonic regression to calibrate LLM self-reported confidences.
-    """
-    def __init__(self):
-        self.iso = IsotonicRegression(out_of_bounds='clip')
-        self.is_fitted = False
-    def fit(self, raw_conf: np.ndarray, labels: np.ndarray):
-        self.iso.fit(raw_conf, labels)
-        self.is_fitted = True
-    def transform(self, raw_conf: np.ndarray) -> np.ndarray:
-        if not self.is_fitted:
-            return raw_conf
-        return self.iso.transform(raw_conf)
+from __future__ import annotations
+from typing import List, Tuple, Set
+from sklearn.isotonic import IsotonicRegression
+import numpy as np
+import numpy as np
+def recall_at_k(predicted: List[Tuple[str, float]], relevant: Set[str], k: int = 10) -> float:
+    if not predicted or not relevant:
+        return 0.0
+    top = [n for n, _ in predicted[:k]]
+    hits = sum(1 for n in top if n in relevant)
+    denom = min(k, len(relevant)) or 1
+    return hits / denom
+def expected_calibration_error(probs: np.ndarray, labels: np.ndarray, n_bins: int = 10) -> float:
+    bins = np.linspace(0.0, 1.0, n_bins + 1)
+    ece = 0.0
+    N = len(probs)
+    for i in range(n_bins):
+        lo, hi = bins[i], bins[i + 1]
+        mask = (probs > lo) & (probs <= hi)
+        if not np.any(mask):
+            continue
+        acc = labels[mask].mean()
+        conf = probs[mask].mean()
+        ece += np.abs(conf - acc) * (mask.sum() / max(N, 1))
+    return float(ece)
+class SimpleLLMCalibrator:
+    """
+    Wraps isotonic regression to calibrate LLM self-reported confidences.
+    """
+    def __init__(self):
+        self.iso = IsotonicRegression(out_of_bounds='clip')
+        self.is_fitted = False
+    def fit(self, raw_conf: np.ndarray, labels: np.ndarray):
+        self.iso.fit(raw_conf, labels)
+        self.is_fitted = True
+    def transform(self, raw_conf: np.ndarray) -> np.ndarray:
+        if not self.is_fitted:
+            return raw_conf
+        return self.iso.transform(raw_conf)

odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

odin-engine 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl