PyPI - odin-engine - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

odin-engine 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

benchmarks/__init__.py +17 -17
benchmarks/datasets.py +284 -284
benchmarks/metrics.py +275 -275
benchmarks/run_ablation.py +279 -279
benchmarks/run_npll_benchmark.py +270 -270
npll/__init__.py +10 -10
npll/bootstrap.py +474 -474
npll/core/__init__.py +33 -33
npll/core/knowledge_graph.py +308 -308
npll/core/logical_rules.py +496 -496
npll/core/mln.py +474 -474
npll/inference/__init__.py +40 -40
npll/inference/e_step.py +419 -419
npll/inference/elbo.py +434 -434
npll/inference/m_step.py +576 -576
npll/npll_model.py +631 -631
npll/scoring/__init__.py +42 -42
npll/scoring/embeddings.py +441 -441
npll/scoring/probability.py +402 -402
npll/scoring/scoring_module.py +369 -369
npll/training/__init__.py +24 -24
npll/training/evaluation.py +496 -496
npll/training/npll_trainer.py +520 -520
npll/utils/__init__.py +47 -47
npll/utils/batch_utils.py +492 -492
npll/utils/config.py +144 -144
npll/utils/math_utils.py +338 -338
odin/__init__.py +21 -20
odin/engine.py +264 -264
odin/schema.py +210 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
odin_engine-0.2.0.dist-info/RECORD +63 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
retrieval/__init__.py +50 -50
retrieval/adapters.py +140 -140
retrieval/adapters_arango.py +1418 -1418
retrieval/aggregators.py +707 -707
retrieval/beam.py +127 -127
retrieval/budget.py +60 -60
retrieval/cache.py +159 -159
retrieval/confidence.py +88 -88
retrieval/eval.py +49 -49
retrieval/linker.py +87 -87
retrieval/metrics.py +105 -105
retrieval/metrics_motifs.py +36 -36
retrieval/orchestrator.py +571 -571
retrieval/ppr/__init__.py +12 -12
retrieval/ppr/anchors.py +41 -41
retrieval/ppr/bippr.py +61 -61
retrieval/ppr/engines.py +257 -257
retrieval/ppr/global_pr.py +76 -76
retrieval/ppr/indexes.py +78 -78
retrieval/ppr.py +156 -156
retrieval/ppr_cache.py +25 -25
retrieval/scoring.py +294 -294
retrieval/utils/pii_redaction.py +36 -36
retrieval/writers/__init__.py +9 -9
retrieval/writers/arango_writer.py +28 -28
retrieval/writers/base.py +21 -21
retrieval/writers/janus_writer.py +36 -36
odin_engine-0.1.0.dist-info/RECORD +0 -62
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
{odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0

retrieval/linker.py CHANGED Viewed

@@ -1,87 +1,87 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import List, Dict, Optional, Tuple
-from .adapters import NodeId
-@dataclass
-class LinkerConfig:
-    candidates_per_mention: int = 10
-    coherence_iterations: int = 1
-    persist_threshold: float = 0.8
-    w_candidate: float = 0.6
-    w_prior: float = 0.3
-    w_coherence: float = 0.1
-@dataclass
-class Mention:
-    mention_id: str
-    surface: str
-    normalized: Optional[str]
-    span: Tuple[int, int]
-    context: Optional[str]
-    llm_confidence: float
-    candidates: List[Tuple[NodeId, float]]  # (entity_id, candidate_score)
-class CoherenceLinker:
-    """
-    Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
-    Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
-    """
-    def __init__(self, cfg: LinkerConfig):
-        self.cfg = cfg
-    def link(
-        self,
-        mentions: List[Mention],
-        entity_prior: Optional[Dict[NodeId, float]] = None,
-        coherence_fn: Optional[callable] = None,
-    ) -> Dict[str, Dict[str, object]]:
-        pri = entity_prior or {}
-        # Initialize by local best per mention
-        assignment: Dict[str, Tuple[NodeId, float]] = {}
-        for m in mentions:
-            cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
-            if not cs:
-                continue
-            ent, score = cs[0]
-            assignment[m.mention_id] = (ent, float(score))
-        # Iterative coherence re-weighting (greedy)
-        for _ in range(max(1, self.cfg.coherence_iterations)):
-            linked_entities = [e for (_, (e, _)) in assignment.items()]
-            for m in mentions:
-                cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
-                best_ent, best_val = None, -1e9
-                for ent, cand_score in cs:
-                    prior = pri.get(ent, 0.0)
-                    coh = 0.0
-                    if coherence_fn and linked_entities:
-                        coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
-                    val = (
-                        self.cfg.w_candidate * cand_score
-                        + self.cfg.w_prior * prior
-                        + self.cfg.w_coherence * coh
-                    )
-                    if val > best_val:
-                        best_val = val
-                        best_ent = ent
-                if best_ent is not None:
-                    assignment[m.mention_id] = (best_ent, float(best_val))
-        # Produce results with normalized confidence in [0,1]
-        # Here we map the composite score through min-max over chosen candidates for a rough normalization
-        vals = [v for (_, v) in assignment.values()]
-        vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
-        rng = max(vmax - vmin, 1e-9)
-        results: Dict[str, Dict[str, object]] = {}
-        for mid, (ent, val) in assignment.items():
-            norm = (val - vmin) / rng
-            results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
-        return results
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+from .adapters import NodeId
+@dataclass
+class LinkerConfig:
+    candidates_per_mention: int = 10
+    coherence_iterations: int = 1
+    persist_threshold: float = 0.8
+    w_candidate: float = 0.6
+    w_prior: float = 0.3
+    w_coherence: float = 0.1
+@dataclass
+class Mention:
+    mention_id: str
+    surface: str
+    normalized: Optional[str]
+    span: Tuple[int, int]
+    context: Optional[str]
+    llm_confidence: float
+    candidates: List[Tuple[NodeId, float]]  # (entity_id, candidate_score)
+class CoherenceLinker:
+    """
+    Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
+    Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
+    """
+    def __init__(self, cfg: LinkerConfig):
+        self.cfg = cfg
+    def link(
+        self,
+        mentions: List[Mention],
+        entity_prior: Optional[Dict[NodeId, float]] = None,
+        coherence_fn: Optional[callable] = None,
+    ) -> Dict[str, Dict[str, object]]:
+        pri = entity_prior or {}
+        # Initialize by local best per mention
+        assignment: Dict[str, Tuple[NodeId, float]] = {}
+        for m in mentions:
+            cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
+            if not cs:
+                continue
+            ent, score = cs[0]
+            assignment[m.mention_id] = (ent, float(score))
+        # Iterative coherence re-weighting (greedy)
+        for _ in range(max(1, self.cfg.coherence_iterations)):
+            linked_entities = [e for (_, (e, _)) in assignment.items()]
+            for m in mentions:
+                cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
+                best_ent, best_val = None, -1e9
+                for ent, cand_score in cs:
+                    prior = pri.get(ent, 0.0)
+                    coh = 0.0
+                    if coherence_fn and linked_entities:
+                        coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
+                    val = (
+                        self.cfg.w_candidate * cand_score
+                        + self.cfg.w_prior * prior
+                        + self.cfg.w_coherence * coh
+                    )
+                    if val > best_val:
+                        best_val = val
+                        best_ent = ent
+                if best_ent is not None:
+                    assignment[m.mention_id] = (best_ent, float(best_val))
+        # Produce results with normalized confidence in [0,1]
+        # Here we map the composite score through min-max over chosen candidates for a rough normalization
+        vals = [v for (_, v) in assignment.values()]
+        vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
+        rng = max(vmax - vmin, 1e-9)
+        results: Dict[str, Dict[str, object]] = {}
+        for mid, (ent, val) in assignment.items():
+            norm = (val - vmin) / rng
+            results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
+        return results

retrieval/metrics.py CHANGED Viewed

@@ -1,105 +1,105 @@
-from __future__ import annotations
-from dataclasses import dataclass, asdict
-from typing import Optional, Dict, Any
-import time, json, os, threading
-from .utils.pii_redaction import redact_dict
-class Timer:
-    def __init__(self):
-        self.t0 = time.perf_counter()
-        self.marks: Dict[str, float] = {}
-    def mark(self, name: str):
-        self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
-    def elapsed_ms(self) -> int:
-        return int((time.perf_counter() - self.t0) * 1000)
-class JSONLSink:
-    def __init__(self, path: str):
-        self.path = path
-        os.makedirs(os.path.dirname(path), exist_ok=True)
-        self._lock = threading.Lock()
-    def write(self, event: Dict[str, Any]):
-        line = json.dumps(event, ensure_ascii=False)
-        with self._lock:
-            with open(self.path, 'a', encoding='utf-8') as f:
-                f.write(line + '\n')
-@dataclass
-class RetrievalMetrics:
-    query_id: Optional[str]
-    community_id: Optional[str]
-    seeds_count: int
-    ppr_mass: float
-    topk: int
-    used_budget: Dict[str, Any]
-    latency_ms: int
-    early_stop_reason: Optional[str]
-    engine: str
-    notes: Optional[Dict[str, Any]] = None
-    def to_event(self) -> Dict[str, Any]:
-        return asdict(self)
-class MetricsLogger:
-    def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
-        self.sink = sink
-        self.redact_pii = redact_pii
-    def log(self, metrics: RetrievalMetrics):
-        if self.sink:
-            event = metrics.to_event()
-            if self.redact_pii:
-                event = redact_dict(event)
-            self.sink.write(event)
-def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
-    import numpy as np
-    latencies, budget_hits = [], 0
-    total = 0
-    with open(jsonl_path, 'r', encoding='utf-8') as f:
-        for line in f:
-            total += 1
-            try:
-                ev = json.loads(line)
-            except Exception:
-                continue
-            if 'latency_ms' in ev:
-                latencies.append(ev['latency_ms'])
-            used = ev.get('used_budget', {})
-            bud = used if isinstance(used, dict) else {}
-            # Budget hit if any dimension equals its cap
-            max_nodes = bud.get('max_nodes')
-            max_edges = bud.get('max_edges')
-            max_ms = bud.get('max_ms')
-            max_paths = bud.get('max_paths')
-            u_nodes = bud.get('nodes', -1)
-            u_edges = bud.get('edges', -1)
-            u_ms = bud.get('ms', -1)
-            u_paths = bud.get('paths', -1)
-            hit = (
-                (max_nodes is not None and u_nodes >= max_nodes)
-                or (max_edges is not None and u_edges >= max_edges)
-                or (max_ms is not None and u_ms >= max_ms)
-                or (max_paths is not None and u_paths >= max_paths)
-            )
-            if hit:
-                budget_hits += 1
-    if not latencies:
-        return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
-    arr = np.array(latencies)
-    return {
-        "count": total,
-        "p50_ms": float(np.percentile(arr, 50)),
-        "p95_ms": float(np.percentile(arr, 95)),
-        "budget_hit_rate": (budget_hits / max(total, 1)),
-    }
+from __future__ import annotations
+from dataclasses import dataclass, asdict
+from typing import Optional, Dict, Any
+import time, json, os, threading
+from .utils.pii_redaction import redact_dict
+class Timer:
+    def __init__(self):
+        self.t0 = time.perf_counter()
+        self.marks: Dict[str, float] = {}
+    def mark(self, name: str):
+        self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
+    def elapsed_ms(self) -> int:
+        return int((time.perf_counter() - self.t0) * 1000)
+class JSONLSink:
+    def __init__(self, path: str):
+        self.path = path
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        self._lock = threading.Lock()
+    def write(self, event: Dict[str, Any]):
+        line = json.dumps(event, ensure_ascii=False)
+        with self._lock:
+            with open(self.path, 'a', encoding='utf-8') as f:
+                f.write(line + '\n')
+@dataclass
+class RetrievalMetrics:
+    query_id: Optional[str]
+    community_id: Optional[str]
+    seeds_count: int
+    ppr_mass: float
+    topk: int
+    used_budget: Dict[str, Any]
+    latency_ms: int
+    early_stop_reason: Optional[str]
+    engine: str
+    notes: Optional[Dict[str, Any]] = None
+    def to_event(self) -> Dict[str, Any]:
+        return asdict(self)
+class MetricsLogger:
+    def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
+        self.sink = sink
+        self.redact_pii = redact_pii
+    def log(self, metrics: RetrievalMetrics):
+        if self.sink:
+            event = metrics.to_event()
+            if self.redact_pii:
+                event = redact_dict(event)
+            self.sink.write(event)
+def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
+    import numpy as np
+    latencies, budget_hits = [], 0
+    total = 0
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            total += 1
+            try:
+                ev = json.loads(line)
+            except Exception:
+                continue
+            if 'latency_ms' in ev:
+                latencies.append(ev['latency_ms'])
+            used = ev.get('used_budget', {})
+            bud = used if isinstance(used, dict) else {}
+            # Budget hit if any dimension equals its cap
+            max_nodes = bud.get('max_nodes')
+            max_edges = bud.get('max_edges')
+            max_ms = bud.get('max_ms')
+            max_paths = bud.get('max_paths')
+            u_nodes = bud.get('nodes', -1)
+            u_edges = bud.get('edges', -1)
+            u_ms = bud.get('ms', -1)
+            u_paths = bud.get('paths', -1)
+            hit = (
+                (max_nodes is not None and u_nodes >= max_nodes)
+                or (max_edges is not None and u_edges >= max_edges)
+                or (max_ms is not None and u_ms >= max_ms)
+                or (max_paths is not None and u_paths >= max_paths)
+            )
+            if hit:
+                budget_hits += 1
+    if not latencies:
+        return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
+    arr = np.array(latencies)
+    return {
+        "count": total,
+        "p50_ms": float(np.percentile(arr, 50)),
+        "p95_ms": float(np.percentile(arr, 95)),
+        "budget_hit_rate": (budget_hits / max(total, 1)),
+    }

retrieval/metrics_motifs.py CHANGED Viewed

@@ -1,36 +1,36 @@
-from __future__ import annotations
-from typing import Iterable, Tuple, Dict, Set
-from .adapters import GraphAccessor, NodeId, RelId
-def wedge_and_triad_closures(
-    accessor: GraphAccessor,
-    community_id: str,
-    nodes: Iterable[NodeId],
-    relation_filter: Set[RelId] | None = None,
-    hop_cap: int = 3,
-) -> Dict[str, float]:
-    """
-    Estimate tiny-link yield via wedge/triad closures within a hop cap.
-    Returns fraction of wedges that close (triangles) and count estimates.
-    """
-    nodes = list(nodes)
-    if not nodes:
-        return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
-    wedges = 0
-    triads = 0
-    for u in nodes:
-        nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
-        for v in nbrs1:
-            nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
-            for w in nbrs2:
-                wedges += 1
-                # Closure if an edge from u to w exists (any relation in filter)
-                closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
-                if closed:
-                    triads += 1
-    rate = (triads / wedges) if wedges else 0.0
-    return {"wedges": wedges, "triads": triads, "closure_rate": rate}
+from __future__ import annotations
+from typing import Iterable, Tuple, Dict, Set
+from .adapters import GraphAccessor, NodeId, RelId
+def wedge_and_triad_closures(
+    accessor: GraphAccessor,
+    community_id: str,
+    nodes: Iterable[NodeId],
+    relation_filter: Set[RelId] | None = None,
+    hop_cap: int = 3,
+) -> Dict[str, float]:
+    """
+    Estimate tiny-link yield via wedge/triad closures within a hop cap.
+    Returns fraction of wedges that close (triangles) and count estimates.
+    """
+    nodes = list(nodes)
+    if not nodes:
+        return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
+    wedges = 0
+    triads = 0
+    for u in nodes:
+        nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
+        for v in nbrs1:
+            nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
+            for w in nbrs2:
+                wedges += 1
+                # Closure if an edge from u to w exists (any relation in filter)
+                closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
+                if closed:
+                    triads += 1
+    rate = (triads / wedges) if wedges else 0.0
+    return {"wedges": wedges, "triads": triads, "closure_rate": rate}

odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

odin-engine 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl