odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. benchmarks/__init__.py +17 -17
  2. benchmarks/datasets.py +284 -284
  3. benchmarks/metrics.py +275 -275
  4. benchmarks/run_ablation.py +279 -279
  5. benchmarks/run_npll_benchmark.py +270 -270
  6. npll/__init__.py +10 -10
  7. npll/bootstrap.py +474 -474
  8. npll/core/__init__.py +33 -33
  9. npll/core/knowledge_graph.py +308 -308
  10. npll/core/logical_rules.py +496 -496
  11. npll/core/mln.py +474 -474
  12. npll/inference/__init__.py +40 -40
  13. npll/inference/e_step.py +419 -419
  14. npll/inference/elbo.py +434 -434
  15. npll/inference/m_step.py +576 -576
  16. npll/npll_model.py +631 -631
  17. npll/scoring/__init__.py +42 -42
  18. npll/scoring/embeddings.py +441 -441
  19. npll/scoring/probability.py +402 -402
  20. npll/scoring/scoring_module.py +369 -369
  21. npll/training/__init__.py +24 -24
  22. npll/training/evaluation.py +496 -496
  23. npll/training/npll_trainer.py +520 -520
  24. npll/utils/__init__.py +47 -47
  25. npll/utils/batch_utils.py +492 -492
  26. npll/utils/config.py +144 -144
  27. npll/utils/math_utils.py +338 -338
  28. odin/__init__.py +21 -20
  29. odin/engine.py +264 -264
  30. odin/schema.py +210 -0
  31. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
  32. odin_engine-0.2.0.dist-info/RECORD +63 -0
  33. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
  34. retrieval/__init__.py +50 -50
  35. retrieval/adapters.py +140 -140
  36. retrieval/adapters_arango.py +1418 -1418
  37. retrieval/aggregators.py +707 -707
  38. retrieval/beam.py +127 -127
  39. retrieval/budget.py +60 -60
  40. retrieval/cache.py +159 -159
  41. retrieval/confidence.py +88 -88
  42. retrieval/eval.py +49 -49
  43. retrieval/linker.py +87 -87
  44. retrieval/metrics.py +105 -105
  45. retrieval/metrics_motifs.py +36 -36
  46. retrieval/orchestrator.py +571 -571
  47. retrieval/ppr/__init__.py +12 -12
  48. retrieval/ppr/anchors.py +41 -41
  49. retrieval/ppr/bippr.py +61 -61
  50. retrieval/ppr/engines.py +257 -257
  51. retrieval/ppr/global_pr.py +76 -76
  52. retrieval/ppr/indexes.py +78 -78
  53. retrieval/ppr.py +156 -156
  54. retrieval/ppr_cache.py +25 -25
  55. retrieval/scoring.py +294 -294
  56. retrieval/utils/pii_redaction.py +36 -36
  57. retrieval/writers/__init__.py +9 -9
  58. retrieval/writers/arango_writer.py +28 -28
  59. retrieval/writers/base.py +21 -21
  60. retrieval/writers/janus_writer.py +36 -36
  61. odin_engine-0.1.0.dist-info/RECORD +0 -62
  62. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
  63. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
retrieval/linker.py CHANGED
@@ -1,87 +1,87 @@
1
- from __future__ import annotations
2
- from dataclasses import dataclass
3
- from typing import List, Dict, Optional, Tuple
4
-
5
- from .adapters import NodeId
6
-
7
-
8
- @dataclass
9
- class LinkerConfig:
10
- candidates_per_mention: int = 10
11
- coherence_iterations: int = 1
12
- persist_threshold: float = 0.8
13
- w_candidate: float = 0.6
14
- w_prior: float = 0.3
15
- w_coherence: float = 0.1
16
-
17
-
18
- @dataclass
19
- class Mention:
20
- mention_id: str
21
- surface: str
22
- normalized: Optional[str]
23
- span: Tuple[int, int]
24
- context: Optional[str]
25
- llm_confidence: float
26
- candidates: List[Tuple[NodeId, float]] # (entity_id, candidate_score)
27
-
28
-
29
- class CoherenceLinker:
30
- """
31
- Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
32
- Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
33
- """
34
-
35
- def __init__(self, cfg: LinkerConfig):
36
- self.cfg = cfg
37
-
38
- def link(
39
- self,
40
- mentions: List[Mention],
41
- entity_prior: Optional[Dict[NodeId, float]] = None,
42
- coherence_fn: Optional[callable] = None,
43
- ) -> Dict[str, Dict[str, object]]:
44
- pri = entity_prior or {}
45
- # Initialize by local best per mention
46
- assignment: Dict[str, Tuple[NodeId, float]] = {}
47
- for m in mentions:
48
- cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
49
- if not cs:
50
- continue
51
- ent, score = cs[0]
52
- assignment[m.mention_id] = (ent, float(score))
53
-
54
- # Iterative coherence re-weighting (greedy)
55
- for _ in range(max(1, self.cfg.coherence_iterations)):
56
- linked_entities = [e for (_, (e, _)) in assignment.items()]
57
- for m in mentions:
58
- cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
59
- best_ent, best_val = None, -1e9
60
- for ent, cand_score in cs:
61
- prior = pri.get(ent, 0.0)
62
- coh = 0.0
63
- if coherence_fn and linked_entities:
64
- coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
65
- val = (
66
- self.cfg.w_candidate * cand_score
67
- + self.cfg.w_prior * prior
68
- + self.cfg.w_coherence * coh
69
- )
70
- if val > best_val:
71
- best_val = val
72
- best_ent = ent
73
- if best_ent is not None:
74
- assignment[m.mention_id] = (best_ent, float(best_val))
75
-
76
- # Produce results with normalized confidence in [0,1]
77
- # Here we map the composite score through min-max over chosen candidates for a rough normalization
78
- vals = [v for (_, v) in assignment.values()]
79
- vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
80
- rng = max(vmax - vmin, 1e-9)
81
- results: Dict[str, Dict[str, object]] = {}
82
- for mid, (ent, val) in assignment.items():
83
- norm = (val - vmin) / rng
84
- results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
85
- return results
86
-
87
-
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import List, Dict, Optional, Tuple
4
+
5
+ from .adapters import NodeId
6
+
7
+
8
+ @dataclass
9
+ class LinkerConfig:
10
+ candidates_per_mention: int = 10
11
+ coherence_iterations: int = 1
12
+ persist_threshold: float = 0.8
13
+ w_candidate: float = 0.6
14
+ w_prior: float = 0.3
15
+ w_coherence: float = 0.1
16
+
17
+
18
+ @dataclass
19
+ class Mention:
20
+ mention_id: str
21
+ surface: str
22
+ normalized: Optional[str]
23
+ span: Tuple[int, int]
24
+ context: Optional[str]
25
+ llm_confidence: float
26
+ candidates: List[Tuple[NodeId, float]] # (entity_id, candidate_score)
27
+
28
+
29
+ class CoherenceLinker:
30
+ """
31
+ Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
32
+ Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
33
+ """
34
+
35
+ def __init__(self, cfg: LinkerConfig):
36
+ self.cfg = cfg
37
+
38
+ def link(
39
+ self,
40
+ mentions: List[Mention],
41
+ entity_prior: Optional[Dict[NodeId, float]] = None,
42
+ coherence_fn: Optional[callable] = None,
43
+ ) -> Dict[str, Dict[str, object]]:
44
+ pri = entity_prior or {}
45
+ # Initialize by local best per mention
46
+ assignment: Dict[str, Tuple[NodeId, float]] = {}
47
+ for m in mentions:
48
+ cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
49
+ if not cs:
50
+ continue
51
+ ent, score = cs[0]
52
+ assignment[m.mention_id] = (ent, float(score))
53
+
54
+ # Iterative coherence re-weighting (greedy)
55
+ for _ in range(max(1, self.cfg.coherence_iterations)):
56
+ linked_entities = [e for (_, (e, _)) in assignment.items()]
57
+ for m in mentions:
58
+ cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
59
+ best_ent, best_val = None, -1e9
60
+ for ent, cand_score in cs:
61
+ prior = pri.get(ent, 0.0)
62
+ coh = 0.0
63
+ if coherence_fn and linked_entities:
64
+ coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
65
+ val = (
66
+ self.cfg.w_candidate * cand_score
67
+ + self.cfg.w_prior * prior
68
+ + self.cfg.w_coherence * coh
69
+ )
70
+ if val > best_val:
71
+ best_val = val
72
+ best_ent = ent
73
+ if best_ent is not None:
74
+ assignment[m.mention_id] = (best_ent, float(best_val))
75
+
76
+ # Produce results with normalized confidence in [0,1]
77
+ # Here we map the composite score through min-max over chosen candidates for a rough normalization
78
+ vals = [v for (_, v) in assignment.values()]
79
+ vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
80
+ rng = max(vmax - vmin, 1e-9)
81
+ results: Dict[str, Dict[str, object]] = {}
82
+ for mid, (ent, val) in assignment.items():
83
+ norm = (val - vmin) / rng
84
+ results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
85
+ return results
86
+
87
+
retrieval/metrics.py CHANGED
@@ -1,105 +1,105 @@
1
- from __future__ import annotations
2
- from dataclasses import dataclass, asdict
3
- from typing import Optional, Dict, Any
4
- import time, json, os, threading
5
- from .utils.pii_redaction import redact_dict
6
-
7
-
8
- class Timer:
9
- def __init__(self):
10
- self.t0 = time.perf_counter()
11
- self.marks: Dict[str, float] = {}
12
-
13
- def mark(self, name: str):
14
- self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
15
-
16
- def elapsed_ms(self) -> int:
17
- return int((time.perf_counter() - self.t0) * 1000)
18
-
19
-
20
- class JSONLSink:
21
- def __init__(self, path: str):
22
- self.path = path
23
- os.makedirs(os.path.dirname(path), exist_ok=True)
24
- self._lock = threading.Lock()
25
-
26
- def write(self, event: Dict[str, Any]):
27
- line = json.dumps(event, ensure_ascii=False)
28
- with self._lock:
29
- with open(self.path, 'a', encoding='utf-8') as f:
30
- f.write(line + '\n')
31
-
32
-
33
- @dataclass
34
- class RetrievalMetrics:
35
- query_id: Optional[str]
36
- community_id: Optional[str]
37
- seeds_count: int
38
- ppr_mass: float
39
- topk: int
40
- used_budget: Dict[str, Any]
41
- latency_ms: int
42
- early_stop_reason: Optional[str]
43
- engine: str
44
- notes: Optional[Dict[str, Any]] = None
45
-
46
- def to_event(self) -> Dict[str, Any]:
47
- return asdict(self)
48
-
49
-
50
- class MetricsLogger:
51
- def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
52
- self.sink = sink
53
- self.redact_pii = redact_pii
54
-
55
- def log(self, metrics: RetrievalMetrics):
56
- if self.sink:
57
- event = metrics.to_event()
58
- if self.redact_pii:
59
- event = redact_dict(event)
60
- self.sink.write(event)
61
-
62
-
63
- def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
64
- import numpy as np
65
- latencies, budget_hits = [], 0
66
- total = 0
67
- with open(jsonl_path, 'r', encoding='utf-8') as f:
68
- for line in f:
69
- total += 1
70
- try:
71
- ev = json.loads(line)
72
- except Exception:
73
- continue
74
- if 'latency_ms' in ev:
75
- latencies.append(ev['latency_ms'])
76
- used = ev.get('used_budget', {})
77
- bud = used if isinstance(used, dict) else {}
78
- # Budget hit if any dimension equals its cap
79
- max_nodes = bud.get('max_nodes')
80
- max_edges = bud.get('max_edges')
81
- max_ms = bud.get('max_ms')
82
- max_paths = bud.get('max_paths')
83
- u_nodes = bud.get('nodes', -1)
84
- u_edges = bud.get('edges', -1)
85
- u_ms = bud.get('ms', -1)
86
- u_paths = bud.get('paths', -1)
87
- hit = (
88
- (max_nodes is not None and u_nodes >= max_nodes)
89
- or (max_edges is not None and u_edges >= max_edges)
90
- or (max_ms is not None and u_ms >= max_ms)
91
- or (max_paths is not None and u_paths >= max_paths)
92
- )
93
- if hit:
94
- budget_hits += 1
95
- if not latencies:
96
- return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
97
- arr = np.array(latencies)
98
- return {
99
- "count": total,
100
- "p50_ms": float(np.percentile(arr, 50)),
101
- "p95_ms": float(np.percentile(arr, 95)),
102
- "budget_hit_rate": (budget_hits / max(total, 1)),
103
- }
104
-
105
-
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass, asdict
3
+ from typing import Optional, Dict, Any
4
+ import time, json, os, threading
5
+ from .utils.pii_redaction import redact_dict
6
+
7
+
8
+ class Timer:
9
+ def __init__(self):
10
+ self.t0 = time.perf_counter()
11
+ self.marks: Dict[str, float] = {}
12
+
13
+ def mark(self, name: str):
14
+ self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
15
+
16
+ def elapsed_ms(self) -> int:
17
+ return int((time.perf_counter() - self.t0) * 1000)
18
+
19
+
20
+ class JSONLSink:
21
+ def __init__(self, path: str):
22
+ self.path = path
23
+ os.makedirs(os.path.dirname(path), exist_ok=True)
24
+ self._lock = threading.Lock()
25
+
26
+ def write(self, event: Dict[str, Any]):
27
+ line = json.dumps(event, ensure_ascii=False)
28
+ with self._lock:
29
+ with open(self.path, 'a', encoding='utf-8') as f:
30
+ f.write(line + '\n')
31
+
32
+
33
+ @dataclass
34
+ class RetrievalMetrics:
35
+ query_id: Optional[str]
36
+ community_id: Optional[str]
37
+ seeds_count: int
38
+ ppr_mass: float
39
+ topk: int
40
+ used_budget: Dict[str, Any]
41
+ latency_ms: int
42
+ early_stop_reason: Optional[str]
43
+ engine: str
44
+ notes: Optional[Dict[str, Any]] = None
45
+
46
+ def to_event(self) -> Dict[str, Any]:
47
+ return asdict(self)
48
+
49
+
50
+ class MetricsLogger:
51
+ def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
52
+ self.sink = sink
53
+ self.redact_pii = redact_pii
54
+
55
+ def log(self, metrics: RetrievalMetrics):
56
+ if self.sink:
57
+ event = metrics.to_event()
58
+ if self.redact_pii:
59
+ event = redact_dict(event)
60
+ self.sink.write(event)
61
+
62
+
63
+ def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
64
+ import numpy as np
65
+ latencies, budget_hits = [], 0
66
+ total = 0
67
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
68
+ for line in f:
69
+ total += 1
70
+ try:
71
+ ev = json.loads(line)
72
+ except Exception:
73
+ continue
74
+ if 'latency_ms' in ev:
75
+ latencies.append(ev['latency_ms'])
76
+ used = ev.get('used_budget', {})
77
+ bud = used if isinstance(used, dict) else {}
78
+ # Budget hit if any dimension equals its cap
79
+ max_nodes = bud.get('max_nodes')
80
+ max_edges = bud.get('max_edges')
81
+ max_ms = bud.get('max_ms')
82
+ max_paths = bud.get('max_paths')
83
+ u_nodes = bud.get('nodes', -1)
84
+ u_edges = bud.get('edges', -1)
85
+ u_ms = bud.get('ms', -1)
86
+ u_paths = bud.get('paths', -1)
87
+ hit = (
88
+ (max_nodes is not None and u_nodes >= max_nodes)
89
+ or (max_edges is not None and u_edges >= max_edges)
90
+ or (max_ms is not None and u_ms >= max_ms)
91
+ or (max_paths is not None and u_paths >= max_paths)
92
+ )
93
+ if hit:
94
+ budget_hits += 1
95
+ if not latencies:
96
+ return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
97
+ arr = np.array(latencies)
98
+ return {
99
+ "count": total,
100
+ "p50_ms": float(np.percentile(arr, 50)),
101
+ "p95_ms": float(np.percentile(arr, 95)),
102
+ "budget_hit_rate": (budget_hits / max(total, 1)),
103
+ }
104
+
105
+
@@ -1,36 +1,36 @@
1
- from __future__ import annotations
2
- from typing import Iterable, Tuple, Dict, Set
3
-
4
- from .adapters import GraphAccessor, NodeId, RelId
5
-
6
-
7
- def wedge_and_triad_closures(
8
- accessor: GraphAccessor,
9
- community_id: str,
10
- nodes: Iterable[NodeId],
11
- relation_filter: Set[RelId] | None = None,
12
- hop_cap: int = 3,
13
- ) -> Dict[str, float]:
14
- """
15
- Estimate tiny-link yield via wedge/triad closures within a hop cap.
16
- Returns fraction of wedges that close (triangles) and count estimates.
17
- """
18
- nodes = list(nodes)
19
- if not nodes:
20
- return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
21
- wedges = 0
22
- triads = 0
23
- for u in nodes:
24
- nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
25
- for v in nbrs1:
26
- nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
27
- for w in nbrs2:
28
- wedges += 1
29
- # Closure if an edge from u to w exists (any relation in filter)
30
- closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
31
- if closed:
32
- triads += 1
33
- rate = (triads / wedges) if wedges else 0.0
34
- return {"wedges": wedges, "triads": triads, "closure_rate": rate}
35
-
36
-
1
+ from __future__ import annotations
2
+ from typing import Iterable, Tuple, Dict, Set
3
+
4
+ from .adapters import GraphAccessor, NodeId, RelId
5
+
6
+
7
+ def wedge_and_triad_closures(
8
+ accessor: GraphAccessor,
9
+ community_id: str,
10
+ nodes: Iterable[NodeId],
11
+ relation_filter: Set[RelId] | None = None,
12
+ hop_cap: int = 3,
13
+ ) -> Dict[str, float]:
14
+ """
15
+ Estimate tiny-link yield via wedge/triad closures within a hop cap.
16
+ Returns fraction of wedges that close (triangles) and count estimates.
17
+ """
18
+ nodes = list(nodes)
19
+ if not nodes:
20
+ return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
21
+ wedges = 0
22
+ triads = 0
23
+ for u in nodes:
24
+ nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
25
+ for v in nbrs1:
26
+ nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
27
+ for w in nbrs2:
28
+ wedges += 1
29
+ # Closure if an edge from u to w exists (any relation in filter)
30
+ closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
31
+ if closed:
32
+ triads += 1
33
+ rate = (triads / wedges) if wedges else 0.0
34
+ return {"wedges": wedges, "triads": triads, "closure_rate": rate}
35
+
36
+