odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +17 -17
- benchmarks/datasets.py +284 -284
- benchmarks/metrics.py +275 -275
- benchmarks/run_ablation.py +279 -279
- benchmarks/run_npll_benchmark.py +270 -270
- npll/__init__.py +10 -10
- npll/bootstrap.py +474 -474
- npll/core/__init__.py +33 -33
- npll/core/knowledge_graph.py +308 -308
- npll/core/logical_rules.py +496 -496
- npll/core/mln.py +474 -474
- npll/inference/__init__.py +40 -40
- npll/inference/e_step.py +419 -419
- npll/inference/elbo.py +434 -434
- npll/inference/m_step.py +576 -576
- npll/npll_model.py +631 -631
- npll/scoring/__init__.py +42 -42
- npll/scoring/embeddings.py +441 -441
- npll/scoring/probability.py +402 -402
- npll/scoring/scoring_module.py +369 -369
- npll/training/__init__.py +24 -24
- npll/training/evaluation.py +496 -496
- npll/training/npll_trainer.py +520 -520
- npll/utils/__init__.py +47 -47
- npll/utils/batch_utils.py +492 -492
- npll/utils/config.py +144 -144
- npll/utils/math_utils.py +338 -338
- odin/__init__.py +21 -20
- odin/engine.py +264 -264
- odin/schema.py +210 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
- odin_engine-0.2.0.dist-info/RECORD +63 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
- retrieval/__init__.py +50 -50
- retrieval/adapters.py +140 -140
- retrieval/adapters_arango.py +1418 -1418
- retrieval/aggregators.py +707 -707
- retrieval/beam.py +127 -127
- retrieval/budget.py +60 -60
- retrieval/cache.py +159 -159
- retrieval/confidence.py +88 -88
- retrieval/eval.py +49 -49
- retrieval/linker.py +87 -87
- retrieval/metrics.py +105 -105
- retrieval/metrics_motifs.py +36 -36
- retrieval/orchestrator.py +571 -571
- retrieval/ppr/__init__.py +12 -12
- retrieval/ppr/anchors.py +41 -41
- retrieval/ppr/bippr.py +61 -61
- retrieval/ppr/engines.py +257 -257
- retrieval/ppr/global_pr.py +76 -76
- retrieval/ppr/indexes.py +78 -78
- retrieval/ppr.py +156 -156
- retrieval/ppr_cache.py +25 -25
- retrieval/scoring.py +294 -294
- retrieval/utils/pii_redaction.py +36 -36
- retrieval/writers/__init__.py +9 -9
- retrieval/writers/arango_writer.py +28 -28
- retrieval/writers/base.py +21 -21
- retrieval/writers/janus_writer.py +36 -36
- odin_engine-0.1.0.dist-info/RECORD +0 -62
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
retrieval/linker.py
CHANGED
|
@@ -1,87 +1,87 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import List, Dict, Optional, Tuple
|
|
4
|
-
|
|
5
|
-
from .adapters import NodeId
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@dataclass
|
|
9
|
-
class LinkerConfig:
|
|
10
|
-
candidates_per_mention: int = 10
|
|
11
|
-
coherence_iterations: int = 1
|
|
12
|
-
persist_threshold: float = 0.8
|
|
13
|
-
w_candidate: float = 0.6
|
|
14
|
-
w_prior: float = 0.3
|
|
15
|
-
w_coherence: float = 0.1
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@dataclass
|
|
19
|
-
class Mention:
|
|
20
|
-
mention_id: str
|
|
21
|
-
surface: str
|
|
22
|
-
normalized: Optional[str]
|
|
23
|
-
span: Tuple[int, int]
|
|
24
|
-
context: Optional[str]
|
|
25
|
-
llm_confidence: float
|
|
26
|
-
candidates: List[Tuple[NodeId, float]] # (entity_id, candidate_score)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class CoherenceLinker:
|
|
30
|
-
"""
|
|
31
|
-
Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
|
|
32
|
-
Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self, cfg: LinkerConfig):
|
|
36
|
-
self.cfg = cfg
|
|
37
|
-
|
|
38
|
-
def link(
|
|
39
|
-
self,
|
|
40
|
-
mentions: List[Mention],
|
|
41
|
-
entity_prior: Optional[Dict[NodeId, float]] = None,
|
|
42
|
-
coherence_fn: Optional[callable] = None,
|
|
43
|
-
) -> Dict[str, Dict[str, object]]:
|
|
44
|
-
pri = entity_prior or {}
|
|
45
|
-
# Initialize by local best per mention
|
|
46
|
-
assignment: Dict[str, Tuple[NodeId, float]] = {}
|
|
47
|
-
for m in mentions:
|
|
48
|
-
cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
|
|
49
|
-
if not cs:
|
|
50
|
-
continue
|
|
51
|
-
ent, score = cs[0]
|
|
52
|
-
assignment[m.mention_id] = (ent, float(score))
|
|
53
|
-
|
|
54
|
-
# Iterative coherence re-weighting (greedy)
|
|
55
|
-
for _ in range(max(1, self.cfg.coherence_iterations)):
|
|
56
|
-
linked_entities = [e for (_, (e, _)) in assignment.items()]
|
|
57
|
-
for m in mentions:
|
|
58
|
-
cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
|
|
59
|
-
best_ent, best_val = None, -1e9
|
|
60
|
-
for ent, cand_score in cs:
|
|
61
|
-
prior = pri.get(ent, 0.0)
|
|
62
|
-
coh = 0.0
|
|
63
|
-
if coherence_fn and linked_entities:
|
|
64
|
-
coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
|
|
65
|
-
val = (
|
|
66
|
-
self.cfg.w_candidate * cand_score
|
|
67
|
-
+ self.cfg.w_prior * prior
|
|
68
|
-
+ self.cfg.w_coherence * coh
|
|
69
|
-
)
|
|
70
|
-
if val > best_val:
|
|
71
|
-
best_val = val
|
|
72
|
-
best_ent = ent
|
|
73
|
-
if best_ent is not None:
|
|
74
|
-
assignment[m.mention_id] = (best_ent, float(best_val))
|
|
75
|
-
|
|
76
|
-
# Produce results with normalized confidence in [0,1]
|
|
77
|
-
# Here we map the composite score through min-max over chosen candidates for a rough normalization
|
|
78
|
-
vals = [v for (_, v) in assignment.values()]
|
|
79
|
-
vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
|
|
80
|
-
rng = max(vmax - vmin, 1e-9)
|
|
81
|
-
results: Dict[str, Dict[str, object]] = {}
|
|
82
|
-
for mid, (ent, val) in assignment.items():
|
|
83
|
-
norm = (val - vmin) / rng
|
|
84
|
-
results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
|
|
85
|
-
return results
|
|
86
|
-
|
|
87
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Dict, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from .adapters import NodeId
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class LinkerConfig:
|
|
10
|
+
candidates_per_mention: int = 10
|
|
11
|
+
coherence_iterations: int = 1
|
|
12
|
+
persist_threshold: float = 0.8
|
|
13
|
+
w_candidate: float = 0.6
|
|
14
|
+
w_prior: float = 0.3
|
|
15
|
+
w_coherence: float = 0.1
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Mention:
|
|
20
|
+
mention_id: str
|
|
21
|
+
surface: str
|
|
22
|
+
normalized: Optional[str]
|
|
23
|
+
span: Tuple[int, int]
|
|
24
|
+
context: Optional[str]
|
|
25
|
+
llm_confidence: float
|
|
26
|
+
candidates: List[Tuple[NodeId, float]] # (entity_id, candidate_score)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CoherenceLinker:
|
|
30
|
+
"""
|
|
31
|
+
Skeleton linker that accepts LLM mentions with candidates and returns linked entities.
|
|
32
|
+
Coherence/ranking by graph priors (to be plugged-in): use PPR/anchors in orchestrator.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, cfg: LinkerConfig):
|
|
36
|
+
self.cfg = cfg
|
|
37
|
+
|
|
38
|
+
def link(
|
|
39
|
+
self,
|
|
40
|
+
mentions: List[Mention],
|
|
41
|
+
entity_prior: Optional[Dict[NodeId, float]] = None,
|
|
42
|
+
coherence_fn: Optional[callable] = None,
|
|
43
|
+
) -> Dict[str, Dict[str, object]]:
|
|
44
|
+
pri = entity_prior or {}
|
|
45
|
+
# Initialize by local best per mention
|
|
46
|
+
assignment: Dict[str, Tuple[NodeId, float]] = {}
|
|
47
|
+
for m in mentions:
|
|
48
|
+
cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
|
|
49
|
+
if not cs:
|
|
50
|
+
continue
|
|
51
|
+
ent, score = cs[0]
|
|
52
|
+
assignment[m.mention_id] = (ent, float(score))
|
|
53
|
+
|
|
54
|
+
# Iterative coherence re-weighting (greedy)
|
|
55
|
+
for _ in range(max(1, self.cfg.coherence_iterations)):
|
|
56
|
+
linked_entities = [e for (_, (e, _)) in assignment.items()]
|
|
57
|
+
for m in mentions:
|
|
58
|
+
cs = sorted(m.candidates, key=lambda x: x[1], reverse=True)[: self.cfg.candidates_per_mention]
|
|
59
|
+
best_ent, best_val = None, -1e9
|
|
60
|
+
for ent, cand_score in cs:
|
|
61
|
+
prior = pri.get(ent, 0.0)
|
|
62
|
+
coh = 0.0
|
|
63
|
+
if coherence_fn and linked_entities:
|
|
64
|
+
coh = sum(coherence_fn(ent, le) for le in linked_entities) / max(1, len(linked_entities))
|
|
65
|
+
val = (
|
|
66
|
+
self.cfg.w_candidate * cand_score
|
|
67
|
+
+ self.cfg.w_prior * prior
|
|
68
|
+
+ self.cfg.w_coherence * coh
|
|
69
|
+
)
|
|
70
|
+
if val > best_val:
|
|
71
|
+
best_val = val
|
|
72
|
+
best_ent = ent
|
|
73
|
+
if best_ent is not None:
|
|
74
|
+
assignment[m.mention_id] = (best_ent, float(best_val))
|
|
75
|
+
|
|
76
|
+
# Produce results with normalized confidence in [0,1]
|
|
77
|
+
# Here we map the composite score through min-max over chosen candidates for a rough normalization
|
|
78
|
+
vals = [v for (_, v) in assignment.values()]
|
|
79
|
+
vmin, vmax = (min(vals), max(vals)) if vals else (0.0, 1.0)
|
|
80
|
+
rng = max(vmax - vmin, 1e-9)
|
|
81
|
+
results: Dict[str, Dict[str, object]] = {}
|
|
82
|
+
for mid, (ent, val) in assignment.items():
|
|
83
|
+
norm = (val - vmin) / rng
|
|
84
|
+
results[mid] = {"entity_id": ent, "link_confidence": float(norm)}
|
|
85
|
+
return results
|
|
86
|
+
|
|
87
|
+
|
retrieval/metrics.py
CHANGED
|
@@ -1,105 +1,105 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from dataclasses import dataclass, asdict
|
|
3
|
-
from typing import Optional, Dict, Any
|
|
4
|
-
import time, json, os, threading
|
|
5
|
-
from .utils.pii_redaction import redact_dict
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Timer:
|
|
9
|
-
def __init__(self):
|
|
10
|
-
self.t0 = time.perf_counter()
|
|
11
|
-
self.marks: Dict[str, float] = {}
|
|
12
|
-
|
|
13
|
-
def mark(self, name: str):
|
|
14
|
-
self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
|
|
15
|
-
|
|
16
|
-
def elapsed_ms(self) -> int:
|
|
17
|
-
return int((time.perf_counter() - self.t0) * 1000)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class JSONLSink:
|
|
21
|
-
def __init__(self, path: str):
|
|
22
|
-
self.path = path
|
|
23
|
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
24
|
-
self._lock = threading.Lock()
|
|
25
|
-
|
|
26
|
-
def write(self, event: Dict[str, Any]):
|
|
27
|
-
line = json.dumps(event, ensure_ascii=False)
|
|
28
|
-
with self._lock:
|
|
29
|
-
with open(self.path, 'a', encoding='utf-8') as f:
|
|
30
|
-
f.write(line + '\n')
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@dataclass
|
|
34
|
-
class RetrievalMetrics:
|
|
35
|
-
query_id: Optional[str]
|
|
36
|
-
community_id: Optional[str]
|
|
37
|
-
seeds_count: int
|
|
38
|
-
ppr_mass: float
|
|
39
|
-
topk: int
|
|
40
|
-
used_budget: Dict[str, Any]
|
|
41
|
-
latency_ms: int
|
|
42
|
-
early_stop_reason: Optional[str]
|
|
43
|
-
engine: str
|
|
44
|
-
notes: Optional[Dict[str, Any]] = None
|
|
45
|
-
|
|
46
|
-
def to_event(self) -> Dict[str, Any]:
|
|
47
|
-
return asdict(self)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class MetricsLogger:
|
|
51
|
-
def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
|
|
52
|
-
self.sink = sink
|
|
53
|
-
self.redact_pii = redact_pii
|
|
54
|
-
|
|
55
|
-
def log(self, metrics: RetrievalMetrics):
|
|
56
|
-
if self.sink:
|
|
57
|
-
event = metrics.to_event()
|
|
58
|
-
if self.redact_pii:
|
|
59
|
-
event = redact_dict(event)
|
|
60
|
-
self.sink.write(event)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
|
|
64
|
-
import numpy as np
|
|
65
|
-
latencies, budget_hits = [], 0
|
|
66
|
-
total = 0
|
|
67
|
-
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
68
|
-
for line in f:
|
|
69
|
-
total += 1
|
|
70
|
-
try:
|
|
71
|
-
ev = json.loads(line)
|
|
72
|
-
except Exception:
|
|
73
|
-
continue
|
|
74
|
-
if 'latency_ms' in ev:
|
|
75
|
-
latencies.append(ev['latency_ms'])
|
|
76
|
-
used = ev.get('used_budget', {})
|
|
77
|
-
bud = used if isinstance(used, dict) else {}
|
|
78
|
-
# Budget hit if any dimension equals its cap
|
|
79
|
-
max_nodes = bud.get('max_nodes')
|
|
80
|
-
max_edges = bud.get('max_edges')
|
|
81
|
-
max_ms = bud.get('max_ms')
|
|
82
|
-
max_paths = bud.get('max_paths')
|
|
83
|
-
u_nodes = bud.get('nodes', -1)
|
|
84
|
-
u_edges = bud.get('edges', -1)
|
|
85
|
-
u_ms = bud.get('ms', -1)
|
|
86
|
-
u_paths = bud.get('paths', -1)
|
|
87
|
-
hit = (
|
|
88
|
-
(max_nodes is not None and u_nodes >= max_nodes)
|
|
89
|
-
or (max_edges is not None and u_edges >= max_edges)
|
|
90
|
-
or (max_ms is not None and u_ms >= max_ms)
|
|
91
|
-
or (max_paths is not None and u_paths >= max_paths)
|
|
92
|
-
)
|
|
93
|
-
if hit:
|
|
94
|
-
budget_hits += 1
|
|
95
|
-
if not latencies:
|
|
96
|
-
return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
|
|
97
|
-
arr = np.array(latencies)
|
|
98
|
-
return {
|
|
99
|
-
"count": total,
|
|
100
|
-
"p50_ms": float(np.percentile(arr, 50)),
|
|
101
|
-
"p95_ms": float(np.percentile(arr, 95)),
|
|
102
|
-
"budget_hit_rate": (budget_hits / max(total, 1)),
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass, asdict
|
|
3
|
+
from typing import Optional, Dict, Any
|
|
4
|
+
import time, json, os, threading
|
|
5
|
+
from .utils.pii_redaction import redact_dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Timer:
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.t0 = time.perf_counter()
|
|
11
|
+
self.marks: Dict[str, float] = {}
|
|
12
|
+
|
|
13
|
+
def mark(self, name: str):
|
|
14
|
+
self.marks[name] = (time.perf_counter() - self.t0) * 1000.0
|
|
15
|
+
|
|
16
|
+
def elapsed_ms(self) -> int:
|
|
17
|
+
return int((time.perf_counter() - self.t0) * 1000)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class JSONLSink:
|
|
21
|
+
def __init__(self, path: str):
|
|
22
|
+
self.path = path
|
|
23
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
24
|
+
self._lock = threading.Lock()
|
|
25
|
+
|
|
26
|
+
def write(self, event: Dict[str, Any]):
|
|
27
|
+
line = json.dumps(event, ensure_ascii=False)
|
|
28
|
+
with self._lock:
|
|
29
|
+
with open(self.path, 'a', encoding='utf-8') as f:
|
|
30
|
+
f.write(line + '\n')
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class RetrievalMetrics:
|
|
35
|
+
query_id: Optional[str]
|
|
36
|
+
community_id: Optional[str]
|
|
37
|
+
seeds_count: int
|
|
38
|
+
ppr_mass: float
|
|
39
|
+
topk: int
|
|
40
|
+
used_budget: Dict[str, Any]
|
|
41
|
+
latency_ms: int
|
|
42
|
+
early_stop_reason: Optional[str]
|
|
43
|
+
engine: str
|
|
44
|
+
notes: Optional[Dict[str, Any]] = None
|
|
45
|
+
|
|
46
|
+
def to_event(self) -> Dict[str, Any]:
|
|
47
|
+
return asdict(self)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class MetricsLogger:
|
|
51
|
+
def __init__(self, sink: Optional[JSONLSink] = None, redact_pii: bool = True):
|
|
52
|
+
self.sink = sink
|
|
53
|
+
self.redact_pii = redact_pii
|
|
54
|
+
|
|
55
|
+
def log(self, metrics: RetrievalMetrics):
|
|
56
|
+
if self.sink:
|
|
57
|
+
event = metrics.to_event()
|
|
58
|
+
if self.redact_pii:
|
|
59
|
+
event = redact_dict(event)
|
|
60
|
+
self.sink.write(event)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def aggregate_latency_and_budget(jsonl_path: str) -> Dict[str, Any]:
|
|
64
|
+
import numpy as np
|
|
65
|
+
latencies, budget_hits = [], 0
|
|
66
|
+
total = 0
|
|
67
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
68
|
+
for line in f:
|
|
69
|
+
total += 1
|
|
70
|
+
try:
|
|
71
|
+
ev = json.loads(line)
|
|
72
|
+
except Exception:
|
|
73
|
+
continue
|
|
74
|
+
if 'latency_ms' in ev:
|
|
75
|
+
latencies.append(ev['latency_ms'])
|
|
76
|
+
used = ev.get('used_budget', {})
|
|
77
|
+
bud = used if isinstance(used, dict) else {}
|
|
78
|
+
# Budget hit if any dimension equals its cap
|
|
79
|
+
max_nodes = bud.get('max_nodes')
|
|
80
|
+
max_edges = bud.get('max_edges')
|
|
81
|
+
max_ms = bud.get('max_ms')
|
|
82
|
+
max_paths = bud.get('max_paths')
|
|
83
|
+
u_nodes = bud.get('nodes', -1)
|
|
84
|
+
u_edges = bud.get('edges', -1)
|
|
85
|
+
u_ms = bud.get('ms', -1)
|
|
86
|
+
u_paths = bud.get('paths', -1)
|
|
87
|
+
hit = (
|
|
88
|
+
(max_nodes is not None and u_nodes >= max_nodes)
|
|
89
|
+
or (max_edges is not None and u_edges >= max_edges)
|
|
90
|
+
or (max_ms is not None and u_ms >= max_ms)
|
|
91
|
+
or (max_paths is not None and u_paths >= max_paths)
|
|
92
|
+
)
|
|
93
|
+
if hit:
|
|
94
|
+
budget_hits += 1
|
|
95
|
+
if not latencies:
|
|
96
|
+
return {"count": total, "p50_ms": None, "p95_ms": None, "budget_hit_rate": None}
|
|
97
|
+
arr = np.array(latencies)
|
|
98
|
+
return {
|
|
99
|
+
"count": total,
|
|
100
|
+
"p50_ms": float(np.percentile(arr, 50)),
|
|
101
|
+
"p95_ms": float(np.percentile(arr, 95)),
|
|
102
|
+
"budget_hit_rate": (budget_hits / max(total, 1)),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
retrieval/metrics_motifs.py
CHANGED
|
@@ -1,36 +1,36 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from typing import Iterable, Tuple, Dict, Set
|
|
3
|
-
|
|
4
|
-
from .adapters import GraphAccessor, NodeId, RelId
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def wedge_and_triad_closures(
|
|
8
|
-
accessor: GraphAccessor,
|
|
9
|
-
community_id: str,
|
|
10
|
-
nodes: Iterable[NodeId],
|
|
11
|
-
relation_filter: Set[RelId] | None = None,
|
|
12
|
-
hop_cap: int = 3,
|
|
13
|
-
) -> Dict[str, float]:
|
|
14
|
-
"""
|
|
15
|
-
Estimate tiny-link yield via wedge/triad closures within a hop cap.
|
|
16
|
-
Returns fraction of wedges that close (triangles) and count estimates.
|
|
17
|
-
"""
|
|
18
|
-
nodes = list(nodes)
|
|
19
|
-
if not nodes:
|
|
20
|
-
return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
|
|
21
|
-
wedges = 0
|
|
22
|
-
triads = 0
|
|
23
|
-
for u in nodes:
|
|
24
|
-
nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
|
|
25
|
-
for v in nbrs1:
|
|
26
|
-
nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
|
|
27
|
-
for w in nbrs2:
|
|
28
|
-
wedges += 1
|
|
29
|
-
# Closure if an edge from u to w exists (any relation in filter)
|
|
30
|
-
closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
|
|
31
|
-
if closed:
|
|
32
|
-
triads += 1
|
|
33
|
-
rate = (triads / wedges) if wedges else 0.0
|
|
34
|
-
return {"wedges": wedges, "triads": triads, "closure_rate": rate}
|
|
35
|
-
|
|
36
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Iterable, Tuple, Dict, Set
|
|
3
|
+
|
|
4
|
+
from .adapters import GraphAccessor, NodeId, RelId
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def wedge_and_triad_closures(
|
|
8
|
+
accessor: GraphAccessor,
|
|
9
|
+
community_id: str,
|
|
10
|
+
nodes: Iterable[NodeId],
|
|
11
|
+
relation_filter: Set[RelId] | None = None,
|
|
12
|
+
hop_cap: int = 3,
|
|
13
|
+
) -> Dict[str, float]:
|
|
14
|
+
"""
|
|
15
|
+
Estimate tiny-link yield via wedge/triad closures within a hop cap.
|
|
16
|
+
Returns fraction of wedges that close (triangles) and count estimates.
|
|
17
|
+
"""
|
|
18
|
+
nodes = list(nodes)
|
|
19
|
+
if not nodes:
|
|
20
|
+
return {"wedges": 0, "triads": 0, "closure_rate": 0.0}
|
|
21
|
+
wedges = 0
|
|
22
|
+
triads = 0
|
|
23
|
+
for u in nodes:
|
|
24
|
+
nbrs1 = [v for v, r, _ in accessor.iter_out(u) if (not relation_filter or r in relation_filter)]
|
|
25
|
+
for v in nbrs1:
|
|
26
|
+
nbrs2 = [w for w, r, _ in accessor.iter_out(v) if w != u and (not relation_filter or r in relation_filter)]
|
|
27
|
+
for w in nbrs2:
|
|
28
|
+
wedges += 1
|
|
29
|
+
# Closure if an edge from u to w exists (any relation in filter)
|
|
30
|
+
closed = any((x == w and (not relation_filter or r in relation_filter)) for x, r, _ in accessor.iter_out(u))
|
|
31
|
+
if closed:
|
|
32
|
+
triads += 1
|
|
33
|
+
rate = (triads / wedges) if wedges else 0.0
|
|
34
|
+
return {"wedges": wedges, "triads": triads, "closure_rate": rate}
|
|
35
|
+
|
|
36
|
+
|