odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. benchmarks/__init__.py +17 -17
  2. benchmarks/datasets.py +284 -284
  3. benchmarks/metrics.py +275 -275
  4. benchmarks/run_ablation.py +279 -279
  5. benchmarks/run_npll_benchmark.py +270 -270
  6. npll/__init__.py +10 -10
  7. npll/bootstrap.py +474 -474
  8. npll/core/__init__.py +33 -33
  9. npll/core/knowledge_graph.py +308 -308
  10. npll/core/logical_rules.py +496 -496
  11. npll/core/mln.py +474 -474
  12. npll/inference/__init__.py +40 -40
  13. npll/inference/e_step.py +419 -419
  14. npll/inference/elbo.py +434 -434
  15. npll/inference/m_step.py +576 -576
  16. npll/npll_model.py +631 -631
  17. npll/scoring/__init__.py +42 -42
  18. npll/scoring/embeddings.py +441 -441
  19. npll/scoring/probability.py +402 -402
  20. npll/scoring/scoring_module.py +369 -369
  21. npll/training/__init__.py +24 -24
  22. npll/training/evaluation.py +496 -496
  23. npll/training/npll_trainer.py +520 -520
  24. npll/utils/__init__.py +47 -47
  25. npll/utils/batch_utils.py +492 -492
  26. npll/utils/config.py +144 -144
  27. npll/utils/math_utils.py +338 -338
  28. odin/__init__.py +21 -20
  29. odin/engine.py +264 -264
  30. odin/schema.py +210 -0
  31. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
  32. odin_engine-0.2.0.dist-info/RECORD +63 -0
  33. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
  34. retrieval/__init__.py +50 -50
  35. retrieval/adapters.py +140 -140
  36. retrieval/adapters_arango.py +1418 -1418
  37. retrieval/aggregators.py +707 -707
  38. retrieval/beam.py +127 -127
  39. retrieval/budget.py +60 -60
  40. retrieval/cache.py +159 -159
  41. retrieval/confidence.py +88 -88
  42. retrieval/eval.py +49 -49
  43. retrieval/linker.py +87 -87
  44. retrieval/metrics.py +105 -105
  45. retrieval/metrics_motifs.py +36 -36
  46. retrieval/orchestrator.py +571 -571
  47. retrieval/ppr/__init__.py +12 -12
  48. retrieval/ppr/anchors.py +41 -41
  49. retrieval/ppr/bippr.py +61 -61
  50. retrieval/ppr/engines.py +257 -257
  51. retrieval/ppr/global_pr.py +76 -76
  52. retrieval/ppr/indexes.py +78 -78
  53. retrieval/ppr.py +156 -156
  54. retrieval/ppr_cache.py +25 -25
  55. retrieval/scoring.py +294 -294
  56. retrieval/utils/pii_redaction.py +36 -36
  57. retrieval/writers/__init__.py +9 -9
  58. retrieval/writers/arango_writer.py +28 -28
  59. retrieval/writers/base.py +21 -21
  60. retrieval/writers/janus_writer.py +36 -36
  61. odin_engine-0.1.0.dist-info/RECORD +0 -62
  62. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
  63. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
retrieval/scoring.py CHANGED
@@ -1,294 +1,294 @@
1
- from __future__ import annotations
2
- from dataclasses import dataclass
3
- from typing import Dict, List, Tuple, Optional, Iterable
4
- import math
5
-
6
- from .adapters import NodeId, RelId, GraphAccessor
7
- from .confidence import EdgeConfidenceProvider, ConstantConfidence
8
-
9
-
10
- @dataclass
11
- class PathScoreConfig:
12
- relation_type_prior: Optional[Dict[RelId, float]] = None
13
- recency_tau_days: float = 30.0
14
- recency_clamp: Tuple[float, float] = (0.5, 1.0)
15
- power_mean_rho: float = 0.5
16
- # New signals for GNN/Bridge integration
17
- bridge_boost: float = 1.5 # Multiplier for bridge nodes
18
- affinity_boost: float = 1.5 # Multiplier for high-affinity cross-community edges
19
- use_bridge_scoring: bool = True
20
-
21
-
22
-
23
- def _logit(p: float) -> float:
24
- p = min(max(p, 1e-12), 1.0 - 1e-12)
25
- return math.log(p / (1.0 - p))
26
-
27
-
28
- def _sigmoid(x: float) -> float:
29
- if x >= 0:
30
- z = math.exp(-x)
31
- return 1.0 / (1.0 + z)
32
- else:
33
- z = math.exp(x)
34
- return z / (1.0 + z)
35
-
36
-
37
- def combine_edge_confidence(raw: float, npll: Optional[float] = None, calibrated: Optional[float] = None,
38
- weights: Tuple[float, float, float] = (0.5, 0.4, 0.1)) -> float:
39
- w_raw, w_npll, w_cal = weights
40
- terms: List[Tuple[float, float]] = [(w_raw, raw)]
41
- if npll is not None:
42
- terms.append((w_npll, npll))
43
- if calibrated is not None:
44
- terms.append((w_cal, calibrated))
45
- # Normalize weights present
46
- total_w = sum(w for w, _ in terms) or 1.0
47
- norm = [(w / total_w, p) for w, p in terms]
48
- s = sum(w * _logit(p) for w, p in norm)
49
- return _sigmoid(s)
50
-
51
-
52
- def power_mean(values: Iterable[float], rho: float) -> float:
53
- values = list(values)
54
- if not values:
55
- return 0.0
56
- if rho == 0.0:
57
- # geometric mean
58
- return math.exp(sum(math.log(max(v, 1e-12)) for v in values) / len(values))
59
- acc = sum((max(v, 1e-12)) ** rho for v in values) / len(values)
60
- return max(acc, 1e-12) ** (1.0 / rho)
61
-
62
-
63
- def recency_factor(edge_timestamps: List[Optional[float]], now_ts: Optional[float], tau_days: float,
64
- clamp: Tuple[float, float]) -> float:
65
- if not edge_timestamps:
66
- return 1.0
67
- lo, hi = clamp
68
- total = 1.0
69
- for ts in edge_timestamps:
70
- if ts is None or now_ts is None:
71
- continue
72
- dt_days = max(0.0, (now_ts - ts) / (60 * 60 * 24))
73
- total *= math.exp(-dt_days / max(tau_days, 1e-6))
74
- return min(max(total, lo), hi)
75
-
76
-
77
- def path_score(
78
- path_edges: List[Tuple[NodeId, RelId, NodeId]],
79
- node_ppr: Dict[NodeId, float],
80
- conf_provider: EdgeConfidenceProvider,
81
- now_ts: Optional[float],
82
- edge_timestamp_lookup,
83
- cfg: PathScoreConfig = PathScoreConfig(),
84
- # New params
85
- node_bridge_scores: Optional[Dict[NodeId, float]] = None,
86
- edge_affinity_scores: Optional[Dict[Tuple[NodeId, NodeId], float]] = None,
87
- ) -> float:
88
- # Edge confidence term (product of effective confidences)
89
- edge_confs: List[float] = []
90
- type_priors: List[float] = []
91
- ts_list: List[Optional[float]] = []
92
- affinity_mults: List[float] = [] # New
93
-
94
- priors = cfg.relation_type_prior or {}
95
- for u, rel, v in path_edges:
96
- c = conf_provider.confidence(u, rel, v)
97
- edge_confs.append(max(min(c, 1.0), 1e-12))
98
- type_priors.append(max(priors.get(rel, 1.0), 1e-6))
99
- ts_list.append(edge_timestamp_lookup(u, rel, v) if edge_timestamp_lookup else None)
100
-
101
- # Affinity term
102
- if cfg.use_bridge_scoring and edge_affinity_scores:
103
- # Check (u,v) or (v,u)
104
- aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
105
- if aff > 0:
106
- affinity_mults.append(1.0 + (aff * cfg.affinity_boost))
107
- else:
108
- affinity_mults.append(1.0)
109
- else:
110
- affinity_mults.append(1.0)
111
-
112
- edge_term = 1.0
113
- for c in edge_confs:
114
- edge_term *= c
115
-
116
- # PPR term: power mean across unique nodes on the path
117
- path_nodes: List[NodeId] = [path_edges[0][0]] + [v for (_, _, v) in path_edges] if path_edges else []
118
- ppr_vals = [node_ppr.get(n, 1e-12) for n in path_nodes]
119
- ppr_term = power_mean(ppr_vals, cfg.power_mean_rho)
120
-
121
- # Bridge term: boost nodes that are bridges
122
- bridge_term = 1.0
123
- if cfg.use_bridge_scoring and node_bridge_scores:
124
- # We can average the bridge scores or take max
125
- # node_bridge_scores should be pre-processed factors (e.g. 1.0 to 2.0)
126
- b_vals = [node_bridge_scores.get(n, 1.0) for n in path_nodes]
127
- bridge_term = power_mean(b_vals, cfg.power_mean_rho)
128
-
129
- # Affinity term aggregation
130
- affinity_term = 1.0
131
- for a in affinity_mults:
132
- affinity_term *= a
133
-
134
- # Type priors multiply
135
- prior_term = 1.0
136
- for t in type_priors:
137
- prior_term *= t
138
-
139
- # Recency multiplicative factor (clamped)
140
- rec_term = recency_factor(ts_list, now_ts, cfg.recency_tau_days, cfg.recency_clamp)
141
-
142
- return float(edge_term * ppr_term * bridge_term * affinity_term * prior_term * rec_term)
143
-
144
-
145
- def aggregate_evidence_strength(path_scores: List[float], top_k: int = 5) -> float:
146
- if not path_scores:
147
- return 0.0
148
- top = sorted(path_scores, reverse=True)[: top_k]
149
- prod = 1.0
150
- for s in top:
151
- prod *= max(0.0, 1.0 - s)
152
- return 1.0 - prod
153
-
154
-
155
- @dataclass
156
- class InsightScoreConfig:
157
- alpha: float = 0.5
158
- beta: float = 0.2
159
- gamma: float = 0.2
160
- delta: float = 0.1
161
-
162
-
163
- def insight_score(
164
- evidence_strength: float,
165
- community_relevance: float,
166
- explanation_quality: float,
167
- business_impact_proxy: float,
168
- cfg: InsightScoreConfig = InsightScoreConfig(),
169
- ) -> float:
170
- # Normalize inputs into [0,1]
171
- e = min(max(evidence_strength, 0.0), 1.0)
172
- c = min(max(community_relevance, 0.0), 1.0)
173
- x = min(max(explanation_quality, 0.0), 1.0)
174
- b = min(max(business_impact_proxy, 0.0), 1.0)
175
- w_sum = max(cfg.alpha + cfg.beta + cfg.gamma + cfg.delta, 1e-9)
176
- a = cfg.alpha / w_sum
177
- bb = cfg.beta / w_sum
178
- g = cfg.gamma / w_sum
179
- d = cfg.delta / w_sum
180
- return float(a * e + bb * c + g * x + d * b)
181
-
182
-
183
- def compute_community_relevance(nodes_in_insight: Iterable[NodeId], node_ppr: Dict[NodeId, float]) -> float:
184
- return float(sum(node_ppr.get(n, 0.0) for n in set(nodes_in_insight)))
185
-
186
-
187
- def score_paths_and_insight(
188
- accessor: GraphAccessor,
189
- community_id: str,
190
- seeds: List[NodeId],
191
- node_ppr_scores: List[Tuple[NodeId, float]],
192
- candidate_paths: List[List[Tuple[NodeId, RelId, NodeId]]],
193
- conf_provider: EdgeConfidenceProvider = ConstantConfidence(0.8),
194
- now_ts: Optional[float] = None,
195
- edge_timestamp_lookup=None,
196
- path_cfg: PathScoreConfig = PathScoreConfig(),
197
- insight_cfg: InsightScoreConfig = InsightScoreConfig(),
198
- top_k_paths: int = 5,
199
- ) -> Dict[str, object]:
200
- node_ppr = {n: p for n, p in node_ppr_scores}
201
-
202
- # Pre-fetch bridge & affinity data if accessor supports it
203
- node_bridge_scores = {}
204
- edge_affinity_scores = {}
205
-
206
- if path_cfg.use_bridge_scoring:
207
- # Duck typing check for GlobalGraphAccessor capabilities
208
- has_bridge = hasattr(accessor, "is_bridge")
209
- has_affinity = hasattr(accessor, "get_affinity") and hasattr(accessor, "get_entity_community")
210
-
211
- if has_bridge or has_affinity:
212
- unique_nodes = set()
213
- for p in candidate_paths:
214
- unique_nodes.add(p[0][0] if p else "")
215
- for _, _, v in p:
216
- unique_nodes.add(v)
217
- if "" in unique_nodes: unique_nodes.remove("")
218
-
219
- node_communities = {}
220
- for n in unique_nodes:
221
- if has_bridge:
222
- b_data = accessor.is_bridge(n)
223
- if b_data:
224
- # Normalize strength: 1 + log(1+strength) * scaling
225
- strength = b_data.get("bridge_strength", 0)
226
- if strength > 0:
227
- # e.g., strength 10 -> log(11)~2.4 -> 1.24 multiplier boost (if boost=1.0)
228
- node_bridge_scores[n] = 1.0 + (math.log(1 + strength) * 0.1 * path_cfg.bridge_boost)
229
-
230
- if has_affinity:
231
- comm = accessor.get_entity_community(n)
232
- if comm:
233
- node_communities[n] = comm
234
-
235
- if has_affinity:
236
- for p in candidate_paths:
237
- for u, _, v in p:
238
- pair = (u, v)
239
- if pair not in edge_affinity_scores and (v, u) not in edge_affinity_scores:
240
- c_u = node_communities.get(u)
241
- c_v = node_communities.get(v)
242
- if c_u and c_v and c_u != c_v:
243
- aff = accessor.get_affinity(c_u, c_v)
244
- edge_affinity_scores[pair] = aff
245
-
246
- path_scores: List[float] = []
247
- scored_paths: List[Dict[str, object]] = []
248
- for edges in candidate_paths:
249
- ps = path_score(
250
- edges, node_ppr, conf_provider, now_ts, edge_timestamp_lookup, path_cfg,
251
- node_bridge_scores=node_bridge_scores,
252
- edge_affinity_scores=edge_affinity_scores
253
- )
254
- path_scores.append(ps)
255
- nodes = [edges[0][0]] + [e[2] for e in edges] if edges else []
256
- # Decomposition terms for transparency
257
- ppr_vals = [node_ppr.get(n, 1e-12) for n in nodes]
258
- priors = path_cfg.relation_type_prior or {}
259
- pri_list = [max(priors.get(r, 1.0), 1e-6) for (_, r, _) in edges]
260
- confs = [conf_provider.confidence(u, r, v) for (u, r, v) in edges]
261
- recs = [edge_timestamp_lookup(u, r, v) if edge_timestamp_lookup else None for (u, r, v) in edges]
262
-
263
- # New decomposition terms
264
- bridge_vals = [node_bridge_scores.get(n, 1.0) for n in nodes]
265
- aff_vals = []
266
- for u, r, v in edges:
267
- aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
268
- aff_vals.append(aff)
269
-
270
- scored_paths.append({
271
- "score": ps,
272
- "nodes": nodes,
273
- "edges": [{"u": u, "rel": r, "v": v} for (u, r, v) in edges],
274
- "decomp": {
275
- "ppr_values": ppr_vals,
276
- "type_priors": pri_list,
277
- "edge_confidences": confs,
278
- "edge_timestamps": recs,
279
- "bridge_scores": bridge_vals,
280
- "affinity_scores": aff_vals,
281
- },
282
- })
283
- es = aggregate_evidence_strength(path_scores, top_k=top_k_paths)
284
- nodes_in_insight = set(n for p in scored_paths for n in p["nodes"])
285
- comm_rel = compute_community_relevance(nodes_in_insight, node_ppr)
286
- ins = insight_score(es, comm_rel, explanation_quality=0.5, business_impact_proxy=0.5, cfg=insight_cfg)
287
- return {
288
- "paths": sorted(scored_paths, key=lambda x: x["score"], reverse=True),
289
- "evidence_strength": es,
290
- "community_relevance": comm_rel,
291
- "insight_score": ins,
292
- }
293
-
294
-
1
+ from __future__ import annotations
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Tuple, Optional, Iterable
4
+ import math
5
+
6
+ from .adapters import NodeId, RelId, GraphAccessor
7
+ from .confidence import EdgeConfidenceProvider, ConstantConfidence
8
+
9
+
10
+ @dataclass
11
+ class PathScoreConfig:
12
+ relation_type_prior: Optional[Dict[RelId, float]] = None
13
+ recency_tau_days: float = 30.0
14
+ recency_clamp: Tuple[float, float] = (0.5, 1.0)
15
+ power_mean_rho: float = 0.5
16
+ # New signals for GNN/Bridge integration
17
+ bridge_boost: float = 1.5 # Multiplier for bridge nodes
18
+ affinity_boost: float = 1.5 # Multiplier for high-affinity cross-community edges
19
+ use_bridge_scoring: bool = True
20
+
21
+
22
+
23
+ def _logit(p: float) -> float:
24
+ p = min(max(p, 1e-12), 1.0 - 1e-12)
25
+ return math.log(p / (1.0 - p))
26
+
27
+
28
+ def _sigmoid(x: float) -> float:
29
+ if x >= 0:
30
+ z = math.exp(-x)
31
+ return 1.0 / (1.0 + z)
32
+ else:
33
+ z = math.exp(x)
34
+ return z / (1.0 + z)
35
+
36
+
37
+ def combine_edge_confidence(raw: float, npll: Optional[float] = None, calibrated: Optional[float] = None,
38
+ weights: Tuple[float, float, float] = (0.5, 0.4, 0.1)) -> float:
39
+ w_raw, w_npll, w_cal = weights
40
+ terms: List[Tuple[float, float]] = [(w_raw, raw)]
41
+ if npll is not None:
42
+ terms.append((w_npll, npll))
43
+ if calibrated is not None:
44
+ terms.append((w_cal, calibrated))
45
+ # Normalize weights present
46
+ total_w = sum(w for w, _ in terms) or 1.0
47
+ norm = [(w / total_w, p) for w, p in terms]
48
+ s = sum(w * _logit(p) for w, p in norm)
49
+ return _sigmoid(s)
50
+
51
+
52
+ def power_mean(values: Iterable[float], rho: float) -> float:
53
+ values = list(values)
54
+ if not values:
55
+ return 0.0
56
+ if rho == 0.0:
57
+ # geometric mean
58
+ return math.exp(sum(math.log(max(v, 1e-12)) for v in values) / len(values))
59
+ acc = sum((max(v, 1e-12)) ** rho for v in values) / len(values)
60
+ return max(acc, 1e-12) ** (1.0 / rho)
61
+
62
+
63
+ def recency_factor(edge_timestamps: List[Optional[float]], now_ts: Optional[float], tau_days: float,
64
+ clamp: Tuple[float, float]) -> float:
65
+ if not edge_timestamps:
66
+ return 1.0
67
+ lo, hi = clamp
68
+ total = 1.0
69
+ for ts in edge_timestamps:
70
+ if ts is None or now_ts is None:
71
+ continue
72
+ dt_days = max(0.0, (now_ts - ts) / (60 * 60 * 24))
73
+ total *= math.exp(-dt_days / max(tau_days, 1e-6))
74
+ return min(max(total, lo), hi)
75
+
76
+
77
+ def path_score(
78
+ path_edges: List[Tuple[NodeId, RelId, NodeId]],
79
+ node_ppr: Dict[NodeId, float],
80
+ conf_provider: EdgeConfidenceProvider,
81
+ now_ts: Optional[float],
82
+ edge_timestamp_lookup,
83
+ cfg: PathScoreConfig = PathScoreConfig(),
84
+ # New params
85
+ node_bridge_scores: Optional[Dict[NodeId, float]] = None,
86
+ edge_affinity_scores: Optional[Dict[Tuple[NodeId, NodeId], float]] = None,
87
+ ) -> float:
88
+ # Edge confidence term (product of effective confidences)
89
+ edge_confs: List[float] = []
90
+ type_priors: List[float] = []
91
+ ts_list: List[Optional[float]] = []
92
+ affinity_mults: List[float] = [] # New
93
+
94
+ priors = cfg.relation_type_prior or {}
95
+ for u, rel, v in path_edges:
96
+ c = conf_provider.confidence(u, rel, v)
97
+ edge_confs.append(max(min(c, 1.0), 1e-12))
98
+ type_priors.append(max(priors.get(rel, 1.0), 1e-6))
99
+ ts_list.append(edge_timestamp_lookup(u, rel, v) if edge_timestamp_lookup else None)
100
+
101
+ # Affinity term
102
+ if cfg.use_bridge_scoring and edge_affinity_scores:
103
+ # Check (u,v) or (v,u)
104
+ aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
105
+ if aff > 0:
106
+ affinity_mults.append(1.0 + (aff * cfg.affinity_boost))
107
+ else:
108
+ affinity_mults.append(1.0)
109
+ else:
110
+ affinity_mults.append(1.0)
111
+
112
+ edge_term = 1.0
113
+ for c in edge_confs:
114
+ edge_term *= c
115
+
116
+ # PPR term: power mean across unique nodes on the path
117
+ path_nodes: List[NodeId] = [path_edges[0][0]] + [v for (_, _, v) in path_edges] if path_edges else []
118
+ ppr_vals = [node_ppr.get(n, 1e-12) for n in path_nodes]
119
+ ppr_term = power_mean(ppr_vals, cfg.power_mean_rho)
120
+
121
+ # Bridge term: boost nodes that are bridges
122
+ bridge_term = 1.0
123
+ if cfg.use_bridge_scoring and node_bridge_scores:
124
+ # We can average the bridge scores or take max
125
+ # node_bridge_scores should be pre-processed factors (e.g. 1.0 to 2.0)
126
+ b_vals = [node_bridge_scores.get(n, 1.0) for n in path_nodes]
127
+ bridge_term = power_mean(b_vals, cfg.power_mean_rho)
128
+
129
+ # Affinity term aggregation
130
+ affinity_term = 1.0
131
+ for a in affinity_mults:
132
+ affinity_term *= a
133
+
134
+ # Type priors multiply
135
+ prior_term = 1.0
136
+ for t in type_priors:
137
+ prior_term *= t
138
+
139
+ # Recency multiplicative factor (clamped)
140
+ rec_term = recency_factor(ts_list, now_ts, cfg.recency_tau_days, cfg.recency_clamp)
141
+
142
+ return float(edge_term * ppr_term * bridge_term * affinity_term * prior_term * rec_term)
143
+
144
+
145
+ def aggregate_evidence_strength(path_scores: List[float], top_k: int = 5) -> float:
146
+ if not path_scores:
147
+ return 0.0
148
+ top = sorted(path_scores, reverse=True)[: top_k]
149
+ prod = 1.0
150
+ for s in top:
151
+ prod *= max(0.0, 1.0 - s)
152
+ return 1.0 - prod
153
+
154
+
155
+ @dataclass
156
+ class InsightScoreConfig:
157
+ alpha: float = 0.5
158
+ beta: float = 0.2
159
+ gamma: float = 0.2
160
+ delta: float = 0.1
161
+
162
+
163
+ def insight_score(
164
+ evidence_strength: float,
165
+ community_relevance: float,
166
+ explanation_quality: float,
167
+ business_impact_proxy: float,
168
+ cfg: InsightScoreConfig = InsightScoreConfig(),
169
+ ) -> float:
170
+ # Normalize inputs into [0,1]
171
+ e = min(max(evidence_strength, 0.0), 1.0)
172
+ c = min(max(community_relevance, 0.0), 1.0)
173
+ x = min(max(explanation_quality, 0.0), 1.0)
174
+ b = min(max(business_impact_proxy, 0.0), 1.0)
175
+ w_sum = max(cfg.alpha + cfg.beta + cfg.gamma + cfg.delta, 1e-9)
176
+ a = cfg.alpha / w_sum
177
+ bb = cfg.beta / w_sum
178
+ g = cfg.gamma / w_sum
179
+ d = cfg.delta / w_sum
180
+ return float(a * e + bb * c + g * x + d * b)
181
+
182
+
183
+ def compute_community_relevance(nodes_in_insight: Iterable[NodeId], node_ppr: Dict[NodeId, float]) -> float:
184
+ return float(sum(node_ppr.get(n, 0.0) for n in set(nodes_in_insight)))
185
+
186
+
187
+ def score_paths_and_insight(
188
+ accessor: GraphAccessor,
189
+ community_id: str,
190
+ seeds: List[NodeId],
191
+ node_ppr_scores: List[Tuple[NodeId, float]],
192
+ candidate_paths: List[List[Tuple[NodeId, RelId, NodeId]]],
193
+ conf_provider: EdgeConfidenceProvider = ConstantConfidence(0.8),
194
+ now_ts: Optional[float] = None,
195
+ edge_timestamp_lookup=None,
196
+ path_cfg: PathScoreConfig = PathScoreConfig(),
197
+ insight_cfg: InsightScoreConfig = InsightScoreConfig(),
198
+ top_k_paths: int = 5,
199
+ ) -> Dict[str, object]:
200
+ node_ppr = {n: p for n, p in node_ppr_scores}
201
+
202
+ # Pre-fetch bridge & affinity data if accessor supports it
203
+ node_bridge_scores = {}
204
+ edge_affinity_scores = {}
205
+
206
+ if path_cfg.use_bridge_scoring:
207
+ # Duck typing check for GlobalGraphAccessor capabilities
208
+ has_bridge = hasattr(accessor, "is_bridge")
209
+ has_affinity = hasattr(accessor, "get_affinity") and hasattr(accessor, "get_entity_community")
210
+
211
+ if has_bridge or has_affinity:
212
+ unique_nodes = set()
213
+ for p in candidate_paths:
214
+ unique_nodes.add(p[0][0] if p else "")
215
+ for _, _, v in p:
216
+ unique_nodes.add(v)
217
+ if "" in unique_nodes: unique_nodes.remove("")
218
+
219
+ node_communities = {}
220
+ for n in unique_nodes:
221
+ if has_bridge:
222
+ b_data = accessor.is_bridge(n)
223
+ if b_data:
224
+ # Normalize strength: 1 + log(1+strength) * scaling
225
+ strength = b_data.get("bridge_strength", 0)
226
+ if strength > 0:
227
+ # e.g., strength 10 -> log(11)~2.4 -> 1.24 multiplier boost (if boost=1.0)
228
+ node_bridge_scores[n] = 1.0 + (math.log(1 + strength) * 0.1 * path_cfg.bridge_boost)
229
+
230
+ if has_affinity:
231
+ comm = accessor.get_entity_community(n)
232
+ if comm:
233
+ node_communities[n] = comm
234
+
235
+ if has_affinity:
236
+ for p in candidate_paths:
237
+ for u, _, v in p:
238
+ pair = (u, v)
239
+ if pair not in edge_affinity_scores and (v, u) not in edge_affinity_scores:
240
+ c_u = node_communities.get(u)
241
+ c_v = node_communities.get(v)
242
+ if c_u and c_v and c_u != c_v:
243
+ aff = accessor.get_affinity(c_u, c_v)
244
+ edge_affinity_scores[pair] = aff
245
+
246
+ path_scores: List[float] = []
247
+ scored_paths: List[Dict[str, object]] = []
248
+ for edges in candidate_paths:
249
+ ps = path_score(
250
+ edges, node_ppr, conf_provider, now_ts, edge_timestamp_lookup, path_cfg,
251
+ node_bridge_scores=node_bridge_scores,
252
+ edge_affinity_scores=edge_affinity_scores
253
+ )
254
+ path_scores.append(ps)
255
+ nodes = [edges[0][0]] + [e[2] for e in edges] if edges else []
256
+ # Decomposition terms for transparency
257
+ ppr_vals = [node_ppr.get(n, 1e-12) for n in nodes]
258
+ priors = path_cfg.relation_type_prior or {}
259
+ pri_list = [max(priors.get(r, 1.0), 1e-6) for (_, r, _) in edges]
260
+ confs = [conf_provider.confidence(u, r, v) for (u, r, v) in edges]
261
+ recs = [edge_timestamp_lookup(u, r, v) if edge_timestamp_lookup else None for (u, r, v) in edges]
262
+
263
+ # New decomposition terms
264
+ bridge_vals = [node_bridge_scores.get(n, 1.0) for n in nodes]
265
+ aff_vals = []
266
+ for u, r, v in edges:
267
+ aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
268
+ aff_vals.append(aff)
269
+
270
+ scored_paths.append({
271
+ "score": ps,
272
+ "nodes": nodes,
273
+ "edges": [{"u": u, "rel": r, "v": v} for (u, r, v) in edges],
274
+ "decomp": {
275
+ "ppr_values": ppr_vals,
276
+ "type_priors": pri_list,
277
+ "edge_confidences": confs,
278
+ "edge_timestamps": recs,
279
+ "bridge_scores": bridge_vals,
280
+ "affinity_scores": aff_vals,
281
+ },
282
+ })
283
+ es = aggregate_evidence_strength(path_scores, top_k=top_k_paths)
284
+ nodes_in_insight = set(n for p in scored_paths for n in p["nodes"])
285
+ comm_rel = compute_community_relevance(nodes_in_insight, node_ppr)
286
+ ins = insight_score(es, comm_rel, explanation_quality=0.5, business_impact_proxy=0.5, cfg=insight_cfg)
287
+ return {
288
+ "paths": sorted(scored_paths, key=lambda x: x["score"], reverse=True),
289
+ "evidence_strength": es,
290
+ "community_relevance": comm_rel,
291
+ "insight_score": ins,
292
+ }
293
+
294
+
@@ -1,36 +1,36 @@
1
- import re
2
- from typing import Any, Dict
3
-
4
- # Simple regex patterns for common PII
5
- PII_PATTERNS = {
6
- 'EMAIL': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
7
- 'PHONE': re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
8
- }
9
-
10
- def redact_text(text: str) -> str:
11
- """Redacts PII from a single string."""
12
- if not isinstance(text, str):
13
- return text
14
- for pii_type, pattern in PII_PATTERNS.items():
15
- text = pattern.sub(f'[{pii_type}_REDACTED]', text)
16
- return text
17
-
18
- def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
19
- """Recursively redacts PII from string values in a dictionary."""
20
- if not isinstance(data, dict):
21
- return data
22
-
23
- clean_dict = {}
24
- for key, value in data.items():
25
- if isinstance(value, str):
26
- clean_dict[key] = redact_text(value)
27
- elif isinstance(value, dict):
28
- clean_dict[key] = redact_dict(value)
29
- elif isinstance(value, list):
30
- clean_dict[key] = [
31
- redact_dict(item) if isinstance(item, dict) else (redact_text(item) if isinstance(item, str) else item)
32
- for item in value
33
- ]
34
- else:
35
- clean_dict[key] = value
36
- return clean_dict
1
+ import re
2
+ from typing import Any, Dict
3
+
4
+ # Simple regex patterns for common PII
5
+ PII_PATTERNS = {
6
+ 'EMAIL': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
7
+ 'PHONE': re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
8
+ }
9
+
10
+ def redact_text(text: str) -> str:
11
+ """Redacts PII from a single string."""
12
+ if not isinstance(text, str):
13
+ return text
14
+ for pii_type, pattern in PII_PATTERNS.items():
15
+ text = pattern.sub(f'[{pii_type}_REDACTED]', text)
16
+ return text
17
+
18
+ def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
19
+ """Recursively redacts PII from string values in a dictionary."""
20
+ if not isinstance(data, dict):
21
+ return data
22
+
23
+ clean_dict = {}
24
+ for key, value in data.items():
25
+ if isinstance(value, str):
26
+ clean_dict[key] = redact_text(value)
27
+ elif isinstance(value, dict):
28
+ clean_dict[key] = redact_dict(value)
29
+ elif isinstance(value, list):
30
+ clean_dict[key] = [
31
+ redact_dict(item) if isinstance(item, dict) else (redact_text(item) if isinstance(item, str) else item)
32
+ for item in value
33
+ ]
34
+ else:
35
+ clean_dict[key] = value
36
+ return clean_dict