odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +17 -17
- benchmarks/datasets.py +284 -284
- benchmarks/metrics.py +275 -275
- benchmarks/run_ablation.py +279 -279
- benchmarks/run_npll_benchmark.py +270 -270
- npll/__init__.py +10 -10
- npll/bootstrap.py +474 -474
- npll/core/__init__.py +33 -33
- npll/core/knowledge_graph.py +308 -308
- npll/core/logical_rules.py +496 -496
- npll/core/mln.py +474 -474
- npll/inference/__init__.py +40 -40
- npll/inference/e_step.py +419 -419
- npll/inference/elbo.py +434 -434
- npll/inference/m_step.py +576 -576
- npll/npll_model.py +631 -631
- npll/scoring/__init__.py +42 -42
- npll/scoring/embeddings.py +441 -441
- npll/scoring/probability.py +402 -402
- npll/scoring/scoring_module.py +369 -369
- npll/training/__init__.py +24 -24
- npll/training/evaluation.py +496 -496
- npll/training/npll_trainer.py +520 -520
- npll/utils/__init__.py +47 -47
- npll/utils/batch_utils.py +492 -492
- npll/utils/config.py +144 -144
- npll/utils/math_utils.py +338 -338
- odin/__init__.py +21 -20
- odin/engine.py +264 -264
- odin/schema.py +210 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
- odin_engine-0.2.0.dist-info/RECORD +63 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
- retrieval/__init__.py +50 -50
- retrieval/adapters.py +140 -140
- retrieval/adapters_arango.py +1418 -1418
- retrieval/aggregators.py +707 -707
- retrieval/beam.py +127 -127
- retrieval/budget.py +60 -60
- retrieval/cache.py +159 -159
- retrieval/confidence.py +88 -88
- retrieval/eval.py +49 -49
- retrieval/linker.py +87 -87
- retrieval/metrics.py +105 -105
- retrieval/metrics_motifs.py +36 -36
- retrieval/orchestrator.py +571 -571
- retrieval/ppr/__init__.py +12 -12
- retrieval/ppr/anchors.py +41 -41
- retrieval/ppr/bippr.py +61 -61
- retrieval/ppr/engines.py +257 -257
- retrieval/ppr/global_pr.py +76 -76
- retrieval/ppr/indexes.py +78 -78
- retrieval/ppr.py +156 -156
- retrieval/ppr_cache.py +25 -25
- retrieval/scoring.py +294 -294
- retrieval/utils/pii_redaction.py +36 -36
- retrieval/writers/__init__.py +9 -9
- retrieval/writers/arango_writer.py +28 -28
- retrieval/writers/base.py +21 -21
- retrieval/writers/janus_writer.py +36 -36
- odin_engine-0.1.0.dist-info/RECORD +0 -62
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
retrieval/scoring.py
CHANGED
|
@@ -1,294 +1,294 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import Dict, List, Tuple, Optional, Iterable
|
|
4
|
-
import math
|
|
5
|
-
|
|
6
|
-
from .adapters import NodeId, RelId, GraphAccessor
|
|
7
|
-
from .confidence import EdgeConfidenceProvider, ConstantConfidence
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
@dataclass
|
|
11
|
-
class PathScoreConfig:
|
|
12
|
-
relation_type_prior: Optional[Dict[RelId, float]] = None
|
|
13
|
-
recency_tau_days: float = 30.0
|
|
14
|
-
recency_clamp: Tuple[float, float] = (0.5, 1.0)
|
|
15
|
-
power_mean_rho: float = 0.5
|
|
16
|
-
# New signals for GNN/Bridge integration
|
|
17
|
-
bridge_boost: float = 1.5 # Multiplier for bridge nodes
|
|
18
|
-
affinity_boost: float = 1.5 # Multiplier for high-affinity cross-community edges
|
|
19
|
-
use_bridge_scoring: bool = True
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _logit(p: float) -> float:
|
|
24
|
-
p = min(max(p, 1e-12), 1.0 - 1e-12)
|
|
25
|
-
return math.log(p / (1.0 - p))
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def _sigmoid(x: float) -> float:
|
|
29
|
-
if x >= 0:
|
|
30
|
-
z = math.exp(-x)
|
|
31
|
-
return 1.0 / (1.0 + z)
|
|
32
|
-
else:
|
|
33
|
-
z = math.exp(x)
|
|
34
|
-
return z / (1.0 + z)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def combine_edge_confidence(raw: float, npll: Optional[float] = None, calibrated: Optional[float] = None,
|
|
38
|
-
weights: Tuple[float, float, float] = (0.5, 0.4, 0.1)) -> float:
|
|
39
|
-
w_raw, w_npll, w_cal = weights
|
|
40
|
-
terms: List[Tuple[float, float]] = [(w_raw, raw)]
|
|
41
|
-
if npll is not None:
|
|
42
|
-
terms.append((w_npll, npll))
|
|
43
|
-
if calibrated is not None:
|
|
44
|
-
terms.append((w_cal, calibrated))
|
|
45
|
-
# Normalize weights present
|
|
46
|
-
total_w = sum(w for w, _ in terms) or 1.0
|
|
47
|
-
norm = [(w / total_w, p) for w, p in terms]
|
|
48
|
-
s = sum(w * _logit(p) for w, p in norm)
|
|
49
|
-
return _sigmoid(s)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def power_mean(values: Iterable[float], rho: float) -> float:
|
|
53
|
-
values = list(values)
|
|
54
|
-
if not values:
|
|
55
|
-
return 0.0
|
|
56
|
-
if rho == 0.0:
|
|
57
|
-
# geometric mean
|
|
58
|
-
return math.exp(sum(math.log(max(v, 1e-12)) for v in values) / len(values))
|
|
59
|
-
acc = sum((max(v, 1e-12)) ** rho for v in values) / len(values)
|
|
60
|
-
return max(acc, 1e-12) ** (1.0 / rho)
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def recency_factor(edge_timestamps: List[Optional[float]], now_ts: Optional[float], tau_days: float,
|
|
64
|
-
clamp: Tuple[float, float]) -> float:
|
|
65
|
-
if not edge_timestamps:
|
|
66
|
-
return 1.0
|
|
67
|
-
lo, hi = clamp
|
|
68
|
-
total = 1.0
|
|
69
|
-
for ts in edge_timestamps:
|
|
70
|
-
if ts is None or now_ts is None:
|
|
71
|
-
continue
|
|
72
|
-
dt_days = max(0.0, (now_ts - ts) / (60 * 60 * 24))
|
|
73
|
-
total *= math.exp(-dt_days / max(tau_days, 1e-6))
|
|
74
|
-
return min(max(total, lo), hi)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def path_score(
|
|
78
|
-
path_edges: List[Tuple[NodeId, RelId, NodeId]],
|
|
79
|
-
node_ppr: Dict[NodeId, float],
|
|
80
|
-
conf_provider: EdgeConfidenceProvider,
|
|
81
|
-
now_ts: Optional[float],
|
|
82
|
-
edge_timestamp_lookup,
|
|
83
|
-
cfg: PathScoreConfig = PathScoreConfig(),
|
|
84
|
-
# New params
|
|
85
|
-
node_bridge_scores: Optional[Dict[NodeId, float]] = None,
|
|
86
|
-
edge_affinity_scores: Optional[Dict[Tuple[NodeId, NodeId], float]] = None,
|
|
87
|
-
) -> float:
|
|
88
|
-
# Edge confidence term (product of effective confidences)
|
|
89
|
-
edge_confs: List[float] = []
|
|
90
|
-
type_priors: List[float] = []
|
|
91
|
-
ts_list: List[Optional[float]] = []
|
|
92
|
-
affinity_mults: List[float] = [] # New
|
|
93
|
-
|
|
94
|
-
priors = cfg.relation_type_prior or {}
|
|
95
|
-
for u, rel, v in path_edges:
|
|
96
|
-
c = conf_provider.confidence(u, rel, v)
|
|
97
|
-
edge_confs.append(max(min(c, 1.0), 1e-12))
|
|
98
|
-
type_priors.append(max(priors.get(rel, 1.0), 1e-6))
|
|
99
|
-
ts_list.append(edge_timestamp_lookup(u, rel, v) if edge_timestamp_lookup else None)
|
|
100
|
-
|
|
101
|
-
# Affinity term
|
|
102
|
-
if cfg.use_bridge_scoring and edge_affinity_scores:
|
|
103
|
-
# Check (u,v) or (v,u)
|
|
104
|
-
aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
|
|
105
|
-
if aff > 0:
|
|
106
|
-
affinity_mults.append(1.0 + (aff * cfg.affinity_boost))
|
|
107
|
-
else:
|
|
108
|
-
affinity_mults.append(1.0)
|
|
109
|
-
else:
|
|
110
|
-
affinity_mults.append(1.0)
|
|
111
|
-
|
|
112
|
-
edge_term = 1.0
|
|
113
|
-
for c in edge_confs:
|
|
114
|
-
edge_term *= c
|
|
115
|
-
|
|
116
|
-
# PPR term: power mean across unique nodes on the path
|
|
117
|
-
path_nodes: List[NodeId] = [path_edges[0][0]] + [v for (_, _, v) in path_edges] if path_edges else []
|
|
118
|
-
ppr_vals = [node_ppr.get(n, 1e-12) for n in path_nodes]
|
|
119
|
-
ppr_term = power_mean(ppr_vals, cfg.power_mean_rho)
|
|
120
|
-
|
|
121
|
-
# Bridge term: boost nodes that are bridges
|
|
122
|
-
bridge_term = 1.0
|
|
123
|
-
if cfg.use_bridge_scoring and node_bridge_scores:
|
|
124
|
-
# We can average the bridge scores or take max
|
|
125
|
-
# node_bridge_scores should be pre-processed factors (e.g. 1.0 to 2.0)
|
|
126
|
-
b_vals = [node_bridge_scores.get(n, 1.0) for n in path_nodes]
|
|
127
|
-
bridge_term = power_mean(b_vals, cfg.power_mean_rho)
|
|
128
|
-
|
|
129
|
-
# Affinity term aggregation
|
|
130
|
-
affinity_term = 1.0
|
|
131
|
-
for a in affinity_mults:
|
|
132
|
-
affinity_term *= a
|
|
133
|
-
|
|
134
|
-
# Type priors multiply
|
|
135
|
-
prior_term = 1.0
|
|
136
|
-
for t in type_priors:
|
|
137
|
-
prior_term *= t
|
|
138
|
-
|
|
139
|
-
# Recency multiplicative factor (clamped)
|
|
140
|
-
rec_term = recency_factor(ts_list, now_ts, cfg.recency_tau_days, cfg.recency_clamp)
|
|
141
|
-
|
|
142
|
-
return float(edge_term * ppr_term * bridge_term * affinity_term * prior_term * rec_term)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def aggregate_evidence_strength(path_scores: List[float], top_k: int = 5) -> float:
|
|
146
|
-
if not path_scores:
|
|
147
|
-
return 0.0
|
|
148
|
-
top = sorted(path_scores, reverse=True)[: top_k]
|
|
149
|
-
prod = 1.0
|
|
150
|
-
for s in top:
|
|
151
|
-
prod *= max(0.0, 1.0 - s)
|
|
152
|
-
return 1.0 - prod
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
@dataclass
|
|
156
|
-
class InsightScoreConfig:
|
|
157
|
-
alpha: float = 0.5
|
|
158
|
-
beta: float = 0.2
|
|
159
|
-
gamma: float = 0.2
|
|
160
|
-
delta: float = 0.1
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def insight_score(
|
|
164
|
-
evidence_strength: float,
|
|
165
|
-
community_relevance: float,
|
|
166
|
-
explanation_quality: float,
|
|
167
|
-
business_impact_proxy: float,
|
|
168
|
-
cfg: InsightScoreConfig = InsightScoreConfig(),
|
|
169
|
-
) -> float:
|
|
170
|
-
# Normalize inputs into [0,1]
|
|
171
|
-
e = min(max(evidence_strength, 0.0), 1.0)
|
|
172
|
-
c = min(max(community_relevance, 0.0), 1.0)
|
|
173
|
-
x = min(max(explanation_quality, 0.0), 1.0)
|
|
174
|
-
b = min(max(business_impact_proxy, 0.0), 1.0)
|
|
175
|
-
w_sum = max(cfg.alpha + cfg.beta + cfg.gamma + cfg.delta, 1e-9)
|
|
176
|
-
a = cfg.alpha / w_sum
|
|
177
|
-
bb = cfg.beta / w_sum
|
|
178
|
-
g = cfg.gamma / w_sum
|
|
179
|
-
d = cfg.delta / w_sum
|
|
180
|
-
return float(a * e + bb * c + g * x + d * b)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def compute_community_relevance(nodes_in_insight: Iterable[NodeId], node_ppr: Dict[NodeId, float]) -> float:
|
|
184
|
-
return float(sum(node_ppr.get(n, 0.0) for n in set(nodes_in_insight)))
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def score_paths_and_insight(
|
|
188
|
-
accessor: GraphAccessor,
|
|
189
|
-
community_id: str,
|
|
190
|
-
seeds: List[NodeId],
|
|
191
|
-
node_ppr_scores: List[Tuple[NodeId, float]],
|
|
192
|
-
candidate_paths: List[List[Tuple[NodeId, RelId, NodeId]]],
|
|
193
|
-
conf_provider: EdgeConfidenceProvider = ConstantConfidence(0.8),
|
|
194
|
-
now_ts: Optional[float] = None,
|
|
195
|
-
edge_timestamp_lookup=None,
|
|
196
|
-
path_cfg: PathScoreConfig = PathScoreConfig(),
|
|
197
|
-
insight_cfg: InsightScoreConfig = InsightScoreConfig(),
|
|
198
|
-
top_k_paths: int = 5,
|
|
199
|
-
) -> Dict[str, object]:
|
|
200
|
-
node_ppr = {n: p for n, p in node_ppr_scores}
|
|
201
|
-
|
|
202
|
-
# Pre-fetch bridge & affinity data if accessor supports it
|
|
203
|
-
node_bridge_scores = {}
|
|
204
|
-
edge_affinity_scores = {}
|
|
205
|
-
|
|
206
|
-
if path_cfg.use_bridge_scoring:
|
|
207
|
-
# Duck typing check for GlobalGraphAccessor capabilities
|
|
208
|
-
has_bridge = hasattr(accessor, "is_bridge")
|
|
209
|
-
has_affinity = hasattr(accessor, "get_affinity") and hasattr(accessor, "get_entity_community")
|
|
210
|
-
|
|
211
|
-
if has_bridge or has_affinity:
|
|
212
|
-
unique_nodes = set()
|
|
213
|
-
for p in candidate_paths:
|
|
214
|
-
unique_nodes.add(p[0][0] if p else "")
|
|
215
|
-
for _, _, v in p:
|
|
216
|
-
unique_nodes.add(v)
|
|
217
|
-
if "" in unique_nodes: unique_nodes.remove("")
|
|
218
|
-
|
|
219
|
-
node_communities = {}
|
|
220
|
-
for n in unique_nodes:
|
|
221
|
-
if has_bridge:
|
|
222
|
-
b_data = accessor.is_bridge(n)
|
|
223
|
-
if b_data:
|
|
224
|
-
# Normalize strength: 1 + log(1+strength) * scaling
|
|
225
|
-
strength = b_data.get("bridge_strength", 0)
|
|
226
|
-
if strength > 0:
|
|
227
|
-
# e.g., strength 10 -> log(11)~2.4 -> 1.24 multiplier boost (if boost=1.0)
|
|
228
|
-
node_bridge_scores[n] = 1.0 + (math.log(1 + strength) * 0.1 * path_cfg.bridge_boost)
|
|
229
|
-
|
|
230
|
-
if has_affinity:
|
|
231
|
-
comm = accessor.get_entity_community(n)
|
|
232
|
-
if comm:
|
|
233
|
-
node_communities[n] = comm
|
|
234
|
-
|
|
235
|
-
if has_affinity:
|
|
236
|
-
for p in candidate_paths:
|
|
237
|
-
for u, _, v in p:
|
|
238
|
-
pair = (u, v)
|
|
239
|
-
if pair not in edge_affinity_scores and (v, u) not in edge_affinity_scores:
|
|
240
|
-
c_u = node_communities.get(u)
|
|
241
|
-
c_v = node_communities.get(v)
|
|
242
|
-
if c_u and c_v and c_u != c_v:
|
|
243
|
-
aff = accessor.get_affinity(c_u, c_v)
|
|
244
|
-
edge_affinity_scores[pair] = aff
|
|
245
|
-
|
|
246
|
-
path_scores: List[float] = []
|
|
247
|
-
scored_paths: List[Dict[str, object]] = []
|
|
248
|
-
for edges in candidate_paths:
|
|
249
|
-
ps = path_score(
|
|
250
|
-
edges, node_ppr, conf_provider, now_ts, edge_timestamp_lookup, path_cfg,
|
|
251
|
-
node_bridge_scores=node_bridge_scores,
|
|
252
|
-
edge_affinity_scores=edge_affinity_scores
|
|
253
|
-
)
|
|
254
|
-
path_scores.append(ps)
|
|
255
|
-
nodes = [edges[0][0]] + [e[2] for e in edges] if edges else []
|
|
256
|
-
# Decomposition terms for transparency
|
|
257
|
-
ppr_vals = [node_ppr.get(n, 1e-12) for n in nodes]
|
|
258
|
-
priors = path_cfg.relation_type_prior or {}
|
|
259
|
-
pri_list = [max(priors.get(r, 1.0), 1e-6) for (_, r, _) in edges]
|
|
260
|
-
confs = [conf_provider.confidence(u, r, v) for (u, r, v) in edges]
|
|
261
|
-
recs = [edge_timestamp_lookup(u, r, v) if edge_timestamp_lookup else None for (u, r, v) in edges]
|
|
262
|
-
|
|
263
|
-
# New decomposition terms
|
|
264
|
-
bridge_vals = [node_bridge_scores.get(n, 1.0) for n in nodes]
|
|
265
|
-
aff_vals = []
|
|
266
|
-
for u, r, v in edges:
|
|
267
|
-
aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
|
|
268
|
-
aff_vals.append(aff)
|
|
269
|
-
|
|
270
|
-
scored_paths.append({
|
|
271
|
-
"score": ps,
|
|
272
|
-
"nodes": nodes,
|
|
273
|
-
"edges": [{"u": u, "rel": r, "v": v} for (u, r, v) in edges],
|
|
274
|
-
"decomp": {
|
|
275
|
-
"ppr_values": ppr_vals,
|
|
276
|
-
"type_priors": pri_list,
|
|
277
|
-
"edge_confidences": confs,
|
|
278
|
-
"edge_timestamps": recs,
|
|
279
|
-
"bridge_scores": bridge_vals,
|
|
280
|
-
"affinity_scores": aff_vals,
|
|
281
|
-
},
|
|
282
|
-
})
|
|
283
|
-
es = aggregate_evidence_strength(path_scores, top_k=top_k_paths)
|
|
284
|
-
nodes_in_insight = set(n for p in scored_paths for n in p["nodes"])
|
|
285
|
-
comm_rel = compute_community_relevance(nodes_in_insight, node_ppr)
|
|
286
|
-
ins = insight_score(es, comm_rel, explanation_quality=0.5, business_impact_proxy=0.5, cfg=insight_cfg)
|
|
287
|
-
return {
|
|
288
|
-
"paths": sorted(scored_paths, key=lambda x: x["score"], reverse=True),
|
|
289
|
-
"evidence_strength": es,
|
|
290
|
-
"community_relevance": comm_rel,
|
|
291
|
-
"insight_score": ins,
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Tuple, Optional, Iterable
|
|
4
|
+
import math
|
|
5
|
+
|
|
6
|
+
from .adapters import NodeId, RelId, GraphAccessor
|
|
7
|
+
from .confidence import EdgeConfidenceProvider, ConstantConfidence
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PathScoreConfig:
|
|
12
|
+
relation_type_prior: Optional[Dict[RelId, float]] = None
|
|
13
|
+
recency_tau_days: float = 30.0
|
|
14
|
+
recency_clamp: Tuple[float, float] = (0.5, 1.0)
|
|
15
|
+
power_mean_rho: float = 0.5
|
|
16
|
+
# New signals for GNN/Bridge integration
|
|
17
|
+
bridge_boost: float = 1.5 # Multiplier for bridge nodes
|
|
18
|
+
affinity_boost: float = 1.5 # Multiplier for high-affinity cross-community edges
|
|
19
|
+
use_bridge_scoring: bool = True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _logit(p: float) -> float:
|
|
24
|
+
p = min(max(p, 1e-12), 1.0 - 1e-12)
|
|
25
|
+
return math.log(p / (1.0 - p))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _sigmoid(x: float) -> float:
|
|
29
|
+
if x >= 0:
|
|
30
|
+
z = math.exp(-x)
|
|
31
|
+
return 1.0 / (1.0 + z)
|
|
32
|
+
else:
|
|
33
|
+
z = math.exp(x)
|
|
34
|
+
return z / (1.0 + z)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def combine_edge_confidence(raw: float, npll: Optional[float] = None, calibrated: Optional[float] = None,
|
|
38
|
+
weights: Tuple[float, float, float] = (0.5, 0.4, 0.1)) -> float:
|
|
39
|
+
w_raw, w_npll, w_cal = weights
|
|
40
|
+
terms: List[Tuple[float, float]] = [(w_raw, raw)]
|
|
41
|
+
if npll is not None:
|
|
42
|
+
terms.append((w_npll, npll))
|
|
43
|
+
if calibrated is not None:
|
|
44
|
+
terms.append((w_cal, calibrated))
|
|
45
|
+
# Normalize weights present
|
|
46
|
+
total_w = sum(w for w, _ in terms) or 1.0
|
|
47
|
+
norm = [(w / total_w, p) for w, p in terms]
|
|
48
|
+
s = sum(w * _logit(p) for w, p in norm)
|
|
49
|
+
return _sigmoid(s)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def power_mean(values: Iterable[float], rho: float) -> float:
|
|
53
|
+
values = list(values)
|
|
54
|
+
if not values:
|
|
55
|
+
return 0.0
|
|
56
|
+
if rho == 0.0:
|
|
57
|
+
# geometric mean
|
|
58
|
+
return math.exp(sum(math.log(max(v, 1e-12)) for v in values) / len(values))
|
|
59
|
+
acc = sum((max(v, 1e-12)) ** rho for v in values) / len(values)
|
|
60
|
+
return max(acc, 1e-12) ** (1.0 / rho)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def recency_factor(edge_timestamps: List[Optional[float]], now_ts: Optional[float], tau_days: float,
|
|
64
|
+
clamp: Tuple[float, float]) -> float:
|
|
65
|
+
if not edge_timestamps:
|
|
66
|
+
return 1.0
|
|
67
|
+
lo, hi = clamp
|
|
68
|
+
total = 1.0
|
|
69
|
+
for ts in edge_timestamps:
|
|
70
|
+
if ts is None or now_ts is None:
|
|
71
|
+
continue
|
|
72
|
+
dt_days = max(0.0, (now_ts - ts) / (60 * 60 * 24))
|
|
73
|
+
total *= math.exp(-dt_days / max(tau_days, 1e-6))
|
|
74
|
+
return min(max(total, lo), hi)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def path_score(
|
|
78
|
+
path_edges: List[Tuple[NodeId, RelId, NodeId]],
|
|
79
|
+
node_ppr: Dict[NodeId, float],
|
|
80
|
+
conf_provider: EdgeConfidenceProvider,
|
|
81
|
+
now_ts: Optional[float],
|
|
82
|
+
edge_timestamp_lookup,
|
|
83
|
+
cfg: PathScoreConfig = PathScoreConfig(),
|
|
84
|
+
# New params
|
|
85
|
+
node_bridge_scores: Optional[Dict[NodeId, float]] = None,
|
|
86
|
+
edge_affinity_scores: Optional[Dict[Tuple[NodeId, NodeId], float]] = None,
|
|
87
|
+
) -> float:
|
|
88
|
+
# Edge confidence term (product of effective confidences)
|
|
89
|
+
edge_confs: List[float] = []
|
|
90
|
+
type_priors: List[float] = []
|
|
91
|
+
ts_list: List[Optional[float]] = []
|
|
92
|
+
affinity_mults: List[float] = [] # New
|
|
93
|
+
|
|
94
|
+
priors = cfg.relation_type_prior or {}
|
|
95
|
+
for u, rel, v in path_edges:
|
|
96
|
+
c = conf_provider.confidence(u, rel, v)
|
|
97
|
+
edge_confs.append(max(min(c, 1.0), 1e-12))
|
|
98
|
+
type_priors.append(max(priors.get(rel, 1.0), 1e-6))
|
|
99
|
+
ts_list.append(edge_timestamp_lookup(u, rel, v) if edge_timestamp_lookup else None)
|
|
100
|
+
|
|
101
|
+
# Affinity term
|
|
102
|
+
if cfg.use_bridge_scoring and edge_affinity_scores:
|
|
103
|
+
# Check (u,v) or (v,u)
|
|
104
|
+
aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
|
|
105
|
+
if aff > 0:
|
|
106
|
+
affinity_mults.append(1.0 + (aff * cfg.affinity_boost))
|
|
107
|
+
else:
|
|
108
|
+
affinity_mults.append(1.0)
|
|
109
|
+
else:
|
|
110
|
+
affinity_mults.append(1.0)
|
|
111
|
+
|
|
112
|
+
edge_term = 1.0
|
|
113
|
+
for c in edge_confs:
|
|
114
|
+
edge_term *= c
|
|
115
|
+
|
|
116
|
+
# PPR term: power mean across unique nodes on the path
|
|
117
|
+
path_nodes: List[NodeId] = [path_edges[0][0]] + [v for (_, _, v) in path_edges] if path_edges else []
|
|
118
|
+
ppr_vals = [node_ppr.get(n, 1e-12) for n in path_nodes]
|
|
119
|
+
ppr_term = power_mean(ppr_vals, cfg.power_mean_rho)
|
|
120
|
+
|
|
121
|
+
# Bridge term: boost nodes that are bridges
|
|
122
|
+
bridge_term = 1.0
|
|
123
|
+
if cfg.use_bridge_scoring and node_bridge_scores:
|
|
124
|
+
# We can average the bridge scores or take max
|
|
125
|
+
# node_bridge_scores should be pre-processed factors (e.g. 1.0 to 2.0)
|
|
126
|
+
b_vals = [node_bridge_scores.get(n, 1.0) for n in path_nodes]
|
|
127
|
+
bridge_term = power_mean(b_vals, cfg.power_mean_rho)
|
|
128
|
+
|
|
129
|
+
# Affinity term aggregation
|
|
130
|
+
affinity_term = 1.0
|
|
131
|
+
for a in affinity_mults:
|
|
132
|
+
affinity_term *= a
|
|
133
|
+
|
|
134
|
+
# Type priors multiply
|
|
135
|
+
prior_term = 1.0
|
|
136
|
+
for t in type_priors:
|
|
137
|
+
prior_term *= t
|
|
138
|
+
|
|
139
|
+
# Recency multiplicative factor (clamped)
|
|
140
|
+
rec_term = recency_factor(ts_list, now_ts, cfg.recency_tau_days, cfg.recency_clamp)
|
|
141
|
+
|
|
142
|
+
return float(edge_term * ppr_term * bridge_term * affinity_term * prior_term * rec_term)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def aggregate_evidence_strength(path_scores: List[float], top_k: int = 5) -> float:
|
|
146
|
+
if not path_scores:
|
|
147
|
+
return 0.0
|
|
148
|
+
top = sorted(path_scores, reverse=True)[: top_k]
|
|
149
|
+
prod = 1.0
|
|
150
|
+
for s in top:
|
|
151
|
+
prod *= max(0.0, 1.0 - s)
|
|
152
|
+
return 1.0 - prod
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class InsightScoreConfig:
|
|
157
|
+
alpha: float = 0.5
|
|
158
|
+
beta: float = 0.2
|
|
159
|
+
gamma: float = 0.2
|
|
160
|
+
delta: float = 0.1
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def insight_score(
|
|
164
|
+
evidence_strength: float,
|
|
165
|
+
community_relevance: float,
|
|
166
|
+
explanation_quality: float,
|
|
167
|
+
business_impact_proxy: float,
|
|
168
|
+
cfg: InsightScoreConfig = InsightScoreConfig(),
|
|
169
|
+
) -> float:
|
|
170
|
+
# Normalize inputs into [0,1]
|
|
171
|
+
e = min(max(evidence_strength, 0.0), 1.0)
|
|
172
|
+
c = min(max(community_relevance, 0.0), 1.0)
|
|
173
|
+
x = min(max(explanation_quality, 0.0), 1.0)
|
|
174
|
+
b = min(max(business_impact_proxy, 0.0), 1.0)
|
|
175
|
+
w_sum = max(cfg.alpha + cfg.beta + cfg.gamma + cfg.delta, 1e-9)
|
|
176
|
+
a = cfg.alpha / w_sum
|
|
177
|
+
bb = cfg.beta / w_sum
|
|
178
|
+
g = cfg.gamma / w_sum
|
|
179
|
+
d = cfg.delta / w_sum
|
|
180
|
+
return float(a * e + bb * c + g * x + d * b)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def compute_community_relevance(nodes_in_insight: Iterable[NodeId], node_ppr: Dict[NodeId, float]) -> float:
|
|
184
|
+
return float(sum(node_ppr.get(n, 0.0) for n in set(nodes_in_insight)))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def score_paths_and_insight(
|
|
188
|
+
accessor: GraphAccessor,
|
|
189
|
+
community_id: str,
|
|
190
|
+
seeds: List[NodeId],
|
|
191
|
+
node_ppr_scores: List[Tuple[NodeId, float]],
|
|
192
|
+
candidate_paths: List[List[Tuple[NodeId, RelId, NodeId]]],
|
|
193
|
+
conf_provider: EdgeConfidenceProvider = ConstantConfidence(0.8),
|
|
194
|
+
now_ts: Optional[float] = None,
|
|
195
|
+
edge_timestamp_lookup=None,
|
|
196
|
+
path_cfg: PathScoreConfig = PathScoreConfig(),
|
|
197
|
+
insight_cfg: InsightScoreConfig = InsightScoreConfig(),
|
|
198
|
+
top_k_paths: int = 5,
|
|
199
|
+
) -> Dict[str, object]:
|
|
200
|
+
node_ppr = {n: p for n, p in node_ppr_scores}
|
|
201
|
+
|
|
202
|
+
# Pre-fetch bridge & affinity data if accessor supports it
|
|
203
|
+
node_bridge_scores = {}
|
|
204
|
+
edge_affinity_scores = {}
|
|
205
|
+
|
|
206
|
+
if path_cfg.use_bridge_scoring:
|
|
207
|
+
# Duck typing check for GlobalGraphAccessor capabilities
|
|
208
|
+
has_bridge = hasattr(accessor, "is_bridge")
|
|
209
|
+
has_affinity = hasattr(accessor, "get_affinity") and hasattr(accessor, "get_entity_community")
|
|
210
|
+
|
|
211
|
+
if has_bridge or has_affinity:
|
|
212
|
+
unique_nodes = set()
|
|
213
|
+
for p in candidate_paths:
|
|
214
|
+
unique_nodes.add(p[0][0] if p else "")
|
|
215
|
+
for _, _, v in p:
|
|
216
|
+
unique_nodes.add(v)
|
|
217
|
+
if "" in unique_nodes: unique_nodes.remove("")
|
|
218
|
+
|
|
219
|
+
node_communities = {}
|
|
220
|
+
for n in unique_nodes:
|
|
221
|
+
if has_bridge:
|
|
222
|
+
b_data = accessor.is_bridge(n)
|
|
223
|
+
if b_data:
|
|
224
|
+
# Normalize strength: 1 + log(1+strength) * scaling
|
|
225
|
+
strength = b_data.get("bridge_strength", 0)
|
|
226
|
+
if strength > 0:
|
|
227
|
+
# e.g., strength 10 -> log(11)~2.4 -> 1.24 multiplier boost (if boost=1.0)
|
|
228
|
+
node_bridge_scores[n] = 1.0 + (math.log(1 + strength) * 0.1 * path_cfg.bridge_boost)
|
|
229
|
+
|
|
230
|
+
if has_affinity:
|
|
231
|
+
comm = accessor.get_entity_community(n)
|
|
232
|
+
if comm:
|
|
233
|
+
node_communities[n] = comm
|
|
234
|
+
|
|
235
|
+
if has_affinity:
|
|
236
|
+
for p in candidate_paths:
|
|
237
|
+
for u, _, v in p:
|
|
238
|
+
pair = (u, v)
|
|
239
|
+
if pair not in edge_affinity_scores and (v, u) not in edge_affinity_scores:
|
|
240
|
+
c_u = node_communities.get(u)
|
|
241
|
+
c_v = node_communities.get(v)
|
|
242
|
+
if c_u and c_v and c_u != c_v:
|
|
243
|
+
aff = accessor.get_affinity(c_u, c_v)
|
|
244
|
+
edge_affinity_scores[pair] = aff
|
|
245
|
+
|
|
246
|
+
path_scores: List[float] = []
|
|
247
|
+
scored_paths: List[Dict[str, object]] = []
|
|
248
|
+
for edges in candidate_paths:
|
|
249
|
+
ps = path_score(
|
|
250
|
+
edges, node_ppr, conf_provider, now_ts, edge_timestamp_lookup, path_cfg,
|
|
251
|
+
node_bridge_scores=node_bridge_scores,
|
|
252
|
+
edge_affinity_scores=edge_affinity_scores
|
|
253
|
+
)
|
|
254
|
+
path_scores.append(ps)
|
|
255
|
+
nodes = [edges[0][0]] + [e[2] for e in edges] if edges else []
|
|
256
|
+
# Decomposition terms for transparency
|
|
257
|
+
ppr_vals = [node_ppr.get(n, 1e-12) for n in nodes]
|
|
258
|
+
priors = path_cfg.relation_type_prior or {}
|
|
259
|
+
pri_list = [max(priors.get(r, 1.0), 1e-6) for (_, r, _) in edges]
|
|
260
|
+
confs = [conf_provider.confidence(u, r, v) for (u, r, v) in edges]
|
|
261
|
+
recs = [edge_timestamp_lookup(u, r, v) if edge_timestamp_lookup else None for (u, r, v) in edges]
|
|
262
|
+
|
|
263
|
+
# New decomposition terms
|
|
264
|
+
bridge_vals = [node_bridge_scores.get(n, 1.0) for n in nodes]
|
|
265
|
+
aff_vals = []
|
|
266
|
+
for u, r, v in edges:
|
|
267
|
+
aff = edge_affinity_scores.get((u, v), edge_affinity_scores.get((v, u), 0.0))
|
|
268
|
+
aff_vals.append(aff)
|
|
269
|
+
|
|
270
|
+
scored_paths.append({
|
|
271
|
+
"score": ps,
|
|
272
|
+
"nodes": nodes,
|
|
273
|
+
"edges": [{"u": u, "rel": r, "v": v} for (u, r, v) in edges],
|
|
274
|
+
"decomp": {
|
|
275
|
+
"ppr_values": ppr_vals,
|
|
276
|
+
"type_priors": pri_list,
|
|
277
|
+
"edge_confidences": confs,
|
|
278
|
+
"edge_timestamps": recs,
|
|
279
|
+
"bridge_scores": bridge_vals,
|
|
280
|
+
"affinity_scores": aff_vals,
|
|
281
|
+
},
|
|
282
|
+
})
|
|
283
|
+
es = aggregate_evidence_strength(path_scores, top_k=top_k_paths)
|
|
284
|
+
nodes_in_insight = set(n for p in scored_paths for n in p["nodes"])
|
|
285
|
+
comm_rel = compute_community_relevance(nodes_in_insight, node_ppr)
|
|
286
|
+
ins = insight_score(es, comm_rel, explanation_quality=0.5, business_impact_proxy=0.5, cfg=insight_cfg)
|
|
287
|
+
return {
|
|
288
|
+
"paths": sorted(scored_paths, key=lambda x: x["score"], reverse=True),
|
|
289
|
+
"evidence_strength": es,
|
|
290
|
+
"community_relevance": comm_rel,
|
|
291
|
+
"insight_score": ins,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
retrieval/utils/pii_redaction.py
CHANGED
|
@@ -1,36 +1,36 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import Any, Dict
|
|
3
|
-
|
|
4
|
-
# Simple regex patterns for common PII
|
|
5
|
-
PII_PATTERNS = {
|
|
6
|
-
'EMAIL': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
|
|
7
|
-
'PHONE': re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
def redact_text(text: str) -> str:
|
|
11
|
-
"""Redacts PII from a single string."""
|
|
12
|
-
if not isinstance(text, str):
|
|
13
|
-
return text
|
|
14
|
-
for pii_type, pattern in PII_PATTERNS.items():
|
|
15
|
-
text = pattern.sub(f'[{pii_type}_REDACTED]', text)
|
|
16
|
-
return text
|
|
17
|
-
|
|
18
|
-
def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
-
"""Recursively redacts PII from string values in a dictionary."""
|
|
20
|
-
if not isinstance(data, dict):
|
|
21
|
-
return data
|
|
22
|
-
|
|
23
|
-
clean_dict = {}
|
|
24
|
-
for key, value in data.items():
|
|
25
|
-
if isinstance(value, str):
|
|
26
|
-
clean_dict[key] = redact_text(value)
|
|
27
|
-
elif isinstance(value, dict):
|
|
28
|
-
clean_dict[key] = redact_dict(value)
|
|
29
|
-
elif isinstance(value, list):
|
|
30
|
-
clean_dict[key] = [
|
|
31
|
-
redact_dict(item) if isinstance(item, dict) else (redact_text(item) if isinstance(item, str) else item)
|
|
32
|
-
for item in value
|
|
33
|
-
]
|
|
34
|
-
else:
|
|
35
|
-
clean_dict[key] = value
|
|
36
|
-
return clean_dict
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
# Simple regex patterns for common PII
|
|
5
|
+
PII_PATTERNS = {
|
|
6
|
+
'EMAIL': re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
|
|
7
|
+
'PHONE': re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
def redact_text(text: str) -> str:
|
|
11
|
+
"""Redacts PII from a single string."""
|
|
12
|
+
if not isinstance(text, str):
|
|
13
|
+
return text
|
|
14
|
+
for pii_type, pattern in PII_PATTERNS.items():
|
|
15
|
+
text = pattern.sub(f'[{pii_type}_REDACTED]', text)
|
|
16
|
+
return text
|
|
17
|
+
|
|
18
|
+
def redact_dict(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
+
"""Recursively redacts PII from string values in a dictionary."""
|
|
20
|
+
if not isinstance(data, dict):
|
|
21
|
+
return data
|
|
22
|
+
|
|
23
|
+
clean_dict = {}
|
|
24
|
+
for key, value in data.items():
|
|
25
|
+
if isinstance(value, str):
|
|
26
|
+
clean_dict[key] = redact_text(value)
|
|
27
|
+
elif isinstance(value, dict):
|
|
28
|
+
clean_dict[key] = redact_dict(value)
|
|
29
|
+
elif isinstance(value, list):
|
|
30
|
+
clean_dict[key] = [
|
|
31
|
+
redact_dict(item) if isinstance(item, dict) else (redact_text(item) if isinstance(item, str) else item)
|
|
32
|
+
for item in value
|
|
33
|
+
]
|
|
34
|
+
else:
|
|
35
|
+
clean_dict[key] = value
|
|
36
|
+
return clean_dict
|