odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +17 -17
- benchmarks/datasets.py +284 -284
- benchmarks/metrics.py +275 -275
- benchmarks/run_ablation.py +279 -279
- benchmarks/run_npll_benchmark.py +270 -270
- npll/__init__.py +10 -10
- npll/bootstrap.py +474 -474
- npll/core/__init__.py +33 -33
- npll/core/knowledge_graph.py +308 -308
- npll/core/logical_rules.py +496 -496
- npll/core/mln.py +474 -474
- npll/inference/__init__.py +40 -40
- npll/inference/e_step.py +419 -419
- npll/inference/elbo.py +434 -434
- npll/inference/m_step.py +576 -576
- npll/npll_model.py +631 -631
- npll/scoring/__init__.py +42 -42
- npll/scoring/embeddings.py +441 -441
- npll/scoring/probability.py +402 -402
- npll/scoring/scoring_module.py +369 -369
- npll/training/__init__.py +24 -24
- npll/training/evaluation.py +496 -496
- npll/training/npll_trainer.py +520 -520
- npll/utils/__init__.py +47 -47
- npll/utils/batch_utils.py +492 -492
- npll/utils/config.py +144 -144
- npll/utils/math_utils.py +338 -338
- odin/__init__.py +21 -20
- odin/engine.py +264 -264
- odin/schema.py +210 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
- odin_engine-0.2.0.dist-info/RECORD +63 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
- retrieval/__init__.py +50 -50
- retrieval/adapters.py +140 -140
- retrieval/adapters_arango.py +1418 -1418
- retrieval/aggregators.py +707 -707
- retrieval/beam.py +127 -127
- retrieval/budget.py +60 -60
- retrieval/cache.py +159 -159
- retrieval/confidence.py +88 -88
- retrieval/eval.py +49 -49
- retrieval/linker.py +87 -87
- retrieval/metrics.py +105 -105
- retrieval/metrics_motifs.py +36 -36
- retrieval/orchestrator.py +571 -571
- retrieval/ppr/__init__.py +12 -12
- retrieval/ppr/anchors.py +41 -41
- retrieval/ppr/bippr.py +61 -61
- retrieval/ppr/engines.py +257 -257
- retrieval/ppr/global_pr.py +76 -76
- retrieval/ppr/indexes.py +78 -78
- retrieval/ppr.py +156 -156
- retrieval/ppr_cache.py +25 -25
- retrieval/scoring.py +294 -294
- retrieval/utils/pii_redaction.py +36 -36
- retrieval/writers/__init__.py +9 -9
- retrieval/writers/arango_writer.py +28 -28
- retrieval/writers/base.py +21 -21
- retrieval/writers/janus_writer.py +36 -36
- odin_engine-0.1.0.dist-info/RECORD +0 -62
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
- {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
retrieval/adapters_arango.py
CHANGED
|
@@ -1,1418 +1,1418 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from typing import Iterable, Tuple, Optional, List, Dict, Any, NamedTuple
|
|
3
|
-
|
|
4
|
-
from .adapters import GraphAccessor, NodeId, RelId
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class EdgeView(NamedTuple):
|
|
8
|
-
neighbor_id: NodeId
|
|
9
|
-
relation: RelId
|
|
10
|
-
weight: float # structural effective weight
|
|
11
|
-
edge_id: str
|
|
12
|
-
valid_from: Optional[str]
|
|
13
|
-
valid_to: Optional[str]
|
|
14
|
-
status: Optional[str]
|
|
15
|
-
raw_confidence: Optional[float]
|
|
16
|
-
npll_posterior: Optional[float]
|
|
17
|
-
calibration: Optional[float]
|
|
18
|
-
sources: List[str] # doc/text ids from inline fields & EXTRACTED_FROM
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class ArangoCommunityAccessor(GraphAccessor):
|
|
22
|
-
"""
|
|
23
|
-
Arango-backed GraphAccessor for a single community.
|
|
24
|
-
|
|
25
|
-
Defaults match your schema:
|
|
26
|
-
- nodes: ExtractedEntities
|
|
27
|
-
- edges: ExtractedRelationships (field: relationship, created_at)
|
|
28
|
-
- community via mapping: EntityCommunities(entity_id, community_id)
|
|
29
|
-
- provenance: inline fields + EXTRACTED_FROM (entity -> Documents/TextBlocks)
|
|
30
|
-
|
|
31
|
-
Structural weight only by default:
|
|
32
|
-
w_struct = base_weight * type_prior(relation) * recency_decay
|
|
33
|
-
(Set fuse_edge_confidence=True to multiply raw_confidence * npll_posterior * calibration in-adapter.)
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
def __init__(
|
|
37
|
-
self,
|
|
38
|
-
db,
|
|
39
|
-
community_id: str,
|
|
40
|
-
# Collections
|
|
41
|
-
nodes_collection: str = "ExtractedEntities",
|
|
42
|
-
edges_collection: str = "ExtractedRelationships",
|
|
43
|
-
# Core field names
|
|
44
|
-
relation_property: str = "relationship",
|
|
45
|
-
weight_property: str = "weight",
|
|
46
|
-
node_type_property: str = "type",
|
|
47
|
-
# Time fields
|
|
48
|
-
edge_timestamp_property: str = "created_at",
|
|
49
|
-
edge_valid_from_property: Optional[str] = "valid_from",
|
|
50
|
-
edge_valid_to_property: Optional[str] = "valid_to",
|
|
51
|
-
edge_status_property: Optional[str] = "status",
|
|
52
|
-
# Community scoping (mapping mode by default)
|
|
53
|
-
community_mode: str = "mapping", # "mapping" | "property"
|
|
54
|
-
community_property: str = "community_id", # only used if community_mode == "property"
|
|
55
|
-
membership_collection: str = "EntityCommunities",
|
|
56
|
-
membership_entity_field: str = "entity_id",
|
|
57
|
-
membership_community_field: str = "community_id",
|
|
58
|
-
# Dynamic constraints
|
|
59
|
-
allowed_relations: Optional[List[str]] = None,
|
|
60
|
-
disallowed_relations: Optional[List[str]] = None,
|
|
61
|
-
allowed_neighbor_types: Optional[List[str]] = None,
|
|
62
|
-
# Time filters
|
|
63
|
-
time_window: Optional[Tuple[str, str]] = None, # (start_iso, end_iso)
|
|
64
|
-
as_of: Optional[str] = None, # ISO timestamp for "as of"
|
|
65
|
-
current_only: bool = False, # respect valid_from/valid_to around as_of
|
|
66
|
-
recency_half_life_days: Optional[float] = 90.0, # None disables recency decay
|
|
67
|
-
# Priors
|
|
68
|
-
type_priors: Optional[Dict[str, float]] = None, # e.g., {"assessor": 1.1}
|
|
69
|
-
# Provenance
|
|
70
|
-
edge_provenance_fields: Optional[List[str]] = None, # defaults: ["source_document_id","source_text_id"]
|
|
71
|
-
provenance_edge_collection: Optional[str] = "EXTRACTED_FROM",
|
|
72
|
-
provenance_target_collections: Optional[List[str]] = None, # defaults: ["Documents","TextBlocks"]
|
|
73
|
-
# Confidence fusion (usually False; you do NPLL in engine)
|
|
74
|
-
fuse_edge_confidence: bool = False,
|
|
75
|
-
missing_confidence_prior: float = 1.0,
|
|
76
|
-
edge_raw_confidence_property: Optional[str] = "raw_confidence",
|
|
77
|
-
edge_npll_posterior_property: Optional[str] = "npll_posterior",
|
|
78
|
-
edge_calibration_property: Optional[str] = "calibration",
|
|
79
|
-
# Performance
|
|
80
|
-
aql_batch_size: int = 1000,
|
|
81
|
-
aql_stream: bool = True,
|
|
82
|
-
outbound_index_hint: Optional[str] = None, # e.g. "edges_from_rel_ts"
|
|
83
|
-
inbound_index_hint: Optional[str] = None, # e.g. "edges_to_rel_ts"
|
|
84
|
-
# Bridge / GNN integration
|
|
85
|
-
bridge_collection: str = "BridgeEntities",
|
|
86
|
-
affinity_collection: str = "CommunityAffinity",
|
|
87
|
-
algorithm: str = "gnn", # Default to GNN as per pipeline
|
|
88
|
-
):
|
|
89
|
-
self.db = db
|
|
90
|
-
self._cid = community_id
|
|
91
|
-
self.bridge_col = bridge_collection
|
|
92
|
-
self.affinity_col = affinity_collection
|
|
93
|
-
self.algorithm = algorithm
|
|
94
|
-
self._bridge_cache: Dict[str, Optional[dict]] = {}
|
|
95
|
-
self._affinity_cache: Dict[str, float] = {}
|
|
96
|
-
|
|
97
|
-
self.nodes_col = nodes_collection
|
|
98
|
-
self.edges_col = edges_collection
|
|
99
|
-
|
|
100
|
-
self.rel_prop = relation_property
|
|
101
|
-
self.w_prop = weight_property
|
|
102
|
-
self.node_type_prop = node_type_property
|
|
103
|
-
|
|
104
|
-
self.ts_prop = edge_timestamp_property
|
|
105
|
-
self.edge_valid_from_prop = edge_valid_from_property
|
|
106
|
-
self.edge_valid_to_prop = edge_valid_to_property
|
|
107
|
-
self.edge_status_prop = edge_status_property
|
|
108
|
-
|
|
109
|
-
self.community_mode = community_mode
|
|
110
|
-
self.community_prop = community_property
|
|
111
|
-
self.membership_col = membership_collection
|
|
112
|
-
self.memb_ent_field = membership_entity_field
|
|
113
|
-
self.memb_com_field = membership_community_field
|
|
114
|
-
|
|
115
|
-
self.allowed_relations = allowed_relations
|
|
116
|
-
self.disallowed_relations = disallowed_relations
|
|
117
|
-
self.allowed_neighbor_types = allowed_neighbor_types
|
|
118
|
-
|
|
119
|
-
self.time_window = time_window
|
|
120
|
-
self.as_of = as_of
|
|
121
|
-
self.current_only = current_only
|
|
122
|
-
self.recency_half_life_days = recency_half_life_days
|
|
123
|
-
|
|
124
|
-
self.type_priors = type_priors or {}
|
|
125
|
-
|
|
126
|
-
self.edge_prov_fields = edge_provenance_fields or ["source_document_id", "source_text_id"]
|
|
127
|
-
self.prov_edges_col = provenance_edge_collection
|
|
128
|
-
self.prov_target_cols = provenance_target_collections or ["Documents", "TextBlocks"]
|
|
129
|
-
|
|
130
|
-
self.fuse_edge_confidence = fuse_edge_confidence
|
|
131
|
-
self.missing_confidence_prior = missing_confidence_prior
|
|
132
|
-
self.edge_raw_conf_prop = edge_raw_confidence_property
|
|
133
|
-
self.edge_npll_post_prop = edge_npll_posterior_property
|
|
134
|
-
self.edge_calibration_prop = edge_calibration_property
|
|
135
|
-
|
|
136
|
-
self.aql_batch_size = aql_batch_size
|
|
137
|
-
self.aql_stream = aql_stream
|
|
138
|
-
self.outbound_index_hint = outbound_index_hint
|
|
139
|
-
self.inbound_index_hint = inbound_index_hint
|
|
140
|
-
|
|
141
|
-
# --------------------------
|
|
142
|
-
# Back-compatible core API
|
|
143
|
-
# --------------------------
|
|
144
|
-
def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
145
|
-
for ev in self._iter_neighbors(node, direction="OUTBOUND", rich=True):
|
|
146
|
-
yield ev.neighbor_id, ev.relation, ev.weight
|
|
147
|
-
|
|
148
|
-
def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
149
|
-
for ev in self._iter_neighbors(node, direction="INBOUND", rich=True):
|
|
150
|
-
yield ev.neighbor_id, ev.relation, ev.weight
|
|
151
|
-
|
|
152
|
-
def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
|
|
153
|
-
"""
|
|
154
|
-
Return all node IDs in this community.
|
|
155
|
-
- mapping mode: EntityCommunities -> entity_id
|
|
156
|
-
- property mode: filter ExtractedEntities by community_id field (if you add it)
|
|
157
|
-
- none mode: return all nodes
|
|
158
|
-
"""
|
|
159
|
-
cid = community_id or self._cid
|
|
160
|
-
if self.community_mode == "property":
|
|
161
|
-
aql = f"""
|
|
162
|
-
FOR v IN {self.nodes_col}
|
|
163
|
-
FILTER v.{self.community_prop} == @cid
|
|
164
|
-
RETURN v._id
|
|
165
|
-
"""
|
|
166
|
-
cursor = self.db.aql.execute(
|
|
167
|
-
aql, bind_vars={"cid": cid}, batch_size=self.aql_batch_size, stream=self.aql_stream
|
|
168
|
-
)
|
|
169
|
-
elif self.community_mode == "mapping":
|
|
170
|
-
aql = f"""
|
|
171
|
-
FOR m IN @@mcol
|
|
172
|
-
FILTER m[@m_com] == @cid
|
|
173
|
-
RETURN m[@m_ent]
|
|
174
|
-
"""
|
|
175
|
-
cursor = self.db.aql.execute(
|
|
176
|
-
aql,
|
|
177
|
-
bind_vars={
|
|
178
|
-
"cid": cid,
|
|
179
|
-
"@mcol": self.membership_col,
|
|
180
|
-
"m_ent": self.memb_ent_field,
|
|
181
|
-
"m_com": self.memb_com_field,
|
|
182
|
-
},
|
|
183
|
-
batch_size=self.aql_batch_size,
|
|
184
|
-
stream=self.aql_stream,
|
|
185
|
-
)
|
|
186
|
-
else: # community_mode == "none"
|
|
187
|
-
aql = f"""
|
|
188
|
-
FOR v IN {self.nodes_col}
|
|
189
|
-
RETURN v._id
|
|
190
|
-
"""
|
|
191
|
-
cursor = self.db.aql.execute(
|
|
192
|
-
aql, batch_size=self.aql_batch_size, stream=self.aql_stream
|
|
193
|
-
)
|
|
194
|
-
for vid in cursor:
|
|
195
|
-
yield vid
|
|
196
|
-
|
|
197
|
-
def degree(self, node: NodeId) -> int:
|
|
198
|
-
"""Out-degree (fast)."""
|
|
199
|
-
hint_clause = (
|
|
200
|
-
"OPTIONS { indexHint: @idx, forceIndexHint: true }" if self.outbound_index_hint else ""
|
|
201
|
-
)
|
|
202
|
-
aql = f"""
|
|
203
|
-
RETURN LENGTH(
|
|
204
|
-
FOR e IN {self.edges_col}
|
|
205
|
-
{hint_clause}
|
|
206
|
-
FILTER e._from == @node
|
|
207
|
-
RETURN 1
|
|
208
|
-
)
|
|
209
|
-
"""
|
|
210
|
-
bind = {"node": node}
|
|
211
|
-
if self.outbound_index_hint:
|
|
212
|
-
bind["idx"] = self.outbound_index_hint
|
|
213
|
-
cur = self.db.aql.execute(aql, bind_vars=bind)
|
|
214
|
-
return int(list(cur)[0] or 0)
|
|
215
|
-
|
|
216
|
-
# --------------------------
|
|
217
|
-
# Rich neighbor variants
|
|
218
|
-
# --------------------------
|
|
219
|
-
def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
220
|
-
yield from self._iter_neighbors(node, direction="OUTBOUND", rich=True)
|
|
221
|
-
|
|
222
|
-
def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
223
|
-
yield from self._iter_neighbors(node, direction="INBOUND", rich=True)
|
|
224
|
-
|
|
225
|
-
# --------------------------
|
|
226
|
-
# Provenance helpers
|
|
227
|
-
# --------------------------
|
|
228
|
-
def get_edge_provenance(self, edge_id: str) -> List[str]:
|
|
229
|
-
"""
|
|
230
|
-
Return provenance targets for a relationship edge:
|
|
231
|
-
- inline fields (source_document_id, source_text_id)
|
|
232
|
-
- EXTRACTED_FROM edges for either endpoint entity
|
|
233
|
-
"""
|
|
234
|
-
# Build the provenance edges clause safely (avoid nested f-strings)
|
|
235
|
-
prov_edges_clause = (
|
|
236
|
-
f"""
|
|
237
|
-
FOR p IN {self.prov_edges_col}
|
|
238
|
-
FILTER p._from IN [e._from, e._to]
|
|
239
|
-
RETURN p._to
|
|
240
|
-
"""
|
|
241
|
-
if self.prov_edges_col else "[]"
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
aql = f"""
|
|
245
|
-
LET e = DOCUMENT(@eid)
|
|
246
|
-
LET inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
|
|
247
|
-
LET inline = (
|
|
248
|
-
FOR x IN inline_candidates
|
|
249
|
-
FILTER x != null
|
|
250
|
-
RETURN x
|
|
251
|
-
)
|
|
252
|
-
LET via_edges = (
|
|
253
|
-
{prov_edges_clause}
|
|
254
|
-
)
|
|
255
|
-
RETURN UNIQUE(APPEND(inline, via_edges))
|
|
256
|
-
"""
|
|
257
|
-
cur = self.db.aql.execute(aql, bind_vars={"eid": edge_id}, batch_size=self.aql_batch_size, stream=self.aql_stream)
|
|
258
|
-
out = list(cur)
|
|
259
|
-
return out[0] if out else []
|
|
260
|
-
|
|
261
|
-
def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
262
|
-
if fields:
|
|
263
|
-
proj = ", ".join([f"{f}: d.{f}" for f in fields])
|
|
264
|
-
aql = f"LET d = DOCUMENT(@id) RETURN {{ _id: d._id, {proj} }}"
|
|
265
|
-
else:
|
|
266
|
-
aql = "RETURN DOCUMENT(@id)"
|
|
267
|
-
cur = self.db.aql.execute(aql, bind_vars={"id": node_id})
|
|
268
|
-
res = list(cur)
|
|
269
|
-
return res[0] if res else {}
|
|
270
|
-
|
|
271
|
-
# --------------------------
|
|
272
|
-
# Stats / quick analytics
|
|
273
|
-
# --------------------------
|
|
274
|
-
@staticmethod
|
|
275
|
-
def get_top_n_entities_by_degree(
|
|
276
|
-
db,
|
|
277
|
-
edges_collection: str = "ExtractedRelationships",
|
|
278
|
-
limit: Optional[int] = None,
|
|
279
|
-
time_window: Optional[Tuple[str, str]] = None,
|
|
280
|
-
time_property: str = "created_at",
|
|
281
|
-
) -> List[dict]:
|
|
282
|
-
bind: Dict[str, Any] = {}
|
|
283
|
-
where = ""
|
|
284
|
-
if time_window:
|
|
285
|
-
where = "FILTER HAS(e, @ts) AND e[@ts] >= @start_ts AND e[@ts] <= @end_ts"
|
|
286
|
-
bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
|
|
287
|
-
limit_clause = "LIMIT @lim" if limit else ""
|
|
288
|
-
if limit:
|
|
289
|
-
bind["lim"] = limit
|
|
290
|
-
aql = f"""
|
|
291
|
-
FOR e IN {edges_collection}
|
|
292
|
-
{where}
|
|
293
|
-
COLLECT entity = e._from WITH COUNT INTO degree
|
|
294
|
-
SORT degree DESC
|
|
295
|
-
{limit_clause}
|
|
296
|
-
RETURN {{ "entity": entity, "degree": degree }}
|
|
297
|
-
"""
|
|
298
|
-
return list(db.aql.execute(aql, bind_vars=bind))
|
|
299
|
-
|
|
300
|
-
@staticmethod
|
|
301
|
-
def get_entity_type_counts(
|
|
302
|
-
db,
|
|
303
|
-
nodes_collection: str = "ExtractedEntities",
|
|
304
|
-
type_property: str = "type"
|
|
305
|
-
) -> List[dict]:
|
|
306
|
-
aql = f"""
|
|
307
|
-
FOR doc IN {nodes_collection}
|
|
308
|
-
COLLECT t = doc.{type_property} WITH COUNT INTO c
|
|
309
|
-
SORT c DESC
|
|
310
|
-
RETURN {{ "type": t, "count": c }}
|
|
311
|
-
"""
|
|
312
|
-
return list(db.aql.execute(aql))
|
|
313
|
-
|
|
314
|
-
@staticmethod
|
|
315
|
-
def get_relationship_type_counts(
|
|
316
|
-
db,
|
|
317
|
-
edges_collection: str = "ExtractedRelationships",
|
|
318
|
-
relation_property: str = "relationship",
|
|
319
|
-
time_window: Optional[Tuple[str, str]] = None,
|
|
320
|
-
time_property: str = "created_at",
|
|
321
|
-
) -> List[dict]:
|
|
322
|
-
bind: Dict[str, Any] = {"rel_prop": relation_property}
|
|
323
|
-
where = "FILTER HAS(rel, @rel_prop)"
|
|
324
|
-
if time_window:
|
|
325
|
-
where += " AND HAS(rel, @ts) AND rel[@ts] >= @start_ts AND rel[@ts] <= @end_ts"
|
|
326
|
-
bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
|
|
327
|
-
aql = f"""
|
|
328
|
-
FOR rel IN {edges_collection}
|
|
329
|
-
{where}
|
|
330
|
-
COLLECT t = rel[@rel_prop] WITH COUNT INTO c
|
|
331
|
-
SORT c DESC
|
|
332
|
-
RETURN {{ "type": t, "count": c }}
|
|
333
|
-
"""
|
|
334
|
-
return list(db.aql.execute(aql, bind_vars=bind))
|
|
335
|
-
|
|
336
|
-
@staticmethod
|
|
337
|
-
def get_community_summaries(
|
|
338
|
-
db,
|
|
339
|
-
communities_collection: str = "Communities",
|
|
340
|
-
limit: Optional[int] = None,
|
|
341
|
-
skip: int = 0,
|
|
342
|
-
require_summary: bool = True
|
|
343
|
-
) -> List[dict]:
|
|
344
|
-
filter_clause = "FILTER c.summary != null AND c.summary != ''" if require_summary else "FILTER c.summary == null OR c.summary == ''"
|
|
345
|
-
limit_clause = "LIMIT @skip, @limit" if limit is not None else ""
|
|
346
|
-
bind: Dict[str, Any] = {}
|
|
347
|
-
if limit is not None:
|
|
348
|
-
bind.update({"skip": skip, "limit": limit})
|
|
349
|
-
aql = f"""
|
|
350
|
-
FOR c IN {communities_collection}
|
|
351
|
-
{filter_clause}
|
|
352
|
-
SORT c.community_id ASC
|
|
353
|
-
{limit_clause}
|
|
354
|
-
RETURN {{ id: c.community_id, summary: c.summary, size: c.size, level: c.level }}
|
|
355
|
-
"""
|
|
356
|
-
return list(db.aql.execute(aql, bind_vars=bind))
|
|
357
|
-
|
|
358
|
-
@staticmethod
|
|
359
|
-
def get_unique_table_headers(
|
|
360
|
-
db,
|
|
361
|
-
tables_collection: str = "Tables",
|
|
362
|
-
headers_property: str = "headers"
|
|
363
|
-
) -> List[List[str]]:
|
|
364
|
-
aql = f"""
|
|
365
|
-
FOR t IN {tables_collection}
|
|
366
|
-
FILTER HAS(t, @hp)
|
|
367
|
-
COLLECT h = t[@hp]
|
|
368
|
-
RETURN h
|
|
369
|
-
"""
|
|
370
|
-
return list(db.aql.execute(aql, bind_vars={"hp": headers_property}))
|
|
371
|
-
|
|
372
|
-
# --------------------------
|
|
373
|
-
# Bridge / GNN Integration Methods (Mirrored from GlobalGraphAccessor)
|
|
374
|
-
# --------------------------
|
|
375
|
-
|
|
376
|
-
def is_bridge(self, entity_key: str) -> Optional[dict]:
|
|
377
|
-
"""
|
|
378
|
-
Check if an entity is a bridge and return its bridge data.
|
|
379
|
-
Uses caching for performance.
|
|
380
|
-
"""
|
|
381
|
-
# Strip collection if present to get key
|
|
382
|
-
if "/" in entity_key:
|
|
383
|
-
entity_key = entity_key.split("/")[-1]
|
|
384
|
-
|
|
385
|
-
if entity_key in self._bridge_cache:
|
|
386
|
-
return self._bridge_cache[entity_key]
|
|
387
|
-
|
|
388
|
-
aql = """
|
|
389
|
-
FOR b IN @@bridge_col
|
|
390
|
-
FILTER b.entity_key == @entity_key
|
|
391
|
-
FILTER b.algorithm == @algorithm
|
|
392
|
-
RETURN b
|
|
393
|
-
"""
|
|
394
|
-
try:
|
|
395
|
-
result = list(self.db.aql.execute(
|
|
396
|
-
aql,
|
|
397
|
-
bind_vars={
|
|
398
|
-
"@bridge_col": self.bridge_col,
|
|
399
|
-
"entity_key": entity_key,
|
|
400
|
-
"algorithm": self.algorithm,
|
|
401
|
-
}
|
|
402
|
-
))
|
|
403
|
-
bridge_data = result[0] if result else None
|
|
404
|
-
except Exception:
|
|
405
|
-
# Fallback if collection doesn't exist yet
|
|
406
|
-
bridge_data = None
|
|
407
|
-
|
|
408
|
-
self._bridge_cache[entity_key] = bridge_data
|
|
409
|
-
return bridge_data
|
|
410
|
-
|
|
411
|
-
def get_entity_community(self, entity_id: str) -> Optional[str]:
|
|
412
|
-
"""Get the community ID for an entity."""
|
|
413
|
-
# For ArangoCommunityAccessor, we might know the community if mode is 'mapping'
|
|
414
|
-
# But we should check the mapping collection to be sure (or if it's a bridge to another community)
|
|
415
|
-
|
|
416
|
-
# If we are in 'mapping' mode, we can query membership collection
|
|
417
|
-
if self.community_mode == "mapping":
|
|
418
|
-
aql = f"""
|
|
419
|
-
FOR m IN {self.membership_col}
|
|
420
|
-
FILTER m.{self.memb_ent_field} == @entity_id
|
|
421
|
-
// We don't filter by algorithm here usually, but if needed we can
|
|
422
|
-
RETURN m.{self.memb_com_field}
|
|
423
|
-
"""
|
|
424
|
-
try:
|
|
425
|
-
result = list(self.db.aql.execute(aql, bind_vars={"entity_id": entity_id}))
|
|
426
|
-
return result[0] if result else None
|
|
427
|
-
except Exception:
|
|
428
|
-
return None
|
|
429
|
-
return None
|
|
430
|
-
|
|
431
|
-
def get_affinity(self, community_a: str, community_b: str) -> float:
|
|
432
|
-
"""
|
|
433
|
-
Get the affinity score between two communities.
|
|
434
|
-
Returns 0.0 if no affinity data exists.
|
|
435
|
-
"""
|
|
436
|
-
if not community_a or not community_b:
|
|
437
|
-
return 0.0
|
|
438
|
-
|
|
439
|
-
cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
|
|
440
|
-
|
|
441
|
-
if cache_key in self._affinity_cache:
|
|
442
|
-
return self._affinity_cache[cache_key]
|
|
443
|
-
|
|
444
|
-
aql = """
|
|
445
|
-
FOR a IN @@affinity_col
|
|
446
|
-
FILTER a.algorithm == @algorithm
|
|
447
|
-
FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
|
|
448
|
-
OR (a.community_a == @comm_b AND a.community_b == @comm_a)
|
|
449
|
-
RETURN a.affinity_score
|
|
450
|
-
"""
|
|
451
|
-
try:
|
|
452
|
-
result = list(self.db.aql.execute(
|
|
453
|
-
aql,
|
|
454
|
-
bind_vars={
|
|
455
|
-
"@affinity_col": self.affinity_col,
|
|
456
|
-
"algorithm": self.algorithm,
|
|
457
|
-
"comm_a": community_a,
|
|
458
|
-
"comm_b": community_b,
|
|
459
|
-
}
|
|
460
|
-
))
|
|
461
|
-
affinity = result[0] if result else 0.0
|
|
462
|
-
except Exception:
|
|
463
|
-
affinity = 0.0
|
|
464
|
-
|
|
465
|
-
self._affinity_cache[cache_key] = affinity
|
|
466
|
-
return affinity
|
|
467
|
-
|
|
468
|
-
def clear_bridge_cache(self):
|
|
469
|
-
"""Clear bridge/affinity caches."""
|
|
470
|
-
self._bridge_cache.clear()
|
|
471
|
-
self._affinity_cache.clear()
|
|
472
|
-
|
|
473
|
-
# ════════════════════════════════════════════════════════════════
|
|
474
|
-
# DISCOVERY ENTRY POINTS (for autonomous insight discovery)
|
|
475
|
-
# ════════════════════════════════════════════════════════════════
|
|
476
|
-
|
|
477
|
-
@staticmethod
|
|
478
|
-
def get_top_entities_in_community(
|
|
479
|
-
db,
|
|
480
|
-
community_id: str,
|
|
481
|
-
membership_collection: str = "EntityCommunities",
|
|
482
|
-
membership_entity_field: str = "entity_id",
|
|
483
|
-
membership_community_field: str = "community_id",
|
|
484
|
-
edges_collection: str = "ExtractedRelationships",
|
|
485
|
-
limit: int = 20,
|
|
486
|
-
) -> List[dict]:
|
|
487
|
-
"""
|
|
488
|
-
Get top entities by degree WITHIN a specific community.
|
|
489
|
-
Essential for autonomous discovery - provides high-value seed nodes.
|
|
490
|
-
|
|
491
|
-
Returns:
|
|
492
|
-
List of {entity: str, degree: int}
|
|
493
|
-
"""
|
|
494
|
-
aql = """
|
|
495
|
-
LET community_entities = (
|
|
496
|
-
FOR m IN @@membership
|
|
497
|
-
FILTER m[@m_com] == @cid
|
|
498
|
-
RETURN m[@m_ent]
|
|
499
|
-
)
|
|
500
|
-
FOR e IN @@edges
|
|
501
|
-
FILTER e._from IN community_entities
|
|
502
|
-
COLLECT entity = e._from WITH COUNT INTO degree
|
|
503
|
-
SORT degree DESC
|
|
504
|
-
LIMIT @limit
|
|
505
|
-
RETURN { entity: entity, degree: degree }
|
|
506
|
-
"""
|
|
507
|
-
return list(db.aql.execute(aql, bind_vars={
|
|
508
|
-
"@membership": membership_collection,
|
|
509
|
-
"@edges": edges_collection,
|
|
510
|
-
"m_ent": membership_entity_field,
|
|
511
|
-
"m_com": membership_community_field,
|
|
512
|
-
"cid": community_id,
|
|
513
|
-
"limit": limit,
|
|
514
|
-
}))
|
|
515
|
-
|
|
516
|
-
@staticmethod
|
|
517
|
-
def get_recent_entities(
|
|
518
|
-
db,
|
|
519
|
-
since: str, # ISO timestamp
|
|
520
|
-
community_id: Optional[str] = None,
|
|
521
|
-
nodes_collection: str = "ExtractedEntities",
|
|
522
|
-
membership_collection: str = "EntityCommunities",
|
|
523
|
-
membership_entity_field: str = "entity_id",
|
|
524
|
-
membership_community_field: str = "community_id",
|
|
525
|
-
created_at_property: str = "created_at",
|
|
526
|
-
updated_at_property: str = "updated_at",
|
|
527
|
-
limit: int = 100,
|
|
528
|
-
) -> List[dict]:
|
|
529
|
-
"""
|
|
530
|
-
Get entities created or updated since a timestamp.
|
|
531
|
-
Critical for daily discovery - "what's new since yesterday?"
|
|
532
|
-
|
|
533
|
-
Args:
|
|
534
|
-
since: ISO timestamp (e.g., "2026-01-11T00:00:00Z")
|
|
535
|
-
community_id: Optional community filter
|
|
536
|
-
|
|
537
|
-
Returns:
|
|
538
|
-
List of {entity: str, created_at: str, type: str}
|
|
539
|
-
"""
|
|
540
|
-
bind: Dict[str, Any] = {
|
|
541
|
-
"since": since,
|
|
542
|
-
"limit": limit,
|
|
543
|
-
"created_prop": created_at_property,
|
|
544
|
-
"updated_prop": updated_at_property,
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
community_filter = ""
|
|
548
|
-
if community_id:
|
|
549
|
-
community_filter = """
|
|
550
|
-
LET community_entities = (
|
|
551
|
-
FOR m IN @@membership
|
|
552
|
-
FILTER m[@m_com] == @cid
|
|
553
|
-
RETURN m[@m_ent]
|
|
554
|
-
)
|
|
555
|
-
FILTER e._id IN community_entities
|
|
556
|
-
"""
|
|
557
|
-
bind["@membership"] = membership_collection
|
|
558
|
-
bind["m_ent"] = membership_entity_field
|
|
559
|
-
bind["m_com"] = membership_community_field
|
|
560
|
-
bind["cid"] = community_id
|
|
561
|
-
|
|
562
|
-
aql = f"""
|
|
563
|
-
FOR e IN {nodes_collection}
|
|
564
|
-
FILTER (HAS(e, @created_prop) AND e[@created_prop] >= @since)
|
|
565
|
-
OR (HAS(e, @updated_prop) AND e[@updated_prop] >= @since)
|
|
566
|
-
{community_filter}
|
|
567
|
-
SORT HAS(e, @created_prop) ? e[@created_prop] : e[@updated_prop] DESC
|
|
568
|
-
LIMIT @limit
|
|
569
|
-
RETURN {{
|
|
570
|
-
entity: e._id,
|
|
571
|
-
created_at: HAS(e, @created_prop) ? e[@created_prop] : null,
|
|
572
|
-
updated_at: HAS(e, @updated_prop) ? e[@updated_prop] : null,
|
|
573
|
-
type: e.type,
|
|
574
|
-
name: e.name
|
|
575
|
-
}}
|
|
576
|
-
"""
|
|
577
|
-
return list(db.aql.execute(aql, bind_vars=bind))
|
|
578
|
-
|
|
579
|
-
@staticmethod
|
|
580
|
-
def search_entities(
|
|
581
|
-
db,
|
|
582
|
-
query: str,
|
|
583
|
-
community_id: Optional[str] = None,
|
|
584
|
-
nodes_collection: str = "ExtractedEntities",
|
|
585
|
-
membership_collection: str = "EntityCommunities",
|
|
586
|
-
membership_entity_field: str = "entity_id",
|
|
587
|
-
membership_community_field: str = "community_id",
|
|
588
|
-
search_fields: List[str] = None,
|
|
589
|
-
limit: int = 20,
|
|
590
|
-
) -> List[dict]:
|
|
591
|
-
"""
|
|
592
|
-
Text search for entities matching query.
|
|
593
|
-
Uses LIKE for simple text matching (can be upgraded to ArangoSearch).
|
|
594
|
-
|
|
595
|
-
Args:
|
|
596
|
-
query: Search string
|
|
597
|
-
search_fields: Fields to search in (default: ["name", "description"])
|
|
598
|
-
|
|
599
|
-
Returns:
|
|
600
|
-
List of {entity: str, name: str, type: str, matched_field: str}
|
|
601
|
-
"""
|
|
602
|
-
if search_fields is None:
|
|
603
|
-
search_fields = ["name", "description"]
|
|
604
|
-
|
|
605
|
-
bind: Dict[str, Any] = {
|
|
606
|
-
"query": f"%{query.lower()}%",
|
|
607
|
-
"limit": limit,
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
# Build search conditions
|
|
611
|
-
search_conditions = []
|
|
612
|
-
for field in search_fields:
|
|
613
|
-
search_conditions.append(f"LOWER(e.{field}) LIKE @query")
|
|
614
|
-
search_clause = " OR ".join(search_conditions)
|
|
615
|
-
|
|
616
|
-
community_filter = ""
|
|
617
|
-
if community_id:
|
|
618
|
-
community_filter = """
|
|
619
|
-
LET community_entities = (
|
|
620
|
-
FOR m IN @@membership
|
|
621
|
-
FILTER m[@m_com] == @cid
|
|
622
|
-
RETURN m[@m_ent]
|
|
623
|
-
)
|
|
624
|
-
FILTER e._id IN community_entities
|
|
625
|
-
"""
|
|
626
|
-
bind["@membership"] = membership_collection
|
|
627
|
-
bind["m_ent"] = membership_entity_field
|
|
628
|
-
bind["m_com"] = membership_community_field
|
|
629
|
-
bind["cid"] = community_id
|
|
630
|
-
|
|
631
|
-
aql = f"""
|
|
632
|
-
FOR e IN {nodes_collection}
|
|
633
|
-
FILTER {search_clause}
|
|
634
|
-
{community_filter}
|
|
635
|
-
LIMIT @limit
|
|
636
|
-
RETURN {{
|
|
637
|
-
entity: e._id,
|
|
638
|
-
name: e.name,
|
|
639
|
-
type: e.type,
|
|
640
|
-
description: e.description
|
|
641
|
-
}}
|
|
642
|
-
"""
|
|
643
|
-
return list(db.aql.execute(aql, bind_vars=bind))
|
|
644
|
-
|
|
645
|
-
# ════════════════════════════════════════════════════════════════
|
|
646
|
-
# CONTENT HYDRATION (for agent reasoning)
|
|
647
|
-
# ════════════════════════════════════════════════════════════════
|
|
648
|
-
|
|
649
|
-
@staticmethod
|
|
650
|
-
def get_document_content(
|
|
651
|
-
db,
|
|
652
|
-
doc_id: str,
|
|
653
|
-
text_collection: str = "TextBlocks",
|
|
654
|
-
table_collection: str = "Tables",
|
|
655
|
-
image_collection: str = "Images",
|
|
656
|
-
document_collection: str = "Documents",
|
|
657
|
-
) -> Optional[dict]:
|
|
658
|
-
"""
|
|
659
|
-
Fetch content from any document collection by ID.
|
|
660
|
-
Essential for agent reasoning - converts graph IDs to actual content.
|
|
661
|
-
|
|
662
|
-
Args:
|
|
663
|
-
doc_id: Document ID in format "CollectionName/key"
|
|
664
|
-
|
|
665
|
-
Returns:
|
|
666
|
-
Dict with type-specific content, or None if not found
|
|
667
|
-
"""
|
|
668
|
-
try:
|
|
669
|
-
collection, key = doc_id.split("/", 1)
|
|
670
|
-
except ValueError:
|
|
671
|
-
return None
|
|
672
|
-
|
|
673
|
-
if collection == text_collection:
|
|
674
|
-
aql = f"""
|
|
675
|
-
FOR tb IN {text_collection}
|
|
676
|
-
FILTER tb._id == @doc_id
|
|
677
|
-
RETURN {{
|
|
678
|
-
type: "text",
|
|
679
|
-
text: tb.text,
|
|
680
|
-
document_id: tb.document_id,
|
|
681
|
-
page: tb.page,
|
|
682
|
-
char_span: tb.char_span,
|
|
683
|
-
metadata: tb.metadata
|
|
684
|
-
}}
|
|
685
|
-
"""
|
|
686
|
-
elif collection == table_collection:
|
|
687
|
-
aql = f"""
|
|
688
|
-
FOR t IN {table_collection}
|
|
689
|
-
FILTER t._id == @doc_id
|
|
690
|
-
RETURN {{
|
|
691
|
-
type: "table",
|
|
692
|
-
headers: t.headers,
|
|
693
|
-
rows: t.rows,
|
|
694
|
-
caption: t.caption,
|
|
695
|
-
document_id: t.document_id,
|
|
696
|
-
page: t.page,
|
|
697
|
-
metadata: t.metadata
|
|
698
|
-
}}
|
|
699
|
-
"""
|
|
700
|
-
elif collection == image_collection:
|
|
701
|
-
aql = f"""
|
|
702
|
-
FOR img IN {image_collection}
|
|
703
|
-
FILTER img._id == @doc_id
|
|
704
|
-
RETURN {{
|
|
705
|
-
type: "image",
|
|
706
|
-
caption: img.caption,
|
|
707
|
-
ocr_text: img.ocr_text,
|
|
708
|
-
url: img.storage_url,
|
|
709
|
-
document_id: img.document_id,
|
|
710
|
-
page: img.page,
|
|
711
|
-
metadata: img.metadata
|
|
712
|
-
}}
|
|
713
|
-
"""
|
|
714
|
-
elif collection == document_collection:
|
|
715
|
-
aql = f"""
|
|
716
|
-
FOR d IN {document_collection}
|
|
717
|
-
FILTER d._id == @doc_id
|
|
718
|
-
RETURN {{
|
|
719
|
-
type: "document",
|
|
720
|
-
filename: d.filename,
|
|
721
|
-
content: d.content,
|
|
722
|
-
metadata: d.metadata
|
|
723
|
-
}}
|
|
724
|
-
"""
|
|
725
|
-
else:
|
|
726
|
-
return None
|
|
727
|
-
|
|
728
|
-
result = list(db.aql.execute(aql, bind_vars={"doc_id": doc_id}))
|
|
729
|
-
return result[0] if result else None
|
|
730
|
-
|
|
731
|
-
@staticmethod
|
|
732
|
-
def get_entity_sources(
|
|
733
|
-
db,
|
|
734
|
-
entity_id: str,
|
|
735
|
-
extracted_from_collection: str = "EXTRACTED_FROM",
|
|
736
|
-
max_sources: int = 10,
|
|
737
|
-
) -> List[dict]:
|
|
738
|
-
"""
|
|
739
|
-
Get all source documents/blocks for an entity via EXTRACTED_FROM edges.
|
|
740
|
-
Critical for evidence gathering - shows WHERE an entity was mentioned.
|
|
741
|
-
|
|
742
|
-
Args:
|
|
743
|
-
entity_id: Entity ID (e.g., "ExtractedEntities/ent_123")
|
|
744
|
-
max_sources: Limit number of sources returned
|
|
745
|
-
|
|
746
|
-
Returns:
|
|
747
|
-
List of {source_id, source_type, content, char_span, confidence, metadata}
|
|
748
|
-
"""
|
|
749
|
-
aql = f"""
|
|
750
|
-
FOR edge IN {extracted_from_collection}
|
|
751
|
-
FILTER edge._from == @entity_id
|
|
752
|
-
LIMIT @max_sources
|
|
753
|
-
LET source = DOCUMENT(edge._to)
|
|
754
|
-
LET collection = PARSE_IDENTIFIER(edge._to).collection
|
|
755
|
-
RETURN {{
|
|
756
|
-
source_id: edge._to,
|
|
757
|
-
source_type: collection,
|
|
758
|
-
char_span: edge.char_span,
|
|
759
|
-
extraction_confidence: edge.extraction_confidence,
|
|
760
|
-
content: CASE
|
|
761
|
-
WHEN collection == "TextBlocks" THEN source.text
|
|
762
|
-
WHEN collection == "Tables" THEN {{ headers: source.headers, rows: source.rows }}
|
|
763
|
-
WHEN collection == "Images" THEN {{ caption: source.caption, ocr_text: source.ocr_text }}
|
|
764
|
-
WHEN collection == "Documents" THEN SUBSTRING(source.content, 0, 500)
|
|
765
|
-
ELSE null
|
|
766
|
-
END,
|
|
767
|
-
metadata: {{
|
|
768
|
-
page: source.page,
|
|
769
|
-
document_id: source.document_id,
|
|
770
|
-
filename: source.filename
|
|
771
|
-
}}
|
|
772
|
-
}}
|
|
773
|
-
"""
|
|
774
|
-
return list(db.aql.execute(aql, bind_vars={
|
|
775
|
-
"entity_id": entity_id,
|
|
776
|
-
"max_sources": max_sources,
|
|
777
|
-
}))
|
|
778
|
-
|
|
779
|
-
@staticmethod
|
|
780
|
-
def search_content(
|
|
781
|
-
db,
|
|
782
|
-
query: str,
|
|
783
|
-
community_id: Optional[str] = None,
|
|
784
|
-
content_types: List[str] = None,
|
|
785
|
-
text_collection: str = "TextBlocks",
|
|
786
|
-
table_collection: str = "Tables",
|
|
787
|
-
image_collection: str = "Images",
|
|
788
|
-
membership_collection: str = "EntityCommunities",
|
|
789
|
-
extracted_from_collection: str = "EXTRACTED_FROM",
|
|
790
|
-
limit: int = 10,
|
|
791
|
-
) -> List[dict]:
|
|
792
|
-
"""
|
|
793
|
-
Semantic/text search across content collections.
|
|
794
|
-
Uses simple LIKE matching (can be upgraded to ArangoSearch/vectors).
|
|
795
|
-
|
|
796
|
-
Args:
|
|
797
|
-
query: Search string
|
|
798
|
-
content_types: Collections to search (default: ["TextBlocks", "Tables", "Images"])
|
|
799
|
-
community_id: Optional filter to content linked to community entities
|
|
800
|
-
|
|
801
|
-
Returns:
|
|
802
|
-
List of {source_id, source_type, content, score, metadata}
|
|
803
|
-
"""
|
|
804
|
-
if content_types is None:
|
|
805
|
-
content_types = [text_collection, table_collection, image_collection]
|
|
806
|
-
|
|
807
|
-
bind: Dict[str, Any] = {
|
|
808
|
-
"query": f"%{query.lower()}%",
|
|
809
|
-
"limit": limit,
|
|
810
|
-
}
|
|
811
|
-
|
|
812
|
-
results = []
|
|
813
|
-
|
|
814
|
-
# Search TextBlocks
|
|
815
|
-
if text_collection in content_types:
|
|
816
|
-
aql_text = f"""
|
|
817
|
-
FOR tb IN {text_collection}
|
|
818
|
-
FILTER LOWER(tb.text) LIKE @query
|
|
819
|
-
LIMIT @limit
|
|
820
|
-
RETURN {{
|
|
821
|
-
source_id: tb._id,
|
|
822
|
-
source_type: "TextBlocks",
|
|
823
|
-
content: tb.text,
|
|
824
|
-
score: 1.0,
|
|
825
|
-
metadata: {{
|
|
826
|
-
document_id: tb.document_id,
|
|
827
|
-
page: tb.page
|
|
828
|
-
}}
|
|
829
|
-
}}
|
|
830
|
-
"""
|
|
831
|
-
results.extend(list(db.aql.execute(aql_text, bind_vars=bind)))
|
|
832
|
-
|
|
833
|
-
# Search Tables (caption)
|
|
834
|
-
if table_collection in content_types:
|
|
835
|
-
aql_table = f"""
|
|
836
|
-
FOR t IN {table_collection}
|
|
837
|
-
FILTER LOWER(t.caption) LIKE @query
|
|
838
|
-
LIMIT @limit
|
|
839
|
-
RETURN {{
|
|
840
|
-
source_id: t._id,
|
|
841
|
-
source_type: "Tables",
|
|
842
|
-
content: {{ headers: t.headers, rows: t.rows, caption: t.caption }},
|
|
843
|
-
score: 1.0,
|
|
844
|
-
metadata: {{
|
|
845
|
-
document_id: t.document_id,
|
|
846
|
-
page: t.page
|
|
847
|
-
}}
|
|
848
|
-
}}
|
|
849
|
-
"""
|
|
850
|
-
results.extend(list(db.aql.execute(aql_table, bind_vars=bind)))
|
|
851
|
-
|
|
852
|
-
# Search Images (OCR text)
|
|
853
|
-
if image_collection in content_types:
|
|
854
|
-
aql_image = f"""
|
|
855
|
-
FOR img IN {image_collection}
|
|
856
|
-
FILTER LOWER(img.ocr_text) LIKE @query OR LOWER(img.caption) LIKE @query
|
|
857
|
-
LIMIT @limit
|
|
858
|
-
RETURN {{
|
|
859
|
-
source_id: img._id,
|
|
860
|
-
source_type: "Images",
|
|
861
|
-
content: {{ caption: img.caption, ocr_text: img.ocr_text }},
|
|
862
|
-
score: 1.0,
|
|
863
|
-
metadata: {{
|
|
864
|
-
document_id: img.document_id,
|
|
865
|
-
page: img.page
|
|
866
|
-
}}
|
|
867
|
-
}}
|
|
868
|
-
"""
|
|
869
|
-
results.extend(list(db.aql.execute(aql_image, bind_vars=bind)))
|
|
870
|
-
|
|
871
|
-
return results[:limit]
|
|
872
|
-
|
|
873
|
-
# --------------------------
|
|
874
|
-
# Internal neighbor routine
|
|
875
|
-
# --------------------------
|
|
876
|
-
def _iter_neighbors(self, node: NodeId, *, direction: str, rich: bool) -> Iterable[EdgeView]:
|
|
877
|
-
assert direction in ("OUTBOUND", "INBOUND")
|
|
878
|
-
|
|
879
|
-
bind: Dict[str, Any] = {
|
|
880
|
-
"node": node,
|
|
881
|
-
"rel_prop": self.rel_prop,
|
|
882
|
-
"w_prop": self.w_prop,
|
|
883
|
-
"priors_map": self.type_priors,
|
|
884
|
-
}
|
|
885
|
-
|
|
886
|
-
# Only add community ID if we're filtering by it
|
|
887
|
-
if self.community_mode != "none":
|
|
888
|
-
bind["cid"] = self._cid
|
|
889
|
-
|
|
890
|
-
# Bind parameters are added only when referenced to avoid AQL 1552 errors
|
|
891
|
-
|
|
892
|
-
hint = ""
|
|
893
|
-
if direction == "OUTBOUND" and self.outbound_index_hint:
|
|
894
|
-
hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
|
|
895
|
-
bind["idx"] = self.outbound_index_hint
|
|
896
|
-
elif direction == "INBOUND" and self.inbound_index_hint:
|
|
897
|
-
hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
|
|
898
|
-
bind["idx"] = self.inbound_index_hint
|
|
899
|
-
|
|
900
|
-
filters: List[str] = []
|
|
901
|
-
|
|
902
|
-
# Community filter
|
|
903
|
-
if self.community_mode == "property":
|
|
904
|
-
filters.append(f"v.{self.community_prop} == @cid")
|
|
905
|
-
elif self.community_mode == "mapping":
|
|
906
|
-
bind.update({"@mcol": self.membership_col, "m_ent": self.memb_ent_field, "m_com": self.memb_com_field})
|
|
907
|
-
filters.append("""
|
|
908
|
-
FIRST(
|
|
909
|
-
FOR m IN @@mcol
|
|
910
|
-
FILTER m[@m_com] == @cid AND m[@m_ent] == v._id
|
|
911
|
-
LIMIT 1
|
|
912
|
-
RETURN 1
|
|
913
|
-
)
|
|
914
|
-
""")
|
|
915
|
-
# else community_mode == "none" - no filtering
|
|
916
|
-
|
|
917
|
-
# Relation / neighbor type filters
|
|
918
|
-
if self.allowed_relations:
|
|
919
|
-
bind["allowed_relations"] = self.allowed_relations
|
|
920
|
-
filters.append("e[@rel_prop] IN @allowed_relations")
|
|
921
|
-
if self.disallowed_relations:
|
|
922
|
-
bind["disallowed_relations"] = self.disallowed_relations
|
|
923
|
-
filters.append("!(e[@rel_prop] IN @disallowed_relations)")
|
|
924
|
-
if self.allowed_neighbor_types:
|
|
925
|
-
bind["allowed_neighbor_types"] = self.allowed_neighbor_types
|
|
926
|
-
filters.append(f"v.{self.node_type_prop} IN @allowed_neighbor_types")
|
|
927
|
-
|
|
928
|
-
# Time window filter on edge timestamp
|
|
929
|
-
if self.time_window and self.ts_prop:
|
|
930
|
-
bind["start_ts"], bind["end_ts"] = self.time_window
|
|
931
|
-
bind["ts_prop"] = self.ts_prop
|
|
932
|
-
filters.append("HAS(e, @ts_prop) AND e[@ts_prop] >= @start_ts AND e[@ts_prop] <= @end_ts")
|
|
933
|
-
|
|
934
|
-
# Current-only validity wrt as_of
|
|
935
|
-
if self.current_only and self.as_of:
|
|
936
|
-
bind["as_of"] = self.as_of
|
|
937
|
-
vf_prop = self.edge_valid_from_prop or "valid_from"
|
|
938
|
-
vt_prop = self.edge_valid_to_prop or "valid_to"
|
|
939
|
-
filters.append(
|
|
940
|
-
f"( (HAS(e, '{vf_prop}') ? e['{vf_prop}'] <= @as_of : true) "
|
|
941
|
-
f"AND (HAS(e, '{vt_prop}') ? (e['{vt_prop}'] == null OR e['{vt_prop}'] >= @as_of) : true) )"
|
|
942
|
-
)
|
|
943
|
-
|
|
944
|
-
# Optional status guard
|
|
945
|
-
status_guard = ""
|
|
946
|
-
if self.edge_status_prop:
|
|
947
|
-
status_guard = "LET _status = e[@status_prop]"
|
|
948
|
-
bind["status_prop"] = self.edge_status_prop
|
|
949
|
-
|
|
950
|
-
# Recency decay: 2^(- age_days / half_life)
|
|
951
|
-
recency_clause = "1.0"
|
|
952
|
-
if self.recency_half_life_days is not None and self.as_of and self.ts_prop:
|
|
953
|
-
bind["half_life"] = float(self.recency_half_life_days)
|
|
954
|
-
bind["as_of"] = self.as_of
|
|
955
|
-
bind["ts_prop"] = self.ts_prop
|
|
956
|
-
recency_clause = "POW(2, -1 * DATE_DIFF(@as_of, e[@ts_prop], 'days') / @half_life)"
|
|
957
|
-
|
|
958
|
-
# Base weight
|
|
959
|
-
weight_clause = "(HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0)"
|
|
960
|
-
|
|
961
|
-
# Confidence fusion (usually disabled; you do it in engine)
|
|
962
|
-
conf_clause = "1.0"
|
|
963
|
-
if self.fuse_edge_confidence:
|
|
964
|
-
bind.update({
|
|
965
|
-
"raw_c": self.edge_raw_conf_prop,
|
|
966
|
-
"npll": self.edge_npll_post_prop,
|
|
967
|
-
"calib": self.edge_calibration_prop,
|
|
968
|
-
"miss_prior": float(self.missing_confidence_prior),
|
|
969
|
-
})
|
|
970
|
-
conf_clause = (
|
|
971
|
-
"( (HAS(e, @raw_c) && IS_NUMBER(e[@raw_c]) ? e[@raw_c] : @miss_prior) * "
|
|
972
|
-
" (HAS(e, @npll) && IS_NUMBER(e[@npll]) ? e[@npll] : @miss_prior) * "
|
|
973
|
-
" (HAS(e, @calib) && IS_NUMBER(e[@calib]) ? e[@calib] : @miss_prior) )"
|
|
974
|
-
)
|
|
975
|
-
|
|
976
|
-
filters_str = " && ".join(filters) if filters else "true"
|
|
977
|
-
|
|
978
|
-
# Build the src edges clause safely
|
|
979
|
-
src_edges_clause = (
|
|
980
|
-
f"""
|
|
981
|
-
FOR p IN {self.prov_edges_col}
|
|
982
|
-
FILTER p._from IN [e._from, e._to]
|
|
983
|
-
RETURN p._to
|
|
984
|
-
"""
|
|
985
|
-
if self.prov_edges_col else "[]"
|
|
986
|
-
)
|
|
987
|
-
|
|
988
|
-
aql = f"""
|
|
989
|
-
LET priors = @priors_map
|
|
990
|
-
FOR v, e IN 1..1 {direction} @node {self.edges_col}
|
|
991
|
-
{hint}
|
|
992
|
-
FILTER {filters_str}
|
|
993
|
-
{status_guard}
|
|
994
|
-
LET _rel = e[@rel_prop]
|
|
995
|
-
LET _prior = TO_NUMBER(NOT_NULL(priors[_rel], 1.0))
|
|
996
|
-
LET _base_w = {weight_clause}
|
|
997
|
-
LET _rec = {recency_clause}
|
|
998
|
-
LET _conf = {conf_clause}
|
|
999
|
-
LET _w_eff = TO_NUMBER(_base_w) * TO_NUMBER(_prior) * TO_NUMBER(_rec) * TO_NUMBER(_conf)
|
|
1000
|
-
|
|
1001
|
-
LET _vf = {f"e['{self.edge_valid_from_prop}']" if self.edge_valid_from_prop else 'null'}
|
|
1002
|
-
LET _vt = {f"e['{self.edge_valid_to_prop}']" if self.edge_valid_to_prop else 'null'}
|
|
1003
|
-
LET _status2 = {f"e['{self.edge_status_prop}']" if self.edge_status_prop else 'null'}
|
|
1004
|
-
|
|
1005
|
-
// Provenance: inline fields + EXTRACTED_FROM for both endpoints
|
|
1006
|
-
LET _src_inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
|
|
1007
|
-
LET _src_inline = (
|
|
1008
|
-
FOR x IN _src_inline_candidates
|
|
1009
|
-
FILTER x != null
|
|
1010
|
-
RETURN x
|
|
1011
|
-
)
|
|
1012
|
-
LET _src_edges = (
|
|
1013
|
-
{src_edges_clause}
|
|
1014
|
-
)
|
|
1015
|
-
LET _sources = UNIQUE(APPEND(_src_inline, _src_edges))
|
|
1016
|
-
|
|
1017
|
-
RETURN {{
|
|
1018
|
-
v_id: v._id,
|
|
1019
|
-
rel: _rel,
|
|
1020
|
-
weight: _w_eff,
|
|
1021
|
-
edge_id: e._id,
|
|
1022
|
-
valid_from: _vf,
|
|
1023
|
-
valid_to: _vt,
|
|
1024
|
-
status: _status2,
|
|
1025
|
-
raw_confidence: {f"e['{self.edge_raw_conf_prop}']" if self.edge_raw_conf_prop else 'null'},
|
|
1026
|
-
npll_posterior: {f"e['{self.edge_npll_post_prop}']" if self.edge_npll_post_prop else 'null'},
|
|
1027
|
-
calibration: {f"e['{self.edge_calibration_prop}']" if self.edge_calibration_prop else 'null'},
|
|
1028
|
-
sources: _sources
|
|
1029
|
-
}}
|
|
1030
|
-
"""
|
|
1031
|
-
|
|
1032
|
-
cursor = self.db.aql.execute(
|
|
1033
|
-
aql,
|
|
1034
|
-
bind_vars=bind,
|
|
1035
|
-
batch_size=self.aql_batch_size or 1000,
|
|
1036
|
-
stream=self.aql_stream if self.aql_stream is not None else True,
|
|
1037
|
-
ttl=120, # 2 minute timeout for long queries
|
|
1038
|
-
optimizer_rules=["+use-indexes"] # Force index usage
|
|
1039
|
-
)
|
|
1040
|
-
for d in cursor:
|
|
1041
|
-
if rich:
|
|
1042
|
-
yield EdgeView(
|
|
1043
|
-
neighbor_id=d["v_id"],
|
|
1044
|
-
relation=d["rel"],
|
|
1045
|
-
weight=float(d["weight"]),
|
|
1046
|
-
edge_id=d["edge_id"],
|
|
1047
|
-
valid_from=d.get("valid_from"),
|
|
1048
|
-
valid_to=d.get("valid_to"),
|
|
1049
|
-
status=d.get("status"),
|
|
1050
|
-
raw_confidence=d.get("raw_confidence"),
|
|
1051
|
-
npll_posterior=d.get("npll_posterior"),
|
|
1052
|
-
calibration=d.get("calibration"),
|
|
1053
|
-
sources=d.get("sources") or [],
|
|
1054
|
-
)
|
|
1055
|
-
else:
|
|
1056
|
-
yield d["v_id"], d["rel"], float(d["weight"])
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
class GlobalGraphAccessor(GraphAccessor):
|
|
1060
|
-
"""
|
|
1061
|
-
Cross-community graph accessor using pre-computed bridge entities.
|
|
1062
|
-
|
|
1063
|
-
This accessor enables intelligent traversal across community boundaries
|
|
1064
|
-
by leveraging the BridgeEntities and CommunityAffinity collections
|
|
1065
|
-
created during community detection.
|
|
1066
|
-
|
|
1067
|
-
Key features:
|
|
1068
|
-
- Uses bridge entities to efficiently cross community boundaries
|
|
1069
|
-
- Scores cross-community paths using affinity scores
|
|
1070
|
-
- Mission-aware: can weight community crossings based on context
|
|
1071
|
-
- Maintains all ArangoCommunityAccessor features
|
|
1072
|
-
"""
|
|
1073
|
-
|
|
1074
|
-
def __init__(
|
|
1075
|
-
self,
|
|
1076
|
-
db,
|
|
1077
|
-
algorithm: str = "leiden",
|
|
1078
|
-
# Base accessor settings
|
|
1079
|
-
nodes_collection: str = "ExtractedEntities",
|
|
1080
|
-
edges_collection: str = "ExtractedRelationships",
|
|
1081
|
-
relation_property: str = "relationship",
|
|
1082
|
-
weight_property: str = "weight",
|
|
1083
|
-
# Bridge collections
|
|
1084
|
-
bridge_collection: str = "BridgeEntities",
|
|
1085
|
-
affinity_collection: str = "CommunityAffinity",
|
|
1086
|
-
membership_collection: str = "EntityCommunities",
|
|
1087
|
-
# Cross-community scoring
|
|
1088
|
-
cross_community_bonus: float = 1.5, # Boost for cross-community edges (often valuable)
|
|
1089
|
-
min_affinity_threshold: float = 0.0, # Minimum affinity to allow crossing
|
|
1090
|
-
# Performance
|
|
1091
|
-
aql_batch_size: int = 1000,
|
|
1092
|
-
aql_stream: bool = True,
|
|
1093
|
-
):
|
|
1094
|
-
self.db = db
|
|
1095
|
-
self.algorithm = algorithm
|
|
1096
|
-
|
|
1097
|
-
self.nodes_col = nodes_collection
|
|
1098
|
-
self.edges_col = edges_collection
|
|
1099
|
-
self.rel_prop = relation_property
|
|
1100
|
-
self.w_prop = weight_property
|
|
1101
|
-
|
|
1102
|
-
self.bridge_col = bridge_collection
|
|
1103
|
-
self.affinity_col = affinity_collection
|
|
1104
|
-
self.membership_col = membership_collection
|
|
1105
|
-
|
|
1106
|
-
self.cross_community_bonus = cross_community_bonus
|
|
1107
|
-
self.min_affinity_threshold = min_affinity_threshold
|
|
1108
|
-
|
|
1109
|
-
self.aql_batch_size = aql_batch_size
|
|
1110
|
-
self.aql_stream = aql_stream
|
|
1111
|
-
|
|
1112
|
-
# Cache for bridge status and affinities
|
|
1113
|
-
self._bridge_cache: Dict[str, Optional[dict]] = {}
|
|
1114
|
-
self._affinity_cache: Dict[str, float] = {}
|
|
1115
|
-
|
|
1116
|
-
# --------------------------
|
|
1117
|
-
# Core traversal API
|
|
1118
|
-
# --------------------------
|
|
1119
|
-
|
|
1120
|
-
def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
1121
|
-
"""Iterate outbound edges, scoring cross-community edges appropriately."""
|
|
1122
|
-
for ev in self._iter_neighbors_global(node, direction="OUTBOUND"):
|
|
1123
|
-
yield ev.neighbor_id, ev.relation, ev.weight
|
|
1124
|
-
|
|
1125
|
-
def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
1126
|
-
"""Iterate inbound edges, scoring cross-community edges appropriately."""
|
|
1127
|
-
for ev in self._iter_neighbors_global(node, direction="INBOUND"):
|
|
1128
|
-
yield ev.neighbor_id, ev.relation, ev.weight
|
|
1129
|
-
|
|
1130
|
-
def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
1131
|
-
"""Rich outbound edges with cross-community metadata."""
|
|
1132
|
-
yield from self._iter_neighbors_global(node, direction="OUTBOUND")
|
|
1133
|
-
|
|
1134
|
-
def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
1135
|
-
"""Rich inbound edges with cross-community metadata."""
|
|
1136
|
-
yield from self._iter_neighbors_global(node, direction="INBOUND")
|
|
1137
|
-
|
|
1138
|
-
def nodes(self) -> Iterable[NodeId]:
|
|
1139
|
-
"""Return all nodes (no community restriction)."""
|
|
1140
|
-
aql = f"FOR v IN {self.nodes_col} RETURN v._id"
|
|
1141
|
-
cursor = self.db.aql.execute(aql, batch_size=self.aql_batch_size, stream=self.aql_stream)
|
|
1142
|
-
for vid in cursor:
|
|
1143
|
-
yield vid
|
|
1144
|
-
|
|
1145
|
-
def degree(self, node: NodeId) -> int:
|
|
1146
|
-
"""Out-degree of a node."""
|
|
1147
|
-
aql = f"""
|
|
1148
|
-
RETURN LENGTH(
|
|
1149
|
-
FOR e IN {self.edges_col}
|
|
1150
|
-
FILTER e._from == @node
|
|
1151
|
-
RETURN 1
|
|
1152
|
-
)
|
|
1153
|
-
"""
|
|
1154
|
-
cur = self.db.aql.execute(aql, bind_vars={"node": node})
|
|
1155
|
-
return int(list(cur)[0] or 0)
|
|
1156
|
-
|
|
1157
|
-
# --------------------------
|
|
1158
|
-
# Bridge-aware methods
|
|
1159
|
-
# --------------------------
|
|
1160
|
-
|
|
1161
|
-
def is_bridge(self, entity_key: str) -> Optional[dict]:
|
|
1162
|
-
"""
|
|
1163
|
-
Check if an entity is a bridge and return its bridge data.
|
|
1164
|
-
Uses caching for performance.
|
|
1165
|
-
"""
|
|
1166
|
-
if entity_key in self._bridge_cache:
|
|
1167
|
-
return self._bridge_cache[entity_key]
|
|
1168
|
-
|
|
1169
|
-
aql = """
|
|
1170
|
-
FOR b IN @@bridge_col
|
|
1171
|
-
FILTER b.entity_key == @entity_key
|
|
1172
|
-
FILTER b.algorithm == @algorithm
|
|
1173
|
-
RETURN b
|
|
1174
|
-
"""
|
|
1175
|
-
result = list(self.db.aql.execute(
|
|
1176
|
-
aql,
|
|
1177
|
-
bind_vars={
|
|
1178
|
-
"@bridge_col": self.bridge_col,
|
|
1179
|
-
"entity_key": entity_key,
|
|
1180
|
-
"algorithm": self.algorithm,
|
|
1181
|
-
}
|
|
1182
|
-
))
|
|
1183
|
-
|
|
1184
|
-
bridge_data = result[0] if result else None
|
|
1185
|
-
self._bridge_cache[entity_key] = bridge_data
|
|
1186
|
-
return bridge_data
|
|
1187
|
-
|
|
1188
|
-
def get_entity_community(self, entity_id: str) -> Optional[str]:
|
|
1189
|
-
"""Get the community ID for an entity."""
|
|
1190
|
-
aql = """
|
|
1191
|
-
FOR m IN @@membership_col
|
|
1192
|
-
FILTER m.entity_id == @entity_id
|
|
1193
|
-
FILTER m.algorithm == @algorithm
|
|
1194
|
-
RETURN m.community_id
|
|
1195
|
-
"""
|
|
1196
|
-
result = list(self.db.aql.execute(
|
|
1197
|
-
aql,
|
|
1198
|
-
bind_vars={
|
|
1199
|
-
"@membership_col": self.membership_col,
|
|
1200
|
-
"entity_id": entity_id,
|
|
1201
|
-
"algorithm": self.algorithm,
|
|
1202
|
-
}
|
|
1203
|
-
))
|
|
1204
|
-
return result[0] if result else None
|
|
1205
|
-
|
|
1206
|
-
def get_affinity(self, community_a: str, community_b: str) -> float:
|
|
1207
|
-
"""
|
|
1208
|
-
Get the affinity score between two communities.
|
|
1209
|
-
Returns 0.0 if no affinity data exists.
|
|
1210
|
-
"""
|
|
1211
|
-
cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
|
|
1212
|
-
|
|
1213
|
-
if cache_key in self._affinity_cache:
|
|
1214
|
-
return self._affinity_cache[cache_key]
|
|
1215
|
-
|
|
1216
|
-
aql = """
|
|
1217
|
-
FOR a IN @@affinity_col
|
|
1218
|
-
FILTER a.algorithm == @algorithm
|
|
1219
|
-
FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
|
|
1220
|
-
OR (a.community_a == @comm_b AND a.community_b == @comm_a)
|
|
1221
|
-
RETURN a.affinity_score
|
|
1222
|
-
"""
|
|
1223
|
-
result = list(self.db.aql.execute(
|
|
1224
|
-
aql,
|
|
1225
|
-
bind_vars={
|
|
1226
|
-
"@affinity_col": self.affinity_col,
|
|
1227
|
-
"algorithm": self.algorithm,
|
|
1228
|
-
"comm_a": community_a,
|
|
1229
|
-
"comm_b": community_b,
|
|
1230
|
-
}
|
|
1231
|
-
))
|
|
1232
|
-
|
|
1233
|
-
affinity = result[0] if result else 0.0
|
|
1234
|
-
self._affinity_cache[cache_key] = affinity
|
|
1235
|
-
return affinity
|
|
1236
|
-
|
|
1237
|
-
def get_bridges_from_community(self, community_id: str, min_strength: int = 1) -> List[dict]:
|
|
1238
|
-
"""Get all bridge entities from a specific community."""
|
|
1239
|
-
aql = """
|
|
1240
|
-
FOR b IN @@bridge_col
|
|
1241
|
-
FILTER b.algorithm == @algorithm
|
|
1242
|
-
FILTER b.home_community == @community_id
|
|
1243
|
-
FILTER b.bridge_strength >= @min_strength
|
|
1244
|
-
SORT b.bridge_strength DESC
|
|
1245
|
-
RETURN b
|
|
1246
|
-
"""
|
|
1247
|
-
return list(self.db.aql.execute(
|
|
1248
|
-
aql,
|
|
1249
|
-
bind_vars={
|
|
1250
|
-
"@bridge_col": self.bridge_col,
|
|
1251
|
-
"algorithm": self.algorithm,
|
|
1252
|
-
"community_id": community_id,
|
|
1253
|
-
"min_strength": min_strength,
|
|
1254
|
-
}
|
|
1255
|
-
))
|
|
1256
|
-
|
|
1257
|
-
def get_top_bridges(self, limit: int = 20) -> List[dict]:
|
|
1258
|
-
"""Get the top bridge entities by bridge strength."""
|
|
1259
|
-
aql = """
|
|
1260
|
-
FOR b IN @@bridge_col
|
|
1261
|
-
FILTER b.algorithm == @algorithm
|
|
1262
|
-
SORT b.bridge_strength DESC
|
|
1263
|
-
LIMIT @limit
|
|
1264
|
-
RETURN b
|
|
1265
|
-
"""
|
|
1266
|
-
return list(self.db.aql.execute(
|
|
1267
|
-
aql,
|
|
1268
|
-
bind_vars={
|
|
1269
|
-
"@bridge_col": self.bridge_col,
|
|
1270
|
-
"algorithm": self.algorithm,
|
|
1271
|
-
"limit": limit,
|
|
1272
|
-
}
|
|
1273
|
-
))
|
|
1274
|
-
|
|
1275
|
-
def get_strongest_affinities(self, limit: int = 20) -> List[dict]:
|
|
1276
|
-
"""Get the strongest inter-community affinities."""
|
|
1277
|
-
aql = """
|
|
1278
|
-
FOR a IN @@affinity_col
|
|
1279
|
-
FILTER a.algorithm == @algorithm
|
|
1280
|
-
SORT a.affinity_score DESC
|
|
1281
|
-
LIMIT @limit
|
|
1282
|
-
RETURN a
|
|
1283
|
-
"""
|
|
1284
|
-
return list(self.db.aql.execute(
|
|
1285
|
-
aql,
|
|
1286
|
-
bind_vars={
|
|
1287
|
-
"@affinity_col": self.affinity_col,
|
|
1288
|
-
"algorithm": self.algorithm,
|
|
1289
|
-
"limit": limit,
|
|
1290
|
-
}
|
|
1291
|
-
))
|
|
1292
|
-
|
|
1293
|
-
# --------------------------
|
|
1294
|
-
# Cross-community traversal
|
|
1295
|
-
# --------------------------
|
|
1296
|
-
|
|
1297
|
-
def _iter_neighbors_global(self, node: NodeId, direction: str) -> Iterable[EdgeView]:
|
|
1298
|
-
"""
|
|
1299
|
-
Iterate neighbors with cross-community awareness.
|
|
1300
|
-
|
|
1301
|
-
- Gets all neighbors (no community restriction)
|
|
1302
|
-
- Detects cross-community edges
|
|
1303
|
-
- Applies bonus/penalty based on affinity
|
|
1304
|
-
"""
|
|
1305
|
-
assert direction in ("OUTBOUND", "INBOUND")
|
|
1306
|
-
|
|
1307
|
-
# Get source node's community
|
|
1308
|
-
source_community = self.get_entity_community(node)
|
|
1309
|
-
|
|
1310
|
-
# Get all neighbors
|
|
1311
|
-
aql = f"""
|
|
1312
|
-
FOR v, e IN 1..1 {direction} @node {self.edges_col}
|
|
1313
|
-
LET rel = e[@rel_prop]
|
|
1314
|
-
LET base_weight = HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0
|
|
1315
|
-
RETURN {{
|
|
1316
|
-
v_id: v._id,
|
|
1317
|
-
v_key: v._key,
|
|
1318
|
-
rel: rel,
|
|
1319
|
-
base_weight: base_weight,
|
|
1320
|
-
edge_id: e._id,
|
|
1321
|
-
sources: []
|
|
1322
|
-
}}
|
|
1323
|
-
"""
|
|
1324
|
-
|
|
1325
|
-
cursor = self.db.aql.execute(
|
|
1326
|
-
aql,
|
|
1327
|
-
bind_vars={
|
|
1328
|
-
"node": node,
|
|
1329
|
-
"rel_prop": self.rel_prop,
|
|
1330
|
-
"w_prop": self.w_prop,
|
|
1331
|
-
},
|
|
1332
|
-
batch_size=self.aql_batch_size,
|
|
1333
|
-
stream=self.aql_stream,
|
|
1334
|
-
)
|
|
1335
|
-
|
|
1336
|
-
for d in cursor:
|
|
1337
|
-
neighbor_id = d["v_id"]
|
|
1338
|
-
base_weight = float(d["base_weight"])
|
|
1339
|
-
|
|
1340
|
-
# Check if this is a cross-community edge
|
|
1341
|
-
neighbor_community = self.get_entity_community(neighbor_id)
|
|
1342
|
-
|
|
1343
|
-
weight = base_weight
|
|
1344
|
-
is_cross_community = False
|
|
1345
|
-
|
|
1346
|
-
if source_community and neighbor_community and source_community != neighbor_community:
|
|
1347
|
-
is_cross_community = True
|
|
1348
|
-
|
|
1349
|
-
# Get affinity between communities
|
|
1350
|
-
affinity = self.get_affinity(source_community, neighbor_community)
|
|
1351
|
-
|
|
1352
|
-
# Apply cross-community scoring
|
|
1353
|
-
if affinity >= self.min_affinity_threshold:
|
|
1354
|
-
# Bonus for crossing to well-connected communities
|
|
1355
|
-
weight = base_weight * self.cross_community_bonus * (1 + affinity)
|
|
1356
|
-
else:
|
|
1357
|
-
# Penalty for crossing to poorly-connected communities
|
|
1358
|
-
weight = base_weight * 0.5
|
|
1359
|
-
|
|
1360
|
-
yield EdgeView(
|
|
1361
|
-
neighbor_id=neighbor_id,
|
|
1362
|
-
relation=d["rel"],
|
|
1363
|
-
weight=weight,
|
|
1364
|
-
edge_id=d["edge_id"],
|
|
1365
|
-
valid_from=None,
|
|
1366
|
-
valid_to=None,
|
|
1367
|
-
status="cross_community" if is_cross_community else "same_community",
|
|
1368
|
-
raw_confidence=None,
|
|
1369
|
-
npll_posterior=None,
|
|
1370
|
-
calibration=None,
|
|
1371
|
-
sources=d.get("sources") or [],
|
|
1372
|
-
)
|
|
1373
|
-
|
|
1374
|
-
# --------------------------
|
|
1375
|
-
# Mission-aware scoring
|
|
1376
|
-
# --------------------------
|
|
1377
|
-
|
|
1378
|
-
def score_community_crossing(
|
|
1379
|
-
self,
|
|
1380
|
-
from_community: str,
|
|
1381
|
-
to_community: str,
|
|
1382
|
-
mission: Optional[str] = None
|
|
1383
|
-
) -> float:
|
|
1384
|
-
"""
|
|
1385
|
-
Score a community crossing based on mission context.
|
|
1386
|
-
|
|
1387
|
-
Args:
|
|
1388
|
-
from_community: Source community
|
|
1389
|
-
to_community: Target community
|
|
1390
|
-
mission: Optional mission context (e.g., "fraud_detection", "patient_care")
|
|
1391
|
-
|
|
1392
|
-
Returns:
|
|
1393
|
-
Score multiplier for the crossing (>1 = valuable, <1 = not valuable)
|
|
1394
|
-
"""
|
|
1395
|
-
base_affinity = self.get_affinity(from_community, to_community)
|
|
1396
|
-
|
|
1397
|
-
if not mission:
|
|
1398
|
-
return 1.0 + base_affinity
|
|
1399
|
-
|
|
1400
|
-
# Mission-specific scoring (customize based on your domain)
|
|
1401
|
-
mission_lower = mission.lower()
|
|
1402
|
-
|
|
1403
|
-
# Example: fraud detection values Claims -> Clinical crossings
|
|
1404
|
-
if "fraud" in mission_lower:
|
|
1405
|
-
# This would need actual community type detection
|
|
1406
|
-
# For now, just boost high-affinity crossings
|
|
1407
|
-
return (1.0 + base_affinity) * 1.5
|
|
1408
|
-
|
|
1409
|
-
# Example: patient care values Clinical -> Lab crossings
|
|
1410
|
-
if "patient" in mission_lower or "clinical" in mission_lower:
|
|
1411
|
-
return (1.0 + base_affinity) * 1.3
|
|
1412
|
-
|
|
1413
|
-
return 1.0 + base_affinity
|
|
1414
|
-
|
|
1415
|
-
def clear_cache(self):
|
|
1416
|
-
"""Clear internal caches (useful after data updates)."""
|
|
1417
|
-
self._bridge_cache.clear()
|
|
1418
|
-
self._affinity_cache.clear()
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Iterable, Tuple, Optional, List, Dict, Any, NamedTuple
|
|
3
|
+
|
|
4
|
+
from .adapters import GraphAccessor, NodeId, RelId
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EdgeView(NamedTuple):
|
|
8
|
+
neighbor_id: NodeId
|
|
9
|
+
relation: RelId
|
|
10
|
+
weight: float # structural effective weight
|
|
11
|
+
edge_id: str
|
|
12
|
+
valid_from: Optional[str]
|
|
13
|
+
valid_to: Optional[str]
|
|
14
|
+
status: Optional[str]
|
|
15
|
+
raw_confidence: Optional[float]
|
|
16
|
+
npll_posterior: Optional[float]
|
|
17
|
+
calibration: Optional[float]
|
|
18
|
+
sources: List[str] # doc/text ids from inline fields & EXTRACTED_FROM
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ArangoCommunityAccessor(GraphAccessor):
|
|
22
|
+
"""
|
|
23
|
+
Arango-backed GraphAccessor for a single community.
|
|
24
|
+
|
|
25
|
+
Defaults match your schema:
|
|
26
|
+
- nodes: ExtractedEntities
|
|
27
|
+
- edges: ExtractedRelationships (field: relationship, created_at)
|
|
28
|
+
- community via mapping: EntityCommunities(entity_id, community_id)
|
|
29
|
+
- provenance: inline fields + EXTRACTED_FROM (entity -> Documents/TextBlocks)
|
|
30
|
+
|
|
31
|
+
Structural weight only by default:
|
|
32
|
+
w_struct = base_weight * type_prior(relation) * recency_decay
|
|
33
|
+
(Set fuse_edge_confidence=True to multiply raw_confidence * npll_posterior * calibration in-adapter.)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
db,
|
|
39
|
+
community_id: str,
|
|
40
|
+
# Collections
|
|
41
|
+
nodes_collection: str = "ExtractedEntities",
|
|
42
|
+
edges_collection: str = "ExtractedRelationships",
|
|
43
|
+
# Core field names
|
|
44
|
+
relation_property: str = "relationship",
|
|
45
|
+
weight_property: str = "weight",
|
|
46
|
+
node_type_property: str = "type",
|
|
47
|
+
# Time fields
|
|
48
|
+
edge_timestamp_property: str = "created_at",
|
|
49
|
+
edge_valid_from_property: Optional[str] = "valid_from",
|
|
50
|
+
edge_valid_to_property: Optional[str] = "valid_to",
|
|
51
|
+
edge_status_property: Optional[str] = "status",
|
|
52
|
+
# Community scoping (mapping mode by default)
|
|
53
|
+
community_mode: str = "mapping", # "mapping" | "property"
|
|
54
|
+
community_property: str = "community_id", # only used if community_mode == "property"
|
|
55
|
+
membership_collection: str = "EntityCommunities",
|
|
56
|
+
membership_entity_field: str = "entity_id",
|
|
57
|
+
membership_community_field: str = "community_id",
|
|
58
|
+
# Dynamic constraints
|
|
59
|
+
allowed_relations: Optional[List[str]] = None,
|
|
60
|
+
disallowed_relations: Optional[List[str]] = None,
|
|
61
|
+
allowed_neighbor_types: Optional[List[str]] = None,
|
|
62
|
+
# Time filters
|
|
63
|
+
time_window: Optional[Tuple[str, str]] = None, # (start_iso, end_iso)
|
|
64
|
+
as_of: Optional[str] = None, # ISO timestamp for "as of"
|
|
65
|
+
current_only: bool = False, # respect valid_from/valid_to around as_of
|
|
66
|
+
recency_half_life_days: Optional[float] = 90.0, # None disables recency decay
|
|
67
|
+
# Priors
|
|
68
|
+
type_priors: Optional[Dict[str, float]] = None, # e.g., {"assessor": 1.1}
|
|
69
|
+
# Provenance
|
|
70
|
+
edge_provenance_fields: Optional[List[str]] = None, # defaults: ["source_document_id","source_text_id"]
|
|
71
|
+
provenance_edge_collection: Optional[str] = "EXTRACTED_FROM",
|
|
72
|
+
provenance_target_collections: Optional[List[str]] = None, # defaults: ["Documents","TextBlocks"]
|
|
73
|
+
# Confidence fusion (usually False; you do NPLL in engine)
|
|
74
|
+
fuse_edge_confidence: bool = False,
|
|
75
|
+
missing_confidence_prior: float = 1.0,
|
|
76
|
+
edge_raw_confidence_property: Optional[str] = "raw_confidence",
|
|
77
|
+
edge_npll_posterior_property: Optional[str] = "npll_posterior",
|
|
78
|
+
edge_calibration_property: Optional[str] = "calibration",
|
|
79
|
+
# Performance
|
|
80
|
+
aql_batch_size: int = 1000,
|
|
81
|
+
aql_stream: bool = True,
|
|
82
|
+
outbound_index_hint: Optional[str] = None, # e.g. "edges_from_rel_ts"
|
|
83
|
+
inbound_index_hint: Optional[str] = None, # e.g. "edges_to_rel_ts"
|
|
84
|
+
# Bridge / GNN integration
|
|
85
|
+
bridge_collection: str = "BridgeEntities",
|
|
86
|
+
affinity_collection: str = "CommunityAffinity",
|
|
87
|
+
algorithm: str = "gnn", # Default to GNN as per pipeline
|
|
88
|
+
):
|
|
89
|
+
self.db = db
|
|
90
|
+
self._cid = community_id
|
|
91
|
+
self.bridge_col = bridge_collection
|
|
92
|
+
self.affinity_col = affinity_collection
|
|
93
|
+
self.algorithm = algorithm
|
|
94
|
+
self._bridge_cache: Dict[str, Optional[dict]] = {}
|
|
95
|
+
self._affinity_cache: Dict[str, float] = {}
|
|
96
|
+
|
|
97
|
+
self.nodes_col = nodes_collection
|
|
98
|
+
self.edges_col = edges_collection
|
|
99
|
+
|
|
100
|
+
self.rel_prop = relation_property
|
|
101
|
+
self.w_prop = weight_property
|
|
102
|
+
self.node_type_prop = node_type_property
|
|
103
|
+
|
|
104
|
+
self.ts_prop = edge_timestamp_property
|
|
105
|
+
self.edge_valid_from_prop = edge_valid_from_property
|
|
106
|
+
self.edge_valid_to_prop = edge_valid_to_property
|
|
107
|
+
self.edge_status_prop = edge_status_property
|
|
108
|
+
|
|
109
|
+
self.community_mode = community_mode
|
|
110
|
+
self.community_prop = community_property
|
|
111
|
+
self.membership_col = membership_collection
|
|
112
|
+
self.memb_ent_field = membership_entity_field
|
|
113
|
+
self.memb_com_field = membership_community_field
|
|
114
|
+
|
|
115
|
+
self.allowed_relations = allowed_relations
|
|
116
|
+
self.disallowed_relations = disallowed_relations
|
|
117
|
+
self.allowed_neighbor_types = allowed_neighbor_types
|
|
118
|
+
|
|
119
|
+
self.time_window = time_window
|
|
120
|
+
self.as_of = as_of
|
|
121
|
+
self.current_only = current_only
|
|
122
|
+
self.recency_half_life_days = recency_half_life_days
|
|
123
|
+
|
|
124
|
+
self.type_priors = type_priors or {}
|
|
125
|
+
|
|
126
|
+
self.edge_prov_fields = edge_provenance_fields or ["source_document_id", "source_text_id"]
|
|
127
|
+
self.prov_edges_col = provenance_edge_collection
|
|
128
|
+
self.prov_target_cols = provenance_target_collections or ["Documents", "TextBlocks"]
|
|
129
|
+
|
|
130
|
+
self.fuse_edge_confidence = fuse_edge_confidence
|
|
131
|
+
self.missing_confidence_prior = missing_confidence_prior
|
|
132
|
+
self.edge_raw_conf_prop = edge_raw_confidence_property
|
|
133
|
+
self.edge_npll_post_prop = edge_npll_posterior_property
|
|
134
|
+
self.edge_calibration_prop = edge_calibration_property
|
|
135
|
+
|
|
136
|
+
self.aql_batch_size = aql_batch_size
|
|
137
|
+
self.aql_stream = aql_stream
|
|
138
|
+
self.outbound_index_hint = outbound_index_hint
|
|
139
|
+
self.inbound_index_hint = inbound_index_hint
|
|
140
|
+
|
|
141
|
+
# --------------------------
|
|
142
|
+
# Back-compatible core API
|
|
143
|
+
# --------------------------
|
|
144
|
+
def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
145
|
+
for ev in self._iter_neighbors(node, direction="OUTBOUND", rich=True):
|
|
146
|
+
yield ev.neighbor_id, ev.relation, ev.weight
|
|
147
|
+
|
|
148
|
+
def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
149
|
+
for ev in self._iter_neighbors(node, direction="INBOUND", rich=True):
|
|
150
|
+
yield ev.neighbor_id, ev.relation, ev.weight
|
|
151
|
+
|
|
152
|
+
def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
|
|
153
|
+
"""
|
|
154
|
+
Return all node IDs in this community.
|
|
155
|
+
- mapping mode: EntityCommunities -> entity_id
|
|
156
|
+
- property mode: filter ExtractedEntities by community_id field (if you add it)
|
|
157
|
+
- none mode: return all nodes
|
|
158
|
+
"""
|
|
159
|
+
cid = community_id or self._cid
|
|
160
|
+
if self.community_mode == "property":
|
|
161
|
+
aql = f"""
|
|
162
|
+
FOR v IN {self.nodes_col}
|
|
163
|
+
FILTER v.{self.community_prop} == @cid
|
|
164
|
+
RETURN v._id
|
|
165
|
+
"""
|
|
166
|
+
cursor = self.db.aql.execute(
|
|
167
|
+
aql, bind_vars={"cid": cid}, batch_size=self.aql_batch_size, stream=self.aql_stream
|
|
168
|
+
)
|
|
169
|
+
elif self.community_mode == "mapping":
|
|
170
|
+
aql = f"""
|
|
171
|
+
FOR m IN @@mcol
|
|
172
|
+
FILTER m[@m_com] == @cid
|
|
173
|
+
RETURN m[@m_ent]
|
|
174
|
+
"""
|
|
175
|
+
cursor = self.db.aql.execute(
|
|
176
|
+
aql,
|
|
177
|
+
bind_vars={
|
|
178
|
+
"cid": cid,
|
|
179
|
+
"@mcol": self.membership_col,
|
|
180
|
+
"m_ent": self.memb_ent_field,
|
|
181
|
+
"m_com": self.memb_com_field,
|
|
182
|
+
},
|
|
183
|
+
batch_size=self.aql_batch_size,
|
|
184
|
+
stream=self.aql_stream,
|
|
185
|
+
)
|
|
186
|
+
else: # community_mode == "none"
|
|
187
|
+
aql = f"""
|
|
188
|
+
FOR v IN {self.nodes_col}
|
|
189
|
+
RETURN v._id
|
|
190
|
+
"""
|
|
191
|
+
cursor = self.db.aql.execute(
|
|
192
|
+
aql, batch_size=self.aql_batch_size, stream=self.aql_stream
|
|
193
|
+
)
|
|
194
|
+
for vid in cursor:
|
|
195
|
+
yield vid
|
|
196
|
+
|
|
197
|
+
def degree(self, node: NodeId) -> int:
|
|
198
|
+
"""Out-degree (fast)."""
|
|
199
|
+
hint_clause = (
|
|
200
|
+
"OPTIONS { indexHint: @idx, forceIndexHint: true }" if self.outbound_index_hint else ""
|
|
201
|
+
)
|
|
202
|
+
aql = f"""
|
|
203
|
+
RETURN LENGTH(
|
|
204
|
+
FOR e IN {self.edges_col}
|
|
205
|
+
{hint_clause}
|
|
206
|
+
FILTER e._from == @node
|
|
207
|
+
RETURN 1
|
|
208
|
+
)
|
|
209
|
+
"""
|
|
210
|
+
bind = {"node": node}
|
|
211
|
+
if self.outbound_index_hint:
|
|
212
|
+
bind["idx"] = self.outbound_index_hint
|
|
213
|
+
cur = self.db.aql.execute(aql, bind_vars=bind)
|
|
214
|
+
return int(list(cur)[0] or 0)
|
|
215
|
+
|
|
216
|
+
# --------------------------
|
|
217
|
+
# Rich neighbor variants
|
|
218
|
+
# --------------------------
|
|
219
|
+
def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
220
|
+
yield from self._iter_neighbors(node, direction="OUTBOUND", rich=True)
|
|
221
|
+
|
|
222
|
+
def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
223
|
+
yield from self._iter_neighbors(node, direction="INBOUND", rich=True)
|
|
224
|
+
|
|
225
|
+
# --------------------------
|
|
226
|
+
# Provenance helpers
|
|
227
|
+
# --------------------------
|
|
228
|
+
def get_edge_provenance(self, edge_id: str) -> List[str]:
|
|
229
|
+
"""
|
|
230
|
+
Return provenance targets for a relationship edge:
|
|
231
|
+
- inline fields (source_document_id, source_text_id)
|
|
232
|
+
- EXTRACTED_FROM edges for either endpoint entity
|
|
233
|
+
"""
|
|
234
|
+
# Build the provenance edges clause safely (avoid nested f-strings)
|
|
235
|
+
prov_edges_clause = (
|
|
236
|
+
f"""
|
|
237
|
+
FOR p IN {self.prov_edges_col}
|
|
238
|
+
FILTER p._from IN [e._from, e._to]
|
|
239
|
+
RETURN p._to
|
|
240
|
+
"""
|
|
241
|
+
if self.prov_edges_col else "[]"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
aql = f"""
|
|
245
|
+
LET e = DOCUMENT(@eid)
|
|
246
|
+
LET inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
|
|
247
|
+
LET inline = (
|
|
248
|
+
FOR x IN inline_candidates
|
|
249
|
+
FILTER x != null
|
|
250
|
+
RETURN x
|
|
251
|
+
)
|
|
252
|
+
LET via_edges = (
|
|
253
|
+
{prov_edges_clause}
|
|
254
|
+
)
|
|
255
|
+
RETURN UNIQUE(APPEND(inline, via_edges))
|
|
256
|
+
"""
|
|
257
|
+
cur = self.db.aql.execute(aql, bind_vars={"eid": edge_id}, batch_size=self.aql_batch_size, stream=self.aql_stream)
|
|
258
|
+
out = list(cur)
|
|
259
|
+
return out[0] if out else []
|
|
260
|
+
|
|
261
|
+
def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
262
|
+
if fields:
|
|
263
|
+
proj = ", ".join([f"{f}: d.{f}" for f in fields])
|
|
264
|
+
aql = f"LET d = DOCUMENT(@id) RETURN {{ _id: d._id, {proj} }}"
|
|
265
|
+
else:
|
|
266
|
+
aql = "RETURN DOCUMENT(@id)"
|
|
267
|
+
cur = self.db.aql.execute(aql, bind_vars={"id": node_id})
|
|
268
|
+
res = list(cur)
|
|
269
|
+
return res[0] if res else {}
|
|
270
|
+
|
|
271
|
+
# --------------------------
|
|
272
|
+
# Stats / quick analytics
|
|
273
|
+
# --------------------------
|
|
274
|
+
@staticmethod
|
|
275
|
+
def get_top_n_entities_by_degree(
|
|
276
|
+
db,
|
|
277
|
+
edges_collection: str = "ExtractedRelationships",
|
|
278
|
+
limit: Optional[int] = None,
|
|
279
|
+
time_window: Optional[Tuple[str, str]] = None,
|
|
280
|
+
time_property: str = "created_at",
|
|
281
|
+
) -> List[dict]:
|
|
282
|
+
bind: Dict[str, Any] = {}
|
|
283
|
+
where = ""
|
|
284
|
+
if time_window:
|
|
285
|
+
where = "FILTER HAS(e, @ts) AND e[@ts] >= @start_ts AND e[@ts] <= @end_ts"
|
|
286
|
+
bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
|
|
287
|
+
limit_clause = "LIMIT @lim" if limit else ""
|
|
288
|
+
if limit:
|
|
289
|
+
bind["lim"] = limit
|
|
290
|
+
aql = f"""
|
|
291
|
+
FOR e IN {edges_collection}
|
|
292
|
+
{where}
|
|
293
|
+
COLLECT entity = e._from WITH COUNT INTO degree
|
|
294
|
+
SORT degree DESC
|
|
295
|
+
{limit_clause}
|
|
296
|
+
RETURN {{ "entity": entity, "degree": degree }}
|
|
297
|
+
"""
|
|
298
|
+
return list(db.aql.execute(aql, bind_vars=bind))
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def get_entity_type_counts(
|
|
302
|
+
db,
|
|
303
|
+
nodes_collection: str = "ExtractedEntities",
|
|
304
|
+
type_property: str = "type"
|
|
305
|
+
) -> List[dict]:
|
|
306
|
+
aql = f"""
|
|
307
|
+
FOR doc IN {nodes_collection}
|
|
308
|
+
COLLECT t = doc.{type_property} WITH COUNT INTO c
|
|
309
|
+
SORT c DESC
|
|
310
|
+
RETURN {{ "type": t, "count": c }}
|
|
311
|
+
"""
|
|
312
|
+
return list(db.aql.execute(aql))
|
|
313
|
+
|
|
314
|
+
@staticmethod
|
|
315
|
+
def get_relationship_type_counts(
|
|
316
|
+
db,
|
|
317
|
+
edges_collection: str = "ExtractedRelationships",
|
|
318
|
+
relation_property: str = "relationship",
|
|
319
|
+
time_window: Optional[Tuple[str, str]] = None,
|
|
320
|
+
time_property: str = "created_at",
|
|
321
|
+
) -> List[dict]:
|
|
322
|
+
bind: Dict[str, Any] = {"rel_prop": relation_property}
|
|
323
|
+
where = "FILTER HAS(rel, @rel_prop)"
|
|
324
|
+
if time_window:
|
|
325
|
+
where += " AND HAS(rel, @ts) AND rel[@ts] >= @start_ts AND rel[@ts] <= @end_ts"
|
|
326
|
+
bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
|
|
327
|
+
aql = f"""
|
|
328
|
+
FOR rel IN {edges_collection}
|
|
329
|
+
{where}
|
|
330
|
+
COLLECT t = rel[@rel_prop] WITH COUNT INTO c
|
|
331
|
+
SORT c DESC
|
|
332
|
+
RETURN {{ "type": t, "count": c }}
|
|
333
|
+
"""
|
|
334
|
+
return list(db.aql.execute(aql, bind_vars=bind))
|
|
335
|
+
|
|
336
|
+
@staticmethod
|
|
337
|
+
def get_community_summaries(
|
|
338
|
+
db,
|
|
339
|
+
communities_collection: str = "Communities",
|
|
340
|
+
limit: Optional[int] = None,
|
|
341
|
+
skip: int = 0,
|
|
342
|
+
require_summary: bool = True
|
|
343
|
+
) -> List[dict]:
|
|
344
|
+
filter_clause = "FILTER c.summary != null AND c.summary != ''" if require_summary else "FILTER c.summary == null OR c.summary == ''"
|
|
345
|
+
limit_clause = "LIMIT @skip, @limit" if limit is not None else ""
|
|
346
|
+
bind: Dict[str, Any] = {}
|
|
347
|
+
if limit is not None:
|
|
348
|
+
bind.update({"skip": skip, "limit": limit})
|
|
349
|
+
aql = f"""
|
|
350
|
+
FOR c IN {communities_collection}
|
|
351
|
+
{filter_clause}
|
|
352
|
+
SORT c.community_id ASC
|
|
353
|
+
{limit_clause}
|
|
354
|
+
RETURN {{ id: c.community_id, summary: c.summary, size: c.size, level: c.level }}
|
|
355
|
+
"""
|
|
356
|
+
return list(db.aql.execute(aql, bind_vars=bind))
|
|
357
|
+
|
|
358
|
+
@staticmethod
|
|
359
|
+
def get_unique_table_headers(
|
|
360
|
+
db,
|
|
361
|
+
tables_collection: str = "Tables",
|
|
362
|
+
headers_property: str = "headers"
|
|
363
|
+
) -> List[List[str]]:
|
|
364
|
+
aql = f"""
|
|
365
|
+
FOR t IN {tables_collection}
|
|
366
|
+
FILTER HAS(t, @hp)
|
|
367
|
+
COLLECT h = t[@hp]
|
|
368
|
+
RETURN h
|
|
369
|
+
"""
|
|
370
|
+
return list(db.aql.execute(aql, bind_vars={"hp": headers_property}))
|
|
371
|
+
|
|
372
|
+
# --------------------------
|
|
373
|
+
# Bridge / GNN Integration Methods (Mirrored from GlobalGraphAccessor)
|
|
374
|
+
# --------------------------
|
|
375
|
+
|
|
376
|
+
def is_bridge(self, entity_key: str) -> Optional[dict]:
|
|
377
|
+
"""
|
|
378
|
+
Check if an entity is a bridge and return its bridge data.
|
|
379
|
+
Uses caching for performance.
|
|
380
|
+
"""
|
|
381
|
+
# Strip collection if present to get key
|
|
382
|
+
if "/" in entity_key:
|
|
383
|
+
entity_key = entity_key.split("/")[-1]
|
|
384
|
+
|
|
385
|
+
if entity_key in self._bridge_cache:
|
|
386
|
+
return self._bridge_cache[entity_key]
|
|
387
|
+
|
|
388
|
+
aql = """
|
|
389
|
+
FOR b IN @@bridge_col
|
|
390
|
+
FILTER b.entity_key == @entity_key
|
|
391
|
+
FILTER b.algorithm == @algorithm
|
|
392
|
+
RETURN b
|
|
393
|
+
"""
|
|
394
|
+
try:
|
|
395
|
+
result = list(self.db.aql.execute(
|
|
396
|
+
aql,
|
|
397
|
+
bind_vars={
|
|
398
|
+
"@bridge_col": self.bridge_col,
|
|
399
|
+
"entity_key": entity_key,
|
|
400
|
+
"algorithm": self.algorithm,
|
|
401
|
+
}
|
|
402
|
+
))
|
|
403
|
+
bridge_data = result[0] if result else None
|
|
404
|
+
except Exception:
|
|
405
|
+
# Fallback if collection doesn't exist yet
|
|
406
|
+
bridge_data = None
|
|
407
|
+
|
|
408
|
+
self._bridge_cache[entity_key] = bridge_data
|
|
409
|
+
return bridge_data
|
|
410
|
+
|
|
411
|
+
def get_entity_community(self, entity_id: str) -> Optional[str]:
|
|
412
|
+
"""Get the community ID for an entity."""
|
|
413
|
+
# For ArangoCommunityAccessor, we might know the community if mode is 'mapping'
|
|
414
|
+
# But we should check the mapping collection to be sure (or if it's a bridge to another community)
|
|
415
|
+
|
|
416
|
+
# If we are in 'mapping' mode, we can query membership collection
|
|
417
|
+
if self.community_mode == "mapping":
|
|
418
|
+
aql = f"""
|
|
419
|
+
FOR m IN {self.membership_col}
|
|
420
|
+
FILTER m.{self.memb_ent_field} == @entity_id
|
|
421
|
+
// We don't filter by algorithm here usually, but if needed we can
|
|
422
|
+
RETURN m.{self.memb_com_field}
|
|
423
|
+
"""
|
|
424
|
+
try:
|
|
425
|
+
result = list(self.db.aql.execute(aql, bind_vars={"entity_id": entity_id}))
|
|
426
|
+
return result[0] if result else None
|
|
427
|
+
except Exception:
|
|
428
|
+
return None
|
|
429
|
+
return None
|
|
430
|
+
|
|
431
|
+
def get_affinity(self, community_a: str, community_b: str) -> float:
|
|
432
|
+
"""
|
|
433
|
+
Get the affinity score between two communities.
|
|
434
|
+
Returns 0.0 if no affinity data exists.
|
|
435
|
+
"""
|
|
436
|
+
if not community_a or not community_b:
|
|
437
|
+
return 0.0
|
|
438
|
+
|
|
439
|
+
cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
|
|
440
|
+
|
|
441
|
+
if cache_key in self._affinity_cache:
|
|
442
|
+
return self._affinity_cache[cache_key]
|
|
443
|
+
|
|
444
|
+
aql = """
|
|
445
|
+
FOR a IN @@affinity_col
|
|
446
|
+
FILTER a.algorithm == @algorithm
|
|
447
|
+
FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
|
|
448
|
+
OR (a.community_a == @comm_b AND a.community_b == @comm_a)
|
|
449
|
+
RETURN a.affinity_score
|
|
450
|
+
"""
|
|
451
|
+
try:
|
|
452
|
+
result = list(self.db.aql.execute(
|
|
453
|
+
aql,
|
|
454
|
+
bind_vars={
|
|
455
|
+
"@affinity_col": self.affinity_col,
|
|
456
|
+
"algorithm": self.algorithm,
|
|
457
|
+
"comm_a": community_a,
|
|
458
|
+
"comm_b": community_b,
|
|
459
|
+
}
|
|
460
|
+
))
|
|
461
|
+
affinity = result[0] if result else 0.0
|
|
462
|
+
except Exception:
|
|
463
|
+
affinity = 0.0
|
|
464
|
+
|
|
465
|
+
self._affinity_cache[cache_key] = affinity
|
|
466
|
+
return affinity
|
|
467
|
+
|
|
468
|
+
def clear_bridge_cache(self):
|
|
469
|
+
"""Clear bridge/affinity caches."""
|
|
470
|
+
self._bridge_cache.clear()
|
|
471
|
+
self._affinity_cache.clear()
|
|
472
|
+
|
|
473
|
+
# ════════════════════════════════════════════════════════════════
|
|
474
|
+
# DISCOVERY ENTRY POINTS (for autonomous insight discovery)
|
|
475
|
+
# ════════════════════════════════════════════════════════════════
|
|
476
|
+
|
|
477
|
+
@staticmethod
|
|
478
|
+
def get_top_entities_in_community(
|
|
479
|
+
db,
|
|
480
|
+
community_id: str,
|
|
481
|
+
membership_collection: str = "EntityCommunities",
|
|
482
|
+
membership_entity_field: str = "entity_id",
|
|
483
|
+
membership_community_field: str = "community_id",
|
|
484
|
+
edges_collection: str = "ExtractedRelationships",
|
|
485
|
+
limit: int = 20,
|
|
486
|
+
) -> List[dict]:
|
|
487
|
+
"""
|
|
488
|
+
Get top entities by degree WITHIN a specific community.
|
|
489
|
+
Essential for autonomous discovery - provides high-value seed nodes.
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
List of {entity: str, degree: int}
|
|
493
|
+
"""
|
|
494
|
+
aql = """
|
|
495
|
+
LET community_entities = (
|
|
496
|
+
FOR m IN @@membership
|
|
497
|
+
FILTER m[@m_com] == @cid
|
|
498
|
+
RETURN m[@m_ent]
|
|
499
|
+
)
|
|
500
|
+
FOR e IN @@edges
|
|
501
|
+
FILTER e._from IN community_entities
|
|
502
|
+
COLLECT entity = e._from WITH COUNT INTO degree
|
|
503
|
+
SORT degree DESC
|
|
504
|
+
LIMIT @limit
|
|
505
|
+
RETURN { entity: entity, degree: degree }
|
|
506
|
+
"""
|
|
507
|
+
return list(db.aql.execute(aql, bind_vars={
|
|
508
|
+
"@membership": membership_collection,
|
|
509
|
+
"@edges": edges_collection,
|
|
510
|
+
"m_ent": membership_entity_field,
|
|
511
|
+
"m_com": membership_community_field,
|
|
512
|
+
"cid": community_id,
|
|
513
|
+
"limit": limit,
|
|
514
|
+
}))
|
|
515
|
+
|
|
516
|
+
@staticmethod
|
|
517
|
+
def get_recent_entities(
|
|
518
|
+
db,
|
|
519
|
+
since: str, # ISO timestamp
|
|
520
|
+
community_id: Optional[str] = None,
|
|
521
|
+
nodes_collection: str = "ExtractedEntities",
|
|
522
|
+
membership_collection: str = "EntityCommunities",
|
|
523
|
+
membership_entity_field: str = "entity_id",
|
|
524
|
+
membership_community_field: str = "community_id",
|
|
525
|
+
created_at_property: str = "created_at",
|
|
526
|
+
updated_at_property: str = "updated_at",
|
|
527
|
+
limit: int = 100,
|
|
528
|
+
) -> List[dict]:
|
|
529
|
+
"""
|
|
530
|
+
Get entities created or updated since a timestamp.
|
|
531
|
+
Critical for daily discovery - "what's new since yesterday?"
|
|
532
|
+
|
|
533
|
+
Args:
|
|
534
|
+
since: ISO timestamp (e.g., "2026-01-11T00:00:00Z")
|
|
535
|
+
community_id: Optional community filter
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
List of {entity: str, created_at: str, type: str}
|
|
539
|
+
"""
|
|
540
|
+
bind: Dict[str, Any] = {
|
|
541
|
+
"since": since,
|
|
542
|
+
"limit": limit,
|
|
543
|
+
"created_prop": created_at_property,
|
|
544
|
+
"updated_prop": updated_at_property,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
community_filter = ""
|
|
548
|
+
if community_id:
|
|
549
|
+
community_filter = """
|
|
550
|
+
LET community_entities = (
|
|
551
|
+
FOR m IN @@membership
|
|
552
|
+
FILTER m[@m_com] == @cid
|
|
553
|
+
RETURN m[@m_ent]
|
|
554
|
+
)
|
|
555
|
+
FILTER e._id IN community_entities
|
|
556
|
+
"""
|
|
557
|
+
bind["@membership"] = membership_collection
|
|
558
|
+
bind["m_ent"] = membership_entity_field
|
|
559
|
+
bind["m_com"] = membership_community_field
|
|
560
|
+
bind["cid"] = community_id
|
|
561
|
+
|
|
562
|
+
aql = f"""
|
|
563
|
+
FOR e IN {nodes_collection}
|
|
564
|
+
FILTER (HAS(e, @created_prop) AND e[@created_prop] >= @since)
|
|
565
|
+
OR (HAS(e, @updated_prop) AND e[@updated_prop] >= @since)
|
|
566
|
+
{community_filter}
|
|
567
|
+
SORT HAS(e, @created_prop) ? e[@created_prop] : e[@updated_prop] DESC
|
|
568
|
+
LIMIT @limit
|
|
569
|
+
RETURN {{
|
|
570
|
+
entity: e._id,
|
|
571
|
+
created_at: HAS(e, @created_prop) ? e[@created_prop] : null,
|
|
572
|
+
updated_at: HAS(e, @updated_prop) ? e[@updated_prop] : null,
|
|
573
|
+
type: e.type,
|
|
574
|
+
name: e.name
|
|
575
|
+
}}
|
|
576
|
+
"""
|
|
577
|
+
return list(db.aql.execute(aql, bind_vars=bind))
|
|
578
|
+
|
|
579
|
+
@staticmethod
|
|
580
|
+
def search_entities(
|
|
581
|
+
db,
|
|
582
|
+
query: str,
|
|
583
|
+
community_id: Optional[str] = None,
|
|
584
|
+
nodes_collection: str = "ExtractedEntities",
|
|
585
|
+
membership_collection: str = "EntityCommunities",
|
|
586
|
+
membership_entity_field: str = "entity_id",
|
|
587
|
+
membership_community_field: str = "community_id",
|
|
588
|
+
search_fields: List[str] = None,
|
|
589
|
+
limit: int = 20,
|
|
590
|
+
) -> List[dict]:
|
|
591
|
+
"""
|
|
592
|
+
Text search for entities matching query.
|
|
593
|
+
Uses LIKE for simple text matching (can be upgraded to ArangoSearch).
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
query: Search string
|
|
597
|
+
search_fields: Fields to search in (default: ["name", "description"])
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
List of {entity: str, name: str, type: str, matched_field: str}
|
|
601
|
+
"""
|
|
602
|
+
if search_fields is None:
|
|
603
|
+
search_fields = ["name", "description"]
|
|
604
|
+
|
|
605
|
+
bind: Dict[str, Any] = {
|
|
606
|
+
"query": f"%{query.lower()}%",
|
|
607
|
+
"limit": limit,
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
# Build search conditions
|
|
611
|
+
search_conditions = []
|
|
612
|
+
for field in search_fields:
|
|
613
|
+
search_conditions.append(f"LOWER(e.{field}) LIKE @query")
|
|
614
|
+
search_clause = " OR ".join(search_conditions)
|
|
615
|
+
|
|
616
|
+
community_filter = ""
|
|
617
|
+
if community_id:
|
|
618
|
+
community_filter = """
|
|
619
|
+
LET community_entities = (
|
|
620
|
+
FOR m IN @@membership
|
|
621
|
+
FILTER m[@m_com] == @cid
|
|
622
|
+
RETURN m[@m_ent]
|
|
623
|
+
)
|
|
624
|
+
FILTER e._id IN community_entities
|
|
625
|
+
"""
|
|
626
|
+
bind["@membership"] = membership_collection
|
|
627
|
+
bind["m_ent"] = membership_entity_field
|
|
628
|
+
bind["m_com"] = membership_community_field
|
|
629
|
+
bind["cid"] = community_id
|
|
630
|
+
|
|
631
|
+
aql = f"""
|
|
632
|
+
FOR e IN {nodes_collection}
|
|
633
|
+
FILTER {search_clause}
|
|
634
|
+
{community_filter}
|
|
635
|
+
LIMIT @limit
|
|
636
|
+
RETURN {{
|
|
637
|
+
entity: e._id,
|
|
638
|
+
name: e.name,
|
|
639
|
+
type: e.type,
|
|
640
|
+
description: e.description
|
|
641
|
+
}}
|
|
642
|
+
"""
|
|
643
|
+
return list(db.aql.execute(aql, bind_vars=bind))
|
|
644
|
+
|
|
645
|
+
# ════════════════════════════════════════════════════════════════
|
|
646
|
+
# CONTENT HYDRATION (for agent reasoning)
|
|
647
|
+
# ════════════════════════════════════════════════════════════════
|
|
648
|
+
|
|
649
|
+
@staticmethod
|
|
650
|
+
def get_document_content(
|
|
651
|
+
db,
|
|
652
|
+
doc_id: str,
|
|
653
|
+
text_collection: str = "TextBlocks",
|
|
654
|
+
table_collection: str = "Tables",
|
|
655
|
+
image_collection: str = "Images",
|
|
656
|
+
document_collection: str = "Documents",
|
|
657
|
+
) -> Optional[dict]:
|
|
658
|
+
"""
|
|
659
|
+
Fetch content from any document collection by ID.
|
|
660
|
+
Essential for agent reasoning - converts graph IDs to actual content.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
doc_id: Document ID in format "CollectionName/key"
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
Dict with type-specific content, or None if not found
|
|
667
|
+
"""
|
|
668
|
+
try:
|
|
669
|
+
collection, key = doc_id.split("/", 1)
|
|
670
|
+
except ValueError:
|
|
671
|
+
return None
|
|
672
|
+
|
|
673
|
+
if collection == text_collection:
|
|
674
|
+
aql = f"""
|
|
675
|
+
FOR tb IN {text_collection}
|
|
676
|
+
FILTER tb._id == @doc_id
|
|
677
|
+
RETURN {{
|
|
678
|
+
type: "text",
|
|
679
|
+
text: tb.text,
|
|
680
|
+
document_id: tb.document_id,
|
|
681
|
+
page: tb.page,
|
|
682
|
+
char_span: tb.char_span,
|
|
683
|
+
metadata: tb.metadata
|
|
684
|
+
}}
|
|
685
|
+
"""
|
|
686
|
+
elif collection == table_collection:
|
|
687
|
+
aql = f"""
|
|
688
|
+
FOR t IN {table_collection}
|
|
689
|
+
FILTER t._id == @doc_id
|
|
690
|
+
RETURN {{
|
|
691
|
+
type: "table",
|
|
692
|
+
headers: t.headers,
|
|
693
|
+
rows: t.rows,
|
|
694
|
+
caption: t.caption,
|
|
695
|
+
document_id: t.document_id,
|
|
696
|
+
page: t.page,
|
|
697
|
+
metadata: t.metadata
|
|
698
|
+
}}
|
|
699
|
+
"""
|
|
700
|
+
elif collection == image_collection:
|
|
701
|
+
aql = f"""
|
|
702
|
+
FOR img IN {image_collection}
|
|
703
|
+
FILTER img._id == @doc_id
|
|
704
|
+
RETURN {{
|
|
705
|
+
type: "image",
|
|
706
|
+
caption: img.caption,
|
|
707
|
+
ocr_text: img.ocr_text,
|
|
708
|
+
url: img.storage_url,
|
|
709
|
+
document_id: img.document_id,
|
|
710
|
+
page: img.page,
|
|
711
|
+
metadata: img.metadata
|
|
712
|
+
}}
|
|
713
|
+
"""
|
|
714
|
+
elif collection == document_collection:
|
|
715
|
+
aql = f"""
|
|
716
|
+
FOR d IN {document_collection}
|
|
717
|
+
FILTER d._id == @doc_id
|
|
718
|
+
RETURN {{
|
|
719
|
+
type: "document",
|
|
720
|
+
filename: d.filename,
|
|
721
|
+
content: d.content,
|
|
722
|
+
metadata: d.metadata
|
|
723
|
+
}}
|
|
724
|
+
"""
|
|
725
|
+
else:
|
|
726
|
+
return None
|
|
727
|
+
|
|
728
|
+
result = list(db.aql.execute(aql, bind_vars={"doc_id": doc_id}))
|
|
729
|
+
return result[0] if result else None
|
|
730
|
+
|
|
731
|
+
@staticmethod
|
|
732
|
+
def get_entity_sources(
|
|
733
|
+
db,
|
|
734
|
+
entity_id: str,
|
|
735
|
+
extracted_from_collection: str = "EXTRACTED_FROM",
|
|
736
|
+
max_sources: int = 10,
|
|
737
|
+
) -> List[dict]:
|
|
738
|
+
"""
|
|
739
|
+
Get all source documents/blocks for an entity via EXTRACTED_FROM edges.
|
|
740
|
+
Critical for evidence gathering - shows WHERE an entity was mentioned.
|
|
741
|
+
|
|
742
|
+
Args:
|
|
743
|
+
entity_id: Entity ID (e.g., "ExtractedEntities/ent_123")
|
|
744
|
+
max_sources: Limit number of sources returned
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
List of {source_id, source_type, content, char_span, confidence, metadata}
|
|
748
|
+
"""
|
|
749
|
+
aql = f"""
|
|
750
|
+
FOR edge IN {extracted_from_collection}
|
|
751
|
+
FILTER edge._from == @entity_id
|
|
752
|
+
LIMIT @max_sources
|
|
753
|
+
LET source = DOCUMENT(edge._to)
|
|
754
|
+
LET collection = PARSE_IDENTIFIER(edge._to).collection
|
|
755
|
+
RETURN {{
|
|
756
|
+
source_id: edge._to,
|
|
757
|
+
source_type: collection,
|
|
758
|
+
char_span: edge.char_span,
|
|
759
|
+
extraction_confidence: edge.extraction_confidence,
|
|
760
|
+
content: CASE
|
|
761
|
+
WHEN collection == "TextBlocks" THEN source.text
|
|
762
|
+
WHEN collection == "Tables" THEN {{ headers: source.headers, rows: source.rows }}
|
|
763
|
+
WHEN collection == "Images" THEN {{ caption: source.caption, ocr_text: source.ocr_text }}
|
|
764
|
+
WHEN collection == "Documents" THEN SUBSTRING(source.content, 0, 500)
|
|
765
|
+
ELSE null
|
|
766
|
+
END,
|
|
767
|
+
metadata: {{
|
|
768
|
+
page: source.page,
|
|
769
|
+
document_id: source.document_id,
|
|
770
|
+
filename: source.filename
|
|
771
|
+
}}
|
|
772
|
+
}}
|
|
773
|
+
"""
|
|
774
|
+
return list(db.aql.execute(aql, bind_vars={
|
|
775
|
+
"entity_id": entity_id,
|
|
776
|
+
"max_sources": max_sources,
|
|
777
|
+
}))
|
|
778
|
+
|
|
779
|
+
@staticmethod
|
|
780
|
+
def search_content(
|
|
781
|
+
db,
|
|
782
|
+
query: str,
|
|
783
|
+
community_id: Optional[str] = None,
|
|
784
|
+
content_types: List[str] = None,
|
|
785
|
+
text_collection: str = "TextBlocks",
|
|
786
|
+
table_collection: str = "Tables",
|
|
787
|
+
image_collection: str = "Images",
|
|
788
|
+
membership_collection: str = "EntityCommunities",
|
|
789
|
+
extracted_from_collection: str = "EXTRACTED_FROM",
|
|
790
|
+
limit: int = 10,
|
|
791
|
+
) -> List[dict]:
|
|
792
|
+
"""
|
|
793
|
+
Semantic/text search across content collections.
|
|
794
|
+
Uses simple LIKE matching (can be upgraded to ArangoSearch/vectors).
|
|
795
|
+
|
|
796
|
+
Args:
|
|
797
|
+
query: Search string
|
|
798
|
+
content_types: Collections to search (default: ["TextBlocks", "Tables", "Images"])
|
|
799
|
+
community_id: Optional filter to content linked to community entities
|
|
800
|
+
|
|
801
|
+
Returns:
|
|
802
|
+
List of {source_id, source_type, content, score, metadata}
|
|
803
|
+
"""
|
|
804
|
+
if content_types is None:
|
|
805
|
+
content_types = [text_collection, table_collection, image_collection]
|
|
806
|
+
|
|
807
|
+
bind: Dict[str, Any] = {
|
|
808
|
+
"query": f"%{query.lower()}%",
|
|
809
|
+
"limit": limit,
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
results = []
|
|
813
|
+
|
|
814
|
+
# Search TextBlocks
|
|
815
|
+
if text_collection in content_types:
|
|
816
|
+
aql_text = f"""
|
|
817
|
+
FOR tb IN {text_collection}
|
|
818
|
+
FILTER LOWER(tb.text) LIKE @query
|
|
819
|
+
LIMIT @limit
|
|
820
|
+
RETURN {{
|
|
821
|
+
source_id: tb._id,
|
|
822
|
+
source_type: "TextBlocks",
|
|
823
|
+
content: tb.text,
|
|
824
|
+
score: 1.0,
|
|
825
|
+
metadata: {{
|
|
826
|
+
document_id: tb.document_id,
|
|
827
|
+
page: tb.page
|
|
828
|
+
}}
|
|
829
|
+
}}
|
|
830
|
+
"""
|
|
831
|
+
results.extend(list(db.aql.execute(aql_text, bind_vars=bind)))
|
|
832
|
+
|
|
833
|
+
# Search Tables (caption)
|
|
834
|
+
if table_collection in content_types:
|
|
835
|
+
aql_table = f"""
|
|
836
|
+
FOR t IN {table_collection}
|
|
837
|
+
FILTER LOWER(t.caption) LIKE @query
|
|
838
|
+
LIMIT @limit
|
|
839
|
+
RETURN {{
|
|
840
|
+
source_id: t._id,
|
|
841
|
+
source_type: "Tables",
|
|
842
|
+
content: {{ headers: t.headers, rows: t.rows, caption: t.caption }},
|
|
843
|
+
score: 1.0,
|
|
844
|
+
metadata: {{
|
|
845
|
+
document_id: t.document_id,
|
|
846
|
+
page: t.page
|
|
847
|
+
}}
|
|
848
|
+
}}
|
|
849
|
+
"""
|
|
850
|
+
results.extend(list(db.aql.execute(aql_table, bind_vars=bind)))
|
|
851
|
+
|
|
852
|
+
# Search Images (OCR text)
|
|
853
|
+
if image_collection in content_types:
|
|
854
|
+
aql_image = f"""
|
|
855
|
+
FOR img IN {image_collection}
|
|
856
|
+
FILTER LOWER(img.ocr_text) LIKE @query OR LOWER(img.caption) LIKE @query
|
|
857
|
+
LIMIT @limit
|
|
858
|
+
RETURN {{
|
|
859
|
+
source_id: img._id,
|
|
860
|
+
source_type: "Images",
|
|
861
|
+
content: {{ caption: img.caption, ocr_text: img.ocr_text }},
|
|
862
|
+
score: 1.0,
|
|
863
|
+
metadata: {{
|
|
864
|
+
document_id: img.document_id,
|
|
865
|
+
page: img.page
|
|
866
|
+
}}
|
|
867
|
+
}}
|
|
868
|
+
"""
|
|
869
|
+
results.extend(list(db.aql.execute(aql_image, bind_vars=bind)))
|
|
870
|
+
|
|
871
|
+
return results[:limit]
|
|
872
|
+
|
|
873
|
+
# --------------------------
|
|
874
|
+
# Internal neighbor routine
|
|
875
|
+
# --------------------------
|
|
876
|
+
def _iter_neighbors(self, node: NodeId, *, direction: str, rich: bool) -> Iterable[EdgeView]:
|
|
877
|
+
assert direction in ("OUTBOUND", "INBOUND")
|
|
878
|
+
|
|
879
|
+
bind: Dict[str, Any] = {
|
|
880
|
+
"node": node,
|
|
881
|
+
"rel_prop": self.rel_prop,
|
|
882
|
+
"w_prop": self.w_prop,
|
|
883
|
+
"priors_map": self.type_priors,
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
# Only add community ID if we're filtering by it
|
|
887
|
+
if self.community_mode != "none":
|
|
888
|
+
bind["cid"] = self._cid
|
|
889
|
+
|
|
890
|
+
# Bind parameters are added only when referenced to avoid AQL 1552 errors
|
|
891
|
+
|
|
892
|
+
hint = ""
|
|
893
|
+
if direction == "OUTBOUND" and self.outbound_index_hint:
|
|
894
|
+
hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
|
|
895
|
+
bind["idx"] = self.outbound_index_hint
|
|
896
|
+
elif direction == "INBOUND" and self.inbound_index_hint:
|
|
897
|
+
hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
|
|
898
|
+
bind["idx"] = self.inbound_index_hint
|
|
899
|
+
|
|
900
|
+
filters: List[str] = []
|
|
901
|
+
|
|
902
|
+
# Community filter
|
|
903
|
+
if self.community_mode == "property":
|
|
904
|
+
filters.append(f"v.{self.community_prop} == @cid")
|
|
905
|
+
elif self.community_mode == "mapping":
|
|
906
|
+
bind.update({"@mcol": self.membership_col, "m_ent": self.memb_ent_field, "m_com": self.memb_com_field})
|
|
907
|
+
filters.append("""
|
|
908
|
+
FIRST(
|
|
909
|
+
FOR m IN @@mcol
|
|
910
|
+
FILTER m[@m_com] == @cid AND m[@m_ent] == v._id
|
|
911
|
+
LIMIT 1
|
|
912
|
+
RETURN 1
|
|
913
|
+
)
|
|
914
|
+
""")
|
|
915
|
+
# else community_mode == "none" - no filtering
|
|
916
|
+
|
|
917
|
+
# Relation / neighbor type filters
|
|
918
|
+
if self.allowed_relations:
|
|
919
|
+
bind["allowed_relations"] = self.allowed_relations
|
|
920
|
+
filters.append("e[@rel_prop] IN @allowed_relations")
|
|
921
|
+
if self.disallowed_relations:
|
|
922
|
+
bind["disallowed_relations"] = self.disallowed_relations
|
|
923
|
+
filters.append("!(e[@rel_prop] IN @disallowed_relations)")
|
|
924
|
+
if self.allowed_neighbor_types:
|
|
925
|
+
bind["allowed_neighbor_types"] = self.allowed_neighbor_types
|
|
926
|
+
filters.append(f"v.{self.node_type_prop} IN @allowed_neighbor_types")
|
|
927
|
+
|
|
928
|
+
# Time window filter on edge timestamp
|
|
929
|
+
if self.time_window and self.ts_prop:
|
|
930
|
+
bind["start_ts"], bind["end_ts"] = self.time_window
|
|
931
|
+
bind["ts_prop"] = self.ts_prop
|
|
932
|
+
filters.append("HAS(e, @ts_prop) AND e[@ts_prop] >= @start_ts AND e[@ts_prop] <= @end_ts")
|
|
933
|
+
|
|
934
|
+
# Current-only validity wrt as_of
|
|
935
|
+
if self.current_only and self.as_of:
|
|
936
|
+
bind["as_of"] = self.as_of
|
|
937
|
+
vf_prop = self.edge_valid_from_prop or "valid_from"
|
|
938
|
+
vt_prop = self.edge_valid_to_prop or "valid_to"
|
|
939
|
+
filters.append(
|
|
940
|
+
f"( (HAS(e, '{vf_prop}') ? e['{vf_prop}'] <= @as_of : true) "
|
|
941
|
+
f"AND (HAS(e, '{vt_prop}') ? (e['{vt_prop}'] == null OR e['{vt_prop}'] >= @as_of) : true) )"
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
# Optional status guard
|
|
945
|
+
status_guard = ""
|
|
946
|
+
if self.edge_status_prop:
|
|
947
|
+
status_guard = "LET _status = e[@status_prop]"
|
|
948
|
+
bind["status_prop"] = self.edge_status_prop
|
|
949
|
+
|
|
950
|
+
# Recency decay: 2^(- age_days / half_life)
|
|
951
|
+
recency_clause = "1.0"
|
|
952
|
+
if self.recency_half_life_days is not None and self.as_of and self.ts_prop:
|
|
953
|
+
bind["half_life"] = float(self.recency_half_life_days)
|
|
954
|
+
bind["as_of"] = self.as_of
|
|
955
|
+
bind["ts_prop"] = self.ts_prop
|
|
956
|
+
recency_clause = "POW(2, -1 * DATE_DIFF(@as_of, e[@ts_prop], 'days') / @half_life)"
|
|
957
|
+
|
|
958
|
+
# Base weight
|
|
959
|
+
weight_clause = "(HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0)"
|
|
960
|
+
|
|
961
|
+
# Confidence fusion (usually disabled; you do it in engine)
|
|
962
|
+
conf_clause = "1.0"
|
|
963
|
+
if self.fuse_edge_confidence:
|
|
964
|
+
bind.update({
|
|
965
|
+
"raw_c": self.edge_raw_conf_prop,
|
|
966
|
+
"npll": self.edge_npll_post_prop,
|
|
967
|
+
"calib": self.edge_calibration_prop,
|
|
968
|
+
"miss_prior": float(self.missing_confidence_prior),
|
|
969
|
+
})
|
|
970
|
+
conf_clause = (
|
|
971
|
+
"( (HAS(e, @raw_c) && IS_NUMBER(e[@raw_c]) ? e[@raw_c] : @miss_prior) * "
|
|
972
|
+
" (HAS(e, @npll) && IS_NUMBER(e[@npll]) ? e[@npll] : @miss_prior) * "
|
|
973
|
+
" (HAS(e, @calib) && IS_NUMBER(e[@calib]) ? e[@calib] : @miss_prior) )"
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
filters_str = " && ".join(filters) if filters else "true"
|
|
977
|
+
|
|
978
|
+
# Build the src edges clause safely
|
|
979
|
+
src_edges_clause = (
|
|
980
|
+
f"""
|
|
981
|
+
FOR p IN {self.prov_edges_col}
|
|
982
|
+
FILTER p._from IN [e._from, e._to]
|
|
983
|
+
RETURN p._to
|
|
984
|
+
"""
|
|
985
|
+
if self.prov_edges_col else "[]"
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
aql = f"""
|
|
989
|
+
LET priors = @priors_map
|
|
990
|
+
FOR v, e IN 1..1 {direction} @node {self.edges_col}
|
|
991
|
+
{hint}
|
|
992
|
+
FILTER {filters_str}
|
|
993
|
+
{status_guard}
|
|
994
|
+
LET _rel = e[@rel_prop]
|
|
995
|
+
LET _prior = TO_NUMBER(NOT_NULL(priors[_rel], 1.0))
|
|
996
|
+
LET _base_w = {weight_clause}
|
|
997
|
+
LET _rec = {recency_clause}
|
|
998
|
+
LET _conf = {conf_clause}
|
|
999
|
+
LET _w_eff = TO_NUMBER(_base_w) * TO_NUMBER(_prior) * TO_NUMBER(_rec) * TO_NUMBER(_conf)
|
|
1000
|
+
|
|
1001
|
+
LET _vf = {f"e['{self.edge_valid_from_prop}']" if self.edge_valid_from_prop else 'null'}
|
|
1002
|
+
LET _vt = {f"e['{self.edge_valid_to_prop}']" if self.edge_valid_to_prop else 'null'}
|
|
1003
|
+
LET _status2 = {f"e['{self.edge_status_prop}']" if self.edge_status_prop else 'null'}
|
|
1004
|
+
|
|
1005
|
+
// Provenance: inline fields + EXTRACTED_FROM for both endpoints
|
|
1006
|
+
LET _src_inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
|
|
1007
|
+
LET _src_inline = (
|
|
1008
|
+
FOR x IN _src_inline_candidates
|
|
1009
|
+
FILTER x != null
|
|
1010
|
+
RETURN x
|
|
1011
|
+
)
|
|
1012
|
+
LET _src_edges = (
|
|
1013
|
+
{src_edges_clause}
|
|
1014
|
+
)
|
|
1015
|
+
LET _sources = UNIQUE(APPEND(_src_inline, _src_edges))
|
|
1016
|
+
|
|
1017
|
+
RETURN {{
|
|
1018
|
+
v_id: v._id,
|
|
1019
|
+
rel: _rel,
|
|
1020
|
+
weight: _w_eff,
|
|
1021
|
+
edge_id: e._id,
|
|
1022
|
+
valid_from: _vf,
|
|
1023
|
+
valid_to: _vt,
|
|
1024
|
+
status: _status2,
|
|
1025
|
+
raw_confidence: {f"e['{self.edge_raw_conf_prop}']" if self.edge_raw_conf_prop else 'null'},
|
|
1026
|
+
npll_posterior: {f"e['{self.edge_npll_post_prop}']" if self.edge_npll_post_prop else 'null'},
|
|
1027
|
+
calibration: {f"e['{self.edge_calibration_prop}']" if self.edge_calibration_prop else 'null'},
|
|
1028
|
+
sources: _sources
|
|
1029
|
+
}}
|
|
1030
|
+
"""
|
|
1031
|
+
|
|
1032
|
+
cursor = self.db.aql.execute(
|
|
1033
|
+
aql,
|
|
1034
|
+
bind_vars=bind,
|
|
1035
|
+
batch_size=self.aql_batch_size or 1000,
|
|
1036
|
+
stream=self.aql_stream if self.aql_stream is not None else True,
|
|
1037
|
+
ttl=120, # 2 minute timeout for long queries
|
|
1038
|
+
optimizer_rules=["+use-indexes"] # Force index usage
|
|
1039
|
+
)
|
|
1040
|
+
for d in cursor:
|
|
1041
|
+
if rich:
|
|
1042
|
+
yield EdgeView(
|
|
1043
|
+
neighbor_id=d["v_id"],
|
|
1044
|
+
relation=d["rel"],
|
|
1045
|
+
weight=float(d["weight"]),
|
|
1046
|
+
edge_id=d["edge_id"],
|
|
1047
|
+
valid_from=d.get("valid_from"),
|
|
1048
|
+
valid_to=d.get("valid_to"),
|
|
1049
|
+
status=d.get("status"),
|
|
1050
|
+
raw_confidence=d.get("raw_confidence"),
|
|
1051
|
+
npll_posterior=d.get("npll_posterior"),
|
|
1052
|
+
calibration=d.get("calibration"),
|
|
1053
|
+
sources=d.get("sources") or [],
|
|
1054
|
+
)
|
|
1055
|
+
else:
|
|
1056
|
+
yield d["v_id"], d["rel"], float(d["weight"])
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
class GlobalGraphAccessor(GraphAccessor):
|
|
1060
|
+
"""
|
|
1061
|
+
Cross-community graph accessor using pre-computed bridge entities.
|
|
1062
|
+
|
|
1063
|
+
This accessor enables intelligent traversal across community boundaries
|
|
1064
|
+
by leveraging the BridgeEntities and CommunityAffinity collections
|
|
1065
|
+
created during community detection.
|
|
1066
|
+
|
|
1067
|
+
Key features:
|
|
1068
|
+
- Uses bridge entities to efficiently cross community boundaries
|
|
1069
|
+
- Scores cross-community paths using affinity scores
|
|
1070
|
+
- Mission-aware: can weight community crossings based on context
|
|
1071
|
+
- Maintains all ArangoCommunityAccessor features
|
|
1072
|
+
"""
|
|
1073
|
+
|
|
1074
|
+
def __init__(
|
|
1075
|
+
self,
|
|
1076
|
+
db,
|
|
1077
|
+
algorithm: str = "leiden",
|
|
1078
|
+
# Base accessor settings
|
|
1079
|
+
nodes_collection: str = "ExtractedEntities",
|
|
1080
|
+
edges_collection: str = "ExtractedRelationships",
|
|
1081
|
+
relation_property: str = "relationship",
|
|
1082
|
+
weight_property: str = "weight",
|
|
1083
|
+
# Bridge collections
|
|
1084
|
+
bridge_collection: str = "BridgeEntities",
|
|
1085
|
+
affinity_collection: str = "CommunityAffinity",
|
|
1086
|
+
membership_collection: str = "EntityCommunities",
|
|
1087
|
+
# Cross-community scoring
|
|
1088
|
+
cross_community_bonus: float = 1.5, # Boost for cross-community edges (often valuable)
|
|
1089
|
+
min_affinity_threshold: float = 0.0, # Minimum affinity to allow crossing
|
|
1090
|
+
# Performance
|
|
1091
|
+
aql_batch_size: int = 1000,
|
|
1092
|
+
aql_stream: bool = True,
|
|
1093
|
+
):
|
|
1094
|
+
self.db = db
|
|
1095
|
+
self.algorithm = algorithm
|
|
1096
|
+
|
|
1097
|
+
self.nodes_col = nodes_collection
|
|
1098
|
+
self.edges_col = edges_collection
|
|
1099
|
+
self.rel_prop = relation_property
|
|
1100
|
+
self.w_prop = weight_property
|
|
1101
|
+
|
|
1102
|
+
self.bridge_col = bridge_collection
|
|
1103
|
+
self.affinity_col = affinity_collection
|
|
1104
|
+
self.membership_col = membership_collection
|
|
1105
|
+
|
|
1106
|
+
self.cross_community_bonus = cross_community_bonus
|
|
1107
|
+
self.min_affinity_threshold = min_affinity_threshold
|
|
1108
|
+
|
|
1109
|
+
self.aql_batch_size = aql_batch_size
|
|
1110
|
+
self.aql_stream = aql_stream
|
|
1111
|
+
|
|
1112
|
+
# Cache for bridge status and affinities
|
|
1113
|
+
self._bridge_cache: Dict[str, Optional[dict]] = {}
|
|
1114
|
+
self._affinity_cache: Dict[str, float] = {}
|
|
1115
|
+
|
|
1116
|
+
# --------------------------
|
|
1117
|
+
# Core traversal API
|
|
1118
|
+
# --------------------------
|
|
1119
|
+
|
|
1120
|
+
def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
1121
|
+
"""Iterate outbound edges, scoring cross-community edges appropriately."""
|
|
1122
|
+
for ev in self._iter_neighbors_global(node, direction="OUTBOUND"):
|
|
1123
|
+
yield ev.neighbor_id, ev.relation, ev.weight
|
|
1124
|
+
|
|
1125
|
+
def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
|
|
1126
|
+
"""Iterate inbound edges, scoring cross-community edges appropriately."""
|
|
1127
|
+
for ev in self._iter_neighbors_global(node, direction="INBOUND"):
|
|
1128
|
+
yield ev.neighbor_id, ev.relation, ev.weight
|
|
1129
|
+
|
|
1130
|
+
def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
1131
|
+
"""Rich outbound edges with cross-community metadata."""
|
|
1132
|
+
yield from self._iter_neighbors_global(node, direction="OUTBOUND")
|
|
1133
|
+
|
|
1134
|
+
def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
|
|
1135
|
+
"""Rich inbound edges with cross-community metadata."""
|
|
1136
|
+
yield from self._iter_neighbors_global(node, direction="INBOUND")
|
|
1137
|
+
|
|
1138
|
+
def nodes(self) -> Iterable[NodeId]:
|
|
1139
|
+
"""Return all nodes (no community restriction)."""
|
|
1140
|
+
aql = f"FOR v IN {self.nodes_col} RETURN v._id"
|
|
1141
|
+
cursor = self.db.aql.execute(aql, batch_size=self.aql_batch_size, stream=self.aql_stream)
|
|
1142
|
+
for vid in cursor:
|
|
1143
|
+
yield vid
|
|
1144
|
+
|
|
1145
|
+
def degree(self, node: NodeId) -> int:
|
|
1146
|
+
"""Out-degree of a node."""
|
|
1147
|
+
aql = f"""
|
|
1148
|
+
RETURN LENGTH(
|
|
1149
|
+
FOR e IN {self.edges_col}
|
|
1150
|
+
FILTER e._from == @node
|
|
1151
|
+
RETURN 1
|
|
1152
|
+
)
|
|
1153
|
+
"""
|
|
1154
|
+
cur = self.db.aql.execute(aql, bind_vars={"node": node})
|
|
1155
|
+
return int(list(cur)[0] or 0)
|
|
1156
|
+
|
|
1157
|
+
# --------------------------
|
|
1158
|
+
# Bridge-aware methods
|
|
1159
|
+
# --------------------------
|
|
1160
|
+
|
|
1161
|
+
def is_bridge(self, entity_key: str) -> Optional[dict]:
|
|
1162
|
+
"""
|
|
1163
|
+
Check if an entity is a bridge and return its bridge data.
|
|
1164
|
+
Uses caching for performance.
|
|
1165
|
+
"""
|
|
1166
|
+
if entity_key in self._bridge_cache:
|
|
1167
|
+
return self._bridge_cache[entity_key]
|
|
1168
|
+
|
|
1169
|
+
aql = """
|
|
1170
|
+
FOR b IN @@bridge_col
|
|
1171
|
+
FILTER b.entity_key == @entity_key
|
|
1172
|
+
FILTER b.algorithm == @algorithm
|
|
1173
|
+
RETURN b
|
|
1174
|
+
"""
|
|
1175
|
+
result = list(self.db.aql.execute(
|
|
1176
|
+
aql,
|
|
1177
|
+
bind_vars={
|
|
1178
|
+
"@bridge_col": self.bridge_col,
|
|
1179
|
+
"entity_key": entity_key,
|
|
1180
|
+
"algorithm": self.algorithm,
|
|
1181
|
+
}
|
|
1182
|
+
))
|
|
1183
|
+
|
|
1184
|
+
bridge_data = result[0] if result else None
|
|
1185
|
+
self._bridge_cache[entity_key] = bridge_data
|
|
1186
|
+
return bridge_data
|
|
1187
|
+
|
|
1188
|
+
def get_entity_community(self, entity_id: str) -> Optional[str]:
|
|
1189
|
+
"""Get the community ID for an entity."""
|
|
1190
|
+
aql = """
|
|
1191
|
+
FOR m IN @@membership_col
|
|
1192
|
+
FILTER m.entity_id == @entity_id
|
|
1193
|
+
FILTER m.algorithm == @algorithm
|
|
1194
|
+
RETURN m.community_id
|
|
1195
|
+
"""
|
|
1196
|
+
result = list(self.db.aql.execute(
|
|
1197
|
+
aql,
|
|
1198
|
+
bind_vars={
|
|
1199
|
+
"@membership_col": self.membership_col,
|
|
1200
|
+
"entity_id": entity_id,
|
|
1201
|
+
"algorithm": self.algorithm,
|
|
1202
|
+
}
|
|
1203
|
+
))
|
|
1204
|
+
return result[0] if result else None
|
|
1205
|
+
|
|
1206
|
+
def get_affinity(self, community_a: str, community_b: str) -> float:
|
|
1207
|
+
"""
|
|
1208
|
+
Get the affinity score between two communities.
|
|
1209
|
+
Returns 0.0 if no affinity data exists.
|
|
1210
|
+
"""
|
|
1211
|
+
cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
|
|
1212
|
+
|
|
1213
|
+
if cache_key in self._affinity_cache:
|
|
1214
|
+
return self._affinity_cache[cache_key]
|
|
1215
|
+
|
|
1216
|
+
aql = """
|
|
1217
|
+
FOR a IN @@affinity_col
|
|
1218
|
+
FILTER a.algorithm == @algorithm
|
|
1219
|
+
FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
|
|
1220
|
+
OR (a.community_a == @comm_b AND a.community_b == @comm_a)
|
|
1221
|
+
RETURN a.affinity_score
|
|
1222
|
+
"""
|
|
1223
|
+
result = list(self.db.aql.execute(
|
|
1224
|
+
aql,
|
|
1225
|
+
bind_vars={
|
|
1226
|
+
"@affinity_col": self.affinity_col,
|
|
1227
|
+
"algorithm": self.algorithm,
|
|
1228
|
+
"comm_a": community_a,
|
|
1229
|
+
"comm_b": community_b,
|
|
1230
|
+
}
|
|
1231
|
+
))
|
|
1232
|
+
|
|
1233
|
+
affinity = result[0] if result else 0.0
|
|
1234
|
+
self._affinity_cache[cache_key] = affinity
|
|
1235
|
+
return affinity
|
|
1236
|
+
|
|
1237
|
+
def get_bridges_from_community(self, community_id: str, min_strength: int = 1) -> List[dict]:
|
|
1238
|
+
"""Get all bridge entities from a specific community."""
|
|
1239
|
+
aql = """
|
|
1240
|
+
FOR b IN @@bridge_col
|
|
1241
|
+
FILTER b.algorithm == @algorithm
|
|
1242
|
+
FILTER b.home_community == @community_id
|
|
1243
|
+
FILTER b.bridge_strength >= @min_strength
|
|
1244
|
+
SORT b.bridge_strength DESC
|
|
1245
|
+
RETURN b
|
|
1246
|
+
"""
|
|
1247
|
+
return list(self.db.aql.execute(
|
|
1248
|
+
aql,
|
|
1249
|
+
bind_vars={
|
|
1250
|
+
"@bridge_col": self.bridge_col,
|
|
1251
|
+
"algorithm": self.algorithm,
|
|
1252
|
+
"community_id": community_id,
|
|
1253
|
+
"min_strength": min_strength,
|
|
1254
|
+
}
|
|
1255
|
+
))
|
|
1256
|
+
|
|
1257
|
+
def get_top_bridges(self, limit: int = 20) -> List[dict]:
|
|
1258
|
+
"""Get the top bridge entities by bridge strength."""
|
|
1259
|
+
aql = """
|
|
1260
|
+
FOR b IN @@bridge_col
|
|
1261
|
+
FILTER b.algorithm == @algorithm
|
|
1262
|
+
SORT b.bridge_strength DESC
|
|
1263
|
+
LIMIT @limit
|
|
1264
|
+
RETURN b
|
|
1265
|
+
"""
|
|
1266
|
+
return list(self.db.aql.execute(
|
|
1267
|
+
aql,
|
|
1268
|
+
bind_vars={
|
|
1269
|
+
"@bridge_col": self.bridge_col,
|
|
1270
|
+
"algorithm": self.algorithm,
|
|
1271
|
+
"limit": limit,
|
|
1272
|
+
}
|
|
1273
|
+
))
|
|
1274
|
+
|
|
1275
|
+
def get_strongest_affinities(self, limit: int = 20) -> List[dict]:
|
|
1276
|
+
"""Get the strongest inter-community affinities."""
|
|
1277
|
+
aql = """
|
|
1278
|
+
FOR a IN @@affinity_col
|
|
1279
|
+
FILTER a.algorithm == @algorithm
|
|
1280
|
+
SORT a.affinity_score DESC
|
|
1281
|
+
LIMIT @limit
|
|
1282
|
+
RETURN a
|
|
1283
|
+
"""
|
|
1284
|
+
return list(self.db.aql.execute(
|
|
1285
|
+
aql,
|
|
1286
|
+
bind_vars={
|
|
1287
|
+
"@affinity_col": self.affinity_col,
|
|
1288
|
+
"algorithm": self.algorithm,
|
|
1289
|
+
"limit": limit,
|
|
1290
|
+
}
|
|
1291
|
+
))
|
|
1292
|
+
|
|
1293
|
+
# --------------------------
|
|
1294
|
+
# Cross-community traversal
|
|
1295
|
+
# --------------------------
|
|
1296
|
+
|
|
1297
|
+
def _iter_neighbors_global(self, node: NodeId, direction: str) -> Iterable[EdgeView]:
|
|
1298
|
+
"""
|
|
1299
|
+
Iterate neighbors with cross-community awareness.
|
|
1300
|
+
|
|
1301
|
+
- Gets all neighbors (no community restriction)
|
|
1302
|
+
- Detects cross-community edges
|
|
1303
|
+
- Applies bonus/penalty based on affinity
|
|
1304
|
+
"""
|
|
1305
|
+
assert direction in ("OUTBOUND", "INBOUND")
|
|
1306
|
+
|
|
1307
|
+
# Get source node's community
|
|
1308
|
+
source_community = self.get_entity_community(node)
|
|
1309
|
+
|
|
1310
|
+
# Get all neighbors
|
|
1311
|
+
aql = f"""
|
|
1312
|
+
FOR v, e IN 1..1 {direction} @node {self.edges_col}
|
|
1313
|
+
LET rel = e[@rel_prop]
|
|
1314
|
+
LET base_weight = HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0
|
|
1315
|
+
RETURN {{
|
|
1316
|
+
v_id: v._id,
|
|
1317
|
+
v_key: v._key,
|
|
1318
|
+
rel: rel,
|
|
1319
|
+
base_weight: base_weight,
|
|
1320
|
+
edge_id: e._id,
|
|
1321
|
+
sources: []
|
|
1322
|
+
}}
|
|
1323
|
+
"""
|
|
1324
|
+
|
|
1325
|
+
cursor = self.db.aql.execute(
|
|
1326
|
+
aql,
|
|
1327
|
+
bind_vars={
|
|
1328
|
+
"node": node,
|
|
1329
|
+
"rel_prop": self.rel_prop,
|
|
1330
|
+
"w_prop": self.w_prop,
|
|
1331
|
+
},
|
|
1332
|
+
batch_size=self.aql_batch_size,
|
|
1333
|
+
stream=self.aql_stream,
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
for d in cursor:
|
|
1337
|
+
neighbor_id = d["v_id"]
|
|
1338
|
+
base_weight = float(d["base_weight"])
|
|
1339
|
+
|
|
1340
|
+
# Check if this is a cross-community edge
|
|
1341
|
+
neighbor_community = self.get_entity_community(neighbor_id)
|
|
1342
|
+
|
|
1343
|
+
weight = base_weight
|
|
1344
|
+
is_cross_community = False
|
|
1345
|
+
|
|
1346
|
+
if source_community and neighbor_community and source_community != neighbor_community:
|
|
1347
|
+
is_cross_community = True
|
|
1348
|
+
|
|
1349
|
+
# Get affinity between communities
|
|
1350
|
+
affinity = self.get_affinity(source_community, neighbor_community)
|
|
1351
|
+
|
|
1352
|
+
# Apply cross-community scoring
|
|
1353
|
+
if affinity >= self.min_affinity_threshold:
|
|
1354
|
+
# Bonus for crossing to well-connected communities
|
|
1355
|
+
weight = base_weight * self.cross_community_bonus * (1 + affinity)
|
|
1356
|
+
else:
|
|
1357
|
+
# Penalty for crossing to poorly-connected communities
|
|
1358
|
+
weight = base_weight * 0.5
|
|
1359
|
+
|
|
1360
|
+
yield EdgeView(
|
|
1361
|
+
neighbor_id=neighbor_id,
|
|
1362
|
+
relation=d["rel"],
|
|
1363
|
+
weight=weight,
|
|
1364
|
+
edge_id=d["edge_id"],
|
|
1365
|
+
valid_from=None,
|
|
1366
|
+
valid_to=None,
|
|
1367
|
+
status="cross_community" if is_cross_community else "same_community",
|
|
1368
|
+
raw_confidence=None,
|
|
1369
|
+
npll_posterior=None,
|
|
1370
|
+
calibration=None,
|
|
1371
|
+
sources=d.get("sources") or [],
|
|
1372
|
+
)
|
|
1373
|
+
|
|
1374
|
+
# --------------------------
|
|
1375
|
+
# Mission-aware scoring
|
|
1376
|
+
# --------------------------
|
|
1377
|
+
|
|
1378
|
+
def score_community_crossing(
|
|
1379
|
+
self,
|
|
1380
|
+
from_community: str,
|
|
1381
|
+
to_community: str,
|
|
1382
|
+
mission: Optional[str] = None
|
|
1383
|
+
) -> float:
|
|
1384
|
+
"""
|
|
1385
|
+
Score a community crossing based on mission context.
|
|
1386
|
+
|
|
1387
|
+
Args:
|
|
1388
|
+
from_community: Source community
|
|
1389
|
+
to_community: Target community
|
|
1390
|
+
mission: Optional mission context (e.g., "fraud_detection", "patient_care")
|
|
1391
|
+
|
|
1392
|
+
Returns:
|
|
1393
|
+
Score multiplier for the crossing (>1 = valuable, <1 = not valuable)
|
|
1394
|
+
"""
|
|
1395
|
+
base_affinity = self.get_affinity(from_community, to_community)
|
|
1396
|
+
|
|
1397
|
+
if not mission:
|
|
1398
|
+
return 1.0 + base_affinity
|
|
1399
|
+
|
|
1400
|
+
# Mission-specific scoring (customize based on your domain)
|
|
1401
|
+
mission_lower = mission.lower()
|
|
1402
|
+
|
|
1403
|
+
# Example: fraud detection values Claims -> Clinical crossings
|
|
1404
|
+
if "fraud" in mission_lower:
|
|
1405
|
+
# This would need actual community type detection
|
|
1406
|
+
# For now, just boost high-affinity crossings
|
|
1407
|
+
return (1.0 + base_affinity) * 1.5
|
|
1408
|
+
|
|
1409
|
+
# Example: patient care values Clinical -> Lab crossings
|
|
1410
|
+
if "patient" in mission_lower or "clinical" in mission_lower:
|
|
1411
|
+
return (1.0 + base_affinity) * 1.3
|
|
1412
|
+
|
|
1413
|
+
return 1.0 + base_affinity
|
|
1414
|
+
|
|
1415
|
+
def clear_cache(self):
|
|
1416
|
+
"""Clear internal caches (useful after data updates)."""
|
|
1417
|
+
self._bridge_cache.clear()
|
|
1418
|
+
self._affinity_cache.clear()
|