odin-engine 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. benchmarks/__init__.py +17 -17
  2. benchmarks/datasets.py +284 -284
  3. benchmarks/metrics.py +275 -275
  4. benchmarks/run_ablation.py +279 -279
  5. benchmarks/run_npll_benchmark.py +270 -270
  6. npll/__init__.py +10 -10
  7. npll/bootstrap.py +474 -474
  8. npll/core/__init__.py +33 -33
  9. npll/core/knowledge_graph.py +308 -308
  10. npll/core/logical_rules.py +496 -496
  11. npll/core/mln.py +474 -474
  12. npll/inference/__init__.py +40 -40
  13. npll/inference/e_step.py +419 -419
  14. npll/inference/elbo.py +434 -434
  15. npll/inference/m_step.py +576 -576
  16. npll/npll_model.py +631 -631
  17. npll/scoring/__init__.py +42 -42
  18. npll/scoring/embeddings.py +441 -441
  19. npll/scoring/probability.py +402 -402
  20. npll/scoring/scoring_module.py +369 -369
  21. npll/training/__init__.py +24 -24
  22. npll/training/evaluation.py +496 -496
  23. npll/training/npll_trainer.py +520 -520
  24. npll/utils/__init__.py +47 -47
  25. npll/utils/batch_utils.py +492 -492
  26. npll/utils/config.py +144 -144
  27. npll/utils/math_utils.py +338 -338
  28. odin/__init__.py +21 -20
  29. odin/engine.py +264 -264
  30. odin/schema.py +210 -0
  31. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/METADATA +503 -456
  32. odin_engine-0.2.0.dist-info/RECORD +63 -0
  33. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/licenses/LICENSE +21 -21
  34. retrieval/__init__.py +50 -50
  35. retrieval/adapters.py +140 -140
  36. retrieval/adapters_arango.py +1418 -1418
  37. retrieval/aggregators.py +707 -707
  38. retrieval/beam.py +127 -127
  39. retrieval/budget.py +60 -60
  40. retrieval/cache.py +159 -159
  41. retrieval/confidence.py +88 -88
  42. retrieval/eval.py +49 -49
  43. retrieval/linker.py +87 -87
  44. retrieval/metrics.py +105 -105
  45. retrieval/metrics_motifs.py +36 -36
  46. retrieval/orchestrator.py +571 -571
  47. retrieval/ppr/__init__.py +12 -12
  48. retrieval/ppr/anchors.py +41 -41
  49. retrieval/ppr/bippr.py +61 -61
  50. retrieval/ppr/engines.py +257 -257
  51. retrieval/ppr/global_pr.py +76 -76
  52. retrieval/ppr/indexes.py +78 -78
  53. retrieval/ppr.py +156 -156
  54. retrieval/ppr_cache.py +25 -25
  55. retrieval/scoring.py +294 -294
  56. retrieval/utils/pii_redaction.py +36 -36
  57. retrieval/writers/__init__.py +9 -9
  58. retrieval/writers/arango_writer.py +28 -28
  59. retrieval/writers/base.py +21 -21
  60. retrieval/writers/janus_writer.py +36 -36
  61. odin_engine-0.1.0.dist-info/RECORD +0 -62
  62. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/WHEEL +0 -0
  63. {odin_engine-0.1.0.dist-info → odin_engine-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,1418 +1,1418 @@
1
- from __future__ import annotations
2
- from typing import Iterable, Tuple, Optional, List, Dict, Any, NamedTuple
3
-
4
- from .adapters import GraphAccessor, NodeId, RelId
5
-
6
-
7
- class EdgeView(NamedTuple):
8
- neighbor_id: NodeId
9
- relation: RelId
10
- weight: float # structural effective weight
11
- edge_id: str
12
- valid_from: Optional[str]
13
- valid_to: Optional[str]
14
- status: Optional[str]
15
- raw_confidence: Optional[float]
16
- npll_posterior: Optional[float]
17
- calibration: Optional[float]
18
- sources: List[str] # doc/text ids from inline fields & EXTRACTED_FROM
19
-
20
-
21
- class ArangoCommunityAccessor(GraphAccessor):
22
- """
23
- Arango-backed GraphAccessor for a single community.
24
-
25
- Defaults match your schema:
26
- - nodes: ExtractedEntities
27
- - edges: ExtractedRelationships (field: relationship, created_at)
28
- - community via mapping: EntityCommunities(entity_id, community_id)
29
- - provenance: inline fields + EXTRACTED_FROM (entity -> Documents/TextBlocks)
30
-
31
- Structural weight only by default:
32
- w_struct = base_weight * type_prior(relation) * recency_decay
33
- (Set fuse_edge_confidence=True to multiply raw_confidence * npll_posterior * calibration in-adapter.)
34
- """
35
-
36
- def __init__(
37
- self,
38
- db,
39
- community_id: str,
40
- # Collections
41
- nodes_collection: str = "ExtractedEntities",
42
- edges_collection: str = "ExtractedRelationships",
43
- # Core field names
44
- relation_property: str = "relationship",
45
- weight_property: str = "weight",
46
- node_type_property: str = "type",
47
- # Time fields
48
- edge_timestamp_property: str = "created_at",
49
- edge_valid_from_property: Optional[str] = "valid_from",
50
- edge_valid_to_property: Optional[str] = "valid_to",
51
- edge_status_property: Optional[str] = "status",
52
- # Community scoping (mapping mode by default)
53
- community_mode: str = "mapping", # "mapping" | "property"
54
- community_property: str = "community_id", # only used if community_mode == "property"
55
- membership_collection: str = "EntityCommunities",
56
- membership_entity_field: str = "entity_id",
57
- membership_community_field: str = "community_id",
58
- # Dynamic constraints
59
- allowed_relations: Optional[List[str]] = None,
60
- disallowed_relations: Optional[List[str]] = None,
61
- allowed_neighbor_types: Optional[List[str]] = None,
62
- # Time filters
63
- time_window: Optional[Tuple[str, str]] = None, # (start_iso, end_iso)
64
- as_of: Optional[str] = None, # ISO timestamp for "as of"
65
- current_only: bool = False, # respect valid_from/valid_to around as_of
66
- recency_half_life_days: Optional[float] = 90.0, # None disables recency decay
67
- # Priors
68
- type_priors: Optional[Dict[str, float]] = None, # e.g., {"assessor": 1.1}
69
- # Provenance
70
- edge_provenance_fields: Optional[List[str]] = None, # defaults: ["source_document_id","source_text_id"]
71
- provenance_edge_collection: Optional[str] = "EXTRACTED_FROM",
72
- provenance_target_collections: Optional[List[str]] = None, # defaults: ["Documents","TextBlocks"]
73
- # Confidence fusion (usually False; you do NPLL in engine)
74
- fuse_edge_confidence: bool = False,
75
- missing_confidence_prior: float = 1.0,
76
- edge_raw_confidence_property: Optional[str] = "raw_confidence",
77
- edge_npll_posterior_property: Optional[str] = "npll_posterior",
78
- edge_calibration_property: Optional[str] = "calibration",
79
- # Performance
80
- aql_batch_size: int = 1000,
81
- aql_stream: bool = True,
82
- outbound_index_hint: Optional[str] = None, # e.g. "edges_from_rel_ts"
83
- inbound_index_hint: Optional[str] = None, # e.g. "edges_to_rel_ts"
84
- # Bridge / GNN integration
85
- bridge_collection: str = "BridgeEntities",
86
- affinity_collection: str = "CommunityAffinity",
87
- algorithm: str = "gnn", # Default to GNN as per pipeline
88
- ):
89
- self.db = db
90
- self._cid = community_id
91
- self.bridge_col = bridge_collection
92
- self.affinity_col = affinity_collection
93
- self.algorithm = algorithm
94
- self._bridge_cache: Dict[str, Optional[dict]] = {}
95
- self._affinity_cache: Dict[str, float] = {}
96
-
97
- self.nodes_col = nodes_collection
98
- self.edges_col = edges_collection
99
-
100
- self.rel_prop = relation_property
101
- self.w_prop = weight_property
102
- self.node_type_prop = node_type_property
103
-
104
- self.ts_prop = edge_timestamp_property
105
- self.edge_valid_from_prop = edge_valid_from_property
106
- self.edge_valid_to_prop = edge_valid_to_property
107
- self.edge_status_prop = edge_status_property
108
-
109
- self.community_mode = community_mode
110
- self.community_prop = community_property
111
- self.membership_col = membership_collection
112
- self.memb_ent_field = membership_entity_field
113
- self.memb_com_field = membership_community_field
114
-
115
- self.allowed_relations = allowed_relations
116
- self.disallowed_relations = disallowed_relations
117
- self.allowed_neighbor_types = allowed_neighbor_types
118
-
119
- self.time_window = time_window
120
- self.as_of = as_of
121
- self.current_only = current_only
122
- self.recency_half_life_days = recency_half_life_days
123
-
124
- self.type_priors = type_priors or {}
125
-
126
- self.edge_prov_fields = edge_provenance_fields or ["source_document_id", "source_text_id"]
127
- self.prov_edges_col = provenance_edge_collection
128
- self.prov_target_cols = provenance_target_collections or ["Documents", "TextBlocks"]
129
-
130
- self.fuse_edge_confidence = fuse_edge_confidence
131
- self.missing_confidence_prior = missing_confidence_prior
132
- self.edge_raw_conf_prop = edge_raw_confidence_property
133
- self.edge_npll_post_prop = edge_npll_posterior_property
134
- self.edge_calibration_prop = edge_calibration_property
135
-
136
- self.aql_batch_size = aql_batch_size
137
- self.aql_stream = aql_stream
138
- self.outbound_index_hint = outbound_index_hint
139
- self.inbound_index_hint = inbound_index_hint
140
-
141
- # --------------------------
142
- # Back-compatible core API
143
- # --------------------------
144
- def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
145
- for ev in self._iter_neighbors(node, direction="OUTBOUND", rich=True):
146
- yield ev.neighbor_id, ev.relation, ev.weight
147
-
148
- def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
149
- for ev in self._iter_neighbors(node, direction="INBOUND", rich=True):
150
- yield ev.neighbor_id, ev.relation, ev.weight
151
-
152
- def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
153
- """
154
- Return all node IDs in this community.
155
- - mapping mode: EntityCommunities -> entity_id
156
- - property mode: filter ExtractedEntities by community_id field (if you add it)
157
- - none mode: return all nodes
158
- """
159
- cid = community_id or self._cid
160
- if self.community_mode == "property":
161
- aql = f"""
162
- FOR v IN {self.nodes_col}
163
- FILTER v.{self.community_prop} == @cid
164
- RETURN v._id
165
- """
166
- cursor = self.db.aql.execute(
167
- aql, bind_vars={"cid": cid}, batch_size=self.aql_batch_size, stream=self.aql_stream
168
- )
169
- elif self.community_mode == "mapping":
170
- aql = f"""
171
- FOR m IN @@mcol
172
- FILTER m[@m_com] == @cid
173
- RETURN m[@m_ent]
174
- """
175
- cursor = self.db.aql.execute(
176
- aql,
177
- bind_vars={
178
- "cid": cid,
179
- "@mcol": self.membership_col,
180
- "m_ent": self.memb_ent_field,
181
- "m_com": self.memb_com_field,
182
- },
183
- batch_size=self.aql_batch_size,
184
- stream=self.aql_stream,
185
- )
186
- else: # community_mode == "none"
187
- aql = f"""
188
- FOR v IN {self.nodes_col}
189
- RETURN v._id
190
- """
191
- cursor = self.db.aql.execute(
192
- aql, batch_size=self.aql_batch_size, stream=self.aql_stream
193
- )
194
- for vid in cursor:
195
- yield vid
196
-
197
- def degree(self, node: NodeId) -> int:
198
- """Out-degree (fast)."""
199
- hint_clause = (
200
- "OPTIONS { indexHint: @idx, forceIndexHint: true }" if self.outbound_index_hint else ""
201
- )
202
- aql = f"""
203
- RETURN LENGTH(
204
- FOR e IN {self.edges_col}
205
- {hint_clause}
206
- FILTER e._from == @node
207
- RETURN 1
208
- )
209
- """
210
- bind = {"node": node}
211
- if self.outbound_index_hint:
212
- bind["idx"] = self.outbound_index_hint
213
- cur = self.db.aql.execute(aql, bind_vars=bind)
214
- return int(list(cur)[0] or 0)
215
-
216
- # --------------------------
217
- # Rich neighbor variants
218
- # --------------------------
219
- def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
220
- yield from self._iter_neighbors(node, direction="OUTBOUND", rich=True)
221
-
222
- def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
223
- yield from self._iter_neighbors(node, direction="INBOUND", rich=True)
224
-
225
- # --------------------------
226
- # Provenance helpers
227
- # --------------------------
228
- def get_edge_provenance(self, edge_id: str) -> List[str]:
229
- """
230
- Return provenance targets for a relationship edge:
231
- - inline fields (source_document_id, source_text_id)
232
- - EXTRACTED_FROM edges for either endpoint entity
233
- """
234
- # Build the provenance edges clause safely (avoid nested f-strings)
235
- prov_edges_clause = (
236
- f"""
237
- FOR p IN {self.prov_edges_col}
238
- FILTER p._from IN [e._from, e._to]
239
- RETURN p._to
240
- """
241
- if self.prov_edges_col else "[]"
242
- )
243
-
244
- aql = f"""
245
- LET e = DOCUMENT(@eid)
246
- LET inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
247
- LET inline = (
248
- FOR x IN inline_candidates
249
- FILTER x != null
250
- RETURN x
251
- )
252
- LET via_edges = (
253
- {prov_edges_clause}
254
- )
255
- RETURN UNIQUE(APPEND(inline, via_edges))
256
- """
257
- cur = self.db.aql.execute(aql, bind_vars={"eid": edge_id}, batch_size=self.aql_batch_size, stream=self.aql_stream)
258
- out = list(cur)
259
- return out[0] if out else []
260
-
261
- def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> Dict[str, Any]:
262
- if fields:
263
- proj = ", ".join([f"{f}: d.{f}" for f in fields])
264
- aql = f"LET d = DOCUMENT(@id) RETURN {{ _id: d._id, {proj} }}"
265
- else:
266
- aql = "RETURN DOCUMENT(@id)"
267
- cur = self.db.aql.execute(aql, bind_vars={"id": node_id})
268
- res = list(cur)
269
- return res[0] if res else {}
270
-
271
- # --------------------------
272
- # Stats / quick analytics
273
- # --------------------------
274
- @staticmethod
275
- def get_top_n_entities_by_degree(
276
- db,
277
- edges_collection: str = "ExtractedRelationships",
278
- limit: Optional[int] = None,
279
- time_window: Optional[Tuple[str, str]] = None,
280
- time_property: str = "created_at",
281
- ) -> List[dict]:
282
- bind: Dict[str, Any] = {}
283
- where = ""
284
- if time_window:
285
- where = "FILTER HAS(e, @ts) AND e[@ts] >= @start_ts AND e[@ts] <= @end_ts"
286
- bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
287
- limit_clause = "LIMIT @lim" if limit else ""
288
- if limit:
289
- bind["lim"] = limit
290
- aql = f"""
291
- FOR e IN {edges_collection}
292
- {where}
293
- COLLECT entity = e._from WITH COUNT INTO degree
294
- SORT degree DESC
295
- {limit_clause}
296
- RETURN {{ "entity": entity, "degree": degree }}
297
- """
298
- return list(db.aql.execute(aql, bind_vars=bind))
299
-
300
- @staticmethod
301
- def get_entity_type_counts(
302
- db,
303
- nodes_collection: str = "ExtractedEntities",
304
- type_property: str = "type"
305
- ) -> List[dict]:
306
- aql = f"""
307
- FOR doc IN {nodes_collection}
308
- COLLECT t = doc.{type_property} WITH COUNT INTO c
309
- SORT c DESC
310
- RETURN {{ "type": t, "count": c }}
311
- """
312
- return list(db.aql.execute(aql))
313
-
314
- @staticmethod
315
- def get_relationship_type_counts(
316
- db,
317
- edges_collection: str = "ExtractedRelationships",
318
- relation_property: str = "relationship",
319
- time_window: Optional[Tuple[str, str]] = None,
320
- time_property: str = "created_at",
321
- ) -> List[dict]:
322
- bind: Dict[str, Any] = {"rel_prop": relation_property}
323
- where = "FILTER HAS(rel, @rel_prop)"
324
- if time_window:
325
- where += " AND HAS(rel, @ts) AND rel[@ts] >= @start_ts AND rel[@ts] <= @end_ts"
326
- bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
327
- aql = f"""
328
- FOR rel IN {edges_collection}
329
- {where}
330
- COLLECT t = rel[@rel_prop] WITH COUNT INTO c
331
- SORT c DESC
332
- RETURN {{ "type": t, "count": c }}
333
- """
334
- return list(db.aql.execute(aql, bind_vars=bind))
335
-
336
- @staticmethod
337
- def get_community_summaries(
338
- db,
339
- communities_collection: str = "Communities",
340
- limit: Optional[int] = None,
341
- skip: int = 0,
342
- require_summary: bool = True
343
- ) -> List[dict]:
344
- filter_clause = "FILTER c.summary != null AND c.summary != ''" if require_summary else "FILTER c.summary == null OR c.summary == ''"
345
- limit_clause = "LIMIT @skip, @limit" if limit is not None else ""
346
- bind: Dict[str, Any] = {}
347
- if limit is not None:
348
- bind.update({"skip": skip, "limit": limit})
349
- aql = f"""
350
- FOR c IN {communities_collection}
351
- {filter_clause}
352
- SORT c.community_id ASC
353
- {limit_clause}
354
- RETURN {{ id: c.community_id, summary: c.summary, size: c.size, level: c.level }}
355
- """
356
- return list(db.aql.execute(aql, bind_vars=bind))
357
-
358
- @staticmethod
359
- def get_unique_table_headers(
360
- db,
361
- tables_collection: str = "Tables",
362
- headers_property: str = "headers"
363
- ) -> List[List[str]]:
364
- aql = f"""
365
- FOR t IN {tables_collection}
366
- FILTER HAS(t, @hp)
367
- COLLECT h = t[@hp]
368
- RETURN h
369
- """
370
- return list(db.aql.execute(aql, bind_vars={"hp": headers_property}))
371
-
372
- # --------------------------
373
- # Bridge / GNN Integration Methods (Mirrored from GlobalGraphAccessor)
374
- # --------------------------
375
-
376
- def is_bridge(self, entity_key: str) -> Optional[dict]:
377
- """
378
- Check if an entity is a bridge and return its bridge data.
379
- Uses caching for performance.
380
- """
381
- # Strip collection if present to get key
382
- if "/" in entity_key:
383
- entity_key = entity_key.split("/")[-1]
384
-
385
- if entity_key in self._bridge_cache:
386
- return self._bridge_cache[entity_key]
387
-
388
- aql = """
389
- FOR b IN @@bridge_col
390
- FILTER b.entity_key == @entity_key
391
- FILTER b.algorithm == @algorithm
392
- RETURN b
393
- """
394
- try:
395
- result = list(self.db.aql.execute(
396
- aql,
397
- bind_vars={
398
- "@bridge_col": self.bridge_col,
399
- "entity_key": entity_key,
400
- "algorithm": self.algorithm,
401
- }
402
- ))
403
- bridge_data = result[0] if result else None
404
- except Exception:
405
- # Fallback if collection doesn't exist yet
406
- bridge_data = None
407
-
408
- self._bridge_cache[entity_key] = bridge_data
409
- return bridge_data
410
-
411
- def get_entity_community(self, entity_id: str) -> Optional[str]:
412
- """Get the community ID for an entity."""
413
- # For ArangoCommunityAccessor, we might know the community if mode is 'mapping'
414
- # But we should check the mapping collection to be sure (or if it's a bridge to another community)
415
-
416
- # If we are in 'mapping' mode, we can query membership collection
417
- if self.community_mode == "mapping":
418
- aql = f"""
419
- FOR m IN {self.membership_col}
420
- FILTER m.{self.memb_ent_field} == @entity_id
421
- // We don't filter by algorithm here usually, but if needed we can
422
- RETURN m.{self.memb_com_field}
423
- """
424
- try:
425
- result = list(self.db.aql.execute(aql, bind_vars={"entity_id": entity_id}))
426
- return result[0] if result else None
427
- except Exception:
428
- return None
429
- return None
430
-
431
- def get_affinity(self, community_a: str, community_b: str) -> float:
432
- """
433
- Get the affinity score between two communities.
434
- Returns 0.0 if no affinity data exists.
435
- """
436
- if not community_a or not community_b:
437
- return 0.0
438
-
439
- cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
440
-
441
- if cache_key in self._affinity_cache:
442
- return self._affinity_cache[cache_key]
443
-
444
- aql = """
445
- FOR a IN @@affinity_col
446
- FILTER a.algorithm == @algorithm
447
- FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
448
- OR (a.community_a == @comm_b AND a.community_b == @comm_a)
449
- RETURN a.affinity_score
450
- """
451
- try:
452
- result = list(self.db.aql.execute(
453
- aql,
454
- bind_vars={
455
- "@affinity_col": self.affinity_col,
456
- "algorithm": self.algorithm,
457
- "comm_a": community_a,
458
- "comm_b": community_b,
459
- }
460
- ))
461
- affinity = result[0] if result else 0.0
462
- except Exception:
463
- affinity = 0.0
464
-
465
- self._affinity_cache[cache_key] = affinity
466
- return affinity
467
-
468
- def clear_bridge_cache(self):
469
- """Clear bridge/affinity caches."""
470
- self._bridge_cache.clear()
471
- self._affinity_cache.clear()
472
-
473
- # ════════════════════════════════════════════════════════════════
474
- # DISCOVERY ENTRY POINTS (for autonomous insight discovery)
475
- # ════════════════════════════════════════════════════════════════
476
-
477
- @staticmethod
478
- def get_top_entities_in_community(
479
- db,
480
- community_id: str,
481
- membership_collection: str = "EntityCommunities",
482
- membership_entity_field: str = "entity_id",
483
- membership_community_field: str = "community_id",
484
- edges_collection: str = "ExtractedRelationships",
485
- limit: int = 20,
486
- ) -> List[dict]:
487
- """
488
- Get top entities by degree WITHIN a specific community.
489
- Essential for autonomous discovery - provides high-value seed nodes.
490
-
491
- Returns:
492
- List of {entity: str, degree: int}
493
- """
494
- aql = """
495
- LET community_entities = (
496
- FOR m IN @@membership
497
- FILTER m[@m_com] == @cid
498
- RETURN m[@m_ent]
499
- )
500
- FOR e IN @@edges
501
- FILTER e._from IN community_entities
502
- COLLECT entity = e._from WITH COUNT INTO degree
503
- SORT degree DESC
504
- LIMIT @limit
505
- RETURN { entity: entity, degree: degree }
506
- """
507
- return list(db.aql.execute(aql, bind_vars={
508
- "@membership": membership_collection,
509
- "@edges": edges_collection,
510
- "m_ent": membership_entity_field,
511
- "m_com": membership_community_field,
512
- "cid": community_id,
513
- "limit": limit,
514
- }))
515
-
516
- @staticmethod
517
- def get_recent_entities(
518
- db,
519
- since: str, # ISO timestamp
520
- community_id: Optional[str] = None,
521
- nodes_collection: str = "ExtractedEntities",
522
- membership_collection: str = "EntityCommunities",
523
- membership_entity_field: str = "entity_id",
524
- membership_community_field: str = "community_id",
525
- created_at_property: str = "created_at",
526
- updated_at_property: str = "updated_at",
527
- limit: int = 100,
528
- ) -> List[dict]:
529
- """
530
- Get entities created or updated since a timestamp.
531
- Critical for daily discovery - "what's new since yesterday?"
532
-
533
- Args:
534
- since: ISO timestamp (e.g., "2026-01-11T00:00:00Z")
535
- community_id: Optional community filter
536
-
537
- Returns:
538
- List of {entity: str, created_at: str, type: str}
539
- """
540
- bind: Dict[str, Any] = {
541
- "since": since,
542
- "limit": limit,
543
- "created_prop": created_at_property,
544
- "updated_prop": updated_at_property,
545
- }
546
-
547
- community_filter = ""
548
- if community_id:
549
- community_filter = """
550
- LET community_entities = (
551
- FOR m IN @@membership
552
- FILTER m[@m_com] == @cid
553
- RETURN m[@m_ent]
554
- )
555
- FILTER e._id IN community_entities
556
- """
557
- bind["@membership"] = membership_collection
558
- bind["m_ent"] = membership_entity_field
559
- bind["m_com"] = membership_community_field
560
- bind["cid"] = community_id
561
-
562
- aql = f"""
563
- FOR e IN {nodes_collection}
564
- FILTER (HAS(e, @created_prop) AND e[@created_prop] >= @since)
565
- OR (HAS(e, @updated_prop) AND e[@updated_prop] >= @since)
566
- {community_filter}
567
- SORT HAS(e, @created_prop) ? e[@created_prop] : e[@updated_prop] DESC
568
- LIMIT @limit
569
- RETURN {{
570
- entity: e._id,
571
- created_at: HAS(e, @created_prop) ? e[@created_prop] : null,
572
- updated_at: HAS(e, @updated_prop) ? e[@updated_prop] : null,
573
- type: e.type,
574
- name: e.name
575
- }}
576
- """
577
- return list(db.aql.execute(aql, bind_vars=bind))
578
-
579
- @staticmethod
580
- def search_entities(
581
- db,
582
- query: str,
583
- community_id: Optional[str] = None,
584
- nodes_collection: str = "ExtractedEntities",
585
- membership_collection: str = "EntityCommunities",
586
- membership_entity_field: str = "entity_id",
587
- membership_community_field: str = "community_id",
588
- search_fields: List[str] = None,
589
- limit: int = 20,
590
- ) -> List[dict]:
591
- """
592
- Text search for entities matching query.
593
- Uses LIKE for simple text matching (can be upgraded to ArangoSearch).
594
-
595
- Args:
596
- query: Search string
597
- search_fields: Fields to search in (default: ["name", "description"])
598
-
599
- Returns:
600
- List of {entity: str, name: str, type: str, matched_field: str}
601
- """
602
- if search_fields is None:
603
- search_fields = ["name", "description"]
604
-
605
- bind: Dict[str, Any] = {
606
- "query": f"%{query.lower()}%",
607
- "limit": limit,
608
- }
609
-
610
- # Build search conditions
611
- search_conditions = []
612
- for field in search_fields:
613
- search_conditions.append(f"LOWER(e.{field}) LIKE @query")
614
- search_clause = " OR ".join(search_conditions)
615
-
616
- community_filter = ""
617
- if community_id:
618
- community_filter = """
619
- LET community_entities = (
620
- FOR m IN @@membership
621
- FILTER m[@m_com] == @cid
622
- RETURN m[@m_ent]
623
- )
624
- FILTER e._id IN community_entities
625
- """
626
- bind["@membership"] = membership_collection
627
- bind["m_ent"] = membership_entity_field
628
- bind["m_com"] = membership_community_field
629
- bind["cid"] = community_id
630
-
631
- aql = f"""
632
- FOR e IN {nodes_collection}
633
- FILTER {search_clause}
634
- {community_filter}
635
- LIMIT @limit
636
- RETURN {{
637
- entity: e._id,
638
- name: e.name,
639
- type: e.type,
640
- description: e.description
641
- }}
642
- """
643
- return list(db.aql.execute(aql, bind_vars=bind))
644
-
645
- # ════════════════════════════════════════════════════════════════
646
- # CONTENT HYDRATION (for agent reasoning)
647
- # ════════════════════════════════════════════════════════════════
648
-
649
- @staticmethod
650
- def get_document_content(
651
- db,
652
- doc_id: str,
653
- text_collection: str = "TextBlocks",
654
- table_collection: str = "Tables",
655
- image_collection: str = "Images",
656
- document_collection: str = "Documents",
657
- ) -> Optional[dict]:
658
- """
659
- Fetch content from any document collection by ID.
660
- Essential for agent reasoning - converts graph IDs to actual content.
661
-
662
- Args:
663
- doc_id: Document ID in format "CollectionName/key"
664
-
665
- Returns:
666
- Dict with type-specific content, or None if not found
667
- """
668
- try:
669
- collection, key = doc_id.split("/", 1)
670
- except ValueError:
671
- return None
672
-
673
- if collection == text_collection:
674
- aql = f"""
675
- FOR tb IN {text_collection}
676
- FILTER tb._id == @doc_id
677
- RETURN {{
678
- type: "text",
679
- text: tb.text,
680
- document_id: tb.document_id,
681
- page: tb.page,
682
- char_span: tb.char_span,
683
- metadata: tb.metadata
684
- }}
685
- """
686
- elif collection == table_collection:
687
- aql = f"""
688
- FOR t IN {table_collection}
689
- FILTER t._id == @doc_id
690
- RETURN {{
691
- type: "table",
692
- headers: t.headers,
693
- rows: t.rows,
694
- caption: t.caption,
695
- document_id: t.document_id,
696
- page: t.page,
697
- metadata: t.metadata
698
- }}
699
- """
700
- elif collection == image_collection:
701
- aql = f"""
702
- FOR img IN {image_collection}
703
- FILTER img._id == @doc_id
704
- RETURN {{
705
- type: "image",
706
- caption: img.caption,
707
- ocr_text: img.ocr_text,
708
- url: img.storage_url,
709
- document_id: img.document_id,
710
- page: img.page,
711
- metadata: img.metadata
712
- }}
713
- """
714
- elif collection == document_collection:
715
- aql = f"""
716
- FOR d IN {document_collection}
717
- FILTER d._id == @doc_id
718
- RETURN {{
719
- type: "document",
720
- filename: d.filename,
721
- content: d.content,
722
- metadata: d.metadata
723
- }}
724
- """
725
- else:
726
- return None
727
-
728
- result = list(db.aql.execute(aql, bind_vars={"doc_id": doc_id}))
729
- return result[0] if result else None
730
-
731
- @staticmethod
732
- def get_entity_sources(
733
- db,
734
- entity_id: str,
735
- extracted_from_collection: str = "EXTRACTED_FROM",
736
- max_sources: int = 10,
737
- ) -> List[dict]:
738
- """
739
- Get all source documents/blocks for an entity via EXTRACTED_FROM edges.
740
- Critical for evidence gathering - shows WHERE an entity was mentioned.
741
-
742
- Args:
743
- entity_id: Entity ID (e.g., "ExtractedEntities/ent_123")
744
- max_sources: Limit number of sources returned
745
-
746
- Returns:
747
- List of {source_id, source_type, content, char_span, confidence, metadata}
748
- """
749
- aql = f"""
750
- FOR edge IN {extracted_from_collection}
751
- FILTER edge._from == @entity_id
752
- LIMIT @max_sources
753
- LET source = DOCUMENT(edge._to)
754
- LET collection = PARSE_IDENTIFIER(edge._to).collection
755
- RETURN {{
756
- source_id: edge._to,
757
- source_type: collection,
758
- char_span: edge.char_span,
759
- extraction_confidence: edge.extraction_confidence,
760
- content: CASE
761
- WHEN collection == "TextBlocks" THEN source.text
762
- WHEN collection == "Tables" THEN {{ headers: source.headers, rows: source.rows }}
763
- WHEN collection == "Images" THEN {{ caption: source.caption, ocr_text: source.ocr_text }}
764
- WHEN collection == "Documents" THEN SUBSTRING(source.content, 0, 500)
765
- ELSE null
766
- END,
767
- metadata: {{
768
- page: source.page,
769
- document_id: source.document_id,
770
- filename: source.filename
771
- }}
772
- }}
773
- """
774
- return list(db.aql.execute(aql, bind_vars={
775
- "entity_id": entity_id,
776
- "max_sources": max_sources,
777
- }))
778
-
779
- @staticmethod
780
- def search_content(
781
- db,
782
- query: str,
783
- community_id: Optional[str] = None,
784
- content_types: List[str] = None,
785
- text_collection: str = "TextBlocks",
786
- table_collection: str = "Tables",
787
- image_collection: str = "Images",
788
- membership_collection: str = "EntityCommunities",
789
- extracted_from_collection: str = "EXTRACTED_FROM",
790
- limit: int = 10,
791
- ) -> List[dict]:
792
- """
793
- Semantic/text search across content collections.
794
- Uses simple LIKE matching (can be upgraded to ArangoSearch/vectors).
795
-
796
- Args:
797
- query: Search string
798
- content_types: Collections to search (default: ["TextBlocks", "Tables", "Images"])
799
- community_id: Optional filter to content linked to community entities
800
-
801
- Returns:
802
- List of {source_id, source_type, content, score, metadata}
803
- """
804
- if content_types is None:
805
- content_types = [text_collection, table_collection, image_collection]
806
-
807
- bind: Dict[str, Any] = {
808
- "query": f"%{query.lower()}%",
809
- "limit": limit,
810
- }
811
-
812
- results = []
813
-
814
- # Search TextBlocks
815
- if text_collection in content_types:
816
- aql_text = f"""
817
- FOR tb IN {text_collection}
818
- FILTER LOWER(tb.text) LIKE @query
819
- LIMIT @limit
820
- RETURN {{
821
- source_id: tb._id,
822
- source_type: "TextBlocks",
823
- content: tb.text,
824
- score: 1.0,
825
- metadata: {{
826
- document_id: tb.document_id,
827
- page: tb.page
828
- }}
829
- }}
830
- """
831
- results.extend(list(db.aql.execute(aql_text, bind_vars=bind)))
832
-
833
- # Search Tables (caption)
834
- if table_collection in content_types:
835
- aql_table = f"""
836
- FOR t IN {table_collection}
837
- FILTER LOWER(t.caption) LIKE @query
838
- LIMIT @limit
839
- RETURN {{
840
- source_id: t._id,
841
- source_type: "Tables",
842
- content: {{ headers: t.headers, rows: t.rows, caption: t.caption }},
843
- score: 1.0,
844
- metadata: {{
845
- document_id: t.document_id,
846
- page: t.page
847
- }}
848
- }}
849
- """
850
- results.extend(list(db.aql.execute(aql_table, bind_vars=bind)))
851
-
852
- # Search Images (OCR text)
853
- if image_collection in content_types:
854
- aql_image = f"""
855
- FOR img IN {image_collection}
856
- FILTER LOWER(img.ocr_text) LIKE @query OR LOWER(img.caption) LIKE @query
857
- LIMIT @limit
858
- RETURN {{
859
- source_id: img._id,
860
- source_type: "Images",
861
- content: {{ caption: img.caption, ocr_text: img.ocr_text }},
862
- score: 1.0,
863
- metadata: {{
864
- document_id: img.document_id,
865
- page: img.page
866
- }}
867
- }}
868
- """
869
- results.extend(list(db.aql.execute(aql_image, bind_vars=bind)))
870
-
871
- return results[:limit]
872
-
873
- # --------------------------
874
- # Internal neighbor routine
875
- # --------------------------
876
- def _iter_neighbors(self, node: NodeId, *, direction: str, rich: bool) -> Iterable[EdgeView]:
877
- assert direction in ("OUTBOUND", "INBOUND")
878
-
879
- bind: Dict[str, Any] = {
880
- "node": node,
881
- "rel_prop": self.rel_prop,
882
- "w_prop": self.w_prop,
883
- "priors_map": self.type_priors,
884
- }
885
-
886
- # Only add community ID if we're filtering by it
887
- if self.community_mode != "none":
888
- bind["cid"] = self._cid
889
-
890
- # Bind parameters are added only when referenced to avoid AQL 1552 errors
891
-
892
- hint = ""
893
- if direction == "OUTBOUND" and self.outbound_index_hint:
894
- hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
895
- bind["idx"] = self.outbound_index_hint
896
- elif direction == "INBOUND" and self.inbound_index_hint:
897
- hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
898
- bind["idx"] = self.inbound_index_hint
899
-
900
- filters: List[str] = []
901
-
902
- # Community filter
903
- if self.community_mode == "property":
904
- filters.append(f"v.{self.community_prop} == @cid")
905
- elif self.community_mode == "mapping":
906
- bind.update({"@mcol": self.membership_col, "m_ent": self.memb_ent_field, "m_com": self.memb_com_field})
907
- filters.append("""
908
- FIRST(
909
- FOR m IN @@mcol
910
- FILTER m[@m_com] == @cid AND m[@m_ent] == v._id
911
- LIMIT 1
912
- RETURN 1
913
- )
914
- """)
915
- # else community_mode == "none" - no filtering
916
-
917
- # Relation / neighbor type filters
918
- if self.allowed_relations:
919
- bind["allowed_relations"] = self.allowed_relations
920
- filters.append("e[@rel_prop] IN @allowed_relations")
921
- if self.disallowed_relations:
922
- bind["disallowed_relations"] = self.disallowed_relations
923
- filters.append("!(e[@rel_prop] IN @disallowed_relations)")
924
- if self.allowed_neighbor_types:
925
- bind["allowed_neighbor_types"] = self.allowed_neighbor_types
926
- filters.append(f"v.{self.node_type_prop} IN @allowed_neighbor_types")
927
-
928
- # Time window filter on edge timestamp
929
- if self.time_window and self.ts_prop:
930
- bind["start_ts"], bind["end_ts"] = self.time_window
931
- bind["ts_prop"] = self.ts_prop
932
- filters.append("HAS(e, @ts_prop) AND e[@ts_prop] >= @start_ts AND e[@ts_prop] <= @end_ts")
933
-
934
- # Current-only validity wrt as_of
935
- if self.current_only and self.as_of:
936
- bind["as_of"] = self.as_of
937
- vf_prop = self.edge_valid_from_prop or "valid_from"
938
- vt_prop = self.edge_valid_to_prop or "valid_to"
939
- filters.append(
940
- f"( (HAS(e, '{vf_prop}') ? e['{vf_prop}'] <= @as_of : true) "
941
- f"AND (HAS(e, '{vt_prop}') ? (e['{vt_prop}'] == null OR e['{vt_prop}'] >= @as_of) : true) )"
942
- )
943
-
944
- # Optional status guard
945
- status_guard = ""
946
- if self.edge_status_prop:
947
- status_guard = "LET _status = e[@status_prop]"
948
- bind["status_prop"] = self.edge_status_prop
949
-
950
- # Recency decay: 2^(- age_days / half_life)
951
- recency_clause = "1.0"
952
- if self.recency_half_life_days is not None and self.as_of and self.ts_prop:
953
- bind["half_life"] = float(self.recency_half_life_days)
954
- bind["as_of"] = self.as_of
955
- bind["ts_prop"] = self.ts_prop
956
- recency_clause = "POW(2, -1 * DATE_DIFF(@as_of, e[@ts_prop], 'days') / @half_life)"
957
-
958
- # Base weight
959
- weight_clause = "(HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0)"
960
-
961
- # Confidence fusion (usually disabled; you do it in engine)
962
- conf_clause = "1.0"
963
- if self.fuse_edge_confidence:
964
- bind.update({
965
- "raw_c": self.edge_raw_conf_prop,
966
- "npll": self.edge_npll_post_prop,
967
- "calib": self.edge_calibration_prop,
968
- "miss_prior": float(self.missing_confidence_prior),
969
- })
970
- conf_clause = (
971
- "( (HAS(e, @raw_c) && IS_NUMBER(e[@raw_c]) ? e[@raw_c] : @miss_prior) * "
972
- " (HAS(e, @npll) && IS_NUMBER(e[@npll]) ? e[@npll] : @miss_prior) * "
973
- " (HAS(e, @calib) && IS_NUMBER(e[@calib]) ? e[@calib] : @miss_prior) )"
974
- )
975
-
976
- filters_str = " && ".join(filters) if filters else "true"
977
-
978
- # Build the src edges clause safely
979
- src_edges_clause = (
980
- f"""
981
- FOR p IN {self.prov_edges_col}
982
- FILTER p._from IN [e._from, e._to]
983
- RETURN p._to
984
- """
985
- if self.prov_edges_col else "[]"
986
- )
987
-
988
- aql = f"""
989
- LET priors = @priors_map
990
- FOR v, e IN 1..1 {direction} @node {self.edges_col}
991
- {hint}
992
- FILTER {filters_str}
993
- {status_guard}
994
- LET _rel = e[@rel_prop]
995
- LET _prior = TO_NUMBER(NOT_NULL(priors[_rel], 1.0))
996
- LET _base_w = {weight_clause}
997
- LET _rec = {recency_clause}
998
- LET _conf = {conf_clause}
999
- LET _w_eff = TO_NUMBER(_base_w) * TO_NUMBER(_prior) * TO_NUMBER(_rec) * TO_NUMBER(_conf)
1000
-
1001
- LET _vf = {f"e['{self.edge_valid_from_prop}']" if self.edge_valid_from_prop else 'null'}
1002
- LET _vt = {f"e['{self.edge_valid_to_prop}']" if self.edge_valid_to_prop else 'null'}
1003
- LET _status2 = {f"e['{self.edge_status_prop}']" if self.edge_status_prop else 'null'}
1004
-
1005
- // Provenance: inline fields + EXTRACTED_FROM for both endpoints
1006
- LET _src_inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
1007
- LET _src_inline = (
1008
- FOR x IN _src_inline_candidates
1009
- FILTER x != null
1010
- RETURN x
1011
- )
1012
- LET _src_edges = (
1013
- {src_edges_clause}
1014
- )
1015
- LET _sources = UNIQUE(APPEND(_src_inline, _src_edges))
1016
-
1017
- RETURN {{
1018
- v_id: v._id,
1019
- rel: _rel,
1020
- weight: _w_eff,
1021
- edge_id: e._id,
1022
- valid_from: _vf,
1023
- valid_to: _vt,
1024
- status: _status2,
1025
- raw_confidence: {f"e['{self.edge_raw_conf_prop}']" if self.edge_raw_conf_prop else 'null'},
1026
- npll_posterior: {f"e['{self.edge_npll_post_prop}']" if self.edge_npll_post_prop else 'null'},
1027
- calibration: {f"e['{self.edge_calibration_prop}']" if self.edge_calibration_prop else 'null'},
1028
- sources: _sources
1029
- }}
1030
- """
1031
-
1032
- cursor = self.db.aql.execute(
1033
- aql,
1034
- bind_vars=bind,
1035
- batch_size=self.aql_batch_size or 1000,
1036
- stream=self.aql_stream if self.aql_stream is not None else True,
1037
- ttl=120, # 2 minute timeout for long queries
1038
- optimizer_rules=["+use-indexes"] # Force index usage
1039
- )
1040
- for d in cursor:
1041
- if rich:
1042
- yield EdgeView(
1043
- neighbor_id=d["v_id"],
1044
- relation=d["rel"],
1045
- weight=float(d["weight"]),
1046
- edge_id=d["edge_id"],
1047
- valid_from=d.get("valid_from"),
1048
- valid_to=d.get("valid_to"),
1049
- status=d.get("status"),
1050
- raw_confidence=d.get("raw_confidence"),
1051
- npll_posterior=d.get("npll_posterior"),
1052
- calibration=d.get("calibration"),
1053
- sources=d.get("sources") or [],
1054
- )
1055
- else:
1056
- yield d["v_id"], d["rel"], float(d["weight"])
1057
-
1058
-
1059
- class GlobalGraphAccessor(GraphAccessor):
1060
- """
1061
- Cross-community graph accessor using pre-computed bridge entities.
1062
-
1063
- This accessor enables intelligent traversal across community boundaries
1064
- by leveraging the BridgeEntities and CommunityAffinity collections
1065
- created during community detection.
1066
-
1067
- Key features:
1068
- - Uses bridge entities to efficiently cross community boundaries
1069
- - Scores cross-community paths using affinity scores
1070
- - Mission-aware: can weight community crossings based on context
1071
- - Maintains all ArangoCommunityAccessor features
1072
- """
1073
-
1074
- def __init__(
1075
- self,
1076
- db,
1077
- algorithm: str = "leiden",
1078
- # Base accessor settings
1079
- nodes_collection: str = "ExtractedEntities",
1080
- edges_collection: str = "ExtractedRelationships",
1081
- relation_property: str = "relationship",
1082
- weight_property: str = "weight",
1083
- # Bridge collections
1084
- bridge_collection: str = "BridgeEntities",
1085
- affinity_collection: str = "CommunityAffinity",
1086
- membership_collection: str = "EntityCommunities",
1087
- # Cross-community scoring
1088
- cross_community_bonus: float = 1.5, # Boost for cross-community edges (often valuable)
1089
- min_affinity_threshold: float = 0.0, # Minimum affinity to allow crossing
1090
- # Performance
1091
- aql_batch_size: int = 1000,
1092
- aql_stream: bool = True,
1093
- ):
1094
- self.db = db
1095
- self.algorithm = algorithm
1096
-
1097
- self.nodes_col = nodes_collection
1098
- self.edges_col = edges_collection
1099
- self.rel_prop = relation_property
1100
- self.w_prop = weight_property
1101
-
1102
- self.bridge_col = bridge_collection
1103
- self.affinity_col = affinity_collection
1104
- self.membership_col = membership_collection
1105
-
1106
- self.cross_community_bonus = cross_community_bonus
1107
- self.min_affinity_threshold = min_affinity_threshold
1108
-
1109
- self.aql_batch_size = aql_batch_size
1110
- self.aql_stream = aql_stream
1111
-
1112
- # Cache for bridge status and affinities
1113
- self._bridge_cache: Dict[str, Optional[dict]] = {}
1114
- self._affinity_cache: Dict[str, float] = {}
1115
-
1116
- # --------------------------
1117
- # Core traversal API
1118
- # --------------------------
1119
-
1120
- def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
1121
- """Iterate outbound edges, scoring cross-community edges appropriately."""
1122
- for ev in self._iter_neighbors_global(node, direction="OUTBOUND"):
1123
- yield ev.neighbor_id, ev.relation, ev.weight
1124
-
1125
- def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
1126
- """Iterate inbound edges, scoring cross-community edges appropriately."""
1127
- for ev in self._iter_neighbors_global(node, direction="INBOUND"):
1128
- yield ev.neighbor_id, ev.relation, ev.weight
1129
-
1130
- def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
1131
- """Rich outbound edges with cross-community metadata."""
1132
- yield from self._iter_neighbors_global(node, direction="OUTBOUND")
1133
-
1134
- def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
1135
- """Rich inbound edges with cross-community metadata."""
1136
- yield from self._iter_neighbors_global(node, direction="INBOUND")
1137
-
1138
- def nodes(self) -> Iterable[NodeId]:
1139
- """Return all nodes (no community restriction)."""
1140
- aql = f"FOR v IN {self.nodes_col} RETURN v._id"
1141
- cursor = self.db.aql.execute(aql, batch_size=self.aql_batch_size, stream=self.aql_stream)
1142
- for vid in cursor:
1143
- yield vid
1144
-
1145
- def degree(self, node: NodeId) -> int:
1146
- """Out-degree of a node."""
1147
- aql = f"""
1148
- RETURN LENGTH(
1149
- FOR e IN {self.edges_col}
1150
- FILTER e._from == @node
1151
- RETURN 1
1152
- )
1153
- """
1154
- cur = self.db.aql.execute(aql, bind_vars={"node": node})
1155
- return int(list(cur)[0] or 0)
1156
-
1157
- # --------------------------
1158
- # Bridge-aware methods
1159
- # --------------------------
1160
-
1161
- def is_bridge(self, entity_key: str) -> Optional[dict]:
1162
- """
1163
- Check if an entity is a bridge and return its bridge data.
1164
- Uses caching for performance.
1165
- """
1166
- if entity_key in self._bridge_cache:
1167
- return self._bridge_cache[entity_key]
1168
-
1169
- aql = """
1170
- FOR b IN @@bridge_col
1171
- FILTER b.entity_key == @entity_key
1172
- FILTER b.algorithm == @algorithm
1173
- RETURN b
1174
- """
1175
- result = list(self.db.aql.execute(
1176
- aql,
1177
- bind_vars={
1178
- "@bridge_col": self.bridge_col,
1179
- "entity_key": entity_key,
1180
- "algorithm": self.algorithm,
1181
- }
1182
- ))
1183
-
1184
- bridge_data = result[0] if result else None
1185
- self._bridge_cache[entity_key] = bridge_data
1186
- return bridge_data
1187
-
1188
- def get_entity_community(self, entity_id: str) -> Optional[str]:
1189
- """Get the community ID for an entity."""
1190
- aql = """
1191
- FOR m IN @@membership_col
1192
- FILTER m.entity_id == @entity_id
1193
- FILTER m.algorithm == @algorithm
1194
- RETURN m.community_id
1195
- """
1196
- result = list(self.db.aql.execute(
1197
- aql,
1198
- bind_vars={
1199
- "@membership_col": self.membership_col,
1200
- "entity_id": entity_id,
1201
- "algorithm": self.algorithm,
1202
- }
1203
- ))
1204
- return result[0] if result else None
1205
-
1206
- def get_affinity(self, community_a: str, community_b: str) -> float:
1207
- """
1208
- Get the affinity score between two communities.
1209
- Returns 0.0 if no affinity data exists.
1210
- """
1211
- cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
1212
-
1213
- if cache_key in self._affinity_cache:
1214
- return self._affinity_cache[cache_key]
1215
-
1216
- aql = """
1217
- FOR a IN @@affinity_col
1218
- FILTER a.algorithm == @algorithm
1219
- FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
1220
- OR (a.community_a == @comm_b AND a.community_b == @comm_a)
1221
- RETURN a.affinity_score
1222
- """
1223
- result = list(self.db.aql.execute(
1224
- aql,
1225
- bind_vars={
1226
- "@affinity_col": self.affinity_col,
1227
- "algorithm": self.algorithm,
1228
- "comm_a": community_a,
1229
- "comm_b": community_b,
1230
- }
1231
- ))
1232
-
1233
- affinity = result[0] if result else 0.0
1234
- self._affinity_cache[cache_key] = affinity
1235
- return affinity
1236
-
1237
- def get_bridges_from_community(self, community_id: str, min_strength: int = 1) -> List[dict]:
1238
- """Get all bridge entities from a specific community."""
1239
- aql = """
1240
- FOR b IN @@bridge_col
1241
- FILTER b.algorithm == @algorithm
1242
- FILTER b.home_community == @community_id
1243
- FILTER b.bridge_strength >= @min_strength
1244
- SORT b.bridge_strength DESC
1245
- RETURN b
1246
- """
1247
- return list(self.db.aql.execute(
1248
- aql,
1249
- bind_vars={
1250
- "@bridge_col": self.bridge_col,
1251
- "algorithm": self.algorithm,
1252
- "community_id": community_id,
1253
- "min_strength": min_strength,
1254
- }
1255
- ))
1256
-
1257
- def get_top_bridges(self, limit: int = 20) -> List[dict]:
1258
- """Get the top bridge entities by bridge strength."""
1259
- aql = """
1260
- FOR b IN @@bridge_col
1261
- FILTER b.algorithm == @algorithm
1262
- SORT b.bridge_strength DESC
1263
- LIMIT @limit
1264
- RETURN b
1265
- """
1266
- return list(self.db.aql.execute(
1267
- aql,
1268
- bind_vars={
1269
- "@bridge_col": self.bridge_col,
1270
- "algorithm": self.algorithm,
1271
- "limit": limit,
1272
- }
1273
- ))
1274
-
1275
- def get_strongest_affinities(self, limit: int = 20) -> List[dict]:
1276
- """Get the strongest inter-community affinities."""
1277
- aql = """
1278
- FOR a IN @@affinity_col
1279
- FILTER a.algorithm == @algorithm
1280
- SORT a.affinity_score DESC
1281
- LIMIT @limit
1282
- RETURN a
1283
- """
1284
- return list(self.db.aql.execute(
1285
- aql,
1286
- bind_vars={
1287
- "@affinity_col": self.affinity_col,
1288
- "algorithm": self.algorithm,
1289
- "limit": limit,
1290
- }
1291
- ))
1292
-
1293
- # --------------------------
1294
- # Cross-community traversal
1295
- # --------------------------
1296
-
1297
- def _iter_neighbors_global(self, node: NodeId, direction: str) -> Iterable[EdgeView]:
1298
- """
1299
- Iterate neighbors with cross-community awareness.
1300
-
1301
- - Gets all neighbors (no community restriction)
1302
- - Detects cross-community edges
1303
- - Applies bonus/penalty based on affinity
1304
- """
1305
- assert direction in ("OUTBOUND", "INBOUND")
1306
-
1307
- # Get source node's community
1308
- source_community = self.get_entity_community(node)
1309
-
1310
- # Get all neighbors
1311
- aql = f"""
1312
- FOR v, e IN 1..1 {direction} @node {self.edges_col}
1313
- LET rel = e[@rel_prop]
1314
- LET base_weight = HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0
1315
- RETURN {{
1316
- v_id: v._id,
1317
- v_key: v._key,
1318
- rel: rel,
1319
- base_weight: base_weight,
1320
- edge_id: e._id,
1321
- sources: []
1322
- }}
1323
- """
1324
-
1325
- cursor = self.db.aql.execute(
1326
- aql,
1327
- bind_vars={
1328
- "node": node,
1329
- "rel_prop": self.rel_prop,
1330
- "w_prop": self.w_prop,
1331
- },
1332
- batch_size=self.aql_batch_size,
1333
- stream=self.aql_stream,
1334
- )
1335
-
1336
- for d in cursor:
1337
- neighbor_id = d["v_id"]
1338
- base_weight = float(d["base_weight"])
1339
-
1340
- # Check if this is a cross-community edge
1341
- neighbor_community = self.get_entity_community(neighbor_id)
1342
-
1343
- weight = base_weight
1344
- is_cross_community = False
1345
-
1346
- if source_community and neighbor_community and source_community != neighbor_community:
1347
- is_cross_community = True
1348
-
1349
- # Get affinity between communities
1350
- affinity = self.get_affinity(source_community, neighbor_community)
1351
-
1352
- # Apply cross-community scoring
1353
- if affinity >= self.min_affinity_threshold:
1354
- # Bonus for crossing to well-connected communities
1355
- weight = base_weight * self.cross_community_bonus * (1 + affinity)
1356
- else:
1357
- # Penalty for crossing to poorly-connected communities
1358
- weight = base_weight * 0.5
1359
-
1360
- yield EdgeView(
1361
- neighbor_id=neighbor_id,
1362
- relation=d["rel"],
1363
- weight=weight,
1364
- edge_id=d["edge_id"],
1365
- valid_from=None,
1366
- valid_to=None,
1367
- status="cross_community" if is_cross_community else "same_community",
1368
- raw_confidence=None,
1369
- npll_posterior=None,
1370
- calibration=None,
1371
- sources=d.get("sources") or [],
1372
- )
1373
-
1374
- # --------------------------
1375
- # Mission-aware scoring
1376
- # --------------------------
1377
-
1378
- def score_community_crossing(
1379
- self,
1380
- from_community: str,
1381
- to_community: str,
1382
- mission: Optional[str] = None
1383
- ) -> float:
1384
- """
1385
- Score a community crossing based on mission context.
1386
-
1387
- Args:
1388
- from_community: Source community
1389
- to_community: Target community
1390
- mission: Optional mission context (e.g., "fraud_detection", "patient_care")
1391
-
1392
- Returns:
1393
- Score multiplier for the crossing (>1 = valuable, <1 = not valuable)
1394
- """
1395
- base_affinity = self.get_affinity(from_community, to_community)
1396
-
1397
- if not mission:
1398
- return 1.0 + base_affinity
1399
-
1400
- # Mission-specific scoring (customize based on your domain)
1401
- mission_lower = mission.lower()
1402
-
1403
- # Example: fraud detection values Claims -> Clinical crossings
1404
- if "fraud" in mission_lower:
1405
- # This would need actual community type detection
1406
- # For now, just boost high-affinity crossings
1407
- return (1.0 + base_affinity) * 1.5
1408
-
1409
- # Example: patient care values Clinical -> Lab crossings
1410
- if "patient" in mission_lower or "clinical" in mission_lower:
1411
- return (1.0 + base_affinity) * 1.3
1412
-
1413
- return 1.0 + base_affinity
1414
-
1415
- def clear_cache(self):
1416
- """Clear internal caches (useful after data updates)."""
1417
- self._bridge_cache.clear()
1418
- self._affinity_cache.clear()
1
+ from __future__ import annotations
2
+ from typing import Iterable, Tuple, Optional, List, Dict, Any, NamedTuple
3
+
4
+ from .adapters import GraphAccessor, NodeId, RelId
5
+
6
+
7
+ class EdgeView(NamedTuple):
8
+ neighbor_id: NodeId
9
+ relation: RelId
10
+ weight: float # structural effective weight
11
+ edge_id: str
12
+ valid_from: Optional[str]
13
+ valid_to: Optional[str]
14
+ status: Optional[str]
15
+ raw_confidence: Optional[float]
16
+ npll_posterior: Optional[float]
17
+ calibration: Optional[float]
18
+ sources: List[str] # doc/text ids from inline fields & EXTRACTED_FROM
19
+
20
+
21
+ class ArangoCommunityAccessor(GraphAccessor):
22
+ """
23
+ Arango-backed GraphAccessor for a single community.
24
+
25
+ Defaults match your schema:
26
+ - nodes: ExtractedEntities
27
+ - edges: ExtractedRelationships (field: relationship, created_at)
28
+ - community via mapping: EntityCommunities(entity_id, community_id)
29
+ - provenance: inline fields + EXTRACTED_FROM (entity -> Documents/TextBlocks)
30
+
31
+ Structural weight only by default:
32
+ w_struct = base_weight * type_prior(relation) * recency_decay
33
+ (Set fuse_edge_confidence=True to multiply raw_confidence * npll_posterior * calibration in-adapter.)
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ db,
39
+ community_id: str,
40
+ # Collections
41
+ nodes_collection: str = "ExtractedEntities",
42
+ edges_collection: str = "ExtractedRelationships",
43
+ # Core field names
44
+ relation_property: str = "relationship",
45
+ weight_property: str = "weight",
46
+ node_type_property: str = "type",
47
+ # Time fields
48
+ edge_timestamp_property: str = "created_at",
49
+ edge_valid_from_property: Optional[str] = "valid_from",
50
+ edge_valid_to_property: Optional[str] = "valid_to",
51
+ edge_status_property: Optional[str] = "status",
52
+ # Community scoping (mapping mode by default)
53
+ community_mode: str = "mapping", # "mapping" | "property"
54
+ community_property: str = "community_id", # only used if community_mode == "property"
55
+ membership_collection: str = "EntityCommunities",
56
+ membership_entity_field: str = "entity_id",
57
+ membership_community_field: str = "community_id",
58
+ # Dynamic constraints
59
+ allowed_relations: Optional[List[str]] = None,
60
+ disallowed_relations: Optional[List[str]] = None,
61
+ allowed_neighbor_types: Optional[List[str]] = None,
62
+ # Time filters
63
+ time_window: Optional[Tuple[str, str]] = None, # (start_iso, end_iso)
64
+ as_of: Optional[str] = None, # ISO timestamp for "as of"
65
+ current_only: bool = False, # respect valid_from/valid_to around as_of
66
+ recency_half_life_days: Optional[float] = 90.0, # None disables recency decay
67
+ # Priors
68
+ type_priors: Optional[Dict[str, float]] = None, # e.g., {"assessor": 1.1}
69
+ # Provenance
70
+ edge_provenance_fields: Optional[List[str]] = None, # defaults: ["source_document_id","source_text_id"]
71
+ provenance_edge_collection: Optional[str] = "EXTRACTED_FROM",
72
+ provenance_target_collections: Optional[List[str]] = None, # defaults: ["Documents","TextBlocks"]
73
+ # Confidence fusion (usually False; you do NPLL in engine)
74
+ fuse_edge_confidence: bool = False,
75
+ missing_confidence_prior: float = 1.0,
76
+ edge_raw_confidence_property: Optional[str] = "raw_confidence",
77
+ edge_npll_posterior_property: Optional[str] = "npll_posterior",
78
+ edge_calibration_property: Optional[str] = "calibration",
79
+ # Performance
80
+ aql_batch_size: int = 1000,
81
+ aql_stream: bool = True,
82
+ outbound_index_hint: Optional[str] = None, # e.g. "edges_from_rel_ts"
83
+ inbound_index_hint: Optional[str] = None, # e.g. "edges_to_rel_ts"
84
+ # Bridge / GNN integration
85
+ bridge_collection: str = "BridgeEntities",
86
+ affinity_collection: str = "CommunityAffinity",
87
+ algorithm: str = "gnn", # Default to GNN as per pipeline
88
+ ):
89
+ self.db = db
90
+ self._cid = community_id
91
+ self.bridge_col = bridge_collection
92
+ self.affinity_col = affinity_collection
93
+ self.algorithm = algorithm
94
+ self._bridge_cache: Dict[str, Optional[dict]] = {}
95
+ self._affinity_cache: Dict[str, float] = {}
96
+
97
+ self.nodes_col = nodes_collection
98
+ self.edges_col = edges_collection
99
+
100
+ self.rel_prop = relation_property
101
+ self.w_prop = weight_property
102
+ self.node_type_prop = node_type_property
103
+
104
+ self.ts_prop = edge_timestamp_property
105
+ self.edge_valid_from_prop = edge_valid_from_property
106
+ self.edge_valid_to_prop = edge_valid_to_property
107
+ self.edge_status_prop = edge_status_property
108
+
109
+ self.community_mode = community_mode
110
+ self.community_prop = community_property
111
+ self.membership_col = membership_collection
112
+ self.memb_ent_field = membership_entity_field
113
+ self.memb_com_field = membership_community_field
114
+
115
+ self.allowed_relations = allowed_relations
116
+ self.disallowed_relations = disallowed_relations
117
+ self.allowed_neighbor_types = allowed_neighbor_types
118
+
119
+ self.time_window = time_window
120
+ self.as_of = as_of
121
+ self.current_only = current_only
122
+ self.recency_half_life_days = recency_half_life_days
123
+
124
+ self.type_priors = type_priors or {}
125
+
126
+ self.edge_prov_fields = edge_provenance_fields or ["source_document_id", "source_text_id"]
127
+ self.prov_edges_col = provenance_edge_collection
128
+ self.prov_target_cols = provenance_target_collections or ["Documents", "TextBlocks"]
129
+
130
+ self.fuse_edge_confidence = fuse_edge_confidence
131
+ self.missing_confidence_prior = missing_confidence_prior
132
+ self.edge_raw_conf_prop = edge_raw_confidence_property
133
+ self.edge_npll_post_prop = edge_npll_posterior_property
134
+ self.edge_calibration_prop = edge_calibration_property
135
+
136
+ self.aql_batch_size = aql_batch_size
137
+ self.aql_stream = aql_stream
138
+ self.outbound_index_hint = outbound_index_hint
139
+ self.inbound_index_hint = inbound_index_hint
140
+
141
+ # --------------------------
142
+ # Back-compatible core API
143
+ # --------------------------
144
+ def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
145
+ for ev in self._iter_neighbors(node, direction="OUTBOUND", rich=True):
146
+ yield ev.neighbor_id, ev.relation, ev.weight
147
+
148
+ def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
149
+ for ev in self._iter_neighbors(node, direction="INBOUND", rich=True):
150
+ yield ev.neighbor_id, ev.relation, ev.weight
151
+
152
+ def nodes(self, community_id: Optional[str] = None) -> Iterable[NodeId]:
153
+ """
154
+ Return all node IDs in this community.
155
+ - mapping mode: EntityCommunities -> entity_id
156
+ - property mode: filter ExtractedEntities by community_id field (if you add it)
157
+ - none mode: return all nodes
158
+ """
159
+ cid = community_id or self._cid
160
+ if self.community_mode == "property":
161
+ aql = f"""
162
+ FOR v IN {self.nodes_col}
163
+ FILTER v.{self.community_prop} == @cid
164
+ RETURN v._id
165
+ """
166
+ cursor = self.db.aql.execute(
167
+ aql, bind_vars={"cid": cid}, batch_size=self.aql_batch_size, stream=self.aql_stream
168
+ )
169
+ elif self.community_mode == "mapping":
170
+ aql = f"""
171
+ FOR m IN @@mcol
172
+ FILTER m[@m_com] == @cid
173
+ RETURN m[@m_ent]
174
+ """
175
+ cursor = self.db.aql.execute(
176
+ aql,
177
+ bind_vars={
178
+ "cid": cid,
179
+ "@mcol": self.membership_col,
180
+ "m_ent": self.memb_ent_field,
181
+ "m_com": self.memb_com_field,
182
+ },
183
+ batch_size=self.aql_batch_size,
184
+ stream=self.aql_stream,
185
+ )
186
+ else: # community_mode == "none"
187
+ aql = f"""
188
+ FOR v IN {self.nodes_col}
189
+ RETURN v._id
190
+ """
191
+ cursor = self.db.aql.execute(
192
+ aql, batch_size=self.aql_batch_size, stream=self.aql_stream
193
+ )
194
+ for vid in cursor:
195
+ yield vid
196
+
197
+ def degree(self, node: NodeId) -> int:
198
+ """Out-degree (fast)."""
199
+ hint_clause = (
200
+ "OPTIONS { indexHint: @idx, forceIndexHint: true }" if self.outbound_index_hint else ""
201
+ )
202
+ aql = f"""
203
+ RETURN LENGTH(
204
+ FOR e IN {self.edges_col}
205
+ {hint_clause}
206
+ FILTER e._from == @node
207
+ RETURN 1
208
+ )
209
+ """
210
+ bind = {"node": node}
211
+ if self.outbound_index_hint:
212
+ bind["idx"] = self.outbound_index_hint
213
+ cur = self.db.aql.execute(aql, bind_vars=bind)
214
+ return int(list(cur)[0] or 0)
215
+
216
+ # --------------------------
217
+ # Rich neighbor variants
218
+ # --------------------------
219
+ def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
220
+ yield from self._iter_neighbors(node, direction="OUTBOUND", rich=True)
221
+
222
+ def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
223
+ yield from self._iter_neighbors(node, direction="INBOUND", rich=True)
224
+
225
+ # --------------------------
226
+ # Provenance helpers
227
+ # --------------------------
228
+ def get_edge_provenance(self, edge_id: str) -> List[str]:
229
+ """
230
+ Return provenance targets for a relationship edge:
231
+ - inline fields (source_document_id, source_text_id)
232
+ - EXTRACTED_FROM edges for either endpoint entity
233
+ """
234
+ # Build the provenance edges clause safely (avoid nested f-strings)
235
+ prov_edges_clause = (
236
+ f"""
237
+ FOR p IN {self.prov_edges_col}
238
+ FILTER p._from IN [e._from, e._to]
239
+ RETURN p._to
240
+ """
241
+ if self.prov_edges_col else "[]"
242
+ )
243
+
244
+ aql = f"""
245
+ LET e = DOCUMENT(@eid)
246
+ LET inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
247
+ LET inline = (
248
+ FOR x IN inline_candidates
249
+ FILTER x != null
250
+ RETURN x
251
+ )
252
+ LET via_edges = (
253
+ {prov_edges_clause}
254
+ )
255
+ RETURN UNIQUE(APPEND(inline, via_edges))
256
+ """
257
+ cur = self.db.aql.execute(aql, bind_vars={"eid": edge_id}, batch_size=self.aql_batch_size, stream=self.aql_stream)
258
+ out = list(cur)
259
+ return out[0] if out else []
260
+
261
+ def get_node(self, node_id: NodeId, fields: Optional[List[str]] = None) -> Dict[str, Any]:
262
+ if fields:
263
+ proj = ", ".join([f"{f}: d.{f}" for f in fields])
264
+ aql = f"LET d = DOCUMENT(@id) RETURN {{ _id: d._id, {proj} }}"
265
+ else:
266
+ aql = "RETURN DOCUMENT(@id)"
267
+ cur = self.db.aql.execute(aql, bind_vars={"id": node_id})
268
+ res = list(cur)
269
+ return res[0] if res else {}
270
+
271
+ # --------------------------
272
+ # Stats / quick analytics
273
+ # --------------------------
274
+ @staticmethod
275
+ def get_top_n_entities_by_degree(
276
+ db,
277
+ edges_collection: str = "ExtractedRelationships",
278
+ limit: Optional[int] = None,
279
+ time_window: Optional[Tuple[str, str]] = None,
280
+ time_property: str = "created_at",
281
+ ) -> List[dict]:
282
+ bind: Dict[str, Any] = {}
283
+ where = ""
284
+ if time_window:
285
+ where = "FILTER HAS(e, @ts) AND e[@ts] >= @start_ts AND e[@ts] <= @end_ts"
286
+ bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
287
+ limit_clause = "LIMIT @lim" if limit else ""
288
+ if limit:
289
+ bind["lim"] = limit
290
+ aql = f"""
291
+ FOR e IN {edges_collection}
292
+ {where}
293
+ COLLECT entity = e._from WITH COUNT INTO degree
294
+ SORT degree DESC
295
+ {limit_clause}
296
+ RETURN {{ "entity": entity, "degree": degree }}
297
+ """
298
+ return list(db.aql.execute(aql, bind_vars=bind))
299
+
300
+ @staticmethod
301
+ def get_entity_type_counts(
302
+ db,
303
+ nodes_collection: str = "ExtractedEntities",
304
+ type_property: str = "type"
305
+ ) -> List[dict]:
306
+ aql = f"""
307
+ FOR doc IN {nodes_collection}
308
+ COLLECT t = doc.{type_property} WITH COUNT INTO c
309
+ SORT c DESC
310
+ RETURN {{ "type": t, "count": c }}
311
+ """
312
+ return list(db.aql.execute(aql))
313
+
314
+ @staticmethod
315
+ def get_relationship_type_counts(
316
+ db,
317
+ edges_collection: str = "ExtractedRelationships",
318
+ relation_property: str = "relationship",
319
+ time_window: Optional[Tuple[str, str]] = None,
320
+ time_property: str = "created_at",
321
+ ) -> List[dict]:
322
+ bind: Dict[str, Any] = {"rel_prop": relation_property}
323
+ where = "FILTER HAS(rel, @rel_prop)"
324
+ if time_window:
325
+ where += " AND HAS(rel, @ts) AND rel[@ts] >= @start_ts AND rel[@ts] <= @end_ts"
326
+ bind.update({"ts": time_property, "start_ts": time_window[0], "end_ts": time_window[1]})
327
+ aql = f"""
328
+ FOR rel IN {edges_collection}
329
+ {where}
330
+ COLLECT t = rel[@rel_prop] WITH COUNT INTO c
331
+ SORT c DESC
332
+ RETURN {{ "type": t, "count": c }}
333
+ """
334
+ return list(db.aql.execute(aql, bind_vars=bind))
335
+
336
+ @staticmethod
337
+ def get_community_summaries(
338
+ db,
339
+ communities_collection: str = "Communities",
340
+ limit: Optional[int] = None,
341
+ skip: int = 0,
342
+ require_summary: bool = True
343
+ ) -> List[dict]:
344
+ filter_clause = "FILTER c.summary != null AND c.summary != ''" if require_summary else "FILTER c.summary == null OR c.summary == ''"
345
+ limit_clause = "LIMIT @skip, @limit" if limit is not None else ""
346
+ bind: Dict[str, Any] = {}
347
+ if limit is not None:
348
+ bind.update({"skip": skip, "limit": limit})
349
+ aql = f"""
350
+ FOR c IN {communities_collection}
351
+ {filter_clause}
352
+ SORT c.community_id ASC
353
+ {limit_clause}
354
+ RETURN {{ id: c.community_id, summary: c.summary, size: c.size, level: c.level }}
355
+ """
356
+ return list(db.aql.execute(aql, bind_vars=bind))
357
+
358
+ @staticmethod
359
+ def get_unique_table_headers(
360
+ db,
361
+ tables_collection: str = "Tables",
362
+ headers_property: str = "headers"
363
+ ) -> List[List[str]]:
364
+ aql = f"""
365
+ FOR t IN {tables_collection}
366
+ FILTER HAS(t, @hp)
367
+ COLLECT h = t[@hp]
368
+ RETURN h
369
+ """
370
+ return list(db.aql.execute(aql, bind_vars={"hp": headers_property}))
371
+
372
+ # --------------------------
373
+ # Bridge / GNN Integration Methods (Mirrored from GlobalGraphAccessor)
374
+ # --------------------------
375
+
376
+ def is_bridge(self, entity_key: str) -> Optional[dict]:
377
+ """
378
+ Check if an entity is a bridge and return its bridge data.
379
+ Uses caching for performance.
380
+ """
381
+ # Strip collection if present to get key
382
+ if "/" in entity_key:
383
+ entity_key = entity_key.split("/")[-1]
384
+
385
+ if entity_key in self._bridge_cache:
386
+ return self._bridge_cache[entity_key]
387
+
388
+ aql = """
389
+ FOR b IN @@bridge_col
390
+ FILTER b.entity_key == @entity_key
391
+ FILTER b.algorithm == @algorithm
392
+ RETURN b
393
+ """
394
+ try:
395
+ result = list(self.db.aql.execute(
396
+ aql,
397
+ bind_vars={
398
+ "@bridge_col": self.bridge_col,
399
+ "entity_key": entity_key,
400
+ "algorithm": self.algorithm,
401
+ }
402
+ ))
403
+ bridge_data = result[0] if result else None
404
+ except Exception:
405
+ # Fallback if collection doesn't exist yet
406
+ bridge_data = None
407
+
408
+ self._bridge_cache[entity_key] = bridge_data
409
+ return bridge_data
410
+
411
+ def get_entity_community(self, entity_id: str) -> Optional[str]:
412
+ """Get the community ID for an entity."""
413
+ # For ArangoCommunityAccessor, we might know the community if mode is 'mapping'
414
+ # But we should check the mapping collection to be sure (or if it's a bridge to another community)
415
+
416
+ # If we are in 'mapping' mode, we can query membership collection
417
+ if self.community_mode == "mapping":
418
+ aql = f"""
419
+ FOR m IN {self.membership_col}
420
+ FILTER m.{self.memb_ent_field} == @entity_id
421
+ // We don't filter by algorithm here usually, but if needed we can
422
+ RETURN m.{self.memb_com_field}
423
+ """
424
+ try:
425
+ result = list(self.db.aql.execute(aql, bind_vars={"entity_id": entity_id}))
426
+ return result[0] if result else None
427
+ except Exception:
428
+ return None
429
+ return None
430
+
431
+ def get_affinity(self, community_a: str, community_b: str) -> float:
432
+ """
433
+ Get the affinity score between two communities.
434
+ Returns 0.0 if no affinity data exists.
435
+ """
436
+ if not community_a or not community_b:
437
+ return 0.0
438
+
439
+ cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
440
+
441
+ if cache_key in self._affinity_cache:
442
+ return self._affinity_cache[cache_key]
443
+
444
+ aql = """
445
+ FOR a IN @@affinity_col
446
+ FILTER a.algorithm == @algorithm
447
+ FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
448
+ OR (a.community_a == @comm_b AND a.community_b == @comm_a)
449
+ RETURN a.affinity_score
450
+ """
451
+ try:
452
+ result = list(self.db.aql.execute(
453
+ aql,
454
+ bind_vars={
455
+ "@affinity_col": self.affinity_col,
456
+ "algorithm": self.algorithm,
457
+ "comm_a": community_a,
458
+ "comm_b": community_b,
459
+ }
460
+ ))
461
+ affinity = result[0] if result else 0.0
462
+ except Exception:
463
+ affinity = 0.0
464
+
465
+ self._affinity_cache[cache_key] = affinity
466
+ return affinity
467
+
468
+ def clear_bridge_cache(self):
469
+ """Clear bridge/affinity caches."""
470
+ self._bridge_cache.clear()
471
+ self._affinity_cache.clear()
472
+
473
+ # ════════════════════════════════════════════════════════════════
474
+ # DISCOVERY ENTRY POINTS (for autonomous insight discovery)
475
+ # ════════════════════════════════════════════════════════════════
476
+
477
+ @staticmethod
478
+ def get_top_entities_in_community(
479
+ db,
480
+ community_id: str,
481
+ membership_collection: str = "EntityCommunities",
482
+ membership_entity_field: str = "entity_id",
483
+ membership_community_field: str = "community_id",
484
+ edges_collection: str = "ExtractedRelationships",
485
+ limit: int = 20,
486
+ ) -> List[dict]:
487
+ """
488
+ Get top entities by degree WITHIN a specific community.
489
+ Essential for autonomous discovery - provides high-value seed nodes.
490
+
491
+ Returns:
492
+ List of {entity: str, degree: int}
493
+ """
494
+ aql = """
495
+ LET community_entities = (
496
+ FOR m IN @@membership
497
+ FILTER m[@m_com] == @cid
498
+ RETURN m[@m_ent]
499
+ )
500
+ FOR e IN @@edges
501
+ FILTER e._from IN community_entities
502
+ COLLECT entity = e._from WITH COUNT INTO degree
503
+ SORT degree DESC
504
+ LIMIT @limit
505
+ RETURN { entity: entity, degree: degree }
506
+ """
507
+ return list(db.aql.execute(aql, bind_vars={
508
+ "@membership": membership_collection,
509
+ "@edges": edges_collection,
510
+ "m_ent": membership_entity_field,
511
+ "m_com": membership_community_field,
512
+ "cid": community_id,
513
+ "limit": limit,
514
+ }))
515
+
516
+ @staticmethod
517
+ def get_recent_entities(
518
+ db,
519
+ since: str, # ISO timestamp
520
+ community_id: Optional[str] = None,
521
+ nodes_collection: str = "ExtractedEntities",
522
+ membership_collection: str = "EntityCommunities",
523
+ membership_entity_field: str = "entity_id",
524
+ membership_community_field: str = "community_id",
525
+ created_at_property: str = "created_at",
526
+ updated_at_property: str = "updated_at",
527
+ limit: int = 100,
528
+ ) -> List[dict]:
529
+ """
530
+ Get entities created or updated since a timestamp.
531
+ Critical for daily discovery - "what's new since yesterday?"
532
+
533
+ Args:
534
+ since: ISO timestamp (e.g., "2026-01-11T00:00:00Z")
535
+ community_id: Optional community filter
536
+
537
+ Returns:
538
+ List of {entity: str, created_at: str, type: str}
539
+ """
540
+ bind: Dict[str, Any] = {
541
+ "since": since,
542
+ "limit": limit,
543
+ "created_prop": created_at_property,
544
+ "updated_prop": updated_at_property,
545
+ }
546
+
547
+ community_filter = ""
548
+ if community_id:
549
+ community_filter = """
550
+ LET community_entities = (
551
+ FOR m IN @@membership
552
+ FILTER m[@m_com] == @cid
553
+ RETURN m[@m_ent]
554
+ )
555
+ FILTER e._id IN community_entities
556
+ """
557
+ bind["@membership"] = membership_collection
558
+ bind["m_ent"] = membership_entity_field
559
+ bind["m_com"] = membership_community_field
560
+ bind["cid"] = community_id
561
+
562
+ aql = f"""
563
+ FOR e IN {nodes_collection}
564
+ FILTER (HAS(e, @created_prop) AND e[@created_prop] >= @since)
565
+ OR (HAS(e, @updated_prop) AND e[@updated_prop] >= @since)
566
+ {community_filter}
567
+ SORT HAS(e, @created_prop) ? e[@created_prop] : e[@updated_prop] DESC
568
+ LIMIT @limit
569
+ RETURN {{
570
+ entity: e._id,
571
+ created_at: HAS(e, @created_prop) ? e[@created_prop] : null,
572
+ updated_at: HAS(e, @updated_prop) ? e[@updated_prop] : null,
573
+ type: e.type,
574
+ name: e.name
575
+ }}
576
+ """
577
+ return list(db.aql.execute(aql, bind_vars=bind))
578
+
579
+ @staticmethod
580
+ def search_entities(
581
+ db,
582
+ query: str,
583
+ community_id: Optional[str] = None,
584
+ nodes_collection: str = "ExtractedEntities",
585
+ membership_collection: str = "EntityCommunities",
586
+ membership_entity_field: str = "entity_id",
587
+ membership_community_field: str = "community_id",
588
+ search_fields: List[str] = None,
589
+ limit: int = 20,
590
+ ) -> List[dict]:
591
+ """
592
+ Text search for entities matching query.
593
+ Uses LIKE for simple text matching (can be upgraded to ArangoSearch).
594
+
595
+ Args:
596
+ query: Search string
597
+ search_fields: Fields to search in (default: ["name", "description"])
598
+
599
+ Returns:
600
+ List of {entity: str, name: str, type: str, matched_field: str}
601
+ """
602
+ if search_fields is None:
603
+ search_fields = ["name", "description"]
604
+
605
+ bind: Dict[str, Any] = {
606
+ "query": f"%{query.lower()}%",
607
+ "limit": limit,
608
+ }
609
+
610
+ # Build search conditions
611
+ search_conditions = []
612
+ for field in search_fields:
613
+ search_conditions.append(f"LOWER(e.{field}) LIKE @query")
614
+ search_clause = " OR ".join(search_conditions)
615
+
616
+ community_filter = ""
617
+ if community_id:
618
+ community_filter = """
619
+ LET community_entities = (
620
+ FOR m IN @@membership
621
+ FILTER m[@m_com] == @cid
622
+ RETURN m[@m_ent]
623
+ )
624
+ FILTER e._id IN community_entities
625
+ """
626
+ bind["@membership"] = membership_collection
627
+ bind["m_ent"] = membership_entity_field
628
+ bind["m_com"] = membership_community_field
629
+ bind["cid"] = community_id
630
+
631
+ aql = f"""
632
+ FOR e IN {nodes_collection}
633
+ FILTER {search_clause}
634
+ {community_filter}
635
+ LIMIT @limit
636
+ RETURN {{
637
+ entity: e._id,
638
+ name: e.name,
639
+ type: e.type,
640
+ description: e.description
641
+ }}
642
+ """
643
+ return list(db.aql.execute(aql, bind_vars=bind))
644
+
645
+ # ════════════════════════════════════════════════════════════════
646
+ # CONTENT HYDRATION (for agent reasoning)
647
+ # ════════════════════════════════════════════════════════════════
648
+
649
+ @staticmethod
650
+ def get_document_content(
651
+ db,
652
+ doc_id: str,
653
+ text_collection: str = "TextBlocks",
654
+ table_collection: str = "Tables",
655
+ image_collection: str = "Images",
656
+ document_collection: str = "Documents",
657
+ ) -> Optional[dict]:
658
+ """
659
+ Fetch content from any document collection by ID.
660
+ Essential for agent reasoning - converts graph IDs to actual content.
661
+
662
+ Args:
663
+ doc_id: Document ID in format "CollectionName/key"
664
+
665
+ Returns:
666
+ Dict with type-specific content, or None if not found
667
+ """
668
+ try:
669
+ collection, key = doc_id.split("/", 1)
670
+ except ValueError:
671
+ return None
672
+
673
+ if collection == text_collection:
674
+ aql = f"""
675
+ FOR tb IN {text_collection}
676
+ FILTER tb._id == @doc_id
677
+ RETURN {{
678
+ type: "text",
679
+ text: tb.text,
680
+ document_id: tb.document_id,
681
+ page: tb.page,
682
+ char_span: tb.char_span,
683
+ metadata: tb.metadata
684
+ }}
685
+ """
686
+ elif collection == table_collection:
687
+ aql = f"""
688
+ FOR t IN {table_collection}
689
+ FILTER t._id == @doc_id
690
+ RETURN {{
691
+ type: "table",
692
+ headers: t.headers,
693
+ rows: t.rows,
694
+ caption: t.caption,
695
+ document_id: t.document_id,
696
+ page: t.page,
697
+ metadata: t.metadata
698
+ }}
699
+ """
700
+ elif collection == image_collection:
701
+ aql = f"""
702
+ FOR img IN {image_collection}
703
+ FILTER img._id == @doc_id
704
+ RETURN {{
705
+ type: "image",
706
+ caption: img.caption,
707
+ ocr_text: img.ocr_text,
708
+ url: img.storage_url,
709
+ document_id: img.document_id,
710
+ page: img.page,
711
+ metadata: img.metadata
712
+ }}
713
+ """
714
+ elif collection == document_collection:
715
+ aql = f"""
716
+ FOR d IN {document_collection}
717
+ FILTER d._id == @doc_id
718
+ RETURN {{
719
+ type: "document",
720
+ filename: d.filename,
721
+ content: d.content,
722
+ metadata: d.metadata
723
+ }}
724
+ """
725
+ else:
726
+ return None
727
+
728
+ result = list(db.aql.execute(aql, bind_vars={"doc_id": doc_id}))
729
+ return result[0] if result else None
730
+
731
+ @staticmethod
732
+ def get_entity_sources(
733
+ db,
734
+ entity_id: str,
735
+ extracted_from_collection: str = "EXTRACTED_FROM",
736
+ max_sources: int = 10,
737
+ ) -> List[dict]:
738
+ """
739
+ Get all source documents/blocks for an entity via EXTRACTED_FROM edges.
740
+ Critical for evidence gathering - shows WHERE an entity was mentioned.
741
+
742
+ Args:
743
+ entity_id: Entity ID (e.g., "ExtractedEntities/ent_123")
744
+ max_sources: Limit number of sources returned
745
+
746
+ Returns:
747
+ List of {source_id, source_type, content, char_span, confidence, metadata}
748
+ """
749
+ aql = f"""
750
+ FOR edge IN {extracted_from_collection}
751
+ FILTER edge._from == @entity_id
752
+ LIMIT @max_sources
753
+ LET source = DOCUMENT(edge._to)
754
+ LET collection = PARSE_IDENTIFIER(edge._to).collection
755
+ RETURN {{
756
+ source_id: edge._to,
757
+ source_type: collection,
758
+ char_span: edge.char_span,
759
+ extraction_confidence: edge.extraction_confidence,
760
+ content: CASE
761
+ WHEN collection == "TextBlocks" THEN source.text
762
+ WHEN collection == "Tables" THEN {{ headers: source.headers, rows: source.rows }}
763
+ WHEN collection == "Images" THEN {{ caption: source.caption, ocr_text: source.ocr_text }}
764
+ WHEN collection == "Documents" THEN SUBSTRING(source.content, 0, 500)
765
+ ELSE null
766
+ END,
767
+ metadata: {{
768
+ page: source.page,
769
+ document_id: source.document_id,
770
+ filename: source.filename
771
+ }}
772
+ }}
773
+ """
774
+ return list(db.aql.execute(aql, bind_vars={
775
+ "entity_id": entity_id,
776
+ "max_sources": max_sources,
777
+ }))
778
+
779
+ @staticmethod
780
+ def search_content(
781
+ db,
782
+ query: str,
783
+ community_id: Optional[str] = None,
784
+ content_types: List[str] = None,
785
+ text_collection: str = "TextBlocks",
786
+ table_collection: str = "Tables",
787
+ image_collection: str = "Images",
788
+ membership_collection: str = "EntityCommunities",
789
+ extracted_from_collection: str = "EXTRACTED_FROM",
790
+ limit: int = 10,
791
+ ) -> List[dict]:
792
+ """
793
+ Semantic/text search across content collections.
794
+ Uses simple LIKE matching (can be upgraded to ArangoSearch/vectors).
795
+
796
+ Args:
797
+ query: Search string
798
+ content_types: Collections to search (default: ["TextBlocks", "Tables", "Images"])
799
+ community_id: Optional filter to content linked to community entities
800
+
801
+ Returns:
802
+ List of {source_id, source_type, content, score, metadata}
803
+ """
804
+ if content_types is None:
805
+ content_types = [text_collection, table_collection, image_collection]
806
+
807
+ bind: Dict[str, Any] = {
808
+ "query": f"%{query.lower()}%",
809
+ "limit": limit,
810
+ }
811
+
812
+ results = []
813
+
814
+ # Search TextBlocks
815
+ if text_collection in content_types:
816
+ aql_text = f"""
817
+ FOR tb IN {text_collection}
818
+ FILTER LOWER(tb.text) LIKE @query
819
+ LIMIT @limit
820
+ RETURN {{
821
+ source_id: tb._id,
822
+ source_type: "TextBlocks",
823
+ content: tb.text,
824
+ score: 1.0,
825
+ metadata: {{
826
+ document_id: tb.document_id,
827
+ page: tb.page
828
+ }}
829
+ }}
830
+ """
831
+ results.extend(list(db.aql.execute(aql_text, bind_vars=bind)))
832
+
833
+ # Search Tables (caption)
834
+ if table_collection in content_types:
835
+ aql_table = f"""
836
+ FOR t IN {table_collection}
837
+ FILTER LOWER(t.caption) LIKE @query
838
+ LIMIT @limit
839
+ RETURN {{
840
+ source_id: t._id,
841
+ source_type: "Tables",
842
+ content: {{ headers: t.headers, rows: t.rows, caption: t.caption }},
843
+ score: 1.0,
844
+ metadata: {{
845
+ document_id: t.document_id,
846
+ page: t.page
847
+ }}
848
+ }}
849
+ """
850
+ results.extend(list(db.aql.execute(aql_table, bind_vars=bind)))
851
+
852
+ # Search Images (OCR text)
853
+ if image_collection in content_types:
854
+ aql_image = f"""
855
+ FOR img IN {image_collection}
856
+ FILTER LOWER(img.ocr_text) LIKE @query OR LOWER(img.caption) LIKE @query
857
+ LIMIT @limit
858
+ RETURN {{
859
+ source_id: img._id,
860
+ source_type: "Images",
861
+ content: {{ caption: img.caption, ocr_text: img.ocr_text }},
862
+ score: 1.0,
863
+ metadata: {{
864
+ document_id: img.document_id,
865
+ page: img.page
866
+ }}
867
+ }}
868
+ """
869
+ results.extend(list(db.aql.execute(aql_image, bind_vars=bind)))
870
+
871
+ return results[:limit]
872
+
873
+ # --------------------------
874
+ # Internal neighbor routine
875
+ # --------------------------
876
+ def _iter_neighbors(self, node: NodeId, *, direction: str, rich: bool) -> Iterable[EdgeView]:
877
+ assert direction in ("OUTBOUND", "INBOUND")
878
+
879
+ bind: Dict[str, Any] = {
880
+ "node": node,
881
+ "rel_prop": self.rel_prop,
882
+ "w_prop": self.w_prop,
883
+ "priors_map": self.type_priors,
884
+ }
885
+
886
+ # Only add community ID if we're filtering by it
887
+ if self.community_mode != "none":
888
+ bind["cid"] = self._cid
889
+
890
+ # Bind parameters are added only when referenced to avoid AQL 1552 errors
891
+
892
+ hint = ""
893
+ if direction == "OUTBOUND" and self.outbound_index_hint:
894
+ hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
895
+ bind["idx"] = self.outbound_index_hint
896
+ elif direction == "INBOUND" and self.inbound_index_hint:
897
+ hint = "OPTIONS { indexHint: @idx, forceIndexHint: true }"
898
+ bind["idx"] = self.inbound_index_hint
899
+
900
+ filters: List[str] = []
901
+
902
+ # Community filter
903
+ if self.community_mode == "property":
904
+ filters.append(f"v.{self.community_prop} == @cid")
905
+ elif self.community_mode == "mapping":
906
+ bind.update({"@mcol": self.membership_col, "m_ent": self.memb_ent_field, "m_com": self.memb_com_field})
907
+ filters.append("""
908
+ FIRST(
909
+ FOR m IN @@mcol
910
+ FILTER m[@m_com] == @cid AND m[@m_ent] == v._id
911
+ LIMIT 1
912
+ RETURN 1
913
+ )
914
+ """)
915
+ # else community_mode == "none" - no filtering
916
+
917
+ # Relation / neighbor type filters
918
+ if self.allowed_relations:
919
+ bind["allowed_relations"] = self.allowed_relations
920
+ filters.append("e[@rel_prop] IN @allowed_relations")
921
+ if self.disallowed_relations:
922
+ bind["disallowed_relations"] = self.disallowed_relations
923
+ filters.append("!(e[@rel_prop] IN @disallowed_relations)")
924
+ if self.allowed_neighbor_types:
925
+ bind["allowed_neighbor_types"] = self.allowed_neighbor_types
926
+ filters.append(f"v.{self.node_type_prop} IN @allowed_neighbor_types")
927
+
928
+ # Time window filter on edge timestamp
929
+ if self.time_window and self.ts_prop:
930
+ bind["start_ts"], bind["end_ts"] = self.time_window
931
+ bind["ts_prop"] = self.ts_prop
932
+ filters.append("HAS(e, @ts_prop) AND e[@ts_prop] >= @start_ts AND e[@ts_prop] <= @end_ts")
933
+
934
+ # Current-only validity wrt as_of
935
+ if self.current_only and self.as_of:
936
+ bind["as_of"] = self.as_of
937
+ vf_prop = self.edge_valid_from_prop or "valid_from"
938
+ vt_prop = self.edge_valid_to_prop or "valid_to"
939
+ filters.append(
940
+ f"( (HAS(e, '{vf_prop}') ? e['{vf_prop}'] <= @as_of : true) "
941
+ f"AND (HAS(e, '{vt_prop}') ? (e['{vt_prop}'] == null OR e['{vt_prop}'] >= @as_of) : true) )"
942
+ )
943
+
944
+ # Optional status guard
945
+ status_guard = ""
946
+ if self.edge_status_prop:
947
+ status_guard = "LET _status = e[@status_prop]"
948
+ bind["status_prop"] = self.edge_status_prop
949
+
950
+ # Recency decay: 2^(- age_days / half_life)
951
+ recency_clause = "1.0"
952
+ if self.recency_half_life_days is not None and self.as_of and self.ts_prop:
953
+ bind["half_life"] = float(self.recency_half_life_days)
954
+ bind["as_of"] = self.as_of
955
+ bind["ts_prop"] = self.ts_prop
956
+ recency_clause = "POW(2, -1 * DATE_DIFF(@as_of, e[@ts_prop], 'days') / @half_life)"
957
+
958
+ # Base weight
959
+ weight_clause = "(HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0)"
960
+
961
+ # Confidence fusion (usually disabled; you do it in engine)
962
+ conf_clause = "1.0"
963
+ if self.fuse_edge_confidence:
964
+ bind.update({
965
+ "raw_c": self.edge_raw_conf_prop,
966
+ "npll": self.edge_npll_post_prop,
967
+ "calib": self.edge_calibration_prop,
968
+ "miss_prior": float(self.missing_confidence_prior),
969
+ })
970
+ conf_clause = (
971
+ "( (HAS(e, @raw_c) && IS_NUMBER(e[@raw_c]) ? e[@raw_c] : @miss_prior) * "
972
+ " (HAS(e, @npll) && IS_NUMBER(e[@npll]) ? e[@npll] : @miss_prior) * "
973
+ " (HAS(e, @calib) && IS_NUMBER(e[@calib]) ? e[@calib] : @miss_prior) )"
974
+ )
975
+
976
+ filters_str = " && ".join(filters) if filters else "true"
977
+
978
+ # Build the src edges clause safely
979
+ src_edges_clause = (
980
+ f"""
981
+ FOR p IN {self.prov_edges_col}
982
+ FILTER p._from IN [e._from, e._to]
983
+ RETURN p._to
984
+ """
985
+ if self.prov_edges_col else "[]"
986
+ )
987
+
988
+ aql = f"""
989
+ LET priors = @priors_map
990
+ FOR v, e IN 1..1 {direction} @node {self.edges_col}
991
+ {hint}
992
+ FILTER {filters_str}
993
+ {status_guard}
994
+ LET _rel = e[@rel_prop]
995
+ LET _prior = TO_NUMBER(NOT_NULL(priors[_rel], 1.0))
996
+ LET _base_w = {weight_clause}
997
+ LET _rec = {recency_clause}
998
+ LET _conf = {conf_clause}
999
+ LET _w_eff = TO_NUMBER(_base_w) * TO_NUMBER(_prior) * TO_NUMBER(_rec) * TO_NUMBER(_conf)
1000
+
1001
+ LET _vf = {f"e['{self.edge_valid_from_prop}']" if self.edge_valid_from_prop else 'null'}
1002
+ LET _vt = {f"e['{self.edge_valid_to_prop}']" if self.edge_valid_to_prop else 'null'}
1003
+ LET _status2 = {f"e['{self.edge_status_prop}']" if self.edge_status_prop else 'null'}
1004
+
1005
+ // Provenance: inline fields + EXTRACTED_FROM for both endpoints
1006
+ LET _src_inline_candidates = [{", ".join([f"e['{f}']" for f in self.edge_prov_fields])}]
1007
+ LET _src_inline = (
1008
+ FOR x IN _src_inline_candidates
1009
+ FILTER x != null
1010
+ RETURN x
1011
+ )
1012
+ LET _src_edges = (
1013
+ {src_edges_clause}
1014
+ )
1015
+ LET _sources = UNIQUE(APPEND(_src_inline, _src_edges))
1016
+
1017
+ RETURN {{
1018
+ v_id: v._id,
1019
+ rel: _rel,
1020
+ weight: _w_eff,
1021
+ edge_id: e._id,
1022
+ valid_from: _vf,
1023
+ valid_to: _vt,
1024
+ status: _status2,
1025
+ raw_confidence: {f"e['{self.edge_raw_conf_prop}']" if self.edge_raw_conf_prop else 'null'},
1026
+ npll_posterior: {f"e['{self.edge_npll_post_prop}']" if self.edge_npll_post_prop else 'null'},
1027
+ calibration: {f"e['{self.edge_calibration_prop}']" if self.edge_calibration_prop else 'null'},
1028
+ sources: _sources
1029
+ }}
1030
+ """
1031
+
1032
+ cursor = self.db.aql.execute(
1033
+ aql,
1034
+ bind_vars=bind,
1035
+ batch_size=self.aql_batch_size or 1000,
1036
+ stream=self.aql_stream if self.aql_stream is not None else True,
1037
+ ttl=120, # 2 minute timeout for long queries
1038
+ optimizer_rules=["+use-indexes"] # Force index usage
1039
+ )
1040
+ for d in cursor:
1041
+ if rich:
1042
+ yield EdgeView(
1043
+ neighbor_id=d["v_id"],
1044
+ relation=d["rel"],
1045
+ weight=float(d["weight"]),
1046
+ edge_id=d["edge_id"],
1047
+ valid_from=d.get("valid_from"),
1048
+ valid_to=d.get("valid_to"),
1049
+ status=d.get("status"),
1050
+ raw_confidence=d.get("raw_confidence"),
1051
+ npll_posterior=d.get("npll_posterior"),
1052
+ calibration=d.get("calibration"),
1053
+ sources=d.get("sources") or [],
1054
+ )
1055
+ else:
1056
+ yield d["v_id"], d["rel"], float(d["weight"])
1057
+
1058
+
1059
+ class GlobalGraphAccessor(GraphAccessor):
1060
+ """
1061
+ Cross-community graph accessor using pre-computed bridge entities.
1062
+
1063
+ This accessor enables intelligent traversal across community boundaries
1064
+ by leveraging the BridgeEntities and CommunityAffinity collections
1065
+ created during community detection.
1066
+
1067
+ Key features:
1068
+ - Uses bridge entities to efficiently cross community boundaries
1069
+ - Scores cross-community paths using affinity scores
1070
+ - Mission-aware: can weight community crossings based on context
1071
+ - Maintains all ArangoCommunityAccessor features
1072
+ """
1073
+
1074
+ def __init__(
1075
+ self,
1076
+ db,
1077
+ algorithm: str = "leiden",
1078
+ # Base accessor settings
1079
+ nodes_collection: str = "ExtractedEntities",
1080
+ edges_collection: str = "ExtractedRelationships",
1081
+ relation_property: str = "relationship",
1082
+ weight_property: str = "weight",
1083
+ # Bridge collections
1084
+ bridge_collection: str = "BridgeEntities",
1085
+ affinity_collection: str = "CommunityAffinity",
1086
+ membership_collection: str = "EntityCommunities",
1087
+ # Cross-community scoring
1088
+ cross_community_bonus: float = 1.5, # Boost for cross-community edges (often valuable)
1089
+ min_affinity_threshold: float = 0.0, # Minimum affinity to allow crossing
1090
+ # Performance
1091
+ aql_batch_size: int = 1000,
1092
+ aql_stream: bool = True,
1093
+ ):
1094
+ self.db = db
1095
+ self.algorithm = algorithm
1096
+
1097
+ self.nodes_col = nodes_collection
1098
+ self.edges_col = edges_collection
1099
+ self.rel_prop = relation_property
1100
+ self.w_prop = weight_property
1101
+
1102
+ self.bridge_col = bridge_collection
1103
+ self.affinity_col = affinity_collection
1104
+ self.membership_col = membership_collection
1105
+
1106
+ self.cross_community_bonus = cross_community_bonus
1107
+ self.min_affinity_threshold = min_affinity_threshold
1108
+
1109
+ self.aql_batch_size = aql_batch_size
1110
+ self.aql_stream = aql_stream
1111
+
1112
+ # Cache for bridge status and affinities
1113
+ self._bridge_cache: Dict[str, Optional[dict]] = {}
1114
+ self._affinity_cache: Dict[str, float] = {}
1115
+
1116
+ # --------------------------
1117
+ # Core traversal API
1118
+ # --------------------------
1119
+
1120
+ def iter_out(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
1121
+ """Iterate outbound edges, scoring cross-community edges appropriately."""
1122
+ for ev in self._iter_neighbors_global(node, direction="OUTBOUND"):
1123
+ yield ev.neighbor_id, ev.relation, ev.weight
1124
+
1125
+ def iter_in(self, node: NodeId) -> Iterable[Tuple[NodeId, RelId, float]]:
1126
+ """Iterate inbound edges, scoring cross-community edges appropriately."""
1127
+ for ev in self._iter_neighbors_global(node, direction="INBOUND"):
1128
+ yield ev.neighbor_id, ev.relation, ev.weight
1129
+
1130
+ def iter_out_rich(self, node: NodeId) -> Iterable[EdgeView]:
1131
+ """Rich outbound edges with cross-community metadata."""
1132
+ yield from self._iter_neighbors_global(node, direction="OUTBOUND")
1133
+
1134
+ def iter_in_rich(self, node: NodeId) -> Iterable[EdgeView]:
1135
+ """Rich inbound edges with cross-community metadata."""
1136
+ yield from self._iter_neighbors_global(node, direction="INBOUND")
1137
+
1138
+ def nodes(self) -> Iterable[NodeId]:
1139
+ """Return all nodes (no community restriction)."""
1140
+ aql = f"FOR v IN {self.nodes_col} RETURN v._id"
1141
+ cursor = self.db.aql.execute(aql, batch_size=self.aql_batch_size, stream=self.aql_stream)
1142
+ for vid in cursor:
1143
+ yield vid
1144
+
1145
+ def degree(self, node: NodeId) -> int:
1146
+ """Out-degree of a node."""
1147
+ aql = f"""
1148
+ RETURN LENGTH(
1149
+ FOR e IN {self.edges_col}
1150
+ FILTER e._from == @node
1151
+ RETURN 1
1152
+ )
1153
+ """
1154
+ cur = self.db.aql.execute(aql, bind_vars={"node": node})
1155
+ return int(list(cur)[0] or 0)
1156
+
1157
+ # --------------------------
1158
+ # Bridge-aware methods
1159
+ # --------------------------
1160
+
1161
+ def is_bridge(self, entity_key: str) -> Optional[dict]:
1162
+ """
1163
+ Check if an entity is a bridge and return its bridge data.
1164
+ Uses caching for performance.
1165
+ """
1166
+ if entity_key in self._bridge_cache:
1167
+ return self._bridge_cache[entity_key]
1168
+
1169
+ aql = """
1170
+ FOR b IN @@bridge_col
1171
+ FILTER b.entity_key == @entity_key
1172
+ FILTER b.algorithm == @algorithm
1173
+ RETURN b
1174
+ """
1175
+ result = list(self.db.aql.execute(
1176
+ aql,
1177
+ bind_vars={
1178
+ "@bridge_col": self.bridge_col,
1179
+ "entity_key": entity_key,
1180
+ "algorithm": self.algorithm,
1181
+ }
1182
+ ))
1183
+
1184
+ bridge_data = result[0] if result else None
1185
+ self._bridge_cache[entity_key] = bridge_data
1186
+ return bridge_data
1187
+
1188
+ def get_entity_community(self, entity_id: str) -> Optional[str]:
1189
+ """Get the community ID for an entity."""
1190
+ aql = """
1191
+ FOR m IN @@membership_col
1192
+ FILTER m.entity_id == @entity_id
1193
+ FILTER m.algorithm == @algorithm
1194
+ RETURN m.community_id
1195
+ """
1196
+ result = list(self.db.aql.execute(
1197
+ aql,
1198
+ bind_vars={
1199
+ "@membership_col": self.membership_col,
1200
+ "entity_id": entity_id,
1201
+ "algorithm": self.algorithm,
1202
+ }
1203
+ ))
1204
+ return result[0] if result else None
1205
+
1206
+ def get_affinity(self, community_a: str, community_b: str) -> float:
1207
+ """
1208
+ Get the affinity score between two communities.
1209
+ Returns 0.0 if no affinity data exists.
1210
+ """
1211
+ cache_key = f"{min(community_a, community_b)}_{max(community_a, community_b)}"
1212
+
1213
+ if cache_key in self._affinity_cache:
1214
+ return self._affinity_cache[cache_key]
1215
+
1216
+ aql = """
1217
+ FOR a IN @@affinity_col
1218
+ FILTER a.algorithm == @algorithm
1219
+ FILTER (a.community_a == @comm_a AND a.community_b == @comm_b)
1220
+ OR (a.community_a == @comm_b AND a.community_b == @comm_a)
1221
+ RETURN a.affinity_score
1222
+ """
1223
+ result = list(self.db.aql.execute(
1224
+ aql,
1225
+ bind_vars={
1226
+ "@affinity_col": self.affinity_col,
1227
+ "algorithm": self.algorithm,
1228
+ "comm_a": community_a,
1229
+ "comm_b": community_b,
1230
+ }
1231
+ ))
1232
+
1233
+ affinity = result[0] if result else 0.0
1234
+ self._affinity_cache[cache_key] = affinity
1235
+ return affinity
1236
+
1237
+ def get_bridges_from_community(self, community_id: str, min_strength: int = 1) -> List[dict]:
1238
+ """Get all bridge entities from a specific community."""
1239
+ aql = """
1240
+ FOR b IN @@bridge_col
1241
+ FILTER b.algorithm == @algorithm
1242
+ FILTER b.home_community == @community_id
1243
+ FILTER b.bridge_strength >= @min_strength
1244
+ SORT b.bridge_strength DESC
1245
+ RETURN b
1246
+ """
1247
+ return list(self.db.aql.execute(
1248
+ aql,
1249
+ bind_vars={
1250
+ "@bridge_col": self.bridge_col,
1251
+ "algorithm": self.algorithm,
1252
+ "community_id": community_id,
1253
+ "min_strength": min_strength,
1254
+ }
1255
+ ))
1256
+
1257
+ def get_top_bridges(self, limit: int = 20) -> List[dict]:
1258
+ """Get the top bridge entities by bridge strength."""
1259
+ aql = """
1260
+ FOR b IN @@bridge_col
1261
+ FILTER b.algorithm == @algorithm
1262
+ SORT b.bridge_strength DESC
1263
+ LIMIT @limit
1264
+ RETURN b
1265
+ """
1266
+ return list(self.db.aql.execute(
1267
+ aql,
1268
+ bind_vars={
1269
+ "@bridge_col": self.bridge_col,
1270
+ "algorithm": self.algorithm,
1271
+ "limit": limit,
1272
+ }
1273
+ ))
1274
+
1275
+ def get_strongest_affinities(self, limit: int = 20) -> List[dict]:
1276
+ """Get the strongest inter-community affinities."""
1277
+ aql = """
1278
+ FOR a IN @@affinity_col
1279
+ FILTER a.algorithm == @algorithm
1280
+ SORT a.affinity_score DESC
1281
+ LIMIT @limit
1282
+ RETURN a
1283
+ """
1284
+ return list(self.db.aql.execute(
1285
+ aql,
1286
+ bind_vars={
1287
+ "@affinity_col": self.affinity_col,
1288
+ "algorithm": self.algorithm,
1289
+ "limit": limit,
1290
+ }
1291
+ ))
1292
+
1293
+ # --------------------------
1294
+ # Cross-community traversal
1295
+ # --------------------------
1296
+
1297
+ def _iter_neighbors_global(self, node: NodeId, direction: str) -> Iterable[EdgeView]:
1298
+ """
1299
+ Iterate neighbors with cross-community awareness.
1300
+
1301
+ - Gets all neighbors (no community restriction)
1302
+ - Detects cross-community edges
1303
+ - Applies bonus/penalty based on affinity
1304
+ """
1305
+ assert direction in ("OUTBOUND", "INBOUND")
1306
+
1307
+ # Get source node's community
1308
+ source_community = self.get_entity_community(node)
1309
+
1310
+ # Get all neighbors
1311
+ aql = f"""
1312
+ FOR v, e IN 1..1 {direction} @node {self.edges_col}
1313
+ LET rel = e[@rel_prop]
1314
+ LET base_weight = HAS(e, @w_prop) && IS_NUMBER(e[@w_prop]) ? e[@w_prop] : 1.0
1315
+ RETURN {{
1316
+ v_id: v._id,
1317
+ v_key: v._key,
1318
+ rel: rel,
1319
+ base_weight: base_weight,
1320
+ edge_id: e._id,
1321
+ sources: []
1322
+ }}
1323
+ """
1324
+
1325
+ cursor = self.db.aql.execute(
1326
+ aql,
1327
+ bind_vars={
1328
+ "node": node,
1329
+ "rel_prop": self.rel_prop,
1330
+ "w_prop": self.w_prop,
1331
+ },
1332
+ batch_size=self.aql_batch_size,
1333
+ stream=self.aql_stream,
1334
+ )
1335
+
1336
+ for d in cursor:
1337
+ neighbor_id = d["v_id"]
1338
+ base_weight = float(d["base_weight"])
1339
+
1340
+ # Check if this is a cross-community edge
1341
+ neighbor_community = self.get_entity_community(neighbor_id)
1342
+
1343
+ weight = base_weight
1344
+ is_cross_community = False
1345
+
1346
+ if source_community and neighbor_community and source_community != neighbor_community:
1347
+ is_cross_community = True
1348
+
1349
+ # Get affinity between communities
1350
+ affinity = self.get_affinity(source_community, neighbor_community)
1351
+
1352
+ # Apply cross-community scoring
1353
+ if affinity >= self.min_affinity_threshold:
1354
+ # Bonus for crossing to well-connected communities
1355
+ weight = base_weight * self.cross_community_bonus * (1 + affinity)
1356
+ else:
1357
+ # Penalty for crossing to poorly-connected communities
1358
+ weight = base_weight * 0.5
1359
+
1360
+ yield EdgeView(
1361
+ neighbor_id=neighbor_id,
1362
+ relation=d["rel"],
1363
+ weight=weight,
1364
+ edge_id=d["edge_id"],
1365
+ valid_from=None,
1366
+ valid_to=None,
1367
+ status="cross_community" if is_cross_community else "same_community",
1368
+ raw_confidence=None,
1369
+ npll_posterior=None,
1370
+ calibration=None,
1371
+ sources=d.get("sources") or [],
1372
+ )
1373
+
1374
+ # --------------------------
1375
+ # Mission-aware scoring
1376
+ # --------------------------
1377
+
1378
+ def score_community_crossing(
1379
+ self,
1380
+ from_community: str,
1381
+ to_community: str,
1382
+ mission: Optional[str] = None
1383
+ ) -> float:
1384
+ """
1385
+ Score a community crossing based on mission context.
1386
+
1387
+ Args:
1388
+ from_community: Source community
1389
+ to_community: Target community
1390
+ mission: Optional mission context (e.g., "fraud_detection", "patient_care")
1391
+
1392
+ Returns:
1393
+ Score multiplier for the crossing (>1 = valuable, <1 = not valuable)
1394
+ """
1395
+ base_affinity = self.get_affinity(from_community, to_community)
1396
+
1397
+ if not mission:
1398
+ return 1.0 + base_affinity
1399
+
1400
+ # Mission-specific scoring (customize based on your domain)
1401
+ mission_lower = mission.lower()
1402
+
1403
+ # Example: fraud detection values Claims -> Clinical crossings
1404
+ if "fraud" in mission_lower:
1405
+ # This would need actual community type detection
1406
+ # For now, just boost high-affinity crossings
1407
+ return (1.0 + base_affinity) * 1.5
1408
+
1409
+ # Example: patient care values Clinical -> Lab crossings
1410
+ if "patient" in mission_lower or "clinical" in mission_lower:
1411
+ return (1.0 + base_affinity) * 1.3
1412
+
1413
+ return 1.0 + base_affinity
1414
+
1415
+ def clear_cache(self):
1416
+ """Clear internal caches (useful after data updates)."""
1417
+ self._bridge_cache.clear()
1418
+ self._affinity_cache.clear()