nexo-brain 7.32.0 → 7.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/ppr.py ADDED
@@ -0,0 +1,473 @@
1
+ """Personalized PageRank (PPR) over the NEXO Knowledge Graph — Ola 2.
2
+
3
+ "Connect the dots at answer time" (HippoRAG2 style). The answer-path already
4
+ fans out 1-hop from query entities/files via ``kg_neighbors``; this module
5
+ generalises that to a multi-hop, *ranked* spreading-activation that pulls in
6
+ nodes 2-3 hops away that a 1-hop fan-out never reaches.
7
+
8
+ Design decisions (approved):
9
+
10
+ * **Pure-Python forward-push** (Andersen-Chung-Lang local push), no numpy/scipy
11
+ in the hot-path. Localised: touches only the few hundred nodes that accumulate
12
+ mass above ``eps`` for a focused query -> sub-30ms on the real KG.
13
+ * **Substrate = the existing ``kg_nodes`` / ``kg_edges`` (cognitive.db)**. No new
14
+ graph. One bulk load of active edges (``valid_until IS NULL``, ~13k rows) per
15
+ build; never ``get_neighbors`` in a loop.
16
+ * **Column-stochastic transition** ``w_uv = (weight*confidence) / Sigma_out`` is
17
+ MANDATORY: the KG is hub-dominated (e.g. ``area:general`` out-degree ~893). A
18
+ naive PPR would dump all mass on those hubs and always return them. Normalising
19
+ by the per-node outgoing mass neutralises that. Relation weights additionally
20
+ down-weight the noisy structural hubs (``belongs_to_area`` / ``describes_session``).
21
+ * **Fail-open absolute**: every public entrypoint is bounded (``max_push``) and
22
+ wrapped so a large/slow/broken graph degrades to a 1-hop neighbour list or an
23
+ empty result, and NEVER raises into the answer-path.
24
+
25
+ The module is import-light and side-effect free at import time. It only reads the
26
+ KG; it never writes.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import heapq
32
+ import threading
33
+ from dataclasses import dataclass, field
34
+ from typing import Any, Iterable
35
+
36
+
37
+ # --- Tunables (all bounded; see plan section 4.1) ----------------------------
38
+
39
+ DEFAULT_ALPHA = 0.15 # teleport / restart probability
40
+ DEFAULT_EPS = 1e-4 # residual mass threshold for the push frontier
41
+ DEFAULT_MAX_PUSH = 2000 # hard cap on push operations (bounds latency)
42
+ DEFAULT_TOP_N = 12 # max ranked nodes returned
43
+ DEFAULT_MAX_SEEDS = 8 # cap on personalization seeds
44
+ # Fraction of an edge's mass that flows along the *reverse* direction. KG
45
+ # relations are asymmetric (ops:produced != causal:motivated_by), so we follow
46
+ # outgoing edges primarily but let a small share traverse backwards so a query
47
+ # seeded on a leaf (e.g. a learning) can still reach its parents.
48
+ DEFAULT_REVERSE_FRACTION = 0.35
49
+
50
+ # Per-relation multipliers applied BEFORE column normalisation. The big
51
+ # structural relations (area / session membership) are the hubs; damping them
52
+ # keeps the walk on the semantically informative edges (causal / provenance /
53
+ # file-application). Unknown relations default to 1.0.
54
+ RELATION_WEIGHTS: dict[str, float] = {
55
+ "causal:verified_by": 1.6,
56
+ "causal:resolved_by": 1.6,
57
+ "causal:motivated_by": 1.4,
58
+ "ops:produced": 1.3,
59
+ "applies_to_file": 1.1,
60
+ "touched": 1.1,
61
+ "belongs_to": 1.0,
62
+ "mentions_email": 0.9,
63
+ "in_domain": 0.8,
64
+ "belongs_to_area": 0.35,
65
+ "describes_session": 0.30,
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class PPRGraph:
71
+ """In-memory adjacency built from one bulk load of active KG edges.
72
+
73
+ ``out[u]`` and ``inc[u]`` map a node id to a list of ``(neighbour_id, mass)``
74
+ where ``mass`` is the relation-weighted ``weight*confidence`` of the edge
75
+ (pre-normalisation). ``node_meta`` carries the resolved type/ref/label so the
76
+ adapter never has to round-trip back to SQLite per result node.
77
+ """
78
+
79
+ out: dict[int, list[tuple[int, float]]] = field(default_factory=dict)
80
+ inc: dict[int, list[tuple[int, float]]] = field(default_factory=dict)
81
+ out_sum: dict[int, float] = field(default_factory=dict)
82
+ node_meta: dict[int, dict[str, Any]] = field(default_factory=dict)
83
+ edge_count: int = 0
84
+
85
+ def degree_out(self, node_id: int) -> int:
86
+ return len(self.out.get(node_id, ()))
87
+
88
+ def transitions(self, node_id: int, *, reverse_fraction: float) -> Iterable[tuple[int, float]]:
89
+ """Yield (neighbour, column-stochastic transition prob) for a node.
90
+
91
+ Combines outgoing edges (weight 1) and incoming edges (weight
92
+ ``reverse_fraction``), then normalises the combined mass to sum to 1 so
93
+ the operator is column-stochastic and hub-safe.
94
+ """
95
+ out_edges = self.out.get(node_id, ())
96
+ inc_edges = self.inc.get(node_id, ())
97
+ if not out_edges and not inc_edges:
98
+ return ()
99
+ combined: dict[int, float] = {}
100
+ for v, m in out_edges:
101
+ combined[v] = combined.get(v, 0.0) + m
102
+ if reverse_fraction > 0:
103
+ for v, m in inc_edges:
104
+ combined[v] = combined.get(v, 0.0) + m * reverse_fraction
105
+ total = sum(combined.values())
106
+ if total <= 0:
107
+ return ()
108
+ return [(v, m / total) for v, m in combined.items()]
109
+
110
+
111
+ def _relation_weight(relation: str) -> float:
112
+ return RELATION_WEIGHTS.get(str(relation or ""), 1.0)
113
+
114
+
115
+ def build_graph(*, max_edges: int | None = None) -> PPRGraph:
116
+ """Bulk-load active KG edges into an in-memory adjacency.
117
+
118
+ Single ``SELECT`` over ``kg_edges WHERE valid_until IS NULL`` (~13k rows).
119
+ Node metadata is loaded in one further ``SELECT`` over ``kg_nodes``.
120
+ Raises only on a genuinely broken DB; callers wrap this.
121
+ """
122
+ import knowledge_graph as kg
123
+
124
+ db = kg._get_db()
125
+ graph = PPRGraph()
126
+ sql = (
127
+ "SELECT source_id, target_id, relation, weight, confidence "
128
+ "FROM kg_edges WHERE valid_until IS NULL"
129
+ )
130
+ if max_edges and max_edges > 0:
131
+ sql += f" LIMIT {int(max_edges)}"
132
+ rows = db.execute(sql).fetchall()
133
+ for row in rows:
134
+ src = int(row["source_id"])
135
+ tgt = int(row["target_id"])
136
+ if src == tgt:
137
+ continue # self-loop carries no associative signal
138
+ w = float(row["weight"] if row["weight"] is not None else 1.0)
139
+ c = float(row["confidence"] if row["confidence"] is not None else 1.0)
140
+ mass = max(0.0, w) * max(0.0, c) * _relation_weight(row["relation"])
141
+ if mass <= 0:
142
+ continue
143
+ graph.out.setdefault(src, []).append((tgt, mass))
144
+ graph.inc.setdefault(tgt, []).append((src, mass))
145
+ graph.out_sum[src] = graph.out_sum.get(src, 0.0) + mass
146
+ graph.edge_count += 1
147
+
148
+ # Resolve node metadata in one shot (only for nodes that appear in an edge).
149
+ touched = set(graph.out) | set(graph.inc)
150
+ if touched:
151
+ node_rows = db.execute(
152
+ "SELECT id, node_type, node_ref, label FROM kg_nodes"
153
+ ).fetchall()
154
+ for nr in node_rows:
155
+ nid = int(nr["id"])
156
+ if nid in touched:
157
+ graph.node_meta[nid] = {
158
+ "id": nid,
159
+ "node_type": nr["node_type"],
160
+ "node_ref": nr["node_ref"],
161
+ "label": nr["label"],
162
+ }
163
+ return graph
164
+
165
+
166
+ # --- Per-process graph cache + safe background pre-warm ----------------------
167
+ #
168
+ # WHY A CACHE. ``build_graph`` is a bulk load of all active KG edges (~13k rows
169
+ # on the real KG, ~20ms) plus a node-metadata SELECT. The adapter used to pay
170
+ # that on EVERY answer. Worse, a cold process pays it on top of imports +
171
+ # entity resolution + the first SQLite touch, and the combined ~167ms blows the
172
+ # 120ms per-source step timeout, so the dispatcher aborts the step
173
+ # (``aborted_reason='timeout'``) and the feature contributes nothing on query-1.
174
+ # Caching the built graph per process drops warm queries to ~5-7ms.
175
+ #
176
+ # WHY (MAX(id), COUNT) AND NOT ``get_change_watermark``. The working-memory
177
+ # resolution cache invalidates on ``db.get_change_watermark`` == ``MAX(id) FROM
178
+ # change_log``. But the KG write path (``knowledge_graph.upsert_edge`` /
179
+ # ``delete_edge``) mutates ``kg_edges`` DIRECTLY and never appends to
180
+ # ``change_log`` — so the global watermark does NOT move when edges are added,
181
+ # superseded or retired, and reusing it would serve a stale graph forever. The
182
+ # correct, equally-cheap KG-local fingerprint is a single SELECT of
183
+ # ``(MAX(id), COUNT(*))`` over active edges (``valid_until IS NULL``). It catches
184
+ # every mutation shape coherently with the anti-stale discipline:
185
+ # * ADD -> a new row: MAX(id) rises (and COUNT rises)
186
+ # * UPDATE -> supersede: old row gets valid_until + new row inserted ->
187
+ # MAX(id) rises
188
+ # * DELETE -> supersede: old row gets valid_until -> active COUNT drops
189
+ # So whenever the KG changes, the fingerprint changes and the cache rebuilds.
190
+ #
191
+ # The cache is keyed by ``(db_path, fingerprint)`` so a test that swaps
192
+ # ``COGNITIVE_DB`` to a tmp file (conftest isolation) never reads another DB's
193
+ # graph, and a moved/rotated prod DB rebuilds rather than serving the old one.
194
+
195
+ _GRAPH_CACHE: dict[str, tuple[tuple[int, int], "PPRGraph"]] = {}
196
+ _GRAPH_CACHE_LOCK = threading.Lock()
197
+ _PREWARM_LOCK = threading.Lock()
198
+ _prewarm_thread: threading.Thread | None = None
199
+
200
+ # Cache statistics (process-local; used by tests/benchmarks to assert that a
201
+ # warm call did NOT rebuild). Best-effort, not thread-perfect.
202
+ _CACHE_STATS = {"builds": 0, "hits": 0}
203
+
204
+
205
+ def _kg_db_path() -> str:
206
+ """Resolved cognitive.db path — the cache namespace. Cheap, no connection."""
207
+ try:
208
+ import cognitive._core as cog_core
209
+
210
+ return str(cog_core.COGNITIVE_DB)
211
+ except Exception:
212
+ return ""
213
+
214
+
215
+ def kg_fingerprint() -> tuple[int, int]:
216
+ """One-SELECT KG-local invalidation signal: (MAX active edge id, active count).
217
+
218
+ Cheap (indexed AUTOINCREMENT MAX + COUNT), monotonic-enough to detect every
219
+ add/supersede/retire on ``kg_edges``. Returns ``(0, 0)`` if the KG is empty
220
+ or unavailable — a fresh cache entry stores that same value, so an empty KG
221
+ never spuriously invalidates. Never raises (callers depend on fail-open).
222
+ """
223
+ try:
224
+ import knowledge_graph as kg
225
+
226
+ row = kg._get_db().execute(
227
+ "SELECT MAX(id), COUNT(*) FROM kg_edges WHERE valid_until IS NULL"
228
+ ).fetchone()
229
+ except Exception:
230
+ return (0, 0)
231
+ if not row:
232
+ return (0, 0)
233
+ try:
234
+ max_id = int(row[0]) if row[0] is not None else 0
235
+ count = int(row[1]) if row[1] is not None else 0
236
+ except (TypeError, ValueError):
237
+ return (0, 0)
238
+ return (max_id, count)
239
+
240
+
241
+ def get_cached_graph(*, max_edges: int | None = None) -> PPRGraph:
242
+ """Return the per-process graph, rebuilding only when the KG fingerprint moved.
243
+
244
+ Build cost is paid once per process (or once per KG change). Fail-open is
245
+ handled by callers (``rank_related`` / the adapter wrap this); a genuinely
246
+ broken DB still raises here so those wrappers can degrade.
247
+ """
248
+ db_path = _kg_db_path()
249
+ fp = kg_fingerprint()
250
+ cached = _GRAPH_CACHE.get(db_path)
251
+ if cached is not None and cached[0] == fp:
252
+ _CACHE_STATS["hits"] += 1
253
+ return cached[1]
254
+ # Build outside the lock would risk a double-build under contention; the
255
+ # build is bounded and infrequent, so we serialise it. A second waiter that
256
+ # finds the now-current entry returns it without rebuilding.
257
+ with _GRAPH_CACHE_LOCK:
258
+ cached = _GRAPH_CACHE.get(db_path)
259
+ if cached is not None and cached[0] == fp:
260
+ _CACHE_STATS["hits"] += 1
261
+ return cached[1]
262
+ graph = build_graph(max_edges=max_edges)
263
+ _GRAPH_CACHE[db_path] = (fp, graph)
264
+ _CACHE_STATS["builds"] += 1
265
+ return graph
266
+
267
+
268
+ def cache_is_warm() -> bool:
269
+ """True iff a graph for the CURRENT db+fingerprint is already cached.
270
+
271
+ Used by the adapter to decide, without building, whether query-1 can get the
272
+ multi-hop graph immediately or must degrade to 1-hop while the pre-warm runs.
273
+ """
274
+ cached = _GRAPH_CACHE.get(_kg_db_path())
275
+ return cached is not None and cached[0] == kg_fingerprint()
276
+
277
+
278
+ def reset_graph_cache() -> None:
279
+ """Drop the cache (and stats). For tests and explicit invalidation."""
280
+ with _GRAPH_CACHE_LOCK:
281
+ _GRAPH_CACHE.clear()
282
+ _CACHE_STATS["builds"] = 0
283
+ _CACHE_STATS["hits"] = 0
284
+
285
+
286
+ def prewarm_async() -> None:
287
+ """Build the graph in a background daemon thread without blocking the caller.
288
+
289
+ Cold-start strategy (decision): the FIRST answer of a fresh process must not
290
+ pay the full build under a 120ms step timeout. So instead of building inline,
291
+ the adapter (or router init) calls this to warm the cache off the hot-path.
292
+ If the warm has not finished when query-1 arrives, the adapter degrades
293
+ cleanly to 1-hop ``fallback_neighbors`` (fast, never times out). Query-2+
294
+ then finds a warm cache and gets full multi-hop PPR.
295
+
296
+ Lazy + idempotent: at most one warm thread at a time; a no-op if the cache is
297
+ already warm. Fail-open: a build error in the thread is swallowed (the next
298
+ inline ``get_cached_graph`` will retry / degrade). Daemon so it never blocks
299
+ process exit.
300
+ """
301
+ global _prewarm_thread
302
+ if cache_is_warm():
303
+ return
304
+ with _PREWARM_LOCK:
305
+ if _prewarm_thread is not None and _prewarm_thread.is_alive():
306
+ return
307
+ if cache_is_warm():
308
+ return
309
+
310
+ def _warm() -> None:
311
+ try:
312
+ get_cached_graph()
313
+ except Exception:
314
+ pass # fail-open: inline path will retry/degrade
315
+
316
+ t = threading.Thread(target=_warm, name="ppr-prewarm", daemon=True)
317
+ _prewarm_thread = t
318
+ t.start()
319
+
320
+
321
+ def push_ppr(
322
+ graph: PPRGraph,
323
+ seeds: dict[int, float],
324
+ *,
325
+ alpha: float = DEFAULT_ALPHA,
326
+ eps: float = DEFAULT_EPS,
327
+ max_push: int = DEFAULT_MAX_PUSH,
328
+ reverse_fraction: float = DEFAULT_REVERSE_FRACTION,
329
+ ) -> dict[int, float]:
330
+ """Forward-push approximate Personalized PageRank (Andersen-Chung-Lang).
331
+
332
+ ``seeds`` is the personalization vector ``{node_id: weight}`` (need not be
333
+ pre-normalised; it is normalised to sum 1 here). Returns the PPR estimate
334
+ ``{node_id: score}`` over the nodes touched by the push. Deterministic for a
335
+ fixed graph + seeds (ties broken by node id in the frontier).
336
+ """
337
+ if not seeds:
338
+ return {}
339
+ total = sum(max(0.0, v) for v in seeds.values())
340
+ if total <= 0:
341
+ return {}
342
+
343
+ estimate: dict[int, float] = {}
344
+ residual: dict[int, float] = {n: max(0.0, v) / total for n, v in seeds.items() if v > 0}
345
+
346
+ # Max-heap on residual mass (negate for heapq min-heap). The (-, id) tuple
347
+ # makes ordering total and the result deterministic.
348
+ frontier: list[tuple[float, int]] = [(-r, n) for n, r in residual.items()]
349
+ heapq.heapify(frontier)
350
+ in_frontier = set(residual)
351
+
352
+ pushes = 0
353
+ while frontier and pushes < max_push:
354
+ neg_r, u = heapq.heappop(frontier)
355
+ in_frontier.discard(u)
356
+ r = residual.get(u, 0.0)
357
+ if r <= 0:
358
+ continue
359
+ deg = graph.degree_out(u) or 1
360
+ # Skip nodes whose residual is below the per-degree threshold (standard
361
+ # ACL stopping condition); they cannot meaningfully change the ranking.
362
+ if r < eps * deg and u not in seeds:
363
+ continue
364
+
365
+ estimate[u] = estimate.get(u, 0.0) + alpha * r
366
+ mass = (1.0 - alpha) * r
367
+ residual[u] = 0.0
368
+ pushes += 1
369
+
370
+ for v, p_uv in graph.transitions(u, reverse_fraction=reverse_fraction):
371
+ add = mass * p_uv
372
+ if add <= 0:
373
+ continue
374
+ new_r = residual.get(v, 0.0) + add
375
+ residual[v] = new_r
376
+ if v not in in_frontier:
377
+ heapq.heappush(frontier, (-new_r, v))
378
+ in_frontier.add(v)
379
+
380
+ return estimate
381
+
382
+
383
+ @dataclass
384
+ class RankedNode:
385
+ node_id: int
386
+ score: float
387
+ node_type: str
388
+ node_ref: str
389
+ label: str
390
+
391
+
392
+ def rank_related(
393
+ seeds: dict[int, float],
394
+ *,
395
+ graph: PPRGraph | None = None,
396
+ top_n: int = DEFAULT_TOP_N,
397
+ alpha: float = DEFAULT_ALPHA,
398
+ eps: float = DEFAULT_EPS,
399
+ max_push: int = DEFAULT_MAX_PUSH,
400
+ reverse_fraction: float = DEFAULT_REVERSE_FRACTION,
401
+ ) -> list[RankedNode]:
402
+ """Run PPR from ``seeds`` and return the top-N related nodes (seeds removed).
403
+
404
+ Builds the graph if not supplied. Fail-open: any error returns ``[]``.
405
+ """
406
+ try:
407
+ if not seeds:
408
+ return []
409
+ if graph is None:
410
+ graph = get_cached_graph()
411
+ estimate = push_ppr(
412
+ graph,
413
+ seeds,
414
+ alpha=alpha,
415
+ eps=eps,
416
+ max_push=max_push,
417
+ reverse_fraction=reverse_fraction,
418
+ )
419
+ seed_ids = set(seeds)
420
+ ranked = [
421
+ (nid, score)
422
+ for nid, score in estimate.items()
423
+ if nid not in seed_ids and score > 0
424
+ ]
425
+ # Deterministic order: score desc, then node id asc.
426
+ ranked.sort(key=lambda item: (-item[1], item[0]))
427
+ out: list[RankedNode] = []
428
+ for nid, score in ranked[: max(0, int(top_n))]:
429
+ meta = graph.node_meta.get(nid) or {}
430
+ out.append(
431
+ RankedNode(
432
+ node_id=nid,
433
+ score=round(float(score), 6),
434
+ node_type=str(meta.get("node_type") or ""),
435
+ node_ref=str(meta.get("node_ref") or ""),
436
+ label=str(meta.get("label") or ""),
437
+ )
438
+ )
439
+ return out
440
+ except Exception:
441
+ return []
442
+
443
+
444
+ def fallback_neighbors(seed_ids: Iterable[int], *, limit: int = 6) -> list[RankedNode]:
445
+ """1-hop degraded path (parity with ``kg_neighbors``) if PPR can't run.
446
+
447
+ Used when the graph is too large/slow or PPR fails. Fail-open: returns ``[]``
448
+ on any error.
449
+ """
450
+ try:
451
+ import knowledge_graph as kg
452
+
453
+ seen: set[int] = set()
454
+ out: list[RankedNode] = []
455
+ for sid in seed_ids:
456
+ for nb in kg.get_neighbors(int(sid), active_only=True)[:limit]:
457
+ # neighbour node id is the *other* endpoint
458
+ nid = int(nb["target_id"]) if int(nb["source_id"]) == int(sid) else int(nb["source_id"])
459
+ if nid in seen or nid in set(seed_ids):
460
+ continue
461
+ seen.add(nid)
462
+ out.append(
463
+ RankedNode(
464
+ node_id=nid,
465
+ score=0.0,
466
+ node_type=str(nb.get("node_type") or ""),
467
+ node_ref=str(nb.get("node_ref") or ""),
468
+ label=str(nb.get("label") or ""),
469
+ )
470
+ )
471
+ return out[: max(0, limit * 2)]
472
+ except Exception:
473
+ return []