@pentatonic-ai/ai-agent-sdk 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/packages/memory/openclaw-plugin/index.js +7 -0
- package/packages/memory/openclaw-plugin/openclaw.plugin.json +9 -1
- package/packages/memory/openclaw-plugin/package.json +1 -1
- package/packages/memory/src/__tests__/engine.test.js +142 -0
- package/packages/memory/src/engine.js +65 -0
- package/packages/memory-engine/compat/server.py +90 -5
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +596 -58
- package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +128 -0
- package/packages/memory-engine/tests/e2e_arena.sh +28 -4
- package/packages/memory-engine/tests/test_aggregate.py +333 -0
- package/packages/memory-engine/tests/test_arena_safety.py +232 -0
- package/packages/memory-engine/tests/test_channel_stat_reader.py +437 -0
- package/packages/memory-engine/tests/test_channel_stat_rollups.py +308 -0
- package/packages/memory-engine/tests/test_l3_arena_isolation.py +412 -0
|
@@ -230,6 +230,12 @@ class ChatCompletionRequest(BaseModel):
|
|
|
230
230
|
model: str = "gpt-3.5-turbo"
|
|
231
231
|
max_tokens: int = 1000
|
|
232
232
|
temperature: float = 0.1
|
|
233
|
+
# Optional tenant scope. When absent, the L3 graph layer returns no
|
|
234
|
+
# results (rather than walking the global graph) — other layers
|
|
235
|
+
# still respond, so the call succeeds but with reduced L3 context.
|
|
236
|
+
# Existing single-tenant callers (benchmarks, dev) keep working.
|
|
237
|
+
arena: Optional[str] = None
|
|
238
|
+
arenas: Optional[List[str]] = None
|
|
233
239
|
|
|
234
240
|
class EmbeddingRequest(BaseModel):
|
|
235
241
|
input: Any
|
|
@@ -261,9 +267,15 @@ def extract_query_entities(query: str) -> List[str]:
|
|
|
261
267
|
log.info(f"Extracted entities: {potential_entities}")
|
|
262
268
|
return potential_entities
|
|
263
269
|
|
|
264
|
-
def _hebbian_strengthen(session, node_names: List[str], increment: float = 0.05) -> None:
|
|
265
|
-
"""Hebbian: strengthen edges between co-accessed nodes during query.
|
|
266
|
-
|
|
270
|
+
def _hebbian_strengthen(session, arenas: List[str], node_names: List[str], increment: float = 0.05) -> None:
|
|
271
|
+
"""Hebbian: strengthen edges between co-accessed nodes during query.
|
|
272
|
+
|
|
273
|
+
Scoped by arena so a search inside tenant A can't reinforce edges
|
|
274
|
+
inside tenant B's graph (which would happen via shared entity-name
|
|
275
|
+
nodes pre-arena). When `arenas` is empty (single-tenant local dev,
|
|
276
|
+
benchmarks) we no-op rather than risk a cross-tenant write.
|
|
277
|
+
"""
|
|
278
|
+
if len(node_names) < 2 or not arenas:
|
|
267
279
|
return
|
|
268
280
|
now = datetime.utcnow().isoformat() + "Z"
|
|
269
281
|
for i, n1 in enumerate(node_names):
|
|
@@ -271,16 +283,28 @@ def _hebbian_strengthen(session, node_names: List[str], increment: float = 0.05)
|
|
|
271
283
|
try:
|
|
272
284
|
session.run(
|
|
273
285
|
"""MATCH (a {name: $n1})-[r]-(b {name: $n2})
|
|
286
|
+
WHERE a.arena IN $arenas AND b.arena IN $arenas
|
|
274
287
|
SET r.weight = coalesce(r.weight, 1.0) + $inc,
|
|
275
288
|
r.last_accessed = $now""",
|
|
276
|
-
n1=n1, n2=n2, inc=increment, now=now
|
|
289
|
+
n1=n1, n2=n2, arenas=arenas, inc=increment, now=now
|
|
277
290
|
)
|
|
278
291
|
except Exception:
|
|
279
292
|
pass # non-critical
|
|
280
293
|
|
|
281
294
|
|
|
282
|
-
def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) -> Dict:
|
|
283
|
-
"""Phase 1: Neo4j graph search with spreading activation + Hebbian.
|
|
295
|
+
def search_neo4j_sequential(query: str, entities: List[str], arenas: List[str], limit: int = 12) -> Dict:
|
|
296
|
+
"""Phase 1: Neo4j graph search with spreading activation + Hebbian.
|
|
297
|
+
|
|
298
|
+
`arenas` is the tenant-scope set the caller is authorised for —
|
|
299
|
+
typically [clientId] or [clientId, clientId:userId]. Every Cypher
|
|
300
|
+
clause filters on `n.arena IN $arenas`, so a search from tenant A
|
|
301
|
+
can never traverse into entity nodes belonging to tenant B even
|
|
302
|
+
when their names collide. Empty `arenas` short-circuits to no
|
|
303
|
+
results — that's safer than walking the entire graph in dev/test.
|
|
304
|
+
"""
|
|
305
|
+
if not arenas:
|
|
306
|
+
log.warning("search_neo4j_sequential called without arenas — returning empty results")
|
|
307
|
+
return {"results": [], "graph_entities": [], "entity_count": 0}
|
|
284
308
|
try:
|
|
285
309
|
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
|
|
286
310
|
results = []
|
|
@@ -289,19 +313,20 @@ def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) ->
|
|
|
289
313
|
with driver.session() as session:
|
|
290
314
|
# Search for specific entities — use weighted spreading activation
|
|
291
315
|
for entity in entities:
|
|
292
|
-
# Direct match first
|
|
316
|
+
# Direct match first — arena-scoped on every node we touch.
|
|
293
317
|
cypher = """
|
|
294
318
|
MATCH (n)
|
|
295
|
-
WHERE n.name CONTAINS $entity
|
|
319
|
+
WHERE n.name CONTAINS $entity AND n.arena IN $arenas
|
|
296
320
|
OPTIONAL MATCH (n)-[r]-(connected)
|
|
297
|
-
WHERE
|
|
321
|
+
WHERE connected.arena IN $arenas
|
|
322
|
+
AND coalesce(r.weight, 1.0) >= 0.2
|
|
298
323
|
RETURN n, r, connected, $entity as search_entity,
|
|
299
324
|
coalesce(r.weight, 1.0) AS edge_weight
|
|
300
325
|
ORDER BY edge_weight DESC
|
|
301
326
|
LIMIT $limit
|
|
302
327
|
"""
|
|
303
328
|
|
|
304
|
-
records = session.run(cypher, entity=entity, limit=8)
|
|
329
|
+
records = session.run(cypher, entity=entity, arenas=arenas, limit=8)
|
|
305
330
|
|
|
306
331
|
for record in records:
|
|
307
332
|
node = _serialize_neo4j_value(dict(record["n"]))
|
|
@@ -336,11 +361,17 @@ def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) ->
|
|
|
336
361
|
"node_data": node
|
|
337
362
|
})
|
|
338
363
|
|
|
339
|
-
# 2-hop spreading activation for high-weight paths
|
|
364
|
+
# 2-hop spreading activation for high-weight paths.
|
|
365
|
+
# Every node along the walk must be in-arena. Without
|
|
366
|
+
# the filter, an activation could walk into another
|
|
367
|
+
# tenant's graph via a name-collision on the start node.
|
|
340
368
|
if entity:
|
|
341
369
|
activation_results = session.run("""
|
|
342
370
|
MATCH (start)-[r1]-(mid)-[r2]-(end)
|
|
343
371
|
WHERE start.name CONTAINS $entity
|
|
372
|
+
AND start.arena IN $arenas
|
|
373
|
+
AND mid.arena IN $arenas
|
|
374
|
+
AND end.arena IN $arenas
|
|
344
375
|
AND coalesce(r1.weight, 1.0) >= 0.5
|
|
345
376
|
AND coalesce(r2.weight, 1.0) >= 0.5
|
|
346
377
|
AND start <> end
|
|
@@ -349,7 +380,7 @@ def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) ->
|
|
|
349
380
|
mid.name AS via
|
|
350
381
|
ORDER BY activation DESC
|
|
351
382
|
LIMIT 5
|
|
352
|
-
""", entity=entity)
|
|
383
|
+
""", entity=entity, arenas=arenas)
|
|
353
384
|
|
|
354
385
|
for rec in activation_results:
|
|
355
386
|
end_node = _serialize_neo4j_value(dict(rec["end"])) if rec["end"] else {}
|
|
@@ -365,20 +396,24 @@ def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) ->
|
|
|
365
396
|
"node_data": end_node
|
|
366
397
|
})
|
|
367
398
|
|
|
368
|
-
# General query search if no specific entities found
|
|
399
|
+
# General query search if no specific entities found —
|
|
400
|
+
# arena-gated so the fallback can't walk other tenants'
|
|
401
|
+
# nodes when the heuristic entity extractor returned nothing.
|
|
369
402
|
if not results:
|
|
370
403
|
general_words = [w for w in query.split() if len(w) > 3 and w.lower() not in ['what', 'who', 'where', 'when', 'how']]
|
|
371
404
|
|
|
372
405
|
for word in general_words[:2]:
|
|
373
406
|
cypher = """
|
|
374
407
|
MATCH (n)
|
|
375
|
-
WHERE
|
|
408
|
+
WHERE n.arena IN $arenas
|
|
409
|
+
AND ANY(prop IN keys(n) WHERE n[prop] IS :: STRING AND n[prop] CONTAINS $term)
|
|
376
410
|
OPTIONAL MATCH (n)-[r]-(connected)
|
|
411
|
+
WHERE connected.arena IN $arenas
|
|
377
412
|
RETURN n, r, connected
|
|
378
413
|
LIMIT $limit
|
|
379
414
|
"""
|
|
380
415
|
|
|
381
|
-
records = session.run(cypher, term=word, limit=4)
|
|
416
|
+
records = session.run(cypher, term=word, arenas=arenas, limit=4)
|
|
382
417
|
|
|
383
418
|
for record in records:
|
|
384
419
|
node = _serialize_neo4j_value(dict(record["n"]))
|
|
@@ -395,7 +430,7 @@ def search_neo4j_sequential(query: str, entities: List[str], limit: int = 12) ->
|
|
|
395
430
|
})
|
|
396
431
|
|
|
397
432
|
# Hebbian: strengthen edges between all accessed entities
|
|
398
|
-
_hebbian_strengthen(session, list(graph_entities))
|
|
433
|
+
_hebbian_strengthen(session, arenas, list(graph_entities))
|
|
399
434
|
|
|
400
435
|
driver.close()
|
|
401
436
|
|
|
@@ -974,9 +1009,12 @@ def sequential_hybridrag_search(query: str, limit: int = 16,
|
|
|
974
1009
|
log.info(f"L1 System files: {len(system_results)} results")
|
|
975
1010
|
|
|
976
1011
|
# L2: HybridRAG orchestration
|
|
977
|
-
# L3: Graph search (entity extraction + Neo4j)
|
|
1012
|
+
# L3: Graph search (entity extraction + Neo4j) — arena-scoped so a
|
|
1013
|
+
# tenant's search can never traverse another tenant's entity graph
|
|
1014
|
+
# via name collisions on shared :Entity nodes. The post-filter shim
|
|
1015
|
+
# protects chunks; this protects the entity-walking layer too.
|
|
978
1016
|
entities = extract_query_entities(query)
|
|
979
|
-
graph_context = search_neo4j_sequential(query, entities, limit=8)
|
|
1017
|
+
graph_context = search_neo4j_sequential(query, entities, arena_list, limit=8)
|
|
980
1018
|
log.info(f"L3 Graph search: {len(graph_context['results'])} results, {graph_context['entity_count']} entities")
|
|
981
1019
|
|
|
982
1020
|
# HyDE: expand query for better vector embeddings
|
|
@@ -1058,9 +1096,12 @@ async def search_endpoint(request: Request) -> dict:
|
|
|
1058
1096
|
|
|
1059
1097
|
results = sequential_hybridrag_search(query, limit=limit, arena=arena, arenas=arenas)
|
|
1060
1098
|
|
|
1061
|
-
# Also return raw graph entities for context enrichment
|
|
1099
|
+
# Also return raw graph entities for context enrichment.
|
|
1100
|
+
# Same arena scope as the cascade search above — without it
|
|
1101
|
+
# the entities returned could include cross-tenant rows.
|
|
1102
|
+
arena_list = list(arenas) if arenas else ([arena] if arena else [])
|
|
1062
1103
|
entities = extract_query_entities(query)
|
|
1063
|
-
graph_context = search_neo4j_sequential(query, entities, limit=8)
|
|
1104
|
+
graph_context = search_neo4j_sequential(query, entities, arena_list, limit=8)
|
|
1064
1105
|
|
|
1065
1106
|
return {
|
|
1066
1107
|
"results": results,
|
|
@@ -1125,9 +1166,15 @@ async def chat_completions(request: ChatCompletionRequest) -> dict:
|
|
|
1125
1166
|
|
|
1126
1167
|
query = user_messages[-1].content
|
|
1127
1168
|
|
|
1128
|
-
# Perform sequential HybridRAG search
|
|
1169
|
+
# Perform sequential HybridRAG search — pass through tenant
|
|
1170
|
+
# scope from the request so L3 graph traversal stays inside the
|
|
1171
|
+
# caller's arena. The search function short-circuits L3 to
|
|
1172
|
+
# empty when no arenas are supplied; callers that need L3 must
|
|
1173
|
+
# pass `arena` or `arenas` on the request body.
|
|
1129
1174
|
start_time = time.time()
|
|
1130
|
-
results = sequential_hybridrag_search(
|
|
1175
|
+
results = sequential_hybridrag_search(
|
|
1176
|
+
query, limit=16, arena=request.arena, arenas=request.arenas,
|
|
1177
|
+
)
|
|
1131
1178
|
search_time = time.time() - start_time
|
|
1132
1179
|
|
|
1133
1180
|
# Format results with correct layer structure
|
|
@@ -1183,38 +1230,57 @@ async def chat_completions(request: ChatCompletionRequest) -> dict:
|
|
|
1183
1230
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1184
1231
|
|
|
1185
1232
|
@app.get("/contradictions/{node_name}")
|
|
1186
|
-
async def check_contradictions(node_name: str) -> dict:
|
|
1187
|
-
"""Detect contradictions around a named node.
|
|
1233
|
+
async def check_contradictions(node_name: str, arena: Optional[str] = None) -> dict:
|
|
1234
|
+
"""Detect contradictions around a named node.
|
|
1235
|
+
|
|
1236
|
+
`arena` is required to scope the lookup to one tenant's graph. The
|
|
1237
|
+
endpoint returns a 400 when called without it — silently spanning
|
|
1238
|
+
the entire graph here would leak entity names across tenants via
|
|
1239
|
+
the `node_name` lookup.
|
|
1240
|
+
"""
|
|
1241
|
+
if not arena:
|
|
1242
|
+
raise HTTPException(
|
|
1243
|
+
status_code=400,
|
|
1244
|
+
detail="arena query parameter is required to scope contradiction lookup",
|
|
1245
|
+
)
|
|
1188
1246
|
try:
|
|
1189
1247
|
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
|
|
1190
1248
|
contradictions = []
|
|
1191
1249
|
with driver.session() as session:
|
|
1192
|
-
# Find the node
|
|
1250
|
+
# Find the node — must be in the caller's arena.
|
|
1193
1251
|
node = session.run(
|
|
1194
|
-
"MATCH (n) WHERE toLower(n.name) = toLower($name)
|
|
1252
|
+
"""MATCH (n) WHERE toLower(n.name) = toLower($name) AND n.arena = $arena
|
|
1253
|
+
RETURN elementId(n) AS id""",
|
|
1254
|
+
name=node_name, arena=arena,
|
|
1195
1255
|
).single()
|
|
1196
1256
|
if not node:
|
|
1197
1257
|
return {"node": node_name, "contradictions": [], "error": "Node not found"}
|
|
1198
1258
|
nid = node["id"]
|
|
1199
1259
|
|
|
1200
|
-
# Explicit CONTRADICTS
|
|
1260
|
+
# Explicit CONTRADICTS — both endpoints must be in the same arena.
|
|
1201
1261
|
for rec in session.run(
|
|
1202
|
-
"""MATCH (a)-[r:CONTRADICTS]-(b)
|
|
1203
|
-
|
|
1262
|
+
"""MATCH (a)-[r:CONTRADICTS]-(b)
|
|
1263
|
+
WHERE elementId(a) = $nid AND b.arena = $arena
|
|
1264
|
+
RETURN a.name AS a, b.name AS b, r.reason AS reason""",
|
|
1265
|
+
nid=nid, arena=arena,
|
|
1204
1266
|
):
|
|
1205
1267
|
contradictions.append({"type": "explicit", "a": rec["a"], "b": rec["b"], "reason": rec["reason"]})
|
|
1206
1268
|
|
|
1207
|
-
# Property conflicts via shared neighbour
|
|
1269
|
+
# Property conflicts via shared neighbour — every node along
|
|
1270
|
+
# the (a)--(shared)--(b) path filtered by arena so a shared
|
|
1271
|
+
# neighbour from another tenant can't trigger a false-positive
|
|
1272
|
+
# conflict in this tenant's view.
|
|
1208
1273
|
for rec in session.run(
|
|
1209
1274
|
"""MATCH (a)--(shared)--(b)
|
|
1210
1275
|
WHERE elementId(a) = $nid AND a <> b
|
|
1276
|
+
AND shared.arena = $arena AND b.arena = $arena
|
|
1211
1277
|
WITH a, b, shared, properties(a) AS pa, properties(b) AS pb
|
|
1212
1278
|
WITH a, b, shared,
|
|
1213
1279
|
[k IN keys(pa) WHERE k IN keys(pb) AND pa[k] <> pb[k]
|
|
1214
1280
|
AND NOT k IN ['last_accessed','embedding','created_at','updated_at','id','weight']] AS ck
|
|
1215
1281
|
WHERE size(ck) > 0
|
|
1216
1282
|
RETURN a.name AS a, b.name AS b, shared.name AS via, ck
|
|
1217
|
-
LIMIT 10""", nid=nid
|
|
1283
|
+
LIMIT 10""", nid=nid, arena=arena,
|
|
1218
1284
|
):
|
|
1219
1285
|
contradictions.append({
|
|
1220
1286
|
"type": "property_conflict", "a": rec["a"], "b": rec["b"],
|
|
@@ -1472,22 +1538,70 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
|
|
|
1472
1538
|
log.error(f"L4 QMD write failed: {e}")
|
|
1473
1539
|
|
|
1474
1540
|
# ---- L3 Neo4j KG ----------------------------------------------------
|
|
1541
|
+
# Every node and edge written here is arena-scoped. Two paths:
|
|
1542
|
+
#
|
|
1543
|
+
# 1. Heuristic Concept extraction — title-case + bigrams over the
|
|
1544
|
+
# chunk body, same as before. Concepts MERGE on (arena, name)
|
|
1545
|
+
# so two tenants can independently mint a "Pricing" concept
|
|
1546
|
+
# without colliding.
|
|
1547
|
+
#
|
|
1548
|
+
# 2. Metadata-driven Person extraction — when the chunk's metadata
|
|
1549
|
+
# carries contact_email / contact_name (Pip emits these from
|
|
1550
|
+
# its ingest pipeline; other clients can do the same), we MERGE
|
|
1551
|
+
# a typed (:Entity:Person) node and connect it to the chunk via
|
|
1552
|
+
# a (:COMMUNICATED) edge that carries channel + direction. This
|
|
1553
|
+
# is the path the relationships UI reads from — it's reliable
|
|
1554
|
+
# because the writer knows exactly who the person is, no NLP
|
|
1555
|
+
# guessing required.
|
|
1556
|
+
#
|
|
1557
|
+
# The compound (arena, name) MERGE guarantees no cross-tenant entity
|
|
1558
|
+
# collapse. Pre-existing unscoped entities (arena IS NULL) are left
|
|
1559
|
+
# alone; the wipe-legacy migration script handles them out of band.
|
|
1475
1560
|
l3_entities = 0
|
|
1476
1561
|
l3_chunks = 0
|
|
1477
1562
|
try:
|
|
1478
1563
|
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
|
|
1479
1564
|
with driver.session() as session:
|
|
1480
|
-
#
|
|
1565
|
+
# Indexes — idempotent. The compound (arena, name) is the
|
|
1566
|
+
# right shape now that entities are arena-scoped; the legacy
|
|
1567
|
+
# entity_name index stays for the wipe-migration to work
|
|
1568
|
+
# against pre-arena rows, then can be dropped in a follow-up.
|
|
1481
1569
|
try:
|
|
1482
|
-
session.run("CREATE INDEX
|
|
1570
|
+
session.run("CREATE INDEX entity_arena_name IF NOT EXISTS FOR (n:Entity) ON (n.arena, n.name)")
|
|
1571
|
+
session.run("CREATE INDEX person_arena_email IF NOT EXISTS FOR (n:Person) ON (n.arena, n.email)")
|
|
1572
|
+
session.run("CREATE INDEX chunk_arena IF NOT EXISTS FOR (c:Chunk) ON (c.arena)")
|
|
1483
1573
|
session.run("CREATE INDEX chunk_id IF NOT EXISTS FOR (c:Chunk) ON (c.id)")
|
|
1574
|
+
# ChannelStat is the denormalised aggregate read by
|
|
1575
|
+
# /aggregate on the fast path. Compound index covers
|
|
1576
|
+
# the (arena, person_email) lookup that the reader
|
|
1577
|
+
# uses; the per-channel rows are returned in one
|
|
1578
|
+
# range scan.
|
|
1579
|
+
session.run("CREATE INDEX channelstat_arena_email IF NOT EXISTS FOR (s:ChannelStat) ON (s.arena, s.person_email)")
|
|
1580
|
+
# UNIQUE constraint on the writer's MERGE key. Without
|
|
1581
|
+
# this, two concurrent index-internal-batch transactions
|
|
1582
|
+
# can both decide a ChannelStat doesn't exist and create
|
|
1583
|
+
# rival nodes — the index doesn't lock, the constraint
|
|
1584
|
+
# does. The constraint also implies an index on the
|
|
1585
|
+
# full key so the MERGE locks efficiently.
|
|
1586
|
+
session.run("CREATE CONSTRAINT channelstat_unique IF NOT EXISTS FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE")
|
|
1484
1587
|
except Exception:
|
|
1485
1588
|
pass
|
|
1486
1589
|
for n in norm:
|
|
1487
|
-
|
|
1488
|
-
|
|
1590
|
+
heuristic_entities = _extract_entities_for_kg(n["content"])
|
|
1591
|
+
meta = n.get("metadata") or {}
|
|
1592
|
+
contact_email = meta.get("contact_email")
|
|
1593
|
+
contact_name = meta.get("contact_name")
|
|
1594
|
+
channel = meta.get("channel")
|
|
1595
|
+
direction = meta.get("direction")
|
|
1596
|
+
occurred_at = meta.get("timestamp") or meta.get("occurred_at") or now_iso
|
|
1597
|
+
# Skip the chunk only when there is genuinely nothing to
|
|
1598
|
+
# connect — heuristic entities AND no person metadata.
|
|
1599
|
+
if not heuristic_entities and not contact_email and not contact_name:
|
|
1489
1600
|
continue
|
|
1490
|
-
# Create the chunk node
|
|
1601
|
+
# Create the chunk node — arena property is the
|
|
1602
|
+
# tenant-isolation anchor. Every read traverses through
|
|
1603
|
+
# this node, so getting the arena right here is the
|
|
1604
|
+
# single most important invariant of this whole block.
|
|
1491
1605
|
session.run(
|
|
1492
1606
|
"""
|
|
1493
1607
|
MERGE (c:Chunk {id: $cid})
|
|
@@ -1501,38 +1615,150 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
|
|
|
1501
1615
|
arena=arena, now=now_iso,
|
|
1502
1616
|
)
|
|
1503
1617
|
l3_chunks += 1
|
|
1504
|
-
|
|
1505
|
-
|
|
1618
|
+
|
|
1619
|
+
# Concept entities — heuristic, arena-scoped.
|
|
1620
|
+
for ent in heuristic_entities:
|
|
1506
1621
|
session.run(
|
|
1507
1622
|
"""
|
|
1508
|
-
MERGE (e:Entity {name: $name})
|
|
1623
|
+
MERGE (e:Entity:Concept {arena: $arena, name: $name})
|
|
1509
1624
|
ON CREATE SET e.type = 'Concept',
|
|
1510
1625
|
e.created_at = $now,
|
|
1511
1626
|
e.weight = 1.0
|
|
1512
1627
|
WITH e
|
|
1513
|
-
MATCH (c:Chunk {id: $cid})
|
|
1628
|
+
MATCH (c:Chunk {arena: $arena, id: $cid})
|
|
1514
1629
|
MERGE (e)-[r:MENTIONS]->(c)
|
|
1515
1630
|
ON CREATE SET r.weight = 1.0, r.created_at = $now
|
|
1516
1631
|
ON MATCH SET r.weight = coalesce(r.weight, 1.0) + 0.1
|
|
1517
1632
|
""",
|
|
1518
|
-
name=ent, cid=n["id"], now=now_iso,
|
|
1633
|
+
arena=arena, name=ent, cid=n["id"], now=now_iso,
|
|
1519
1634
|
)
|
|
1520
1635
|
l3_entities += 1
|
|
1521
|
-
#
|
|
1522
|
-
# so
|
|
1523
|
-
if
|
|
1524
|
-
|
|
1525
|
-
|
|
1636
|
+
# Concept-concept co-occurrence — same arena on both
|
|
1637
|
+
# ends so cross-tenant CO_OCCURS edges can't form even
|
|
1638
|
+
# if two tenants happen to extract the same concept name.
|
|
1639
|
+
if len(heuristic_entities) >= 2:
|
|
1640
|
+
for i in range(len(heuristic_entities)):
|
|
1641
|
+
for j in range(i + 1, len(heuristic_entities)):
|
|
1526
1642
|
session.run(
|
|
1527
1643
|
"""
|
|
1528
|
-
MATCH (a:Entity {name: $a})
|
|
1529
|
-
MATCH (b:Entity {name: $b})
|
|
1644
|
+
MATCH (a:Entity:Concept {arena: $arena, name: $a})
|
|
1645
|
+
MATCH (b:Entity:Concept {arena: $arena, name: $b})
|
|
1530
1646
|
MERGE (a)-[r:CO_OCCURS]->(b)
|
|
1531
1647
|
ON CREATE SET r.weight = 0.5, r.created_at = $now
|
|
1532
1648
|
ON MATCH SET r.weight = coalesce(r.weight, 0.5) + 0.05
|
|
1533
1649
|
""",
|
|
1534
|
-
|
|
1650
|
+
arena=arena, a=heuristic_entities[i],
|
|
1651
|
+
b=heuristic_entities[j], now=now_iso,
|
|
1535
1652
|
)
|
|
1653
|
+
|
|
1654
|
+
# Person entities — typed via writer-supplied metadata.
|
|
1655
|
+
# Email gets its own node (canonical id for a person);
|
|
1656
|
+
# name gets its own node (display surface). When both
|
|
1657
|
+
# are present they're linked via KNOWN_AS so a query
|
|
1658
|
+
# against either resolves the same person.
|
|
1659
|
+
person_email_node = None
|
|
1660
|
+
if isinstance(contact_email, str) and contact_email.strip():
|
|
1661
|
+
norm_email = contact_email.strip().lower()
|
|
1662
|
+
# Two-phase write: MERGE the Person + COMMUNICATED
|
|
1663
|
+
# edge, then update the ChannelStat aggregate IFF
|
|
1664
|
+
# the edge was just created. The `r._counted` flag
|
|
1665
|
+
# is the idempotency rail — set false on CREATE and
|
|
1666
|
+
# flipped to true after the stat update, so replays
|
|
1667
|
+
# of the same eventId never double-count even when
|
|
1668
|
+
# the chunk already exists.
|
|
1669
|
+
session.run(
|
|
1670
|
+
"""
|
|
1671
|
+
MERGE (p:Entity:Person {arena: $arena, email: $email})
|
|
1672
|
+
ON CREATE SET p.created_at = $now,
|
|
1673
|
+
p.first_seen = $occurred_at,
|
|
1674
|
+
p.last_seen = $occurred_at
|
|
1675
|
+
ON MATCH SET p.last_seen = CASE
|
|
1676
|
+
WHEN $occurred_at > coalesce(p.last_seen, '')
|
|
1677
|
+
THEN $occurred_at
|
|
1678
|
+
ELSE p.last_seen END
|
|
1679
|
+
WITH p
|
|
1680
|
+
MATCH (c:Chunk {arena: $arena, id: $cid})
|
|
1681
|
+
MERGE (p)-[r:COMMUNICATED]->(c)
|
|
1682
|
+
ON CREATE SET r.channel = $channel,
|
|
1683
|
+
r.direction = $direction,
|
|
1684
|
+
r.occurred_at = $occurred_at,
|
|
1685
|
+
r.weight = 1.0,
|
|
1686
|
+
r._counted = false
|
|
1687
|
+
WITH p, r
|
|
1688
|
+
// ChannelStat denormalises Person-COMMUNICATED
|
|
1689
|
+
// edge counts so /aggregate becomes a property
|
|
1690
|
+
// read instead of a per-query Cypher walk over
|
|
1691
|
+
// every edge. Read path falls back to the edge
|
|
1692
|
+
// walk for older tenants whose stats haven't
|
|
1693
|
+
// been backfilled, so this is a forward-only
|
|
1694
|
+
// optimisation — no migration needed for stats
|
|
1695
|
+
// to start materialising.
|
|
1696
|
+
FOREACH (_ IN CASE WHEN r._counted = false THEN [1] ELSE [] END |
|
|
1697
|
+
MERGE (s:ChannelStat {arena: $arena, person_email: $email, channel: $channel})
|
|
1698
|
+
ON CREATE SET s.count = 0,
|
|
1699
|
+
s.inbound = 0,
|
|
1700
|
+
s.outbound = 0,
|
|
1701
|
+
s.first_seen = $occurred_at,
|
|
1702
|
+
s.last_seen = $occurred_at,
|
|
1703
|
+
s.created_at = $now
|
|
1704
|
+
SET s.count = s.count + 1,
|
|
1705
|
+
s.inbound = s.inbound + (CASE WHEN $direction = 'inbound' THEN 1 ELSE 0 END),
|
|
1706
|
+
s.outbound = s.outbound + (CASE WHEN $direction = 'outbound' THEN 1 ELSE 0 END),
|
|
1707
|
+
s.first_seen = CASE
|
|
1708
|
+
WHEN $occurred_at < coalesce(s.first_seen, $occurred_at)
|
|
1709
|
+
THEN $occurred_at
|
|
1710
|
+
ELSE s.first_seen END,
|
|
1711
|
+
s.last_seen = CASE
|
|
1712
|
+
WHEN $occurred_at > coalesce(s.last_seen, '')
|
|
1713
|
+
THEN $occurred_at
|
|
1714
|
+
ELSE s.last_seen END,
|
|
1715
|
+
s.updated_at = $now
|
|
1716
|
+
MERGE (p)-[:HAS_STAT]->(s)
|
|
1717
|
+
SET r._counted = true
|
|
1718
|
+
)
|
|
1719
|
+
""",
|
|
1720
|
+
arena=arena, email=norm_email, cid=n["id"],
|
|
1721
|
+
channel=channel, direction=direction,
|
|
1722
|
+
occurred_at=occurred_at, now=now_iso,
|
|
1723
|
+
)
|
|
1724
|
+
person_email_node = norm_email
|
|
1725
|
+
l3_entities += 1
|
|
1726
|
+
if isinstance(contact_name, str) and contact_name.strip():
|
|
1727
|
+
cname = contact_name.strip()
|
|
1728
|
+
session.run(
|
|
1729
|
+
"""
|
|
1730
|
+
MERGE (p:Entity:Person {arena: $arena, name: $name})
|
|
1731
|
+
ON CREATE SET p.created_at = $now,
|
|
1732
|
+
p.first_seen = $occurred_at,
|
|
1733
|
+
p.last_seen = $occurred_at
|
|
1734
|
+
ON MATCH SET p.last_seen = CASE
|
|
1735
|
+
WHEN $occurred_at > coalesce(p.last_seen, '')
|
|
1736
|
+
THEN $occurred_at
|
|
1737
|
+
ELSE p.last_seen END
|
|
1738
|
+
WITH p
|
|
1739
|
+
MATCH (c:Chunk {arena: $arena, id: $cid})
|
|
1740
|
+
MERGE (p)-[r:COMMUNICATED]->(c)
|
|
1741
|
+
ON CREATE SET r.channel = $channel,
|
|
1742
|
+
r.direction = $direction,
|
|
1743
|
+
r.occurred_at = $occurred_at,
|
|
1744
|
+
r.weight = 1.0
|
|
1745
|
+
""",
|
|
1746
|
+
arena=arena, name=cname, cid=n["id"],
|
|
1747
|
+
channel=channel, direction=direction,
|
|
1748
|
+
occurred_at=occurred_at, now=now_iso,
|
|
1749
|
+
)
|
|
1750
|
+
l3_entities += 1
|
|
1751
|
+
# Link name→email node so the relationships query
|
|
1752
|
+
# can resolve either alias to the same person.
|
|
1753
|
+
if person_email_node:
|
|
1754
|
+
session.run(
|
|
1755
|
+
"""
|
|
1756
|
+
MATCH (n:Person {arena: $arena, name: $name})
|
|
1757
|
+
MATCH (e:Person {arena: $arena, email: $email})
|
|
1758
|
+
MERGE (n)-[:KNOWN_AS]->(e)
|
|
1759
|
+
""",
|
|
1760
|
+
arena=arena, name=cname, email=person_email_node,
|
|
1761
|
+
)
|
|
1536
1762
|
driver.close()
|
|
1537
1763
|
except Exception as e:
|
|
1538
1764
|
log.error(f"L3 KG write failed: {e}")
|
|
@@ -1551,16 +1777,43 @@ async def index_internal_batch(req: IndexInternalBatchRequest) -> dict:
|
|
|
1551
1777
|
|
|
1552
1778
|
@app.post("/forget-internal")
|
|
1553
1779
|
async def forget_internal(request: Request) -> dict:
|
|
1554
|
-
"""Wipe L0 + L4-qmd + L3.
|
|
1780
|
+
"""Wipe L0 + L4-qmd + L3.
|
|
1781
|
+
|
|
1782
|
+
Two modes:
|
|
1783
|
+
- Tenant-scoped (default, safe): pass `{"arena": "<tenant>"}` and
|
|
1784
|
+
only that tenant's rows are deleted. Used by tenant offboarding
|
|
1785
|
+
and by tests.
|
|
1786
|
+
- Global (unsafe): the bench harness needs to wipe everything
|
|
1787
|
+
between runs. Require an explicit `{"confirm": "GLOBAL_WIPE"}`
|
|
1788
|
+
flag — without it we refuse rather than nuke shared infra.
|
|
1789
|
+
|
|
1790
|
+
Pre-fix this endpoint silently ignored the arena param and always
|
|
1791
|
+
deleted globally. That meant a tenant offboarding script — or any
|
|
1792
|
+
caller that read the param-name and trusted it — would erase every
|
|
1793
|
+
other tenant's L3 graph and wipe the shared sqlite stores. Hence
|
|
1794
|
+
the explicit confirm gate now.
|
|
1795
|
+
"""
|
|
1555
1796
|
try:
|
|
1556
1797
|
body = await request.json()
|
|
1557
1798
|
except Exception:
|
|
1558
1799
|
body = {}
|
|
1559
|
-
arena = body.get("arena")
|
|
1800
|
+
arena = body.get("arena")
|
|
1801
|
+
confirm = body.get("confirm")
|
|
1802
|
+
if not arena and confirm != "GLOBAL_WIPE":
|
|
1803
|
+
raise HTTPException(
|
|
1804
|
+
status_code=400,
|
|
1805
|
+
detail="forget-internal requires either 'arena' (tenant-scoped) "
|
|
1806
|
+
"or 'confirm: GLOBAL_WIPE' (unsafe, deletes everything).",
|
|
1807
|
+
)
|
|
1560
1808
|
deleted = {"l0": 0, "l4_qmd": 0, "l3_entities": 0, "l3_chunks": 0}
|
|
1809
|
+
|
|
1810
|
+
# ---- L0 BM25 (sqlite) ----------------------------------------------
|
|
1811
|
+
# The L0 chunks table doesn't carry an arena column today, so we
|
|
1812
|
+
# only support GLOBAL_WIPE here. Tenant-scoped L0 deletes are a
|
|
1813
|
+
# follow-up (needs schema migration to add `arena` to L0 rows).
|
|
1561
1814
|
try:
|
|
1562
1815
|
l0_db = Path(os.environ.get("PME_MEMORY_DB", str(L0_MEMORY_DB)))
|
|
1563
|
-
if l0_db.exists():
|
|
1816
|
+
if l0_db.exists() and confirm == "GLOBAL_WIPE":
|
|
1564
1817
|
conn = sqlite3.connect(str(l0_db), timeout=5)
|
|
1565
1818
|
cur = conn.execute("DELETE FROM chunks")
|
|
1566
1819
|
deleted["l0"] = cur.rowcount
|
|
@@ -1571,25 +1824,310 @@ async def forget_internal(request: Request) -> dict:
|
|
|
1571
1824
|
conn.commit(); conn.close()
|
|
1572
1825
|
except Exception as e:
|
|
1573
1826
|
log.error(f"L0 forget failed: {e}")
|
|
1827
|
+
|
|
1828
|
+
# ---- L4 sqlite-vec --------------------------------------------------
|
|
1829
|
+
# Same situation as L0 — no per-arena column on chunks. Global only
|
|
1830
|
+
# for now; tenant-scoped delete is a follow-up.
|
|
1574
1831
|
try:
|
|
1575
|
-
if Path(QMD_DB_PATH).exists():
|
|
1832
|
+
if Path(QMD_DB_PATH).exists() and confirm == "GLOBAL_WIPE":
|
|
1576
1833
|
conn = sqlite3.connect(QMD_DB_PATH, timeout=5)
|
|
1577
1834
|
cur = conn.execute("DELETE FROM chunks")
|
|
1578
1835
|
deleted["l4_qmd"] = cur.rowcount
|
|
1579
1836
|
conn.commit(); conn.close()
|
|
1580
1837
|
except Exception as e:
|
|
1581
1838
|
log.error(f"L4 QMD forget failed: {e}")
|
|
1839
|
+
|
|
1840
|
+
# ---- L3 Neo4j -------------------------------------------------------
|
|
1841
|
+
# Neo4j chunks AND entities both carry arena now, so tenant-scoped
|
|
1842
|
+
# delete works correctly here even if L0/L4 still need a migration.
|
|
1582
1843
|
try:
|
|
1583
1844
|
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
|
|
1584
1845
|
with driver.session() as session:
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1846
|
+
if arena:
|
|
1847
|
+
r1 = session.run(
|
|
1848
|
+
"MATCH (c:Chunk {arena: $arena}) DETACH DELETE c RETURN count(c) AS n",
|
|
1849
|
+
arena=arena,
|
|
1850
|
+
)
|
|
1851
|
+
deleted["l3_chunks"] = r1.single()["n"]
|
|
1852
|
+
r2 = session.run(
|
|
1853
|
+
"MATCH (e:Entity {arena: $arena}) DETACH DELETE e RETURN count(e) AS n",
|
|
1854
|
+
arena=arena,
|
|
1855
|
+
)
|
|
1856
|
+
deleted["l3_entities"] = r2.single()["n"]
|
|
1857
|
+
else: # confirm == "GLOBAL_WIPE", validated above
|
|
1858
|
+
r1 = session.run("MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n")
|
|
1859
|
+
deleted["l3_chunks"] = r1.single()["n"]
|
|
1860
|
+
r2 = session.run("MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n")
|
|
1861
|
+
deleted["l3_entities"] = r2.single()["n"]
|
|
1589
1862
|
driver.close()
|
|
1590
1863
|
except Exception as e:
|
|
1591
1864
|
log.error(f"L3 forget failed: {e}")
|
|
1592
|
-
return {"status": "ok", "deleted": deleted, "arena": arena}
|
|
1865
|
+
return {"status": "ok", "deleted": deleted, "arena": arena, "global_wipe": confirm == "GLOBAL_WIPE"}
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
class AggregateInternalRequest(BaseModel):
|
|
1869
|
+
"""Aggregate (:Person)-[:COMMUNICATED]->(:Chunk) edges by group_by keys.
|
|
1870
|
+
|
|
1871
|
+
The relationships UI pre-#28 went through a metadata-filtered
|
|
1872
|
+
/search and grouped client-side, capped at the engine over-fetch
|
|
1873
|
+
ceiling. With typed-Person nodes in L3 we can run a single Cypher
|
|
1874
|
+
aggregate that scales to any volume — no over-fetch, no cap.
|
|
1875
|
+
|
|
1876
|
+
Required: arena (the tenant scope) plus enough metadata to identify
|
|
1877
|
+
the Person node we're rolling up. Today that means contact_email
|
|
1878
|
+
(the canonical Person key), but the shape leaves room for future
|
|
1879
|
+
Person identifiers (e.g. slack_user_id, hubspot_contact_id) without
|
|
1880
|
+
a wire change.
|
|
1881
|
+
"""
|
|
1882
|
+
|
|
1883
|
+
arena: str
|
|
1884
|
+
contact_email: Optional[str] = None
|
|
1885
|
+
contact_name: Optional[str] = None
|
|
1886
|
+
# Group by these properties on the COMMUNICATED edge. Only the
|
|
1887
|
+
# relationship-page-supported keys are honoured; unknown keys are
|
|
1888
|
+
# silently dropped (no useful aggregate shape for them).
|
|
1889
|
+
group_by: List[str] = ["channel"]
|
|
1890
|
+
|
|
1891
|
+
|
|
1892
|
+
class AggregateBucket(BaseModel):
|
|
1893
|
+
keys: Dict[str, Optional[str]]
|
|
1894
|
+
count: int
|
|
1895
|
+
inbound: int
|
|
1896
|
+
outbound: int
|
|
1897
|
+
last_seen: Optional[str] = None
|
|
1898
|
+
first_seen: Optional[str] = None
|
|
1899
|
+
|
|
1900
|
+
|
|
1901
|
+
class AggregateInternalResponse(BaseModel):
|
|
1902
|
+
arena: str
|
|
1903
|
+
total: int
|
|
1904
|
+
last_seen: Optional[str] = None
|
|
1905
|
+
buckets: List[AggregateBucket]
|
|
1906
|
+
|
|
1907
|
+
|
|
1908
|
+
# Whitelist of group_by keys we know how to project. Cypher
|
|
1909
|
+
# parameter-substitution doesn't work on property names, so we
|
|
1910
|
+
# template the keys into the query — this whitelist is the safety
|
|
1911
|
+
# rail that keeps the templating from accepting arbitrary input.
|
|
1912
|
+
_AGGREGATE_GROUP_BY_KEYS = {"channel", "direction"}
|
|
1913
|
+
|
|
1914
|
+
|
|
1915
|
+
@app.post("/aggregate-internal", response_model=AggregateInternalResponse)
|
|
1916
|
+
async def aggregate_internal(req: AggregateInternalRequest) -> AggregateInternalResponse:
|
|
1917
|
+
"""Aggregate Person→Chunk COMMUNICATED edges by edge properties.
|
|
1918
|
+
|
|
1919
|
+
Returns one bucket per (group_by key combination) with count,
|
|
1920
|
+
inbound/outbound split, and time bounds. The Person match is
|
|
1921
|
+
arena-scoped (mandatory) and additionally filtered by whatever
|
|
1922
|
+
Person identifier the caller supplies.
|
|
1923
|
+
|
|
1924
|
+
No fallback to chunk scanning — if the typed-Person nodes don't
|
|
1925
|
+
exist for this contact, the response is `total: 0` with no
|
|
1926
|
+
buckets, and the caller falls back to whatever it had before.
|
|
1927
|
+
That's intentional: the over-fetch path is in TES (#273); this
|
|
1928
|
+
endpoint is the scaling answer that doesn't have one.
|
|
1929
|
+
"""
|
|
1930
|
+
arena = (req.arena or "").strip()
|
|
1931
|
+
if not arena:
|
|
1932
|
+
raise HTTPException(status_code=400, detail="arena is required")
|
|
1933
|
+
contact_email = (req.contact_email or "").strip().lower()
|
|
1934
|
+
contact_name = (req.contact_name or "").strip()
|
|
1935
|
+
if not contact_email and not contact_name:
|
|
1936
|
+
raise HTTPException(
|
|
1937
|
+
status_code=400,
|
|
1938
|
+
detail="provide contact_email and/or contact_name to identify the Person",
|
|
1939
|
+
)
|
|
1940
|
+
|
|
1941
|
+
# Filter group_by to the supported keys; preserve order so a caller
|
|
1942
|
+
# asking for ["direction", "channel"] gets buckets keyed in that
|
|
1943
|
+
# order on the response.
|
|
1944
|
+
seen: set[str] = set()
|
|
1945
|
+
safe_group_by: List[str] = []
|
|
1946
|
+
for k in req.group_by or []:
|
|
1947
|
+
if k in _AGGREGATE_GROUP_BY_KEYS and k not in seen:
|
|
1948
|
+
seen.add(k)
|
|
1949
|
+
safe_group_by.append(k)
|
|
1950
|
+
|
|
1951
|
+
try:
|
|
1952
|
+
driver = GraphDatabase.driver(NEO4J_URI, auth=NEO4J_AUTH)
|
|
1953
|
+
except Exception as e:
|
|
1954
|
+
raise HTTPException(status_code=500, detail=f"neo4j connect: {e}")
|
|
1955
|
+
|
|
1956
|
+
try:
|
|
1957
|
+
with driver.session() as session:
|
|
1958
|
+
# Fast path: read from the ChannelStat denormalisation
|
|
1959
|
+
# whenever the caller has an email and is grouping by
|
|
1960
|
+
# channel. ChannelStats are written by /index-internal-batch
|
|
1961
|
+
# on every store with contact_email metadata, so any tenant
|
|
1962
|
+
# with new ingest gets O(channels) reads instead of an
|
|
1963
|
+
# edge walk over every COMMUNICATED relationship.
|
|
1964
|
+
#
|
|
1965
|
+
# Conditions for the fast path:
|
|
1966
|
+
# - contact_email set (stats are email-keyed; name-only
|
|
1967
|
+
# contacts fall through to the edge walk).
|
|
1968
|
+
# - group_by is exactly ["channel"] OR no group_by (single
|
|
1969
|
+
# bucket). Other group_by combinations (e.g. with
|
|
1970
|
+
# direction) need the edge granularity the stats
|
|
1971
|
+
# don't carry.
|
|
1972
|
+
fast_path_eligible = bool(contact_email) and (
|
|
1973
|
+
not safe_group_by or safe_group_by == ["channel"]
|
|
1974
|
+
)
|
|
1975
|
+
if fast_path_eligible:
|
|
1976
|
+
stats_rows = list(session.run(
|
|
1977
|
+
"MATCH (s:ChannelStat {arena: $arena, person_email: $email})\n"
|
|
1978
|
+
"RETURN s.channel AS channel,\n"
|
|
1979
|
+
" s.count AS count,\n"
|
|
1980
|
+
" s.inbound AS inbound,\n"
|
|
1981
|
+
" s.outbound AS outbound,\n"
|
|
1982
|
+
" s.last_seen AS last_seen,\n"
|
|
1983
|
+
" s.first_seen AS first_seen\n"
|
|
1984
|
+
"ORDER BY s.count DESC\n",
|
|
1985
|
+
arena=arena, email=contact_email,
|
|
1986
|
+
))
|
|
1987
|
+
if stats_rows:
|
|
1988
|
+
# Build buckets directly. When group_by=[] we
|
|
1989
|
+
# collapse to a single overall bucket; otherwise
|
|
1990
|
+
# one bucket per channel.
|
|
1991
|
+
if safe_group_by == ["channel"]:
|
|
1992
|
+
buckets = [
|
|
1993
|
+
AggregateBucket(
|
|
1994
|
+
keys={"channel": rec["channel"]},
|
|
1995
|
+
count=int(rec["count"] or 0),
|
|
1996
|
+
inbound=int(rec["inbound"] or 0),
|
|
1997
|
+
outbound=int(rec["outbound"] or 0),
|
|
1998
|
+
last_seen=str(rec["last_seen"]) if rec["last_seen"] else None,
|
|
1999
|
+
first_seen=str(rec["first_seen"]) if rec["first_seen"] else None,
|
|
2000
|
+
)
|
|
2001
|
+
for rec in stats_rows
|
|
2002
|
+
]
|
|
2003
|
+
total = sum(b.count for b in buckets)
|
|
2004
|
+
latest = None
|
|
2005
|
+
for b in buckets:
|
|
2006
|
+
if b.last_seen and (latest is None or b.last_seen > latest):
|
|
2007
|
+
latest = b.last_seen
|
|
2008
|
+
else:
|
|
2009
|
+
# Single global bucket — sum across channels.
|
|
2010
|
+
total = sum(int(rec["count"] or 0) for rec in stats_rows)
|
|
2011
|
+
inbound = sum(int(rec["inbound"] or 0) for rec in stats_rows)
|
|
2012
|
+
outbound = sum(int(rec["outbound"] or 0) for rec in stats_rows)
|
|
2013
|
+
last_seens = [rec["last_seen"] for rec in stats_rows if rec["last_seen"]]
|
|
2014
|
+
first_seens = [rec["first_seen"] for rec in stats_rows if rec["first_seen"]]
|
|
2015
|
+
latest = max((str(x) for x in last_seens), default=None)
|
|
2016
|
+
earliest = min((str(x) for x in first_seens), default=None)
|
|
2017
|
+
buckets = [AggregateBucket(
|
|
2018
|
+
keys={},
|
|
2019
|
+
count=total,
|
|
2020
|
+
inbound=inbound,
|
|
2021
|
+
outbound=outbound,
|
|
2022
|
+
last_seen=latest,
|
|
2023
|
+
first_seen=earliest,
|
|
2024
|
+
)]
|
|
2025
|
+
return AggregateInternalResponse(
|
|
2026
|
+
arena=arena,
|
|
2027
|
+
total=total,
|
|
2028
|
+
last_seen=latest,
|
|
2029
|
+
buckets=buckets,
|
|
2030
|
+
)
|
|
2031
|
+
# else: stats absent (older tenant pre-rollup, or this
|
|
2032
|
+
# contact has no email-keyed Person yet) → fall through
|
|
2033
|
+
# to the edge-walk path.
|
|
2034
|
+
|
|
2035
|
+
# Edge-walk path (original Cypher). Used when:
|
|
2036
|
+
# - caller has only contact_name (no email-keyed stats)
|
|
2037
|
+
# - caller asked for a group_by we don't denormalise (e.g.
|
|
2038
|
+
# direction)
|
|
2039
|
+
# - tenant predates the rollup writer (no stats nodes yet)
|
|
2040
|
+
# Both paths return the same response shape, so callers
|
|
2041
|
+
# don't need to know which served them.
|
|
2042
|
+
#
|
|
2043
|
+
# Build the Person match. We want either email-keyed or
|
|
2044
|
+
# name-keyed Person nodes; when both are supplied we OR
|
|
2045
|
+
# them so a caller can hit either alias. Both branches
|
|
2046
|
+
# arena-scope the Person.
|
|
2047
|
+
person_clauses: List[str] = []
|
|
2048
|
+
params: Dict[str, Any] = {"arena": arena}
|
|
2049
|
+
if contact_email:
|
|
2050
|
+
person_clauses.append("(p.email = $contact_email)")
|
|
2051
|
+
params["contact_email"] = contact_email
|
|
2052
|
+
if contact_name:
|
|
2053
|
+
person_clauses.append("(p.name = $contact_name)")
|
|
2054
|
+
params["contact_name"] = contact_name
|
|
2055
|
+
person_filter = " OR ".join(person_clauses)
|
|
2056
|
+
|
|
2057
|
+
# group_by keys go into the WITH clause. Cypher doesn't
|
|
2058
|
+
# support property-name parameters, so we template them
|
|
2059
|
+
# in — the whitelist above is the safety rail against
|
|
2060
|
+
# injection. Built up separately rather than via f-string
|
|
2061
|
+
# so the static MATCH clause stays a plain string and the
|
|
2062
|
+
# arena-safety lint can parse it cleanly.
|
|
2063
|
+
with_keys = ", ".join(f"r.{k} AS {k}" for k in safe_group_by)
|
|
2064
|
+
return_keys = ", ".join(safe_group_by)
|
|
2065
|
+
|
|
2066
|
+
# Static base — arena scope on both Person and Chunk so the
|
|
2067
|
+
# lint catches any future copy-paste that forgets it.
|
|
2068
|
+
base = (
|
|
2069
|
+
"MATCH (p:Person {arena: $arena})-[r:COMMUNICATED]->(c:Chunk {arena: $arena})\n"
|
|
2070
|
+
"WHERE " + person_filter + "\n"
|
|
2071
|
+
)
|
|
2072
|
+
agg_select = (
|
|
2073
|
+
"count(*) AS count,\n"
|
|
2074
|
+
"sum(CASE WHEN _direction = 'inbound' THEN 1 ELSE 0 END) AS inbound,\n"
|
|
2075
|
+
"sum(CASE WHEN _direction = 'outbound' THEN 1 ELSE 0 END) AS outbound,\n"
|
|
2076
|
+
"max(_occurred_at) AS last_seen,\n"
|
|
2077
|
+
"min(_occurred_at) AS first_seen\n"
|
|
2078
|
+
)
|
|
2079
|
+
|
|
2080
|
+
if safe_group_by:
|
|
2081
|
+
cypher = (
|
|
2082
|
+
base
|
|
2083
|
+
+ f"WITH {with_keys}, r.direction AS _direction, r.occurred_at AS _occurred_at\n"
|
|
2084
|
+
+ f"RETURN {return_keys},\n"
|
|
2085
|
+
+ agg_select
|
|
2086
|
+
+ "ORDER BY count DESC\n"
|
|
2087
|
+
)
|
|
2088
|
+
else:
|
|
2089
|
+
# No group_by → one global bucket (just the overall
|
|
2090
|
+
# totals for this Person). Useful for "total comms
|
|
2091
|
+
# with X" without per-channel breakdown.
|
|
2092
|
+
cypher = (
|
|
2093
|
+
base
|
|
2094
|
+
+ "WITH r.direction AS _direction, r.occurred_at AS _occurred_at\n"
|
|
2095
|
+
+ "RETURN " + agg_select
|
|
2096
|
+
)
|
|
2097
|
+
|
|
2098
|
+
buckets: List[AggregateBucket] = []
|
|
2099
|
+
total = 0
|
|
2100
|
+
latest: Optional[str] = None
|
|
2101
|
+
for rec in session.run(cypher, **params):
|
|
2102
|
+
count = int(rec["count"] or 0)
|
|
2103
|
+
total += count
|
|
2104
|
+
last_seen = rec["last_seen"]
|
|
2105
|
+
if last_seen and (latest is None or str(last_seen) > latest):
|
|
2106
|
+
latest = str(last_seen)
|
|
2107
|
+
bucket_keys: Dict[str, Optional[str]] = (
|
|
2108
|
+
{k: rec[k] for k in safe_group_by} if safe_group_by else {}
|
|
2109
|
+
)
|
|
2110
|
+
buckets.append(AggregateBucket(
|
|
2111
|
+
keys=bucket_keys,
|
|
2112
|
+
count=count,
|
|
2113
|
+
inbound=int(rec["inbound"] or 0),
|
|
2114
|
+
outbound=int(rec["outbound"] or 0),
|
|
2115
|
+
last_seen=str(last_seen) if last_seen else None,
|
|
2116
|
+
first_seen=str(rec["first_seen"]) if rec["first_seen"] else None,
|
|
2117
|
+
))
|
|
2118
|
+
return AggregateInternalResponse(
|
|
2119
|
+
arena=arena,
|
|
2120
|
+
total=total,
|
|
2121
|
+
last_seen=latest,
|
|
2122
|
+
buckets=buckets,
|
|
2123
|
+
)
|
|
2124
|
+
except HTTPException:
|
|
2125
|
+
raise
|
|
2126
|
+
except Exception as e:
|
|
2127
|
+
log.error(f"aggregate-internal failed: {e}")
|
|
2128
|
+
raise HTTPException(status_code=500, detail=f"aggregate failed: {e}")
|
|
2129
|
+
finally:
|
|
2130
|
+
driver.close()
|
|
1593
2131
|
|
|
1594
2132
|
|
|
1595
2133
|
@app.get("/index-internal-stats")
|