@pentatonic-ai/ai-agent-sdk 0.8.0 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/packages/memory/openclaw-plugin/index.js +7 -0
- package/packages/memory/openclaw-plugin/openclaw.plugin.json +9 -1
- package/packages/memory/openclaw-plugin/package.json +1 -1
- package/packages/memory/src/__tests__/engine.test.js +142 -0
- package/packages/memory/src/engine.js +65 -0
- package/packages/memory-engine/compat/server.py +98 -7
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +596 -58
- package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +128 -0
- package/packages/memory-engine/tests/e2e_arena.sh +28 -4
- package/packages/memory-engine/tests/test_aggregate.py +333 -0
- package/packages/memory-engine/tests/test_arena_safety.py +232 -0
- package/packages/memory-engine/tests/test_channel_stat_reader.py +437 -0
- package/packages/memory-engine/tests/test_channel_stat_rollups.py +308 -0
- package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +48 -0
- package/packages/memory-engine/tests/test_l3_arena_isolation.py +412 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""Static safety check: every Cypher node pattern that targets a
|
|
2
|
+
tenant-data label must scope by `arena` — not just somewhere in the
|
|
3
|
+
surrounding block, but on the same variable.
|
|
4
|
+
|
|
5
|
+
Run with:
|
|
6
|
+
cd packages/memory-engine
|
|
7
|
+
python -m pytest tests/test_arena_safety.py -v
|
|
8
|
+
|
|
9
|
+
How the check works:
|
|
10
|
+
|
|
11
|
+
1. Walk the live engine source and pull out every Cypher block (both
|
|
12
|
+
triple-quoted strings and inline ``session.run("…")`` calls).
|
|
13
|
+
|
|
14
|
+
2. For each block, find every node pattern that names one of the
|
|
15
|
+
tenant labels — patterns like ``(p:Person {...})`` or
|
|
16
|
+
``(e:Entity:Concept {...})``.
|
|
17
|
+
|
|
18
|
+
3. For each such pattern, the variable bound by that pattern (e.g.
|
|
19
|
+
``p`` / ``e``) must be tied to ``arena`` somewhere in the block:
|
|
20
|
+
either inside the pattern's own property bag (``{arena: $arena,
|
|
21
|
+
…}``) or via a WHERE clause that references ``<var>.arena``.
|
|
22
|
+
|
|
23
|
+
The earlier weaker version of this lint checked "the block contains
|
|
24
|
+
the string `arena` *somewhere*", which let a Person MERGE without
|
|
25
|
+
arena slip through if any neighbouring chunk-join in the same block
|
|
26
|
+
referenced `arena`. The bug-day repro was injecting
|
|
27
|
+
``MERGE (p:Entity:Person {email: $email})`` while the rest of the
|
|
28
|
+
block kept ``MATCH (c:Chunk {arena: $arena, …})`` — block contained
|
|
29
|
+
"arena", lint was happy, the Person node was global.
|
|
30
|
+
|
|
31
|
+
If a future change introduces a Cypher pattern on these labels without
|
|
32
|
+
arena (e.g. a debug helper that genuinely needs to span all tenants),
|
|
33
|
+
allow-list it via ``_ALLOWED_NO_ARENA_REASONS`` with a justification.
|
|
34
|
+
"""
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import re
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
|
|
40
|
+
import pytest
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
44
|
+
ENGINE_LIVE = REPO_ROOT / "engine" / "services" / "l2" / "l2-hybridrag-proxy.py"
|
|
45
|
+
COMPAT_SHIM = REPO_ROOT / "compat" / "server.py"
|
|
46
|
+
|
|
47
|
+
# Labels that carry tenant data. Any Cypher pattern naming these MUST
|
|
48
|
+
# bind the variable to `arena` — either as a property in the pattern
|
|
49
|
+
# itself or via a WHERE clause on the same variable.
|
|
50
|
+
TENANT_LABELS = ("Entity", "Person", "Concept", "Channel", "Chunk", "ChannelStat")
|
|
51
|
+
_LABEL_ALT = "|".join(TENANT_LABELS)
|
|
52
|
+
|
|
53
|
+
# Triple-quoted strings.
|
|
54
|
+
_TRIPLE_STRING = re.compile(
|
|
55
|
+
r'"""(?P<body>[^"]*?(?:"(?!"")[^"]*?)*?)"""',
|
|
56
|
+
re.DOTALL,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# Inline `session.run("…")` calls that aren't already in a triple-quote.
|
|
60
|
+
_SINGLELINE_RUN = re.compile(
|
|
61
|
+
r'session\.run\(\s*"((?:[^"\\]|\\.)+)"',
|
|
62
|
+
re.MULTILINE,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Anything that smells like Cypher inside a string literal.
|
|
66
|
+
_OP_PATTERN = re.compile(r"\b(MERGE|MATCH|DETACH\s+DELETE)\b", re.IGNORECASE)
|
|
67
|
+
|
|
68
|
+
# Node pattern: (var:Label1:Label2 {props}) or (var:Label)
|
|
69
|
+
# The var is optional in Cypher, but anonymous patterns can't carry a
|
|
70
|
+
# WHERE clause anyway — flag them as unsafe unless the inline property
|
|
71
|
+
# bag scopes by arena.
|
|
72
|
+
_NODE_PATTERN = re.compile(
|
|
73
|
+
r"""
|
|
74
|
+
\(
|
|
75
|
+
\s*(?P<var>[A-Za-z_][A-Za-z0-9_]*)? # optional variable
|
|
76
|
+
\s*(?P<labels>(?::(?:""" + _LABEL_ALT + r"""))+)\b # one+ tenant labels
|
|
77
|
+
\s*(?P<props>\{[^{}]*\})? # optional property bag
|
|
78
|
+
""",
|
|
79
|
+
re.VERBOSE,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Allow-list: cross-tenant Cypher that we deliberately want to keep.
|
|
83
|
+
# Map a unique substring of the offending pattern to a justification.
|
|
84
|
+
_ALLOWED_NO_ARENA_REASONS: dict[str, str] = {
|
|
85
|
+
# /index-internal-stats — global ops counters that return ints.
|
|
86
|
+
"MATCH (c:Chunk) RETURN count(c) AS n":
|
|
87
|
+
"ops counter — returns a single int, no tenant data exposed",
|
|
88
|
+
"MATCH (e:Entity) RETURN count(e) AS n":
|
|
89
|
+
"ops counter — returns a single int, no tenant data exposed",
|
|
90
|
+
# /forget-internal global-wipe path — gated by confirm: GLOBAL_WIPE.
|
|
91
|
+
"MATCH (c:Chunk) DETACH DELETE c RETURN count(c) AS n":
|
|
92
|
+
"global-wipe, gated by explicit confirm: GLOBAL_WIPE",
|
|
93
|
+
"MATCH (e:Entity) DETACH DELETE e RETURN count(e) AS n":
|
|
94
|
+
"global-wipe, gated by explicit confirm: GLOBAL_WIPE",
|
|
95
|
+
# Migration target — pre-arena legacy entities have no arena.
|
|
96
|
+
"MATCH (e:Entity) WHERE e.arena IS NULL DETACH DELETE e":
|
|
97
|
+
"legacy-wipe migration target (entities pre-arena scoping)",
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# Cypher line comments. Strip these from extracted blocks before
|
|
102
|
+
# running the tenant-label scan so that prose mentions of pattern
|
|
103
|
+
# syntax (e.g. "Person-COMMUNICATED edges") inside `// …` comments
|
|
104
|
+
# don't get parsed as real node patterns.
|
|
105
|
+
_CYPHER_LINE_COMMENT = re.compile(r"//[^\n]*")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _strip_cypher_comments(block: str) -> str:
|
|
109
|
+
return _CYPHER_LINE_COMMENT.sub("", block)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _extract_cypher_blocks(source: str) -> list[tuple[int, str]]:
|
|
113
|
+
"""Return [(approx_line_no, body)] for every Cypher block."""
|
|
114
|
+
blocks: list[tuple[int, str]] = []
|
|
115
|
+
for m in _TRIPLE_STRING.finditer(source):
|
|
116
|
+
body = m.group("body")
|
|
117
|
+
if _OP_PATTERN.search(body):
|
|
118
|
+
line_no = source[: m.start()].count("\n") + 1
|
|
119
|
+
# Strip Cypher // comments so prose patterns inside
|
|
120
|
+
# comments don't get scanned as actual queries.
|
|
121
|
+
blocks.append((line_no, _strip_cypher_comments(body)))
|
|
122
|
+
# session.run("…") matches are skipped if they fell inside a triple
|
|
123
|
+
# string already covered above. Cheap dedup: if the body of a
|
|
124
|
+
# singleline match is a substring of any triple body, skip it.
|
|
125
|
+
triple_bodies = [b for _, b in blocks]
|
|
126
|
+
for m in _SINGLELINE_RUN.finditer(source):
|
|
127
|
+
body = m.group(1)
|
|
128
|
+
if not _OP_PATTERN.search(body):
|
|
129
|
+
continue
|
|
130
|
+
if any(body in tb for tb in triple_bodies):
|
|
131
|
+
continue
|
|
132
|
+
line_no = source[: m.start()].count("\n") + 1
|
|
133
|
+
blocks.append((line_no, body))
|
|
134
|
+
return blocks
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _is_allowed(block: str) -> str | None:
|
|
138
|
+
for fragment, reason in _ALLOWED_NO_ARENA_REASONS.items():
|
|
139
|
+
if fragment in block:
|
|
140
|
+
return reason
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _pattern_scopes_arena(block: str, var: str | None, props: str | None) -> bool:
|
|
145
|
+
"""True if this specific pattern is arena-scoped.
|
|
146
|
+
|
|
147
|
+
A pattern is arena-scoped when EITHER:
|
|
148
|
+
- The inline property bag contains `arena:`, OR
|
|
149
|
+
- A `WHERE` clause in the surrounding block references
|
|
150
|
+
`<var>.arena`.
|
|
151
|
+
"""
|
|
152
|
+
if props and re.search(r"\barena\s*:", props):
|
|
153
|
+
return True
|
|
154
|
+
if var is None:
|
|
155
|
+
# Anonymous pattern with no property bag — there's no way to
|
|
156
|
+
# scope it via WHERE since there's no var to reference.
|
|
157
|
+
return False
|
|
158
|
+
# Look for `<var>.arena` anywhere in the block. Crude but the
|
|
159
|
+
# variable name is unambiguous within a single Cypher block.
|
|
160
|
+
if re.search(rf"\b{re.escape(var)}\.arena\b", block):
|
|
161
|
+
return True
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@pytest.mark.parametrize(
|
|
166
|
+
"source_path",
|
|
167
|
+
[
|
|
168
|
+
pytest.param(ENGINE_LIVE, id="l2-hybridrag-proxy"),
|
|
169
|
+
pytest.param(COMPAT_SHIM, id="compat-shim"),
|
|
170
|
+
],
|
|
171
|
+
)
|
|
172
|
+
def test_every_tenant_pattern_is_arena_scoped(source_path: Path) -> None:
|
|
173
|
+
"""Each tenant-label node pattern is scoped by arena."""
|
|
174
|
+
if not source_path.exists():
|
|
175
|
+
pytest.skip(f"{source_path} not present")
|
|
176
|
+
source = source_path.read_text()
|
|
177
|
+
offenders: list[str] = []
|
|
178
|
+
for line_no, block in _extract_cypher_blocks(source):
|
|
179
|
+
# Honour block-level allow-list before per-pattern checks; that
|
|
180
|
+
# way an entire global-wipe block can be allow-listed once.
|
|
181
|
+
if _is_allowed(block):
|
|
182
|
+
continue
|
|
183
|
+
for m in _NODE_PATTERN.finditer(block):
|
|
184
|
+
var = m.group("var")
|
|
185
|
+
props = m.group("props")
|
|
186
|
+
if _pattern_scopes_arena(block, var, props):
|
|
187
|
+
continue
|
|
188
|
+
offenders.append(
|
|
189
|
+
f"{source_path.name}:~{line_no} pattern `{m.group(0).strip()}` "
|
|
190
|
+
f"in block:\n{block.strip()[:240]}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
assert not offenders, (
|
|
194
|
+
f"{len(offenders)} tenant-labelled Cypher pattern(s) miss arena scoping:\n\n"
|
|
195
|
+
+ "\n\n---\n\n".join(offenders)
|
|
196
|
+
+ "\n\nAdd `arena` to the pattern (e.g. `{arena: $arena, …}`) or to a "
|
|
197
|
+
"WHERE clause on the same variable. If the pattern genuinely needs "
|
|
198
|
+
"to span tenants, add an entry to _ALLOWED_NO_ARENA_REASONS with a "
|
|
199
|
+
"justification."
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# A self-test: the lint should fail when given a block that's clearly
|
|
204
|
+
# unscoped. This guards against future refactors of the lint silently
|
|
205
|
+
# turning into a no-op.
|
|
206
|
+
def test_lint_self_test_catches_obvious_bug() -> None:
|
|
207
|
+
"""Inject an unscoped pattern into a fake source and assert lint flags it."""
|
|
208
|
+
bad_source = '''
|
|
209
|
+
def writer():
|
|
210
|
+
session.run("""
|
|
211
|
+
MERGE (p:Entity:Person {email: $email})
|
|
212
|
+
ON CREATE SET p.created_at = $now
|
|
213
|
+
MATCH (c:Chunk {arena: $arena, id: $cid})
|
|
214
|
+
MERGE (p)-[:MENTIONS]->(c)
|
|
215
|
+
""", email="x", arena="acme", cid="1", now="t")
|
|
216
|
+
'''
|
|
217
|
+
blocks = _extract_cypher_blocks(bad_source)
|
|
218
|
+
assert blocks, "lint helper failed to extract the test block"
|
|
219
|
+
block = blocks[0][1]
|
|
220
|
+
assert not _is_allowed(block)
|
|
221
|
+
flagged: list[str] = []
|
|
222
|
+
for m in _NODE_PATTERN.finditer(block):
|
|
223
|
+
var = m.group("var")
|
|
224
|
+
props = m.group("props")
|
|
225
|
+
if not _pattern_scopes_arena(block, var, props):
|
|
226
|
+
flagged.append(m.group(0))
|
|
227
|
+
# The Person MERGE has no arena anywhere on `p` — must be flagged.
|
|
228
|
+
# The Chunk MATCH has arena in the property bag — must NOT be flagged.
|
|
229
|
+
assert any("Person" in f for f in flagged), \
|
|
230
|
+
"self-test: unscoped Person pattern should have been flagged"
|
|
231
|
+
assert not any("Chunk" in f for f in flagged), \
|
|
232
|
+
"self-test: arena-scoped Chunk pattern should not have been flagged"
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
"""Integration tests for the /aggregate-internal reader fast path.
|
|
2
|
+
|
|
3
|
+
Sister file to ``test_channel_stat_rollups.py``: where that file
|
|
4
|
+
covers the *writer* Cypher (ChannelStat nodes are maintained on every
|
|
5
|
+
store), this file covers the *reader* — proves that
|
|
6
|
+
``aggregate_internal`` actually reads from the denormalisation when
|
|
7
|
+
the conditions are right, falls through to the edge walk when they
|
|
8
|
+
aren't, and produces the same response shape either way.
|
|
9
|
+
|
|
10
|
+
The PR review for #33 flagged that the writer was thoroughly tested
|
|
11
|
+
but the reader's fast path only had indirect coverage via the
|
|
12
|
+
endpoint integration test in ``test_aggregate.py``. This file closes
|
|
13
|
+
that gap by importing the proxy module and calling ``aggregate_internal``
|
|
14
|
+
directly, so a regression in the fast-path branch can't be masked by
|
|
15
|
+
the silent fall-through to the edge walk.
|
|
16
|
+
|
|
17
|
+
Gated on NEO4J_TEST_URI + NEO4J_TEST_PASSWORD; skip cleanly when
|
|
18
|
+
those env vars are absent so unit-only test runs stay fast.
|
|
19
|
+
|
|
20
|
+
Run:
|
|
21
|
+
|
|
22
|
+
cd packages/memory-engine
|
|
23
|
+
NEO4J_TEST_URI=bolt://localhost:17687 \\
|
|
24
|
+
NEO4J_TEST_PASSWORD=testpassword \\
|
|
25
|
+
.venv/bin/python -m pytest tests/test_channel_stat_reader.py -v
|
|
26
|
+
"""
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import asyncio
|
|
30
|
+
import importlib.util
|
|
31
|
+
import os
|
|
32
|
+
import sys
|
|
33
|
+
import uuid
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
|
|
36
|
+
import pytest
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_NEO4J_URI = os.environ.get("NEO4J_TEST_URI")
|
|
40
|
+
_NEO4J_USER = os.environ.get("NEO4J_TEST_USER", "neo4j")
|
|
41
|
+
_NEO4J_PASSWORD = os.environ.get("NEO4J_TEST_PASSWORD")
|
|
42
|
+
|
|
43
|
+
_skip_no_neo4j = pytest.mark.skipif(
|
|
44
|
+
not (_NEO4J_URI and _NEO4J_PASSWORD),
|
|
45
|
+
reason="set NEO4J_TEST_URI + NEO4J_TEST_PASSWORD to run integration tests",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
ENGINE_ROOT = Path(__file__).resolve().parent.parent / "engine" / "services" / "l2"
|
|
50
|
+
sys.path.insert(0, str(ENGINE_ROOT))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture(scope="module")
|
|
54
|
+
def proxy_module():
|
|
55
|
+
"""Load l2-hybridrag-proxy as a module and point it at the test
|
|
56
|
+
Neo4j. The module reads NEO4J_URI/NEO4J_AUTH as module-level
|
|
57
|
+
constants at import time; we override them after load so the
|
|
58
|
+
request handler dials the test instance instead of the default
|
|
59
|
+
localhost:7687.
|
|
60
|
+
|
|
61
|
+
Skip cleanly if fastapi/neo4j aren't importable (matches the
|
|
62
|
+
pattern in test_aggregate.py)."""
|
|
63
|
+
spec = importlib.util.spec_from_file_location(
|
|
64
|
+
"l2_proxy_module",
|
|
65
|
+
ENGINE_ROOT / "l2-hybridrag-proxy.py",
|
|
66
|
+
)
|
|
67
|
+
assert spec and spec.loader
|
|
68
|
+
try:
|
|
69
|
+
mod = importlib.util.module_from_spec(spec)
|
|
70
|
+
spec.loader.exec_module(mod)
|
|
71
|
+
except ImportError:
|
|
72
|
+
pytest.skip("l2 proxy deps unavailable in this venv (fine for unit-only runs)")
|
|
73
|
+
mod.NEO4J_URI = _NEO4J_URI
|
|
74
|
+
mod.NEO4J_AUTH = (_NEO4J_USER, _NEO4J_PASSWORD)
|
|
75
|
+
return mod
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.fixture
|
|
79
|
+
def neo4j_driver():
|
|
80
|
+
"""Per-test driver + cleanup. Two arenas so isolation tests can
|
|
81
|
+
run side by side without trampling each other."""
|
|
82
|
+
from neo4j import GraphDatabase
|
|
83
|
+
|
|
84
|
+
driver = GraphDatabase.driver(_NEO4J_URI, auth=(_NEO4J_USER, _NEO4J_PASSWORD))
|
|
85
|
+
arenas = [f"rdr_a_{uuid.uuid4().hex[:8]}", f"rdr_b_{uuid.uuid4().hex[:8]}"]
|
|
86
|
+
yield driver, arenas
|
|
87
|
+
with driver.session() as session:
|
|
88
|
+
for arena in arenas:
|
|
89
|
+
session.run(
|
|
90
|
+
"MATCH (n) WHERE n.arena = $arena DETACH DELETE n",
|
|
91
|
+
arena=arena,
|
|
92
|
+
)
|
|
93
|
+
driver.close()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _write_with_stats(
|
|
97
|
+
session,
|
|
98
|
+
arena: str,
|
|
99
|
+
cid: str,
|
|
100
|
+
email: str,
|
|
101
|
+
channel: str,
|
|
102
|
+
direction: str,
|
|
103
|
+
occurred_at: str,
|
|
104
|
+
) -> None:
|
|
105
|
+
"""Mirror the writer Cypher in /index-internal-batch's email path,
|
|
106
|
+
including the ChannelStat rollup. Keeping this inline rather than
|
|
107
|
+
importing from test_channel_stat_rollups.py so this file can be
|
|
108
|
+
read top-to-bottom without cross-file detective work."""
|
|
109
|
+
now_iso = "2026-05-11T00:00:00Z"
|
|
110
|
+
session.run(
|
|
111
|
+
"""
|
|
112
|
+
MERGE (c:Chunk {id: $cid})
|
|
113
|
+
SET c.text = 't', c.path = 'p', c.arena = $arena,
|
|
114
|
+
c.created_at = $now
|
|
115
|
+
""",
|
|
116
|
+
cid=cid, arena=arena, now=now_iso,
|
|
117
|
+
)
|
|
118
|
+
session.run(
|
|
119
|
+
"""
|
|
120
|
+
MERGE (p:Entity:Person {arena: $arena, email: $email})
|
|
121
|
+
ON CREATE SET p.created_at = $now
|
|
122
|
+
WITH p
|
|
123
|
+
MATCH (c:Chunk {arena: $arena, id: $cid})
|
|
124
|
+
MERGE (p)-[r:COMMUNICATED]->(c)
|
|
125
|
+
ON CREATE SET r.channel = $channel,
|
|
126
|
+
r.direction = $direction,
|
|
127
|
+
r.occurred_at = $occurred_at,
|
|
128
|
+
r.weight = 1.0,
|
|
129
|
+
r._counted = false
|
|
130
|
+
WITH p, r
|
|
131
|
+
FOREACH (_ IN CASE WHEN r._counted = false THEN [1] ELSE [] END |
|
|
132
|
+
MERGE (s:ChannelStat {arena: $arena, person_email: $email, channel: $channel})
|
|
133
|
+
ON CREATE SET s.count = 0,
|
|
134
|
+
s.inbound = 0,
|
|
135
|
+
s.outbound = 0,
|
|
136
|
+
s.first_seen = $occurred_at,
|
|
137
|
+
s.last_seen = $occurred_at,
|
|
138
|
+
s.created_at = $now
|
|
139
|
+
SET s.count = s.count + 1,
|
|
140
|
+
s.inbound = s.inbound + (CASE WHEN $direction = 'inbound' THEN 1 ELSE 0 END),
|
|
141
|
+
s.outbound = s.outbound + (CASE WHEN $direction = 'outbound' THEN 1 ELSE 0 END),
|
|
142
|
+
s.first_seen = CASE
|
|
143
|
+
WHEN $occurred_at < coalesce(s.first_seen, $occurred_at)
|
|
144
|
+
THEN $occurred_at
|
|
145
|
+
ELSE s.first_seen END,
|
|
146
|
+
s.last_seen = CASE
|
|
147
|
+
WHEN $occurred_at > coalesce(s.last_seen, '')
|
|
148
|
+
THEN $occurred_at
|
|
149
|
+
ELSE s.last_seen END,
|
|
150
|
+
s.updated_at = $now
|
|
151
|
+
MERGE (p)-[:HAS_STAT]->(s)
|
|
152
|
+
SET r._counted = true
|
|
153
|
+
)
|
|
154
|
+
""",
|
|
155
|
+
arena=arena, email=email, cid=cid,
|
|
156
|
+
channel=channel, direction=direction,
|
|
157
|
+
occurred_at=occurred_at, now=now_iso,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _write_edges_only(
|
|
162
|
+
session,
|
|
163
|
+
arena: str,
|
|
164
|
+
cid: str,
|
|
165
|
+
email: str,
|
|
166
|
+
channel: str,
|
|
167
|
+
direction: str,
|
|
168
|
+
occurred_at: str,
|
|
169
|
+
) -> None:
|
|
170
|
+
"""Write Person + COMMUNICATED + Chunk *without* a ChannelStat.
|
|
171
|
+
Simulates pre-rollout data: older tenants whose ingest never went
|
|
172
|
+
through the new writer. Reader fast path must fall through to the
|
|
173
|
+
edge walk and still return correct results for these rows."""
|
|
174
|
+
now_iso = "2026-05-11T00:00:00Z"
|
|
175
|
+
session.run(
|
|
176
|
+
"""
|
|
177
|
+
MERGE (c:Chunk {id: $cid})
|
|
178
|
+
SET c.text = 't', c.path = 'p', c.arena = $arena,
|
|
179
|
+
c.created_at = $now
|
|
180
|
+
MERGE (p:Entity:Person {arena: $arena, email: $email})
|
|
181
|
+
ON CREATE SET p.created_at = $now
|
|
182
|
+
MERGE (p)-[r:COMMUNICATED]->(c)
|
|
183
|
+
ON CREATE SET r.channel = $channel,
|
|
184
|
+
r.direction = $direction,
|
|
185
|
+
r.occurred_at = $occurred_at,
|
|
186
|
+
r.weight = 1.0
|
|
187
|
+
""",
|
|
188
|
+
cid=cid, arena=arena, email=email,
|
|
189
|
+
channel=channel, direction=direction,
|
|
190
|
+
occurred_at=occurred_at, now=now_iso,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _ensure_indexes(session) -> None:
|
|
195
|
+
"""Run the same index/constraint statements the writer runs at the
|
|
196
|
+
top of /index-internal-batch. Tests that exercise the constraint
|
|
197
|
+
behaviour need it present; the writer fixture in
|
|
198
|
+
test_channel_stat_rollups.py doesn't fire this path."""
|
|
199
|
+
session.run(
|
|
200
|
+
"CREATE INDEX channelstat_arena_email IF NOT EXISTS "
|
|
201
|
+
"FOR (s:ChannelStat) ON (s.arena, s.person_email)"
|
|
202
|
+
)
|
|
203
|
+
session.run(
|
|
204
|
+
"CREATE CONSTRAINT channelstat_unique IF NOT EXISTS "
|
|
205
|
+
"FOR (s:ChannelStat) REQUIRE (s.arena, s.person_email, s.channel) IS UNIQUE"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _call_aggregate(proxy_module, **kwargs):
|
|
210
|
+
"""Invoke aggregate_internal as the FastAPI route would, but
|
|
211
|
+
without going through HTTP. The handler is async; this helper
|
|
212
|
+
wraps the asyncio.run boilerplate so individual tests stay terse."""
|
|
213
|
+
req = proxy_module.AggregateInternalRequest(**kwargs)
|
|
214
|
+
return asyncio.run(proxy_module.aggregate_internal(req))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
# UNIQUE constraint.
|
|
219
|
+
# ---------------------------------------------------------------------------
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@_skip_no_neo4j
|
|
223
|
+
def test_unique_constraint_present_after_index_setup(neo4j_driver) -> None:
|
|
224
|
+
"""Pin the constraint exists. Without it, concurrent writers can
|
|
225
|
+
create rival ChannelStat nodes for the same (arena, email,
|
|
226
|
+
channel) tuple — MERGE doesn't lock without a constraint."""
|
|
227
|
+
driver, _ = neo4j_driver
|
|
228
|
+
with driver.session() as session:
|
|
229
|
+
_ensure_indexes(session)
|
|
230
|
+
names = {
|
|
231
|
+
rec["name"]
|
|
232
|
+
for rec in session.run("SHOW CONSTRAINTS YIELD name")
|
|
233
|
+
}
|
|
234
|
+
assert "channelstat_unique" in names
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@_skip_no_neo4j
|
|
238
|
+
def test_unique_constraint_rejects_duplicate_channelstat(neo4j_driver) -> None:
|
|
239
|
+
"""End-to-end: with the constraint in place, an attempt to CREATE
|
|
240
|
+
a second node for the same key fails. Belt-and-braces against the
|
|
241
|
+
MERGE pattern ever falling back to CREATE under contention."""
|
|
242
|
+
from neo4j.exceptions import ConstraintError
|
|
243
|
+
|
|
244
|
+
driver, (arena, _) = neo4j_driver
|
|
245
|
+
with driver.session() as session:
|
|
246
|
+
_ensure_indexes(session)
|
|
247
|
+
session.run(
|
|
248
|
+
"CREATE (s:ChannelStat {arena: $arena, person_email: $email, "
|
|
249
|
+
"channel: $channel, count: 1})",
|
|
250
|
+
arena=arena, email="alex@x.io", channel="email",
|
|
251
|
+
)
|
|
252
|
+
with pytest.raises(ConstraintError):
|
|
253
|
+
session.run(
|
|
254
|
+
"CREATE (s:ChannelStat {arena: $arena, person_email: $email, "
|
|
255
|
+
"channel: $channel, count: 1})",
|
|
256
|
+
arena=arena, email="alex@x.io", channel="email",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# ---------------------------------------------------------------------------
|
|
261
|
+
# Fast-path reader behaviour.
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
@_skip_no_neo4j
|
|
266
|
+
def test_fast_path_returns_per_channel_buckets_from_stats(
|
|
267
|
+
neo4j_driver, proxy_module
|
|
268
|
+
) -> None:
|
|
269
|
+
"""When ChannelStats exist for the contact, /aggregate-internal
|
|
270
|
+
should return one bucket per channel, ordered by count desc."""
|
|
271
|
+
driver, (arena, _) = neo4j_driver
|
|
272
|
+
email = "alex@x.io"
|
|
273
|
+
with driver.session() as session:
|
|
274
|
+
_ensure_indexes(session)
|
|
275
|
+
# 3 emails (2 in, 1 out), 1 slack (in).
|
|
276
|
+
_write_with_stats(session, arena, "c1", email, "email", "inbound", "2026-05-09T09:00:00Z")
|
|
277
|
+
_write_with_stats(session, arena, "c2", email, "email", "outbound", "2026-05-09T10:00:00Z")
|
|
278
|
+
_write_with_stats(session, arena, "c3", email, "email", "inbound", "2026-05-09T11:00:00Z")
|
|
279
|
+
_write_with_stats(session, arena, "c4", email, "slack", "inbound", "2026-05-09T08:00:00Z")
|
|
280
|
+
|
|
281
|
+
out = _call_aggregate(
|
|
282
|
+
proxy_module,
|
|
283
|
+
arena=arena, contact_email=email, group_by=["channel"],
|
|
284
|
+
)
|
|
285
|
+
assert out.total == 4
|
|
286
|
+
assert out.last_seen == "2026-05-09T11:00:00Z"
|
|
287
|
+
assert len(out.buckets) == 2
|
|
288
|
+
# Busiest first.
|
|
289
|
+
email_bucket = out.buckets[0]
|
|
290
|
+
slack_bucket = out.buckets[1]
|
|
291
|
+
assert email_bucket.keys == {"channel": "email"}
|
|
292
|
+
assert email_bucket.count == 3
|
|
293
|
+
assert email_bucket.inbound == 2
|
|
294
|
+
assert email_bucket.outbound == 1
|
|
295
|
+
assert slack_bucket.keys == {"channel": "slack"}
|
|
296
|
+
assert slack_bucket.count == 1
|
|
297
|
+
assert slack_bucket.inbound == 1
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@_skip_no_neo4j
|
|
301
|
+
def test_fast_path_returns_single_bucket_when_group_by_empty(
|
|
302
|
+
neo4j_driver, proxy_module
|
|
303
|
+
) -> None:
|
|
304
|
+
"""Empty group_by collapses ChannelStat rows into one global
|
|
305
|
+
bucket — totals summed across channels, inbound/outbound likewise,
|
|
306
|
+
last_seen = max across all channels, first_seen = min."""
|
|
307
|
+
driver, (arena, _) = neo4j_driver
|
|
308
|
+
email = "alex@x.io"
|
|
309
|
+
with driver.session() as session:
|
|
310
|
+
_ensure_indexes(session)
|
|
311
|
+
_write_with_stats(session, arena, "c1", email, "email", "inbound", "2026-05-08T09:00:00Z")
|
|
312
|
+
_write_with_stats(session, arena, "c2", email, "email", "outbound", "2026-05-09T10:00:00Z")
|
|
313
|
+
_write_with_stats(session, arena, "c3", email, "slack", "inbound", "2026-05-07T15:00:00Z")
|
|
314
|
+
|
|
315
|
+
out = _call_aggregate(
|
|
316
|
+
proxy_module,
|
|
317
|
+
arena=arena, contact_email=email, group_by=[],
|
|
318
|
+
)
|
|
319
|
+
assert out.total == 3
|
|
320
|
+
assert len(out.buckets) == 1
|
|
321
|
+
bucket = out.buckets[0]
|
|
322
|
+
assert bucket.keys == {}
|
|
323
|
+
assert bucket.count == 3
|
|
324
|
+
assert bucket.inbound == 2
|
|
325
|
+
assert bucket.outbound == 1
|
|
326
|
+
# Time bounds span the full range across channels.
|
|
327
|
+
assert bucket.first_seen == "2026-05-07T15:00:00Z"
|
|
328
|
+
assert bucket.last_seen == "2026-05-09T10:00:00Z"
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@_skip_no_neo4j
|
|
332
|
+
def test_fast_path_falls_through_to_edge_walk_when_stats_absent(
|
|
333
|
+
neo4j_driver, proxy_module
|
|
334
|
+
) -> None:
|
|
335
|
+
"""The forward-only optimisation: pre-rollout data has
|
|
336
|
+
COMMUNICATED edges but no ChannelStat nodes. The fast-path check
|
|
337
|
+
must fall through silently and the edge-walk path must still
|
|
338
|
+
return the correct buckets — this is the contract that means no
|
|
339
|
+
migration is needed."""
|
|
340
|
+
driver, (arena, _) = neo4j_driver
|
|
341
|
+
email = "legacy@example.com"
|
|
342
|
+
with driver.session() as session:
|
|
343
|
+
_ensure_indexes(session)
|
|
344
|
+
# Edges only — no ChannelStat written, simulating an older
|
|
345
|
+
# tenant whose data was ingested before this PR.
|
|
346
|
+
_write_edges_only(session, arena, "c1", email, "email", "inbound", "2026-05-09T09:00:00Z")
|
|
347
|
+
_write_edges_only(session, arena, "c2", email, "email", "outbound", "2026-05-09T10:00:00Z")
|
|
348
|
+
# Sanity: confirm no ChannelStat exists for this contact.
|
|
349
|
+
rows = list(session.run(
|
|
350
|
+
"MATCH (s:ChannelStat {arena: $arena, person_email: $email}) RETURN s",
|
|
351
|
+
arena=arena, email=email,
|
|
352
|
+
))
|
|
353
|
+
assert rows == []
|
|
354
|
+
|
|
355
|
+
out = _call_aggregate(
|
|
356
|
+
proxy_module,
|
|
357
|
+
arena=arena, contact_email=email, group_by=["channel"],
|
|
358
|
+
)
|
|
359
|
+
# Edge-walk path took over and returned the same shape.
|
|
360
|
+
assert out.total == 2
|
|
361
|
+
assert len(out.buckets) == 1
|
|
362
|
+
assert out.buckets[0].keys == {"channel": "email"}
|
|
363
|
+
assert out.buckets[0].inbound == 1
|
|
364
|
+
assert out.buckets[0].outbound == 1
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@_skip_no_neo4j
|
|
368
|
+
def test_fast_path_arena_isolated(neo4j_driver, proxy_module) -> None:
|
|
369
|
+
"""A's aggregate must never reflect B's stats even when both
|
|
370
|
+
arenas have the same email — the channelstat_arena_email index is
|
|
371
|
+
keyed on arena first to enforce this. Companion to the writer-
|
|
372
|
+
side isolation test in test_channel_stat_rollups.py."""
|
|
373
|
+
driver, (arena_a, arena_b) = neo4j_driver
|
|
374
|
+
email = "shared@example.com"
|
|
375
|
+
with driver.session() as session:
|
|
376
|
+
_ensure_indexes(session)
|
|
377
|
+
_write_with_stats(session, arena_a, "ca1", email, "email", "inbound", "2026-05-09T09:00:00Z")
|
|
378
|
+
_write_with_stats(session, arena_b, "cb1", email, "email", "inbound", "2026-05-09T10:00:00Z")
|
|
379
|
+
_write_with_stats(session, arena_b, "cb2", email, "slack", "outbound", "2026-05-09T11:00:00Z")
|
|
380
|
+
|
|
381
|
+
out_a = _call_aggregate(
|
|
382
|
+
proxy_module, arena=arena_a, contact_email=email, group_by=["channel"],
|
|
383
|
+
)
|
|
384
|
+
out_b = _call_aggregate(
|
|
385
|
+
proxy_module, arena=arena_b, contact_email=email, group_by=["channel"],
|
|
386
|
+
)
|
|
387
|
+
assert out_a.total == 1
|
|
388
|
+
assert len(out_a.buckets) == 1
|
|
389
|
+
assert out_a.buckets[0].keys == {"channel": "email"}
|
|
390
|
+
assert out_b.total == 2
|
|
391
|
+
assert len(out_b.buckets) == 2
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
@_skip_no_neo4j
|
|
395
|
+
def test_fast_path_and_edge_walk_produce_equivalent_totals(
|
|
396
|
+
neo4j_driver, proxy_module
|
|
397
|
+
) -> None:
|
|
398
|
+
"""Equivalence: for data that has BOTH stats and edges (the
|
|
399
|
+
normal post-rollout case), the fast-path response must match what
|
|
400
|
+
the edge walk would have returned. Caught a class of bug where
|
|
401
|
+
the writer's rollup drifts from the edges — surface it via
|
|
402
|
+
response divergence rather than waiting for users to notice
|
|
403
|
+
relationships-UI inconsistency."""
|
|
404
|
+
driver, (arena, _) = neo4j_driver
|
|
405
|
+
email = "alex@x.io"
|
|
406
|
+
with driver.session() as session:
|
|
407
|
+
_ensure_indexes(session)
|
|
408
|
+
_write_with_stats(session, arena, "c1", email, "email", "inbound", "2026-05-08T09:00:00Z")
|
|
409
|
+
_write_with_stats(session, arena, "c2", email, "email", "outbound", "2026-05-09T10:00:00Z")
|
|
410
|
+
_write_with_stats(session, arena, "c3", email, "slack", "inbound", "2026-05-07T15:00:00Z")
|
|
411
|
+
|
|
412
|
+
# Compute the edge-walk answer by hand from the COMMUNICATED
|
|
413
|
+
# rows. This is the ground truth the rollup should reflect.
|
|
414
|
+
edge_rows = list(session.run(
|
|
415
|
+
"MATCH (p:Person {arena: $arena, email: $email})-[r:COMMUNICATED]->(:Chunk {arena: $arena})\n"
|
|
416
|
+
"WITH r.channel AS channel, r.direction AS direction\n"
|
|
417
|
+
"RETURN channel,\n"
|
|
418
|
+
" count(*) AS count,\n"
|
|
419
|
+
" sum(CASE WHEN direction = 'inbound' THEN 1 ELSE 0 END) AS inbound,\n"
|
|
420
|
+
" sum(CASE WHEN direction = 'outbound' THEN 1 ELSE 0 END) AS outbound\n"
|
|
421
|
+
"ORDER BY count DESC",
|
|
422
|
+
arena=arena, email=email,
|
|
423
|
+
))
|
|
424
|
+
ground_truth = {
|
|
425
|
+
rec["channel"]: (int(rec["count"]), int(rec["inbound"]), int(rec["outbound"]))
|
|
426
|
+
for rec in edge_rows
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
out = _call_aggregate(
|
|
430
|
+
proxy_module, arena=arena, contact_email=email, group_by=["channel"],
|
|
431
|
+
)
|
|
432
|
+
fast_path = {
|
|
433
|
+
b.keys["channel"]: (b.count, b.inbound, b.outbound)
|
|
434
|
+
for b in out.buckets
|
|
435
|
+
}
|
|
436
|
+
assert fast_path == ground_truth
|
|
437
|
+
assert out.total == sum(c for c, _, _ in ground_truth.values())
|