@pentatonic-ai/ai-agent-sdk 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/server.py +38 -6
- package/packages/memory-engine-v2/extractor-async/Dockerfile +5 -3
- package/packages/memory-engine-v2/extractor-async/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-async/sensitive_filter.py +51 -0
- package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py +258 -0
- package/packages/memory-engine-v2/extractor-async/test_sensitive_filter.py +61 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +307 -43
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-sync/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +231 -55
- package/packages/memory-engine-v2/extractor-sync/test_entity_id.py +88 -0
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +208 -0
- package/packages/memory-engine-v2/org-model/migrations/002_entity_merges_audit.sql +53 -0
- package/packages/memory-engine-v2/org-model/migrations/003_distillation_traces.sql +60 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +581 -0
- package/packages/memory-engine-v2/tests/test_entity_id_parity.py +57 -0
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.2";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.2";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.2",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -355,16 +355,48 @@ def _content_hash(arena: str, content: str) -> str:
|
|
|
355
355
|
# gateway healthy and queues the rest in compat instead of pushing
|
|
356
356
|
# the failure back through the DO retry loop (which causes DLQ on
|
|
357
357
|
# repeated 502s — observed 2026-05-17). Pair with retry below.
|
|
358
|
-
|
|
358
|
+
#
|
|
359
|
+
# ── Embed lane separation ────────────────────────────────────────────
|
|
360
|
+
# Live /search query embeds and bulk ingest /store(-batch) content embeds
|
|
361
|
+
# share one embedder. A bulk re-embed/ingest job can then saturate the
|
|
362
|
+
# embedder (GPU + the shared semaphore) and starve interactive chat —
|
|
363
|
+
# observed 2026-06-06: a corpus re-embed pinned the embedder, every chat
|
|
364
|
+
# query-embed timed out, semantic search returned 0. Two backward-
|
|
365
|
+
# compatible levers fix it:
|
|
366
|
+
# 1. NV_EMBED_URL_BULK — optional SEPARATE embedder for the bulk lane.
|
|
367
|
+
# Defaults to NV_EMBED_URL, so behaviour is unchanged until a second
|
|
368
|
+
# embedder is provisioned and this is set.
|
|
369
|
+
# 2. Per-lane semaphores — the interactive lane gets its own reserved
|
|
370
|
+
# in-flight slots, so a saturated bulk lane cannot consume the slots
|
|
371
|
+
# live chat needs, EVEN when both lanes share one embedder.
|
|
372
|
+
NV_EMBED_URL_BULK = os.environ.get("NV_EMBED_URL_BULK", NV_EMBED_URL)
|
|
373
|
+
_EMBED_SEMAPHORE = asyncio.Semaphore(
|
|
374
|
+
int(os.environ.get("NV_EMBED_BULK_CONCURRENCY", "4"))
|
|
375
|
+
)
|
|
376
|
+
_EMBED_SEMAPHORE_INTERACTIVE = asyncio.Semaphore(
|
|
377
|
+
int(os.environ.get("NV_EMBED_INTERACTIVE_CONCURRENCY", "4"))
|
|
378
|
+
)
|
|
359
379
|
_EMBED_RETRY_STATUSES = {502, 503, 504, 429}
|
|
360
380
|
_EMBED_MAX_ATTEMPTS = 5
|
|
361
381
|
|
|
362
382
|
|
|
363
|
-
async def _embed_batch(
|
|
383
|
+
async def _embed_batch(
|
|
384
|
+
texts: list[str], lane: str = "bulk"
|
|
385
|
+
) -> list[list[float]]:
|
|
364
386
|
"""Call the external embed gateway. Both 'openai' and
|
|
365
|
-
'pentatonic-gateway' provider shapes supported.
|
|
387
|
+
'pentatonic-gateway' provider shapes supported.
|
|
388
|
+
|
|
389
|
+
`lane` selects the embed lane (see NV_EMBED_URL_BULK above):
|
|
390
|
+
- 'interactive' — live /search query embeds. Uses NV_EMBED_URL and
|
|
391
|
+
a dedicated semaphore so chat is never starved by bulk ingest.
|
|
392
|
+
- 'bulk' (default) — ingest /store(-batch) content embeds. Uses
|
|
393
|
+
NV_EMBED_URL_BULK (defaults to NV_EMBED_URL).
|
|
394
|
+
"""
|
|
366
395
|
if not texts:
|
|
367
396
|
return []
|
|
397
|
+
interactive = lane == "interactive"
|
|
398
|
+
url = NV_EMBED_URL if interactive else NV_EMBED_URL_BULK
|
|
399
|
+
sem = _EMBED_SEMAPHORE_INTERACTIVE if interactive else _EMBED_SEMAPHORE
|
|
368
400
|
headers = {"Content-Type": "application/json"}
|
|
369
401
|
if NV_EMBED_API_KEY:
|
|
370
402
|
if NV_EMBED_PROVIDER == "pentatonic-gateway":
|
|
@@ -374,7 +406,7 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
|
374
406
|
|
|
375
407
|
body = {"input": texts, "model": "nv-embed-v2"}
|
|
376
408
|
|
|
377
|
-
async with
|
|
409
|
+
async with sem:
|
|
378
410
|
# Retry transient gateway failures (502/503/504/429) with
|
|
379
411
|
# exponential backoff before bubbling up to the caller. Without
|
|
380
412
|
# this a single GPU hiccup propagates a 500 to the TES DO,
|
|
@@ -382,7 +414,7 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
|
|
|
382
414
|
last_exc: Exception | None = None
|
|
383
415
|
for attempt in range(_EMBED_MAX_ATTEMPTS):
|
|
384
416
|
try:
|
|
385
|
-
r = await _http.post(
|
|
417
|
+
r = await _http.post(url, json=body, headers=headers)
|
|
386
418
|
if r.status_code in _EMBED_RETRY_STATUSES:
|
|
387
419
|
last_exc = httpx.HTTPStatusError(
|
|
388
420
|
f"embed gateway {r.status_code}", request=r.request, response=r,
|
|
@@ -813,7 +845,7 @@ async def search(req: SearchRequest):
|
|
|
813
845
|
# rejects to force callers to be explicit.
|
|
814
846
|
raise HTTPException(400, "arena or arenas required")
|
|
815
847
|
|
|
816
|
-
qvec = (await _embed_batch([req.query]))[0]
|
|
848
|
+
qvec = (await _embed_batch([req.query], lane="interactive"))[0]
|
|
817
849
|
# Compose Qdrant Filter: arena scope is always required, plus any
|
|
818
850
|
# caller-supplied metadata_filter keys ANDed in. Mirrors how
|
|
819
851
|
# /forget's `metadata_contains` already builds containment filters
|
|
@@ -7,8 +7,10 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
|
7
7
|
|
|
8
8
|
COPY worker.py .
|
|
9
9
|
# Pure helper modules — sibling imports inside worker.py
|
|
10
|
-
# (noise_filter, confidence).
|
|
11
|
-
#
|
|
12
|
-
|
|
10
|
+
# (noise_filter, confidence, entity_id). entity_id.py is byte-identical to
|
|
11
|
+
# extractor-sync's copy (per-service build contexts prevent a shared module;
|
|
12
|
+
# tests/test_entity_id_parity.py guards drift). The test_*.py files are
|
|
13
|
+
# intentionally excluded; they're for local pytest, not container runtime.
|
|
14
|
+
COPY noise_filter.py confidence.py entity_id.py ./
|
|
13
15
|
|
|
14
16
|
CMD ["python", "worker.py"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Canonical entity-ID scheme — SHARED, byte-identical across extractor-sync and
|
|
2
|
+
extractor-async.
|
|
3
|
+
|
|
4
|
+
The two extractors run as separate Docker services with PER-SERVICE build contexts
|
|
5
|
+
(docker-compose `context: ./extractor-sync` / `./extractor-async`), so a single
|
|
6
|
+
importable module can't be COPY'd into both. This file is therefore DUPLICATED in
|
|
7
|
+
each service dir, and tests/test_entity_id_parity.py fails if the copies ever drift.
|
|
8
|
+
|
|
9
|
+
Why this exists: both passes must key an entity (person / org / …) by the SAME id so
|
|
10
|
+
the same entity converges across the deterministic (sync) and LLM (async) passes.
|
|
11
|
+
Before this, the two services keyed entities DIFFERENTLY — sync as
|
|
12
|
+
`e_` + sha256("{arena}|{type}|{name.lower().strip()}")[:24]; async as
|
|
13
|
+
sha256("\\x1f".join(parts))[:32] (no lowercasing, no prefix) — so even identical
|
|
14
|
+
names produced different ids and never merged. We unify on the sync scheme: sync's
|
|
15
|
+
existing rows are unaffected, and the async pass converges onto them.
|
|
16
|
+
|
|
17
|
+
Step 1 of RFC-entity-reconciliation.md (the foundation for alias-aware resolution).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import re
|
|
24
|
+
import unicodedata
|
|
25
|
+
|
|
26
|
+
_WHITESPACE = re.compile(r"\s+")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_surface_form(value: str) -> str:
|
|
30
|
+
"""Normalize a surface form (person name, email, org name, …) for identity
|
|
31
|
+
keying. Steps, in order:
|
|
32
|
+
|
|
33
|
+
1. None → "" (defensive; some producers can hand a missing field as None).
|
|
34
|
+
2. Unicode NFKC (compatibility decomposition + canonical composition) —
|
|
35
|
+
collapses width / ligature / decomposed-accent variants. Without this
|
|
36
|
+
"Café" (precomposed U+00E9) and "Cafe\\u0301" (decomposed e+combining
|
|
37
|
+
acute) — which render identically — would key as different entities.
|
|
38
|
+
Same for fullwidth Latin ("CARLY" ↔ "CARLY") and ligatures
|
|
39
|
+
("fi" U+FB01 ↔ "fi"). Real-world relevant for vCard imports, Mac
|
|
40
|
+
pasteboard, IME inputs, internationalised name sources.
|
|
41
|
+
3. Trim outer whitespace, then collapse internal `\\s+` to a single space —
|
|
42
|
+
"Carly Snider" (slack-autocomplete double-space) ↔ "Carly Snider".
|
|
43
|
+
4. Lowercase. Email casing is case-insensitive per spec; person-name
|
|
44
|
+
casing varies by producer (gmail header casing vs slack profile).
|
|
45
|
+
"""
|
|
46
|
+
s = unicodedata.normalize("NFKC", value or "")
|
|
47
|
+
return _WHITESPACE.sub(" ", s.strip()).lower()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
|
|
51
|
+
"""Deterministic entity id. The same (arena, entity_type, normalized
|
|
52
|
+
canonical_name) yields the same id across BOTH extractor passes, so re-extraction
|
|
53
|
+
and cross-pass extraction converge. Format is preserved from extractor-sync
|
|
54
|
+
(`e_` + 24 hex of sha256) so its existing rows are unaffected.
|
|
55
|
+
"""
|
|
56
|
+
key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
|
|
57
|
+
return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""sensitive_filter — content-guardrail skip for the distillation worker.
|
|
2
|
+
|
|
3
|
+
Seesa's ingest classifier tags content that is directed interpersonal
|
|
4
|
+
sentiment / characterization about a named colleague (gossip) with
|
|
5
|
+
`sensitivity_class == 'interpersonal'` and a `sensitive_about` subject
|
|
6
|
+
set (see Seesa docs/permissions/content-guardrail-spec.md). Such content
|
|
7
|
+
must NEVER be distilled into the entity graph: the SUBJECT of gossip has
|
|
8
|
+
no standing in an entity graph, and turning "X thinks Y is checked out"
|
|
9
|
+
into a fact ABOUT Y is the worst pollution — and a real HR / legal
|
|
10
|
+
hazard. The extractor filters these events at claim time (a SQL pre-pass)
|
|
11
|
+
AND per-event via the pure predicate here (defense in depth).
|
|
12
|
+
|
|
13
|
+
commercial_secret is deliberately NOT filtered: it stays in the
|
|
14
|
+
producer's own per-user arena; its cross-user spread is governed upstream
|
|
15
|
+
(L0 / Layer-P), not by the per-arena graph.
|
|
16
|
+
|
|
17
|
+
Pure module — no I/O, no deps. Importable from worker.py without pulling
|
|
18
|
+
in psycopg / httpx, and importable from tests without fixtures (mirrors
|
|
19
|
+
noise_filter.py).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
# Default ON; tunable off for back-compat. Inert until Seesa stamps the
|
|
28
|
+
# tag upstream (the interpersonal detector is opt-in), but safe to ship on
|
|
29
|
+
# so it takes effect the moment tags appear.
|
|
30
|
+
SKIP_SENSITIVE_CONTENT: bool = os.environ.get(
|
|
31
|
+
"DISTILL_SKIP_SENSITIVE_CONTENT", "true"
|
|
32
|
+
).strip().lower() not in ("false", "0", "no", "off")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_sensitive_event(event: dict[str, Any]) -> bool:
|
|
36
|
+
"""True iff the event was tagged interpersonal gossip about a colleague
|
|
37
|
+
(`sensitivity_class == 'interpersonal'`) or carries a non-empty
|
|
38
|
+
`sensitive_about` subject set. Tolerant of missing / odd attribute
|
|
39
|
+
shapes — defaults to False (fail-open: the SQL pre-filter is the
|
|
40
|
+
primary line; a malformed attribute bag never crashes the worker)."""
|
|
41
|
+
if not isinstance(event, dict):
|
|
42
|
+
return False
|
|
43
|
+
attrs = event.get("attributes") or {}
|
|
44
|
+
if not isinstance(attrs, dict):
|
|
45
|
+
return False
|
|
46
|
+
if attrs.get("sensitivity_class") == "interpersonal":
|
|
47
|
+
return True
|
|
48
|
+
sa = attrs.get("sensitive_about")
|
|
49
|
+
return isinstance(sa, list) and any(
|
|
50
|
+
isinstance(s, str) and s.strip() for s in sa
|
|
51
|
+
)
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Unit tests for entity reconciliation logic in extractor-async.
|
|
2
|
+
|
|
3
|
+
Covers the parts of RFC §1 (normalization), §2 (alias-aware upsert
|
|
4
|
+
helper construction), and §4 (LLM prompt 4-field ENT parsing) that
|
|
5
|
+
don't require a real Postgres connection. The integration scenarios
|
|
6
|
+
(advisory-lock concurrency, end-to-end alias resolution) need a DB
|
|
7
|
+
and live in `packages/memory-engine-v2/tests/`.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import importlib.util
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_THIS = Path(__file__).resolve().parent
|
|
19
|
+
_SPEC = importlib.util.spec_from_file_location("extractor_async_worker",
|
|
20
|
+
_THIS / "worker.py")
|
|
21
|
+
assert _SPEC and _SPEC.loader
|
|
22
|
+
worker = importlib.util.module_from_spec(_SPEC)
|
|
23
|
+
try:
|
|
24
|
+
_SPEC.loader.exec_module(worker)
|
|
25
|
+
except ImportError as e:
|
|
26
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_content_id_does_not_normalize() -> None:
|
|
31
|
+
"""`_content_id` is used for fact / relationship ids — those are
|
|
32
|
+
content-hashed and intentionally case-sensitive. Entity ids go
|
|
33
|
+
through `entity_id()` (entity_id.py) instead, which DOES normalize.
|
|
34
|
+
Locks in that separation: changing `_content_id` to normalize
|
|
35
|
+
would silently break fact/rel content addressing."""
|
|
36
|
+
a = worker._content_id("foo", "bar")
|
|
37
|
+
b = worker._content_id("FOO", "bar")
|
|
38
|
+
assert a != b
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ----------------------------------------------------------------------
|
|
42
|
+
# Prompt parser — ENT|type|name|email? (RFC §Async extractor)
|
|
43
|
+
# ----------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def test_ent_parser_three_fields_unchanged() -> None:
|
|
46
|
+
"""Back-compat: 3-field ENT lines (no email) parse as before."""
|
|
47
|
+
out = worker._parse_kv_records(
|
|
48
|
+
"=== event 0 ===\nENT|person|Alex Wong\n", expected_n=1
|
|
49
|
+
)
|
|
50
|
+
assert out[0]["entities"] == [{"type": "person", "name": "Alex Wong"}]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_ent_parser_four_fields_promotes_email_to_aliases() -> None:
|
|
54
|
+
"""New: 4-field ENT|person|<name>|<email> → email goes into aliases."""
|
|
55
|
+
out = worker._parse_kv_records(
|
|
56
|
+
"=== event 0 ===\nENT|person|Carly Snider|carly@example.com\n",
|
|
57
|
+
expected_n=1,
|
|
58
|
+
)
|
|
59
|
+
assert out[0]["entities"] == [{
|
|
60
|
+
"type": "person",
|
|
61
|
+
"name": "Carly Snider",
|
|
62
|
+
"aliases": ["carly@example.com"],
|
|
63
|
+
}]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_ent_parser_four_fields_non_email_dropped() -> None:
|
|
67
|
+
"""Junk 4th field (not an email) is silently dropped — better to
|
|
68
|
+
strip noise than poison the aliases list with non-email tokens."""
|
|
69
|
+
out = worker._parse_kv_records(
|
|
70
|
+
"=== event 0 ===\nENT|person|Sam Patel|something random\n",
|
|
71
|
+
expected_n=1,
|
|
72
|
+
)
|
|
73
|
+
assert out[0]["entities"] == [{"type": "person", "name": "Sam Patel"}]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_ent_parser_mixed_three_and_four_field() -> None:
|
|
77
|
+
"""A batch can mix 3-field and 4-field ENT lines (model behaviour
|
|
78
|
+
will vary by what's visible per event)."""
|
|
79
|
+
out = worker._parse_kv_records(
|
|
80
|
+
(
|
|
81
|
+
"=== event 0 ===\n"
|
|
82
|
+
"ENT|person|Alex Wong\n"
|
|
83
|
+
"ENT|person|Carly Snider|carly@example.com\n"
|
|
84
|
+
"ENT|org|Acme Corp\n"
|
|
85
|
+
),
|
|
86
|
+
expected_n=1,
|
|
87
|
+
)
|
|
88
|
+
ents = out[0]["entities"]
|
|
89
|
+
assert {"type": "person", "name": "Alex Wong"} in ents
|
|
90
|
+
assert {"type": "person", "name": "Carly Snider",
|
|
91
|
+
"aliases": ["carly@example.com"]} in ents
|
|
92
|
+
assert {"type": "org", "name": "Acme Corp"} in ents
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_ent_parser_email_with_dot_and_subdomain() -> None:
|
|
96
|
+
"""Realistic email forms (subdomains, plus-tags, dots) get
|
|
97
|
+
accepted into aliases."""
|
|
98
|
+
out = worker._parse_kv_records(
|
|
99
|
+
(
|
|
100
|
+
"=== event 0 ===\n"
|
|
101
|
+
"ENT|person|Dot Person|first.last@sub.example.com\n"
|
|
102
|
+
"ENT|person|Plus Tag|user+tag@example.co.uk\n"
|
|
103
|
+
),
|
|
104
|
+
expected_n=1,
|
|
105
|
+
)
|
|
106
|
+
aliases = [e.get("aliases") for e in out[0]["entities"]]
|
|
107
|
+
assert ["first.last@sub.example.com"] in aliases
|
|
108
|
+
assert ["user+tag@example.co.uk"] in aliases
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ----------------------------------------------------------------------
|
|
112
|
+
# Raw-slice splitter — for distillation trace logging (migration 003).
|
|
113
|
+
# ----------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def test_split_event_blocks_basic() -> None:
|
|
116
|
+
"""Two events, each with content; both slices come back verbatim
|
|
117
|
+
(sans trailing whitespace)."""
|
|
118
|
+
text = (
|
|
119
|
+
"=== event 0 ===\n"
|
|
120
|
+
"ENT|person|Alex\n"
|
|
121
|
+
"FCT|mention|Alex|works at|Acme|Alex works at Acme\n"
|
|
122
|
+
"=== event 1 ===\n"
|
|
123
|
+
"ENT|org|Acme\n"
|
|
124
|
+
)
|
|
125
|
+
slices = worker._split_event_blocks(text, expected_n=2)
|
|
126
|
+
assert slices[0] == (
|
|
127
|
+
"ENT|person|Alex\nFCT|mention|Alex|works at|Acme|Alex works at Acme"
|
|
128
|
+
)
|
|
129
|
+
assert slices[1] == "ENT|org|Acme"
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def test_split_event_blocks_pads_missing() -> None:
|
|
133
|
+
"""If the model skips an event (header absent) we still return
|
|
134
|
+
expected_n entries — missing ones come back as empty strings.
|
|
135
|
+
Parity with `_parse_kv_records` so trace logging and parsing line
|
|
136
|
+
up at the same indices."""
|
|
137
|
+
text = "=== event 0 ===\nENT|person|Alex\n" # no event 1, 2
|
|
138
|
+
slices = worker._split_event_blocks(text, expected_n=3)
|
|
139
|
+
assert slices[0] == "ENT|person|Alex"
|
|
140
|
+
assert slices[1] == ""
|
|
141
|
+
assert slices[2] == ""
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_split_event_blocks_ignores_preamble() -> None:
|
|
145
|
+
"""Lines before the first event header (model preamble like
|
|
146
|
+
'Sure, here are the extractions:') don't poison any slice."""
|
|
147
|
+
text = (
|
|
148
|
+
"Here are the extractions:\n\n"
|
|
149
|
+
"=== event 0 ===\n"
|
|
150
|
+
"ENT|person|Alex\n"
|
|
151
|
+
)
|
|
152
|
+
slices = worker._split_event_blocks(text, expected_n=1)
|
|
153
|
+
assert slices[0] == "ENT|person|Alex"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_split_event_blocks_out_of_range_header_dropped() -> None:
|
|
157
|
+
"""If the model emits `=== event 7 ===` when N=2, the rogue header
|
|
158
|
+
+ its body get dropped without crashing or corrupting other slices."""
|
|
159
|
+
text = (
|
|
160
|
+
"=== event 0 ===\n"
|
|
161
|
+
"ENT|person|Alex\n"
|
|
162
|
+
"=== event 7 ===\n"
|
|
163
|
+
"ENT|person|Mystery\n"
|
|
164
|
+
"=== event 1 ===\n"
|
|
165
|
+
"ENT|org|Acme\n"
|
|
166
|
+
)
|
|
167
|
+
slices = worker._split_event_blocks(text, expected_n=2)
|
|
168
|
+
assert slices[0] == "ENT|person|Alex"
|
|
169
|
+
assert slices[1] == "ENT|org|Acme"
|
|
170
|
+
assert all("Mystery" not in s for s in slices)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def test_split_and_parse_align_at_same_indices() -> None:
|
|
174
|
+
"""Contract: for the same text + expected_n, the i-th slice
|
|
175
|
+
corresponds to the i-th parsed-records dict. The worker relies on
|
|
176
|
+
this to attach raw_slice to the right record."""
|
|
177
|
+
text = (
|
|
178
|
+
"=== event 0 ===\n"
|
|
179
|
+
"ENT|person|Alex\n"
|
|
180
|
+
"=== event 1 ===\n"
|
|
181
|
+
"ENT|org|Acme\n"
|
|
182
|
+
)
|
|
183
|
+
parsed = worker._parse_kv_records(text, expected_n=2)
|
|
184
|
+
slices = worker._split_event_blocks(text, expected_n=2)
|
|
185
|
+
assert len(parsed) == len(slices) == 2
|
|
186
|
+
# event 0 — Alex is in both the parsed entities AND the slice
|
|
187
|
+
assert any(e["name"] == "Alex" for e in parsed[0]["entities"])
|
|
188
|
+
assert "Alex" in slices[0]
|
|
189
|
+
assert "Acme" in slices[1]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ----------------------------------------------------------------------
|
|
193
|
+
# build_event_block format contract — frozen reference for out-of-tree
|
|
194
|
+
# tooling (the Pentatonic-internal training-data export, future student
|
|
195
|
+
# inference, etc.) to match against. The script that generates training
|
|
196
|
+
# data lives OUTSIDE the SDK so it can't import this directly; this
|
|
197
|
+
# test instead asserts a fixed (event_dict, rendered) pair as the
|
|
198
|
+
# canonical contract. If you change build_event_block, also update
|
|
199
|
+
# anything external that reproduces the format.
|
|
200
|
+
# ----------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
def test_build_event_block_format_contract_full() -> None:
|
|
203
|
+
"""All optional attributes (emitted_at, author) present."""
|
|
204
|
+
event = {
|
|
205
|
+
"source_kind": "chat",
|
|
206
|
+
"content": "hello world",
|
|
207
|
+
"attributes": {
|
|
208
|
+
"emitted_at": "2026-06-01T12:00:00Z",
|
|
209
|
+
"author": "phil@example.com",
|
|
210
|
+
},
|
|
211
|
+
}
|
|
212
|
+
assert worker.build_event_block(0, event) == (
|
|
213
|
+
"[event 0]\n"
|
|
214
|
+
"source_kind: chat\n"
|
|
215
|
+
"when: 2026-06-01T12:00:00Z\n"
|
|
216
|
+
"author: phil@example.com\n"
|
|
217
|
+
"---\n"
|
|
218
|
+
"hello world"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def test_build_event_block_format_contract_minimal() -> None:
|
|
223
|
+
"""Only source_kind + content. Missing optional attrs drop their
|
|
224
|
+
header lines entirely — not blank lines, not '-' placeholders."""
|
|
225
|
+
event = {
|
|
226
|
+
"source_kind": "doc",
|
|
227
|
+
"content": "abc",
|
|
228
|
+
"attributes": {},
|
|
229
|
+
}
|
|
230
|
+
assert worker.build_event_block(0, event) == (
|
|
231
|
+
"[event 0]\n"
|
|
232
|
+
"source_kind: doc\n"
|
|
233
|
+
"---\n"
|
|
234
|
+
"abc"
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def test_build_event_block_format_contract_idx_passthrough() -> None:
|
|
239
|
+
"""The idx argument names the event in the header — must round-trip
|
|
240
|
+
so that batched extractions can reattach to the right event."""
|
|
241
|
+
block = worker.build_event_block(7, {
|
|
242
|
+
"source_kind": "note", "content": "x", "attributes": {},
|
|
243
|
+
})
|
|
244
|
+
assert block.startswith("[event 7]\n")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def test_build_event_block_format_contract_truncation() -> None:
|
|
248
|
+
"""Content longer than MAX_CONTENT_CHARS gets truncated. External
|
|
249
|
+
reconstructors must mirror this — otherwise the student sees a
|
|
250
|
+
different input distribution at training vs inference time."""
|
|
251
|
+
long_content = "x" * (worker.MAX_CONTENT_CHARS + 500)
|
|
252
|
+
block = worker.build_event_block(0, {
|
|
253
|
+
"source_kind": "doc",
|
|
254
|
+
"content": long_content,
|
|
255
|
+
"attributes": {},
|
|
256
|
+
})
|
|
257
|
+
content_part = block.split("---\n", 1)[1]
|
|
258
|
+
assert len(content_part) == worker.MAX_CONTENT_CHARS
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Tests for sensitive_filter — content-guardrail distillation skip.
|
|
2
|
+
|
|
3
|
+
The rule: never distil interpersonal gossip about a colleague into the
|
|
4
|
+
entity graph. Mirrors Seesa docs/permissions/content-guardrail-spec.md.
|
|
5
|
+
|
|
6
|
+
Run: pytest packages/memory-engine-v2/extractor-async/test_sensitive_filter.py
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from sensitive_filter import is_sensitive_event
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestSensitiveEventsAreFiltered:
|
|
15
|
+
def test_interpersonal_class(self):
|
|
16
|
+
ev = {"attributes": {"sensitivity_class": "interpersonal"}}
|
|
17
|
+
assert is_sensitive_event(ev)
|
|
18
|
+
|
|
19
|
+
def test_non_empty_sensitive_about(self):
|
|
20
|
+
ev = {"attributes": {"sensitive_about": ["usr_sarah"]}}
|
|
21
|
+
assert is_sensitive_event(ev)
|
|
22
|
+
|
|
23
|
+
def test_both(self):
|
|
24
|
+
ev = {"attributes": {"sensitivity_class": "interpersonal", "sensitive_about": ["usr_x"]}}
|
|
25
|
+
assert is_sensitive_event(ev)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TestNonSensitiveEventsPass:
|
|
29
|
+
def test_clean_event(self):
|
|
30
|
+
ev = {"attributes": {"source": "pip-granola-ingest", "content": "Q3 roadmap"}}
|
|
31
|
+
assert not is_sensitive_event(ev)
|
|
32
|
+
|
|
33
|
+
def test_commercial_secret_is_NOT_filtered(self):
|
|
34
|
+
# commercial_secret stays in the producer's own arena — not graph-filtered.
|
|
35
|
+
ev = {"attributes": {"sensitivity_class": "commercial_secret"}}
|
|
36
|
+
assert not is_sensitive_event(ev)
|
|
37
|
+
|
|
38
|
+
def test_empty_sensitive_about(self):
|
|
39
|
+
ev = {"attributes": {"sensitive_about": []}}
|
|
40
|
+
assert not is_sensitive_event(ev)
|
|
41
|
+
|
|
42
|
+
def test_blank_only_subjects(self):
|
|
43
|
+
ev = {"attributes": {"sensitive_about": ["", " "]}}
|
|
44
|
+
assert not is_sensitive_event(ev)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TestMalformedShapesDefaultFalse:
|
|
48
|
+
def test_no_attributes(self):
|
|
49
|
+
assert not is_sensitive_event({})
|
|
50
|
+
|
|
51
|
+
def test_attributes_none(self):
|
|
52
|
+
assert not is_sensitive_event({"attributes": None})
|
|
53
|
+
|
|
54
|
+
def test_attributes_not_dict(self):
|
|
55
|
+
assert not is_sensitive_event({"attributes": "oops"})
|
|
56
|
+
|
|
57
|
+
def test_event_not_dict(self):
|
|
58
|
+
assert not is_sensitive_event("nope") # type: ignore[arg-type]
|
|
59
|
+
|
|
60
|
+
def test_sensitive_about_not_list(self):
|
|
61
|
+
assert not is_sensitive_event({"attributes": {"sensitive_about": "usr_x"}})
|