@pentatonic-ai/ai-agent-sdk 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.0";
881
+ var VERSION = "0.10.2";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.0";
850
+ var VERSION = "0.10.2";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.0",
3
+ "version": "0.10.2",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -355,16 +355,48 @@ def _content_hash(arena: str, content: str) -> str:
355
355
  # gateway healthy and queues the rest in compat instead of pushing
356
356
  # the failure back through the DO retry loop (which causes DLQ on
357
357
  # repeated 502s — observed 2026-05-17). Pair with retry below.
358
- _EMBED_SEMAPHORE = asyncio.Semaphore(4)
358
+ #
359
+ # ── Embed lane separation ────────────────────────────────────────────
360
+ # Live /search query embeds and bulk ingest /store(-batch) content embeds
361
+ # share one embedder. A bulk re-embed/ingest job can then saturate the
362
+ # embedder (GPU + the shared semaphore) and starve interactive chat —
363
+ # observed 2026-06-06: a corpus re-embed pinned the embedder, every chat
364
+ # query-embed timed out, semantic search returned 0. Two backward-
365
+ # compatible levers fix it:
366
+ # 1. NV_EMBED_URL_BULK — optional SEPARATE embedder for the bulk lane.
367
+ # Defaults to NV_EMBED_URL, so behaviour is unchanged until a second
368
+ # embedder is provisioned and this is set.
369
+ # 2. Per-lane semaphores — the interactive lane gets its own reserved
370
+ # in-flight slots, so a saturated bulk lane cannot consume the slots
371
+ # live chat needs, EVEN when both lanes share one embedder.
372
+ NV_EMBED_URL_BULK = os.environ.get("NV_EMBED_URL_BULK", NV_EMBED_URL)
373
+ _EMBED_SEMAPHORE = asyncio.Semaphore(
374
+ int(os.environ.get("NV_EMBED_BULK_CONCURRENCY", "4"))
375
+ )
376
+ _EMBED_SEMAPHORE_INTERACTIVE = asyncio.Semaphore(
377
+ int(os.environ.get("NV_EMBED_INTERACTIVE_CONCURRENCY", "4"))
378
+ )
359
379
  _EMBED_RETRY_STATUSES = {502, 503, 504, 429}
360
380
  _EMBED_MAX_ATTEMPTS = 5
361
381
 
362
382
 
363
- async def _embed_batch(texts: list[str]) -> list[list[float]]:
383
+ async def _embed_batch(
384
+ texts: list[str], lane: str = "bulk"
385
+ ) -> list[list[float]]:
364
386
  """Call the external embed gateway. Both 'openai' and
365
- 'pentatonic-gateway' provider shapes supported."""
387
+ 'pentatonic-gateway' provider shapes supported.
388
+
389
+ `lane` selects the embed lane (see NV_EMBED_URL_BULK above):
390
+ - 'interactive' — live /search query embeds. Uses NV_EMBED_URL and
391
+ a dedicated semaphore so chat is never starved by bulk ingest.
392
+ - 'bulk' (default) — ingest /store(-batch) content embeds. Uses
393
+ NV_EMBED_URL_BULK (defaults to NV_EMBED_URL).
394
+ """
366
395
  if not texts:
367
396
  return []
397
+ interactive = lane == "interactive"
398
+ url = NV_EMBED_URL if interactive else NV_EMBED_URL_BULK
399
+ sem = _EMBED_SEMAPHORE_INTERACTIVE if interactive else _EMBED_SEMAPHORE
368
400
  headers = {"Content-Type": "application/json"}
369
401
  if NV_EMBED_API_KEY:
370
402
  if NV_EMBED_PROVIDER == "pentatonic-gateway":
@@ -374,7 +406,7 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
374
406
 
375
407
  body = {"input": texts, "model": "nv-embed-v2"}
376
408
 
377
- async with _EMBED_SEMAPHORE:
409
+ async with sem:
378
410
  # Retry transient gateway failures (502/503/504/429) with
379
411
  # exponential backoff before bubbling up to the caller. Without
380
412
  # this a single GPU hiccup propagates a 500 to the TES DO,
@@ -382,7 +414,7 @@ async def _embed_batch(texts: list[str]) -> list[list[float]]:
382
414
  last_exc: Exception | None = None
383
415
  for attempt in range(_EMBED_MAX_ATTEMPTS):
384
416
  try:
385
- r = await _http.post(NV_EMBED_URL, json=body, headers=headers)
417
+ r = await _http.post(url, json=body, headers=headers)
386
418
  if r.status_code in _EMBED_RETRY_STATUSES:
387
419
  last_exc = httpx.HTTPStatusError(
388
420
  f"embed gateway {r.status_code}", request=r.request, response=r,
@@ -813,7 +845,7 @@ async def search(req: SearchRequest):
813
845
  # rejects to force callers to be explicit.
814
846
  raise HTTPException(400, "arena or arenas required")
815
847
 
816
- qvec = (await _embed_batch([req.query]))[0]
848
+ qvec = (await _embed_batch([req.query], lane="interactive"))[0]
817
849
  # Compose Qdrant Filter: arena scope is always required, plus any
818
850
  # caller-supplied metadata_filter keys ANDed in. Mirrors how
819
851
  # /forget's `metadata_contains` already builds containment filters
@@ -7,8 +7,10 @@ RUN pip install --no-cache-dir -r requirements.txt
7
7
 
8
8
  COPY worker.py .
9
9
  # Pure helper modules — sibling imports inside worker.py
10
- # (noise_filter, confidence). The test_*.py files are intentionally
11
- # excluded; they're for local pytest, not container runtime.
12
- COPY noise_filter.py confidence.py ./
10
+ # (noise_filter, confidence, entity_id). entity_id.py is byte-identical to
11
+ # extractor-sync's copy (per-service build contexts prevent a shared module;
12
+ # tests/test_entity_id_parity.py guards drift). The test_*.py files are
13
+ # intentionally excluded; they're for local pytest, not container runtime.
14
+ COPY noise_filter.py confidence.py entity_id.py ./
13
15
 
14
16
  CMD ["python", "worker.py"]
@@ -0,0 +1,57 @@
1
+ """Canonical entity-ID scheme — SHARED, byte-identical across extractor-sync and
2
+ extractor-async.
3
+
4
+ The two extractors run as separate Docker services with PER-SERVICE build contexts
5
+ (docker-compose `context: ./extractor-sync` / `./extractor-async`), so a single
6
+ importable module can't be COPY'd into both. This file is therefore DUPLICATED in
7
+ each service dir, and tests/test_entity_id_parity.py fails if the copies ever drift.
8
+
9
+ Why this exists: both passes must key an entity (person / org / …) by the SAME id so
10
+ the same entity converges across the deterministic (sync) and LLM (async) passes.
11
+ Before this, the two services keyed entities DIFFERENTLY — sync as
12
+ `e_` + sha256("{arena}|{type}|{name.lower().strip()}")[:24]; async as
13
+ sha256("\\x1f".join(parts))[:32] (no lowercasing, no prefix) — so even identical
14
+ names produced different ids and never merged. We unify on the sync scheme: sync's
15
+ existing rows are unaffected, and the async pass converges onto them.
16
+
17
+ Step 1 of RFC-entity-reconciliation.md (the foundation for alias-aware resolution).
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import re
24
+ import unicodedata
25
+
26
+ _WHITESPACE = re.compile(r"\s+")
27
+
28
+
29
+ def normalize_surface_form(value: str) -> str:
30
+ """Normalize a surface form (person name, email, org name, …) for identity
31
+ keying. Steps, in order:
32
+
33
+ 1. None → "" (defensive; some producers can hand a missing field as None).
34
+ 2. Unicode NFKC (compatibility decomposition + canonical composition) —
35
+ collapses width / ligature / decomposed-accent variants. Without this
36
+ "Café" (precomposed U+00E9) and "Cafe\\u0301" (decomposed e+combining
37
+ acute) — which render identically — would key as different entities.
38
+ Same for fullwidth Latin ("CARLY" ↔ "CARLY") and ligatures
39
+ ("fi" U+FB01 ↔ "fi"). Real-world relevant for vCard imports, Mac
40
+ pasteboard, IME inputs, internationalised name sources.
41
+ 3. Trim outer whitespace, then collapse internal `\\s+` to a single space —
42
+ "Carly Snider" (slack-autocomplete double-space) ↔ "Carly Snider".
43
+ 4. Lowercase. Email casing is case-insensitive per spec; person-name
44
+ casing varies by producer (gmail header casing vs slack profile).
45
+ """
46
+ s = unicodedata.normalize("NFKC", value or "")
47
+ return _WHITESPACE.sub(" ", s.strip()).lower()
48
+
49
+
50
+ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
51
+ """Deterministic entity id. The same (arena, entity_type, normalized
52
+ canonical_name) yields the same id across BOTH extractor passes, so re-extraction
53
+ and cross-pass extraction converge. Format is preserved from extractor-sync
54
+ (`e_` + 24 hex of sha256) so its existing rows are unaffected.
55
+ """
56
+ key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
57
+ return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
@@ -0,0 +1,51 @@
1
+ """sensitive_filter — content-guardrail skip for the distillation worker.
2
+
3
+ Seesa's ingest classifier tags content that is directed interpersonal
4
+ sentiment / characterization about a named colleague (gossip) with
5
+ `sensitivity_class == 'interpersonal'` and a `sensitive_about` subject
6
+ set (see Seesa docs/permissions/content-guardrail-spec.md). Such content
7
+ must NEVER be distilled into the entity graph: the SUBJECT of gossip has
8
+ no standing in an entity graph, and turning "X thinks Y is checked out"
9
+ into a fact ABOUT Y is the worst pollution — and a real HR / legal
10
+ hazard. The extractor filters these events at claim time (a SQL pre-pass)
11
+ AND per-event via the pure predicate here (defense in depth).
12
+
13
+ commercial_secret is deliberately NOT filtered: it stays in the
14
+ producer's own per-user arena; its cross-user spread is governed upstream
15
+ (L0 / Layer-P), not by the per-arena graph.
16
+
17
+ Pure module — no I/O, no deps. Importable from worker.py without pulling
18
+ in psycopg / httpx, and importable from tests without fixtures (mirrors
19
+ noise_filter.py).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ from typing import Any
26
+
27
+ # Default ON; tunable off for back-compat. Inert until Seesa stamps the
28
+ # tag upstream (the interpersonal detector is opt-in), but safe to ship on
29
+ # so it takes effect the moment tags appear.
30
+ SKIP_SENSITIVE_CONTENT: bool = os.environ.get(
31
+ "DISTILL_SKIP_SENSITIVE_CONTENT", "true"
32
+ ).strip().lower() not in ("false", "0", "no", "off")
33
+
34
+
35
+ def is_sensitive_event(event: dict[str, Any]) -> bool:
36
+ """True iff the event was tagged interpersonal gossip about a colleague
37
+ (`sensitivity_class == 'interpersonal'`) or carries a non-empty
38
+ `sensitive_about` subject set. Tolerant of missing / odd attribute
39
+ shapes — defaults to False (fail-open: the SQL pre-filter is the
40
+ primary line; a malformed attribute bag never crashes the worker)."""
41
+ if not isinstance(event, dict):
42
+ return False
43
+ attrs = event.get("attributes") or {}
44
+ if not isinstance(attrs, dict):
45
+ return False
46
+ if attrs.get("sensitivity_class") == "interpersonal":
47
+ return True
48
+ sa = attrs.get("sensitive_about")
49
+ return isinstance(sa, list) and any(
50
+ isinstance(s, str) and s.strip() for s in sa
51
+ )
@@ -0,0 +1,258 @@
1
+ """Unit tests for entity reconciliation logic in extractor-async.
2
+
3
+ Covers the parts of RFC §1 (normalization), §2 (alias-aware upsert
4
+ helper construction), and §4 (LLM prompt 4-field ENT parsing) that
5
+ don't require a real Postgres connection. The integration scenarios
6
+ (advisory-lock concurrency, end-to-end alias resolution) need a DB
7
+ and live in `packages/memory-engine-v2/tests/`.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import importlib.util
13
+ from pathlib import Path
14
+
15
+ import pytest
16
+
17
+
18
+ _THIS = Path(__file__).resolve().parent
19
+ _SPEC = importlib.util.spec_from_file_location("extractor_async_worker",
20
+ _THIS / "worker.py")
21
+ assert _SPEC and _SPEC.loader
22
+ worker = importlib.util.module_from_spec(_SPEC)
23
+ try:
24
+ _SPEC.loader.exec_module(worker)
25
+ except ImportError as e:
26
+ pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
27
+
28
+
29
+
30
+ def test_content_id_does_not_normalize() -> None:
31
+ """`_content_id` is used for fact / relationship ids — those are
32
+ content-hashed and intentionally case-sensitive. Entity ids go
33
+ through `entity_id()` (entity_id.py) instead, which DOES normalize.
34
+ Locks in that separation: changing `_content_id` to normalize
35
+ would silently break fact/rel content addressing."""
36
+ a = worker._content_id("foo", "bar")
37
+ b = worker._content_id("FOO", "bar")
38
+ assert a != b
39
+
40
+
41
+ # ----------------------------------------------------------------------
42
+ # Prompt parser — ENT|type|name|email? (RFC §Async extractor)
43
+ # ----------------------------------------------------------------------
44
+
45
+ def test_ent_parser_three_fields_unchanged() -> None:
46
+ """Back-compat: 3-field ENT lines (no email) parse as before."""
47
+ out = worker._parse_kv_records(
48
+ "=== event 0 ===\nENT|person|Alex Wong\n", expected_n=1
49
+ )
50
+ assert out[0]["entities"] == [{"type": "person", "name": "Alex Wong"}]
51
+
52
+
53
+ def test_ent_parser_four_fields_promotes_email_to_aliases() -> None:
54
+ """New: 4-field ENT|person|<name>|<email> → email goes into aliases."""
55
+ out = worker._parse_kv_records(
56
+ "=== event 0 ===\nENT|person|Carly Snider|carly@example.com\n",
57
+ expected_n=1,
58
+ )
59
+ assert out[0]["entities"] == [{
60
+ "type": "person",
61
+ "name": "Carly Snider",
62
+ "aliases": ["carly@example.com"],
63
+ }]
64
+
65
+
66
+ def test_ent_parser_four_fields_non_email_dropped() -> None:
67
+ """Junk 4th field (not an email) is silently dropped — better to
68
+ strip noise than poison the aliases list with non-email tokens."""
69
+ out = worker._parse_kv_records(
70
+ "=== event 0 ===\nENT|person|Sam Patel|something random\n",
71
+ expected_n=1,
72
+ )
73
+ assert out[0]["entities"] == [{"type": "person", "name": "Sam Patel"}]
74
+
75
+
76
+ def test_ent_parser_mixed_three_and_four_field() -> None:
77
+ """A batch can mix 3-field and 4-field ENT lines (model behaviour
78
+ will vary by what's visible per event)."""
79
+ out = worker._parse_kv_records(
80
+ (
81
+ "=== event 0 ===\n"
82
+ "ENT|person|Alex Wong\n"
83
+ "ENT|person|Carly Snider|carly@example.com\n"
84
+ "ENT|org|Acme Corp\n"
85
+ ),
86
+ expected_n=1,
87
+ )
88
+ ents = out[0]["entities"]
89
+ assert {"type": "person", "name": "Alex Wong"} in ents
90
+ assert {"type": "person", "name": "Carly Snider",
91
+ "aliases": ["carly@example.com"]} in ents
92
+ assert {"type": "org", "name": "Acme Corp"} in ents
93
+
94
+
95
+ def test_ent_parser_email_with_dot_and_subdomain() -> None:
96
+ """Realistic email forms (subdomains, plus-tags, dots) get
97
+ accepted into aliases."""
98
+ out = worker._parse_kv_records(
99
+ (
100
+ "=== event 0 ===\n"
101
+ "ENT|person|Dot Person|first.last@sub.example.com\n"
102
+ "ENT|person|Plus Tag|user+tag@example.co.uk\n"
103
+ ),
104
+ expected_n=1,
105
+ )
106
+ aliases = [e.get("aliases") for e in out[0]["entities"]]
107
+ assert ["first.last@sub.example.com"] in aliases
108
+ assert ["user+tag@example.co.uk"] in aliases
109
+
110
+
111
+ # ----------------------------------------------------------------------
112
+ # Raw-slice splitter — for distillation trace logging (migration 003).
113
+ # ----------------------------------------------------------------------
114
+
115
+ def test_split_event_blocks_basic() -> None:
116
+ """Two events, each with content; both slices come back verbatim
117
+ (sans trailing whitespace)."""
118
+ text = (
119
+ "=== event 0 ===\n"
120
+ "ENT|person|Alex\n"
121
+ "FCT|mention|Alex|works at|Acme|Alex works at Acme\n"
122
+ "=== event 1 ===\n"
123
+ "ENT|org|Acme\n"
124
+ )
125
+ slices = worker._split_event_blocks(text, expected_n=2)
126
+ assert slices[0] == (
127
+ "ENT|person|Alex\nFCT|mention|Alex|works at|Acme|Alex works at Acme"
128
+ )
129
+ assert slices[1] == "ENT|org|Acme"
130
+
131
+
132
+ def test_split_event_blocks_pads_missing() -> None:
133
+ """If the model skips an event (header absent) we still return
134
+ expected_n entries — missing ones come back as empty strings.
135
+ Parity with `_parse_kv_records` so trace logging and parsing line
136
+ up at the same indices."""
137
+ text = "=== event 0 ===\nENT|person|Alex\n" # no event 1, 2
138
+ slices = worker._split_event_blocks(text, expected_n=3)
139
+ assert slices[0] == "ENT|person|Alex"
140
+ assert slices[1] == ""
141
+ assert slices[2] == ""
142
+
143
+
144
+ def test_split_event_blocks_ignores_preamble() -> None:
145
+ """Lines before the first event header (model preamble like
146
+ 'Sure, here are the extractions:') don't poison any slice."""
147
+ text = (
148
+ "Here are the extractions:\n\n"
149
+ "=== event 0 ===\n"
150
+ "ENT|person|Alex\n"
151
+ )
152
+ slices = worker._split_event_blocks(text, expected_n=1)
153
+ assert slices[0] == "ENT|person|Alex"
154
+
155
+
156
+ def test_split_event_blocks_out_of_range_header_dropped() -> None:
157
+ """If the model emits `=== event 7 ===` when N=2, the rogue header
158
+ + its body get dropped without crashing or corrupting other slices."""
159
+ text = (
160
+ "=== event 0 ===\n"
161
+ "ENT|person|Alex\n"
162
+ "=== event 7 ===\n"
163
+ "ENT|person|Mystery\n"
164
+ "=== event 1 ===\n"
165
+ "ENT|org|Acme\n"
166
+ )
167
+ slices = worker._split_event_blocks(text, expected_n=2)
168
+ assert slices[0] == "ENT|person|Alex"
169
+ assert slices[1] == "ENT|org|Acme"
170
+ assert all("Mystery" not in s for s in slices)
171
+
172
+
173
+ def test_split_and_parse_align_at_same_indices() -> None:
174
+ """Contract: for the same text + expected_n, the i-th slice
175
+ corresponds to the i-th parsed-records dict. The worker relies on
176
+ this to attach raw_slice to the right record."""
177
+ text = (
178
+ "=== event 0 ===\n"
179
+ "ENT|person|Alex\n"
180
+ "=== event 1 ===\n"
181
+ "ENT|org|Acme\n"
182
+ )
183
+ parsed = worker._parse_kv_records(text, expected_n=2)
184
+ slices = worker._split_event_blocks(text, expected_n=2)
185
+ assert len(parsed) == len(slices) == 2
186
+ # event 0 — Alex is in both the parsed entities AND the slice
187
+ assert any(e["name"] == "Alex" for e in parsed[0]["entities"])
188
+ assert "Alex" in slices[0]
189
+ assert "Acme" in slices[1]
190
+
191
+
192
+ # ----------------------------------------------------------------------
193
+ # build_event_block format contract — frozen reference for out-of-tree
194
+ # tooling (the Pentatonic-internal training-data export, future student
195
+ # inference, etc.) to match against. The script that generates training
196
+ # data lives OUTSIDE the SDK so it can't import this directly; this
197
+ # test instead asserts a fixed (event_dict, rendered) pair as the
198
+ # canonical contract. If you change build_event_block, also update
199
+ # anything external that reproduces the format.
200
+ # ----------------------------------------------------------------------
201
+
202
+ def test_build_event_block_format_contract_full() -> None:
203
+ """All optional attributes (emitted_at, author) present."""
204
+ event = {
205
+ "source_kind": "chat",
206
+ "content": "hello world",
207
+ "attributes": {
208
+ "emitted_at": "2026-06-01T12:00:00Z",
209
+ "author": "phil@example.com",
210
+ },
211
+ }
212
+ assert worker.build_event_block(0, event) == (
213
+ "[event 0]\n"
214
+ "source_kind: chat\n"
215
+ "when: 2026-06-01T12:00:00Z\n"
216
+ "author: phil@example.com\n"
217
+ "---\n"
218
+ "hello world"
219
+ )
220
+
221
+
222
+ def test_build_event_block_format_contract_minimal() -> None:
223
+ """Only source_kind + content. Missing optional attrs drop their
224
+ header lines entirely — not blank lines, not '-' placeholders."""
225
+ event = {
226
+ "source_kind": "doc",
227
+ "content": "abc",
228
+ "attributes": {},
229
+ }
230
+ assert worker.build_event_block(0, event) == (
231
+ "[event 0]\n"
232
+ "source_kind: doc\n"
233
+ "---\n"
234
+ "abc"
235
+ )
236
+
237
+
238
+ def test_build_event_block_format_contract_idx_passthrough() -> None:
239
+ """The idx argument names the event in the header — must round-trip
240
+ so that batched extractions can reattach to the right event."""
241
+ block = worker.build_event_block(7, {
242
+ "source_kind": "note", "content": "x", "attributes": {},
243
+ })
244
+ assert block.startswith("[event 7]\n")
245
+
246
+
247
+ def test_build_event_block_format_contract_truncation() -> None:
248
+ """Content longer than MAX_CONTENT_CHARS gets truncated. External
249
+ reconstructors must mirror this — otherwise the student sees a
250
+ different input distribution at training vs inference time."""
251
+ long_content = "x" * (worker.MAX_CONTENT_CHARS + 500)
252
+ block = worker.build_event_block(0, {
253
+ "source_kind": "doc",
254
+ "content": long_content,
255
+ "attributes": {},
256
+ })
257
+ content_part = block.split("---\n", 1)[1]
258
+ assert len(content_part) == worker.MAX_CONTENT_CHARS
@@ -0,0 +1,61 @@
1
+ """Tests for sensitive_filter — content-guardrail distillation skip.
2
+
3
+ The rule: never distil interpersonal gossip about a colleague into the
4
+ entity graph. Mirrors Seesa docs/permissions/content-guardrail-spec.md.
5
+
6
+ Run: pytest packages/memory-engine-v2/extractor-async/test_sensitive_filter.py
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from sensitive_filter import is_sensitive_event
12
+
13
+
14
+ class TestSensitiveEventsAreFiltered:
15
+ def test_interpersonal_class(self):
16
+ ev = {"attributes": {"sensitivity_class": "interpersonal"}}
17
+ assert is_sensitive_event(ev)
18
+
19
+ def test_non_empty_sensitive_about(self):
20
+ ev = {"attributes": {"sensitive_about": ["usr_sarah"]}}
21
+ assert is_sensitive_event(ev)
22
+
23
+ def test_both(self):
24
+ ev = {"attributes": {"sensitivity_class": "interpersonal", "sensitive_about": ["usr_x"]}}
25
+ assert is_sensitive_event(ev)
26
+
27
+
28
+ class TestNonSensitiveEventsPass:
29
+ def test_clean_event(self):
30
+ ev = {"attributes": {"source": "pip-granola-ingest", "content": "Q3 roadmap"}}
31
+ assert not is_sensitive_event(ev)
32
+
33
+ def test_commercial_secret_is_NOT_filtered(self):
34
+ # commercial_secret stays in the producer's own arena — not graph-filtered.
35
+ ev = {"attributes": {"sensitivity_class": "commercial_secret"}}
36
+ assert not is_sensitive_event(ev)
37
+
38
+ def test_empty_sensitive_about(self):
39
+ ev = {"attributes": {"sensitive_about": []}}
40
+ assert not is_sensitive_event(ev)
41
+
42
+ def test_blank_only_subjects(self):
43
+ ev = {"attributes": {"sensitive_about": ["", " "]}}
44
+ assert not is_sensitive_event(ev)
45
+
46
+
47
+ class TestMalformedShapesDefaultFalse:
48
+ def test_no_attributes(self):
49
+ assert not is_sensitive_event({})
50
+
51
+ def test_attributes_none(self):
52
+ assert not is_sensitive_event({"attributes": None})
53
+
54
+ def test_attributes_not_dict(self):
55
+ assert not is_sensitive_event({"attributes": "oops"})
56
+
57
+ def test_event_not_dict(self):
58
+ assert not is_sensitive_event("nope") # type: ignore[arg-type]
59
+
60
+ def test_sensitive_about_not_list(self):
61
+ assert not is_sensitive_event({"attributes": {"sensitive_about": "usr_x"}})