@pentatonic-ai/ai-agent-sdk 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/server.py +38 -6
- package/packages/memory-engine-v2/extractor-async/Dockerfile +5 -3
- package/packages/memory-engine-v2/extractor-async/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-async/sensitive_filter.py +51 -0
- package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py +258 -0
- package/packages/memory-engine-v2/extractor-async/test_sensitive_filter.py +61 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +276 -42
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-sync/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +231 -55
- package/packages/memory-engine-v2/extractor-sync/test_entity_id.py +88 -0
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +208 -0
- package/packages/memory-engine-v2/org-model/migrations/002_entity_merges_audit.sql +53 -0
- package/packages/memory-engine-v2/org-model/migrations/003_distillation_traces.sql +60 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +581 -0
- package/packages/memory-engine-v2/tests/test_entity_id_parity.py +57 -0
|
@@ -40,7 +40,9 @@ import psycopg
|
|
|
40
40
|
import psycopg.rows
|
|
41
41
|
|
|
42
42
|
from confidence import corroborated_confidence
|
|
43
|
+
from entity_id import entity_id, normalize_surface_form
|
|
43
44
|
from noise_filter import is_noise_entity_name
|
|
45
|
+
from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
|
|
44
46
|
|
|
45
47
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
46
48
|
log = logging.getLogger("extractor-async")
|
|
@@ -74,6 +76,13 @@ LLM_MAX_TOKENS_PER_EVENT = int(os.environ.get("LLM_MAX_TOKENS_PER_EVENT", "300")
|
|
|
74
76
|
|
|
75
77
|
WORKER_ID = f"{socket.gethostname()}:{os.getpid()}"
|
|
76
78
|
|
|
79
|
+
# Trace logging — captures raw teacher I/O per distilled event so we can
|
|
80
|
+
# train a student model (BART/FLAN-T5) on the teacher's distribution.
|
|
81
|
+
# Default off; opt-in per environment. See migration 003.
|
|
82
|
+
DISTILL_TRACE_ENABLED = os.environ.get(
|
|
83
|
+
"DISTILL_TRACE_ENABLED", "false"
|
|
84
|
+
).strip().lower() in ("true", "1", "yes", "on")
|
|
85
|
+
|
|
77
86
|
|
|
78
87
|
# KV-text output format constants. We dropped JSON output (and the
|
|
79
88
|
# `guided_json` schema enforcement that went with it) because a single
|
|
@@ -124,8 +133,16 @@ RULES:
|
|
|
124
133
|
- Each event MUST start with a `=== event K ===` header (zero-indexed, \
|
|
125
134
|
matching the input index). NEVER skip an event — if an event has \
|
|
126
135
|
nothing to extract, emit ONLY the header.
|
|
127
|
-
- ENT lines have
|
|
136
|
+
- ENT lines have 3 or 4 fields: literal `ENT`, type, name, [email].
|
|
128
137
|
type ∈ {person, org, product, place, project, concept, topic, date, other}
|
|
138
|
+
email (OPTIONAL, person only): when the event body or attributes
|
|
139
|
+
show an email address that unambiguously identifies the person,
|
|
140
|
+
append it as the 4th field. This pairs the name+email forms so a
|
|
141
|
+
later event seeing only the email resolves to the same entity.
|
|
142
|
+
Examples:
|
|
143
|
+
ENT|person|Alex Wong|alex@example.com
|
|
144
|
+
ENT|person|Acme Corp (org, no email)
|
|
145
|
+
ENT|person|Sam Patel (person, email not visible)
|
|
129
146
|
- FCT lines have exactly 6 fields: `FCT`, category, subject, \
|
|
130
147
|
predicate, object, statement.
|
|
131
148
|
category ∈ {decision, commitment, state, mention, observation, preference}
|
|
@@ -144,6 +161,12 @@ A whole file is one entity, not twenty.
|
|
|
144
161
|
- Output ONLY the formatted records. No header, no footer, no prose."""
|
|
145
162
|
|
|
146
163
|
|
|
164
|
+
# Teacher-prompt fingerprint for trace logging. If the prompt changes,
|
|
165
|
+
# the hash changes — lets training-data exports filter by teacher
|
|
166
|
+
# version so we never mix outputs from a retired prompt.
|
|
167
|
+
SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
|
|
168
|
+
|
|
169
|
+
|
|
147
170
|
def build_event_block(idx: int, event: dict[str, Any]) -> str:
|
|
148
171
|
"""Render one event as `[event K]\nheader\n---\ncontent` block."""
|
|
149
172
|
src = event.get("source_kind", "unknown")
|
|
@@ -200,11 +223,24 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
|
|
|
200
223
|
# maxsplit so statement / name fields can contain colons or
|
|
201
224
|
# other reserved-looking content without breaking parsing.
|
|
202
225
|
if line.startswith("ENT|"):
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
226
|
+
# ENT|type|name|email? — email is optional, person-only.
|
|
227
|
+
# Use maxsplit=3 so a literal `|` in name (which the prompt
|
|
228
|
+
# forbids but the model might still emit) doesn't get
|
|
229
|
+
# parsed as an email field.
|
|
230
|
+
parts = line.split("|", 3)
|
|
231
|
+
if len(parts) >= 3 and parts[2].strip():
|
|
232
|
+
etype = parts[1].strip().lower()
|
|
233
|
+
name = parts[2].strip()
|
|
234
|
+
ent: dict[str, Any] = {"type": etype, "name": name}
|
|
235
|
+
# Promote 4th-field email into aliases when present
|
|
236
|
+
# and it actually looks like an email. Non-email
|
|
237
|
+
# 4th fields (random text the model added) are dropped
|
|
238
|
+
# — better to silently strip junk than poison aliases.
|
|
239
|
+
if len(parts) == 4:
|
|
240
|
+
email = parts[3].strip()
|
|
241
|
+
if email and "@" in email and " " not in email:
|
|
242
|
+
ent["aliases"] = [email]
|
|
243
|
+
current["entities"].append(ent)
|
|
208
244
|
elif line.startswith("FCT|"):
|
|
209
245
|
parts = line.split("|", 5)
|
|
210
246
|
if len(parts) == 6 and parts[5].strip():
|
|
@@ -232,6 +268,38 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
|
|
|
232
268
|
return results
|
|
233
269
|
|
|
234
270
|
|
|
271
|
+
def _split_event_blocks(text: str, expected_n: int) -> list[str]:
|
|
272
|
+
"""Slice raw LLM output by `=== event K ===` headers.
|
|
273
|
+
|
|
274
|
+
Returns expected_n slices in event-index order. Each slice is the
|
|
275
|
+
verbatim text between a header and the next header (or end-of-text),
|
|
276
|
+
stripped of trailing whitespace. Events the model omitted come back
|
|
277
|
+
as empty strings — same shape contract as _parse_kv_records.
|
|
278
|
+
|
|
279
|
+
Separate from the parser so trace logging stays decoupled from
|
|
280
|
+
extraction semantics: the parser drops malformed lines silently
|
|
281
|
+
(correctness), but the trace wants the raw output verbatim
|
|
282
|
+
(training fidelity)."""
|
|
283
|
+
slices: list[str] = [""] * expected_n
|
|
284
|
+
current_idx: int | None = None
|
|
285
|
+
current_lines: list[str] = []
|
|
286
|
+
|
|
287
|
+
def flush() -> None:
|
|
288
|
+
if current_idx is not None and 0 <= current_idx < expected_n:
|
|
289
|
+
slices[current_idx] = "\n".join(current_lines).rstrip()
|
|
290
|
+
|
|
291
|
+
for raw in text.splitlines():
|
|
292
|
+
m = EVENT_HEADER_RE.match(raw.strip()) if raw.strip() else None
|
|
293
|
+
if m:
|
|
294
|
+
flush()
|
|
295
|
+
current_idx = int(m.group(1))
|
|
296
|
+
current_lines = []
|
|
297
|
+
elif current_idx is not None:
|
|
298
|
+
current_lines.append(raw)
|
|
299
|
+
flush()
|
|
300
|
+
return slices
|
|
301
|
+
|
|
302
|
+
|
|
235
303
|
async def call_llm_batch(
|
|
236
304
|
client: httpx.AsyncClient, events: list[dict[str, Any]]
|
|
237
305
|
) -> list[dict[str, Any]]:
|
|
@@ -279,7 +347,15 @@ async def call_llm_batch(
|
|
|
279
347
|
text = data.get("message", {}).get("content", "")
|
|
280
348
|
if not text:
|
|
281
349
|
raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
|
|
282
|
-
|
|
350
|
+
parsed = _parse_kv_records(text, n)
|
|
351
|
+
# Attach the per-event raw slice so downstream trace logging gets
|
|
352
|
+
# the model's verbatim output for THIS event without re-splitting
|
|
353
|
+
# the chunk-level text. Parser semantics are unaffected — the
|
|
354
|
+
# raw_slice key is ignored by upsert paths.
|
|
355
|
+
slices = _split_event_blocks(text, n)
|
|
356
|
+
for record, slice_text in zip(parsed, slices):
|
|
357
|
+
record["raw_slice"] = slice_text
|
|
358
|
+
return parsed
|
|
283
359
|
|
|
284
360
|
|
|
285
361
|
# --------------------------------------------------------------------
|
|
@@ -288,6 +364,10 @@ async def call_llm_batch(
|
|
|
288
364
|
|
|
289
365
|
|
|
290
366
|
def _content_id(*parts: str) -> str:
|
|
367
|
+
"""Deterministic content-addressed id for facts / relationships.
|
|
368
|
+
Entity ids are minted via `entity_id()` from entity_id.py (see
|
|
369
|
+
`upsert_entities` below); this helper covers the non-entity
|
|
370
|
+
content-hash needs."""
|
|
291
371
|
return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
|
|
292
372
|
|
|
293
373
|
|
|
@@ -299,12 +379,33 @@ def upsert_entities(
|
|
|
299
379
|
disclosure_class: str,
|
|
300
380
|
entities: list[dict],
|
|
301
381
|
) -> dict[str, str]:
|
|
302
|
-
"""
|
|
303
|
-
relationships can link to the inserted rows.
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
382
|
+
"""Alias-aware insert (or merge) of entities; returns a name→id
|
|
383
|
+
map so facts and relationships can link to the inserted rows.
|
|
384
|
+
|
|
385
|
+
Two concerns layered together:
|
|
386
|
+
|
|
387
|
+
1. **ID derivation** uses the shared `entity_id()` helper from
|
|
388
|
+
entity_id.py: `e_` + 24 hex of sha256("{arena}|{entity_type}|
|
|
389
|
+
{normalize_surface_form(name)}"). BYTE-IDENTICAL to extractor-
|
|
390
|
+
sync's id derivation, so the same person extracted by both
|
|
391
|
+
passes converges to the same row instead of fragmenting across
|
|
392
|
+
two id schemes. (RFC step 1.)
|
|
393
|
+
|
|
394
|
+
2. **Resolution at upsert** — before INSERT, check for existing
|
|
395
|
+
rows in the same (arena, entity_type) whose canonical_name or
|
|
396
|
+
aliases overlap any incoming surface form. If matched, merge
|
|
397
|
+
into the existing row. Per-form `pg_advisory_xact_lock`
|
|
398
|
+
serialises concurrent writers (sync + async on the same event)
|
|
399
|
+
on the same surface form. (RFC steps 2 + 2a.)
|
|
400
|
+
|
|
401
|
+
MIRROR of extractor-sync/server.py:_upsert_entities — same
|
|
402
|
+
resolution algorithm. Kept as separate Python because the sync
|
|
403
|
+
extractor uses async psycopg and the async worker uses sync
|
|
404
|
+
psycopg; the SQL is identical.
|
|
405
|
+
|
|
406
|
+
Returns name→id where `name` is the LLM-emitted surface form
|
|
407
|
+
(canonical) so facts/relationships using the same surface form
|
|
408
|
+
in the same LLM batch resolve to the right id."""
|
|
308
409
|
name_to_id: dict[str, str] = {}
|
|
309
410
|
if not entities:
|
|
310
411
|
return name_to_id
|
|
@@ -314,39 +415,84 @@ def upsert_entities(
|
|
|
314
415
|
name = (e.get("name") or "").strip()
|
|
315
416
|
if not name:
|
|
316
417
|
continue
|
|
317
|
-
# Drop junk names before they enter the graph.
|
|
318
|
-
# noise_filter.py — patterns are anchored to live-arena
|
|
319
|
-
# noise (pronouns, hostnames, paths, agent-worktree
|
|
320
|
-
# labels). Skipping here means name_to_id never carries
|
|
321
|
-
# the bad name, so any fact/relationship the LLM tried to
|
|
322
|
-
# attach to it gets dropped downstream (subj/obj resolve
|
|
323
|
-
# to None ⇒ filtered out by upsert_facts /
|
|
324
|
-
# upsert_relationships).
|
|
418
|
+
# Drop junk names before they enter the graph.
|
|
325
419
|
if is_noise_entity_name(etype, name):
|
|
326
420
|
continue
|
|
327
421
|
aliases = [a for a in (e.get("aliases") or []) if a]
|
|
328
|
-
|
|
329
|
-
|
|
422
|
+
|
|
423
|
+
# Sort (don't `list(set(...))`) so lock acquisition order
|
|
424
|
+
# is deterministic across processes — set-iteration order
|
|
425
|
+
# depends on Python's per-process hash randomisation, so
|
|
426
|
+
# sync and async extractors processing the same person
|
|
427
|
+
# could otherwise acquire the same locks in opposite
|
|
428
|
+
# orders and deadlock.
|
|
429
|
+
forms_original = sorted({name, *aliases})
|
|
430
|
+
forms_normalized = sorted({normalize_surface_form(f) for f in forms_original})
|
|
431
|
+
|
|
432
|
+
# 1. Advisory lock per surface form. Serialises concurrent
|
|
433
|
+
# writers (sync + async on the same event) on the same
|
|
434
|
+
# person without blocking anything else. See RFC §2a.
|
|
435
|
+
for f in forms_normalized:
|
|
436
|
+
cur.execute(
|
|
437
|
+
"SELECT pg_advisory_xact_lock(hashtext(%s))",
|
|
438
|
+
(f"{arena}|{etype}|{f}",),
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# 2. Resolve via canonical-name (normalised, case-insensitive)
|
|
442
|
+
# or aliases overlap.
|
|
330
443
|
cur.execute(
|
|
331
444
|
"""
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
provenance_event_ids = (
|
|
341
|
-
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
342
|
-
),
|
|
343
|
-
last_seen = NOW()
|
|
445
|
+
SELECT id FROM entities
|
|
446
|
+
WHERE arena = %s
|
|
447
|
+
AND entity_type = %s
|
|
448
|
+
AND (
|
|
449
|
+
lower(canonical_name) = ANY(%s::text[])
|
|
450
|
+
OR aliases && %s::text[]
|
|
451
|
+
)
|
|
452
|
+
LIMIT 1
|
|
344
453
|
""",
|
|
345
|
-
(
|
|
346
|
-
eid, arena, etype, name, aliases,
|
|
347
|
-
[event_id], participant_set, disclosure_class,
|
|
348
|
-
),
|
|
454
|
+
(arena, etype, forms_normalized, forms_original),
|
|
349
455
|
)
|
|
456
|
+
row = cur.fetchone()
|
|
457
|
+
|
|
458
|
+
if row is not None:
|
|
459
|
+
# 3a. Existing match — merge aliases + provenance.
|
|
460
|
+
# Canonical stays as-was (accrete-only per RFC §2b).
|
|
461
|
+
eid = row[0]
|
|
462
|
+
cur.execute(
|
|
463
|
+
"""
|
|
464
|
+
UPDATE entities SET
|
|
465
|
+
aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
|
|
466
|
+
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
|
|
467
|
+
last_seen = NOW()
|
|
468
|
+
WHERE id = %s
|
|
469
|
+
""",
|
|
470
|
+
(aliases, [event_id], eid),
|
|
471
|
+
)
|
|
472
|
+
else:
|
|
473
|
+
# 3b. No match — insert new.
|
|
474
|
+
eid = entity_id(arena, etype, name)
|
|
475
|
+
cur.execute(
|
|
476
|
+
"""
|
|
477
|
+
INSERT INTO entities (
|
|
478
|
+
id, arena, entity_type, canonical_name, aliases,
|
|
479
|
+
provenance_event_ids, participant_set, disclosure_class
|
|
480
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
|
|
481
|
+
ON CONFLICT (id) DO UPDATE SET
|
|
482
|
+
aliases = (
|
|
483
|
+
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
|
|
484
|
+
),
|
|
485
|
+
provenance_event_ids = (
|
|
486
|
+
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
487
|
+
),
|
|
488
|
+
last_seen = NOW()
|
|
489
|
+
""",
|
|
490
|
+
(
|
|
491
|
+
eid, arena, etype, name, aliases,
|
|
492
|
+
[event_id], participant_set, disclosure_class,
|
|
493
|
+
),
|
|
494
|
+
)
|
|
495
|
+
name_to_id[name] = eid
|
|
350
496
|
return name_to_id
|
|
351
497
|
|
|
352
498
|
|
|
@@ -486,6 +632,43 @@ def upsert_relationships(
|
|
|
486
632
|
return inserted
|
|
487
633
|
|
|
488
634
|
|
|
635
|
+
# --------------------------------------------------------------------
|
|
636
|
+
# Distillation trace logging
|
|
637
|
+
# --------------------------------------------------------------------
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _insert_trace(
|
|
641
|
+
conn: psycopg.Connection,
|
|
642
|
+
*,
|
|
643
|
+
event_id: str,
|
|
644
|
+
user_prompt: str,
|
|
645
|
+
raw_response: str,
|
|
646
|
+
llm_chunk_ms: float | None,
|
|
647
|
+
) -> None:
|
|
648
|
+
"""Append a (user_prompt, raw_response) pair to distillation_traces.
|
|
649
|
+
|
|
650
|
+
Audit-only — not on the hot path of distillation semantics. The
|
|
651
|
+
caller wraps this in a try/except and never lets a trace-insert
|
|
652
|
+
failure poison the upsert path. Skip rows with empty raw_response
|
|
653
|
+
(no signal to train on); the model occasionally emits a header
|
|
654
|
+
with no body."""
|
|
655
|
+
if not raw_response.strip():
|
|
656
|
+
return
|
|
657
|
+
with conn.cursor() as cur:
|
|
658
|
+
cur.execute(
|
|
659
|
+
"""
|
|
660
|
+
INSERT INTO distillation_traces (
|
|
661
|
+
event_id, user_prompt, raw_response,
|
|
662
|
+
llm_model, system_prompt_hash, llm_chunk_ms
|
|
663
|
+
) VALUES (%s, %s, %s, %s, %s, %s)
|
|
664
|
+
""",
|
|
665
|
+
(
|
|
666
|
+
event_id, user_prompt, raw_response,
|
|
667
|
+
LLM_MODEL, SYSTEM_PROMPT_HASH, llm_chunk_ms,
|
|
668
|
+
),
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
|
|
489
672
|
# --------------------------------------------------------------------
|
|
490
673
|
# Queue mechanics
|
|
491
674
|
# --------------------------------------------------------------------
|
|
@@ -570,6 +753,29 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
|
|
|
570
753
|
""",
|
|
571
754
|
(list(SKIP_ATTRIBUTE_SOURCES),),
|
|
572
755
|
)
|
|
756
|
+
# Pre-filter: content-guardrail sensitive events (interpersonal
|
|
757
|
+
# gossip about a colleague). Never distil gossip into the entity
|
|
758
|
+
# graph — the subject has no standing there. attributes is jsonb;
|
|
759
|
+
# cast defensively in case a producer wrote json.
|
|
760
|
+
if SKIP_SENSITIVE_CONTENT:
|
|
761
|
+
cur.execute(
|
|
762
|
+
"""
|
|
763
|
+
UPDATE distillation_queue dq SET
|
|
764
|
+
status = 'done',
|
|
765
|
+
completed_at = NOW(),
|
|
766
|
+
last_error = 'filtered: content-guardrail sensitive'
|
|
767
|
+
FROM events e
|
|
768
|
+
WHERE dq.event_id = e.id
|
|
769
|
+
AND dq.status = 'pending'
|
|
770
|
+
AND (
|
|
771
|
+
(e.attributes::jsonb)->>'sensitivity_class' = 'interpersonal'
|
|
772
|
+
OR (
|
|
773
|
+
jsonb_typeof((e.attributes::jsonb)->'sensitive_about') = 'array'
|
|
774
|
+
AND jsonb_array_length((e.attributes::jsonb)->'sensitive_about') > 0
|
|
775
|
+
)
|
|
776
|
+
)
|
|
777
|
+
"""
|
|
778
|
+
)
|
|
573
779
|
# Pre-filter: events older than the window.
|
|
574
780
|
cur.execute(
|
|
575
781
|
"""
|
|
@@ -704,14 +910,24 @@ async def process_batch(
|
|
|
704
910
|
for item in items:
|
|
705
911
|
events_by_qid[item["id"]] = fetch_event(conn, item["event_id"])
|
|
706
912
|
|
|
707
|
-
# Drop items whose event is missing (mark done up-front, no LLM call)
|
|
913
|
+
# Drop items whose event is missing (mark done up-front, no LLM call),
|
|
914
|
+
# and — content guardrail — any sensitive event that slipped past the
|
|
915
|
+
# claim-time pre-filter (defense in depth; the pure predicate is the
|
|
916
|
+
# testable contract for the rule). Never distil interpersonal gossip.
|
|
708
917
|
callable_items: list[dict[str, Any]] = []
|
|
709
918
|
for item in items:
|
|
710
|
-
|
|
919
|
+
ev = events_by_qid[item["id"]]
|
|
920
|
+
if ev is None:
|
|
711
921
|
log.warning(
|
|
712
922
|
f"event {item['event_id']} missing — marking queue {item['id']} done"
|
|
713
923
|
)
|
|
714
924
|
mark_done(conn, item["id"])
|
|
925
|
+
elif SKIP_SENSITIVE_CONTENT and is_sensitive_event(ev):
|
|
926
|
+
log.info(
|
|
927
|
+
f"content-guardrail: filtering sensitive event {item['event_id']} "
|
|
928
|
+
f"— not distilling gossip into the graph"
|
|
929
|
+
)
|
|
930
|
+
mark_done(conn, item["id"])
|
|
715
931
|
else:
|
|
716
932
|
callable_items.append(item)
|
|
717
933
|
|
|
@@ -740,7 +956,7 @@ async def process_batch(
|
|
|
740
956
|
|
|
741
957
|
# Flatten chunk_outcomes back to per-item results, paired with items.
|
|
742
958
|
for (chunk_items, _chunk_events), (per_item, llm_ms) in zip(chunks, chunk_outcomes):
|
|
743
|
-
for item, result in zip(chunk_items, per_item):
|
|
959
|
+
for local_idx, (item, result) in enumerate(zip(chunk_items, per_item)):
|
|
744
960
|
queue_id = item["id"]
|
|
745
961
|
event_id = item["event_id"]
|
|
746
962
|
attempts = item["attempts"]
|
|
@@ -781,6 +997,24 @@ async def process_batch(
|
|
|
781
997
|
f"relationships={n_rels}"
|
|
782
998
|
+ (f" llm_ms={llm_ms:.0f}/chunk" if not stub_mode else "")
|
|
783
999
|
)
|
|
1000
|
+
# Trace logging — best-effort, never breaks the worker.
|
|
1001
|
+
# Captures (input, output) so a student model can be
|
|
1002
|
+
# trained on the teacher's distribution. Skipped in
|
|
1003
|
+
# stub mode (no real LLM output to record).
|
|
1004
|
+
if DISTILL_TRACE_ENABLED and not stub_mode:
|
|
1005
|
+
try:
|
|
1006
|
+
_insert_trace(
|
|
1007
|
+
conn,
|
|
1008
|
+
event_id=event_id,
|
|
1009
|
+
user_prompt=build_event_block(local_idx, event),
|
|
1010
|
+
raw_response=result.get("raw_slice", ""),
|
|
1011
|
+
llm_chunk_ms=llm_ms,
|
|
1012
|
+
)
|
|
1013
|
+
except Exception as trace_exc:
|
|
1014
|
+
log.warning(
|
|
1015
|
+
f"trace insert failed queue_id={queue_id} "
|
|
1016
|
+
f"event_id={event_id}: {trace_exc}"
|
|
1017
|
+
)
|
|
784
1018
|
except Exception as exc:
|
|
785
1019
|
err = f"{type(exc).__name__}: {exc}"
|
|
786
1020
|
log.warning(
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Canonical entity-ID scheme — SHARED, byte-identical across extractor-sync and
|
|
2
|
+
extractor-async.
|
|
3
|
+
|
|
4
|
+
The two extractors run as separate Docker services with PER-SERVICE build contexts
|
|
5
|
+
(docker-compose `context: ./extractor-sync` / `./extractor-async`), so a single
|
|
6
|
+
importable module can't be COPY'd into both. This file is therefore DUPLICATED in
|
|
7
|
+
each service dir, and tests/test_entity_id_parity.py fails if the copies ever drift.
|
|
8
|
+
|
|
9
|
+
Why this exists: both passes must key an entity (person / org / …) by the SAME id so
|
|
10
|
+
the same entity converges across the deterministic (sync) and LLM (async) passes.
|
|
11
|
+
Before this, the two services keyed entities DIFFERENTLY — sync as
|
|
12
|
+
`e_` + sha256("{arena}|{type}|{name.lower().strip()}")[:24]; async as
|
|
13
|
+
sha256("\\x1f".join(parts))[:32] (no lowercasing, no prefix) — so even identical
|
|
14
|
+
names produced different ids and never merged. We unify on the sync scheme: sync's
|
|
15
|
+
existing rows are unaffected, and the async pass converges onto them.
|
|
16
|
+
|
|
17
|
+
Step 1 of RFC-entity-reconciliation.md (the foundation for alias-aware resolution).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import hashlib
|
|
23
|
+
import re
|
|
24
|
+
import unicodedata
|
|
25
|
+
|
|
26
|
+
_WHITESPACE = re.compile(r"\s+")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def normalize_surface_form(value: str) -> str:
|
|
30
|
+
"""Normalize a surface form (person name, email, org name, …) for identity
|
|
31
|
+
keying. Steps, in order:
|
|
32
|
+
|
|
33
|
+
1. None → "" (defensive; some producers can hand a missing field as None).
|
|
34
|
+
2. Unicode NFKC (compatibility decomposition + canonical composition) —
|
|
35
|
+
collapses width / ligature / decomposed-accent variants. Without this
|
|
36
|
+
"Café" (precomposed U+00E9) and "Cafe\\u0301" (decomposed e+combining
|
|
37
|
+
acute) — which render identically — would key as different entities.
|
|
38
|
+
Same for fullwidth Latin ("CARLY" ↔ "CARLY") and ligatures
|
|
39
|
+
("fi" U+FB01 ↔ "fi"). Real-world relevant for vCard imports, Mac
|
|
40
|
+
pasteboard, IME inputs, internationalised name sources.
|
|
41
|
+
3. Trim outer whitespace, then collapse internal `\\s+` to a single space —
|
|
42
|
+
"Carly Snider" (slack-autocomplete double-space) ↔ "Carly Snider".
|
|
43
|
+
4. Lowercase. Email casing is case-insensitive per spec; person-name
|
|
44
|
+
casing varies by producer (gmail header casing vs slack profile).
|
|
45
|
+
"""
|
|
46
|
+
s = unicodedata.normalize("NFKC", value or "")
|
|
47
|
+
return _WHITESPACE.sub(" ", s.strip()).lower()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
|
|
51
|
+
"""Deterministic entity id. The same (arena, entity_type, normalized
|
|
52
|
+
canonical_name) yields the same id across BOTH extractor passes, so re-extraction
|
|
53
|
+
and cross-pass extraction converge. Format is preserved from extractor-sync
|
|
54
|
+
(`e_` + 24 hex of sha256) so its existing rows are unaffected.
|
|
55
|
+
"""
|
|
56
|
+
key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
|
|
57
|
+
return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
|