@pentatonic-ai/ai-agent-sdk 0.10.1 → 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,9 @@ import psycopg
40
40
  import psycopg.rows
41
41
 
42
42
  from confidence import corroborated_confidence
43
+ from entity_id import entity_id, normalize_surface_form
43
44
  from noise_filter import is_noise_entity_name
45
+ from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
44
46
 
45
47
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
46
48
  log = logging.getLogger("extractor-async")
@@ -74,6 +76,13 @@ LLM_MAX_TOKENS_PER_EVENT = int(os.environ.get("LLM_MAX_TOKENS_PER_EVENT", "300")
74
76
 
75
77
  WORKER_ID = f"{socket.gethostname()}:{os.getpid()}"
76
78
 
79
+ # Trace logging — captures raw teacher I/O per distilled event so we can
80
+ # train a student model (BART/FLAN-T5) on the teacher's distribution.
81
+ # Default off; opt-in per environment. See migration 003.
82
+ DISTILL_TRACE_ENABLED = os.environ.get(
83
+ "DISTILL_TRACE_ENABLED", "false"
84
+ ).strip().lower() in ("true", "1", "yes", "on")
85
+
77
86
 
78
87
  # KV-text output format constants. We dropped JSON output (and the
79
88
  # `guided_json` schema enforcement that went with it) because a single
@@ -124,14 +133,26 @@ RULES:
124
133
  - Each event MUST start with a `=== event K ===` header (zero-indexed, \
125
134
  matching the input index). NEVER skip an event — if an event has \
126
135
  nothing to extract, emit ONLY the header.
127
- - ENT lines have exactly 3 fields: literal `ENT`, type, name.
136
+ - ENT lines have 3 or 4 fields: literal `ENT`, type, name, [email].
128
137
  type ∈ {person, org, product, place, project, concept, topic, date, other}
129
- - FCT lines have exactly 6 fields: `FCT`, category, subject, \
130
- predicate, object, statement.
138
+ email (OPTIONAL, person only): when the event body or attributes
139
+ show an email address that unambiguously identifies the person,
140
+ append it as the 4th field. This pairs the name+email forms so a
141
+ later event seeing only the email resolves to the same entity.
142
+ Examples:
143
+ ENT|person|Alex Wong|alex@example.com
144
+ ENT|person|Acme Corp (org, no email)
145
+ ENT|person|Sam Patel (person, email not visible)
146
+ - FCT lines have EXACTLY 6 pipe-separated fields: `FCT`, category, subject, \
147
+ predicate, object, statement. COUNT THE PIPES: there must be 6 `|` segments. \
148
+ predicate and object are SEPARATE fields — NEVER merge them into the statement, \
149
+ and NEVER drop a field.
131
150
  category ∈ {decision, commitment, state, mention, observation, preference}
132
151
  subject MUST be an entity name declared in THIS event's ENT lines.
152
+ predicate is a short verb phrase (e.g. "agreed to", "owns", "works at").
133
153
  object MAY be an entity name OR a literal string OR `-` if absent.
134
- statement ≤ 140 characters.
154
+ statement ≤ 140 characters, a self-contained sentence.
155
+ WORKED EXAMPLE: `FCT|commitment|Timothy Bradley|agreed to|SAFE amendments|Timothy confirmed the SAFE amendments are set (14 May 2026)`
135
156
  - REL lines have exactly 4 fields: `REL`, from, to, rel_type.
136
157
  from and to MUST be entity names declared in THIS event's ENT lines.
137
158
  rel_type is a short verb / preposition phrase.
@@ -144,10 +165,65 @@ A whole file is one entity, not twenty.
144
165
  - Output ONLY the formatted records. No header, no footer, no prose."""
145
166
 
146
167
 
168
+ # Teacher-prompt fingerprint for trace logging. If the prompt changes,
169
+ # the hash changes — lets training-data exports filter by teacher
170
+ # version so we never mix outputs from a retired prompt.
171
+ SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
172
+
173
+
174
+ # --------------------------------------------------------------------
175
+ # Content cleaner — strip HTML/CSS so email + doc styling never reaches
176
+ # the LLM as text to extract. Without this, events containing Outlook /
177
+ # Gmail / docx-export markup get distilled into junk concept entities
178
+ # (`font-face`, `mso-font-alt`, `panose-1`, `src`) that pollute the
179
+ # graph. clean_content() is a no-op fast path on plain text — only
180
+ # events whose body contains `<` or `{` pay the regex cost.
181
+ # --------------------------------------------------------------------
182
+
183
+ _CC_STYLE = re.compile(r"<(style|script)\b[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
184
+ _CC_CSSRULE = re.compile(r"[.#@]?[A-Za-z0-9_.:#> -]+\s*\{[^{}]*\}")
185
+ _CC_MSO = re.compile(r"\b(mso-[\w-]+|panose-1|font-family|font-face)\b[^;\n]*;?", re.IGNORECASE)
186
+ _CC_TAG = re.compile(r"<[^>]+>")
187
+ _CC_WS = re.compile(r"[ \t\r\f]+")
188
+ _CC_NL = re.compile(r"\n{3,}")
189
+ _CC_ENT = (
190
+ ("&nbsp;", " "), ("&amp;", "&"), ("&lt;", "<"),
191
+ ("&gt;", ">"), ("&quot;", '"'), ("&#39;", "'"), ("&apos;", "'"),
192
+ )
193
+
194
+
195
+ def clean_content(text: str) -> str:
196
+ """Strip HTML/CSS so email + doc styling doesn't distil into junk
197
+ `concept` entities (font-face, mso-font-alt, etc.).
198
+
199
+ Fast early return on plain text (no `<` or `{`). On marked-up
200
+ content, removes `<style>` / `<script>` blocks first, then
201
+ standalone CSS rules, then all remaining tags, then MS-Office /
202
+ panose / font-face property runs that leak as freestanding tokens
203
+ in some Outlook exports. HTML entities are decoded last so we
204
+ don't accidentally introduce `<` tags from `&lt;` after the tag
205
+ pass."""
206
+ if not text or ("<" not in text and "{" not in text):
207
+ return text
208
+ t = _CC_STYLE.sub(" ", text)
209
+ t = _CC_CSSRULE.sub(" ", t)
210
+ t = _CC_TAG.sub(" ", t)
211
+ t = _CC_MSO.sub(" ", t)
212
+ for a, b in _CC_ENT:
213
+ t = t.replace(a, b)
214
+ t = _CC_WS.sub(" ", t)
215
+ t = _CC_NL.sub("\n\n", t)
216
+ return t.strip()
217
+
218
+
147
219
  def build_event_block(idx: int, event: dict[str, Any]) -> str:
148
- """Render one event as `[event K]\nheader\n---\ncontent` block."""
220
+ """Render one event as `[event K]\nheader\n---\ncontent` block.
221
+
222
+ Content is passed through `clean_content()` before truncation so
223
+ that the MAX_CONTENT_CHARS slice doesn't end up containing pure
224
+ HTML markup with no extractable signal."""
149
225
  src = event.get("source_kind", "unknown")
150
- content = (event.get("content") or "")[:MAX_CONTENT_CHARS]
226
+ content = clean_content(event.get("content") or "")[:MAX_CONTENT_CHARS]
151
227
  attrs = event.get("attributes") or {}
152
228
  when = attrs.get("emitted_at") or attrs.get("timestamp")
153
229
  author = attrs.get("author") or attrs.get("user_id")
@@ -200,11 +276,24 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
200
276
  # maxsplit so statement / name fields can contain colons or
201
277
  # other reserved-looking content without breaking parsing.
202
278
  if line.startswith("ENT|"):
203
- parts = line.split("|", 2)
204
- if len(parts) == 3 and parts[2].strip():
205
- current["entities"].append(
206
- {"type": parts[1].strip().lower(), "name": parts[2].strip()}
207
- )
279
+ # ENT|type|name|email? — email is optional, person-only.
280
+ # Use maxsplit=3 so a literal `|` in name (which the prompt
281
+ # forbids but the model might still emit) doesn't get
282
+ # parsed as an email field.
283
+ parts = line.split("|", 3)
284
+ if len(parts) >= 3 and parts[2].strip():
285
+ etype = parts[1].strip().lower()
286
+ name = parts[2].strip()
287
+ ent: dict[str, Any] = {"type": etype, "name": name}
288
+ # Promote 4th-field email into aliases when present
289
+ # and it actually looks like an email. Non-email
290
+ # 4th fields (random text the model added) are dropped
291
+ # — better to silently strip junk than poison aliases.
292
+ if len(parts) == 4:
293
+ email = parts[3].strip()
294
+ if email and "@" in email and " " not in email:
295
+ ent["aliases"] = [email]
296
+ current["entities"].append(ent)
208
297
  elif line.startswith("FCT|"):
209
298
  parts = line.split("|", 5)
210
299
  if len(parts) == 6 and parts[5].strip():
@@ -232,6 +321,38 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
232
321
  return results
233
322
 
234
323
 
324
+ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
325
+ """Slice raw LLM output by `=== event K ===` headers.
326
+
327
+ Returns expected_n slices in event-index order. Each slice is the
328
+ verbatim text between a header and the next header (or end-of-text),
329
+ stripped of trailing whitespace. Events the model omitted come back
330
+ as empty strings — same shape contract as _parse_kv_records.
331
+
332
+ Separate from the parser so trace logging stays decoupled from
333
+ extraction semantics: the parser drops malformed lines silently
334
+ (correctness), but the trace wants the raw output verbatim
335
+ (training fidelity)."""
336
+ slices: list[str] = [""] * expected_n
337
+ current_idx: int | None = None
338
+ current_lines: list[str] = []
339
+
340
+ def flush() -> None:
341
+ if current_idx is not None and 0 <= current_idx < expected_n:
342
+ slices[current_idx] = "\n".join(current_lines).rstrip()
343
+
344
+ for raw in text.splitlines():
345
+ m = EVENT_HEADER_RE.match(raw.strip()) if raw.strip() else None
346
+ if m:
347
+ flush()
348
+ current_idx = int(m.group(1))
349
+ current_lines = []
350
+ elif current_idx is not None:
351
+ current_lines.append(raw)
352
+ flush()
353
+ return slices
354
+
355
+
235
356
  async def call_llm_batch(
236
357
  client: httpx.AsyncClient, events: list[dict[str, Any]]
237
358
  ) -> list[dict[str, Any]]:
@@ -279,7 +400,15 @@ async def call_llm_batch(
279
400
  text = data.get("message", {}).get("content", "")
280
401
  if not text:
281
402
  raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
282
- return _parse_kv_records(text, n)
403
+ parsed = _parse_kv_records(text, n)
404
+ # Attach the per-event raw slice so downstream trace logging gets
405
+ # the model's verbatim output for THIS event without re-splitting
406
+ # the chunk-level text. Parser semantics are unaffected — the
407
+ # raw_slice key is ignored by upsert paths.
408
+ slices = _split_event_blocks(text, n)
409
+ for record, slice_text in zip(parsed, slices):
410
+ record["raw_slice"] = slice_text
411
+ return parsed
283
412
 
284
413
 
285
414
  # --------------------------------------------------------------------
@@ -288,6 +417,10 @@ async def call_llm_batch(
288
417
 
289
418
 
290
419
  def _content_id(*parts: str) -> str:
420
+ """Deterministic content-addressed id for facts / relationships.
421
+ Entity ids are minted via `entity_id()` from entity_id.py (see
422
+ `upsert_entities` below); this helper covers the non-entity
423
+ content-hash needs."""
291
424
  return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
292
425
 
293
426
 
@@ -299,12 +432,33 @@ def upsert_entities(
299
432
  disclosure_class: str,
300
433
  entities: list[dict],
301
434
  ) -> dict[str, str]:
302
- """Insert (or merge) entities; return a name→id map so facts and
303
- relationships can link to the inserted rows.
304
-
305
- ID is sha256(arena:entity_type:canonical_name)[:32] so the same
306
- entity in the same arena converges across events. Aliases and
307
- provenance_event_ids array-append on conflict; never replace."""
435
+ """Alias-aware insert (or merge) of entities; returns a name→id
436
+ map so facts and relationships can link to the inserted rows.
437
+
438
+ Two concerns layered together:
439
+
440
+ 1. **ID derivation** uses the shared `entity_id()` helper from
441
+ entity_id.py: `e_` + 24 hex of sha256("{arena}|{entity_type}|
442
+ {normalize_surface_form(name)}"). BYTE-IDENTICAL to extractor-
443
+ sync's id derivation, so the same person extracted by both
444
+ passes converges to the same row instead of fragmenting across
445
+ two id schemes. (RFC step 1.)
446
+
447
+ 2. **Resolution at upsert** — before INSERT, check for existing
448
+ rows in the same (arena, entity_type) whose canonical_name or
449
+ aliases overlap any incoming surface form. If matched, merge
450
+ into the existing row. Per-form `pg_advisory_xact_lock`
451
+ serialises concurrent writers (sync + async on the same event)
452
+ on the same surface form. (RFC steps 2 + 2a.)
453
+
454
+ MIRROR of extractor-sync/server.py:_upsert_entities — same
455
+ resolution algorithm. Kept as separate Python because the sync
456
+ extractor uses async psycopg and the async worker uses sync
457
+ psycopg; the SQL is identical.
458
+
459
+ Returns name→id where `name` is the LLM-emitted surface form
460
+ (canonical) so facts/relationships using the same surface form
461
+ in the same LLM batch resolve to the right id."""
308
462
  name_to_id: dict[str, str] = {}
309
463
  if not entities:
310
464
  return name_to_id
@@ -314,39 +468,84 @@ def upsert_entities(
314
468
  name = (e.get("name") or "").strip()
315
469
  if not name:
316
470
  continue
317
- # Drop junk names before they enter the graph. See
318
- # noise_filter.py — patterns are anchored to live-arena
319
- # noise (pronouns, hostnames, paths, agent-worktree
320
- # labels). Skipping here means name_to_id never carries
321
- # the bad name, so any fact/relationship the LLM tried to
322
- # attach to it gets dropped downstream (subj/obj resolve
323
- # to None ⇒ filtered out by upsert_facts /
324
- # upsert_relationships).
471
+ # Drop junk names before they enter the graph.
325
472
  if is_noise_entity_name(etype, name):
326
473
  continue
327
474
  aliases = [a for a in (e.get("aliases") or []) if a]
328
- eid = _content_id(arena, etype, name)
329
- name_to_id[name] = eid
475
+
476
+ # Sort (don't `list(set(...))`) so lock acquisition order
477
+ # is deterministic across processes — set-iteration order
478
+ # depends on Python's per-process hash randomisation, so
479
+ # sync and async extractors processing the same person
480
+ # could otherwise acquire the same locks in opposite
481
+ # orders and deadlock.
482
+ forms_original = sorted({name, *aliases})
483
+ forms_normalized = sorted({normalize_surface_form(f) for f in forms_original})
484
+
485
+ # 1. Advisory lock per surface form. Serialises concurrent
486
+ # writers (sync + async on the same event) on the same
487
+ # person without blocking anything else. See RFC §2a.
488
+ for f in forms_normalized:
489
+ cur.execute(
490
+ "SELECT pg_advisory_xact_lock(hashtext(%s))",
491
+ (f"{arena}|{etype}|{f}",),
492
+ )
493
+
494
+ # 2. Resolve via canonical-name (normalised, case-insensitive)
495
+ # or aliases overlap.
330
496
  cur.execute(
331
497
  """
332
- INSERT INTO entities (
333
- id, arena, entity_type, canonical_name, aliases,
334
- provenance_event_ids, participant_set, disclosure_class
335
- ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
336
- ON CONFLICT (id) DO UPDATE SET
337
- aliases = (
338
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
339
- ),
340
- provenance_event_ids = (
341
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
342
- ),
343
- last_seen = NOW()
498
+ SELECT id FROM entities
499
+ WHERE arena = %s
500
+ AND entity_type = %s
501
+ AND (
502
+ lower(canonical_name) = ANY(%s::text[])
503
+ OR aliases && %s::text[]
504
+ )
505
+ LIMIT 1
344
506
  """,
345
- (
346
- eid, arena, etype, name, aliases,
347
- [event_id], participant_set, disclosure_class,
348
- ),
507
+ (arena, etype, forms_normalized, forms_original),
349
508
  )
509
+ row = cur.fetchone()
510
+
511
+ if row is not None:
512
+ # 3a. Existing match — merge aliases + provenance.
513
+ # Canonical stays as-was (accrete-only per RFC §2b).
514
+ eid = row[0]
515
+ cur.execute(
516
+ """
517
+ UPDATE entities SET
518
+ aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
519
+ provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
520
+ last_seen = NOW()
521
+ WHERE id = %s
522
+ """,
523
+ (aliases, [event_id], eid),
524
+ )
525
+ else:
526
+ # 3b. No match — insert new.
527
+ eid = entity_id(arena, etype, name)
528
+ cur.execute(
529
+ """
530
+ INSERT INTO entities (
531
+ id, arena, entity_type, canonical_name, aliases,
532
+ provenance_event_ids, participant_set, disclosure_class
533
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
534
+ ON CONFLICT (id) DO UPDATE SET
535
+ aliases = (
536
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
537
+ ),
538
+ provenance_event_ids = (
539
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
540
+ ),
541
+ last_seen = NOW()
542
+ """,
543
+ (
544
+ eid, arena, etype, name, aliases,
545
+ [event_id], participant_set, disclosure_class,
546
+ ),
547
+ )
548
+ name_to_id[name] = eid
350
549
  return name_to_id
351
550
 
352
551
 
@@ -486,6 +685,43 @@ def upsert_relationships(
486
685
  return inserted
487
686
 
488
687
 
688
+ # --------------------------------------------------------------------
689
+ # Distillation trace logging
690
+ # --------------------------------------------------------------------
691
+
692
+
693
+ def _insert_trace(
694
+ conn: psycopg.Connection,
695
+ *,
696
+ event_id: str,
697
+ user_prompt: str,
698
+ raw_response: str,
699
+ llm_chunk_ms: float | None,
700
+ ) -> None:
701
+ """Append a (user_prompt, raw_response) pair to distillation_traces.
702
+
703
+ Audit-only — not on the hot path of distillation semantics. The
704
+ caller wraps this in a try/except and never lets a trace-insert
705
+ failure poison the upsert path. Skip rows with empty raw_response
706
+ (no signal to train on); the model occasionally emits a header
707
+ with no body."""
708
+ if not raw_response.strip():
709
+ return
710
+ with conn.cursor() as cur:
711
+ cur.execute(
712
+ """
713
+ INSERT INTO distillation_traces (
714
+ event_id, user_prompt, raw_response,
715
+ llm_model, system_prompt_hash, llm_chunk_ms
716
+ ) VALUES (%s, %s, %s, %s, %s, %s)
717
+ """,
718
+ (
719
+ event_id, user_prompt, raw_response,
720
+ LLM_MODEL, SYSTEM_PROMPT_HASH, llm_chunk_ms,
721
+ ),
722
+ )
723
+
724
+
489
725
  # --------------------------------------------------------------------
490
726
  # Queue mechanics
491
727
  # --------------------------------------------------------------------
@@ -570,6 +806,29 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
570
806
  """,
571
807
  (list(SKIP_ATTRIBUTE_SOURCES),),
572
808
  )
809
+ # Pre-filter: content-guardrail sensitive events (interpersonal
810
+ # gossip about a colleague). Never distil gossip into the entity
811
+ # graph — the subject has no standing there. attributes is jsonb;
812
+ # cast defensively in case a producer wrote json.
813
+ if SKIP_SENSITIVE_CONTENT:
814
+ cur.execute(
815
+ """
816
+ UPDATE distillation_queue dq SET
817
+ status = 'done',
818
+ completed_at = NOW(),
819
+ last_error = 'filtered: content-guardrail sensitive'
820
+ FROM events e
821
+ WHERE dq.event_id = e.id
822
+ AND dq.status = 'pending'
823
+ AND (
824
+ (e.attributes::jsonb)->>'sensitivity_class' = 'interpersonal'
825
+ OR (
826
+ jsonb_typeof((e.attributes::jsonb)->'sensitive_about') = 'array'
827
+ AND jsonb_array_length((e.attributes::jsonb)->'sensitive_about') > 0
828
+ )
829
+ )
830
+ """
831
+ )
573
832
  # Pre-filter: events older than the window.
574
833
  cur.execute(
575
834
  """
@@ -704,14 +963,24 @@ async def process_batch(
704
963
  for item in items:
705
964
  events_by_qid[item["id"]] = fetch_event(conn, item["event_id"])
706
965
 
707
- # Drop items whose event is missing (mark done up-front, no LLM call).
966
+ # Drop items whose event is missing (mark done up-front, no LLM call),
967
+ # and — content guardrail — any sensitive event that slipped past the
968
+ # claim-time pre-filter (defense in depth; the pure predicate is the
969
+ # testable contract for the rule). Never distil interpersonal gossip.
708
970
  callable_items: list[dict[str, Any]] = []
709
971
  for item in items:
710
- if events_by_qid[item["id"]] is None:
972
+ ev = events_by_qid[item["id"]]
973
+ if ev is None:
711
974
  log.warning(
712
975
  f"event {item['event_id']} missing — marking queue {item['id']} done"
713
976
  )
714
977
  mark_done(conn, item["id"])
978
+ elif SKIP_SENSITIVE_CONTENT and is_sensitive_event(ev):
979
+ log.info(
980
+ f"content-guardrail: filtering sensitive event {item['event_id']} "
981
+ f"— not distilling gossip into the graph"
982
+ )
983
+ mark_done(conn, item["id"])
715
984
  else:
716
985
  callable_items.append(item)
717
986
 
@@ -740,7 +1009,7 @@ async def process_batch(
740
1009
 
741
1010
  # Flatten chunk_outcomes back to per-item results, paired with items.
742
1011
  for (chunk_items, _chunk_events), (per_item, llm_ms) in zip(chunks, chunk_outcomes):
743
- for item, result in zip(chunk_items, per_item):
1012
+ for local_idx, (item, result) in enumerate(zip(chunk_items, per_item)):
744
1013
  queue_id = item["id"]
745
1014
  event_id = item["event_id"]
746
1015
  attempts = item["attempts"]
@@ -781,6 +1050,24 @@ async def process_batch(
781
1050
  f"relationships={n_rels}"
782
1051
  + (f" llm_ms={llm_ms:.0f}/chunk" if not stub_mode else "")
783
1052
  )
1053
+ # Trace logging — best-effort, never breaks the worker.
1054
+ # Captures (input, output) so a student model can be
1055
+ # trained on the teacher's distribution. Skipped in
1056
+ # stub mode (no real LLM output to record).
1057
+ if DISTILL_TRACE_ENABLED and not stub_mode:
1058
+ try:
1059
+ _insert_trace(
1060
+ conn,
1061
+ event_id=event_id,
1062
+ user_prompt=build_event_block(local_idx, event),
1063
+ raw_response=result.get("raw_slice", ""),
1064
+ llm_chunk_ms=llm_ms,
1065
+ )
1066
+ except Exception as trace_exc:
1067
+ log.warning(
1068
+ f"trace insert failed queue_id={queue_id} "
1069
+ f"event_id={event_id}: {trace_exc}"
1070
+ )
784
1071
  except Exception as exc:
785
1072
  err = f"{type(exc).__name__}: {exc}"
786
1073
  log.warning(
@@ -5,7 +5,7 @@ WORKDIR /app
5
5
  COPY requirements.txt .
6
6
  RUN pip install --no-cache-dir -r requirements.txt
7
7
 
8
- COPY server.py .
8
+ COPY entity_id.py server.py .
9
9
 
10
10
  EXPOSE 8101
11
11
  CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
@@ -0,0 +1,57 @@
1
+ """Canonical entity-ID scheme — SHARED, byte-identical across extractor-sync and
2
+ extractor-async.
3
+
4
+ The two extractors run as separate Docker services with PER-SERVICE build contexts
5
+ (docker-compose `context: ./extractor-sync` / `./extractor-async`), so a single
6
+ importable module can't be COPY'd into both. This file is therefore DUPLICATED in
7
+ each service dir, and tests/test_entity_id_parity.py fails if the copies ever drift.
8
+
9
+ Why this exists: both passes must key an entity (person / org / …) by the SAME id so
10
+ the same entity converges across the deterministic (sync) and LLM (async) passes.
11
+ Before this, the two services keyed entities DIFFERENTLY — sync as
12
+ `e_` + sha256("{arena}|{type}|{name.lower().strip()}")[:24]; async as
13
+ sha256("\\x1f".join(parts))[:32] (no lowercasing, no prefix) — so even identical
14
+ names produced different ids and never merged. We unify on the sync scheme: sync's
15
+ existing rows are unaffected, and the async pass converges onto them.
16
+
17
+ Step 1 of RFC-entity-reconciliation.md (the foundation for alias-aware resolution).
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import re
24
+ import unicodedata
25
+
26
+ _WHITESPACE = re.compile(r"\s+")
27
+
28
+
29
+ def normalize_surface_form(value: str) -> str:
30
+ """Normalize a surface form (person name, email, org name, …) for identity
31
+ keying. Steps, in order:
32
+
33
+ 1. None → "" (defensive; some producers can hand a missing field as None).
34
+ 2. Unicode NFKC (compatibility decomposition + canonical composition) —
35
+ collapses width / ligature / decomposed-accent variants. Without this
36
+ "Café" (precomposed U+00E9) and "Cafe\\u0301" (decomposed e+combining
37
+ acute) — which render identically — would key as different entities.
38
+ Same for fullwidth Latin ("CARLY" ↔ "CARLY") and ligatures
39
+ ("fi" U+FB01 ↔ "fi"). Real-world relevant for vCard imports, Mac
40
+ pasteboard, IME inputs, internationalised name sources.
41
+ 3. Trim outer whitespace, then collapse internal `\\s+` to a single space —
42
+ "Carly Snider" (slack-autocomplete double-space) ↔ "Carly Snider".
43
+ 4. Lowercase. Email casing is case-insensitive per spec; person-name
44
+ casing varies by producer (gmail header casing vs slack profile).
45
+ """
46
+ s = unicodedata.normalize("NFKC", value or "")
47
+ return _WHITESPACE.sub(" ", s.strip()).lower()
48
+
49
+
50
+ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
51
+ """Deterministic entity id. The same (arena, entity_type, normalized
52
+ canonical_name) yields the same id across BOTH extractor passes, so re-extraction
53
+ and cross-pass extraction converge. Format is preserved from extractor-sync
54
+ (`e_` + 24 hex of sha256) so its existing rows are unaffected.
55
+ """
56
+ key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
57
+ return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]