@pentatonic-ai/ai-agent-sdk 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,9 @@ import psycopg
40
40
  import psycopg.rows
41
41
 
42
42
  from confidence import corroborated_confidence
43
+ from entity_id import entity_id, normalize_surface_form
43
44
  from noise_filter import is_noise_entity_name
45
+ from sensitive_filter import SKIP_SENSITIVE_CONTENT, is_sensitive_event
44
46
 
45
47
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
46
48
  log = logging.getLogger("extractor-async")
@@ -74,6 +76,13 @@ LLM_MAX_TOKENS_PER_EVENT = int(os.environ.get("LLM_MAX_TOKENS_PER_EVENT", "300")
74
76
 
75
77
  WORKER_ID = f"{socket.gethostname()}:{os.getpid()}"
76
78
 
79
+ # Trace logging — captures raw teacher I/O per distilled event so we can
80
+ # train a student model (BART/FLAN-T5) on the teacher's distribution.
81
+ # Default off; opt-in per environment. See migration 003.
82
+ DISTILL_TRACE_ENABLED = os.environ.get(
83
+ "DISTILL_TRACE_ENABLED", "false"
84
+ ).strip().lower() in ("true", "1", "yes", "on")
85
+
77
86
 
78
87
  # KV-text output format constants. We dropped JSON output (and the
79
88
  # `guided_json` schema enforcement that went with it) because a single
@@ -124,8 +133,16 @@ RULES:
124
133
  - Each event MUST start with a `=== event K ===` header (zero-indexed, \
125
134
  matching the input index). NEVER skip an event — if an event has \
126
135
  nothing to extract, emit ONLY the header.
127
- - ENT lines have exactly 3 fields: literal `ENT`, type, name.
136
+ - ENT lines have 3 or 4 fields: literal `ENT`, type, name, [email].
128
137
  type ∈ {person, org, product, place, project, concept, topic, date, other}
138
+ email (OPTIONAL, person only): when the event body or attributes
139
+ show an email address that unambiguously identifies the person,
140
+ append it as the 4th field. This pairs the name+email forms so a
141
+ later event seeing only the email resolves to the same entity.
142
+ Examples:
143
+ ENT|person|Alex Wong|alex@example.com
144
+ ENT|person|Acme Corp (org, no email)
145
+ ENT|person|Sam Patel (person, email not visible)
129
146
  - FCT lines have exactly 6 fields: `FCT`, category, subject, \
130
147
  predicate, object, statement.
131
148
  category ∈ {decision, commitment, state, mention, observation, preference}
@@ -144,6 +161,12 @@ A whole file is one entity, not twenty.
144
161
  - Output ONLY the formatted records. No header, no footer, no prose."""
145
162
 
146
163
 
164
+ # Teacher-prompt fingerprint for trace logging. If the prompt changes,
165
+ # the hash changes — lets training-data exports filter by teacher
166
+ # version so we never mix outputs from a retired prompt.
167
+ SYSTEM_PROMPT_HASH = hashlib.sha256(BATCH_SYSTEM_PROMPT.encode()).hexdigest()[:16]
168
+
169
+
147
170
  def build_event_block(idx: int, event: dict[str, Any]) -> str:
148
171
  """Render one event as `[event K]\nheader\n---\ncontent` block."""
149
172
  src = event.get("source_kind", "unknown")
@@ -200,11 +223,24 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
200
223
  # maxsplit so statement / name fields can contain colons or
201
224
  # other reserved-looking content without breaking parsing.
202
225
  if line.startswith("ENT|"):
203
- parts = line.split("|", 2)
204
- if len(parts) == 3 and parts[2].strip():
205
- current["entities"].append(
206
- {"type": parts[1].strip().lower(), "name": parts[2].strip()}
207
- )
226
+ # ENT|type|name|email? — email is optional, person-only.
227
+ # Use maxsplit=3 so a literal `|` in name (which the prompt
228
+ # forbids but the model might still emit) doesn't get
229
+ # parsed as an email field.
230
+ parts = line.split("|", 3)
231
+ if len(parts) >= 3 and parts[2].strip():
232
+ etype = parts[1].strip().lower()
233
+ name = parts[2].strip()
234
+ ent: dict[str, Any] = {"type": etype, "name": name}
235
+ # Promote 4th-field email into aliases when present
236
+ # and it actually looks like an email. Non-email
237
+ # 4th fields (random text the model added) are dropped
238
+ # — better to silently strip junk than poison aliases.
239
+ if len(parts) == 4:
240
+ email = parts[3].strip()
241
+ if email and "@" in email and " " not in email:
242
+ ent["aliases"] = [email]
243
+ current["entities"].append(ent)
208
244
  elif line.startswith("FCT|"):
209
245
  parts = line.split("|", 5)
210
246
  if len(parts) == 6 and parts[5].strip():
@@ -232,6 +268,38 @@ def _parse_kv_records(text: str, expected_n: int) -> list[dict[str, Any]]:
232
268
  return results
233
269
 
234
270
 
271
+ def _split_event_blocks(text: str, expected_n: int) -> list[str]:
272
+ """Slice raw LLM output by `=== event K ===` headers.
273
+
274
+ Returns expected_n slices in event-index order. Each slice is the
275
+ verbatim text between a header and the next header (or end-of-text),
276
+ stripped of trailing whitespace. Events the model omitted come back
277
+ as empty strings — same shape contract as _parse_kv_records.
278
+
279
+ Separate from the parser so trace logging stays decoupled from
280
+ extraction semantics: the parser drops malformed lines silently
281
+ (correctness), but the trace wants the raw output verbatim
282
+ (training fidelity)."""
283
+ slices: list[str] = [""] * expected_n
284
+ current_idx: int | None = None
285
+ current_lines: list[str] = []
286
+
287
+ def flush() -> None:
288
+ if current_idx is not None and 0 <= current_idx < expected_n:
289
+ slices[current_idx] = "\n".join(current_lines).rstrip()
290
+
291
+ for raw in text.splitlines():
292
+ m = EVENT_HEADER_RE.match(raw.strip()) if raw.strip() else None
293
+ if m:
294
+ flush()
295
+ current_idx = int(m.group(1))
296
+ current_lines = []
297
+ elif current_idx is not None:
298
+ current_lines.append(raw)
299
+ flush()
300
+ return slices
301
+
302
+
235
303
  async def call_llm_batch(
236
304
  client: httpx.AsyncClient, events: list[dict[str, Any]]
237
305
  ) -> list[dict[str, Any]]:
@@ -279,7 +347,15 @@ async def call_llm_batch(
279
347
  text = data.get("message", {}).get("content", "")
280
348
  if not text:
281
349
  raise RuntimeError(f"llm returned no content: {json.dumps(data)[:300]}")
282
- return _parse_kv_records(text, n)
350
+ parsed = _parse_kv_records(text, n)
351
+ # Attach the per-event raw slice so downstream trace logging gets
352
+ # the model's verbatim output for THIS event without re-splitting
353
+ # the chunk-level text. Parser semantics are unaffected — the
354
+ # raw_slice key is ignored by upsert paths.
355
+ slices = _split_event_blocks(text, n)
356
+ for record, slice_text in zip(parsed, slices):
357
+ record["raw_slice"] = slice_text
358
+ return parsed
283
359
 
284
360
 
285
361
  # --------------------------------------------------------------------
@@ -288,6 +364,10 @@ async def call_llm_batch(
288
364
 
289
365
 
290
366
  def _content_id(*parts: str) -> str:
367
+ """Deterministic content-addressed id for facts / relationships.
368
+ Entity ids are minted via `entity_id()` from entity_id.py (see
369
+ `upsert_entities` below); this helper covers the non-entity
370
+ content-hash needs."""
291
371
  return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
292
372
 
293
373
 
@@ -299,12 +379,33 @@ def upsert_entities(
299
379
  disclosure_class: str,
300
380
  entities: list[dict],
301
381
  ) -> dict[str, str]:
302
- """Insert (or merge) entities; return a name→id map so facts and
303
- relationships can link to the inserted rows.
304
-
305
- ID is sha256(arena:entity_type:canonical_name)[:32] so the same
306
- entity in the same arena converges across events. Aliases and
307
- provenance_event_ids array-append on conflict; never replace."""
382
+ """Alias-aware insert (or merge) of entities; returns a name→id
383
+ map so facts and relationships can link to the inserted rows.
384
+
385
+ Two concerns layered together:
386
+
387
+ 1. **ID derivation** uses the shared `entity_id()` helper from
388
+ entity_id.py: `e_` + 24 hex of sha256("{arena}|{entity_type}|
389
+ {normalize_surface_form(name)}"). BYTE-IDENTICAL to extractor-
390
+ sync's id derivation, so the same person extracted by both
391
+ passes converges to the same row instead of fragmenting across
392
+ two id schemes. (RFC step 1.)
393
+
394
+ 2. **Resolution at upsert** — before INSERT, check for existing
395
+ rows in the same (arena, entity_type) whose canonical_name or
396
+ aliases overlap any incoming surface form. If matched, merge
397
+ into the existing row. Per-form `pg_advisory_xact_lock`
398
+ serialises concurrent writers (sync + async on the same event)
399
+ on the same surface form. (RFC steps 2 + 2a.)
400
+
401
+ MIRROR of extractor-sync/server.py:_upsert_entities — same
402
+ resolution algorithm. Kept as separate Python because the sync
403
+ extractor uses async psycopg and the async worker uses sync
404
+ psycopg; the SQL is identical.
405
+
406
+ Returns name→id where `name` is the LLM-emitted surface form
407
+ (canonical) so facts/relationships using the same surface form
408
+ in the same LLM batch resolve to the right id."""
308
409
  name_to_id: dict[str, str] = {}
309
410
  if not entities:
310
411
  return name_to_id
@@ -314,39 +415,84 @@ def upsert_entities(
314
415
  name = (e.get("name") or "").strip()
315
416
  if not name:
316
417
  continue
317
- # Drop junk names before they enter the graph. See
318
- # noise_filter.py — patterns are anchored to live-arena
319
- # noise (pronouns, hostnames, paths, agent-worktree
320
- # labels). Skipping here means name_to_id never carries
321
- # the bad name, so any fact/relationship the LLM tried to
322
- # attach to it gets dropped downstream (subj/obj resolve
323
- # to None ⇒ filtered out by upsert_facts /
324
- # upsert_relationships).
418
+ # Drop junk names before they enter the graph.
325
419
  if is_noise_entity_name(etype, name):
326
420
  continue
327
421
  aliases = [a for a in (e.get("aliases") or []) if a]
328
- eid = _content_id(arena, etype, name)
329
- name_to_id[name] = eid
422
+
423
+ # Sort (don't `list(set(...))`) so lock acquisition order
424
+ # is deterministic across processes — set-iteration order
425
+ # depends on Python's per-process hash randomisation, so
426
+ # sync and async extractors processing the same person
427
+ # could otherwise acquire the same locks in opposite
428
+ # orders and deadlock.
429
+ forms_original = sorted({name, *aliases})
430
+ forms_normalized = sorted({normalize_surface_form(f) for f in forms_original})
431
+
432
+ # 1. Advisory lock per surface form. Serialises concurrent
433
+ # writers (sync + async on the same event) on the same
434
+ # person without blocking anything else. See RFC §2a.
435
+ for f in forms_normalized:
436
+ cur.execute(
437
+ "SELECT pg_advisory_xact_lock(hashtext(%s))",
438
+ (f"{arena}|{etype}|{f}",),
439
+ )
440
+
441
+ # 2. Resolve via canonical-name (normalised, case-insensitive)
442
+ # or aliases overlap.
330
443
  cur.execute(
331
444
  """
332
- INSERT INTO entities (
333
- id, arena, entity_type, canonical_name, aliases,
334
- provenance_event_ids, participant_set, disclosure_class
335
- ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
336
- ON CONFLICT (id) DO UPDATE SET
337
- aliases = (
338
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
339
- ),
340
- provenance_event_ids = (
341
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
342
- ),
343
- last_seen = NOW()
445
+ SELECT id FROM entities
446
+ WHERE arena = %s
447
+ AND entity_type = %s
448
+ AND (
449
+ lower(canonical_name) = ANY(%s::text[])
450
+ OR aliases && %s::text[]
451
+ )
452
+ LIMIT 1
344
453
  """,
345
- (
346
- eid, arena, etype, name, aliases,
347
- [event_id], participant_set, disclosure_class,
348
- ),
454
+ (arena, etype, forms_normalized, forms_original),
349
455
  )
456
+ row = cur.fetchone()
457
+
458
+ if row is not None:
459
+ # 3a. Existing match — merge aliases + provenance.
460
+ # Canonical stays as-was (accrete-only per RFC §2b).
461
+ eid = row[0]
462
+ cur.execute(
463
+ """
464
+ UPDATE entities SET
465
+ aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
466
+ provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
467
+ last_seen = NOW()
468
+ WHERE id = %s
469
+ """,
470
+ (aliases, [event_id], eid),
471
+ )
472
+ else:
473
+ # 3b. No match — insert new.
474
+ eid = entity_id(arena, etype, name)
475
+ cur.execute(
476
+ """
477
+ INSERT INTO entities (
478
+ id, arena, entity_type, canonical_name, aliases,
479
+ provenance_event_ids, participant_set, disclosure_class
480
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
481
+ ON CONFLICT (id) DO UPDATE SET
482
+ aliases = (
483
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
484
+ ),
485
+ provenance_event_ids = (
486
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
487
+ ),
488
+ last_seen = NOW()
489
+ """,
490
+ (
491
+ eid, arena, etype, name, aliases,
492
+ [event_id], participant_set, disclosure_class,
493
+ ),
494
+ )
495
+ name_to_id[name] = eid
350
496
  return name_to_id
351
497
 
352
498
 
@@ -486,6 +632,43 @@ def upsert_relationships(
486
632
  return inserted
487
633
 
488
634
 
635
+ # --------------------------------------------------------------------
636
+ # Distillation trace logging
637
+ # --------------------------------------------------------------------
638
+
639
+
640
+ def _insert_trace(
641
+ conn: psycopg.Connection,
642
+ *,
643
+ event_id: str,
644
+ user_prompt: str,
645
+ raw_response: str,
646
+ llm_chunk_ms: float | None,
647
+ ) -> None:
648
+ """Append a (user_prompt, raw_response) pair to distillation_traces.
649
+
650
+ Audit-only — not on the hot path of distillation semantics. The
651
+ caller wraps this in a try/except and never lets a trace-insert
652
+ failure poison the upsert path. Skip rows with empty raw_response
653
+ (no signal to train on); the model occasionally emits a header
654
+ with no body."""
655
+ if not raw_response.strip():
656
+ return
657
+ with conn.cursor() as cur:
658
+ cur.execute(
659
+ """
660
+ INSERT INTO distillation_traces (
661
+ event_id, user_prompt, raw_response,
662
+ llm_model, system_prompt_hash, llm_chunk_ms
663
+ ) VALUES (%s, %s, %s, %s, %s, %s)
664
+ """,
665
+ (
666
+ event_id, user_prompt, raw_response,
667
+ LLM_MODEL, SYSTEM_PROMPT_HASH, llm_chunk_ms,
668
+ ),
669
+ )
670
+
671
+
489
672
  # --------------------------------------------------------------------
490
673
  # Queue mechanics
491
674
  # --------------------------------------------------------------------
@@ -496,6 +679,16 @@ def upsert_relationships(
496
679
  # env so we can revisit per-source value over time.
497
680
  #
498
681
  # Skip rules:
682
+ # - source_kind == 'agent'. These events are an AGENT'S OWN output — a
683
+ # coding-assistant transcript, an orchestrator/triage run, a briefing the
684
+ # agent wrote. Distilling them turns the assistant's chatter into "facts"
685
+ # ("the user wants a summary", "PR #228 merged", "high priority"), which
686
+ # then pollute every per-user/tenant arena the agent later READS — a
687
+ # feedback loop (live evidence 2026-06-02: a Claude-Code SDK that writes
688
+ # its transcript back into memory surfaced its own messages as graph
689
+ # facts). Agent output is not knowledge ABOUT the world; never distil it.
690
+ # This is by source_kind (not an enumerated source list) so any new agent
691
+ # producer is covered automatically. Tunable off via env for back-compat.
499
692
  # - source attribute matches a known code-only ingest (pip-code-ingest
500
693
  # and friends). Code chunks generate noisy entities — class names,
501
694
  # file paths, variables — that pollute the graph and don't surface
@@ -503,6 +696,9 @@ def upsert_relationships(
503
696
  # - received_at older than DISTILL_MAX_AGE_DAYS. Stale events have low
504
697
  # facet value and burn LLM budget. Forward-only + 90-day window is
505
698
  # the right default; old events stay vector-searchable.
699
+ SKIP_AGENT_SOURCE_KIND = os.environ.get(
700
+ "DISTILL_SKIP_AGENT_SOURCE_KIND", "true"
701
+ ).strip().lower() not in ("false", "0", "no", "off")
506
702
  SKIP_ATTRIBUTE_SOURCES = set(
507
703
  s.strip()
508
704
  for s in os.environ.get(
@@ -518,13 +714,30 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
518
714
  concurrent workers never race.
519
715
 
520
716
  Filters at claim time:
717
+ - Events whose source_kind is 'agent' (the agent's own output) when
718
+ SKIP_AGENT_SOURCE_KIND is set — never distil assistant chatter.
521
719
  - Events from skip-sources (attributes.source in SKIP_ATTRIBUTE_SOURCES)
522
720
  are marked done with `filtered:<source>` rather than claimed.
523
721
  - Events older than DISTILL_MAX_AGE_DAYS are similarly skipped.
524
- Both pre-passes run BEFORE the claim so the worker never wastes an
722
+ All pre-passes run BEFORE the claim so the worker never wastes an
525
723
  LLM call on filtered events. They're cheap UPDATE statements scoped
526
724
  to the current pending set."""
527
725
  with conn.cursor() as cur:
726
+ # Pre-filter: agent's-own-output events (by source_kind). Covers any
727
+ # agent producer without enumerating its source label.
728
+ if SKIP_AGENT_SOURCE_KIND:
729
+ cur.execute(
730
+ """
731
+ UPDATE distillation_queue dq SET
732
+ status = 'done',
733
+ completed_at = NOW(),
734
+ last_error = 'filtered: source_kind=agent'
735
+ FROM events e
736
+ WHERE dq.event_id = e.id
737
+ AND dq.status = 'pending'
738
+ AND e.source_kind = 'agent'
739
+ """
740
+ )
528
741
  # Pre-filter: skip-source events.
529
742
  if SKIP_ATTRIBUTE_SOURCES:
530
743
  cur.execute(
@@ -540,6 +753,29 @@ def claim_next_batch(conn: psycopg.Connection) -> list[dict[str, Any]]:
540
753
  """,
541
754
  (list(SKIP_ATTRIBUTE_SOURCES),),
542
755
  )
756
+ # Pre-filter: content-guardrail sensitive events (interpersonal
757
+ # gossip about a colleague). Never distil gossip into the entity
758
+ # graph — the subject has no standing there. attributes is jsonb;
759
+ # cast defensively in case a producer wrote json.
760
+ if SKIP_SENSITIVE_CONTENT:
761
+ cur.execute(
762
+ """
763
+ UPDATE distillation_queue dq SET
764
+ status = 'done',
765
+ completed_at = NOW(),
766
+ last_error = 'filtered: content-guardrail sensitive'
767
+ FROM events e
768
+ WHERE dq.event_id = e.id
769
+ AND dq.status = 'pending'
770
+ AND (
771
+ (e.attributes::jsonb)->>'sensitivity_class' = 'interpersonal'
772
+ OR (
773
+ jsonb_typeof((e.attributes::jsonb)->'sensitive_about') = 'array'
774
+ AND jsonb_array_length((e.attributes::jsonb)->'sensitive_about') > 0
775
+ )
776
+ )
777
+ """
778
+ )
543
779
  # Pre-filter: events older than the window.
544
780
  cur.execute(
545
781
  """
@@ -674,14 +910,24 @@ async def process_batch(
674
910
  for item in items:
675
911
  events_by_qid[item["id"]] = fetch_event(conn, item["event_id"])
676
912
 
677
- # Drop items whose event is missing (mark done up-front, no LLM call).
913
+ # Drop items whose event is missing (mark done up-front, no LLM call),
914
+ # and — content guardrail — any sensitive event that slipped past the
915
+ # claim-time pre-filter (defense in depth; the pure predicate is the
916
+ # testable contract for the rule). Never distil interpersonal gossip.
678
917
  callable_items: list[dict[str, Any]] = []
679
918
  for item in items:
680
- if events_by_qid[item["id"]] is None:
919
+ ev = events_by_qid[item["id"]]
920
+ if ev is None:
681
921
  log.warning(
682
922
  f"event {item['event_id']} missing — marking queue {item['id']} done"
683
923
  )
684
924
  mark_done(conn, item["id"])
925
+ elif SKIP_SENSITIVE_CONTENT and is_sensitive_event(ev):
926
+ log.info(
927
+ f"content-guardrail: filtering sensitive event {item['event_id']} "
928
+ f"— not distilling gossip into the graph"
929
+ )
930
+ mark_done(conn, item["id"])
685
931
  else:
686
932
  callable_items.append(item)
687
933
 
@@ -710,7 +956,7 @@ async def process_batch(
710
956
 
711
957
  # Flatten chunk_outcomes back to per-item results, paired with items.
712
958
  for (chunk_items, _chunk_events), (per_item, llm_ms) in zip(chunks, chunk_outcomes):
713
- for item, result in zip(chunk_items, per_item):
959
+ for local_idx, (item, result) in enumerate(zip(chunk_items, per_item)):
714
960
  queue_id = item["id"]
715
961
  event_id = item["event_id"]
716
962
  attempts = item["attempts"]
@@ -751,6 +997,24 @@ async def process_batch(
751
997
  f"relationships={n_rels}"
752
998
  + (f" llm_ms={llm_ms:.0f}/chunk" if not stub_mode else "")
753
999
  )
1000
+ # Trace logging — best-effort, never breaks the worker.
1001
+ # Captures (input, output) so a student model can be
1002
+ # trained on the teacher's distribution. Skipped in
1003
+ # stub mode (no real LLM output to record).
1004
+ if DISTILL_TRACE_ENABLED and not stub_mode:
1005
+ try:
1006
+ _insert_trace(
1007
+ conn,
1008
+ event_id=event_id,
1009
+ user_prompt=build_event_block(local_idx, event),
1010
+ raw_response=result.get("raw_slice", ""),
1011
+ llm_chunk_ms=llm_ms,
1012
+ )
1013
+ except Exception as trace_exc:
1014
+ log.warning(
1015
+ f"trace insert failed queue_id={queue_id} "
1016
+ f"event_id={event_id}: {trace_exc}"
1017
+ )
754
1018
  except Exception as exc:
755
1019
  err = f"{type(exc).__name__}: {exc}"
756
1020
  log.warning(
@@ -5,7 +5,7 @@ WORKDIR /app
5
5
  COPY requirements.txt .
6
6
  RUN pip install --no-cache-dir -r requirements.txt
7
7
 
8
- COPY server.py .
8
+ COPY entity_id.py server.py .
9
9
 
10
10
  EXPOSE 8101
11
11
  CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8101", "--workers", "2"]
@@ -0,0 +1,57 @@
1
+ """Canonical entity-ID scheme — SHARED, byte-identical across extractor-sync and
2
+ extractor-async.
3
+
4
+ The two extractors run as separate Docker services with PER-SERVICE build contexts
5
+ (docker-compose `context: ./extractor-sync` / `./extractor-async`), so a single
6
+ importable module can't be COPY'd into both. This file is therefore DUPLICATED in
7
+ each service dir, and tests/test_entity_id_parity.py fails if the copies ever drift.
8
+
9
+ Why this exists: both passes must key an entity (person / org / …) by the SAME id so
10
+ the same entity converges across the deterministic (sync) and LLM (async) passes.
11
+ Before this, the two services keyed entities DIFFERENTLY — sync as
12
+ `e_` + sha256("{arena}|{type}|{name.lower().strip()}")[:24]; async as
13
+ sha256("\\x1f".join(parts))[:32] (no lowercasing, no prefix) — so even identical
14
+ names produced different ids and never merged. We unify on the sync scheme: sync's
15
+ existing rows are unaffected, and the async pass converges onto them.
16
+
17
+ Step 1 of RFC-entity-reconciliation.md (the foundation for alias-aware resolution).
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import hashlib
23
+ import re
24
+ import unicodedata
25
+
26
+ _WHITESPACE = re.compile(r"\s+")
27
+
28
+
29
+ def normalize_surface_form(value: str) -> str:
30
+ """Normalize a surface form (person name, email, org name, …) for identity
31
+ keying. Steps, in order:
32
+
33
+ 1. None → "" (defensive; some producers can hand a missing field as None).
34
+ 2. Unicode NFKC (compatibility decomposition + canonical composition) —
35
+ collapses width / ligature / decomposed-accent variants. Without this
36
+ "Café" (precomposed U+00E9) and "Cafe\\u0301" (decomposed e+combining
37
+ acute) — which render identically — would key as different entities.
38
+ Same for fullwidth Latin ("CARLY" ↔ "CARLY") and ligatures
39
+ ("fi" U+FB01 ↔ "fi"). Real-world relevant for vCard imports, Mac
40
+ pasteboard, IME inputs, internationalised name sources.
41
+ 3. Trim outer whitespace, then collapse internal `\\s+` to a single space —
42
+ "Carly Snider" (slack-autocomplete double-space) ↔ "Carly Snider".
43
+ 4. Lowercase. Email casing is case-insensitive per spec; person-name
44
+ casing varies by producer (gmail header casing vs slack profile).
45
+ """
46
+ s = unicodedata.normalize("NFKC", value or "")
47
+ return _WHITESPACE.sub(" ", s.strip()).lower()
48
+
49
+
50
+ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
51
+ """Deterministic entity id. The same (arena, entity_type, normalized
52
+ canonical_name) yields the same id across BOTH extractor passes, so re-extraction
53
+ and cross-pass extraction converge. Format is preserved from extractor-sync
54
+ (`e_` + 24 hex of sha256) so its existing rows are unaffected.
55
+ """
56
+ key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
57
+ return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]