@pentatonic-ai/ai-agent-sdk 0.10.0 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,9 @@ import time
29
29
  from contextlib import asynccontextmanager
30
30
  from typing import Any
31
31
 
32
+ # Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
33
+ from entity_id import entity_id, normalize_surface_form # noqa: F401
34
+
32
35
  import psycopg
33
36
  import psycopg.rows
34
37
  from fastapi import FastAPI, HTTPException
@@ -114,10 +117,11 @@ def _content_hash(arena: str, content: str) -> str:
114
117
 
115
118
 
116
119
  def _entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
117
- """Deterministic entity ID — same canonical name in the same arena
118
- always produces the same entity, so re-extractions converge."""
119
- key = f"{arena}|{entity_type}|{canonical_name.lower().strip()}"
120
- return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
120
+ """Deterministic entity ID — same (arena, type, normalized name) always produces
121
+ the same entity, so re-extractions converge. Delegates to the shared `entity_id`
122
+ helper (entity_id.py) so extractor-sync and extractor-async key entities
123
+ identically previously they did not (RFC-entity-reconciliation.md step 1)."""
124
+ return entity_id(arena, entity_type, canonical_name)
121
125
 
122
126
 
123
127
  def _fact_id(arena: str, category: str, subject: str | None, predicate: str | None,
@@ -212,44 +216,143 @@ def _extract_doc(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
212
216
  return entities, facts, relationships
213
217
 
214
218
 
219
+ def _person_entity(
220
+ req: ExtractRequest,
221
+ event_id: str,
222
+ *,
223
+ name: str | None,
224
+ email: str | None,
225
+ extra_aliases: list[str] | None = None,
226
+ ) -> dict | None:
227
+ """Build a person entity record from a (name, email) pair, either
228
+ or both of which may be present. When both are present, the entity
229
+ is keyed by the *name* (more human-readable canonical) with the
230
+ email carried as an alias — so a later event that only carries the
231
+ email resolves to this same entity via the alias overlap check in
232
+ `_upsert_entities`.
233
+
234
+ See RFC §1 (extract-time pairing). The producer-agnostic shape:
235
+ callers extract whatever name+email pair their source carries
236
+ (gmail from_name/from_email, calendar displayName/email, slack
237
+ real_name/email, etc.) and hand it here. No producer-specific
238
+ knowledge in this helper.
239
+ """
240
+ name = (name or "").strip() or None
241
+ email = (email or "").strip() or None
242
+ if not name and not email:
243
+ return None
244
+
245
+ # Prefer name as canonical when both present. RFC §1 + §2b
246
+ # (accrete-only). When only email is available, fall back to email
247
+ # canonical — a later event carrying the pair will alias-resolve
248
+ # to this row and add the name as an alias, but won't rename the
249
+ # canonical (deferred to a follow-up; see §2b).
250
+ canonical = name if name else email
251
+ aliases_set: set[str] = set()
252
+ if name:
253
+ aliases_set.add(name)
254
+ if email:
255
+ aliases_set.add(email)
256
+ for a in extra_aliases or []:
257
+ if a:
258
+ aliases_set.add(a)
259
+
260
+ return {
261
+ "id": _entity_id(req.arena, "person", canonical),
262
+ "arena": req.arena,
263
+ "entity_type": "person",
264
+ "canonical_name": canonical,
265
+ "aliases": sorted(aliases_set),
266
+ "provenance_event_ids": [event_id],
267
+ "participant_set": req.attributes.get("participant_set", [req.arena]),
268
+ "disclosure_class": req.attributes.get("disclosure_class", "private"),
269
+ }
270
+
271
+
272
+ # Map of metadata key prefixes → role for paired email lookup.
273
+ # Looking up `from_email` ⟶ also look for `from_name`. Same shape for
274
+ # `to_*`, `cc_*`, `reply_to_*`. Producer-agnostic: any producer that
275
+ # follows the conventional `<role>_email` / `<role>_name` pattern (gmail
276
+ # bridge, email-ingest tools, calendar producers) gets pairing for free.
277
+ _PERSON_ROLE_PAIRS = [
278
+ ("from_email", "from_name"),
279
+ ("to_email", "to_name"),
280
+ ("cc_email", "cc_name"),
281
+ ("reply_to_email", "reply_to_name"),
282
+ ("sender_email", "sender_name"),
283
+ ]
284
+
285
+
215
286
  def _extract_note(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
216
- """gmail / drafts: extract from + to emails, subject."""
287
+ """gmail / drafts / other email-shape sources: extract participants.
288
+
289
+ For each `<role>_email` in the event attributes, pair with the
290
+ corresponding `<role>_name` if present so the resulting entity
291
+ carries both surface forms (RFC §1). Falls back to email-only
292
+ canonical when no name is paired.
293
+ """
217
294
  entities, facts, relationships = [], [], []
218
- for key in ("from_email", "to_email", "cc_email"):
219
- val = req.attributes.get(key)
220
- if isinstance(val, str) and "@" in val:
221
- eid = _entity_id(req.arena, "person", val)
222
- entities.append({
223
- "id": eid,
224
- "arena": req.arena,
225
- "entity_type": "person",
226
- "canonical_name": val,
227
- "aliases": [val],
228
- "provenance_event_ids": [event_id],
229
- "participant_set": req.attributes.get("participant_set", [req.arena]),
230
- "disclosure_class": req.attributes.get("disclosure_class", "private"),
231
- })
295
+ attrs = req.attributes
296
+ for email_key, name_key in _PERSON_ROLE_PAIRS:
297
+ val = attrs.get(email_key)
298
+ if not isinstance(val, str) or "@" not in val:
299
+ continue
300
+ ent = _person_entity(req, event_id, name=attrs.get(name_key), email=val)
301
+ if ent is not None:
302
+ entities.append(ent)
232
303
  return entities, facts, relationships
233
304
 
234
305
 
235
306
  def _extract_event(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
236
- """calendar: extract organizer + attendees as people entities."""
307
+ """calendar: extract organizer + attendees as people entities.
308
+
309
+ Pairs name+email when the producer carries both — modern calendar
310
+ producers (Google Calendar, Outlook) emit attendees as
311
+ `{email, displayName, responseStatus, ...}` objects. We accept
312
+ several common shapes (RFC §1).
313
+ """
237
314
  entities, facts, relationships = [], [], []
238
- organizer = req.attributes.get("organizer_email")
239
- attendees = req.attributes.get("attendee_emails") or []
240
- for email in [organizer, *attendees]:
241
- if isinstance(email, str) and "@" in email:
242
- eid = _entity_id(req.arena, "person", email)
243
- entities.append({
244
- "id": eid,
245
- "arena": req.arena,
246
- "entity_type": "person",
247
- "canonical_name": email,
248
- "aliases": [email],
249
- "provenance_event_ids": [event_id],
250
- "participant_set": req.attributes.get("participant_set", [req.arena]),
251
- "disclosure_class": req.attributes.get("disclosure_class", "private"),
252
- })
315
+ attrs = req.attributes
316
+
317
+ # Organizer: may be a paired (organizer_email, organizer_name) or
318
+ # a structured organizer object.
319
+ organizer_email = attrs.get("organizer_email")
320
+ organizer_name = attrs.get("organizer_name") or attrs.get("organizer_display_name")
321
+ organizer_obj = attrs.get("organizer")
322
+ if isinstance(organizer_obj, dict):
323
+ organizer_email = organizer_email or organizer_obj.get("email")
324
+ organizer_name = organizer_name or organizer_obj.get("displayName") \
325
+ or organizer_obj.get("name")
326
+ ent = _person_entity(req, event_id, name=organizer_name, email=organizer_email)
327
+ if ent is not None:
328
+ entities.append(ent)
329
+
330
+ # Attendees: prefer structured objects (carry displayName), fall
331
+ # back to flat email list if the producer only sends emails.
332
+ attendees = attrs.get("attendees") or attrs.get("attendee_objects")
333
+ if isinstance(attendees, list):
334
+ for a in attendees:
335
+ if isinstance(a, dict):
336
+ ent = _person_entity(
337
+ req, event_id,
338
+ name=a.get("displayName") or a.get("name"),
339
+ email=a.get("email"),
340
+ )
341
+ if ent is not None:
342
+ entities.append(ent)
343
+ elif isinstance(a, str) and "@" in a:
344
+ ent = _person_entity(req, event_id, name=None, email=a)
345
+ if ent is not None:
346
+ entities.append(ent)
347
+ else:
348
+ # Legacy flat-email-list path. Kept for back-compat with
349
+ # producers that haven't moved to structured attendee objects.
350
+ for email in (attrs.get("attendee_emails") or []):
351
+ if isinstance(email, str) and "@" in email:
352
+ ent = _person_entity(req, event_id, name=None, email=email)
353
+ if ent is not None:
354
+ entities.append(ent)
355
+
253
356
  return entities, facts, relationships
254
357
 
255
358
 
@@ -314,31 +417,104 @@ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
314
417
 
315
418
 
316
419
  async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> None:
317
- """Idempotent entity upsert: same canonical_name in same arena
318
- converges to the same row. Aliases + provenance_event_ids grow
319
- via array_append; never replace."""
420
+ """Alias-aware idempotent entity upsert.
421
+
422
+ For each entity, before inserting, look for an existing row in the
423
+ same (arena, entity_type) whose canonical_name OR aliases overlap
424
+ any of the incoming surface forms. If found, merge aliases +
425
+ provenance into the existing row. Otherwise insert a new row.
426
+
427
+ Concurrency safety: a Postgres advisory lock keyed on
428
+ `(arena, entity_type, normalised_form)` is taken for every surface
429
+ form before resolution, so concurrent writers (sync + async
430
+ extractors on the same event) serialise on the same person and
431
+ can't both race past the SELECT to two separate INSERTs.
432
+
433
+ Surface forms ("forms" below) = the canonical name + every alias,
434
+ after normalize_surface_form. The match SQL uses both the normalised
435
+ forms (for case-insensitive canonical_name match) and the original
436
+ forms (for aliases @>; aliases retain their display casing).
437
+
438
+ See RFC §2 (alias-aware resolution) and §2a (concurrency).
439
+ """
320
440
  for e in entities:
441
+ # Sort (don't `list(set(...))`) so lock acquisition order is
442
+ # deterministic across processes — set-iteration order depends
443
+ # on Python's per-process hash randomisation, so sync and
444
+ # async extractors processing the same person could otherwise
445
+ # acquire the same locks in opposite orders and deadlock.
446
+ forms_original = sorted({e["canonical_name"], *e.get("aliases", [])})
447
+ forms_normalized = sorted({normalize_surface_form(f) for f in forms_original})
448
+
449
+ # 1. Advisory lock on every form. pg_advisory_xact_lock takes a
450
+ # bigint; hashtext gives us one keyed on a string. Held until
451
+ # commit (next SELECT/UPDATE/INSERT release on txn end).
452
+ # Same form locked twice in the same txn is a no-op.
453
+ for f in forms_normalized:
454
+ await cur.execute(
455
+ "SELECT pg_advisory_xact_lock(hashtext(%s))",
456
+ (f"{e['arena']}|{e['entity_type']}|{f}",),
457
+ )
458
+
459
+ # 2. Resolve: any existing entity match any of our forms?
460
+ # lower(canonical_name) is what _entity_id() hashes on, so
461
+ # equality against forms_normalized catches the canonical
462
+ # match. aliases && forms_original catches alias overlap.
321
463
  await cur.execute(
322
464
  """
323
- INSERT INTO entities (
324
- id, arena, entity_type, canonical_name, aliases,
325
- provenance_event_ids, participant_set, disclosure_class
326
- ) VALUES (
327
- %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
328
- )
329
- ON CONFLICT (id) DO UPDATE SET
330
- aliases = (
331
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
332
- ),
333
- provenance_event_ids = (
334
- SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
335
- ),
336
- last_seen = NOW()
465
+ SELECT id FROM entities
466
+ WHERE arena = %s
467
+ AND entity_type = %s
468
+ AND (
469
+ lower(canonical_name) = ANY(%s::text[])
470
+ OR aliases && %s::text[]
471
+ )
472
+ LIMIT 1
337
473
  """,
338
- (e["id"], e["arena"], e["entity_type"], e["canonical_name"],
339
- e["aliases"], e["provenance_event_ids"],
340
- e["participant_set"], e["disclosure_class"]),
474
+ (e["arena"], e["entity_type"], forms_normalized, forms_original),
341
475
  )
476
+ row = await cur.fetchone()
477
+
478
+ if row is not None:
479
+ # 3a. Existing match — merge aliases + provenance, keep
480
+ # canonical as it was (accrete-only per RFC §2b).
481
+ existing_id = row[0]
482
+ await cur.execute(
483
+ """
484
+ UPDATE entities SET
485
+ aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
486
+ provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
487
+ last_seen = NOW()
488
+ WHERE id = %s
489
+ """,
490
+ (e["aliases"], e["provenance_event_ids"], existing_id),
491
+ )
492
+ else:
493
+ # 3b. No match — insert new. ON CONFLICT (id) is a belt-
494
+ # and-braces fallback for the rare case where two writers
495
+ # collide on the same id under different surface forms;
496
+ # the advisory lock above is the primary defence.
497
+ await cur.execute(
498
+ """
499
+ INSERT INTO entities (
500
+ id, arena, entity_type, canonical_name, aliases,
501
+ provenance_event_ids, participant_set, disclosure_class
502
+ ) VALUES (
503
+ %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
504
+ )
505
+ ON CONFLICT (id) DO UPDATE SET
506
+ aliases = (
507
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
508
+ ),
509
+ provenance_event_ids = (
510
+ SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
511
+ ),
512
+ last_seen = NOW()
513
+ """,
514
+ (e["id"], e["arena"], e["entity_type"], e["canonical_name"],
515
+ e["aliases"], e["provenance_event_ids"],
516
+ e["participant_set"], e["disclosure_class"]),
517
+ )
342
518
 
343
519
 
344
520
  async def _enqueue_distillation(cur: psycopg.AsyncCursor, event_id: str) -> None:
@@ -0,0 +1,88 @@
1
+ """Tests for the shared entity-ID scheme (entity_id.py).
2
+
3
+ Step 1 of RFC-entity-reconciliation.md: extractor-sync and extractor-async must key
4
+ entities IDENTICALLY so the same entity converges across the deterministic and LLM
5
+ passes. These cover the normalization contract + the id format.
6
+
7
+ Run: pytest packages/memory-engine-v2/extractor-sync/test_entity_id.py
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from entity_id import entity_id, normalize_surface_form
13
+
14
+
15
+ class TestNormalizeSurfaceForm:
16
+ def test_lowercases(self):
17
+ assert normalize_surface_form("Carly Snider") == "carly snider"
18
+
19
+ def test_trims_and_collapses_internal_whitespace(self):
20
+ assert normalize_surface_form(" Carly Snider ") == "carly snider"
21
+
22
+ def test_email_case_insensitive(self):
23
+ assert normalize_surface_form("Carly@Pact.ORG") == "carly@pact.org"
24
+
25
+ def test_tolerates_none_and_empty(self):
26
+ assert normalize_surface_form(None) == "" # type: ignore[arg-type]
27
+ assert normalize_surface_form("") == ""
28
+ assert normalize_surface_form(" ") == ""
29
+
30
+ # ---- Unicode normalisation (NFKC) ------------------------------------
31
+ # Without NFKC, names that render identically can have different byte
32
+ # sequences (precomposed vs decomposed Unicode, fullwidth vs halfwidth,
33
+ # ligatures vs separate glyphs) — and fragment into separate entities.
34
+ # These cases come up in practice with vCard imports, Mac pasteboard,
35
+ # IME inputs, internationalised name sources.
36
+
37
+ def test_nfkc_decomposed_to_precomposed(self):
38
+ # "Café" precomposed (U+00E9) vs decomposed (U+0065 + U+0301)
39
+ precomposed = "Café"
40
+ decomposed = "Café"
41
+ assert precomposed != decomposed # different byte sequences
42
+ assert normalize_surface_form(precomposed) == normalize_surface_form(decomposed)
43
+
44
+ def test_nfkc_fullwidth_to_halfwidth_latin(self):
45
+ # Fullwidth Latin (often emitted by CJK-locale IMEs) collapses to
46
+ # halfwidth so "CARLY" keys the same as "Carly".
47
+ assert normalize_surface_form("CARLY") == normalize_surface_form("Carly")
48
+
49
+ def test_nfkc_ligature_to_separate_glyphs(self):
50
+ # Compatibility ligatures (e.g. "fi" U+FB01) decompose to "fi".
51
+ assert normalize_surface_form("Office") == normalize_surface_form("Office")
52
+
53
+
54
+ class TestEntityId:
55
+ def test_format_is_e_prefix_plus_24_hex(self):
56
+ eid = entity_id("arena1", "person", "Carly Snider")
57
+ assert eid.startswith("e_")
58
+ assert len(eid) == 26 # "e_" + 24 hex
59
+
60
+ def test_casing_and_spacing_variants_converge(self):
61
+ a = entity_id("arena1", "person", "Carly Snider")
62
+ b = entity_id("arena1", "person", "carly snider")
63
+ c = entity_id("arena1", "person", " Carly Snider ")
64
+ assert a == b == c
65
+
66
+ def test_email_casing_converges(self):
67
+ a = entity_id("arena1", "person", "carly@pactcollective.org")
68
+ b = entity_id("arena1", "person", "Carly@PactCollective.org")
69
+ assert a == b
70
+
71
+ def test_distinct_people_differ(self):
72
+ assert entity_id("a", "person", "Ben Gordon") != entity_id("a", "person", "Ben Smith")
73
+
74
+ def test_arena_scoped(self):
75
+ assert entity_id("a", "person", "Carly Snider") != entity_id("b", "person", "Carly Snider")
76
+
77
+ def test_type_scoped(self):
78
+ # An email could be a person OR (in theory) another type; the type is part
79
+ # of the key, so cross-type collisions can't merge accidentally.
80
+ assert entity_id("a", "person", "acme.com") != entity_id("a", "org", "acme.com")
81
+
82
+ def test_email_and_name_remain_distinct_pre_resolution(self):
83
+ # Step 1 unifies the *scheme*, it does NOT yet merge email<->name (that is
84
+ # step 2's alias resolution). The email form and the name form are still
85
+ # different surface strings, hence different ids — by design at this stage.
86
+ assert entity_id("a", "person", "carly@pactcollective.org") != entity_id(
87
+ "a", "person", "Carly Snider"
88
+ )
@@ -0,0 +1,208 @@
1
+ """Unit tests for the step-2-onwards extractor-sync work — the
2
+ paired-extraction helper (_person_entity) and the per-source rules
3
+ (_extract_note, _extract_event) that drive it.
4
+
5
+ Step 1 (normalize_surface_form + entity_id) is covered by
6
+ test_entity_id.py + tests/test_entity_id_parity.py; not retested here
7
+ to avoid drift between the two.
8
+
9
+ The integration scenarios that touch the DB live in
10
+ `packages/memory-engine-v2/tests/` and spin up a real postgres via
11
+ testcontainers; this file is the cheap pytest layer that runs in
12
+ unit-test CI without infra.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import importlib.util
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ import pytest
22
+
23
+
24
+ # Load extractor-sync's server.py as a module so we can call its
25
+ # private helpers directly.
26
+ _THIS = Path(__file__).resolve().parent
27
+ _SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
28
+ _THIS / "server.py")
29
+ assert _SPEC and _SPEC.loader
30
+ sync_server = importlib.util.module_from_spec(_SPEC)
31
+ try:
32
+ _SPEC.loader.exec_module(sync_server)
33
+ except ImportError as e:
34
+ # If extractor-sync's runtime deps (psycopg, fastapi) aren't
35
+ # installed in this env, skip the whole module. The integration
36
+ # tests in tests/ install the full container deps.
37
+ pytest.skip(f"extractor-sync deps unavailable: {e}", allow_module_level=True)
38
+
39
+
40
+
41
+ # ----------------------------------------------------------------------
42
+ # _person_entity — RFC §1 paired (name, email) emission
43
+ # ----------------------------------------------------------------------
44
+
45
+ @pytest.fixture
46
+ def stub_req():
47
+ """Minimal stub for ExtractRequest — only the attrs _person_entity
48
+ reads (arena, attributes.participant_set, .disclosure_class)."""
49
+ class _Req:
50
+ arena = "tenant:test"
51
+ attributes = {
52
+ "participant_set": ["tenant:test"],
53
+ "disclosure_class": "private",
54
+ }
55
+ return _Req()
56
+
57
+
58
+ def test_person_entity_name_and_email_pair(stub_req) -> None:
59
+ """When both name and email present → entity keyed by NAME,
60
+ BOTH forms in aliases."""
61
+ e = sync_server._person_entity(
62
+ stub_req, "evt1", name="Carly Snider", email="carly@example.com"
63
+ )
64
+ assert e is not None
65
+ assert e["canonical_name"] == "Carly Snider"
66
+ assert set(e["aliases"]) == {"Carly Snider", "carly@example.com"}
67
+ # Id is derived from the name (not the email).
68
+ assert e["id"] == sync_server._entity_id("tenant:test", "person", "Carly Snider")
69
+
70
+
71
+ def test_person_entity_email_only(stub_req) -> None:
72
+ """Email-only event (no name) → email-keyed entity. A later
73
+ paired event will alias-resolve to this via the email."""
74
+ e = sync_server._person_entity(stub_req, "evt1", name=None, email="x@example.com")
75
+ assert e is not None
76
+ assert e["canonical_name"] == "x@example.com"
77
+ assert e["aliases"] == ["x@example.com"]
78
+
79
+
80
+ def test_person_entity_name_only(stub_req) -> None:
81
+ """Name-only (no email) → name-keyed entity, name as only alias."""
82
+ e = sync_server._person_entity(stub_req, "evt1", name="Alex Wong", email=None)
83
+ assert e is not None
84
+ assert e["canonical_name"] == "Alex Wong"
85
+ assert e["aliases"] == ["Alex Wong"]
86
+
87
+
88
+ def test_person_entity_empty_returns_none(stub_req) -> None:
89
+ """Neither name nor email → None (caller skips)."""
90
+ assert sync_server._person_entity(stub_req, "evt1", name="", email=None) is None
91
+ assert sync_server._person_entity(stub_req, "evt1", name=None, email="") is None
92
+ assert sync_server._person_entity(stub_req, "evt1", name=" ", email=" ") is None
93
+
94
+
95
+ def test_person_entity_extra_aliases_merge(stub_req) -> None:
96
+ """`extra_aliases` (e.g. slack id) merge with name + email in aliases."""
97
+ e = sync_server._person_entity(
98
+ stub_req, "evt1",
99
+ name="Sam Patel", email="sam@example.com",
100
+ extra_aliases=["slack:U01ABC"],
101
+ )
102
+ assert e is not None
103
+ assert set(e["aliases"]) == {"Sam Patel", "sam@example.com", "slack:U01ABC"}
104
+
105
+
106
+ # ----------------------------------------------------------------------
107
+ # _extract_note — paired emission from gmail-shape attributes
108
+ # ----------------------------------------------------------------------
109
+
110
+ def _stub_request(attributes: dict, content: str = "") -> object:
111
+ """Build a minimal stub of ExtractRequest for unit tests."""
112
+ class _Req:
113
+ arena = "tenant:test"
114
+ clientId = "client"
115
+ userId = "user"
116
+ event_type = "STORE_MEMORY"
117
+ source_kind = "note"
118
+ source_id = "src1"
119
+ r = _Req()
120
+ r.content = content
121
+ r.attributes = {"participant_set": ["tenant:test"],
122
+ "disclosure_class": "private", **attributes}
123
+ return r
124
+
125
+
126
+ def test_extract_note_pairs_name_and_email() -> None:
127
+ """gmail-shape: from_email + from_name → single entity name-keyed."""
128
+ req = _stub_request({
129
+ "from_email": "carly@example.com",
130
+ "from_name": "Carly Snider",
131
+ })
132
+ entities, _, _ = sync_server._extract_note(req, "evt1")
133
+ assert len(entities) == 1
134
+ assert entities[0]["canonical_name"] == "Carly Snider"
135
+ assert set(entities[0]["aliases"]) == {"Carly Snider", "carly@example.com"}
136
+
137
+
138
+ def test_extract_note_email_only_when_no_name() -> None:
139
+ """gmail-shape: from_email only (no from_name) → email-keyed.
140
+ Same as v1 behaviour for this case (the fix is for when name
141
+ *is* available)."""
142
+ req = _stub_request({
143
+ "from_email": "x@example.com",
144
+ })
145
+ entities, _, _ = sync_server._extract_note(req, "evt1")
146
+ assert len(entities) == 1
147
+ assert entities[0]["canonical_name"] == "x@example.com"
148
+
149
+
150
+ def test_extract_note_multiple_role_pairs() -> None:
151
+ """from_name+email + to_name+email + cc-email only → three entities."""
152
+ req = _stub_request({
153
+ "from_email": "a@example.com", "from_name": "Alice One",
154
+ "to_email": "b@example.com", "to_name": "Bob Two",
155
+ "cc_email": "c@example.com", # no cc_name
156
+ })
157
+ entities, _, _ = sync_server._extract_note(req, "evt1")
158
+ assert len(entities) == 3
159
+ canonicals = {e["canonical_name"] for e in entities}
160
+ assert canonicals == {"Alice One", "Bob Two", "c@example.com"}
161
+
162
+
163
+ # ----------------------------------------------------------------------
164
+ # _extract_event — paired emission from calendar-shape attributes
165
+ # ----------------------------------------------------------------------
166
+
167
+ def test_extract_event_structured_attendees() -> None:
168
+ """Modern calendar shape: attendees as {email, displayName} objects."""
169
+ req = _stub_request({
170
+ "organizer_email": "host@example.com",
171
+ "organizer_name": "Host Person",
172
+ "attendees": [
173
+ {"email": "a@example.com", "displayName": "Alice One"},
174
+ {"email": "b@example.com"}, # no displayName
175
+ ],
176
+ })
177
+ entities, _, _ = sync_server._extract_event(req, "evt1")
178
+ assert len(entities) == 3
179
+ by_canon = {e["canonical_name"]: e for e in entities}
180
+ assert "Host Person" in by_canon
181
+ assert "Alice One" in by_canon
182
+ assert "b@example.com" in by_canon
183
+ # Paired ones carry both forms in aliases.
184
+ assert "host@example.com" in by_canon["Host Person"]["aliases"]
185
+ assert "a@example.com" in by_canon["Alice One"]["aliases"]
186
+
187
+
188
+ def test_extract_event_legacy_flat_attendee_emails() -> None:
189
+ """Back-compat: producers that only send a flat list of emails."""
190
+ req = _stub_request({
191
+ "organizer_email": "host@example.com",
192
+ "attendee_emails": ["a@example.com", "b@example.com"],
193
+ })
194
+ entities, _, _ = sync_server._extract_event(req, "evt1")
195
+ assert len(entities) == 3
196
+ canonicals = {e["canonical_name"] for e in entities}
197
+ assert canonicals == {"host@example.com", "a@example.com", "b@example.com"}
198
+
199
+
200
+ def test_extract_event_organizer_object_form() -> None:
201
+ """Some calendar producers emit organizer as {email, displayName}."""
202
+ req = _stub_request({
203
+ "organizer": {"email": "x@example.com", "displayName": "X Person"},
204
+ })
205
+ entities, _, _ = sync_server._extract_event(req, "evt1")
206
+ assert len(entities) == 1
207
+ assert entities[0]["canonical_name"] == "X Person"
208
+ assert "x@example.com" in entities[0]["aliases"]