@pentatonic-ai/ai-agent-sdk 0.10.0 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/server.py +38 -6
- package/packages/memory-engine-v2/extractor-async/Dockerfile +5 -3
- package/packages/memory-engine-v2/extractor-async/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-async/sensitive_filter.py +51 -0
- package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py +258 -0
- package/packages/memory-engine-v2/extractor-async/test_sensitive_filter.py +61 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +307 -43
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-sync/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +231 -55
- package/packages/memory-engine-v2/extractor-sync/test_entity_id.py +88 -0
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +208 -0
- package/packages/memory-engine-v2/org-model/migrations/002_entity_merges_audit.sql +53 -0
- package/packages/memory-engine-v2/org-model/migrations/003_distillation_traces.sql +60 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +581 -0
- package/packages/memory-engine-v2/tests/test_entity_id_parity.py +57 -0
|
@@ -29,6 +29,9 @@ import time
|
|
|
29
29
|
from contextlib import asynccontextmanager
|
|
30
30
|
from typing import Any
|
|
31
31
|
|
|
32
|
+
# Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
|
|
33
|
+
from entity_id import entity_id, normalize_surface_form # noqa: F401
|
|
34
|
+
|
|
32
35
|
import psycopg
|
|
33
36
|
import psycopg.rows
|
|
34
37
|
from fastapi import FastAPI, HTTPException
|
|
@@ -114,10 +117,11 @@ def _content_hash(arena: str, content: str) -> str:
|
|
|
114
117
|
|
|
115
118
|
|
|
116
119
|
def _entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
|
|
117
|
-
"""Deterministic entity ID — same
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
120
|
+
"""Deterministic entity ID — same (arena, type, normalized name) always produces
|
|
121
|
+
the same entity, so re-extractions converge. Delegates to the shared `entity_id`
|
|
122
|
+
helper (entity_id.py) so extractor-sync and extractor-async key entities
|
|
123
|
+
identically — previously they did not (RFC-entity-reconciliation.md step 1)."""
|
|
124
|
+
return entity_id(arena, entity_type, canonical_name)
|
|
121
125
|
|
|
122
126
|
|
|
123
127
|
def _fact_id(arena: str, category: str, subject: str | None, predicate: str | None,
|
|
@@ -212,44 +216,143 @@ def _extract_doc(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
|
|
|
212
216
|
return entities, facts, relationships
|
|
213
217
|
|
|
214
218
|
|
|
219
|
+
def _person_entity(
|
|
220
|
+
req: ExtractRequest,
|
|
221
|
+
event_id: str,
|
|
222
|
+
*,
|
|
223
|
+
name: str | None,
|
|
224
|
+
email: str | None,
|
|
225
|
+
extra_aliases: list[str] | None = None,
|
|
226
|
+
) -> dict | None:
|
|
227
|
+
"""Build a person entity record from a (name, email) pair, either
|
|
228
|
+
or both of which may be present. When both are present, the entity
|
|
229
|
+
is keyed by the *name* (more human-readable canonical) with the
|
|
230
|
+
email carried as an alias — so a later event that only carries the
|
|
231
|
+
email resolves to this same entity via the alias overlap check in
|
|
232
|
+
`_upsert_entities`.
|
|
233
|
+
|
|
234
|
+
See RFC §1 (extract-time pairing). The producer-agnostic shape:
|
|
235
|
+
callers extract whatever name+email pair their source carries
|
|
236
|
+
(gmail from_name/from_email, calendar displayName/email, slack
|
|
237
|
+
real_name/email, etc.) and hand it here. No producer-specific
|
|
238
|
+
knowledge in this helper.
|
|
239
|
+
"""
|
|
240
|
+
name = (name or "").strip() or None
|
|
241
|
+
email = (email or "").strip() or None
|
|
242
|
+
if not name and not email:
|
|
243
|
+
return None
|
|
244
|
+
|
|
245
|
+
# Prefer name as canonical when both present. RFC §1 + §2b
|
|
246
|
+
# (accrete-only). When only email is available, fall back to email
|
|
247
|
+
# canonical — a later event carrying the pair will alias-resolve
|
|
248
|
+
# to this row and add the name as an alias, but won't rename the
|
|
249
|
+
# canonical (deferred to a follow-up; see §2b).
|
|
250
|
+
canonical = name if name else email
|
|
251
|
+
aliases_set: set[str] = set()
|
|
252
|
+
if name:
|
|
253
|
+
aliases_set.add(name)
|
|
254
|
+
if email:
|
|
255
|
+
aliases_set.add(email)
|
|
256
|
+
for a in extra_aliases or []:
|
|
257
|
+
if a:
|
|
258
|
+
aliases_set.add(a)
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
"id": _entity_id(req.arena, "person", canonical),
|
|
262
|
+
"arena": req.arena,
|
|
263
|
+
"entity_type": "person",
|
|
264
|
+
"canonical_name": canonical,
|
|
265
|
+
"aliases": sorted(aliases_set),
|
|
266
|
+
"provenance_event_ids": [event_id],
|
|
267
|
+
"participant_set": req.attributes.get("participant_set", [req.arena]),
|
|
268
|
+
"disclosure_class": req.attributes.get("disclosure_class", "private"),
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# Map of metadata key prefixes → role for paired email lookup.
|
|
273
|
+
# Looking up `from_email` ⟶ also look for `from_name`. Same shape for
|
|
274
|
+
# `to_*`, `cc_*`, `reply_to_*`. Producer-agnostic: any producer that
|
|
275
|
+
# follows the conventional `<role>_email` / `<role>_name` pattern (gmail
|
|
276
|
+
# bridge, email-ingest tools, calendar producers) gets pairing for free.
|
|
277
|
+
_PERSON_ROLE_PAIRS = [
|
|
278
|
+
("from_email", "from_name"),
|
|
279
|
+
("to_email", "to_name"),
|
|
280
|
+
("cc_email", "cc_name"),
|
|
281
|
+
("reply_to_email", "reply_to_name"),
|
|
282
|
+
("sender_email", "sender_name"),
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
|
|
215
286
|
def _extract_note(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
|
|
216
|
-
"""gmail / drafts
|
|
287
|
+
"""gmail / drafts / other email-shape sources: extract participants.
|
|
288
|
+
|
|
289
|
+
For each `<role>_email` in the event attributes, pair with the
|
|
290
|
+
corresponding `<role>_name` if present so the resulting entity
|
|
291
|
+
carries both surface forms (RFC §1). Falls back to email-only
|
|
292
|
+
canonical when no name is paired.
|
|
293
|
+
"""
|
|
217
294
|
entities, facts, relationships = [], [], []
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"canonical_name": val,
|
|
227
|
-
"aliases": [val],
|
|
228
|
-
"provenance_event_ids": [event_id],
|
|
229
|
-
"participant_set": req.attributes.get("participant_set", [req.arena]),
|
|
230
|
-
"disclosure_class": req.attributes.get("disclosure_class", "private"),
|
|
231
|
-
})
|
|
295
|
+
attrs = req.attributes
|
|
296
|
+
for email_key, name_key in _PERSON_ROLE_PAIRS:
|
|
297
|
+
val = attrs.get(email_key)
|
|
298
|
+
if not isinstance(val, str) or "@" not in val:
|
|
299
|
+
continue
|
|
300
|
+
ent = _person_entity(req, event_id, name=attrs.get(name_key), email=val)
|
|
301
|
+
if ent is not None:
|
|
302
|
+
entities.append(ent)
|
|
232
303
|
return entities, facts, relationships
|
|
233
304
|
|
|
234
305
|
|
|
235
306
|
def _extract_event(req: ExtractRequest, event_id: str) -> tuple[list, list, list]:
|
|
236
|
-
"""calendar: extract organizer + attendees as people entities.
|
|
307
|
+
"""calendar: extract organizer + attendees as people entities.
|
|
308
|
+
|
|
309
|
+
Pairs name+email when the producer carries both — modern calendar
|
|
310
|
+
producers (Google Calendar, Outlook) emit attendees as
|
|
311
|
+
`{email, displayName, responseStatus, ...}` objects. We accept
|
|
312
|
+
several common shapes (RFC §1).
|
|
313
|
+
"""
|
|
237
314
|
entities, facts, relationships = [], [], []
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
315
|
+
attrs = req.attributes
|
|
316
|
+
|
|
317
|
+
# Organizer: may be a paired (organizer_email, organizer_name) or
|
|
318
|
+
# a structured organizer object.
|
|
319
|
+
organizer_email = attrs.get("organizer_email")
|
|
320
|
+
organizer_name = attrs.get("organizer_name") or attrs.get("organizer_display_name")
|
|
321
|
+
organizer_obj = attrs.get("organizer")
|
|
322
|
+
if isinstance(organizer_obj, dict):
|
|
323
|
+
organizer_email = organizer_email or organizer_obj.get("email")
|
|
324
|
+
organizer_name = organizer_name or organizer_obj.get("displayName") \
|
|
325
|
+
or organizer_obj.get("name")
|
|
326
|
+
ent = _person_entity(req, event_id, name=organizer_name, email=organizer_email)
|
|
327
|
+
if ent is not None:
|
|
328
|
+
entities.append(ent)
|
|
329
|
+
|
|
330
|
+
# Attendees: prefer structured objects (carry displayName), fall
|
|
331
|
+
# back to flat email list if the producer only sends emails.
|
|
332
|
+
attendees = attrs.get("attendees") or attrs.get("attendee_objects")
|
|
333
|
+
if isinstance(attendees, list):
|
|
334
|
+
for a in attendees:
|
|
335
|
+
if isinstance(a, dict):
|
|
336
|
+
ent = _person_entity(
|
|
337
|
+
req, event_id,
|
|
338
|
+
name=a.get("displayName") or a.get("name"),
|
|
339
|
+
email=a.get("email"),
|
|
340
|
+
)
|
|
341
|
+
if ent is not None:
|
|
342
|
+
entities.append(ent)
|
|
343
|
+
elif isinstance(a, str) and "@" in a:
|
|
344
|
+
ent = _person_entity(req, event_id, name=None, email=a)
|
|
345
|
+
if ent is not None:
|
|
346
|
+
entities.append(ent)
|
|
347
|
+
else:
|
|
348
|
+
# Legacy flat-email-list path. Kept for back-compat with
|
|
349
|
+
# producers that haven't moved to structured attendee objects.
|
|
350
|
+
for email in (attrs.get("attendee_emails") or []):
|
|
351
|
+
if isinstance(email, str) and "@" in email:
|
|
352
|
+
ent = _person_entity(req, event_id, name=None, email=email)
|
|
353
|
+
if ent is not None:
|
|
354
|
+
entities.append(ent)
|
|
355
|
+
|
|
253
356
|
return entities, facts, relationships
|
|
254
357
|
|
|
255
358
|
|
|
@@ -314,31 +417,104 @@ async def _upsert_event(cur: psycopg.AsyncCursor, req: ExtractRequest,
|
|
|
314
417
|
|
|
315
418
|
|
|
316
419
|
async def _upsert_entities(cur: psycopg.AsyncCursor, entities: list[dict]) -> None:
|
|
317
|
-
"""
|
|
318
|
-
|
|
319
|
-
|
|
420
|
+
"""Alias-aware idempotent entity upsert.
|
|
421
|
+
|
|
422
|
+
For each entity, before inserting, look for an existing row in the
|
|
423
|
+
same (arena, entity_type) whose canonical_name OR aliases overlap
|
|
424
|
+
any of the incoming surface forms. If found, merge aliases +
|
|
425
|
+
provenance into the existing row. Otherwise insert a new row.
|
|
426
|
+
|
|
427
|
+
Concurrency safety: a Postgres advisory lock keyed on
|
|
428
|
+
`(arena, entity_type, normalised_form)` is taken for every surface
|
|
429
|
+
form before resolution, so concurrent writers (sync + async
|
|
430
|
+
extractors on the same event) serialise on the same person and
|
|
431
|
+
can't both race past the SELECT to two separate INSERTs.
|
|
432
|
+
|
|
433
|
+
Surface forms ("forms" below) = the canonical name + every alias,
|
|
434
|
+
after normalize_surface_form. The match SQL uses both the normalised
|
|
435
|
+
forms (for case-insensitive canonical_name match) and the original
|
|
436
|
+
forms (for aliases @>; aliases retain their display casing).
|
|
437
|
+
|
|
438
|
+
See RFC §2 (alias-aware resolution) and §2a (concurrency).
|
|
439
|
+
"""
|
|
320
440
|
for e in entities:
|
|
441
|
+
# Sort (don't `list(set(...))`) so lock acquisition order is
|
|
442
|
+
# deterministic across processes — set-iteration order depends
|
|
443
|
+
# on Python's per-process hash randomisation, so sync and
|
|
444
|
+
# async extractors processing the same person could otherwise
|
|
445
|
+
# acquire the same locks in opposite orders and deadlock.
|
|
446
|
+
forms_original = sorted({e["canonical_name"], *e.get("aliases", [])})
|
|
447
|
+
forms_normalized = sorted({normalize_surface_form(f) for f in forms_original})
|
|
448
|
+
|
|
449
|
+
# 1. Advisory lock on every form. pg_advisory_xact_lock takes a
|
|
450
|
+
# bigint; hashtext gives us one keyed on a string. Held until
|
|
451
|
+
# commit (next SELECT/UPDATE/INSERT release on txn end).
|
|
452
|
+
# Same form locked twice in the same txn is a no-op.
|
|
453
|
+
for f in forms_normalized:
|
|
454
|
+
await cur.execute(
|
|
455
|
+
"SELECT pg_advisory_xact_lock(hashtext(%s))",
|
|
456
|
+
(f"{e['arena']}|{e['entity_type']}|{f}",),
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# 2. Resolve: any existing entity match any of our forms?
|
|
460
|
+
# lower(canonical_name) is what _entity_id() hashes on, so
|
|
461
|
+
# equality against forms_normalized catches the canonical
|
|
462
|
+
# match. aliases && forms_original catches alias overlap.
|
|
321
463
|
await cur.execute(
|
|
322
464
|
"""
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
|
|
332
|
-
),
|
|
333
|
-
provenance_event_ids = (
|
|
334
|
-
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
335
|
-
),
|
|
336
|
-
last_seen = NOW()
|
|
465
|
+
SELECT id FROM entities
|
|
466
|
+
WHERE arena = %s
|
|
467
|
+
AND entity_type = %s
|
|
468
|
+
AND (
|
|
469
|
+
lower(canonical_name) = ANY(%s::text[])
|
|
470
|
+
OR aliases && %s::text[]
|
|
471
|
+
)
|
|
472
|
+
LIMIT 1
|
|
337
473
|
""",
|
|
338
|
-
(e["
|
|
339
|
-
e["aliases"], e["provenance_event_ids"],
|
|
340
|
-
e["participant_set"], e["disclosure_class"]),
|
|
474
|
+
(e["arena"], e["entity_type"], forms_normalized, forms_original),
|
|
341
475
|
)
|
|
476
|
+
row = await cur.fetchone()
|
|
477
|
+
|
|
478
|
+
if row is not None:
|
|
479
|
+
# 3a. Existing match — merge aliases + provenance, keep
|
|
480
|
+
# canonical as it was (accrete-only per RFC §2b).
|
|
481
|
+
existing_id = row[0]
|
|
482
|
+
await cur.execute(
|
|
483
|
+
"""
|
|
484
|
+
UPDATE entities SET
|
|
485
|
+
aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
|
|
486
|
+
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
|
|
487
|
+
last_seen = NOW()
|
|
488
|
+
WHERE id = %s
|
|
489
|
+
""",
|
|
490
|
+
(e["aliases"], e["provenance_event_ids"], existing_id),
|
|
491
|
+
)
|
|
492
|
+
else:
|
|
493
|
+
# 3b. No match — insert new. ON CONFLICT (id) is a belt-
|
|
494
|
+
# and-braces fallback for the rare case where two writers
|
|
495
|
+
# collide on the same id under different surface forms;
|
|
496
|
+
# the advisory lock above is the primary defence.
|
|
497
|
+
await cur.execute(
|
|
498
|
+
"""
|
|
499
|
+
INSERT INTO entities (
|
|
500
|
+
id, arena, entity_type, canonical_name, aliases,
|
|
501
|
+
provenance_event_ids, participant_set, disclosure_class
|
|
502
|
+
) VALUES (
|
|
503
|
+
%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class
|
|
504
|
+
)
|
|
505
|
+
ON CONFLICT (id) DO UPDATE SET
|
|
506
|
+
aliases = (
|
|
507
|
+
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
|
|
508
|
+
),
|
|
509
|
+
provenance_event_ids = (
|
|
510
|
+
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
511
|
+
),
|
|
512
|
+
last_seen = NOW()
|
|
513
|
+
""",
|
|
514
|
+
(e["id"], e["arena"], e["entity_type"], e["canonical_name"],
|
|
515
|
+
e["aliases"], e["provenance_event_ids"],
|
|
516
|
+
e["participant_set"], e["disclosure_class"]),
|
|
517
|
+
)
|
|
342
518
|
|
|
343
519
|
|
|
344
520
|
async def _enqueue_distillation(cur: psycopg.AsyncCursor, event_id: str) -> None:
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Tests for the shared entity-ID scheme (entity_id.py).
|
|
2
|
+
|
|
3
|
+
Step 1 of RFC-entity-reconciliation.md: extractor-sync and extractor-async must key
|
|
4
|
+
entities IDENTICALLY so the same entity converges across the deterministic and LLM
|
|
5
|
+
passes. These cover the normalization contract + the id format.
|
|
6
|
+
|
|
7
|
+
Run: pytest packages/memory-engine-v2/extractor-sync/test_entity_id.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from entity_id import entity_id, normalize_surface_form
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestNormalizeSurfaceForm:
|
|
16
|
+
def test_lowercases(self):
|
|
17
|
+
assert normalize_surface_form("Carly Snider") == "carly snider"
|
|
18
|
+
|
|
19
|
+
def test_trims_and_collapses_internal_whitespace(self):
|
|
20
|
+
assert normalize_surface_form(" Carly Snider ") == "carly snider"
|
|
21
|
+
|
|
22
|
+
def test_email_case_insensitive(self):
|
|
23
|
+
assert normalize_surface_form("Carly@Pact.ORG") == "carly@pact.org"
|
|
24
|
+
|
|
25
|
+
def test_tolerates_none_and_empty(self):
|
|
26
|
+
assert normalize_surface_form(None) == "" # type: ignore[arg-type]
|
|
27
|
+
assert normalize_surface_form("") == ""
|
|
28
|
+
assert normalize_surface_form(" ") == ""
|
|
29
|
+
|
|
30
|
+
# ---- Unicode normalisation (NFKC) ------------------------------------
|
|
31
|
+
# Without NFKC, names that render identically can have different byte
|
|
32
|
+
# sequences (precomposed vs decomposed Unicode, fullwidth vs halfwidth,
|
|
33
|
+
# ligatures vs separate glyphs) — and fragment into separate entities.
|
|
34
|
+
# These cases come up in practice with vCard imports, Mac pasteboard,
|
|
35
|
+
# IME inputs, internationalised name sources.
|
|
36
|
+
|
|
37
|
+
def test_nfkc_decomposed_to_precomposed(self):
|
|
38
|
+
# "Café" precomposed (U+00E9) vs decomposed (U+0065 + U+0301)
|
|
39
|
+
precomposed = "Café"
|
|
40
|
+
decomposed = "Café"
|
|
41
|
+
assert precomposed != decomposed # different byte sequences
|
|
42
|
+
assert normalize_surface_form(precomposed) == normalize_surface_form(decomposed)
|
|
43
|
+
|
|
44
|
+
def test_nfkc_fullwidth_to_halfwidth_latin(self):
|
|
45
|
+
# Fullwidth Latin (often emitted by CJK-locale IMEs) collapses to
|
|
46
|
+
# halfwidth so "CARLY" keys the same as "Carly".
|
|
47
|
+
assert normalize_surface_form("CARLY") == normalize_surface_form("Carly")
|
|
48
|
+
|
|
49
|
+
def test_nfkc_ligature_to_separate_glyphs(self):
|
|
50
|
+
# Compatibility ligatures (e.g. "fi" U+FB01) decompose to "fi".
|
|
51
|
+
assert normalize_surface_form("Office") == normalize_surface_form("Office")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class TestEntityId:
|
|
55
|
+
def test_format_is_e_prefix_plus_24_hex(self):
|
|
56
|
+
eid = entity_id("arena1", "person", "Carly Snider")
|
|
57
|
+
assert eid.startswith("e_")
|
|
58
|
+
assert len(eid) == 26 # "e_" + 24 hex
|
|
59
|
+
|
|
60
|
+
def test_casing_and_spacing_variants_converge(self):
|
|
61
|
+
a = entity_id("arena1", "person", "Carly Snider")
|
|
62
|
+
b = entity_id("arena1", "person", "carly snider")
|
|
63
|
+
c = entity_id("arena1", "person", " Carly Snider ")
|
|
64
|
+
assert a == b == c
|
|
65
|
+
|
|
66
|
+
def test_email_casing_converges(self):
|
|
67
|
+
a = entity_id("arena1", "person", "carly@pactcollective.org")
|
|
68
|
+
b = entity_id("arena1", "person", "Carly@PactCollective.org")
|
|
69
|
+
assert a == b
|
|
70
|
+
|
|
71
|
+
def test_distinct_people_differ(self):
|
|
72
|
+
assert entity_id("a", "person", "Ben Gordon") != entity_id("a", "person", "Ben Smith")
|
|
73
|
+
|
|
74
|
+
def test_arena_scoped(self):
|
|
75
|
+
assert entity_id("a", "person", "Carly Snider") != entity_id("b", "person", "Carly Snider")
|
|
76
|
+
|
|
77
|
+
def test_type_scoped(self):
|
|
78
|
+
# An email could be a person OR (in theory) another type; the type is part
|
|
79
|
+
# of the key, so cross-type collisions can't merge accidentally.
|
|
80
|
+
assert entity_id("a", "person", "acme.com") != entity_id("a", "org", "acme.com")
|
|
81
|
+
|
|
82
|
+
def test_email_and_name_remain_distinct_pre_resolution(self):
|
|
83
|
+
# Step 1 unifies the *scheme*, it does NOT yet merge email<->name (that is
|
|
84
|
+
# step 2's alias resolution). The email form and the name form are still
|
|
85
|
+
# different surface strings, hence different ids — by design at this stage.
|
|
86
|
+
assert entity_id("a", "person", "carly@pactcollective.org") != entity_id(
|
|
87
|
+
"a", "person", "Carly Snider"
|
|
88
|
+
)
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Unit tests for the step-2-onwards extractor-sync work — the
|
|
2
|
+
paired-extraction helper (_person_entity) and the per-source rules
|
|
3
|
+
(_extract_note, _extract_event) that drive it.
|
|
4
|
+
|
|
5
|
+
Step 1 (normalize_surface_form + entity_id) is covered by
|
|
6
|
+
test_entity_id.py + tests/test_entity_id_parity.py; not retested here
|
|
7
|
+
to avoid drift between the two.
|
|
8
|
+
|
|
9
|
+
The integration scenarios that touch the DB live in
|
|
10
|
+
`packages/memory-engine-v2/tests/` and spin up a real postgres via
|
|
11
|
+
testcontainers; this file is the cheap pytest layer that runs in
|
|
12
|
+
unit-test CI without infra.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import importlib.util
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
import pytest
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Load extractor-sync's server.py as a module so we can call its
|
|
25
|
+
# private helpers directly.
|
|
26
|
+
_THIS = Path(__file__).resolve().parent
|
|
27
|
+
_SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
|
|
28
|
+
_THIS / "server.py")
|
|
29
|
+
assert _SPEC and _SPEC.loader
|
|
30
|
+
sync_server = importlib.util.module_from_spec(_SPEC)
|
|
31
|
+
try:
|
|
32
|
+
_SPEC.loader.exec_module(sync_server)
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
# If extractor-sync's runtime deps (psycopg, fastapi) aren't
|
|
35
|
+
# installed in this env, skip the whole module. The integration
|
|
36
|
+
# tests in tests/ install the full container deps.
|
|
37
|
+
pytest.skip(f"extractor-sync deps unavailable: {e}", allow_module_level=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ----------------------------------------------------------------------
|
|
42
|
+
# _person_entity — RFC §1 paired (name, email) emission
|
|
43
|
+
# ----------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
@pytest.fixture
|
|
46
|
+
def stub_req():
|
|
47
|
+
"""Minimal stub for ExtractRequest — only the attrs _person_entity
|
|
48
|
+
reads (arena, attributes.participant_set, .disclosure_class)."""
|
|
49
|
+
class _Req:
|
|
50
|
+
arena = "tenant:test"
|
|
51
|
+
attributes = {
|
|
52
|
+
"participant_set": ["tenant:test"],
|
|
53
|
+
"disclosure_class": "private",
|
|
54
|
+
}
|
|
55
|
+
return _Req()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_person_entity_name_and_email_pair(stub_req) -> None:
|
|
59
|
+
"""When both name and email present → entity keyed by NAME,
|
|
60
|
+
BOTH forms in aliases."""
|
|
61
|
+
e = sync_server._person_entity(
|
|
62
|
+
stub_req, "evt1", name="Carly Snider", email="carly@example.com"
|
|
63
|
+
)
|
|
64
|
+
assert e is not None
|
|
65
|
+
assert e["canonical_name"] == "Carly Snider"
|
|
66
|
+
assert set(e["aliases"]) == {"Carly Snider", "carly@example.com"}
|
|
67
|
+
# Id is derived from the name (not the email).
|
|
68
|
+
assert e["id"] == sync_server._entity_id("tenant:test", "person", "Carly Snider")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_person_entity_email_only(stub_req) -> None:
|
|
72
|
+
"""Email-only event (no name) → email-keyed entity. A later
|
|
73
|
+
paired event will alias-resolve to this via the email."""
|
|
74
|
+
e = sync_server._person_entity(stub_req, "evt1", name=None, email="x@example.com")
|
|
75
|
+
assert e is not None
|
|
76
|
+
assert e["canonical_name"] == "x@example.com"
|
|
77
|
+
assert e["aliases"] == ["x@example.com"]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_person_entity_name_only(stub_req) -> None:
|
|
81
|
+
"""Name-only (no email) → name-keyed entity, name as only alias."""
|
|
82
|
+
e = sync_server._person_entity(stub_req, "evt1", name="Alex Wong", email=None)
|
|
83
|
+
assert e is not None
|
|
84
|
+
assert e["canonical_name"] == "Alex Wong"
|
|
85
|
+
assert e["aliases"] == ["Alex Wong"]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_person_entity_empty_returns_none(stub_req) -> None:
|
|
89
|
+
"""Neither name nor email → None (caller skips)."""
|
|
90
|
+
assert sync_server._person_entity(stub_req, "evt1", name="", email=None) is None
|
|
91
|
+
assert sync_server._person_entity(stub_req, "evt1", name=None, email="") is None
|
|
92
|
+
assert sync_server._person_entity(stub_req, "evt1", name=" ", email=" ") is None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_person_entity_extra_aliases_merge(stub_req) -> None:
|
|
96
|
+
"""`extra_aliases` (e.g. slack id) merge with name + email in aliases."""
|
|
97
|
+
e = sync_server._person_entity(
|
|
98
|
+
stub_req, "evt1",
|
|
99
|
+
name="Sam Patel", email="sam@example.com",
|
|
100
|
+
extra_aliases=["slack:U01ABC"],
|
|
101
|
+
)
|
|
102
|
+
assert e is not None
|
|
103
|
+
assert set(e["aliases"]) == {"Sam Patel", "sam@example.com", "slack:U01ABC"}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ----------------------------------------------------------------------
|
|
107
|
+
# _extract_note — paired emission from gmail-shape attributes
|
|
108
|
+
# ----------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
def _stub_request(attributes: dict, content: str = "") -> object:
|
|
111
|
+
"""Build a minimal stub of ExtractRequest for unit tests."""
|
|
112
|
+
class _Req:
|
|
113
|
+
arena = "tenant:test"
|
|
114
|
+
clientId = "client"
|
|
115
|
+
userId = "user"
|
|
116
|
+
event_type = "STORE_MEMORY"
|
|
117
|
+
source_kind = "note"
|
|
118
|
+
source_id = "src1"
|
|
119
|
+
r = _Req()
|
|
120
|
+
r.content = content
|
|
121
|
+
r.attributes = {"participant_set": ["tenant:test"],
|
|
122
|
+
"disclosure_class": "private", **attributes}
|
|
123
|
+
return r
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_extract_note_pairs_name_and_email() -> None:
|
|
127
|
+
"""gmail-shape: from_email + from_name → single entity name-keyed."""
|
|
128
|
+
req = _stub_request({
|
|
129
|
+
"from_email": "carly@example.com",
|
|
130
|
+
"from_name": "Carly Snider",
|
|
131
|
+
})
|
|
132
|
+
entities, _, _ = sync_server._extract_note(req, "evt1")
|
|
133
|
+
assert len(entities) == 1
|
|
134
|
+
assert entities[0]["canonical_name"] == "Carly Snider"
|
|
135
|
+
assert set(entities[0]["aliases"]) == {"Carly Snider", "carly@example.com"}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_extract_note_email_only_when_no_name() -> None:
|
|
139
|
+
"""gmail-shape: from_email only (no from_name) → email-keyed.
|
|
140
|
+
Same as v1 behaviour for this case (the fix is for when name
|
|
141
|
+
*is* available)."""
|
|
142
|
+
req = _stub_request({
|
|
143
|
+
"from_email": "x@example.com",
|
|
144
|
+
})
|
|
145
|
+
entities, _, _ = sync_server._extract_note(req, "evt1")
|
|
146
|
+
assert len(entities) == 1
|
|
147
|
+
assert entities[0]["canonical_name"] == "x@example.com"
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def test_extract_note_multiple_role_pairs() -> None:
|
|
151
|
+
"""from_name+email + to_name+email + cc-email only → three entities."""
|
|
152
|
+
req = _stub_request({
|
|
153
|
+
"from_email": "a@example.com", "from_name": "Alice One",
|
|
154
|
+
"to_email": "b@example.com", "to_name": "Bob Two",
|
|
155
|
+
"cc_email": "c@example.com", # no cc_name
|
|
156
|
+
})
|
|
157
|
+
entities, _, _ = sync_server._extract_note(req, "evt1")
|
|
158
|
+
assert len(entities) == 3
|
|
159
|
+
canonicals = {e["canonical_name"] for e in entities}
|
|
160
|
+
assert canonicals == {"Alice One", "Bob Two", "c@example.com"}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ----------------------------------------------------------------------
|
|
164
|
+
# _extract_event — paired emission from calendar-shape attributes
|
|
165
|
+
# ----------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def test_extract_event_structured_attendees() -> None:
|
|
168
|
+
"""Modern calendar shape: attendees as {email, displayName} objects."""
|
|
169
|
+
req = _stub_request({
|
|
170
|
+
"organizer_email": "host@example.com",
|
|
171
|
+
"organizer_name": "Host Person",
|
|
172
|
+
"attendees": [
|
|
173
|
+
{"email": "a@example.com", "displayName": "Alice One"},
|
|
174
|
+
{"email": "b@example.com"}, # no displayName
|
|
175
|
+
],
|
|
176
|
+
})
|
|
177
|
+
entities, _, _ = sync_server._extract_event(req, "evt1")
|
|
178
|
+
assert len(entities) == 3
|
|
179
|
+
by_canon = {e["canonical_name"]: e for e in entities}
|
|
180
|
+
assert "Host Person" in by_canon
|
|
181
|
+
assert "Alice One" in by_canon
|
|
182
|
+
assert "b@example.com" in by_canon
|
|
183
|
+
# Paired ones carry both forms in aliases.
|
|
184
|
+
assert "host@example.com" in by_canon["Host Person"]["aliases"]
|
|
185
|
+
assert "a@example.com" in by_canon["Alice One"]["aliases"]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_extract_event_legacy_flat_attendee_emails() -> None:
|
|
189
|
+
"""Back-compat: producers that only send a flat list of emails."""
|
|
190
|
+
req = _stub_request({
|
|
191
|
+
"organizer_email": "host@example.com",
|
|
192
|
+
"attendee_emails": ["a@example.com", "b@example.com"],
|
|
193
|
+
})
|
|
194
|
+
entities, _, _ = sync_server._extract_event(req, "evt1")
|
|
195
|
+
assert len(entities) == 3
|
|
196
|
+
canonicals = {e["canonical_name"] for e in entities}
|
|
197
|
+
assert canonicals == {"host@example.com", "a@example.com", "b@example.com"}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def test_extract_event_organizer_object_form() -> None:
|
|
201
|
+
"""Some calendar producers emit organizer as {email, displayName}."""
|
|
202
|
+
req = _stub_request({
|
|
203
|
+
"organizer": {"email": "x@example.com", "displayName": "X Person"},
|
|
204
|
+
})
|
|
205
|
+
entities, _, _ = sync_server._extract_event(req, "evt1")
|
|
206
|
+
assert len(entities) == 1
|
|
207
|
+
assert entities[0]["canonical_name"] == "X Person"
|
|
208
|
+
assert "x@example.com" in entities[0]["aliases"]
|