@pentatonic-ai/ai-agent-sdk 0.10.14 → 0.10.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.14";
881
+ var VERSION = "0.10.16";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.14";
850
+ var VERSION = "0.10.16";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.14",
3
+ "version": "0.10.16",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -928,7 +928,7 @@ async def list_entities(req: GraphQueryRequest):
928
928
  params.extend([pattern, pattern])
929
929
  sql = f"""
930
930
  SELECT id, arena, entity_type, canonical_name, aliases,
931
- provenance_event_ids, last_seen
931
+ provenance_event_ids, attributes, last_seen
932
932
  FROM entities
933
933
  WHERE {' AND '.join(conditions)}
934
934
  ORDER BY last_seen DESC
@@ -55,3 +55,23 @@ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
55
55
  """
56
56
  key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
57
57
  return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
58
+
59
+
60
+ def person_id_key(name: str | None, email: str | None) -> str:
61
+ """The string `entity_id()` should hash to mint a PERSON's node id — the
62
+ EMAIL (the person's deterministic hard key) when present, else the name.
63
+
64
+ Lives HERE, in the byte-identical shared file, BECAUSE both extractors mint
65
+ person nodes and must agree: the sync pass builds them from the envelope at
66
+ ingest (it has the email), the async pass builds them from prose (email
67
+ promoted to an alias). Keying both on the email means the same person mints
68
+ the SAME id from either pass and in any processing order — converging to one
69
+ node instead of sync's name-keyed node and async's node racing, or a
70
+ re-distill re-homing them differently. The name is the fallback for a person
71
+ with no email (Fusion still merges those fuzzily). `entity_id()` normalises
72
+ (lowercase + trim) the result, so casing/whitespace variants of an email
73
+ collapse. (Org keying is the async-only `org_node_id_key`; person keying is
74
+ cross-pass, so it belongs in this parity-guarded file.)
75
+ """
76
+ e = (email or "").strip()
77
+ return e if e else (name or "")
@@ -35,9 +35,26 @@ from typing import Any
35
35
  # Allowed-value enums. Moved here from worker.py (which now imports
36
36
  # them) so the schema pins to the SAME constants the KV prompt and
37
37
  # downstream normalisation use — change them in one place only.
38
+ #
39
+ # 2026-06-16 — ONTOLOGY ALIGNMENT (entity-ontology-the-spine.md). This enum is
40
+ # specifically the set of types the LLM extracts FROM PROSE as named entities.
41
+ # Removed the NLP byproducts that polluted ~28% of the graph and are not
42
+ # business entities:
43
+ # - `place`, `date` → ATTRIBUTES of real entities (a meeting's location/time),
44
+ # never standalone entities. The guided enum no longer admits them, so the
45
+ # model stops minting bare place/date nodes (the info still lands in facts).
46
+ # - `concept` → folds into `topic` (the model now emits `topic`).
47
+ # NOT added here (deliberately): meeting / document / thread / task / decision.
48
+ # Those are NOT LLM-prose entities — they are created by structured-event paths
49
+ # (meetings/actions/thread module projections; the sync extractor already emits
50
+ # `document`) or modelled as facts (`decision` category). Forcing the LLM to
51
+ # mint them from prose would create spurious nodes. They join the ontology via
52
+ # their own paths, not this enum.
53
+ # Forward-only: existing place/date/concept rows are untouched and demoted at
54
+ # READ time by the ontology ENGINE_TYPE_MAP (concept→topic, place/date→attribute,
55
+ # other→unresolved). No re-distill required for this change to take effect.
38
56
  ALLOWED_ENT_TYPES = {
39
- "person", "org", "product", "place", "project",
40
- "concept", "topic", "date", "other",
57
+ "person", "org", "product", "project", "topic", "other",
41
58
  }
42
59
  ALLOWED_FCT_CATEGORIES = {
43
60
  "decision", "commitment", "state", "mention",
@@ -0,0 +1,62 @@
1
+ """Tests for fact_source — deriving the source label stamped onto facts.
2
+
3
+ The contract under test (SoR-drift foundation): prefer the finer
4
+ producer label `attributes.source` (gmail / hubspot / ...) over the
5
+ coarse `source_kind` enum; fall back to `source_kind` when no finer
6
+ label; return None only when neither is present (NULL == source-unknown,
7
+ the pre-009 state). Pure + total: never raises.
8
+
9
+ Run: pytest packages/memory-engine-v2/extractor-async/test_fact_source.py
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import pytest
15
+
16
+ from worker import fact_source
17
+
18
+
19
+ class TestFactSource:
20
+ def test_prefers_finer_attributes_source(self):
21
+ # attributes.source (hubspot) wins over the coarse source_kind
22
+ # (system) — this is the CRM-vs-email granularity SoR-drift needs.
23
+ ev = {"source_kind": "system", "attributes": {"source": "hubspot"}}
24
+ assert fact_source(ev) == "hubspot"
25
+
26
+ def test_email_finer_than_note_kind(self):
27
+ ev = {"source_kind": "note", "attributes": {"source": "gmail"}}
28
+ assert fact_source(ev) == "gmail"
29
+
30
+ def test_falls_back_to_source_kind_when_no_attribute(self):
31
+ ev = {"source_kind": "chat", "attributes": {}}
32
+ assert fact_source(ev) == "chat"
33
+
34
+ def test_falls_back_when_attributes_missing(self):
35
+ ev = {"source_kind": "doc"}
36
+ assert fact_source(ev) == "doc"
37
+
38
+ def test_strips_whitespace(self):
39
+ ev = {"source_kind": "system", "attributes": {"source": " slack "}}
40
+ assert fact_source(ev) == "slack"
41
+
42
+ def test_blank_attribute_falls_through_to_kind(self):
43
+ # An empty/whitespace `source` must NOT win over a real kind.
44
+ ev = {"source_kind": "doc", "attributes": {"source": " "}}
45
+ assert fact_source(ev) == "doc"
46
+
47
+ # --- None cases: source-unknown, column stays NULL ---
48
+
49
+ @pytest.mark.parametrize(
50
+ "ev",
51
+ [
52
+ {},
53
+ {"attributes": {}},
54
+ {"attributes": {"source": ""}},
55
+ {"source_kind": "", "attributes": {"source": None}},
56
+ {"source_kind": None, "attributes": None},
57
+ # Non-string types must not crash and must not be stamped.
58
+ {"source_kind": 7, "attributes": {"source": 42}},
59
+ ],
60
+ )
61
+ def test_returns_none_when_no_usable_source(self, ev):
62
+ assert fact_source(ev) is None
@@ -61,6 +61,20 @@ def test_schema_enums_pin_to_shared_constants() -> None:
61
61
  assert fct_enum == sorted(fct_enum)
62
62
 
63
63
 
64
+ def test_entity_type_enum_is_ontology_aligned() -> None:
65
+ """Ontology alignment (entity-ontology-the-spine.md): the LLM-extracted
66
+ entity types are the genuine named-entity work types — NOT the NLP
67
+ byproducts. Pins the decision so a future edit can't silently re-admit
68
+ place/date/concept (which polluted ~28% of the graph). meeting/document/
69
+ thread/task/decision are deliberately NOT here — they come from
70
+ structured-event paths / are facts, not LLM prose."""
71
+ assert xs.ALLOWED_ENT_TYPES == {
72
+ "person", "org", "product", "project", "topic", "other",
73
+ }
74
+ for byproduct in ("place", "date", "concept"):
75
+ assert byproduct not in xs.ALLOWED_ENT_TYPES
76
+
77
+
64
78
  def test_schema_caps_mirror_prompt_hard_caps() -> None:
65
79
  """8 ENT / 6 FCT / 6 REL per event, statement <= 140 — what
66
80
  BATCH_SYSTEM_PROMPT requests, the schema enforces."""
@@ -0,0 +1,329 @@
1
+ """Org-domain hard-key stamping — the deterministic helpers that turn an
2
+ event's structured email envelope into an org's domain hard key
3
+ (entity-ontology-the-spine.md §IV; registry §B).
4
+
5
+ Pure-logic tests only (no DB) — they exercise the precision guard that makes
6
+ envelope-derived stamping safe (`match_org_domain`) plus envelope parsing
7
+ (`event_org_domains`) and label extraction (`domain_label`). The upsert SQL that
8
+ consumes them is covered by the engine image build+import check in CI."""
9
+
10
+ from __future__ import annotations
11
+
12
+ import importlib.util
13
+ from pathlib import Path
14
+
15
+ import pytest
16
+
17
+
18
+ _THIS = Path(__file__).resolve().parent
19
+ _SPEC = importlib.util.spec_from_file_location("extractor_async_worker",
20
+ _THIS / "worker.py")
21
+ assert _SPEC and _SPEC.loader
22
+ worker = importlib.util.module_from_spec(_SPEC)
23
+ try:
24
+ _SPEC.loader.exec_module(worker)
25
+ except ImportError as e:
26
+ pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
27
+
28
+
29
+ # ----------------------------------------------------------------------
30
+ # _domain_of — pull the domain out of an email-ish string
31
+ # ----------------------------------------------------------------------
32
+
33
+ def test_domain_of_plain_email() -> None:
34
+ assert worker._domain_of("bob@lego.com") == "lego.com"
35
+
36
+
37
+ def test_domain_of_display_name_form() -> None:
38
+ assert worker._domain_of("Bob Smith <bob@LEGO.com>") == "lego.com"
39
+
40
+
41
+ def test_domain_of_subdomain_preserved() -> None:
42
+ # domain_label, not _domain_of, reduces to the registrable label.
43
+ assert worker._domain_of("a@mail.acme.co.uk") == "mail.acme.co.uk"
44
+
45
+
46
+ def test_domain_of_non_email_is_none() -> None:
47
+ assert worker._domain_of("Bob Smith") is None
48
+ assert worker._domain_of("") is None
49
+
50
+
51
+ # ----------------------------------------------------------------------
52
+ # domain_label — registrable label
53
+ # ----------------------------------------------------------------------
54
+
55
+ def test_domain_label_simple() -> None:
56
+ assert worker.domain_label("lego.com") == "lego"
57
+
58
+
59
+ def test_domain_label_multi_part_tld() -> None:
60
+ assert worker.domain_label("boots.co.uk") == "boots"
61
+
62
+
63
+ def test_domain_label_subdomain() -> None:
64
+ assert worker.domain_label("mail.acme.com") == "acme"
65
+
66
+
67
+ def test_domain_label_subdomain_multi_part_tld() -> None:
68
+ assert worker.domain_label("careers.boots.co.uk") == "boots"
69
+
70
+
71
+ def test_domain_label_degenerate() -> None:
72
+ assert worker.domain_label("localhost") is None
73
+ assert worker.domain_label("") is None
74
+
75
+
76
+ # ----------------------------------------------------------------------
77
+ # event_org_domains — corporate domains from the structured envelope
78
+ # ----------------------------------------------------------------------
79
+
80
+ def test_event_domains_from_contact_email() -> None:
81
+ assert worker.event_org_domains({"contact_email": "ceo@lego.com"}) == {"lego.com"}
82
+
83
+
84
+ def test_event_domains_recipient_list_string() -> None:
85
+ attrs = {"to_emails": "a@acme.com, b@acme.com; c@globex.com"}
86
+ assert worker.event_org_domains(attrs) == {"acme.com", "globex.com"}
87
+
88
+
89
+ def test_event_domains_recipient_list_array() -> None:
90
+ attrs = {"cc_emails": ["a@acme.com", "b@globex.com"]}
91
+ assert worker.event_org_domains(attrs) == {"acme.com", "globex.com"}
92
+
93
+
94
+ def test_event_domains_drops_freemail() -> None:
95
+ attrs = {"contact_email": "someone@gmail.com", "to_emails": "boss@acme.com"}
96
+ assert worker.event_org_domains(attrs) == {"acme.com"}
97
+
98
+
99
+ def test_event_domains_drops_tenant_domain() -> None:
100
+ # Our own people are not an external org.
101
+ attrs = {"to_emails": "phil@pentatonic.com, ext@acme.com"}
102
+ assert worker.event_org_domains(attrs) == {"acme.com"}
103
+
104
+
105
+ def test_event_domains_author_only_if_email() -> None:
106
+ # `author` is often a name/user-id, not an email — only count it when it is one.
107
+ assert worker.event_org_domains({"author": "Phil Hauser"}) == set()
108
+ assert worker.event_org_domains({"author": "phil@acme.com"}) == {"acme.com"}
109
+
110
+
111
+ def test_event_domains_empty_inputs() -> None:
112
+ assert worker.event_org_domains(None) == set()
113
+ assert worker.event_org_domains({}) == set()
114
+
115
+
116
+ # ----------------------------------------------------------------------
117
+ # event_org_domains — direct SoR domain attribute (no email envelope)
118
+ # ----------------------------------------------------------------------
119
+
120
+ def test_event_domains_from_direct_domain_attr() -> None:
121
+ # A CRM company record carries the org's domain directly (no envelope email).
122
+ assert worker.event_org_domains(
123
+ {"domain": "Acme.com", "hubspot_kind": "company"}
124
+ ) == {"acme.com"}
125
+ assert worker.event_org_domains({"company_domain": "acme.com"}) == {"acme.com"}
126
+ assert worker.event_org_domains({"org_domain": "acme.com"}) == {"acme.com"}
127
+
128
+
129
+ def test_event_domains_domain_attr_normalises_url_and_www() -> None:
130
+ assert worker.event_org_domains(
131
+ {"domain": "https://www.acme.com/about?x=1"}
132
+ ) == {"acme.com"}
133
+
134
+
135
+ def test_event_domains_domain_attr_drops_freemail_and_tenant() -> None:
136
+ assert worker.event_org_domains({"domain": "gmail.com"}) == set()
137
+ assert worker.event_org_domains({"company_domain": "pentatonic.com"}) == set()
138
+
139
+
140
+ def test_event_domains_domain_attr_rejects_non_domains() -> None:
141
+ assert worker.event_org_domains({"domain": "not a domain"}) == set()
142
+ assert worker.event_org_domains({"domain": "bob@acme.com"}) == set()
143
+ assert worker.event_org_domains({"domain": "localhost"}) == set()
144
+
145
+
146
+ def test_event_domains_merges_envelope_and_direct_attr() -> None:
147
+ assert worker.event_org_domains(
148
+ {"contact_email": "a@globex.com", "domain": "acme.com"}
149
+ ) == {"globex.com", "acme.com"}
150
+
151
+
152
+ # ----------------------------------------------------------------------
153
+ # match_org_domain — the precision guard
154
+ # ----------------------------------------------------------------------
155
+
156
+ def test_match_exact_name_to_domain() -> None:
157
+ assert worker.match_org_domain("LEGO", {"lego.com"}) == "lego.com"
158
+
159
+
160
+ def test_match_name_with_suffix_word() -> None:
161
+ # "LEGO Group" → token "lego" matches lego.com.
162
+ assert worker.match_org_domain("LEGO Group", {"lego.com"}) == "lego.com"
163
+
164
+
165
+ def test_match_squished_name_equals_label() -> None:
166
+ # de-spaced name "legogroup" EQUALS the domain label "legogroup"
167
+ # (whole-name match, not a prefix).
168
+ assert worker.match_org_domain("Lego Group", {"legogroup.com"}) == "legogroup.com"
169
+
170
+
171
+ def test_no_match_when_domain_is_a_bystander() -> None:
172
+ # THE key safety case: an email FROM acme.com that merely *mentions* Globex
173
+ # must not stamp acme.com onto Globex.
174
+ assert worker.match_org_domain("Globex", {"acme.com"}) is None
175
+
176
+
177
+ # Prefix-match false positives — the reason the original `squished.startswith`
178
+ # clause was removed. One org's name starting with another org's label must NOT
179
+ # stamp that other org's domain (a mis-merge at ingest, via the alias-overlap
180
+ # path, that isn't behind the Fusion Drive's dry-run/audit gate).
181
+ def test_no_match_name_starts_with_other_org_label() -> None:
182
+ # "applebees".startswith("apple") — must NOT stamp apple.com onto Applebee's.
183
+ assert worker.match_org_domain("Applebee's", {"apple.com"}) is None
184
+ # "legoland".startswith("lego") — must NOT stamp lego.com onto Legoland.
185
+ assert worker.match_org_domain("Legoland", {"lego.com"}) is None
186
+ # "microsoft".startswith("micro") — must NOT stamp micro.com onto Microsoft.
187
+ assert worker.match_org_domain("Microsoft", {"micro.com"}) is None
188
+
189
+
190
+ def test_match_still_holds_for_legit_token_and_whole_name() -> None:
191
+ # the cases the prefix clause was *meant* for are covered without it:
192
+ assert worker.match_org_domain("LEGO Group", {"lego.com"}) == "lego.com" # token
193
+ assert worker.match_org_domain("Microsoft", {"microsoft.com"}) == "microsoft.com" # whole name
194
+ assert worker.match_org_domain("Salesforce", {"salesforce.com"}) == "salesforce.com" # whole name
195
+
196
+
197
+ def test_match_picks_the_right_domain_among_several() -> None:
198
+ domains = {"acme.com", "lego.com", "globex.com"}
199
+ assert worker.match_org_domain("LEGO", domains) == "lego.com"
200
+
201
+
202
+ def test_no_match_on_short_label_substring() -> None:
203
+ # short labels (<4) must match a token exactly, never as a substring —
204
+ # avoids "hp" matching "championship" etc.
205
+ assert worker.match_org_domain("Championship Org", {"hp.com"}) is None
206
+
207
+
208
+ def test_match_short_label_exact_token() -> None:
209
+ # but a genuine short-name org still matches its own domain as a token.
210
+ assert worker.match_org_domain("HP", {"hp.com"}) == "hp.com"
211
+
212
+
213
+ def test_ambiguous_two_matching_domains_is_none() -> None:
214
+ # genuine ambiguity → defer to fusion, don't guess.
215
+ assert worker.match_org_domain("Acme", {"acme.com", "acme.co.uk"}) is None
216
+
217
+
218
+ def test_no_match_empty() -> None:
219
+ assert worker.match_org_domain("", {"lego.com"}) is None
220
+ assert worker.match_org_domain("LEGO", set()) is None
221
+
222
+
223
+ # ----------------------------------------------------------------------
224
+ # org_node_id_key — the string entity_id() hashes for the node (task #2)
225
+ # ----------------------------------------------------------------------
226
+
227
+ def test_id_key_is_the_domain_for_a_stamped_org() -> None:
228
+ # The whole point: name variants don't matter — the id keys on the domain.
229
+ assert worker.org_node_id_key("org", "LEGO Group", "lego.com") == "lego.com"
230
+ assert worker.org_node_id_key("org", "LEGO", "lego.com") == "lego.com"
231
+
232
+
233
+ def test_id_key_falls_back_to_name_without_a_domain() -> None:
234
+ assert worker.org_node_id_key("org", "Some Prose Org", None) == "Some Prose Org"
235
+
236
+
237
+ def test_id_key_never_touches_non_orgs() -> None:
238
+ # A person carries an email hard key, but person id-keying is a separate
239
+ # follow-up — this helper must leave non-orgs on their name.
240
+ assert worker.org_node_id_key("person", "Johann", "pentatonic.com") == "Johann"
241
+ assert worker.org_node_id_key("product", "Widget", "widget.com") == "Widget"
242
+
243
+
244
+ def test_id_key_drives_a_domain_keyed_entity_id() -> None:
245
+ # End-to-end: two LEGO name-variants with the same domain mint the SAME id.
246
+ a = worker.entity_id(
247
+ "arena1", "org", worker.org_node_id_key("org", "LEGO Group", "lego.com")
248
+ )
249
+ b = worker.entity_id(
250
+ "arena1", "org", worker.org_node_id_key("org", "LEGO", "lego.com")
251
+ )
252
+ assert a == b
253
+ # …and it differs from the old name-keyed id (so a re-distill re-homes them).
254
+ name_keyed = worker.entity_id("arena1", "org", "LEGO Group")
255
+ assert a != name_keyed
256
+
257
+
258
+ # ----------------------------------------------------------------------
259
+ # person_id_key — email hard key, shared with the sync pass (task #2)
260
+ # ----------------------------------------------------------------------
261
+
262
+ def test_person_id_key_prefers_email() -> None:
263
+ assert worker.person_id_key("Johann", "johann@p.com") == "johann@p.com"
264
+ assert worker.person_id_key("Johann", None) == "Johann"
265
+ assert worker.person_id_key(None, None) == ""
266
+
267
+
268
+ def test_person_id_key_drives_an_email_keyed_entity_id() -> None:
269
+ # Two name-variants of one person, same email → identical node id, distinct
270
+ # from the name-keyed one (so sync + async + a re-distill all converge).
271
+ a = worker.entity_id(
272
+ "arena1", "person", worker.person_id_key("Johann Boedecker", "j@p.com")
273
+ )
274
+ b = worker.entity_id(
275
+ "arena1", "person", worker.person_id_key("BOEDECKER, JOHANN", "J@P.com")
276
+ )
277
+ assert a == b
278
+ assert a != worker.entity_id("arena1", "person", "Johann Boedecker")
279
+
280
+
281
+ # ----------------------------------------------------------------------
282
+ # event_record_subject — SoR external-id fallback (registry A.1 step 5)
283
+ # ----------------------------------------------------------------------
284
+
285
+ def test_record_subject_company_is_org() -> None:
286
+ assert worker.event_record_subject(
287
+ {"source_id": "hubspot:company:12345", "hubspot_kind": "company"}
288
+ ) == ("org", "hubspot:company:12345")
289
+
290
+
291
+ def test_record_subject_contact_is_person() -> None:
292
+ assert worker.event_record_subject(
293
+ {"source_id": "hubspot:contact:777", "hubspot_kind": "contact"}
294
+ ) == ("person", "hubspot:contact:777")
295
+
296
+
297
+ def test_record_subject_deal_is_not_a_core_entity() -> None:
298
+ # deal/ticket/custom types aren't core graph entities → no external-id key.
299
+ assert worker.event_record_subject(
300
+ {"source_id": "hubspot:deal:9", "hubspot_kind": "deal"}
301
+ ) == (None, None)
302
+
303
+
304
+ def test_record_subject_requires_a_source_id() -> None:
305
+ assert worker.event_record_subject({"hubspot_kind": "company"}) == (None, None)
306
+ assert worker.event_record_subject({"source_id": " ", "hubspot_kind": "company"}) == (
307
+ None,
308
+ None,
309
+ )
310
+
311
+
312
+ def test_record_subject_plain_comms_event_is_none() -> None:
313
+ # An ordinary email/slack event is not a single record.
314
+ assert worker.event_record_subject({"contact_email": "a@acme.com"}) == (None, None)
315
+ assert worker.event_record_subject(None) == (None, None)
316
+ assert worker.event_record_subject({}) == (None, None)
317
+
318
+
319
+ def test_external_id_keys_a_domainless_record_node() -> None:
320
+ # The fallback: a company record with no resolvable domain keys on its
321
+ # external id, so two imports of the same record converge to one node.
322
+ ext = "hubspot:company:12345"
323
+ assert worker.entity_id("arena1", "org", ext) == worker.entity_id(
324
+ "arena1", "org", ext
325
+ )
326
+ # …and that's distinct from the name-keyed node it replaces.
327
+ assert worker.entity_id("arena1", "org", ext) != worker.entity_id(
328
+ "arena1", "org", "Acme Corp"
329
+ )
@@ -41,7 +41,7 @@ import psycopg
41
41
  import psycopg.rows
42
42
 
43
43
  from confidence import born_salience, corroborated_confidence
44
- from entity_id import entity_id, normalize_surface_form
44
+ from entity_id import entity_id, normalize_surface_form, person_id_key
45
45
  from source_time import event_source_time, parse_source_time
46
46
  from extraction_schema import (
47
47
  ALLOWED_ENT_TYPES,
@@ -222,7 +222,10 @@ RULES:
222
222
  matching the input index). NEVER skip an event — if an event has \
223
223
  nothing to extract, emit ONLY the header.
224
224
  - ENT lines have 3 or 4 fields: literal `ENT`, type, name, [email].
225
- type ∈ {person, org, product, place, project, concept, topic, date, other}
225
+ type ∈ {person, org, product, project, topic, other}
226
+ Do NOT emit a bare date or place as an entity — those are attributes of
227
+ other entities (a meeting's time/location), not entities themselves. An
228
+ abstract idea or theme is a `topic`. Use `other` only when nothing fits.
226
229
  email (OPTIONAL, person only): when the event body or attributes
227
230
  show an email address that unambiguously identifies the person,
228
231
  append it as the 4th field. This pairs the name+email forms so a
@@ -277,8 +280,10 @@ Each per-event object has:
277
280
  RULES:
278
281
  - NEVER skip an event — if an event has nothing to extract, emit its \
279
282
  object with "index" set and empty arrays.
280
- - entities: type ∈ {person, org, product, place, project, concept, \
281
- topic, date, other}.
283
+ - entities: type ∈ {person, org, product, project, topic, other}. \
284
+ Do NOT emit a bare date or place as an entity (those are attributes of other \
285
+ entities, not entities); an abstract idea or theme is a `topic`; use `other` \
286
+ only when nothing else fits.
282
287
  email (OPTIONAL, person only): when the event body or attributes
283
288
  show an email address that unambiguously identifies the person,
284
289
  include it. This pairs the name+email forms so a later event seeing
@@ -796,6 +801,217 @@ def _digit_ratio(s: str) -> float:
796
801
  return sum(c.isdigit() for c in stripped) / len(stripped)
797
802
 
798
803
 
804
+ # ----------------------------------------------------------------------
805
+ # Org-domain hard-key stamping (entity-ontology-the-spine §IV / registry §B)
806
+ #
807
+ # An organization's email DOMAIN is its deterministic hard key — the join that
808
+ # unifies the same org seen across sources and kills name-fragmentation (the
809
+ # LEGO-×12 / Johann-×5 problem). The async LLM pass mints org entities by NAME
810
+ # from prose; the DOMAIN lives in the event's structured envelope (contact_email
811
+ # / to_emails / cc_emails / author). The structured-data-graph audit found Seesa
812
+ # *emits* this bag but the graph *drops* it. Here we bring the two together:
813
+ # derive corporate domains from the envelope, and — only when a domain's
814
+ # registrable label matches the org's name — stamp it onto that org as a
815
+ # resolution alias (so domain-sharing orgs merge via the existing alias-overlap
816
+ # path) and as an `attributes.domain` hard key on the node.
817
+ #
818
+ # Forward-only: new events get stamped; a re-distill backfills history. Org-only
819
+ # — the sync pass never mints orgs (it emits person/document from the envelope),
820
+ # so this lives entirely in the async worker and leaves entity-id parity intact.
821
+ # ----------------------------------------------------------------------
822
+
823
+ # Consumer / free email providers — a shared domain here means "both use Gmail",
824
+ # NOT "same org", so an org must never be keyed on these.
825
+ FREEMAIL_DOMAINS = frozenset({
826
+ "gmail.com", "googlemail.com", "outlook.com", "hotmail.com", "live.com",
827
+ "msn.com", "yahoo.com", "yahoo.co.uk", "ymail.com", "icloud.com", "me.com",
828
+ "mac.com", "aol.com", "proton.me", "protonmail.com", "gmx.com", "gmx.net",
829
+ "mail.com", "zoho.com", "yandex.com", "fastmail.com", "qq.com", "163.com",
830
+ "126.com",
831
+ })
832
+
833
+ # The tenant's own domain(s) — our people, not an external org; never key an org
834
+ # on our domain (it would collapse unrelated externals under us). Comma-list via
835
+ # env; defaults to the known tenant.
836
+ TENANT_EMAIL_DOMAINS = frozenset(
837
+ d.strip().lower()
838
+ for d in os.environ.get("TENANT_EMAIL_DOMAINS", "pentatonic.com").split(",")
839
+ if d.strip()
840
+ )
841
+
842
+ # Multi-label public suffixes we actually see — enough to pull the registrable
843
+ # label ("boots" from boots.co.uk) without a full public-suffix-list dependency.
844
+ _MULTI_LABEL_TLDS = frozenset({
845
+ "co.uk", "org.uk", "ac.uk", "gov.uk", "me.uk", "co.jp", "com.au", "net.au",
846
+ "org.au", "co.nz", "com.br", "co.in", "com.cn", "co.za", "com.sg", "com.hk",
847
+ })
848
+
849
+ _EMAIL_DOMAIN_RE = re.compile(r"[\w.+-]+@([a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9-]+)+)", re.I)
850
+
851
+
852
+ def _domain_of(value: str) -> str | None:
853
+ """Lowercased domain of an email address found in ``value``, else None."""
854
+ if not value or "@" not in value:
855
+ return None
856
+ m = _EMAIL_DOMAIN_RE.search(value)
857
+ return m.group(1).lower() if m else None
858
+
859
+
860
+ def _bare_domain(value: str) -> str | None:
861
+ """Normalise a structured domain field to a bare host — 'acme.com' from
862
+ 'Acme.com', 'https://www.acme.com/about', etc. Used for system-of-record
863
+ records (e.g. a CRM company) that carry the org's domain DIRECTLY, not via an
864
+ email envelope. None when it isn't a plausible domain (no dot, has a space,
865
+ or looks like an email — those go through `_domain_of`)."""
866
+ if not isinstance(value, str):
867
+ return None
868
+ d = value.strip().lower()
869
+ if not d:
870
+ return None
871
+ d = d.split("//")[-1].split("/")[0].split("?")[0] # strip scheme + path
872
+ if d.startswith("www."):
873
+ d = d[4:]
874
+ if "." not in d or " " in d or "@" in d:
875
+ return None
876
+ return d
877
+
878
+
879
+ def event_org_domains(attrs: dict[str, Any] | None) -> set[str]:
880
+ """Corporate domains in an event's structured data — the deterministic
881
+ source of org hard keys. Two structured sources:
882
+ - the participant ENVELOPE: ``contact_email``/``from_email``/``author`` and
883
+ the recipient lists ``to_emails``/``cc_emails`` (list or delimited
884
+ string), domain taken from each address;
885
+ - a system-of-record's DIRECT domain field (``domain``/``company_domain``/
886
+ ``org_domain``) — e.g. a CRM company record carries the org's domain even
887
+ though there's no email envelope (it would otherwise only sit in prose).
888
+ Drops freemail + the tenant's own domain. Empty set on missing/garbage input
889
+ — never raises (this runs in the hot upsert path)."""
890
+ if not attrs:
891
+ return set()
892
+ domains: set[str] = set()
893
+ raw: list[str] = []
894
+ for key in ("contact_email", "from_email", "author"):
895
+ v = attrs.get(key)
896
+ if isinstance(v, str):
897
+ raw.append(v)
898
+ for key in ("to_emails", "cc_emails"):
899
+ v = attrs.get(key)
900
+ if isinstance(v, str):
901
+ raw.extend(v.replace(";", ",").split(","))
902
+ elif isinstance(v, list):
903
+ raw.extend(x for x in v if isinstance(x, str))
904
+ for r in raw:
905
+ domains.add(_domain_of(r))
906
+ for key in ("domain", "company_domain", "org_domain"):
907
+ domains.add(_bare_domain(attrs.get(key)))
908
+ return {
909
+ d for d in domains
910
+ if d and d not in FREEMAIL_DOMAINS and d not in TENANT_EMAIL_DOMAINS
911
+ }
912
+
913
+
914
+ def domain_label(domain: str) -> str | None:
915
+ """Registrable label of a domain — 'lego' from 'lego.com', 'boots' from
916
+ 'boots.co.uk'. Used only for the name↔domain match below."""
917
+ if not domain:
918
+ return None
919
+ parts = domain.lower().strip(".").split(".")
920
+ if len(parts) < 2:
921
+ return None
922
+ if ".".join(parts[-2:]) in _MULTI_LABEL_TLDS and len(parts) >= 3:
923
+ return parts[-3]
924
+ return parts[-2]
925
+
926
+
927
+ def match_org_domain(org_name: str, domains: set[str]) -> str | None:
928
+ """Return the one envelope domain that deterministically belongs to this
929
+ org — i.e. whose registrable label matches the org's name — else None.
930
+
931
+ This is the precision guard that makes envelope-derived stamping SAFE: an
932
+ email *from* acme.com that merely mentions "Globex" must not stamp acme.com
933
+ onto Globex. We match only when the domain's label is a whole name token
934
+ ("lego" ∈ "LEGO Group") or equals the de-spaced name ("mastercard" ↔
935
+ "Mastercard"). If two different domains match (genuine ambiguity), return
936
+ None and let fusion decide rather than guess.
937
+
938
+ We deliberately do NOT prefix-match the squished name. A `startswith`
939
+ rule (the original #111 form) over-fires whenever one org's name starts
940
+ with another org's label — "apple".startswith on "applebees" stamps
941
+ apple.com onto Applebee's; likewise lego.com→Legoland, micro.com→Microsoft.
942
+ Because the stamp becomes a resolution alias, that mis-merges two real orgs
943
+ at INGEST (immediate, not behind the Fusion Drive's dry-run/audit gate) —
944
+ the one path here that can silently fuse distinct orgs. The legitimate
945
+ "LEGO Group" case is already covered by the whole-token rule, so the prefix
946
+ clause added only false-positive surface for no real gain."""
947
+ if not org_name or not domains:
948
+ return None
949
+ norm = normalize_surface_form(org_name)
950
+ tokens = {t for t in re.split(r"[^a-z0-9]+", norm) if len(t) >= 2}
951
+ squished = norm.replace(" ", "")
952
+ matched: set[str] = set()
953
+ for d in domains:
954
+ label = domain_label(d)
955
+ if not label or len(label) < 2:
956
+ continue
957
+ if label in tokens or label == squished:
958
+ matched.add(d)
959
+ return next(iter(matched)) if len(matched) == 1 else None
960
+
961
+
962
+ # System-of-record record-type → core ontology type. A SoR event (a CRM record,
963
+ # a finance record, …) represents ONE primary entity of a known type; we map the
964
+ # producer's record-type tag onto the core type so we can key that entity on the
965
+ # record's external id. Extend per connector as new SoRs land.
966
+ _HUBSPOT_KIND_TO_TYPE = {"company": "org", "contact": "person"}
967
+
968
+
969
+ def event_record_subject(attrs: dict[str, Any] | None) -> tuple[str | None, str | None]:
970
+ """For an event that IS a system-of-record record, return
971
+ `(core_type, external_id)` for its single primary entity — else `(None, None)`.
972
+
973
+ The external id (`source_id` — already a composite `system:kind:id`, e.g.
974
+ `hubspot:company:12345`) is the registry's FALLBACK org/person hard key
975
+ (A.1): used when no email/domain resolves, so re-importing the same record
976
+ converges to one node (idempotency) and a domainless SoR entity isn't
977
+ re-minted on every sync. Only fires for records whose type maps to a core
978
+ type (company→org, contact→person); deal/ticket/custom-plugin types return
979
+ None (they aren't core graph entities). The caller additionally requires the
980
+ event to extract exactly ONE entity of `core_type` before keying on this —
981
+ so a record that also mentions other orgs/people never mis-stamps them."""
982
+ if not attrs:
983
+ return (None, None)
984
+ external_id = attrs.get("source_id")
985
+ if not isinstance(external_id, str) or not external_id.strip():
986
+ return (None, None)
987
+ hubspot_kind = attrs.get("hubspot_kind")
988
+ if isinstance(hubspot_kind, str):
989
+ core = _HUBSPOT_KIND_TO_TYPE.get(hubspot_kind.lower())
990
+ if core:
991
+ return (core, external_id.strip())
992
+ return (None, None)
993
+
994
+
995
+ def org_node_id_key(entity_type: str, name: str, stamped_domain: str | None) -> str:
996
+ """The string `entity_id()` hashes to mint an entity's NODE id — its domain
997
+ hard key when we resolved one (org), else its name.
998
+
999
+ This is the id-level complement to the alias stamp (`match_org_domain`):
1000
+ the stamp makes the domain a resolution *alias* (caught by the upsert
1001
+ SELECT/lock); keying the node *id* on the domain makes convergence
1002
+ DETERMINISTIC and order-independent. Every event that resolves the same
1003
+ domain mints the SAME id, so they collapse to ONE node via `ON CONFLICT
1004
+ (id)` even if the SELECT and the advisory lock both miss — and a re-distill
1005
+ of the corpus yields exactly one org node per domain regardless of event
1006
+ order, instead of a name-keyed pile Fusion has to chase (the entity-ontology
1007
+ spine's "the big structural fix"). Org-only for now; person→email and
1008
+ external-id are the symmetric follow-ups. Falls back to the name for
1009
+ non-orgs and for orgs we couldn't resolve a domain for (prose-only orgs)."""
1010
+ if entity_type == "org" and stamped_domain:
1011
+ return stamped_domain
1012
+ return name
1013
+
1014
+
799
1015
  def upsert_entities(
800
1016
  conn: psycopg.Connection,
801
1017
  arena: str,
@@ -804,6 +1020,7 @@ def upsert_entities(
804
1020
  disclosure_class: str,
805
1021
  entities: list[dict],
806
1022
  event_time: datetime | None,
1023
+ event_attrs: dict[str, Any] | None = None,
807
1024
  ) -> dict[str, str]:
808
1025
  """Alias-aware insert (or merge) of entities; returns a name→id
809
1026
  map so facts and relationships can link to the inserted rows.
@@ -831,10 +1048,28 @@ def upsert_entities(
831
1048
  serialises concurrent writers (sync + async on the same event)
832
1049
  on the same surface form. (RFC steps 2 + 2a.)
833
1050
 
834
- MIRROR of extractor-sync/server.py:_upsert_entitiessame
835
- resolution algorithm. Kept as separate Python because the sync
836
- extractor uses async psycopg and the async worker uses sync
837
- psycopg; the SQL is identical.
1051
+ 3. **Org-domain hard-key stamping + id keying** when `event_attrs` carries
1052
+ the structured envelope, the corporate email domain that matches an `org`
1053
+ entity's name is stamped onto it as a resolution alias (so domain-sharing
1054
+ orgs merge via the alias-overlap path above) and as `attributes.domain`
1055
+ on the node. The new node's *id* is then minted from that domain
1056
+ (`org_node_id_key`) rather than the name, so same-domain orgs converge to
1057
+ ONE id deterministically — robust to a SELECT/lock miss and order-
1058
+ independent under a re-distill. This is the deterministic hard key the
1059
+ registry wants for org; see `event_org_domains`/`match_org_domain` and
1060
+ entity-ontology-the-spine.md §IV.
1061
+
1062
+ 4. **SoR external-id fallback** — when the event IS a system-of-record record
1063
+ (`event_record_subject`) and extracts exactly ONE entity of the record's
1064
+ core type, that subject — if it has no email/domain hard key — is keyed on
1065
+ the record's external id and carries `attributes.external_id`, so
1066
+ re-imports of the same record converge (registry precedence: email/domain
1067
+ → external-id → name). email/domain still win when present.
1068
+
1069
+ MIRROR of extractor-sync/server.py:_upsert_entities for the resolution
1070
+ algorithm. The sync pass never mints `org` entities (it emits
1071
+ person/document from the envelope), so org-domain stamping is async-only and
1072
+ leaves the entity-id parity test untouched.
838
1073
 
839
1074
  Returns name→id where `name` is the LLM-emitted surface form
840
1075
  (canonical) so facts/relationships using the same surface form
@@ -842,6 +1077,22 @@ def upsert_entities(
842
1077
  name_to_id: dict[str, str] = {}
843
1078
  if not entities:
844
1079
  return name_to_id
1080
+ # Corporate domains in this event's structured envelope — the deterministic
1081
+ # source of org hard keys (computed once per event, not per entity).
1082
+ org_domains = event_org_domains(event_attrs)
1083
+ # System-of-record external-id fallback: if this event IS a single record
1084
+ # (e.g. a HubSpot company/contact) AND it extracts exactly ONE entity of the
1085
+ # record's core type, that entity is the record's subject and may be keyed on
1086
+ # the record's external id (the registry fallback, after email/domain). The
1087
+ # exactly-one guard is the precision gate — a record that also names other
1088
+ # orgs/people leaves `subject_ext_id` None so nothing is mis-stamped.
1089
+ subject_type, subject_ext_id = event_record_subject(event_attrs)
1090
+ if subject_type:
1091
+ n_of_type = sum(
1092
+ 1 for e in entities if (e.get("type") or "other").lower() == subject_type
1093
+ )
1094
+ if n_of_type != 1:
1095
+ subject_ext_id = None
845
1096
  with conn.cursor() as cur:
846
1097
  for e in entities:
847
1098
  etype = (e.get("type") or "other").lower()
@@ -853,6 +1104,27 @@ def upsert_entities(
853
1104
  continue
854
1105
  aliases = [a for a in (e.get("aliases") or []) if a]
855
1106
 
1107
+ # Hard-key stamps for THIS entity, merged onto the node's attributes
1108
+ # and (for domain) into the resolution aliases. Adding domain to
1109
+ # aliases before forms are computed is deliberate — that's what makes
1110
+ # the alias-overlap merge pick it up.
1111
+ attrs_patch: dict[str, str] = {}
1112
+ stamped_domain: str | None = None
1113
+ if etype == "org" and org_domains:
1114
+ stamped_domain = match_org_domain(name, org_domains)
1115
+ if stamped_domain:
1116
+ if stamped_domain not in aliases:
1117
+ aliases.append(stamped_domain)
1118
+ attrs_patch["domain"] = stamped_domain
1119
+ # SoR external-id: the single subject of a record event carries the
1120
+ # record's external id (resolution alias + `attributes.external_id`).
1121
+ is_record_subject = bool(subject_ext_id) and etype == subject_type
1122
+ if is_record_subject and subject_ext_id:
1123
+ if subject_ext_id not in aliases:
1124
+ aliases.append(subject_ext_id)
1125
+ attrs_patch["external_id"] = subject_ext_id
1126
+ attrs_update = json.dumps(attrs_patch)
1127
+
856
1128
  # Sort (don't `list(set(...))`) so lock acquisition order
857
1129
  # is deterministic across processes — set-iteration order
858
1130
  # depends on Python's per-process hash randomisation, so
@@ -897,17 +1169,46 @@ def upsert_entities(
897
1169
  UPDATE entities SET
898
1170
  aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
899
1171
  provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
1172
+ -- Merge the org-domain hard key (no-op for the `{}` default
1173
+ -- on non-orgs / unmatched orgs; never clobbers an existing key).
1174
+ attributes = entities.attributes || %s::jsonb,
900
1175
  -- Widen the seen-window with this event's SOURCE
901
1176
  -- time, not NOW(): newest evidence = newest source.
902
1177
  last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
903
1178
  first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
904
1179
  WHERE id = %s
905
1180
  """,
906
- (aliases, [event_id], event_time, event_time, eid),
1181
+ (aliases, [event_id], attrs_update, event_time, event_time, eid),
907
1182
  )
908
1183
  else:
909
- # 3b. No match — insert new.
910
- eid = entity_id(arena, etype, name)
1184
+ # 3b. No match — insert new. Key the node on its HARD KEY so the
1185
+ # same real thing mints the SAME id and converges via ON CONFLICT
1186
+ # even if the SELECT/lock missed (task #2): org → its resolved
1187
+ # DOMAIN (org_node_id_key); person → its EMAIL via the SHARED
1188
+ # person_id_key, so this matches the sync pass byte-for-byte
1189
+ # (sync mints person nodes at ingest — see entity_id.py). Email
1190
+ # is carried as an alias on async persons (LLM-promoted); pull it
1191
+ # back out here. Everything else falls back to the name.
1192
+ if etype == "person":
1193
+ person_email = next(
1194
+ (a for a in aliases if a and "@" in a and " " not in a),
1195
+ None,
1196
+ )
1197
+ id_key = person_id_key(name, person_email)
1198
+ has_hard_key = person_email is not None
1199
+ elif etype == "org":
1200
+ id_key = org_node_id_key(etype, name, stamped_domain)
1201
+ has_hard_key = stamped_domain is not None
1202
+ else:
1203
+ id_key = name
1204
+ has_hard_key = False
1205
+ # FALLBACK: a SoR record's single subject with no email/domain
1206
+ # keys on the record's external id, so re-imports of that record
1207
+ # converge to one node (registry precedence: email/domain →
1208
+ # external-id → name). email/domain still win when present.
1209
+ if not has_hard_key and is_record_subject and subject_ext_id:
1210
+ id_key = subject_ext_id
1211
+ eid = entity_id(arena, etype, id_key)
911
1212
  # Fusion Drive born-salience: a numeric-ID-as-person (classic
912
1213
  # 7B junk that slips past noise_filter, e.g. "1716801984") is
913
1214
  # born near the floor so the decay pass can evict it on a short
@@ -921,10 +1222,10 @@ def upsert_entities(
921
1222
  INSERT INTO entities (
922
1223
  id, arena, entity_type, canonical_name, aliases,
923
1224
  provenance_event_ids, participant_set, disclosure_class, salience,
924
- first_seen, last_seen
1225
+ attributes, first_seen, last_seen
925
1226
  ) VALUES (
926
1227
  %s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
927
- COALESCE(%s, NOW()), COALESCE(%s, NOW())
1228
+ %s::jsonb, COALESCE(%s, NOW()), COALESCE(%s, NOW())
928
1229
  )
929
1230
  ON CONFLICT (id) DO UPDATE SET
930
1231
  aliases = (
@@ -933,6 +1234,8 @@ def upsert_entities(
933
1234
  provenance_event_ids = (
934
1235
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
935
1236
  ),
1237
+ -- merge the org-domain hard key (no-op for `{}`)
1238
+ attributes = entities.attributes || EXCLUDED.attributes,
936
1239
  -- re-corroboration can only RAISE salience, never lower it
937
1240
  salience = GREATEST(entities.salience, EXCLUDED.salience),
938
1241
  -- widen the seen-window on SOURCE time, not NOW()
@@ -942,13 +1245,36 @@ def upsert_entities(
942
1245
  (
943
1246
  eid, arena, etype, name, aliases,
944
1247
  [event_id], participant_set, disclosure_class, _sal,
945
- event_time, event_time,
1248
+ attrs_update, event_time, event_time,
946
1249
  ),
947
1250
  )
948
1251
  name_to_id[name] = eid
949
1252
  return name_to_id
950
1253
 
951
1254
 
1255
+ def fact_source(event: dict[str, Any]) -> str | None:
1256
+ """Derive the originating SOURCE label to stamp onto a fact.
1257
+
1258
+ Prefer the finer producer label `attributes.source` (gmail / slack /
1259
+ hubspot / granola / drive / github) over the coarse `source_kind`
1260
+ enum (chat / note / system / ...), because SoR-drift detection needs
1261
+ "CRM vs email" granularity, which the enum can't express (both gmail
1262
+ and a draft are `note`; both HubSpot and an ERP snapshot are
1263
+ `system`). Fall back to `source_kind` when the producer didn't supply
1264
+ a finer label, and to None only when neither is present — NULL ==
1265
+ source-unknown, matching the pre-009 state of existing rows so the
1266
+ nullable column never lies. Pure + total: never raises, so it can't
1267
+ break the distill path; the caller stamps whatever it returns."""
1268
+ attrs = event.get("attributes") or {}
1269
+ src = attrs.get("source")
1270
+ if isinstance(src, str) and src.strip():
1271
+ return src.strip()
1272
+ kind = event.get("source_kind")
1273
+ if isinstance(kind, str) and kind.strip():
1274
+ return kind.strip()
1275
+ return None
1276
+
1277
+
952
1278
  def upsert_facts(
953
1279
  conn: psycopg.Connection,
954
1280
  arena: str,
@@ -959,6 +1285,7 @@ def upsert_facts(
959
1285
  name_to_id: dict[str, str],
960
1286
  event_time: datetime | None,
961
1287
  due_at: datetime | None = None,
1288
+ source: str | None = None,
962
1289
  ) -> int:
963
1290
  """Facts are content-hashed on (arena, statement). Same statement
964
1291
  extracted from any event in the arena converges to the same row,
@@ -984,7 +1311,15 @@ def upsert_facts(
984
1311
  corroborating events: facts have no `last_seen`, so #92's decay uses
985
1312
  `asserted_at` as the recency clock and resets it on re-corroboration
986
1313
  — order-stable regardless of distill order. `due_at` (the source
987
- event's structured deadline, if any) populates `effective_until`."""
1314
+ event's structured deadline, if any) populates `effective_until`.
1315
+
1316
+ `source` is the originating producer label (`fact_source()` — finer
1317
+ `attributes.source` else coarse `source_kind`), stamped so a reader
1318
+ can tell "this came from the CRM" from "this came from an email" —
1319
+ the foundation for SoR-drift detection. Accrete-only on conflict:
1320
+ COALESCE keeps the first-known source and only fills it when a prior
1321
+ extraction left it NULL, so corroboration never rewrites provenance
1322
+ (and a NULL — pre-009 rows, or a source-less event — stays NULL)."""
988
1323
  if not facts:
989
1324
  return 0
990
1325
  inserted = 0
@@ -1010,11 +1345,11 @@ def upsert_facts(
1010
1345
  id, arena, category, subject_entity_id, predicate,
1011
1346
  object_entity_id, statement, provenance_event_ids,
1012
1347
  stage, confidence, participant_set, disclosure_class, salience,
1013
- asserted_at, effective_until
1348
+ asserted_at, effective_until, source
1014
1349
  ) VALUES (
1015
1350
  %s, %s, %s, %s, %s, %s, %s, %s,
1016
1351
  'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
1017
- COALESCE(%s, NOW()), %s
1352
+ COALESCE(%s, NOW()), %s, %s
1018
1353
  )
1019
1354
  ON CONFLICT (id) DO UPDATE SET
1020
1355
  provenance_event_ids = (
@@ -1052,7 +1387,11 @@ def upsert_facts(
1052
1387
  -- This also makes it order-stable (independent of
1053
1388
  -- distill order). EXCLUDED.asserted_at is the
1054
1389
  -- COALESCE(event_time, NOW()) from the INSERT above.
1055
- asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
1390
+ asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at),
1391
+ -- Accrete-only: keep the first-known source, only fill
1392
+ -- it if a prior extraction left it NULL. Corroboration
1393
+ -- must not rewrite where the fact first came from.
1394
+ source = COALESCE(facts.source, EXCLUDED.source)
1056
1395
  """,
1057
1396
  (
1058
1397
  _content_id(arena, stmt),
@@ -1069,6 +1408,7 @@ def upsert_facts(
1069
1408
  _fsal,
1070
1409
  event_time,
1071
1410
  due_at,
1411
+ source,
1072
1412
  ),
1073
1413
  )
1074
1414
  inserted += 1
@@ -1546,15 +1886,20 @@ async def process_batch(
1546
1886
  # behaviour). Only `attributes.due_at` is honoured; we do NOT
1547
1887
  # guess deadlines from free text here.
1548
1888
  due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
1889
+ # ORIGINATING SOURCE of this event, stamped onto its facts so
1890
+ # downstream can tell CRM-asserted from email-asserted (the
1891
+ # SoR-drift foundation). Finer `attributes.source` else coarse
1892
+ # `source_kind`; None ⇒ column stays NULL (source-unknown).
1893
+ src = fact_source(event)
1549
1894
 
1550
1895
  try:
1551
1896
  name_to_id = upsert_entities(
1552
1897
  conn, arena, event_id, participant_set, disclosure, ents,
1553
- event_time,
1898
+ event_time, event.get("attributes"),
1554
1899
  )
1555
1900
  n_facts = upsert_facts(
1556
1901
  conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
1557
- event_time, due_at,
1902
+ event_time, due_at, src,
1558
1903
  )
1559
1904
  n_rels = upsert_relationships(
1560
1905
  conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
@@ -55,3 +55,23 @@ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
55
55
  """
56
56
  key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
57
57
  return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
58
+
59
+
60
+ def person_id_key(name: str | None, email: str | None) -> str:
61
+ """The string `entity_id()` should hash to mint a PERSON's node id — the
62
+ EMAIL (the person's deterministic hard key) when present, else the name.
63
+
64
+ Lives HERE, in the byte-identical shared file, BECAUSE both extractors mint
65
+ person nodes and must agree: the sync pass builds them from the envelope at
66
+ ingest (it has the email), the async pass builds them from prose (email
67
+ promoted to an alias). Keying both on the email means the same person mints
68
+ the SAME id from either pass and in any processing order — converging to one
69
+ node instead of sync's name-keyed node and async's node racing, or a
70
+ re-distill re-homing them differently. The name is the fallback for a person
71
+ with no email (Fusion still merges those fuzzily). `entity_id()` normalises
72
+ (lowercase + trim) the result, so casing/whitespace variants of an email
73
+ collapse. (Org keying is the async-only `org_node_id_key`; person keying is
74
+ cross-pass, so it belongs in this parity-guarded file.)
75
+ """
76
+ e = (email or "").strip()
77
+ return e if e else (name or "")
@@ -32,7 +32,7 @@ from typing import Any
32
32
 
33
33
  # Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
34
34
  from confidence import born_salience
35
- from entity_id import entity_id, normalize_surface_form # noqa: F401
35
+ from entity_id import entity_id, normalize_surface_form, person_id_key # noqa: F401
36
36
  # Source-time parsing — byte-identical copy in extractor-async (source_time.py).
37
37
  from source_time import event_source_time
38
38
 
@@ -250,12 +250,17 @@ def _person_entity(
250
250
  if not name and not email:
251
251
  return None
252
252
 
253
- # Prefer name as canonical when both present. RFC §1 + §2b
254
- # (accrete-only). When only email is available, fall back to email
255
- # canonical — a later event carrying the pair will alias-resolve
256
- # to this row and add the name as an alias, but won't rename the
257
- # canonical (deferred to a follow-up; see §2b).
253
+ # Prefer name as canonical_name (the human-readable display) when both
254
+ # present. RFC §1 + §2b (accrete-only). When only email is available, fall
255
+ # back to email canonical — a later event carrying the pair will
256
+ # alias-resolve to this row and add the name as an alias, but won't rename
257
+ # the canonical (deferred to a follow-up; see §2b).
258
258
  canonical = name if name else email
259
+ # …but key the node ID on the EMAIL hard key when present (task #2), via the
260
+ # SHARED person_id_key so sync (here) and the async pass mint the SAME id for
261
+ # the same person regardless of order — deterministic convergence, not a
262
+ # name-vs-email race. Display name and id key are now decoupled.
263
+ id_key = person_id_key(name, email)
259
264
  aliases_set: set[str] = set()
260
265
  if name:
261
266
  aliases_set.add(name)
@@ -266,7 +271,7 @@ def _person_entity(
266
271
  aliases_set.add(a)
267
272
 
268
273
  return {
269
- "id": _entity_id(req.arena, "person", canonical),
274
+ "id": _entity_id(req.arena, "person", id_key),
270
275
  "arena": req.arena,
271
276
  "entity_type": "person",
272
277
  "canonical_name": canonical,
@@ -62,16 +62,21 @@ def stub_req():
62
62
 
63
63
 
64
64
  def test_person_entity_name_and_email_pair(stub_req) -> None:
65
- """When both name and email present → entity keyed by NAME,
66
- BOTH forms in aliases."""
65
+ """When both name and email present → display name is the canonical_name,
66
+ but the node ID keys on the EMAIL hard key (task #2), with BOTH forms in
67
+ aliases."""
67
68
  e = sync_server._person_entity(
68
69
  stub_req, "evt1", name="Carly Snider", email="carly@example.com"
69
70
  )
70
71
  assert e is not None
71
72
  assert e["canonical_name"] == "Carly Snider"
72
73
  assert set(e["aliases"]) == {"Carly Snider", "carly@example.com"}
73
- # Id is derived from the name (not the email).
74
- assert e["id"] == sync_server._entity_id("tenant:test", "person", "Carly Snider")
74
+ # Id is derived from the EMAIL hard key (not the name) — so the same person
75
+ # converges across passes/sources/spellings. Display name is decoupled.
76
+ assert e["id"] == sync_server._entity_id(
77
+ "tenant:test", "person", "carly@example.com"
78
+ )
79
+ assert e["id"] != sync_server._entity_id("tenant:test", "person", "Carly Snider")
75
80
 
76
81
 
77
82
  def test_person_entity_email_only(stub_req) -> None:
@@ -0,0 +1,24 @@
1
+ -- 009: stamp the originating SOURCE onto each fact (SoR-drift foundation).
2
+ --
3
+ -- The distiller extracts facts from events but drops WHICH SOURCE each
4
+ -- fact came from. The source lives on the event (`events.source_kind` +
5
+ -- the finer `attributes.source`) but never reaches the `facts` table, so
6
+ -- a reader can't tell "this came from the CRM" from "this came from an
7
+ -- email". That distinction is the foundation for system-of-record drift
8
+ -- detection (e.g. CRM says a deal is active, a newer email says it was
9
+ -- rejected). This column makes the signal available; the drift READ /
10
+ -- detection path is an explicit follow-up and is NOT built here.
11
+ --
12
+ -- Forward-only + additive, per 001_init.sql header note 1 ("to iterate
13
+ -- the schema, add columns; never alter existing ones"). The column is
14
+ -- NULLABLE with no backfill: every historical fact and every existing
15
+ -- reader is unaffected (NULL = source-unknown, the pre-009 state). New
16
+ -- distillations stamp it from the finer `attributes.source` when the
17
+ -- producer supplied one, else the coarse `source_kind` enum value — see
18
+ -- `fact_source()` in extractor-async/worker.py for the derivation rule.
19
+ --
20
+ -- TEXT (not the `source_kind` enum) on purpose: we want the finer
21
+ -- producer label (gmail / slack / hubspot / granola / drive / github)
22
+ -- where available, which is exactly the granularity SoR-drift needs and
23
+ -- which the enum can't represent.
24
+ ALTER TABLE facts ADD COLUMN IF NOT EXISTS source TEXT;
@@ -55,3 +55,27 @@ def test_identical_output_across_copies():
55
55
  for arena, etype, name in cases:
56
56
  assert a.entity_id(arena, etype, name) == b.entity_id(arena, etype, name)
57
57
  assert a.normalize_surface_form(name) == b.normalize_surface_form(name)
58
+
59
+
60
+ def test_person_id_key_parity_and_behaviour():
61
+ """person_id_key is the cross-pass person hard key — it MUST behave
62
+ identically in both copies (it's why it lives in the shared file), and it
63
+ must prefer the email so a person mints the same id from either extractor."""
64
+ a = _load(_SYNC, "entity_id_sync2")
65
+ b = _load(_ASYNC, "entity_id_async2")
66
+ key_cases = [
67
+ ("Johann Boedecker", "johann@pentatonic.com"),
68
+ ("Johann Boedecker", None),
69
+ (None, "carly@pact.org"),
70
+ (" Ben Gordon ", " Ben@Acme.com "),
71
+ (None, None),
72
+ ]
73
+ for name, email in key_cases:
74
+ assert a.person_id_key(name, email) == b.person_id_key(name, email)
75
+ # Prefers the email; falls back to the name.
76
+ assert a.person_id_key("Johann", "johann@p.com") == "johann@p.com"
77
+ assert a.person_id_key("Johann", None) == "Johann"
78
+ # Name-variants with the same email → the SAME node id (the whole point).
79
+ id1 = a.entity_id("arena1", "person", a.person_id_key("Johann Boedecker", "j@p.com"))
80
+ id2 = a.entity_id("arena1", "person", a.person_id_key("BOEDECKER, JOHANN", "j@p.com"))
81
+ assert id1 == id2