@pentatonic-ai/ai-agent-sdk 0.10.15 → 0.10.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/server.py +1 -1
- package/packages/memory-engine-v2/extractor-async/entity_id.py +20 -0
- package/packages/memory-engine-v2/extractor-async/test_fact_source.py +62 -0
- package/packages/memory-engine-v2/extractor-async/test_org_domain.py +329 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +357 -17
- package/packages/memory-engine-v2/extractor-sync/entity_id.py +20 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +12 -7
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +9 -4
- package/packages/memory-engine-v2/org-model/migrations/009_fact_source.sql +24 -0
- package/packages/memory-engine-v2/tests/test_entity_id_parity.py +24 -0
package/dist/index.cjs
CHANGED
|
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
878
878
|
}
|
|
879
879
|
|
|
880
880
|
// src/telemetry.js
|
|
881
|
-
var VERSION = "0.10.
|
|
881
|
+
var VERSION = "0.10.16";
|
|
882
882
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
883
883
|
function machineId() {
|
|
884
884
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/dist/index.js
CHANGED
|
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
|
|
|
847
847
|
}
|
|
848
848
|
|
|
849
849
|
// src/telemetry.js
|
|
850
|
-
var VERSION = "0.10.
|
|
850
|
+
var VERSION = "0.10.16";
|
|
851
851
|
var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
|
|
852
852
|
function machineId() {
|
|
853
853
|
const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pentatonic-ai/ai-agent-sdk",
|
|
3
|
-
"version": "0.10.
|
|
3
|
+
"version": "0.10.16",
|
|
4
4
|
"description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -928,7 +928,7 @@ async def list_entities(req: GraphQueryRequest):
|
|
|
928
928
|
params.extend([pattern, pattern])
|
|
929
929
|
sql = f"""
|
|
930
930
|
SELECT id, arena, entity_type, canonical_name, aliases,
|
|
931
|
-
provenance_event_ids, last_seen
|
|
931
|
+
provenance_event_ids, attributes, last_seen
|
|
932
932
|
FROM entities
|
|
933
933
|
WHERE {' AND '.join(conditions)}
|
|
934
934
|
ORDER BY last_seen DESC
|
|
@@ -55,3 +55,23 @@ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
|
|
|
55
55
|
"""
|
|
56
56
|
key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
|
|
57
57
|
return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def person_id_key(name: str | None, email: str | None) -> str:
|
|
61
|
+
"""The string `entity_id()` should hash to mint a PERSON's node id — the
|
|
62
|
+
EMAIL (the person's deterministic hard key) when present, else the name.
|
|
63
|
+
|
|
64
|
+
Lives HERE, in the byte-identical shared file, BECAUSE both extractors mint
|
|
65
|
+
person nodes and must agree: the sync pass builds them from the envelope at
|
|
66
|
+
ingest (it has the email), the async pass builds them from prose (email
|
|
67
|
+
promoted to an alias). Keying both on the email means the same person mints
|
|
68
|
+
the SAME id from either pass and in any processing order — converging to one
|
|
69
|
+
node instead of sync's name-keyed node and async's node racing, or a
|
|
70
|
+
re-distill re-homing them differently. The name is the fallback for a person
|
|
71
|
+
with no email (Fusion still merges those fuzzily). `entity_id()` normalises
|
|
72
|
+
(lowercase + trim) the result, so casing/whitespace variants of an email
|
|
73
|
+
collapse. (Org keying is the async-only `org_node_id_key`; person keying is
|
|
74
|
+
cross-pass, so it belongs in this parity-guarded file.)
|
|
75
|
+
"""
|
|
76
|
+
e = (email or "").strip()
|
|
77
|
+
return e if e else (name or "")
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Tests for fact_source — deriving the source label stamped onto facts.
|
|
2
|
+
|
|
3
|
+
The contract under test (SoR-drift foundation): prefer the finer
|
|
4
|
+
producer label `attributes.source` (gmail / hubspot / ...) over the
|
|
5
|
+
coarse `source_kind` enum; fall back to `source_kind` when no finer
|
|
6
|
+
label; return None only when neither is present (NULL == source-unknown,
|
|
7
|
+
the pre-009 state). Pure + total: never raises.
|
|
8
|
+
|
|
9
|
+
Run: pytest packages/memory-engine-v2/extractor-async/test_fact_source.py
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
from worker import fact_source
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestFactSource:
|
|
20
|
+
def test_prefers_finer_attributes_source(self):
|
|
21
|
+
# attributes.source (hubspot) wins over the coarse source_kind
|
|
22
|
+
# (system) — this is the CRM-vs-email granularity SoR-drift needs.
|
|
23
|
+
ev = {"source_kind": "system", "attributes": {"source": "hubspot"}}
|
|
24
|
+
assert fact_source(ev) == "hubspot"
|
|
25
|
+
|
|
26
|
+
def test_email_finer_than_note_kind(self):
|
|
27
|
+
ev = {"source_kind": "note", "attributes": {"source": "gmail"}}
|
|
28
|
+
assert fact_source(ev) == "gmail"
|
|
29
|
+
|
|
30
|
+
def test_falls_back_to_source_kind_when_no_attribute(self):
|
|
31
|
+
ev = {"source_kind": "chat", "attributes": {}}
|
|
32
|
+
assert fact_source(ev) == "chat"
|
|
33
|
+
|
|
34
|
+
def test_falls_back_when_attributes_missing(self):
|
|
35
|
+
ev = {"source_kind": "doc"}
|
|
36
|
+
assert fact_source(ev) == "doc"
|
|
37
|
+
|
|
38
|
+
def test_strips_whitespace(self):
|
|
39
|
+
ev = {"source_kind": "system", "attributes": {"source": " slack "}}
|
|
40
|
+
assert fact_source(ev) == "slack"
|
|
41
|
+
|
|
42
|
+
def test_blank_attribute_falls_through_to_kind(self):
|
|
43
|
+
# An empty/whitespace `source` must NOT win over a real kind.
|
|
44
|
+
ev = {"source_kind": "doc", "attributes": {"source": " "}}
|
|
45
|
+
assert fact_source(ev) == "doc"
|
|
46
|
+
|
|
47
|
+
# --- None cases: source-unknown, column stays NULL ---
|
|
48
|
+
|
|
49
|
+
@pytest.mark.parametrize(
|
|
50
|
+
"ev",
|
|
51
|
+
[
|
|
52
|
+
{},
|
|
53
|
+
{"attributes": {}},
|
|
54
|
+
{"attributes": {"source": ""}},
|
|
55
|
+
{"source_kind": "", "attributes": {"source": None}},
|
|
56
|
+
{"source_kind": None, "attributes": None},
|
|
57
|
+
# Non-string types must not crash and must not be stamped.
|
|
58
|
+
{"source_kind": 7, "attributes": {"source": 42}},
|
|
59
|
+
],
|
|
60
|
+
)
|
|
61
|
+
def test_returns_none_when_no_usable_source(self, ev):
|
|
62
|
+
assert fact_source(ev) is None
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""Org-domain hard-key stamping — the deterministic helpers that turn an
|
|
2
|
+
event's structured email envelope into an org's domain hard key
|
|
3
|
+
(entity-ontology-the-spine.md §IV; registry §B).
|
|
4
|
+
|
|
5
|
+
Pure-logic tests only (no DB) — they exercise the precision guard that makes
|
|
6
|
+
envelope-derived stamping safe (`match_org_domain`) plus envelope parsing
|
|
7
|
+
(`event_org_domains`) and label extraction (`domain_label`). The upsert SQL that
|
|
8
|
+
consumes them is covered by the engine image build+import check in CI."""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import importlib.util
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_THIS = Path(__file__).resolve().parent
|
|
19
|
+
_SPEC = importlib.util.spec_from_file_location("extractor_async_worker",
|
|
20
|
+
_THIS / "worker.py")
|
|
21
|
+
assert _SPEC and _SPEC.loader
|
|
22
|
+
worker = importlib.util.module_from_spec(_SPEC)
|
|
23
|
+
try:
|
|
24
|
+
_SPEC.loader.exec_module(worker)
|
|
25
|
+
except ImportError as e:
|
|
26
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ----------------------------------------------------------------------
|
|
30
|
+
# _domain_of — pull the domain out of an email-ish string
|
|
31
|
+
# ----------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
def test_domain_of_plain_email() -> None:
|
|
34
|
+
assert worker._domain_of("bob@lego.com") == "lego.com"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_domain_of_display_name_form() -> None:
|
|
38
|
+
assert worker._domain_of("Bob Smith <bob@LEGO.com>") == "lego.com"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_domain_of_subdomain_preserved() -> None:
|
|
42
|
+
# domain_label, not _domain_of, reduces to the registrable label.
|
|
43
|
+
assert worker._domain_of("a@mail.acme.co.uk") == "mail.acme.co.uk"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_domain_of_non_email_is_none() -> None:
|
|
47
|
+
assert worker._domain_of("Bob Smith") is None
|
|
48
|
+
assert worker._domain_of("") is None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ----------------------------------------------------------------------
|
|
52
|
+
# domain_label — registrable label
|
|
53
|
+
# ----------------------------------------------------------------------
|
|
54
|
+
|
|
55
|
+
def test_domain_label_simple() -> None:
|
|
56
|
+
assert worker.domain_label("lego.com") == "lego"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_domain_label_multi_part_tld() -> None:
|
|
60
|
+
assert worker.domain_label("boots.co.uk") == "boots"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_domain_label_subdomain() -> None:
|
|
64
|
+
assert worker.domain_label("mail.acme.com") == "acme"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_domain_label_subdomain_multi_part_tld() -> None:
|
|
68
|
+
assert worker.domain_label("careers.boots.co.uk") == "boots"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_domain_label_degenerate() -> None:
|
|
72
|
+
assert worker.domain_label("localhost") is None
|
|
73
|
+
assert worker.domain_label("") is None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ----------------------------------------------------------------------
|
|
77
|
+
# event_org_domains — corporate domains from the structured envelope
|
|
78
|
+
# ----------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
def test_event_domains_from_contact_email() -> None:
|
|
81
|
+
assert worker.event_org_domains({"contact_email": "ceo@lego.com"}) == {"lego.com"}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_event_domains_recipient_list_string() -> None:
|
|
85
|
+
attrs = {"to_emails": "a@acme.com, b@acme.com; c@globex.com"}
|
|
86
|
+
assert worker.event_org_domains(attrs) == {"acme.com", "globex.com"}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_event_domains_recipient_list_array() -> None:
|
|
90
|
+
attrs = {"cc_emails": ["a@acme.com", "b@globex.com"]}
|
|
91
|
+
assert worker.event_org_domains(attrs) == {"acme.com", "globex.com"}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_event_domains_drops_freemail() -> None:
|
|
95
|
+
attrs = {"contact_email": "someone@gmail.com", "to_emails": "boss@acme.com"}
|
|
96
|
+
assert worker.event_org_domains(attrs) == {"acme.com"}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_event_domains_drops_tenant_domain() -> None:
|
|
100
|
+
# Our own people are not an external org.
|
|
101
|
+
attrs = {"to_emails": "phil@pentatonic.com, ext@acme.com"}
|
|
102
|
+
assert worker.event_org_domains(attrs) == {"acme.com"}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_event_domains_author_only_if_email() -> None:
|
|
106
|
+
# `author` is often a name/user-id, not an email — only count it when it is one.
|
|
107
|
+
assert worker.event_org_domains({"author": "Phil Hauser"}) == set()
|
|
108
|
+
assert worker.event_org_domains({"author": "phil@acme.com"}) == {"acme.com"}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def test_event_domains_empty_inputs() -> None:
|
|
112
|
+
assert worker.event_org_domains(None) == set()
|
|
113
|
+
assert worker.event_org_domains({}) == set()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ----------------------------------------------------------------------
|
|
117
|
+
# event_org_domains — direct SoR domain attribute (no email envelope)
|
|
118
|
+
# ----------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
def test_event_domains_from_direct_domain_attr() -> None:
|
|
121
|
+
# A CRM company record carries the org's domain directly (no envelope email).
|
|
122
|
+
assert worker.event_org_domains(
|
|
123
|
+
{"domain": "Acme.com", "hubspot_kind": "company"}
|
|
124
|
+
) == {"acme.com"}
|
|
125
|
+
assert worker.event_org_domains({"company_domain": "acme.com"}) == {"acme.com"}
|
|
126
|
+
assert worker.event_org_domains({"org_domain": "acme.com"}) == {"acme.com"}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def test_event_domains_domain_attr_normalises_url_and_www() -> None:
|
|
130
|
+
assert worker.event_org_domains(
|
|
131
|
+
{"domain": "https://www.acme.com/about?x=1"}
|
|
132
|
+
) == {"acme.com"}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_event_domains_domain_attr_drops_freemail_and_tenant() -> None:
|
|
136
|
+
assert worker.event_org_domains({"domain": "gmail.com"}) == set()
|
|
137
|
+
assert worker.event_org_domains({"company_domain": "pentatonic.com"}) == set()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_event_domains_domain_attr_rejects_non_domains() -> None:
|
|
141
|
+
assert worker.event_org_domains({"domain": "not a domain"}) == set()
|
|
142
|
+
assert worker.event_org_domains({"domain": "bob@acme.com"}) == set()
|
|
143
|
+
assert worker.event_org_domains({"domain": "localhost"}) == set()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_event_domains_merges_envelope_and_direct_attr() -> None:
|
|
147
|
+
assert worker.event_org_domains(
|
|
148
|
+
{"contact_email": "a@globex.com", "domain": "acme.com"}
|
|
149
|
+
) == {"globex.com", "acme.com"}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# ----------------------------------------------------------------------
|
|
153
|
+
# match_org_domain — the precision guard
|
|
154
|
+
# ----------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def test_match_exact_name_to_domain() -> None:
|
|
157
|
+
assert worker.match_org_domain("LEGO", {"lego.com"}) == "lego.com"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_match_name_with_suffix_word() -> None:
|
|
161
|
+
# "LEGO Group" → token "lego" matches lego.com.
|
|
162
|
+
assert worker.match_org_domain("LEGO Group", {"lego.com"}) == "lego.com"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def test_match_squished_name_equals_label() -> None:
|
|
166
|
+
# de-spaced name "legogroup" EQUALS the domain label "legogroup"
|
|
167
|
+
# (whole-name match, not a prefix).
|
|
168
|
+
assert worker.match_org_domain("Lego Group", {"legogroup.com"}) == "legogroup.com"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_no_match_when_domain_is_a_bystander() -> None:
|
|
172
|
+
# THE key safety case: an email FROM acme.com that merely *mentions* Globex
|
|
173
|
+
# must not stamp acme.com onto Globex.
|
|
174
|
+
assert worker.match_org_domain("Globex", {"acme.com"}) is None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# Prefix-match false positives — the reason the original `squished.startswith`
|
|
178
|
+
# clause was removed. One org's name starting with another org's label must NOT
|
|
179
|
+
# stamp that other org's domain (a mis-merge at ingest, via the alias-overlap
|
|
180
|
+
# path, that isn't behind the Fusion Drive's dry-run/audit gate).
|
|
181
|
+
def test_no_match_name_starts_with_other_org_label() -> None:
|
|
182
|
+
# "applebees".startswith("apple") — must NOT stamp apple.com onto Applebee's.
|
|
183
|
+
assert worker.match_org_domain("Applebee's", {"apple.com"}) is None
|
|
184
|
+
# "legoland".startswith("lego") — must NOT stamp lego.com onto Legoland.
|
|
185
|
+
assert worker.match_org_domain("Legoland", {"lego.com"}) is None
|
|
186
|
+
# "microsoft".startswith("micro") — must NOT stamp micro.com onto Microsoft.
|
|
187
|
+
assert worker.match_org_domain("Microsoft", {"micro.com"}) is None
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def test_match_still_holds_for_legit_token_and_whole_name() -> None:
|
|
191
|
+
# the cases the prefix clause was *meant* for are covered without it:
|
|
192
|
+
assert worker.match_org_domain("LEGO Group", {"lego.com"}) == "lego.com" # token
|
|
193
|
+
assert worker.match_org_domain("Microsoft", {"microsoft.com"}) == "microsoft.com" # whole name
|
|
194
|
+
assert worker.match_org_domain("Salesforce", {"salesforce.com"}) == "salesforce.com" # whole name
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_match_picks_the_right_domain_among_several() -> None:
|
|
198
|
+
domains = {"acme.com", "lego.com", "globex.com"}
|
|
199
|
+
assert worker.match_org_domain("LEGO", domains) == "lego.com"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_no_match_on_short_label_substring() -> None:
|
|
203
|
+
# short labels (<4) must match a token exactly, never as a substring —
|
|
204
|
+
# avoids "hp" matching "championship" etc.
|
|
205
|
+
assert worker.match_org_domain("Championship Org", {"hp.com"}) is None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_match_short_label_exact_token() -> None:
|
|
209
|
+
# but a genuine short-name org still matches its own domain as a token.
|
|
210
|
+
assert worker.match_org_domain("HP", {"hp.com"}) == "hp.com"
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_ambiguous_two_matching_domains_is_none() -> None:
|
|
214
|
+
# genuine ambiguity → defer to fusion, don't guess.
|
|
215
|
+
assert worker.match_org_domain("Acme", {"acme.com", "acme.co.uk"}) is None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def test_no_match_empty() -> None:
|
|
219
|
+
assert worker.match_org_domain("", {"lego.com"}) is None
|
|
220
|
+
assert worker.match_org_domain("LEGO", set()) is None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ----------------------------------------------------------------------
|
|
224
|
+
# org_node_id_key — the string entity_id() hashes for the node (task #2)
|
|
225
|
+
# ----------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
def test_id_key_is_the_domain_for_a_stamped_org() -> None:
|
|
228
|
+
# The whole point: name variants don't matter — the id keys on the domain.
|
|
229
|
+
assert worker.org_node_id_key("org", "LEGO Group", "lego.com") == "lego.com"
|
|
230
|
+
assert worker.org_node_id_key("org", "LEGO", "lego.com") == "lego.com"
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def test_id_key_falls_back_to_name_without_a_domain() -> None:
|
|
234
|
+
assert worker.org_node_id_key("org", "Some Prose Org", None) == "Some Prose Org"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_id_key_never_touches_non_orgs() -> None:
|
|
238
|
+
# A person carries an email hard key, but person id-keying is a separate
|
|
239
|
+
# follow-up — this helper must leave non-orgs on their name.
|
|
240
|
+
assert worker.org_node_id_key("person", "Johann", "pentatonic.com") == "Johann"
|
|
241
|
+
assert worker.org_node_id_key("product", "Widget", "widget.com") == "Widget"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def test_id_key_drives_a_domain_keyed_entity_id() -> None:
|
|
245
|
+
# End-to-end: two LEGO name-variants with the same domain mint the SAME id.
|
|
246
|
+
a = worker.entity_id(
|
|
247
|
+
"arena1", "org", worker.org_node_id_key("org", "LEGO Group", "lego.com")
|
|
248
|
+
)
|
|
249
|
+
b = worker.entity_id(
|
|
250
|
+
"arena1", "org", worker.org_node_id_key("org", "LEGO", "lego.com")
|
|
251
|
+
)
|
|
252
|
+
assert a == b
|
|
253
|
+
# …and it differs from the old name-keyed id (so a re-distill re-homes them).
|
|
254
|
+
name_keyed = worker.entity_id("arena1", "org", "LEGO Group")
|
|
255
|
+
assert a != name_keyed
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# ----------------------------------------------------------------------
|
|
259
|
+
# person_id_key — email hard key, shared with the sync pass (task #2)
|
|
260
|
+
# ----------------------------------------------------------------------
|
|
261
|
+
|
|
262
|
+
def test_person_id_key_prefers_email() -> None:
|
|
263
|
+
assert worker.person_id_key("Johann", "johann@p.com") == "johann@p.com"
|
|
264
|
+
assert worker.person_id_key("Johann", None) == "Johann"
|
|
265
|
+
assert worker.person_id_key(None, None) == ""
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def test_person_id_key_drives_an_email_keyed_entity_id() -> None:
|
|
269
|
+
# Two name-variants of one person, same email → identical node id, distinct
|
|
270
|
+
# from the name-keyed one (so sync + async + a re-distill all converge).
|
|
271
|
+
a = worker.entity_id(
|
|
272
|
+
"arena1", "person", worker.person_id_key("Johann Boedecker", "j@p.com")
|
|
273
|
+
)
|
|
274
|
+
b = worker.entity_id(
|
|
275
|
+
"arena1", "person", worker.person_id_key("BOEDECKER, JOHANN", "J@P.com")
|
|
276
|
+
)
|
|
277
|
+
assert a == b
|
|
278
|
+
assert a != worker.entity_id("arena1", "person", "Johann Boedecker")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ----------------------------------------------------------------------
|
|
282
|
+
# event_record_subject — SoR external-id fallback (registry A.1 step 5)
|
|
283
|
+
# ----------------------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
def test_record_subject_company_is_org() -> None:
|
|
286
|
+
assert worker.event_record_subject(
|
|
287
|
+
{"source_id": "hubspot:company:12345", "hubspot_kind": "company"}
|
|
288
|
+
) == ("org", "hubspot:company:12345")
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def test_record_subject_contact_is_person() -> None:
|
|
292
|
+
assert worker.event_record_subject(
|
|
293
|
+
{"source_id": "hubspot:contact:777", "hubspot_kind": "contact"}
|
|
294
|
+
) == ("person", "hubspot:contact:777")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def test_record_subject_deal_is_not_a_core_entity() -> None:
|
|
298
|
+
# deal/ticket/custom types aren't core graph entities → no external-id key.
|
|
299
|
+
assert worker.event_record_subject(
|
|
300
|
+
{"source_id": "hubspot:deal:9", "hubspot_kind": "deal"}
|
|
301
|
+
) == (None, None)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def test_record_subject_requires_a_source_id() -> None:
|
|
305
|
+
assert worker.event_record_subject({"hubspot_kind": "company"}) == (None, None)
|
|
306
|
+
assert worker.event_record_subject({"source_id": " ", "hubspot_kind": "company"}) == (
|
|
307
|
+
None,
|
|
308
|
+
None,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def test_record_subject_plain_comms_event_is_none() -> None:
|
|
313
|
+
# An ordinary email/slack event is not a single record.
|
|
314
|
+
assert worker.event_record_subject({"contact_email": "a@acme.com"}) == (None, None)
|
|
315
|
+
assert worker.event_record_subject(None) == (None, None)
|
|
316
|
+
assert worker.event_record_subject({}) == (None, None)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def test_external_id_keys_a_domainless_record_node() -> None:
|
|
320
|
+
# The fallback: a company record with no resolvable domain keys on its
|
|
321
|
+
# external id, so two imports of the same record converge to one node.
|
|
322
|
+
ext = "hubspot:company:12345"
|
|
323
|
+
assert worker.entity_id("arena1", "org", ext) == worker.entity_id(
|
|
324
|
+
"arena1", "org", ext
|
|
325
|
+
)
|
|
326
|
+
# …and that's distinct from the name-keyed node it replaces.
|
|
327
|
+
assert worker.entity_id("arena1", "org", ext) != worker.entity_id(
|
|
328
|
+
"arena1", "org", "Acme Corp"
|
|
329
|
+
)
|
|
@@ -41,7 +41,7 @@ import psycopg
|
|
|
41
41
|
import psycopg.rows
|
|
42
42
|
|
|
43
43
|
from confidence import born_salience, corroborated_confidence
|
|
44
|
-
from entity_id import entity_id, normalize_surface_form
|
|
44
|
+
from entity_id import entity_id, normalize_surface_form, person_id_key
|
|
45
45
|
from source_time import event_source_time, parse_source_time
|
|
46
46
|
from extraction_schema import (
|
|
47
47
|
ALLOWED_ENT_TYPES,
|
|
@@ -801,6 +801,217 @@ def _digit_ratio(s: str) -> float:
|
|
|
801
801
|
return sum(c.isdigit() for c in stripped) / len(stripped)
|
|
802
802
|
|
|
803
803
|
|
|
804
|
+
# ----------------------------------------------------------------------
|
|
805
|
+
# Org-domain hard-key stamping (entity-ontology-the-spine §IV / registry §B)
|
|
806
|
+
#
|
|
807
|
+
# An organization's email DOMAIN is its deterministic hard key — the join that
|
|
808
|
+
# unifies the same org seen across sources and kills name-fragmentation (the
|
|
809
|
+
# LEGO-×12 / Johann-×5 problem). The async LLM pass mints org entities by NAME
|
|
810
|
+
# from prose; the DOMAIN lives in the event's structured envelope (contact_email
|
|
811
|
+
# / to_emails / cc_emails / author). The structured-data-graph audit found Seesa
|
|
812
|
+
# *emits* this bag but the graph *drops* it. Here we bring the two together:
|
|
813
|
+
# derive corporate domains from the envelope, and — only when a domain's
|
|
814
|
+
# registrable label matches the org's name — stamp it onto that org as a
|
|
815
|
+
# resolution alias (so domain-sharing orgs merge via the existing alias-overlap
|
|
816
|
+
# path) and as an `attributes.domain` hard key on the node.
|
|
817
|
+
#
|
|
818
|
+
# Forward-only: new events get stamped; a re-distill backfills history. Org-only
|
|
819
|
+
# — the sync pass never mints orgs (it emits person/document from the envelope),
|
|
820
|
+
# so this lives entirely in the async worker and leaves entity-id parity intact.
|
|
821
|
+
# ----------------------------------------------------------------------
|
|
822
|
+
|
|
823
|
+
# Consumer / free email providers — a shared domain here means "both use Gmail",
|
|
824
|
+
# NOT "same org", so an org must never be keyed on these.
|
|
825
|
+
FREEMAIL_DOMAINS = frozenset({
|
|
826
|
+
"gmail.com", "googlemail.com", "outlook.com", "hotmail.com", "live.com",
|
|
827
|
+
"msn.com", "yahoo.com", "yahoo.co.uk", "ymail.com", "icloud.com", "me.com",
|
|
828
|
+
"mac.com", "aol.com", "proton.me", "protonmail.com", "gmx.com", "gmx.net",
|
|
829
|
+
"mail.com", "zoho.com", "yandex.com", "fastmail.com", "qq.com", "163.com",
|
|
830
|
+
"126.com",
|
|
831
|
+
})
|
|
832
|
+
|
|
833
|
+
# The tenant's own domain(s) — our people, not an external org; never key an org
|
|
834
|
+
# on our domain (it would collapse unrelated externals under us). Comma-list via
|
|
835
|
+
# env; defaults to the known tenant.
|
|
836
|
+
TENANT_EMAIL_DOMAINS = frozenset(
|
|
837
|
+
d.strip().lower()
|
|
838
|
+
for d in os.environ.get("TENANT_EMAIL_DOMAINS", "pentatonic.com").split(",")
|
|
839
|
+
if d.strip()
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# Multi-label public suffixes we actually see — enough to pull the registrable
|
|
843
|
+
# label ("boots" from boots.co.uk) without a full public-suffix-list dependency.
|
|
844
|
+
_MULTI_LABEL_TLDS = frozenset({
|
|
845
|
+
"co.uk", "org.uk", "ac.uk", "gov.uk", "me.uk", "co.jp", "com.au", "net.au",
|
|
846
|
+
"org.au", "co.nz", "com.br", "co.in", "com.cn", "co.za", "com.sg", "com.hk",
|
|
847
|
+
})
|
|
848
|
+
|
|
849
|
+
_EMAIL_DOMAIN_RE = re.compile(r"[\w.+-]+@([a-z0-9](?:[a-z0-9-]*[a-z0-9])?(?:\.[a-z0-9-]+)+)", re.I)
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
def _domain_of(value: str) -> str | None:
|
|
853
|
+
"""Lowercased domain of an email address found in ``value``, else None."""
|
|
854
|
+
if not value or "@" not in value:
|
|
855
|
+
return None
|
|
856
|
+
m = _EMAIL_DOMAIN_RE.search(value)
|
|
857
|
+
return m.group(1).lower() if m else None
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def _bare_domain(value: str) -> str | None:
|
|
861
|
+
"""Normalise a structured domain field to a bare host — 'acme.com' from
|
|
862
|
+
'Acme.com', 'https://www.acme.com/about', etc. Used for system-of-record
|
|
863
|
+
records (e.g. a CRM company) that carry the org's domain DIRECTLY, not via an
|
|
864
|
+
email envelope. None when it isn't a plausible domain (no dot, has a space,
|
|
865
|
+
or looks like an email — those go through `_domain_of`)."""
|
|
866
|
+
if not isinstance(value, str):
|
|
867
|
+
return None
|
|
868
|
+
d = value.strip().lower()
|
|
869
|
+
if not d:
|
|
870
|
+
return None
|
|
871
|
+
d = d.split("//")[-1].split("/")[0].split("?")[0] # strip scheme + path
|
|
872
|
+
if d.startswith("www."):
|
|
873
|
+
d = d[4:]
|
|
874
|
+
if "." not in d or " " in d or "@" in d:
|
|
875
|
+
return None
|
|
876
|
+
return d
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def event_org_domains(attrs: dict[str, Any] | None) -> set[str]:
|
|
880
|
+
"""Corporate domains in an event's structured data — the deterministic
|
|
881
|
+
source of org hard keys. Two structured sources:
|
|
882
|
+
- the participant ENVELOPE: ``contact_email``/``from_email``/``author`` and
|
|
883
|
+
the recipient lists ``to_emails``/``cc_emails`` (list or delimited
|
|
884
|
+
string), domain taken from each address;
|
|
885
|
+
- a system-of-record's DIRECT domain field (``domain``/``company_domain``/
|
|
886
|
+
``org_domain``) — e.g. a CRM company record carries the org's domain even
|
|
887
|
+
though there's no email envelope (it would otherwise only sit in prose).
|
|
888
|
+
Drops freemail + the tenant's own domain. Empty set on missing/garbage input
|
|
889
|
+
— never raises (this runs in the hot upsert path)."""
|
|
890
|
+
if not attrs:
|
|
891
|
+
return set()
|
|
892
|
+
domains: set[str] = set()
|
|
893
|
+
raw: list[str] = []
|
|
894
|
+
for key in ("contact_email", "from_email", "author"):
|
|
895
|
+
v = attrs.get(key)
|
|
896
|
+
if isinstance(v, str):
|
|
897
|
+
raw.append(v)
|
|
898
|
+
for key in ("to_emails", "cc_emails"):
|
|
899
|
+
v = attrs.get(key)
|
|
900
|
+
if isinstance(v, str):
|
|
901
|
+
raw.extend(v.replace(";", ",").split(","))
|
|
902
|
+
elif isinstance(v, list):
|
|
903
|
+
raw.extend(x for x in v if isinstance(x, str))
|
|
904
|
+
for r in raw:
|
|
905
|
+
domains.add(_domain_of(r))
|
|
906
|
+
for key in ("domain", "company_domain", "org_domain"):
|
|
907
|
+
domains.add(_bare_domain(attrs.get(key)))
|
|
908
|
+
return {
|
|
909
|
+
d for d in domains
|
|
910
|
+
if d and d not in FREEMAIL_DOMAINS and d not in TENANT_EMAIL_DOMAINS
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def domain_label(domain: str) -> str | None:
|
|
915
|
+
"""Registrable label of a domain — 'lego' from 'lego.com', 'boots' from
|
|
916
|
+
'boots.co.uk'. Used only for the name↔domain match below."""
|
|
917
|
+
if not domain:
|
|
918
|
+
return None
|
|
919
|
+
parts = domain.lower().strip(".").split(".")
|
|
920
|
+
if len(parts) < 2:
|
|
921
|
+
return None
|
|
922
|
+
if ".".join(parts[-2:]) in _MULTI_LABEL_TLDS and len(parts) >= 3:
|
|
923
|
+
return parts[-3]
|
|
924
|
+
return parts[-2]
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def match_org_domain(org_name: str, domains: set[str]) -> str | None:
|
|
928
|
+
"""Return the one envelope domain that deterministically belongs to this
|
|
929
|
+
org — i.e. whose registrable label matches the org's name — else None.
|
|
930
|
+
|
|
931
|
+
This is the precision guard that makes envelope-derived stamping SAFE: an
|
|
932
|
+
email *from* acme.com that merely mentions "Globex" must not stamp acme.com
|
|
933
|
+
onto Globex. We match only when the domain's label is a whole name token
|
|
934
|
+
("lego" ∈ "LEGO Group") or equals the de-spaced name ("mastercard" ↔
|
|
935
|
+
"Mastercard"). If two different domains match (genuine ambiguity), return
|
|
936
|
+
None and let fusion decide rather than guess.
|
|
937
|
+
|
|
938
|
+
We deliberately do NOT prefix-match the squished name. A `startswith`
|
|
939
|
+
rule (the original #111 form) over-fires whenever one org's name starts
|
|
940
|
+
with another org's label — "apple".startswith on "applebees" stamps
|
|
941
|
+
apple.com onto Applebee's; likewise lego.com→Legoland, micro.com→Microsoft.
|
|
942
|
+
Because the stamp becomes a resolution alias, that mis-merges two real orgs
|
|
943
|
+
at INGEST (immediate, not behind the Fusion Drive's dry-run/audit gate) —
|
|
944
|
+
the one path here that can silently fuse distinct orgs. The legitimate
|
|
945
|
+
"LEGO Group" case is already covered by the whole-token rule, so the prefix
|
|
946
|
+
clause added only false-positive surface for no real gain."""
|
|
947
|
+
if not org_name or not domains:
|
|
948
|
+
return None
|
|
949
|
+
norm = normalize_surface_form(org_name)
|
|
950
|
+
tokens = {t for t in re.split(r"[^a-z0-9]+", norm) if len(t) >= 2}
|
|
951
|
+
squished = norm.replace(" ", "")
|
|
952
|
+
matched: set[str] = set()
|
|
953
|
+
for d in domains:
|
|
954
|
+
label = domain_label(d)
|
|
955
|
+
if not label or len(label) < 2:
|
|
956
|
+
continue
|
|
957
|
+
if label in tokens or label == squished:
|
|
958
|
+
matched.add(d)
|
|
959
|
+
return next(iter(matched)) if len(matched) == 1 else None
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
# System-of-record record-type → core ontology type. A SoR event (a CRM record,
|
|
963
|
+
# a finance record, …) represents ONE primary entity of a known type; we map the
|
|
964
|
+
# producer's record-type tag onto the core type so we can key that entity on the
|
|
965
|
+
# record's external id. Extend per connector as new SoRs land.
|
|
966
|
+
_HUBSPOT_KIND_TO_TYPE = {"company": "org", "contact": "person"}
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
def event_record_subject(attrs: dict[str, Any] | None) -> tuple[str | None, str | None]:
|
|
970
|
+
"""For an event that IS a system-of-record record, return
|
|
971
|
+
`(core_type, external_id)` for its single primary entity — else `(None, None)`.
|
|
972
|
+
|
|
973
|
+
The external id (`source_id` — already a composite `system:kind:id`, e.g.
|
|
974
|
+
`hubspot:company:12345`) is the registry's FALLBACK org/person hard key
|
|
975
|
+
(A.1): used when no email/domain resolves, so re-importing the same record
|
|
976
|
+
converges to one node (idempotency) and a domainless SoR entity isn't
|
|
977
|
+
re-minted on every sync. Only fires for records whose type maps to a core
|
|
978
|
+
type (company→org, contact→person); deal/ticket/custom-plugin types return
|
|
979
|
+
None (they aren't core graph entities). The caller additionally requires the
|
|
980
|
+
event to extract exactly ONE entity of `core_type` before keying on this —
|
|
981
|
+
so a record that also mentions other orgs/people never mis-stamps them."""
|
|
982
|
+
if not attrs:
|
|
983
|
+
return (None, None)
|
|
984
|
+
external_id = attrs.get("source_id")
|
|
985
|
+
if not isinstance(external_id, str) or not external_id.strip():
|
|
986
|
+
return (None, None)
|
|
987
|
+
hubspot_kind = attrs.get("hubspot_kind")
|
|
988
|
+
if isinstance(hubspot_kind, str):
|
|
989
|
+
core = _HUBSPOT_KIND_TO_TYPE.get(hubspot_kind.lower())
|
|
990
|
+
if core:
|
|
991
|
+
return (core, external_id.strip())
|
|
992
|
+
return (None, None)
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
def org_node_id_key(entity_type: str, name: str, stamped_domain: str | None) -> str:
|
|
996
|
+
"""The string `entity_id()` hashes to mint an entity's NODE id — its domain
|
|
997
|
+
hard key when we resolved one (org), else its name.
|
|
998
|
+
|
|
999
|
+
This is the id-level complement to the alias stamp (`match_org_domain`):
|
|
1000
|
+
the stamp makes the domain a resolution *alias* (caught by the upsert
|
|
1001
|
+
SELECT/lock); keying the node *id* on the domain makes convergence
|
|
1002
|
+
DETERMINISTIC and order-independent. Every event that resolves the same
|
|
1003
|
+
domain mints the SAME id, so they collapse to ONE node via `ON CONFLICT
|
|
1004
|
+
(id)` even if the SELECT and the advisory lock both miss — and a re-distill
|
|
1005
|
+
of the corpus yields exactly one org node per domain regardless of event
|
|
1006
|
+
order, instead of a name-keyed pile Fusion has to chase (the entity-ontology
|
|
1007
|
+
spine's "the big structural fix"). Org-only for now; person→email and
|
|
1008
|
+
external-id are the symmetric follow-ups. Falls back to the name for
|
|
1009
|
+
non-orgs and for orgs we couldn't resolve a domain for (prose-only orgs)."""
|
|
1010
|
+
if entity_type == "org" and stamped_domain:
|
|
1011
|
+
return stamped_domain
|
|
1012
|
+
return name
|
|
1013
|
+
|
|
1014
|
+
|
|
804
1015
|
def upsert_entities(
|
|
805
1016
|
conn: psycopg.Connection,
|
|
806
1017
|
arena: str,
|
|
@@ -809,6 +1020,7 @@ def upsert_entities(
|
|
|
809
1020
|
disclosure_class: str,
|
|
810
1021
|
entities: list[dict],
|
|
811
1022
|
event_time: datetime | None,
|
|
1023
|
+
event_attrs: dict[str, Any] | None = None,
|
|
812
1024
|
) -> dict[str, str]:
|
|
813
1025
|
"""Alias-aware insert (or merge) of entities; returns a name→id
|
|
814
1026
|
map so facts and relationships can link to the inserted rows.
|
|
@@ -836,10 +1048,28 @@ def upsert_entities(
|
|
|
836
1048
|
serialises concurrent writers (sync + async on the same event)
|
|
837
1049
|
on the same surface form. (RFC steps 2 + 2a.)
|
|
838
1050
|
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
1051
|
+
3. **Org-domain hard-key stamping + id keying** — when `event_attrs` carries
|
|
1052
|
+
the structured envelope, the corporate email domain that matches an `org`
|
|
1053
|
+
entity's name is stamped onto it as a resolution alias (so domain-sharing
|
|
1054
|
+
orgs merge via the alias-overlap path above) and as `attributes.domain`
|
|
1055
|
+
on the node. The new node's *id* is then minted from that domain
|
|
1056
|
+
(`org_node_id_key`) rather than the name, so same-domain orgs converge to
|
|
1057
|
+
ONE id deterministically — robust to a SELECT/lock miss and order-
|
|
1058
|
+
independent under a re-distill. This is the deterministic hard key the
|
|
1059
|
+
registry wants for org; see `event_org_domains`/`match_org_domain` and
|
|
1060
|
+
entity-ontology-the-spine.md §IV.
|
|
1061
|
+
|
|
1062
|
+
4. **SoR external-id fallback** — when the event IS a system-of-record record
|
|
1063
|
+
(`event_record_subject`) and extracts exactly ONE entity of the record's
|
|
1064
|
+
core type, that subject — if it has no email/domain hard key — is keyed on
|
|
1065
|
+
the record's external id and carries `attributes.external_id`, so
|
|
1066
|
+
re-imports of the same record converge (registry precedence: email/domain
|
|
1067
|
+
→ external-id → name). email/domain still win when present.
|
|
1068
|
+
|
|
1069
|
+
MIRROR of extractor-sync/server.py:_upsert_entities for the resolution
|
|
1070
|
+
algorithm. The sync pass never mints `org` entities (it emits
|
|
1071
|
+
person/document from the envelope), so org-domain stamping is async-only and
|
|
1072
|
+
leaves the entity-id parity test untouched.
|
|
843
1073
|
|
|
844
1074
|
Returns name→id where `name` is the LLM-emitted surface form
|
|
845
1075
|
(canonical) so facts/relationships using the same surface form
|
|
@@ -847,6 +1077,22 @@ def upsert_entities(
|
|
|
847
1077
|
name_to_id: dict[str, str] = {}
|
|
848
1078
|
if not entities:
|
|
849
1079
|
return name_to_id
|
|
1080
|
+
# Corporate domains in this event's structured envelope — the deterministic
|
|
1081
|
+
# source of org hard keys (computed once per event, not per entity).
|
|
1082
|
+
org_domains = event_org_domains(event_attrs)
|
|
1083
|
+
# System-of-record external-id fallback: if this event IS a single record
|
|
1084
|
+
# (e.g. a HubSpot company/contact) AND it extracts exactly ONE entity of the
|
|
1085
|
+
# record's core type, that entity is the record's subject and may be keyed on
|
|
1086
|
+
# the record's external id (the registry fallback, after email/domain). The
|
|
1087
|
+
# exactly-one guard is the precision gate — a record that also names other
|
|
1088
|
+
# orgs/people leaves `subject_ext_id` None so nothing is mis-stamped.
|
|
1089
|
+
subject_type, subject_ext_id = event_record_subject(event_attrs)
|
|
1090
|
+
if subject_type:
|
|
1091
|
+
n_of_type = sum(
|
|
1092
|
+
1 for e in entities if (e.get("type") or "other").lower() == subject_type
|
|
1093
|
+
)
|
|
1094
|
+
if n_of_type != 1:
|
|
1095
|
+
subject_ext_id = None
|
|
850
1096
|
with conn.cursor() as cur:
|
|
851
1097
|
for e in entities:
|
|
852
1098
|
etype = (e.get("type") or "other").lower()
|
|
@@ -858,6 +1104,27 @@ def upsert_entities(
|
|
|
858
1104
|
continue
|
|
859
1105
|
aliases = [a for a in (e.get("aliases") or []) if a]
|
|
860
1106
|
|
|
1107
|
+
# Hard-key stamps for THIS entity, merged onto the node's attributes
|
|
1108
|
+
# and (for domain) into the resolution aliases. Adding domain to
|
|
1109
|
+
# aliases before forms are computed is deliberate — that's what makes
|
|
1110
|
+
# the alias-overlap merge pick it up.
|
|
1111
|
+
attrs_patch: dict[str, str] = {}
|
|
1112
|
+
stamped_domain: str | None = None
|
|
1113
|
+
if etype == "org" and org_domains:
|
|
1114
|
+
stamped_domain = match_org_domain(name, org_domains)
|
|
1115
|
+
if stamped_domain:
|
|
1116
|
+
if stamped_domain not in aliases:
|
|
1117
|
+
aliases.append(stamped_domain)
|
|
1118
|
+
attrs_patch["domain"] = stamped_domain
|
|
1119
|
+
# SoR external-id: the single subject of a record event carries the
|
|
1120
|
+
# record's external id (resolution alias + `attributes.external_id`).
|
|
1121
|
+
is_record_subject = bool(subject_ext_id) and etype == subject_type
|
|
1122
|
+
if is_record_subject and subject_ext_id:
|
|
1123
|
+
if subject_ext_id not in aliases:
|
|
1124
|
+
aliases.append(subject_ext_id)
|
|
1125
|
+
attrs_patch["external_id"] = subject_ext_id
|
|
1126
|
+
attrs_update = json.dumps(attrs_patch)
|
|
1127
|
+
|
|
861
1128
|
# Sort (don't `list(set(...))`) so lock acquisition order
|
|
862
1129
|
# is deterministic across processes — set-iteration order
|
|
863
1130
|
# depends on Python's per-process hash randomisation, so
|
|
@@ -902,17 +1169,46 @@ def upsert_entities(
|
|
|
902
1169
|
UPDATE entities SET
|
|
903
1170
|
aliases = ARRAY(SELECT DISTINCT UNNEST(aliases || %s::text[])),
|
|
904
1171
|
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(provenance_event_ids || %s::text[])),
|
|
1172
|
+
-- Merge the org-domain hard key (no-op for the `{}` default
|
|
1173
|
+
-- on non-orgs / unmatched orgs; never clobbers an existing key).
|
|
1174
|
+
attributes = entities.attributes || %s::jsonb,
|
|
905
1175
|
-- Widen the seen-window with this event's SOURCE
|
|
906
1176
|
-- time, not NOW(): newest evidence = newest source.
|
|
907
1177
|
last_seen = GREATEST(last_seen, COALESCE(%s, NOW())),
|
|
908
1178
|
first_seen = LEAST(first_seen, COALESCE(%s, NOW()))
|
|
909
1179
|
WHERE id = %s
|
|
910
1180
|
""",
|
|
911
|
-
(aliases, [event_id], event_time, event_time, eid),
|
|
1181
|
+
(aliases, [event_id], attrs_update, event_time, event_time, eid),
|
|
912
1182
|
)
|
|
913
1183
|
else:
|
|
914
|
-
# 3b. No match — insert new.
|
|
915
|
-
|
|
1184
|
+
# 3b. No match — insert new. Key the node on its HARD KEY so the
|
|
1185
|
+
# same real thing mints the SAME id and converges via ON CONFLICT
|
|
1186
|
+
# even if the SELECT/lock missed (task #2): org → its resolved
|
|
1187
|
+
# DOMAIN (org_node_id_key); person → its EMAIL via the SHARED
|
|
1188
|
+
# person_id_key, so this matches the sync pass byte-for-byte
|
|
1189
|
+
# (sync mints person nodes at ingest — see entity_id.py). Email
|
|
1190
|
+
# is carried as an alias on async persons (LLM-promoted); pull it
|
|
1191
|
+
# back out here. Everything else falls back to the name.
|
|
1192
|
+
if etype == "person":
|
|
1193
|
+
person_email = next(
|
|
1194
|
+
(a for a in aliases if a and "@" in a and " " not in a),
|
|
1195
|
+
None,
|
|
1196
|
+
)
|
|
1197
|
+
id_key = person_id_key(name, person_email)
|
|
1198
|
+
has_hard_key = person_email is not None
|
|
1199
|
+
elif etype == "org":
|
|
1200
|
+
id_key = org_node_id_key(etype, name, stamped_domain)
|
|
1201
|
+
has_hard_key = stamped_domain is not None
|
|
1202
|
+
else:
|
|
1203
|
+
id_key = name
|
|
1204
|
+
has_hard_key = False
|
|
1205
|
+
# FALLBACK: a SoR record's single subject with no email/domain
|
|
1206
|
+
# keys on the record's external id, so re-imports of that record
|
|
1207
|
+
# converge to one node (registry precedence: email/domain →
|
|
1208
|
+
# external-id → name). email/domain still win when present.
|
|
1209
|
+
if not has_hard_key and is_record_subject and subject_ext_id:
|
|
1210
|
+
id_key = subject_ext_id
|
|
1211
|
+
eid = entity_id(arena, etype, id_key)
|
|
916
1212
|
# Fusion Drive born-salience: a numeric-ID-as-person (classic
|
|
917
1213
|
# 7B junk that slips past noise_filter, e.g. "1716801984") is
|
|
918
1214
|
# born near the floor so the decay pass can evict it on a short
|
|
@@ -926,10 +1222,10 @@ def upsert_entities(
|
|
|
926
1222
|
INSERT INTO entities (
|
|
927
1223
|
id, arena, entity_type, canonical_name, aliases,
|
|
928
1224
|
provenance_event_ids, participant_set, disclosure_class, salience,
|
|
929
|
-
first_seen, last_seen
|
|
1225
|
+
attributes, first_seen, last_seen
|
|
930
1226
|
) VALUES (
|
|
931
1227
|
%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s,
|
|
932
|
-
COALESCE(%s, NOW()), COALESCE(%s, NOW())
|
|
1228
|
+
%s::jsonb, COALESCE(%s, NOW()), COALESCE(%s, NOW())
|
|
933
1229
|
)
|
|
934
1230
|
ON CONFLICT (id) DO UPDATE SET
|
|
935
1231
|
aliases = (
|
|
@@ -938,6 +1234,8 @@ def upsert_entities(
|
|
|
938
1234
|
provenance_event_ids = (
|
|
939
1235
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
940
1236
|
),
|
|
1237
|
+
-- merge the org-domain hard key (no-op for `{}`)
|
|
1238
|
+
attributes = entities.attributes || EXCLUDED.attributes,
|
|
941
1239
|
-- re-corroboration can only RAISE salience, never lower it
|
|
942
1240
|
salience = GREATEST(entities.salience, EXCLUDED.salience),
|
|
943
1241
|
-- widen the seen-window on SOURCE time, not NOW()
|
|
@@ -947,13 +1245,36 @@ def upsert_entities(
|
|
|
947
1245
|
(
|
|
948
1246
|
eid, arena, etype, name, aliases,
|
|
949
1247
|
[event_id], participant_set, disclosure_class, _sal,
|
|
950
|
-
event_time, event_time,
|
|
1248
|
+
attrs_update, event_time, event_time,
|
|
951
1249
|
),
|
|
952
1250
|
)
|
|
953
1251
|
name_to_id[name] = eid
|
|
954
1252
|
return name_to_id
|
|
955
1253
|
|
|
956
1254
|
|
|
1255
|
+
def fact_source(event: dict[str, Any]) -> str | None:
|
|
1256
|
+
"""Derive the originating SOURCE label to stamp onto a fact.
|
|
1257
|
+
|
|
1258
|
+
Prefer the finer producer label `attributes.source` (gmail / slack /
|
|
1259
|
+
hubspot / granola / drive / github) over the coarse `source_kind`
|
|
1260
|
+
enum (chat / note / system / ...), because SoR-drift detection needs
|
|
1261
|
+
"CRM vs email" granularity, which the enum can't express (both gmail
|
|
1262
|
+
and a draft are `note`; both HubSpot and an ERP snapshot are
|
|
1263
|
+
`system`). Fall back to `source_kind` when the producer didn't supply
|
|
1264
|
+
a finer label, and to None only when neither is present — NULL ==
|
|
1265
|
+
source-unknown, matching the pre-009 state of existing rows so the
|
|
1266
|
+
nullable column never lies. Pure + total: never raises, so it can't
|
|
1267
|
+
break the distill path; the caller stamps whatever it returns."""
|
|
1268
|
+
attrs = event.get("attributes") or {}
|
|
1269
|
+
src = attrs.get("source")
|
|
1270
|
+
if isinstance(src, str) and src.strip():
|
|
1271
|
+
return src.strip()
|
|
1272
|
+
kind = event.get("source_kind")
|
|
1273
|
+
if isinstance(kind, str) and kind.strip():
|
|
1274
|
+
return kind.strip()
|
|
1275
|
+
return None
|
|
1276
|
+
|
|
1277
|
+
|
|
957
1278
|
def upsert_facts(
|
|
958
1279
|
conn: psycopg.Connection,
|
|
959
1280
|
arena: str,
|
|
@@ -964,6 +1285,7 @@ def upsert_facts(
|
|
|
964
1285
|
name_to_id: dict[str, str],
|
|
965
1286
|
event_time: datetime | None,
|
|
966
1287
|
due_at: datetime | None = None,
|
|
1288
|
+
source: str | None = None,
|
|
967
1289
|
) -> int:
|
|
968
1290
|
"""Facts are content-hashed on (arena, statement). Same statement
|
|
969
1291
|
extracted from any event in the arena converges to the same row,
|
|
@@ -989,7 +1311,15 @@ def upsert_facts(
|
|
|
989
1311
|
corroborating events: facts have no `last_seen`, so #92's decay uses
|
|
990
1312
|
`asserted_at` as the recency clock and resets it on re-corroboration
|
|
991
1313
|
— order-stable regardless of distill order. `due_at` (the source
|
|
992
|
-
event's structured deadline, if any) populates `effective_until`.
|
|
1314
|
+
event's structured deadline, if any) populates `effective_until`.
|
|
1315
|
+
|
|
1316
|
+
`source` is the originating producer label (`fact_source()` — finer
|
|
1317
|
+
`attributes.source` else coarse `source_kind`), stamped so a reader
|
|
1318
|
+
can tell "this came from the CRM" from "this came from an email" —
|
|
1319
|
+
the foundation for SoR-drift detection. Accrete-only on conflict:
|
|
1320
|
+
COALESCE keeps the first-known source and only fills it when a prior
|
|
1321
|
+
extraction left it NULL, so corroboration never rewrites provenance
|
|
1322
|
+
(and a NULL — pre-009 rows, or a source-less event — stays NULL)."""
|
|
993
1323
|
if not facts:
|
|
994
1324
|
return 0
|
|
995
1325
|
inserted = 0
|
|
@@ -1015,11 +1345,11 @@ def upsert_facts(
|
|
|
1015
1345
|
id, arena, category, subject_entity_id, predicate,
|
|
1016
1346
|
object_entity_id, statement, provenance_event_ids,
|
|
1017
1347
|
stage, confidence, participant_set, disclosure_class, salience,
|
|
1018
|
-
asserted_at, effective_until
|
|
1348
|
+
asserted_at, effective_until, source
|
|
1019
1349
|
) VALUES (
|
|
1020
1350
|
%s, %s, %s, %s, %s, %s, %s, %s,
|
|
1021
1351
|
'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s,
|
|
1022
|
-
COALESCE(%s, NOW()), %s
|
|
1352
|
+
COALESCE(%s, NOW()), %s, %s
|
|
1023
1353
|
)
|
|
1024
1354
|
ON CONFLICT (id) DO UPDATE SET
|
|
1025
1355
|
provenance_event_ids = (
|
|
@@ -1057,7 +1387,11 @@ def upsert_facts(
|
|
|
1057
1387
|
-- This also makes it order-stable (independent of
|
|
1058
1388
|
-- distill order). EXCLUDED.asserted_at is the
|
|
1059
1389
|
-- COALESCE(event_time, NOW()) from the INSERT above.
|
|
1060
|
-
asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at)
|
|
1390
|
+
asserted_at = GREATEST(facts.asserted_at, EXCLUDED.asserted_at),
|
|
1391
|
+
-- Accrete-only: keep the first-known source, only fill
|
|
1392
|
+
-- it if a prior extraction left it NULL. Corroboration
|
|
1393
|
+
-- must not rewrite where the fact first came from.
|
|
1394
|
+
source = COALESCE(facts.source, EXCLUDED.source)
|
|
1061
1395
|
""",
|
|
1062
1396
|
(
|
|
1063
1397
|
_content_id(arena, stmt),
|
|
@@ -1074,6 +1408,7 @@ def upsert_facts(
|
|
|
1074
1408
|
_fsal,
|
|
1075
1409
|
event_time,
|
|
1076
1410
|
due_at,
|
|
1411
|
+
source,
|
|
1077
1412
|
),
|
|
1078
1413
|
)
|
|
1079
1414
|
inserted += 1
|
|
@@ -1551,15 +1886,20 @@ async def process_batch(
|
|
|
1551
1886
|
# behaviour). Only `attributes.due_at` is honoured; we do NOT
|
|
1552
1887
|
# guess deadlines from free text here.
|
|
1553
1888
|
due_at = parse_source_time((event.get("attributes") or {}).get("due_at"))
|
|
1889
|
+
# ORIGINATING SOURCE of this event, stamped onto its facts so
|
|
1890
|
+
# downstream can tell CRM-asserted from email-asserted (the
|
|
1891
|
+
# SoR-drift foundation). Finer `attributes.source` else coarse
|
|
1892
|
+
# `source_kind`; None ⇒ column stays NULL (source-unknown).
|
|
1893
|
+
src = fact_source(event)
|
|
1554
1894
|
|
|
1555
1895
|
try:
|
|
1556
1896
|
name_to_id = upsert_entities(
|
|
1557
1897
|
conn, arena, event_id, participant_set, disclosure, ents,
|
|
1558
|
-
event_time,
|
|
1898
|
+
event_time, event.get("attributes"),
|
|
1559
1899
|
)
|
|
1560
1900
|
n_facts = upsert_facts(
|
|
1561
1901
|
conn, arena, event_id, participant_set, disclosure, facts, name_to_id,
|
|
1562
|
-
event_time, due_at,
|
|
1902
|
+
event_time, due_at, src,
|
|
1563
1903
|
)
|
|
1564
1904
|
n_rels = upsert_relationships(
|
|
1565
1905
|
conn, arena, event_id, participant_set, disclosure, rels, name_to_id,
|
|
@@ -55,3 +55,23 @@ def entity_id(arena: str, entity_type: str, canonical_name: str) -> str:
|
|
|
55
55
|
"""
|
|
56
56
|
key = f"{arena}|{entity_type}|{normalize_surface_form(canonical_name)}"
|
|
57
57
|
return "e_" + hashlib.sha256(key.encode()).hexdigest()[:24]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def person_id_key(name: str | None, email: str | None) -> str:
|
|
61
|
+
"""The string `entity_id()` should hash to mint a PERSON's node id — the
|
|
62
|
+
EMAIL (the person's deterministic hard key) when present, else the name.
|
|
63
|
+
|
|
64
|
+
Lives HERE, in the byte-identical shared file, BECAUSE both extractors mint
|
|
65
|
+
person nodes and must agree: the sync pass builds them from the envelope at
|
|
66
|
+
ingest (it has the email), the async pass builds them from prose (email
|
|
67
|
+
promoted to an alias). Keying both on the email means the same person mints
|
|
68
|
+
the SAME id from either pass and in any processing order — converging to one
|
|
69
|
+
node instead of sync's name-keyed node and async's node racing, or a
|
|
70
|
+
re-distill re-homing them differently. The name is the fallback for a person
|
|
71
|
+
with no email (Fusion still merges those fuzzily). `entity_id()` normalises
|
|
72
|
+
(lowercase + trim) the result, so casing/whitespace variants of an email
|
|
73
|
+
collapse. (Org keying is the async-only `org_node_id_key`; person keying is
|
|
74
|
+
cross-pass, so it belongs in this parity-guarded file.)
|
|
75
|
+
"""
|
|
76
|
+
e = (email or "").strip()
|
|
77
|
+
return e if e else (name or "")
|
|
@@ -32,7 +32,7 @@ from typing import Any
|
|
|
32
32
|
|
|
33
33
|
# Canonical entity-ID scheme — byte-identical copy in extractor-async (entity_id.py).
|
|
34
34
|
from confidence import born_salience
|
|
35
|
-
from entity_id import entity_id, normalize_surface_form # noqa: F401
|
|
35
|
+
from entity_id import entity_id, normalize_surface_form, person_id_key # noqa: F401
|
|
36
36
|
# Source-time parsing — byte-identical copy in extractor-async (source_time.py).
|
|
37
37
|
from source_time import event_source_time
|
|
38
38
|
|
|
@@ -250,12 +250,17 @@ def _person_entity(
|
|
|
250
250
|
if not name and not email:
|
|
251
251
|
return None
|
|
252
252
|
|
|
253
|
-
# Prefer name as
|
|
254
|
-
# (accrete-only). When only email is available, fall
|
|
255
|
-
# canonical — a later event carrying the pair will
|
|
256
|
-
# to this row and add the name as an alias, but won't rename
|
|
257
|
-
# canonical (deferred to a follow-up; see §2b).
|
|
253
|
+
# Prefer name as canonical_name (the human-readable display) when both
|
|
254
|
+
# present. RFC §1 + §2b (accrete-only). When only email is available, fall
|
|
255
|
+
# back to email canonical — a later event carrying the pair will
|
|
256
|
+
# alias-resolve to this row and add the name as an alias, but won't rename
|
|
257
|
+
# the canonical (deferred to a follow-up; see §2b).
|
|
258
258
|
canonical = name if name else email
|
|
259
|
+
# …but key the node ID on the EMAIL hard key when present (task #2), via the
|
|
260
|
+
# SHARED person_id_key so sync (here) and the async pass mint the SAME id for
|
|
261
|
+
# the same person regardless of order — deterministic convergence, not a
|
|
262
|
+
# name-vs-email race. Display name and id key are now decoupled.
|
|
263
|
+
id_key = person_id_key(name, email)
|
|
259
264
|
aliases_set: set[str] = set()
|
|
260
265
|
if name:
|
|
261
266
|
aliases_set.add(name)
|
|
@@ -266,7 +271,7 @@ def _person_entity(
|
|
|
266
271
|
aliases_set.add(a)
|
|
267
272
|
|
|
268
273
|
return {
|
|
269
|
-
"id": _entity_id(req.arena, "person",
|
|
274
|
+
"id": _entity_id(req.arena, "person", id_key),
|
|
270
275
|
"arena": req.arena,
|
|
271
276
|
"entity_type": "person",
|
|
272
277
|
"canonical_name": canonical,
|
|
@@ -62,16 +62,21 @@ def stub_req():
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
def test_person_entity_name_and_email_pair(stub_req) -> None:
|
|
65
|
-
"""When both name and email present →
|
|
66
|
-
BOTH forms in
|
|
65
|
+
"""When both name and email present → display name is the canonical_name,
|
|
66
|
+
but the node ID keys on the EMAIL hard key (task #2), with BOTH forms in
|
|
67
|
+
aliases."""
|
|
67
68
|
e = sync_server._person_entity(
|
|
68
69
|
stub_req, "evt1", name="Carly Snider", email="carly@example.com"
|
|
69
70
|
)
|
|
70
71
|
assert e is not None
|
|
71
72
|
assert e["canonical_name"] == "Carly Snider"
|
|
72
73
|
assert set(e["aliases"]) == {"Carly Snider", "carly@example.com"}
|
|
73
|
-
# Id is derived from the
|
|
74
|
-
|
|
74
|
+
# Id is derived from the EMAIL hard key (not the name) — so the same person
|
|
75
|
+
# converges across passes/sources/spellings. Display name is decoupled.
|
|
76
|
+
assert e["id"] == sync_server._entity_id(
|
|
77
|
+
"tenant:test", "person", "carly@example.com"
|
|
78
|
+
)
|
|
79
|
+
assert e["id"] != sync_server._entity_id("tenant:test", "person", "Carly Snider")
|
|
75
80
|
|
|
76
81
|
|
|
77
82
|
def test_person_entity_email_only(stub_req) -> None:
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
-- 009: stamp the originating SOURCE onto each fact (SoR-drift foundation).
|
|
2
|
+
--
|
|
3
|
+
-- The distiller extracts facts from events but drops WHICH SOURCE each
|
|
4
|
+
-- fact came from. The source lives on the event (`events.source_kind` +
|
|
5
|
+
-- the finer `attributes.source`) but never reaches the `facts` table, so
|
|
6
|
+
-- a reader can't tell "this came from the CRM" from "this came from an
|
|
7
|
+
-- email". That distinction is the foundation for system-of-record drift
|
|
8
|
+
-- detection (e.g. CRM says a deal is active, a newer email says it was
|
|
9
|
+
-- rejected). This column makes the signal available; the drift READ /
|
|
10
|
+
-- detection path is an explicit follow-up and is NOT built here.
|
|
11
|
+
--
|
|
12
|
+
-- Forward-only + additive, per 001_init.sql header note 1 ("to iterate
|
|
13
|
+
-- the schema, add columns; never alter existing ones"). The column is
|
|
14
|
+
-- NULLABLE with no backfill: every historical fact and every existing
|
|
15
|
+
-- reader is unaffected (NULL = source-unknown, the pre-009 state). New
|
|
16
|
+
-- distillations stamp it from the finer `attributes.source` when the
|
|
17
|
+
-- producer supplied one, else the coarse `source_kind` enum value — see
|
|
18
|
+
-- `fact_source()` in extractor-async/worker.py for the derivation rule.
|
|
19
|
+
--
|
|
20
|
+
-- TEXT (not the `source_kind` enum) on purpose: we want the finer
|
|
21
|
+
-- producer label (gmail / slack / hubspot / granola / drive / github)
|
|
22
|
+
-- where available, which is exactly the granularity SoR-drift needs and
|
|
23
|
+
-- which the enum can't represent.
|
|
24
|
+
ALTER TABLE facts ADD COLUMN IF NOT EXISTS source TEXT;
|
|
@@ -55,3 +55,27 @@ def test_identical_output_across_copies():
|
|
|
55
55
|
for arena, etype, name in cases:
|
|
56
56
|
assert a.entity_id(arena, etype, name) == b.entity_id(arena, etype, name)
|
|
57
57
|
assert a.normalize_surface_form(name) == b.normalize_surface_form(name)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_person_id_key_parity_and_behaviour():
|
|
61
|
+
"""person_id_key is the cross-pass person hard key — it MUST behave
|
|
62
|
+
identically in both copies (it's why it lives in the shared file), and it
|
|
63
|
+
must prefer the email so a person mints the same id from either extractor."""
|
|
64
|
+
a = _load(_SYNC, "entity_id_sync2")
|
|
65
|
+
b = _load(_ASYNC, "entity_id_async2")
|
|
66
|
+
key_cases = [
|
|
67
|
+
("Johann Boedecker", "johann@pentatonic.com"),
|
|
68
|
+
("Johann Boedecker", None),
|
|
69
|
+
(None, "carly@pact.org"),
|
|
70
|
+
(" Ben Gordon ", " Ben@Acme.com "),
|
|
71
|
+
(None, None),
|
|
72
|
+
]
|
|
73
|
+
for name, email in key_cases:
|
|
74
|
+
assert a.person_id_key(name, email) == b.person_id_key(name, email)
|
|
75
|
+
# Prefers the email; falls back to the name.
|
|
76
|
+
assert a.person_id_key("Johann", "johann@p.com") == "johann@p.com"
|
|
77
|
+
assert a.person_id_key("Johann", None) == "Johann"
|
|
78
|
+
# Name-variants with the same email → the SAME node id (the whole point).
|
|
79
|
+
id1 = a.entity_id("arena1", "person", a.person_id_key("Johann Boedecker", "j@p.com"))
|
|
80
|
+
id2 = a.entity_id("arena1", "person", a.person_id_key("BOEDECKER, JOHANN", "j@p.com"))
|
|
81
|
+
assert id1 == id2
|