@pentatonic-ai/ai-agent-sdk 0.10.18 → 0.10.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +122 -8
- package/packages/memory-engine-v2/compat/server.py +18 -4
- package/packages/memory-engine-v2/docs/redistill-execution-plan-2026-06-22.md +269 -0
- package/packages/memory-engine-v2/docs/redistill-plan-2026-06-21.md +101 -0
- package/packages/memory-engine-v2/extractor-async/extraction_diff.py +218 -0
- package/packages/memory-engine-v2/extractor-async/test_email_alias_guard.py +78 -0
- package/packages/memory-engine-v2/extractor-async/test_extraction_diff.py +180 -0
- package/packages/memory-engine-v2/extractor-async/test_prompt_rules.py +58 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +116 -0
- package/packages/memory-engine-v2/scripts/build_retrain_corpus.py +240 -0
- package/packages/memory-engine-v2/scripts/fusion_defrag.py +440 -0
- package/packages/memory-engine-v2/scripts/redistill.py +236 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Structured diff between two extractions (student vs teacher gold).
|
|
2
|
+
|
|
3
|
+
Replaces the crude agreement proxies we'd been quoting (entity-name-exact-match;
|
|
4
|
+
word-Jaccard on whole statements) with a STRUCTURED comparison that:
|
|
5
|
+
|
|
6
|
+
- matches entities on fuzzy name + compatible type (not exact lowercased string,
|
|
7
|
+
which penalised "Acme" vs "Acme Corp" / normalisation variants),
|
|
8
|
+
- matches facts on their s·p·o STRUCTURE plus the statement as a fallback
|
|
9
|
+
(not bag-of-words on the statement, which ignored who-did-what),
|
|
10
|
+
- matches relationships as (from, type, to) triples,
|
|
11
|
+
- reports precision / recall / F1 PER AXIS, and facts broken down PER CATEGORY
|
|
12
|
+
(so `decision`/`commitment` agreement is isolated — the cascade's
|
|
13
|
+
high-value gate question).
|
|
14
|
+
|
|
15
|
+
Deterministic + stdlib-only (difflib) so it runs offline, in CI, and on any box
|
|
16
|
+
without a GPU. Semantic-embedding matching is a deliberate non-goal here: a
|
|
17
|
+
deterministic structural diff is the defensible, un-game-able baseline (no model
|
|
18
|
+
judging another model's output); an embedding tiebreak can layer on later if the
|
|
19
|
+
fuzzy threshold proves too strict.
|
|
20
|
+
|
|
21
|
+
Shapes (mirror _parse_guided_json output):
|
|
22
|
+
entity = {"name", "type", "aliases"?: [emails]}
|
|
23
|
+
fact = {"category", "subject", "predicate", "object"?, "statement"}
|
|
24
|
+
relationship = {"from", "to", "type"}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from difflib import SequenceMatcher
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
# Org/legal-form suffixes stripped before name comparison so "Acme" == "Acme Inc".
|
|
35
|
+
_ORG_SUFFIX = re.compile(
|
|
36
|
+
r"\b(inc|incorporated|ltd|limited|llc|llp|plc|corp|corporation|co|gmbh|ag|sa|"
|
|
37
|
+
r"sas|bv|nv|pty|group|holdings?|company)\b\.?",
|
|
38
|
+
re.IGNORECASE,
|
|
39
|
+
)
|
|
40
|
+
_NONWORD = re.compile(r"[^a-z0-9 ]+")
|
|
41
|
+
_WS = re.compile(r"\s+")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalize_name(s: str | None) -> str:
|
|
45
|
+
"""Lowercase, drop punctuation + org suffixes, collapse whitespace."""
|
|
46
|
+
if not s:
|
|
47
|
+
return ""
|
|
48
|
+
s = s.lower()
|
|
49
|
+
s = _ORG_SUFFIX.sub(" ", s)
|
|
50
|
+
s = _NONWORD.sub(" ", s)
|
|
51
|
+
return _WS.sub(" ", s).strip()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _tokens(s: str) -> set[str]:
|
|
55
|
+
return set(normalize_name(s).split())
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def sim(a: str | None, b: str | None) -> float:
|
|
59
|
+
"""Similarity in [0,1]: max of char-level ratio and token-set Jaccard on the
|
|
60
|
+
normalised strings. The token-set arm rewards word overlap regardless of
|
|
61
|
+
order/length ("ship the Q3 release" vs "Q3 release will ship"); the
|
|
62
|
+
char-ratio arm rewards near-identical short strings."""
|
|
63
|
+
na, nb = normalize_name(a), normalize_name(b)
|
|
64
|
+
if not na and not nb:
|
|
65
|
+
return 1.0
|
|
66
|
+
if not na or not nb:
|
|
67
|
+
return 0.0
|
|
68
|
+
if na == nb:
|
|
69
|
+
return 1.0
|
|
70
|
+
char = SequenceMatcher(None, na, nb).ratio()
|
|
71
|
+
ta, tb = set(na.split()), set(nb.split())
|
|
72
|
+
jac = len(ta & tb) / len(ta | tb) if (ta | tb) else 0.0
|
|
73
|
+
return max(char, jac)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class PRF:
|
|
78
|
+
"""Precision/recall/F1 for one axis. n_gold/n_pred are the item counts."""
|
|
79
|
+
n_gold: int
|
|
80
|
+
n_pred: int
|
|
81
|
+
matched: int
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def precision(self) -> float:
|
|
85
|
+
if self.n_pred == 0:
|
|
86
|
+
return 1.0 if self.n_gold == 0 else 0.0
|
|
87
|
+
return self.matched / self.n_pred
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def recall(self) -> float:
|
|
91
|
+
if self.n_gold == 0:
|
|
92
|
+
return 1.0 if self.n_pred == 0 else 0.0
|
|
93
|
+
return self.matched / self.n_gold
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def f1(self) -> float:
|
|
97
|
+
p, r = self.precision, self.recall
|
|
98
|
+
return 2 * p * r / (p + r) if (p + r) else (1.0 if self.n_gold == 0 and self.n_pred == 0 else 0.0)
|
|
99
|
+
|
|
100
|
+
def as_dict(self) -> dict[str, float | int]:
|
|
101
|
+
return {
|
|
102
|
+
"n_gold": self.n_gold, "n_pred": self.n_pred, "matched": self.matched,
|
|
103
|
+
"precision": round(self.precision, 4), "recall": round(self.recall, 4),
|
|
104
|
+
"f1": round(self.f1, 4),
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _greedy_match(gold: list, pred: list, score_fn, threshold: float) -> int:
|
|
109
|
+
"""Count matched gold items via greedy 1:1 alignment. All (gold, pred) pairs
|
|
110
|
+
scored, sorted desc, claimed highest-first so each item matches at most once.
|
|
111
|
+
Deterministic (stable sort, then index tiebreak)."""
|
|
112
|
+
pairs = []
|
|
113
|
+
for gi, g in enumerate(gold):
|
|
114
|
+
for pi, p in enumerate(pred):
|
|
115
|
+
s = score_fn(g, p)
|
|
116
|
+
if s >= threshold:
|
|
117
|
+
pairs.append((-s, gi, pi))
|
|
118
|
+
pairs.sort()
|
|
119
|
+
used_g: set[int] = set()
|
|
120
|
+
used_p: set[int] = set()
|
|
121
|
+
matched = 0
|
|
122
|
+
for _, gi, pi in pairs:
|
|
123
|
+
if gi in used_g or pi in used_p:
|
|
124
|
+
continue
|
|
125
|
+
used_g.add(gi)
|
|
126
|
+
used_p.add(pi)
|
|
127
|
+
matched += 1
|
|
128
|
+
return matched
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── per-item scorers ─────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def _entity_score(g: dict, p: dict) -> float:
|
|
134
|
+
"""Name similarity, gated by type compatibility (equal, or either side
|
|
135
|
+
'other'/missing — the LLMs disagree on type far more than on identity)."""
|
|
136
|
+
gt = (g.get("type") or "").lower()
|
|
137
|
+
pt = (p.get("type") or "").lower()
|
|
138
|
+
if gt and pt and gt != pt and "other" not in (gt, pt):
|
|
139
|
+
return 0.0
|
|
140
|
+
return sim(g.get("name"), p.get("name"))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _fact_score(g: dict, p: dict) -> float:
|
|
144
|
+
"""Structural s·p·o similarity, OR statement similarity as a fallback.
|
|
145
|
+
Structure: mean of subject/predicate/object sims (object absent on both =
|
|
146
|
+
neutral 1.0 for that term). Take the max of structural and statement so a
|
|
147
|
+
well-phrased statement still matches even if s/p/o were split differently."""
|
|
148
|
+
subj = sim(g.get("subject"), p.get("subject"))
|
|
149
|
+
pred = sim(g.get("predicate"), p.get("predicate"))
|
|
150
|
+
go, po = g.get("object"), p.get("object")
|
|
151
|
+
obj = 1.0 if not go and not po else sim(go, po)
|
|
152
|
+
structural = (subj + pred + obj) / 3
|
|
153
|
+
statement = sim(g.get("statement"), p.get("statement"))
|
|
154
|
+
return max(structural, statement)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _rel_score(g: dict, p: dict) -> float:
|
|
158
|
+
"""(from, type, to) triple: mean of the three term sims."""
|
|
159
|
+
return (sim(g.get("from"), p.get("from"))
|
|
160
|
+
+ sim(g.get("type"), p.get("type"))
|
|
161
|
+
+ sim(g.get("to"), p.get("to"))) / 3
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ── public API ─────────────────────────────────────────────────────────────
|
|
165
|
+
|
|
166
|
+
ENTITY_THRESHOLD = 0.85
|
|
167
|
+
FACT_THRESHOLD = 0.60
|
|
168
|
+
REL_THRESHOLD = 0.60
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def diff_axis(gold: list[dict], pred: list[dict], kind: str) -> PRF:
|
|
172
|
+
"""Match one axis ('entities' | 'facts' | 'relationships'); return PRF."""
|
|
173
|
+
scorer, thr = {
|
|
174
|
+
"entities": (_entity_score, ENTITY_THRESHOLD),
|
|
175
|
+
"facts": (_fact_score, FACT_THRESHOLD),
|
|
176
|
+
"relationships": (_rel_score, REL_THRESHOLD),
|
|
177
|
+
}[kind]
|
|
178
|
+
matched = _greedy_match(gold, pred, scorer, thr)
|
|
179
|
+
return PRF(n_gold=len(gold), n_pred=len(pred), matched=matched)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _facts_in(extraction: dict, categories: set[str] | None) -> list[dict]:
|
|
183
|
+
facts = extraction.get("facts") or []
|
|
184
|
+
if categories is None:
|
|
185
|
+
return facts
|
|
186
|
+
return [f for f in facts if (f.get("category") or "").lower() in categories]
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def diff_extractions(
|
|
190
|
+
gold: dict, pred: dict, fact_categories: set[str] | None = None
|
|
191
|
+
) -> dict[str, Any]:
|
|
192
|
+
"""Full structured diff. `gold`/`pred` are extraction dicts. Returns per-axis
|
|
193
|
+
PRF dicts plus per-fact-category PRF. `fact_categories`, if given, also
|
|
194
|
+
reports a 'facts_filtered' PRF over just those categories (e.g.
|
|
195
|
+
{'decision','commitment'} for the high-value-gate question)."""
|
|
196
|
+
out: dict[str, Any] = {
|
|
197
|
+
"entities": diff_axis(gold.get("entities") or [], pred.get("entities") or [], "entities").as_dict(),
|
|
198
|
+
"facts": diff_axis(gold.get("facts") or [], pred.get("facts") or [], "facts").as_dict(),
|
|
199
|
+
"relationships": diff_axis(
|
|
200
|
+
gold.get("relationships") or [], pred.get("relationships") or [], "relationships"
|
|
201
|
+
).as_dict(),
|
|
202
|
+
}
|
|
203
|
+
# per-category fact breakdown
|
|
204
|
+
cats = {(f.get("category") or "").lower() for f in (gold.get("facts") or [])}
|
|
205
|
+
cats |= {(f.get("category") or "").lower() for f in (pred.get("facts") or [])}
|
|
206
|
+
cats.discard("")
|
|
207
|
+
by_cat: dict[str, Any] = {}
|
|
208
|
+
for c in sorted(cats):
|
|
209
|
+
g = _facts_in(gold, {c})
|
|
210
|
+
p = _facts_in(pred, {c})
|
|
211
|
+
by_cat[c] = diff_axis(g, p, "facts").as_dict()
|
|
212
|
+
out["facts_by_category"] = by_cat
|
|
213
|
+
if fact_categories is not None:
|
|
214
|
+
g = _facts_in(gold, fact_categories)
|
|
215
|
+
p = _facts_in(pred, fact_categories)
|
|
216
|
+
out["facts_filtered"] = diff_axis(g, p, "facts").as_dict()
|
|
217
|
+
out["facts_filtered_categories"] = sorted(fact_categories)
|
|
218
|
+
return out
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Unit tests for the email-alias guard (_email_plausibly_belongs).
|
|
2
|
+
|
|
3
|
+
Pins the live pollution case (the "Johann Boedecker" node, 2026-06-22): keep the
|
|
4
|
+
person's own addresses; drop the bystander emails (a hotel, newsletters, unrelated
|
|
5
|
+
gmails) the LLM stapled on from co-occurring documents.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extractor_async_worker_aliasguard"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
sys.modules[name] = mod
|
|
23
|
+
spec.loader.exec_module(mod)
|
|
24
|
+
return mod
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
worker = _load()
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
31
|
+
|
|
32
|
+
belongs = lambda n, e: worker._email_plausibly_belongs(n, e)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── KEEP: the person's own addresses ─────────────────────────────────────
|
|
36
|
+
@pytest.mark.parametrize("email", [
|
|
37
|
+
"johann@pentatonic.com",
|
|
38
|
+
"johann.boedecker@pentatonic.com",
|
|
39
|
+
"boedeckerjohann@gmail.com",
|
|
40
|
+
"JOHANN@pentatonic.com", # case-insensitive
|
|
41
|
+
"jb@pentatonic.com", # initials
|
|
42
|
+
"j.boedecker@pentatonic.com", # surname token
|
|
43
|
+
])
|
|
44
|
+
def test_keeps_owner_emails(email):
|
|
45
|
+
assert belongs("Johann Boedecker", email) is True
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ── DROP: the actual bystander emails found on the live Johann node ──────
|
|
49
|
+
@pytest.mark.parametrize("email", [
|
|
50
|
+
"reservations.nyc@acehotel.com",
|
|
51
|
+
"marketingadmin@sustainablebrands.com",
|
|
52
|
+
"martinvasquez87@gmail.com",
|
|
53
|
+
"schwaabd@yahoo.de",
|
|
54
|
+
"cvanderlip@redish.com",
|
|
55
|
+
"leechihshan33@gmail.com",
|
|
56
|
+
])
|
|
57
|
+
def test_drops_bystander_emails(email):
|
|
58
|
+
assert belongs("Johann Boedecker", email) is False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ── edges ────────────────────────────────────────────────────────────────
|
|
62
|
+
def test_initials_either_order():
|
|
63
|
+
assert belongs("Johann Boedecker", "bj@pentatonic.com") is True # reversed initials
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_no_usable_name_does_not_overfilter():
|
|
67
|
+
# a bare/empty name has nothing to check against → keep (don't strip)
|
|
68
|
+
assert belongs("", "anything@x.com") is True
|
|
69
|
+
assert belongs("J", "anything@x.com") is True # single letter < 2 → no tokens
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_surname_only_person_keeps_surname_email():
|
|
73
|
+
assert belongs("Vickers", "will.vickers@vickers-oil.com") is True
|
|
74
|
+
assert belongs("Vickers", "reservations.nyc@acehotel.com") is False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def test_guard_flag_default_on():
|
|
78
|
+
assert worker.EMAIL_ALIAS_GUARD is True
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Unit tests for the structured-diff agreement metric (extraction_diff).
|
|
2
|
+
|
|
3
|
+
Pins the behaviours that make it a real metric rather than the old proxies:
|
|
4
|
+
fuzzy/normalised name matching, structural s·p·o fact matching, per-axis P/R/F1,
|
|
5
|
+
per-category fact breakdown, and the high-value filtered view.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extraction_diff_mod"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "extraction_diff.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
# Register before exec so @dataclass's type resolution (which walks
|
|
23
|
+
# sys.modules[cls.__module__]) works under the importlib custom-name load
|
|
24
|
+
# on Python 3.12+/3.14. A normal `import extraction_diff` (CI/prod) is fine.
|
|
25
|
+
sys.modules[name] = mod
|
|
26
|
+
spec.loader.exec_module(mod)
|
|
27
|
+
return mod
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ed = _load()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ── normalize_name / sim ─────────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
def test_normalize_strips_org_suffix_and_punct():
|
|
36
|
+
assert ed.normalize_name("Acme Corp.") == "acme"
|
|
37
|
+
assert ed.normalize_name("Acme, Inc.") == "acme"
|
|
38
|
+
assert ed.normalize_name("ACME Limited") == "acme"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_sim_normalisation_variants_are_high():
|
|
42
|
+
# the exact-string proxy scored these 0; structured sim should be ~1.
|
|
43
|
+
assert ed.sim("Acme", "Acme Corp") >= 0.99
|
|
44
|
+
assert ed.sim("Acme Inc.", "ACME") >= 0.99
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_sim_token_overlap_order_invariant():
|
|
48
|
+
assert ed.sim("ship the Q3 release", "Q3 release will ship") >= 0.6
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_sim_unrelated_low():
|
|
52
|
+
assert ed.sim("Acme", "Globex") < 0.5
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_sim_both_empty_is_one():
|
|
56
|
+
assert ed.sim("", "") == 1.0
|
|
57
|
+
assert ed.sim(None, "x") == 0.0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ── PRF arithmetic ───────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def test_prf_basic():
|
|
63
|
+
prf = ed.PRF(n_gold=4, n_pred=5, matched=3)
|
|
64
|
+
assert prf.recall == 0.75
|
|
65
|
+
assert prf.precision == 0.6
|
|
66
|
+
assert round(prf.f1, 3) == 0.667
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_prf_empty_both_is_perfect():
|
|
70
|
+
prf = ed.PRF(n_gold=0, n_pred=0, matched=0)
|
|
71
|
+
assert prf.precision == 1.0 and prf.recall == 1.0 and prf.f1 == 1.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_prf_pred_without_gold_is_zero_precision():
|
|
75
|
+
prf = ed.PRF(n_gold=0, n_pred=2, matched=0)
|
|
76
|
+
assert prf.precision == 0.0
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ── entity matching ──────────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
def test_entity_match_fuzzy_name_same_type():
|
|
82
|
+
g = [{"name": "Acme Corp", "type": "org"}]
|
|
83
|
+
p = [{"name": "Acme", "type": "org"}]
|
|
84
|
+
prf = ed.diff_axis(g, p, "entities")
|
|
85
|
+
assert prf.matched == 1 and prf.f1 == 1.0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_entity_type_mismatch_blocks_match():
|
|
89
|
+
g = [{"name": "Apple", "type": "org"}]
|
|
90
|
+
p = [{"name": "Apple", "type": "person"}]
|
|
91
|
+
assert ed.diff_axis(g, p, "entities").matched == 0
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_entity_other_type_is_compatible():
|
|
95
|
+
g = [{"name": "Apple", "type": "org"}]
|
|
96
|
+
p = [{"name": "Apple", "type": "other"}]
|
|
97
|
+
assert ed.diff_axis(g, p, "entities").matched == 1
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_entity_greedy_one_to_one():
|
|
101
|
+
# two golds, one pred → at most one match
|
|
102
|
+
g = [{"name": "Acme", "type": "org"}, {"name": "Acme", "type": "org"}]
|
|
103
|
+
p = [{"name": "Acme", "type": "org"}]
|
|
104
|
+
prf = ed.diff_axis(g, p, "entities")
|
|
105
|
+
assert prf.matched == 1 and prf.recall == 0.5 and prf.precision == 1.0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ── fact matching (structural vs statement) ──────────────────────────────
|
|
109
|
+
|
|
110
|
+
def test_fact_match_on_structure_despite_statement_rephrase():
|
|
111
|
+
g = [{"category": "decision", "subject": "Acme", "predicate": "will renew",
|
|
112
|
+
"object": "the contract", "statement": "Acme decided to renew the contract for 2027."}]
|
|
113
|
+
p = [{"category": "decision", "subject": "Acme", "predicate": "renews",
|
|
114
|
+
"object": "contract", "statement": "The 2027 contract renewal was agreed by Acme."}]
|
|
115
|
+
prf = ed.diff_axis(g, p, "facts")
|
|
116
|
+
assert prf.matched == 1
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_fact_match_on_statement_when_spo_split_differs():
|
|
120
|
+
g = [{"category": "commitment", "subject": "Bob", "predicate": "owns", "object": "migration",
|
|
121
|
+
"statement": "Bob will lead the migration starting in March."}]
|
|
122
|
+
p = [{"category": "commitment", "subject": "Bob Chen", "predicate": "leads",
|
|
123
|
+
"object": "the data migration", "statement": "Bob will lead the migration starting in March."}]
|
|
124
|
+
assert ed.diff_axis(g, p, "facts").matched == 1
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_fact_unrelated_no_match():
|
|
128
|
+
g = [{"category": "decision", "subject": "Acme", "predicate": "hired", "object": "a CFO",
|
|
129
|
+
"statement": "Acme hired a new CFO."}]
|
|
130
|
+
p = [{"category": "decision", "subject": "Globex", "predicate": "closed", "object": "the Berlin office",
|
|
131
|
+
"statement": "Globex shut its Berlin office."}]
|
|
132
|
+
assert ed.diff_axis(g, p, "facts").matched == 0
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ── relationships ────────────────────────────────────────────────────────
|
|
136
|
+
|
|
137
|
+
def test_relationship_triple_match():
|
|
138
|
+
g = [{"from": "Jane", "to": "Acme Corp", "type": "works at"}]
|
|
139
|
+
p = [{"from": "Jane", "to": "Acme", "type": "employed by"}]
|
|
140
|
+
# from + to match strongly; type weaker → mean may dip below threshold
|
|
141
|
+
prf = ed.diff_axis(g, p, "relationships")
|
|
142
|
+
assert prf.n_gold == 1 and prf.n_pred == 1
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ── full diff + per-category + filtered ──────────────────────────────────
|
|
146
|
+
|
|
147
|
+
def test_diff_extractions_per_category_and_filtered():
|
|
148
|
+
gold = {
|
|
149
|
+
"entities": [{"name": "Acme", "type": "org"}],
|
|
150
|
+
"facts": [
|
|
151
|
+
{"category": "decision", "subject": "Acme", "predicate": "will renew",
|
|
152
|
+
"object": "contract", "statement": "Acme will renew the contract."},
|
|
153
|
+
{"category": "state", "subject": "Acme", "predicate": "is", "object": "a customer",
|
|
154
|
+
"statement": "Acme is a customer."},
|
|
155
|
+
],
|
|
156
|
+
"relationships": [],
|
|
157
|
+
}
|
|
158
|
+
pred = {
|
|
159
|
+
"entities": [{"name": "Acme Corp", "type": "org"}],
|
|
160
|
+
"facts": [
|
|
161
|
+
{"category": "decision", "subject": "Acme", "predicate": "renews",
|
|
162
|
+
"object": "the contract", "statement": "Acme renews its contract."},
|
|
163
|
+
],
|
|
164
|
+
"relationships": [],
|
|
165
|
+
}
|
|
166
|
+
out = ed.diff_extractions(gold, pred, fact_categories={"decision", "commitment"})
|
|
167
|
+
assert out["entities"]["matched"] == 1
|
|
168
|
+
# per-category: decision matched 1/1; state missing (recall 0)
|
|
169
|
+
assert out["facts_by_category"]["decision"]["matched"] == 1
|
|
170
|
+
assert out["facts_by_category"]["state"]["recall"] == 0.0
|
|
171
|
+
# filtered to decision/commitment: 1 gold, 1 pred, 1 matched → perfect
|
|
172
|
+
assert out["facts_filtered"]["recall"] == 1.0
|
|
173
|
+
assert out["facts_filtered"]["precision"] == 1.0
|
|
174
|
+
assert out["facts_filtered_categories"] == ["commitment", "decision"]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def test_diff_extractions_empty_both():
|
|
178
|
+
out = ed.diff_extractions({"entities": [], "facts": [], "relationships": []},
|
|
179
|
+
{"entities": [], "facts": [], "relationships": []})
|
|
180
|
+
assert out["facts"]["f1"] == 1.0
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Guard tests for the distiller system-prompt content rules.
|
|
2
|
+
|
|
3
|
+
Pins that the email-discipline + entity-separation rules (this change) and the
|
|
4
|
+
#126 modality/attribution rules are present in BOTH prompt variants — a cheap
|
|
5
|
+
regression guard so a future prompt edit can't silently drop them.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import importlib.util
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
_THIS = Path(__file__).resolve().parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load(name="extractor_async_worker_prompts"):
|
|
20
|
+
spec = importlib.util.spec_from_file_location(name, _THIS / "worker.py")
|
|
21
|
+
mod = importlib.util.module_from_spec(spec)
|
|
22
|
+
sys.modules[name] = mod
|
|
23
|
+
spec.loader.exec_module(mod)
|
|
24
|
+
return mod
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
worker = _load()
|
|
29
|
+
except ImportError as e:
|
|
30
|
+
pytest.skip(f"extractor-async deps unavailable: {e}", allow_module_level=True)
|
|
31
|
+
|
|
32
|
+
PROMPTS = lambda: (worker.BATCH_SYSTEM_PROMPT, worker.GUIDED_JSON_SYSTEM_PROMPT)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_email_discipline_in_both_prompts():
|
|
36
|
+
for p in PROMPTS():
|
|
37
|
+
assert "An email address is NOT a person" in p
|
|
38
|
+
assert "reservations@" in p # role/transactional examples present
|
|
39
|
+
assert "clearly THEIRS" in p # bystander-attachment ban
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_distinct_entities_rule_in_both_prompts():
|
|
43
|
+
for p in PROMPTS():
|
|
44
|
+
assert "DISTINCT ENTITIES" in p
|
|
45
|
+
assert "Acme & Globex" in p # conflation-split example
|
|
46
|
+
assert "warehouse" in p # generic-token suppression
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_126_rules_not_regressed():
|
|
50
|
+
for p in PROMPTS():
|
|
51
|
+
assert "TENSE & MODALITY" in p
|
|
52
|
+
assert "ATTRIBUTION FIDELITY" in p
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_active_prompt_carries_the_rules_and_fresh_hash():
|
|
56
|
+
assert "An email address is NOT a person" in worker.ACTIVE_SYSTEM_PROMPT
|
|
57
|
+
assert "DISTINCT ENTITIES" in worker.ACTIVE_SYSTEM_PROMPT
|
|
58
|
+
assert len(worker.SYSTEM_PROMPT_HASH) == 16 # hash recomputed off ACTIVE prompt
|
|
@@ -284,6 +284,38 @@ and NEVER drop a field.
|
|
|
284
284
|
object MAY be an entity name OR a literal string OR `-` if absent.
|
|
285
285
|
statement ≤ 140 characters, a self-contained sentence.
|
|
286
286
|
WORKED EXAMPLE: `FCT|commitment|Timothy Bradley|agreed to|SAFE amendments|Timothy confirmed the SAFE amendments are set (14 May 2026)`
|
|
287
|
+
- TENSE & MODALITY — never record the future or the merely-planned as done:
|
|
288
|
+
* A SCHEDULED or FUTURE event (a calendar invite, a meeting/call dated \
|
|
289
|
+
later than this event, a recurring-meeting instance) is UPCOMING, not \
|
|
290
|
+
completed. NEVER emit "attended / hosted / met / reported / decided" for a \
|
|
291
|
+
meeting that has not happened — use category=commitment with predicate "is \
|
|
292
|
+
scheduled to" / "plans to".
|
|
293
|
+
* A PLAN, PROPOSAL, INTENT or next-step ("will", "plans to", "aims to", \
|
|
294
|
+
"to organise", "we'll", "next steps:") is category=commitment, NOT state and \
|
|
295
|
+
NOT a completed act. Keep "will send" as a commitment — never record it as "sent".
|
|
296
|
+
- ATTRIBUTION FIDELITY — the subject must be who the source actually credits:
|
|
297
|
+
* Attribute a document/deck/message's content to a person ONLY if the source \
|
|
298
|
+
names them as its author or speaker. Do NOT attribute an unauthored doc, agenda \
|
|
299
|
+
or deck to whoever it merely mentions or whoever shared it.
|
|
300
|
+
* Do NOT promote a meeting ATTENDEE to organiser/host without explicit evidence.
|
|
301
|
+
* Do NOT attribute an ORGANISATION's activity (deals, intros, pipeline) to an \
|
|
302
|
+
individual person.
|
|
303
|
+
- IDENTITY & EMAILS — do NOT infer a person's employer/affiliation from an email \
|
|
304
|
+
signature or a company's standard footer/boilerplate; affiliation needs an \
|
|
305
|
+
explicit statement in the body. \
|
|
306
|
+
An email address is NOT a person: NEVER emit an entity whose name is an email \
|
|
307
|
+
address, and NEVER treat a role/transactional address (reservations@, bookings@, \
|
|
308
|
+
no-reply@, info@, office@, marketing@, support@, notifications@, admin@) as a \
|
|
309
|
+
person. Attach an email to a person ONLY when it is clearly THEIRS (the author, \
|
|
310
|
+
or a local-part matching their name) — NEVER attach an address that merely \
|
|
311
|
+
co-occurs in the same thread, CC list, or document (a hotel booking, a \
|
|
312
|
+
newsletter, an unrelated contact).
|
|
313
|
+
- DISTINCT ENTITIES — two names joined by "and" / "&" / "/" are TWO separate \
|
|
314
|
+
entities, never one merged node ("Acme & Globex" → emit Acme AND Globex). Do NOT \
|
|
315
|
+
turn a sentence fragment or phrase into an entity, and do NOT mint generic \
|
|
316
|
+
infrastructure / environment tokens (prod, staging, preview, UAT, warehouse, \
|
|
317
|
+
main, PRD, CRM, platform, localhost) as entities — they are not named people, \
|
|
318
|
+
orgs, products, or projects.
|
|
287
319
|
- REL lines have exactly 4 fields: `REL`, from, to, rel_type.
|
|
288
320
|
from and to MUST be entity names declared in THIS event's ENT lines.
|
|
289
321
|
rel_type is a short verb / preposition phrase.
|
|
@@ -337,6 +369,38 @@ observation, preference}.
|
|
|
337
369
|
WORKED EXAMPLE: {"category": "commitment", "subject": "Timothy \
|
|
338
370
|
Bradley", "predicate": "agreed to", "object": "SAFE amendments", \
|
|
339
371
|
"statement": "Timothy confirmed the SAFE amendments are set (14 May 2026)"}
|
|
372
|
+
- TENSE & MODALITY — never record the future or the merely-planned as done:
|
|
373
|
+
* A SCHEDULED or FUTURE event (a calendar invite, a meeting/call dated \
|
|
374
|
+
later than this event, a recurring-meeting instance) is UPCOMING, not \
|
|
375
|
+
completed. NEVER emit "attended / hosted / met / reported / decided" for a \
|
|
376
|
+
meeting that has not happened — use category=commitment with predicate "is \
|
|
377
|
+
scheduled to" / "plans to".
|
|
378
|
+
* A PLAN, PROPOSAL, INTENT or next-step ("will", "plans to", "aims to", \
|
|
379
|
+
"to organise", "we'll", "next steps:") is category=commitment, NOT state and \
|
|
380
|
+
NOT a completed act. Keep "will send" as a commitment — never record it as "sent".
|
|
381
|
+
- ATTRIBUTION FIDELITY — the subject must be who the source actually credits:
|
|
382
|
+
* Attribute a document/deck/message's content to a person ONLY if the source \
|
|
383
|
+
names them as its author or speaker. Do NOT attribute an unauthored doc, agenda \
|
|
384
|
+
or deck to whoever it merely mentions or whoever shared it.
|
|
385
|
+
* Do NOT promote a meeting ATTENDEE to organiser/host without explicit evidence.
|
|
386
|
+
* Do NOT attribute an ORGANISATION's activity (deals, intros, pipeline) to an \
|
|
387
|
+
individual person.
|
|
388
|
+
- IDENTITY & EMAILS — do NOT infer a person's employer/affiliation from an email \
|
|
389
|
+
signature or a company's standard footer/boilerplate; affiliation needs an \
|
|
390
|
+
explicit statement in the body. \
|
|
391
|
+
An email address is NOT a person: NEVER emit an entity whose name is an email \
|
|
392
|
+
address, and NEVER treat a role/transactional address (reservations@, bookings@, \
|
|
393
|
+
no-reply@, info@, office@, marketing@, support@, notifications@, admin@) as a \
|
|
394
|
+
person. Attach an email to a person ONLY when it is clearly THEIRS (the author, \
|
|
395
|
+
or a local-part matching their name) — NEVER attach an address that merely \
|
|
396
|
+
co-occurs in the same thread, CC list, or document (a hotel booking, a \
|
|
397
|
+
newsletter, an unrelated contact).
|
|
398
|
+
- DISTINCT ENTITIES — two names joined by "and" / "&" / "/" are TWO separate \
|
|
399
|
+
entities, never one merged node ("Acme & Globex" → emit Acme AND Globex). Do NOT \
|
|
400
|
+
turn a sentence fragment or phrase into an entity, and do NOT mint generic \
|
|
401
|
+
infrastructure / environment tokens (prod, staging, preview, UAT, warehouse, \
|
|
402
|
+
main, PRD, CRM, platform, localhost) as entities — they are not named people, \
|
|
403
|
+
orgs, products, or projects.
|
|
340
404
|
- relationships: "from" and "to" MUST be entity names declared in THIS \
|
|
341
405
|
event's "entities". "type" is a short verb / preposition phrase.
|
|
342
406
|
- HARD CAPS per event: 8 entities, 6 facts, 6 relationships. Pick the \
|
|
@@ -1189,6 +1253,43 @@ def org_node_id_key(entity_type: str, name: str, stamped_domain: str | None) ->
|
|
|
1189
1253
|
return name
|
|
1190
1254
|
|
|
1191
1255
|
|
|
1256
|
+
# --------------------------------------------------------------------
|
|
1257
|
+
# Email-alias guard — stop bystander emails polluting a person
|
|
1258
|
+
# --------------------------------------------------------------------
|
|
1259
|
+
# The async LLM pass sometimes emits a PERSON entity whose `email` is a BYSTANDER
|
|
1260
|
+
# address co-occurring in the same doc/thread (a hotel booking, a newsletter, an
|
|
1261
|
+
# unrelated gmail). _parse_guided_json promotes it into the entity's aliases and
|
|
1262
|
+
# upsert_entities then stores + RESOLVES on it — folding strangers' identities
|
|
1263
|
+
# (and their facts) onto the person. Measured live (pentatonic-team): a "Johann
|
|
1264
|
+
# Boedecker" node carrying reservations.nyc@acehotel.com + unrelated gmails, all
|
|
1265
|
+
# from STUDENT-distilled `doc` events. This guard keeps an email alias on a person
|
|
1266
|
+
# only when its local-part plausibly relates to the person's name; clear bystanders
|
|
1267
|
+
# are dropped BEFORE resolution/storage. Conservative: dropping a genuine but
|
|
1268
|
+
# non-name-matching alias is a mild loss; keeping a bystander is a confabulation
|
|
1269
|
+
# source. Flag-revertible (EMAIL_ALIAS_GUARD=false). Persons only — org domain
|
|
1270
|
+
# stamping is untouched.
|
|
1271
|
+
EMAIL_ALIAS_GUARD = _envflag("EMAIL_ALIAS_GUARD", "true")
|
|
1272
|
+
_ALIAS_NONALPHA = re.compile(r"[^a-z]")
|
|
1273
|
+
_ALIAS_SPLIT = re.compile(r"[^a-z]+")
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _email_plausibly_belongs(person_name: str, email: str) -> bool:
|
|
1277
|
+
"""True ⇒ keep this email as an alias of `person_name`; False ⇒ drop (clear
|
|
1278
|
+
bystander). Match = a name token appears in the local-part, OR the local-part
|
|
1279
|
+
is the person's initials. Pure + deterministic."""
|
|
1280
|
+
local = email.split("@", 1)[0].lower()
|
|
1281
|
+
local_letters = _ALIAS_NONALPHA.sub("", local)
|
|
1282
|
+
name_tokens = {t for t in _ALIAS_SPLIT.split(person_name.lower()) if len(t) >= 2}
|
|
1283
|
+
if not name_tokens or not local_letters:
|
|
1284
|
+
return True # nothing to check against — don't over-filter
|
|
1285
|
+
if any(nt in local_letters for nt in name_tokens):
|
|
1286
|
+
return True # johann@…, johann.boedecker@…, boedeckerjohann@…
|
|
1287
|
+
initials = "".join(t[0] for t in person_name.lower().split() if t[:1].isalpha())
|
|
1288
|
+
if len(initials) >= 2 and local_letters in (initials, initials[::-1]):
|
|
1289
|
+
return True # jb@… / bj@… for "Johann Boedecker"
|
|
1290
|
+
return False
|
|
1291
|
+
|
|
1292
|
+
|
|
1192
1293
|
def upsert_entities(
|
|
1193
1294
|
conn: psycopg.Connection,
|
|
1194
1295
|
arena: str,
|
|
@@ -1281,6 +1382,21 @@ def upsert_entities(
|
|
|
1281
1382
|
continue
|
|
1282
1383
|
aliases = [a for a in (e.get("aliases") or []) if a]
|
|
1283
1384
|
|
|
1385
|
+
# Email-alias guard (persons only): drop bystander emails the LLM
|
|
1386
|
+
# stapled on from a co-occurring doc/thread, BEFORE they reach
|
|
1387
|
+
# resolution or storage. See _email_plausibly_belongs.
|
|
1388
|
+
if EMAIL_ALIAS_GUARD and etype == "person" and aliases:
|
|
1389
|
+
kept = []
|
|
1390
|
+
for a in aliases:
|
|
1391
|
+
if "@" in a and " " not in a and not _email_plausibly_belongs(name, a):
|
|
1392
|
+
log.info(
|
|
1393
|
+
f"alias-guard: dropped bystander email {a!r} from "
|
|
1394
|
+
f"person {name!r} (arena={arena})"
|
|
1395
|
+
)
|
|
1396
|
+
continue
|
|
1397
|
+
kept.append(a)
|
|
1398
|
+
aliases = kept
|
|
1399
|
+
|
|
1284
1400
|
# Hard-key stamps for THIS entity, merged onto the node's attributes
|
|
1285
1401
|
# and (for domain) into the resolution aliases. Adding domain to
|
|
1286
1402
|
# aliases before forms are computed is deliberate — that's what makes
|