@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,507 @@
1
+ """Unit tests for scripts/entity_resolution_v2.py (BET 1b: blocking +
2
+ embedding similarity + LLM adjudication — dry-run tooling).
3
+
4
+ Pure-python: no DB, no network, no embeddings endpoint, no LLM. The
5
+ HTTP embedding backend and the Anthropic adjudicator are NEVER called
6
+ here — tests use fakes. Importable without psycopg (both scripts guard
7
+ the driver import).
8
+
9
+ Run: pytest packages/memory-engine-v2/tests/test_entity_resolution_v2.py
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import importlib.util
15
+ import json
16
+ import subprocess
17
+ import sys
18
+ from pathlib import Path
19
+
20
+ _SCRIPTS = Path(__file__).resolve().parent.parent / "scripts"
21
+
22
+
23
+ def _load(name: str):
24
+ if name in sys.modules:
25
+ return sys.modules[name]
26
+ spec = importlib.util.spec_from_file_location(name, _SCRIPTS / f"{name}.py")
27
+ assert spec and spec.loader
28
+ mod = importlib.util.module_from_spec(spec)
29
+ sys.modules[name] = mod # register before exec (py3.13+ dataclasses)
30
+ spec.loader.exec_module(mod)
31
+ return mod
32
+
33
+
34
+ er2 = _load("entity_resolution_v2")
35
+ v1 = _load("backfill_entity_reconciliation")
36
+
37
+
38
+ def make_entity(eid: str, name: str, aliases: list[str] | None = None,
39
+ facts: int = 0, rels: int = 0) -> "v1.Entity":
40
+ return v1.Entity(
41
+ id=eid, canonical_name=name, aliases=aliases or [],
42
+ provenance_event_ids=[], fact_count=facts, rel_count=rels,
43
+ norm_forms={v1._normalize_surface(name),
44
+ *(v1._normalize_surface(a) for a in (aliases or []))},
45
+ )
46
+
47
+
48
+ class FakeAdjudicator(er2.Adjudicator):
49
+ """Deterministic verdicts keyed by frozenset of entity ids."""
50
+
51
+ def __init__(self, verdicts: dict[frozenset, str]) -> None:
52
+ self.verdicts = verdicts
53
+ self.calls: list[frozenset] = []
54
+
55
+ def adjudicate(self, a, a_facts, b, b_facts):
56
+ key = frozenset({a.id, b.id})
57
+ self.calls.append(key)
58
+ return er2.Adjudication(self.verdicts.get(key, "unsure"),
59
+ f"fake verdict for {sorted(key)}")
60
+
61
+
62
+ # ----------------------------------------------------------------------
63
+ # 1. Candidate-generation normalization — the exact Johann variants.
64
+ # NEVER identity: also locked in below.
65
+ # ----------------------------------------------------------------------
66
+
67
+ def test_block_normalize_underscore():
68
+ assert er2.block_normalize("Johann_Boedecker") == "johann boedecker"
69
+
70
+
71
+ def test_block_normalize_diacritic_and_comma():
72
+ # diacritic fold (NFD + strip combining) + punctuation strip
73
+ assert er2.block_normalize("Bödecker, Johann") == "bodecker johann"
74
+
75
+
76
+ def test_token_sort_handles_comma_inversion():
77
+ assert er2.token_sort("Bödecker, Johann") == er2.token_sort("Bodecker Johann")
78
+ assert er2.token_sort("Bödecker, Johann") == "bodecker johann"
79
+
80
+
81
+ def test_char_trigrams_bridge_oe_vs_diacritic_fold():
82
+ # "Boedecker" (oe transliteration) vs "Bödecker" (folds to bodecker)
83
+ # differ as tokens but must share trigram buckets.
84
+ shared = er2.char_trigrams("Boedecker") & er2.char_trigrams("Bödecker")
85
+ assert {"dec", "eck", "cke", "ker"} <= shared
86
+
87
+
88
+ def test_blocking_normalization_is_not_identity_normalization():
89
+ # The identity scheme (v1 / entity_id.py) preserves diacritics and
90
+ # punctuation; the blocking form must NEVER replace it.
91
+ s = "Bödecker, Johann"
92
+ assert v1._normalize_surface(s) == "bödecker, johann" # identity: untouched ö + comma
93
+ assert er2.block_normalize(s) == "bodecker johann" # blocking only
94
+ assert v1._normalize_surface(s) != er2.block_normalize(s)
95
+
96
+
97
+ def test_blocking_groups_johann_variants():
98
+ a = make_entity("e_a", "Johann_Boedecker")
99
+ b = make_entity("e_b", "Bödecker, Johann")
100
+ c = make_entity("e_c", "Johann Boedecker")
101
+ bare = make_entity("e_bare", "Johann")
102
+ near = make_entity("e_near", "Johanna Phil")
103
+ unrelated = make_entity("e_zoe", "Zoe Quist")
104
+
105
+ pairs = er2.generate_candidate_pairs([a, b, c, bare, near, unrelated])
106
+ keys = {p.key for p in pairs}
107
+
108
+ # All Boedecker spellings co-block pairwise.
109
+ assert frozenset({"e_a", "e_b"}) in keys
110
+ assert frozenset({"e_a", "e_c"}) in keys
111
+ assert frozenset({"e_b", "e_c"}) in keys
112
+ # Bare "Johann" blocks with the full names (first/last token key).
113
+ assert frozenset({"e_bare", "e_c"}) in keys
114
+ # Near-miss "Johanna Phil" co-blocks via trigrams — it must reach
115
+ # the similarity/adjudication stage (and be rejected there), not
116
+ # be silently invisible.
117
+ assert frozenset({"e_near", "e_c"}) in keys
118
+ # Unrelated person doesn't pair with the Boedecker cluster.
119
+ assert not any("e_zoe" in k and k != frozenset({"e_zoe"}) for k in keys)
120
+
121
+
122
+ def test_blocking_email_local_part_key():
123
+ a = make_entity("e_a", "johann.boedecker@pentatonic.com",
124
+ aliases=["johann.boedecker@pentatonic.com"])
125
+ b = make_entity("e_b", "Johann Boedecker")
126
+ pairs = er2.generate_candidate_pairs([a, b])
127
+ assert frozenset({"e_a", "e_b"}) in {p.key for p in pairs}
128
+
129
+
130
+ def test_blocking_skips_pairs_already_grouped_by_v1():
131
+ a = make_entity("e_a", "Johann_Boedecker")
132
+ b = make_entity("e_b", "Johann Boedecker")
133
+ pairs = er2.generate_candidate_pairs(
134
+ [a, b], already_grouped={frozenset({"e_a", "e_b"})})
135
+ assert pairs == []
136
+
137
+
138
+ def test_oversized_blocks_are_skipped():
139
+ clones = [make_entity(f"e_{i}", f"Johann Clone{i}") for i in range(10)]
140
+ pairs = er2.generate_candidate_pairs(clones, max_block=3)
141
+ # every key these share ("first:johann"/"last:..."/trigrams of
142
+ # "johann...") is oversized or unique per clone → no pair explosion
143
+ # from the shared-first-name block.
144
+ shared_first = [p for p in pairs if "first:clone0" in p.shared_keys]
145
+ assert shared_first == []
146
+
147
+
148
+ # ----------------------------------------------------------------------
149
+ # 2. Threshold band routing
150
+ # ----------------------------------------------------------------------
151
+
152
+ def test_threshold_constants():
153
+ assert er2.HIGH_THRESHOLD == 0.92
154
+ assert er2.LOW_THRESHOLD == 0.75
155
+
156
+
157
+ def test_route_band_boundaries():
158
+ assert er2.route_band(0.95) == "high"
159
+ assert er2.route_band(0.92) == "high" # inclusive
160
+ assert er2.route_band(0.9199) == "ambiguous"
161
+ assert er2.route_band(0.75) == "ambiguous" # inclusive
162
+ assert er2.route_band(0.7499) == "drop"
163
+ assert er2.route_band(0.10) == "drop"
164
+
165
+
166
+ def _scored_pair(a, b, sim):
167
+ p = er2.CandidatePair(a=a, b=b)
168
+ p.similarity = sim
169
+ return p
170
+
171
+
172
+ def test_route_pairs_band_routing():
173
+ a1, a2 = make_entity("e_1", "Johann Boedecker"), make_entity("e_2", "Bödecker, Johann")
174
+ b1, b2 = make_entity("e_3", "Carla Voss"), make_entity("e_4", "Carla Vosse")
175
+ c1, c2 = make_entity("e_5", "Johanna Phil"), make_entity("e_6", "Johann Boedeker")
176
+ pairs = [
177
+ _scored_pair(a1, a2, 0.96), # high → auto merge, no LLM
178
+ _scored_pair(b1, b2, 0.85), # ambiguous → adjudicated yes
179
+ _scored_pair(c1, c2, 0.60), # below low → dropped, no LLM
180
+ ]
181
+ adj = FakeAdjudicator({frozenset({"e_3", "e_4"}): "yes"})
182
+ routed = er2.route_pairs(pairs, adj, {})
183
+
184
+ assert {p.key for p in routed.merge} == {frozenset({"e_1", "e_2"}),
185
+ frozenset({"e_3", "e_4"})}
186
+ assert {p.key for p in routed.dropped} == {frozenset({"e_5", "e_6"})}
187
+ assert routed.human_review == []
188
+ # LLM consulted ONLY for the ambiguous pair.
189
+ assert adj.calls == [frozenset({"e_3", "e_4"})]
190
+ # high-confidence merge carries its reasoning
191
+ auto = next(p for p in routed.merge if p.key == frozenset({"e_1", "e_2"}))
192
+ assert auto.verdict == "auto" and "0.960" in auto.reason
193
+
194
+
195
+ def test_route_pairs_unsure_never_merges():
196
+ a, b = make_entity("e_1", "Johann Boedecker"), make_entity("e_2", "Johanna Phil")
197
+ routed = er2.route_pairs(
198
+ [_scored_pair(a, b, 0.85)],
199
+ FakeAdjudicator({frozenset({"e_1", "e_2"}): "unsure"}), {})
200
+ assert routed.merge == []
201
+ assert [p.key for p in routed.human_review] == [frozenset({"e_1", "e_2"})]
202
+
203
+
204
+ def test_route_pairs_no_verdict_means_no_merge():
205
+ a, b = make_entity("e_1", "Johann Boedecker"), make_entity("e_2", "Johanna Phil")
206
+ routed = er2.route_pairs(
207
+ [_scored_pair(a, b, 0.85)],
208
+ FakeAdjudicator({frozenset({"e_1", "e_2"}): "no"}), {})
209
+ assert routed.merge == [] and routed.human_review == []
210
+ assert [p.key for p in routed.dropped] == [frozenset({"e_1", "e_2"})]
211
+
212
+
213
+ def test_no_llm_routes_whole_ambiguous_band_to_human_review():
214
+ a, b = make_entity("e_1", "Carla Voss"), make_entity("e_2", "Carla Vosse")
215
+ c, d = make_entity("e_3", "Mark Diaz"), make_entity("e_4", "Marc Diaz")
216
+ routed = er2.route_pairs(
217
+ [_scored_pair(a, b, 0.85), _scored_pair(c, d, 0.80)],
218
+ er2.NoLLMAdjudicator(), {})
219
+ assert routed.merge == []
220
+ assert len(routed.human_review) == 2
221
+ assert all(p.verdict == "unsure" for p in routed.human_review)
222
+
223
+
224
+ # ----------------------------------------------------------------------
225
+ # 3. Bare-first-name policy
226
+ # ----------------------------------------------------------------------
227
+
228
+ def test_is_bare_name():
229
+ assert er2.is_bare_name(make_entity("e", "Johann"))
230
+ assert not er2.is_bare_name(make_entity("e", "Johann Boedecker"))
231
+ # an email-only entity is not a bare *name*
232
+ assert not er2.is_bare_name(make_entity("e", "johann@x.com"))
233
+ # a single-token canonical with a multi-token alias is not bare
234
+ assert not er2.is_bare_name(
235
+ make_entity("e", "Johann", aliases=["Johann Boedecker"]))
236
+
237
+
238
+ def test_bare_name_single_candidate_and_yes_merges():
239
+ bare = make_entity("e_bare", "Johann")
240
+ full = make_entity("e_full", "Johann Boedecker", facts=10)
241
+ routed = er2.route_pairs(
242
+ [_scored_pair(bare, full, 0.95)],
243
+ FakeAdjudicator({frozenset({"e_bare", "e_full"}): "yes"}), {})
244
+ assert [p.key for p in routed.merge] == [frozenset({"e_bare", "e_full"})]
245
+
246
+
247
+ def test_bare_name_high_similarity_still_requires_adjudication():
248
+ bare = make_entity("e_bare", "Johann")
249
+ full = make_entity("e_full", "Johann Boedecker")
250
+ adj = FakeAdjudicator({frozenset({"e_bare", "e_full"}): "no"})
251
+ routed = er2.route_pairs([_scored_pair(bare, full, 0.99)], adj, {})
252
+ # 0.99 >= high, but bare names never auto-merge: LLM said no → drop
253
+ assert routed.merge == []
254
+ assert adj.calls == [frozenset({"e_bare", "e_full"})]
255
+
256
+
257
+ def test_bare_name_two_candidates_never_merges():
258
+ bare = make_entity("e_bare", "Johann")
259
+ full1 = make_entity("e_f1", "Johann Boedecker")
260
+ full2 = make_entity("e_f2", "Johann Mueller")
261
+ adj = FakeAdjudicator({
262
+ frozenset({"e_bare", "e_f1"}): "yes",
263
+ frozenset({"e_bare", "e_f2"}): "yes",
264
+ })
265
+ routed = er2.route_pairs(
266
+ [_scored_pair(bare, full1, 0.93), _scored_pair(bare, full2, 0.90)],
267
+ adj, {})
268
+ assert routed.merge == []
269
+ assert len(routed.human_review) == 2
270
+ assert adj.calls == [] # ambiguity short-circuits before the LLM
271
+
272
+
273
+ def test_bare_name_with_no_llm_goes_to_review():
274
+ bare = make_entity("e_bare", "Johann")
275
+ full = make_entity("e_full", "Johann Boedecker")
276
+ routed = er2.route_pairs([_scored_pair(bare, full, 0.95)],
277
+ er2.NoLLMAdjudicator(), {})
278
+ assert routed.merge == []
279
+ assert len(routed.human_review) == 1
280
+
281
+
282
+ # ----------------------------------------------------------------------
283
+ # 4. Proposal assembly (richest-row-wins, same ordering as v1)
284
+ # ----------------------------------------------------------------------
285
+
286
+ def test_pairs_to_proposals_richest_wins_and_transitive():
287
+ a = make_entity("e_a", "Johann_Boedecker", facts=2)
288
+ b = make_entity("e_b", "Bödecker, Johann", facts=50, rels=3)
289
+ c = make_entity("e_c", "Johann Boedecker", facts=7)
290
+ p1, p2 = _scored_pair(a, b, 0.95), _scored_pair(b, c, 0.94)
291
+ proposals = er2.pairs_to_proposals([p1, p2])
292
+ assert len(proposals) == 1
293
+ assert proposals[0].canonical.id == "e_b" # richest
294
+ assert {d.id for d in proposals[0].deprecated} == {"e_a", "e_c"}
295
+ assert proposals[0].signal == "embedding_llm"
296
+
297
+
298
+ # ----------------------------------------------------------------------
299
+ # 5. Adjudication JSON parsing — strict; malformed → unsure
300
+ # ----------------------------------------------------------------------
301
+
302
+ def test_parse_adjudication_valid():
303
+ out = er2.parse_adjudication(
304
+ '{"same_person": "yes", "reason": "same email domain and role"}')
305
+ assert out.same_person == "yes"
306
+ assert "email domain" in out.reason
307
+
308
+
309
+ def test_parse_adjudication_json_embedded_in_prose():
310
+ out = er2.parse_adjudication(
311
+ 'Sure! Here is my answer: {"same_person": "no", "reason": "different people"}')
312
+ assert out.same_person == "no"
313
+
314
+
315
+ def test_parse_adjudication_malformed_is_unsure():
316
+ assert er2.parse_adjudication("I think they are the same").same_person == "unsure"
317
+ assert er2.parse_adjudication('{"same_person": "maybe"}').same_person == "unsure"
318
+ assert er2.parse_adjudication('{"broken json').same_person == "unsure"
319
+
320
+
321
+ # ----------------------------------------------------------------------
322
+ # 6. Embedding plumbing (no network — backends are constructed only)
323
+ # ----------------------------------------------------------------------
324
+
325
+ def test_cosine():
326
+ assert abs(er2.cosine([1.0, 0.0], [1.0, 0.0]) - 1.0) < 1e-9
327
+ assert abs(er2.cosine([1.0, 0.0], [0.0, 1.0])) < 1e-9
328
+ assert er2.cosine([0.0, 0.0], [1.0, 0.0]) == 0.0
329
+
330
+
331
+ def test_embedding_bundle_contains_surfaces_and_facts():
332
+ e = make_entity("e", "Johann Boedecker", aliases=["johann@x.com"])
333
+ bundle = er2.embedding_bundle(e, ["Johann leads the Berlin office"])
334
+ assert "Johann Boedecker" in bundle
335
+ assert "johann@x.com" in bundle
336
+ assert "Berlin office" in bundle
337
+
338
+
339
+ def test_http_backend_requires_url_and_is_never_defaulted():
340
+ try:
341
+ er2.HttpEmbeddingBackend("")
342
+ raise AssertionError("expected ValueError for empty --embed-url")
343
+ except ValueError:
344
+ pass
345
+
346
+
347
+ def test_local_backend_is_an_explicit_stub():
348
+ try:
349
+ er2.LocalEmbeddingBackend()
350
+ raise AssertionError("expected NotImplementedError")
351
+ except NotImplementedError as e:
352
+ assert "--embed-backend http" in str(e)
353
+
354
+
355
+ # ----------------------------------------------------------------------
356
+ # 7. Safety gates: --apply refused without --i-have-a-snapshot
357
+ # ----------------------------------------------------------------------
358
+
359
+ def test_validate_args_refuses_apply_without_snapshot():
360
+ args = er2.parse_args(["--arena", "test-arena", "--pg-dsn", "x",
361
+ "--embed-url", "http://e", "--apply"])
362
+ err = er2.validate_args(args)
363
+ assert err is not None and "--i-have-a-snapshot" in err
364
+
365
+
366
+ def test_validate_args_accepts_apply_with_snapshot():
367
+ args = er2.parse_args(["--arena", "test-arena", "--pg-dsn", "x",
368
+ "--embed-url", "http://e",
369
+ "--apply", "--i-have-a-snapshot"])
370
+ assert er2.validate_args(args) is None
371
+
372
+
373
+ def test_cli_subprocess_refuses_apply_without_snapshot():
374
+ proc = subprocess.run(
375
+ [sys.executable, str(_SCRIPTS / "entity_resolution_v2.py"),
376
+ "--arena", "test-arena", "--pg-dsn", "postgresql://x", "--apply"],
377
+ capture_output=True, text=True)
378
+ assert proc.returncode == 2
379
+ assert "--i-have-a-snapshot" in proc.stderr
380
+
381
+
382
+ def test_cli_requires_arena():
383
+ proc = subprocess.run(
384
+ [sys.executable, str(_SCRIPTS / "entity_resolution_v2.py"),
385
+ "--pg-dsn", "postgresql://x"],
386
+ capture_output=True, text=True)
387
+ assert proc.returncode == 2
388
+ assert "--arena" in proc.stderr
389
+
390
+
391
+ # ----------------------------------------------------------------------
392
+ # 8. Arena scoping — every SQL statement carries the arena predicate
393
+ # ----------------------------------------------------------------------
394
+
395
+ def test_v2_sql_registry_is_arena_scoped():
396
+ assert er2.ARENA_SCOPED_SQL, "registry must not be empty"
397
+ for name, sql in er2.ARENA_SCOPED_SQL.items():
398
+ assert "arena = %s" in sql, f"v2 SQL '{name}' lost its arena predicate"
399
+ er2.assert_arena_scoped() # must not raise
400
+
401
+
402
+ def test_v1_load_and_repoint_sql_is_arena_scoped():
403
+ src = (_SCRIPTS / "backfill_entity_reconciliation.py").read_text()
404
+ # Every fact/relationship repoint in apply_proposals is arena-scoped.
405
+ import re
406
+ repoints = re.findall(
407
+ r"UPDATE (?:facts|relationships) SET \w+ = %s\s+WHERE ([^\"]+?)\n",
408
+ src)
409
+ assert len(repoints) == 4, "expected the 4 v1 repoint statements"
410
+ for predicate in repoints:
411
+ assert "arena = %s" in predicate
412
+ # The entity load + co-occurrence scan are arena-scoped too.
413
+ assert "WHERE arena = %s AND entity_type = %s" in src
414
+ assert '"SELECT attributes FROM events WHERE arena = %s"' in src
415
+
416
+
417
+ # ----------------------------------------------------------------------
418
+ # 9. v1 CLI byte-compatibility — v2 must not have changed it
419
+ # ----------------------------------------------------------------------
420
+
421
+ def test_v1_cli_surface_unchanged():
422
+ old_argv = sys.argv
423
+ try:
424
+ sys.argv = ["backfill_entity_reconciliation.py",
425
+ "--arena", "a", "--pg-dsn", "dsn"]
426
+ ns = v1.parse_args()
427
+ finally:
428
+ sys.argv = old_argv
429
+ assert vars(ns) == {
430
+ "arena": "a", "pg_dsn": "dsn", "entity_type": "person",
431
+ "apply": False, "heuristic_merge": False, "out": None,
432
+ "merged_by": None,
433
+ }, "v1's CLI namespace changed — it must stay byte-compatible"
434
+
435
+
436
+ def test_v1_cli_rejects_v2_only_flags():
437
+ old_argv = sys.argv
438
+ try:
439
+ sys.argv = ["backfill_entity_reconciliation.py",
440
+ "--arena", "a", "--pg-dsn", "dsn", "--i-have-a-snapshot"]
441
+ try:
442
+ v1.parse_args()
443
+ raise AssertionError("v1 accepted a v2-only flag")
444
+ except SystemExit as e:
445
+ assert e.code == 2
446
+ finally:
447
+ sys.argv = old_argv
448
+
449
+
450
+ def test_v1_machinery_is_imported_not_copied():
451
+ # v2 must reuse v1's load/apply machinery, not fork it.
452
+ assert er2.v1 is v1
453
+ assert er2.Entity is v1.Entity
454
+ assert er2.MergeProposal is v1.MergeProposal
455
+ src = (_SCRIPTS / "entity_resolution_v2.py").read_text()
456
+ for fn in ("def load_entities", "def apply_proposals",
457
+ "def collect_cooccurrence_pairs", "def build_proposals"):
458
+ assert fn not in src, f"v2 copied v1's {fn} instead of importing it"
459
+
460
+
461
+ # ----------------------------------------------------------------------
462
+ # 10. Tiered report
463
+ # ----------------------------------------------------------------------
464
+
465
+ def test_report_tiers_and_counts(tmp_path=None):
466
+ rich = make_entity("e_rich", "Johann Boedecker", facts=50)
467
+ dup = make_entity("e_dup", "Johann_Boedecker", facts=1)
468
+ v1_prop = v1.MergeProposal(canonical=rich, deprecated=[dup],
469
+ signal="co_occurrence")
470
+ m1, m2 = make_entity("e_m1", "Carla Voss", facts=9), make_entity("e_m2", "Carla Vosse")
471
+ pair = _scored_pair(m1, m2, 0.85)
472
+ pair.band, pair.verdict, pair.reason = "ambiguous", "yes", "same person per facts"
473
+ routed = er2.RoutedPairs(merge=[pair])
474
+ v2_props = er2.pairs_to_proposals([pair])
475
+
476
+ records = er2.build_report_records(
477
+ "test-arena", [v1_prop], v2_props, routed, before_count=10)
478
+ header = records[0]
479
+ assert header["arena"] == "test-arena"
480
+ assert header["entity_count_before"] == 10
481
+ assert header["entity_count_after_if_applied"] == 8 # 2 deprecations
482
+ assert "untouched" in header["other_arenas"]
483
+ assert header["tiers"] == ["co_occurrence", "alias_overlap",
484
+ "embedding_llm", "heuristic"]
485
+
486
+ proposals = [r for r in records if r["type"] == "merge_proposal"]
487
+ assert [p["tier"] for p in proposals] == ["co_occurrence", "embedding_llm"]
488
+ emb = proposals[1]
489
+ assert emb["evidence"][0]["similarity"] == 0.85
490
+ assert emb["evidence"][0]["verdict"] == "yes"
491
+ assert "same person per facts" in emb["evidence"][0]["reason"]
492
+ # All records JSONL-serializable.
493
+ for r in records:
494
+ json.dumps(r)
495
+
496
+
497
+ def test_markdown_summary_mentions_frozen_arena(tmp_path=None):
498
+ import tempfile, os
499
+ records = er2.build_report_records(
500
+ "test-arena", [], [], er2.RoutedPairs(), before_count=3)
501
+ with tempfile.TemporaryDirectory() as d:
502
+ path = os.path.join(d, "summary.md")
503
+ er2.write_markdown_summary(records, path)
504
+ text = Path(path).read_text()
505
+ assert "Other arenas" in text
506
+ assert "pip-agents" in text and "FROZEN" in text
507
+ assert "`test-arena`" in text