pdfhell 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfhell/__init__.py +34 -0
- pdfhell/auditpack.py +182 -0
- pdfhell/case.py +87 -0
- pdfhell/cli.py +216 -0
- pdfhell/generators/__init__.py +49 -0
- pdfhell/generators/_common.py +183 -0
- pdfhell/generators/footnote_override.py +212 -0
- pdfhell/generators/hidden_ocr_mismatch.py +129 -0
- pdfhell/generators/split_table_across_pages.py +174 -0
- pdfhell/junit.py +94 -0
- pdfhell/runner.py +142 -0
- pdfhell/scorer.py +214 -0
- pdfhell/suite.py +104 -0
- pdfhell/vision.py +231 -0
- pdfhell-0.1.0.dist-info/METADATA +208 -0
- pdfhell-0.1.0.dist-info/RECORD +20 -0
- pdfhell-0.1.0.dist-info/WHEEL +5 -0
- pdfhell-0.1.0.dist-info/entry_points.txt +2 -0
- pdfhell-0.1.0.dist-info/licenses/LICENSE +17 -0
- pdfhell-0.1.0.dist-info/top_level.txt +1 -0
pdfhell/junit.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Render a :class:`SuiteReport` as JUnit XML for CI consumption.
|
|
2
|
+
|
|
3
|
+
GitHub Actions, GitLab CI, Jenkins, CircleCI, and most other CI runners
|
|
4
|
+
display JUnit XML natively in their PR / merge-request panel. Failures
|
|
5
|
+
show up as red rows on the PR with the model output and expected
|
|
6
|
+
answer in the failure message.
|
|
7
|
+
|
|
8
|
+
The rendered XML follows the de-facto JUnit dialect (Ant/Maven-style)
|
|
9
|
+
that everyone parses. We classify outcomes as:
|
|
10
|
+
|
|
11
|
+
- ``correct=True`` → passing testcase, no children
|
|
12
|
+
- ``fell_for_trap`` → ``<failure>`` (this is the diagnostic signal)
|
|
13
|
+
- ``refused`` → ``<skipped>`` (model wouldn't answer — not a quality fail)
|
|
14
|
+
- everything else → ``<failure>``
|
|
15
|
+
|
|
16
|
+
We deliberately don't emit ``<error>`` — pdfhell's upstream errors
|
|
17
|
+
(provider down, SDK missing) get caught in the runner and recorded as
|
|
18
|
+
the model's text output starting with ``[error]``. Surfacing them as
|
|
19
|
+
``<error>`` would noise the dashboard for transient infra issues.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import datetime
|
|
24
|
+
import xml.etree.ElementTree as ET
|
|
25
|
+
|
|
26
|
+
from .scorer import SuiteReport
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _suite_name(report: SuiteReport) -> str:
|
|
30
|
+
"""Slug-ish name for the JUnit suite — 'pdfhell.<suite>.<model>'."""
|
|
31
|
+
safe_model = report.model.replace("/", ".").replace(":", ".")
|
|
32
|
+
return f"pdfhell.{report.suite}.{safe_model}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def report_to_junit(report: SuiteReport) -> str:
|
|
36
|
+
"""Return a JUnit XML string for ``report``.
|
|
37
|
+
|
|
38
|
+
The XML is human-readable and round-trips through every CI parser
|
|
39
|
+
we've tested. Failure messages include the expected and observed
|
|
40
|
+
answers so on-call doesn't have to dig through the runs JSON.
|
|
41
|
+
"""
|
|
42
|
+
failures = sum(1 for c in report.cases if not c.correct and not c.refused)
|
|
43
|
+
skipped = sum(1 for c in report.cases if c.refused)
|
|
44
|
+
testsuite = ET.Element(
|
|
45
|
+
"testsuite",
|
|
46
|
+
{
|
|
47
|
+
"name": _suite_name(report),
|
|
48
|
+
"tests": str(report.n),
|
|
49
|
+
"failures": str(failures),
|
|
50
|
+
"errors": "0",
|
|
51
|
+
"skipped": str(skipped),
|
|
52
|
+
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
|
53
|
+
},
|
|
54
|
+
)
|
|
55
|
+
for c in report.cases:
|
|
56
|
+
case_el = ET.SubElement(
|
|
57
|
+
testsuite,
|
|
58
|
+
"testcase",
|
|
59
|
+
{
|
|
60
|
+
"name": c.case_id,
|
|
61
|
+
"classname": c.trap_family,
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
if c.refused:
|
|
65
|
+
skipped_el = ET.SubElement(case_el, "skipped", {"message": "model refused"})
|
|
66
|
+
skipped_el.text = c.model_output[:200]
|
|
67
|
+
elif not c.correct:
|
|
68
|
+
kind = "fell_for_trap" if c.fell_for_trap else "hallucination"
|
|
69
|
+
failure_el = ET.SubElement(
|
|
70
|
+
case_el,
|
|
71
|
+
"failure",
|
|
72
|
+
{
|
|
73
|
+
"type": kind,
|
|
74
|
+
"message": (
|
|
75
|
+
f"expected={c.expected!r}; got={c.model_output[:80]!r}"
|
|
76
|
+
),
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
details = [
|
|
80
|
+
f"expected_answer: {c.expected}",
|
|
81
|
+
f"model_output: {c.model_output}",
|
|
82
|
+
]
|
|
83
|
+
if c.matched_forbidden:
|
|
84
|
+
details.append(f"matched_forbidden: {c.matched_forbidden}")
|
|
85
|
+
if c.failure_mode:
|
|
86
|
+
details.append(f"failure_mode: {c.failure_mode}")
|
|
87
|
+
failure_el.text = "\n".join(details)
|
|
88
|
+
testsuites = ET.Element("testsuites")
|
|
89
|
+
testsuites.append(testsuite)
|
|
90
|
+
# ET in 3.10+ supports short_empty_elements but xml_declaration is what CIs expect.
|
|
91
|
+
return ET.tostring(testsuites, encoding="unicode", xml_declaration=True)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
__all__ = ["report_to_junit"]
|
pdfhell/runner.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Run a model against a pdfhell suite.
|
|
2
|
+
|
|
3
|
+
The runner is intentionally thin. It:
|
|
4
|
+
|
|
5
|
+
1. Loads cases from disk (or builds them on demand by re-seeding).
|
|
6
|
+
2. Sends each ``(question, pdf)`` pair to a vision-capable model.
|
|
7
|
+
3. Scores the model's free-text answer against code-based ground truth.
|
|
8
|
+
|
|
9
|
+
It does NOT do its own scoring methodology — that lives in
|
|
10
|
+
:mod:`pdfhell.scorer`. It does NOT do its own provider dispatch — that
|
|
11
|
+
lives in :mod:`multivon_eval.judge` (we reuse the vision-call dispatch
|
|
12
|
+
from the multimodal evaluators).
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Iterable
|
|
21
|
+
|
|
22
|
+
from multivon_eval import JudgeConfig
|
|
23
|
+
|
|
24
|
+
from .case import HellCase
|
|
25
|
+
from .scorer import CaseScore, SuiteReport, score_case, summarise
|
|
26
|
+
from .vision import call_vision
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_model_spec(spec: str) -> JudgeConfig:
|
|
30
|
+
"""Parse ``"provider:model"`` shorthand into a ``JudgeConfig``.
|
|
31
|
+
|
|
32
|
+
Examples::
|
|
33
|
+
|
|
34
|
+
anthropic:claude-sonnet-4-6
|
|
35
|
+
openai:gpt-4o
|
|
36
|
+
google:gemini-2.5-pro
|
|
37
|
+
|
|
38
|
+
The shorthand is opinionated about temperature (0.0 — we want
|
|
39
|
+
deterministic answers for a benchmark) and max_tokens (256 — answers
|
|
40
|
+
should be short; long answers usually mean the model is rambling
|
|
41
|
+
around the answer rather than giving it).
|
|
42
|
+
"""
|
|
43
|
+
if ":" not in spec:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"model spec must be 'provider:model', got {spec!r}. "
|
|
46
|
+
"Example: anthropic:claude-sonnet-4-6"
|
|
47
|
+
)
|
|
48
|
+
provider, model = spec.split(":", 1)
|
|
49
|
+
provider = provider.strip().lower()
|
|
50
|
+
if provider not in {"anthropic", "openai", "google"}:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"unsupported provider {provider!r}; "
|
|
53
|
+
"use anthropic, openai, or google"
|
|
54
|
+
)
|
|
55
|
+
# max_tokens=2048 gives prose answers room (e.g. footnote_override
|
|
56
|
+
# carve-out summaries) without letting models ramble. Gemini 2.5
|
|
57
|
+
# Flash allocates output tokens to internal "thinking"; tight
|
|
58
|
+
# budgets either produce empty responses or truncate mid-sentence.
|
|
59
|
+
# 2k is sufficient headroom in practice.
|
|
60
|
+
return JudgeConfig(
|
|
61
|
+
provider=provider,
|
|
62
|
+
model=model.strip(),
|
|
63
|
+
temperature=0.0,
|
|
64
|
+
max_tokens=2048,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(slots=True)
|
|
69
|
+
class _Job:
|
|
70
|
+
case: HellCase
|
|
71
|
+
pdf_path: Path
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _ask_model(job: _Job, judge: JudgeConfig) -> tuple[HellCase, str]:
|
|
75
|
+
"""Send one case to the model. Returns (case, raw_answer).
|
|
76
|
+
|
|
77
|
+
Provider-level errors propagate as JudgeUnavailable. The CLI catches
|
|
78
|
+
them and records the case as refused.
|
|
79
|
+
"""
|
|
80
|
+
answer = call_vision(
|
|
81
|
+
prompt=job.case.question,
|
|
82
|
+
sources=[str(job.pdf_path)],
|
|
83
|
+
judge=judge,
|
|
84
|
+
max_tokens=judge.max_tokens or 2048,
|
|
85
|
+
)
|
|
86
|
+
return job.case, answer.strip()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def run_suite(
|
|
90
|
+
cases_dir: Path,
|
|
91
|
+
model_spec: str,
|
|
92
|
+
*,
|
|
93
|
+
workers: int = 4,
|
|
94
|
+
progress: bool = True,
|
|
95
|
+
suite_name: str = "mini",
|
|
96
|
+
) -> SuiteReport:
|
|
97
|
+
"""Evaluate every case under ``cases_dir`` against ``model_spec``.
|
|
98
|
+
|
|
99
|
+
``cases_dir`` must contain ``<case_id>.json`` and ``<case_id>.pdf``
|
|
100
|
+
pairs produced by :func:`pdfhell.suite.build_suite`.
|
|
101
|
+
"""
|
|
102
|
+
judge = parse_model_spec(model_spec)
|
|
103
|
+
jobs = list(_load_jobs(cases_dir))
|
|
104
|
+
if not jobs:
|
|
105
|
+
raise FileNotFoundError(f"no cases found in {cases_dir}")
|
|
106
|
+
scores: list[CaseScore] = []
|
|
107
|
+
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
108
|
+
futures = {pool.submit(_ask_model, job, judge): job for job in jobs}
|
|
109
|
+
completed = 0
|
|
110
|
+
for fut in as_completed(futures):
|
|
111
|
+
job = futures[fut]
|
|
112
|
+
try:
|
|
113
|
+
case, answer = fut.result()
|
|
114
|
+
except Exception as exc: # JudgeUnavailable, provider error, etc.
|
|
115
|
+
# Treat upstream errors as refusals so the run still
|
|
116
|
+
# produces a complete report; the case is scored as
|
|
117
|
+
# incorrect with a refusal flag.
|
|
118
|
+
case = job.case
|
|
119
|
+
answer = f"[error] {type(exc).__name__}: {exc}"
|
|
120
|
+
score = score_case(case, answer)
|
|
121
|
+
scores.append(score)
|
|
122
|
+
completed += 1
|
|
123
|
+
if progress:
|
|
124
|
+
mark = "✓" if score.correct else ("⚠" if score.fell_for_trap else "✗")
|
|
125
|
+
print(f" {mark} {score.case_id:36s} expected={score.expected!r:30s} got={answer[:60]!r}")
|
|
126
|
+
return summarise(model_spec, suite_name, scores)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _load_jobs(cases_dir: Path) -> Iterable[_Job]:
|
|
130
|
+
"""Yield (case, pdf_path) pairs sorted by case id."""
|
|
131
|
+
for json_path in sorted(cases_dir.glob("*.json")):
|
|
132
|
+
case = HellCase.load_json(json_path)
|
|
133
|
+
pdf_path = json_path.with_suffix(".pdf")
|
|
134
|
+
if not pdf_path.exists():
|
|
135
|
+
# The case JSON tracks its own pdf_path relative to the
|
|
136
|
+
# suite root; honour that as a fallback.
|
|
137
|
+
pdf_path = (cases_dir / case.pdf_path).resolve()
|
|
138
|
+
if not pdf_path.exists():
|
|
139
|
+
raise FileNotFoundError(
|
|
140
|
+
f"PDF not found for case {case.id}; expected {pdf_path}"
|
|
141
|
+
)
|
|
142
|
+
yield _Job(case=case, pdf_path=pdf_path)
|
pdfhell/scorer.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Code-based scoring for pdfhell cases.
|
|
2
|
+
|
|
3
|
+
LLM-as-judge is circular: the same complexity that fools the agent
|
|
4
|
+
often fools the judge. pdfhell's primary correctness signal therefore
|
|
5
|
+
does *not* go through an LLM. The PDF was generated from code, so the
|
|
6
|
+
answer is exactly known and the scorer compares strings directly.
|
|
7
|
+
|
|
8
|
+
QAG (multivon-eval's :class:`~multivon_eval.DocumentGrounding`) is
|
|
9
|
+
available separately as the *explanation* of why a model failed — "the
|
|
10
|
+
model returned $19,900.25, matching the hidden-OCR layer rather than
|
|
11
|
+
the visible $18,900.25" — but it never affects pass/fail.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from .case import HellCase
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
23
|
+
_PUNCT_NORMALIZE_RE = re.compile(r"[.,;:]+\s*$")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _normalize(s: str) -> str:
|
|
27
|
+
"""Loose normalisation for free-text matching: collapse whitespace,
|
|
28
|
+
strip trailing punctuation, lower-case. Money strings stay
|
|
29
|
+
case-irrelevant; trailing periods get stripped so "The answer is
|
|
30
|
+
$18,900.25." matches "$18,900.25"."""
|
|
31
|
+
s = _WHITESPACE_RE.sub(" ", s.strip().lower())
|
|
32
|
+
s = _PUNCT_NORMALIZE_RE.sub("", s)
|
|
33
|
+
return s
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _contains_loose(haystack: str, needle: str) -> bool:
|
|
37
|
+
return _normalize(needle) in _normalize(haystack)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(slots=True)
|
|
41
|
+
class CaseScore:
|
|
42
|
+
"""Score for one (case, model_output) pair.
|
|
43
|
+
|
|
44
|
+
``correct`` is the headline binary signal pdfhell publishes. The
|
|
45
|
+
other fields disambiguate *how* the model got it wrong, which is
|
|
46
|
+
what makes pdfhell useful as a diagnostic tool — not just a number.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
case_id: str
|
|
50
|
+
trap_family: str
|
|
51
|
+
correct: bool
|
|
52
|
+
fell_for_trap: bool
|
|
53
|
+
refused: bool
|
|
54
|
+
matched_expected: bool
|
|
55
|
+
matched_forbidden: list[str] = field(default_factory=list)
|
|
56
|
+
model_output: str = ""
|
|
57
|
+
expected: str = ""
|
|
58
|
+
failure_mode: str = "" # human-readable, drawn from the case metadata when relevant
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> dict[str, Any]:
|
|
61
|
+
return {
|
|
62
|
+
"case_id": self.case_id,
|
|
63
|
+
"trap_family": self.trap_family,
|
|
64
|
+
"correct": self.correct,
|
|
65
|
+
"fell_for_trap": self.fell_for_trap,
|
|
66
|
+
"refused": self.refused,
|
|
67
|
+
"matched_expected": self.matched_expected,
|
|
68
|
+
"matched_forbidden": self.matched_forbidden,
|
|
69
|
+
"model_output": self.model_output,
|
|
70
|
+
"expected": self.expected,
|
|
71
|
+
"failure_mode": self.failure_mode,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
_REFUSAL_PATTERNS = (
|
|
76
|
+
"i cannot",
|
|
77
|
+
"i can't",
|
|
78
|
+
"unable to",
|
|
79
|
+
"i do not have access",
|
|
80
|
+
"i don't have access",
|
|
81
|
+
"no information",
|
|
82
|
+
"not visible",
|
|
83
|
+
"cannot determine",
|
|
84
|
+
"can't determine",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _looks_like_refusal(output: str) -> bool:
|
|
89
|
+
low = output.strip().lower()
|
|
90
|
+
if len(low) < 4:
|
|
91
|
+
return True
|
|
92
|
+
return any(p in low for p in _REFUSAL_PATTERNS)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def score_case(case: HellCase, model_output: str) -> CaseScore:
|
|
96
|
+
"""Score a single (case, model_output) pair using code-based ground truth.
|
|
97
|
+
|
|
98
|
+
Decision tree:
|
|
99
|
+
|
|
100
|
+
- If the case has :attr:`HellCase.expected_tokens`, the output is
|
|
101
|
+
correct iff *every* token appears (whitespace-/case-tolerant).
|
|
102
|
+
Used for prose-style traps where the right answer can be phrased
|
|
103
|
+
multiple equally-valid ways but the *facts* are fixed (clause
|
|
104
|
+
numbers, dollar amounts, region codes).
|
|
105
|
+
- Otherwise, the output is correct iff it contains
|
|
106
|
+
:attr:`HellCase.expected_answer`. Used for single-value traps
|
|
107
|
+
(dollar amounts, dates, citations).
|
|
108
|
+
- Forbidden-answer detection runs regardless — if the model produces
|
|
109
|
+
a known wrong value, we record the diagnostic. A correct answer
|
|
110
|
+
that *also* contains a forbidden string is still wrong (it means
|
|
111
|
+
the model returned both, which is incoherent and should be flagged).
|
|
112
|
+
- Refusal detection runs last, only on otherwise-wrong outputs.
|
|
113
|
+
"""
|
|
114
|
+
if case.expected_tokens:
|
|
115
|
+
matched_expected = all(_contains_loose(model_output, t) for t in case.expected_tokens)
|
|
116
|
+
else:
|
|
117
|
+
matched_expected = _contains_loose(model_output, case.expected_answer)
|
|
118
|
+
# When a case uses ``expected_tokens`` (prose answers), the
|
|
119
|
+
# ``forbidden_answer`` is often a literal substring of any complete
|
|
120
|
+
# correct answer (e.g. the body-only phrase is contained inside a
|
|
121
|
+
# full body+footnote summary). If all tokens matched, the model
|
|
122
|
+
# demonstrably captured the right facts; ignore the substring
|
|
123
|
+
# signal. For single-value traps (hidden OCR amount, table cell),
|
|
124
|
+
# expected and forbidden are mutually exclusive by construction, so
|
|
125
|
+
# the check stays active.
|
|
126
|
+
if case.expected_tokens and matched_expected:
|
|
127
|
+
matched_forbidden: list[str] = []
|
|
128
|
+
else:
|
|
129
|
+
matched_forbidden = [
|
|
130
|
+
f for f in case.forbidden_answers if _contains_loose(model_output, f)
|
|
131
|
+
]
|
|
132
|
+
correct = matched_expected and not matched_forbidden
|
|
133
|
+
refused = (not matched_expected) and (not matched_forbidden) and _looks_like_refusal(model_output)
|
|
134
|
+
fell_for_trap = bool(matched_forbidden) and not matched_expected
|
|
135
|
+
failure_mode = ""
|
|
136
|
+
if fell_for_trap:
|
|
137
|
+
failure_mode = case.metadata.get("expected_failure_mode", "")
|
|
138
|
+
return CaseScore(
|
|
139
|
+
case_id=case.id,
|
|
140
|
+
trap_family=case.trap_family,
|
|
141
|
+
correct=correct,
|
|
142
|
+
fell_for_trap=fell_for_trap,
|
|
143
|
+
refused=refused,
|
|
144
|
+
matched_expected=matched_expected,
|
|
145
|
+
matched_forbidden=matched_forbidden,
|
|
146
|
+
model_output=model_output,
|
|
147
|
+
expected=case.expected_answer,
|
|
148
|
+
failure_mode=failure_mode,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass(slots=True)
|
|
153
|
+
class SuiteReport:
|
|
154
|
+
"""Aggregate scoring summary for a run.
|
|
155
|
+
|
|
156
|
+
Designed to serialise to the leaderboard JSON multivon.ai already
|
|
157
|
+
consumes (one entry per (suite, model, judge) tuple).
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
model: str
|
|
161
|
+
suite: str
|
|
162
|
+
n: int
|
|
163
|
+
pass_rate: float
|
|
164
|
+
per_trap_pass: dict[str, float]
|
|
165
|
+
per_trap_fell_for_trap: dict[str, float]
|
|
166
|
+
refused_rate: float
|
|
167
|
+
cases: list[CaseScore] = field(default_factory=list)
|
|
168
|
+
|
|
169
|
+
def to_dict(self) -> dict[str, Any]:
|
|
170
|
+
return {
|
|
171
|
+
"model": self.model,
|
|
172
|
+
"suite": self.suite,
|
|
173
|
+
"n": self.n,
|
|
174
|
+
"pass_rate": self.pass_rate,
|
|
175
|
+
"per_trap_pass": self.per_trap_pass,
|
|
176
|
+
"per_trap_fell_for_trap": self.per_trap_fell_for_trap,
|
|
177
|
+
"refused_rate": self.refused_rate,
|
|
178
|
+
"cases": [c.to_dict() for c in self.cases],
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
def worst_failures(self, k: int = 5) -> list[CaseScore]:
|
|
182
|
+
"""Return up to ``k`` cases that fell into the designed trap.
|
|
183
|
+
|
|
184
|
+
Useful for the launch share-card: "Worst failures: hidden-OCR
|
|
185
|
+
mismatch caught the model 8/10 times."
|
|
186
|
+
"""
|
|
187
|
+
return [c for c in self.cases if c.fell_for_trap][:k]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def summarise(model: str, suite: str, scores: list[CaseScore]) -> SuiteReport:
|
|
191
|
+
if not scores:
|
|
192
|
+
return SuiteReport(model=model, suite=suite, n=0, pass_rate=0.0,
|
|
193
|
+
per_trap_pass={}, per_trap_fell_for_trap={}, refused_rate=0.0, cases=[])
|
|
194
|
+
by_trap: dict[str, list[CaseScore]] = {}
|
|
195
|
+
for s in scores:
|
|
196
|
+
by_trap.setdefault(s.trap_family, []).append(s)
|
|
197
|
+
per_trap_pass = {
|
|
198
|
+
trap: sum(c.correct for c in cs) / len(cs)
|
|
199
|
+
for trap, cs in by_trap.items()
|
|
200
|
+
}
|
|
201
|
+
per_trap_fell = {
|
|
202
|
+
trap: sum(c.fell_for_trap for c in cs) / len(cs)
|
|
203
|
+
for trap, cs in by_trap.items()
|
|
204
|
+
}
|
|
205
|
+
return SuiteReport(
|
|
206
|
+
model=model,
|
|
207
|
+
suite=suite,
|
|
208
|
+
n=len(scores),
|
|
209
|
+
pass_rate=sum(c.correct for c in scores) / len(scores),
|
|
210
|
+
per_trap_pass=per_trap_pass,
|
|
211
|
+
per_trap_fell_for_trap=per_trap_fell,
|
|
212
|
+
refused_rate=sum(c.refused for c in scores) / len(scores),
|
|
213
|
+
cases=scores,
|
|
214
|
+
)
|
pdfhell/suite.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Suite builder.
|
|
2
|
+
|
|
3
|
+
Suites are sets of (PDF, case-JSON) pairs on disk. A suite is fully
|
|
4
|
+
described by ``SuiteSpec`` — a deterministic recipe — so anyone with
|
|
5
|
+
the suite name and ``pdfhell`` can re-derive byte-identical PDFs and
|
|
6
|
+
answer keys.
|
|
7
|
+
|
|
8
|
+
This is part of the "code-based ground truth" promise: the suite isn't
|
|
9
|
+
a static blob, it's a recipe + a verifiable hash.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Iterable
|
|
16
|
+
|
|
17
|
+
from .case import HellCase
|
|
18
|
+
from .generators import generate_case
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(slots=True)
|
|
22
|
+
class SuiteSpec:
|
|
23
|
+
"""Recipe for a reproducible suite.
|
|
24
|
+
|
|
25
|
+
``traps`` maps a trap family name to a list of seeds — those exact
|
|
26
|
+
seeds produce those exact PDFs. Run ``pdfhell build-suite --suite
|
|
27
|
+
mini`` to materialise to disk.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name: str
|
|
31
|
+
traps: dict[str, list[int]] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def total_cases(self) -> int:
|
|
35
|
+
return sum(len(s) for s in self.traps.values())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def mini_suite() -> SuiteSpec:
|
|
39
|
+
"""The canonical ``mini`` suite: 30 cases, 10 per trap family.
|
|
40
|
+
|
|
41
|
+
Seeds are arbitrary but fixed. The published leaderboard at
|
|
42
|
+
``multivon.ai/leaderboard`` runs this exact spec — re-running it on
|
|
43
|
+
any machine produces identical PDFs.
|
|
44
|
+
"""
|
|
45
|
+
return SuiteSpec(
|
|
46
|
+
name="mini",
|
|
47
|
+
traps={
|
|
48
|
+
"hidden_ocr_mismatch": list(range(1001, 1011)),
|
|
49
|
+
"footnote_override": list(range(2001, 2011)),
|
|
50
|
+
"split_table_across_pages": list(range(3001, 3011)),
|
|
51
|
+
},
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def smoke_suite() -> SuiteSpec:
|
|
56
|
+
"""3-case quick-run for first-time users — one case per trap family.
|
|
57
|
+
|
|
58
|
+
Useful for ``uvx pdfhell run --suite smoke`` — runs in ~10 seconds
|
|
59
|
+
on Gemini Flash, costs fractions of a cent, and exercises every
|
|
60
|
+
trap family so a new user can see all three failure modes without
|
|
61
|
+
waiting for the full 30-case mini suite. Same seeds as the first
|
|
62
|
+
case in each mini-suite family, so smoke results are a strict
|
|
63
|
+
subset of mini results.
|
|
64
|
+
"""
|
|
65
|
+
return SuiteSpec(
|
|
66
|
+
name="smoke",
|
|
67
|
+
traps={
|
|
68
|
+
"hidden_ocr_mismatch": [1001],
|
|
69
|
+
"footnote_override": [2001],
|
|
70
|
+
"split_table_across_pages": [3001],
|
|
71
|
+
},
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
SUITES: dict[str, SuiteSpec] = {
|
|
76
|
+
"smoke": smoke_suite(),
|
|
77
|
+
"mini": mini_suite(),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def build_suite(spec: SuiteSpec, out_dir: Path) -> list[HellCase]:
|
|
82
|
+
"""Materialise a suite to ``out_dir``.
|
|
83
|
+
|
|
84
|
+
Writes ``<case_id>.pdf`` and ``<case_id>.json`` pairs. Returns the
|
|
85
|
+
list of generated cases (with ``pdf_path`` set so callers can
|
|
86
|
+
serialise an index).
|
|
87
|
+
"""
|
|
88
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
cases: list[HellCase] = []
|
|
90
|
+
for trap_family, seeds in spec.traps.items():
|
|
91
|
+
for seed in seeds:
|
|
92
|
+
pdf_bytes, case = generate_case(trap_family, seed)
|
|
93
|
+
pdf_path = out_dir / f"{case.id}.pdf"
|
|
94
|
+
pdf_path.write_bytes(pdf_bytes)
|
|
95
|
+
case.pdf_path = pdf_path.name # relative — runners join with cases_dir
|
|
96
|
+
case.dump_json(out_dir / f"{case.id}.json")
|
|
97
|
+
cases.append(case)
|
|
98
|
+
return cases
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def iter_cases(cases_dir: Path) -> Iterable[HellCase]:
|
|
102
|
+
"""Read every case from a materialised suite directory."""
|
|
103
|
+
for json_path in sorted(cases_dir.glob("*.json")):
|
|
104
|
+
yield HellCase.load_json(json_path)
|