pdfhell 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfhell/junit.py ADDED
@@ -0,0 +1,94 @@
1
+ """Render a :class:`SuiteReport` as JUnit XML for CI consumption.
2
+
3
+ GitHub Actions, GitLab CI, Jenkins, CircleCI, and most other CI runners
4
+ display JUnit XML natively in their PR / merge-request panel. Failures
5
+ show up as red rows on the PR with the model output and expected
6
+ answer in the failure message.
7
+
8
+ The rendered XML follows the de-facto JUnit dialect (Ant/Maven-style)
9
+ that everyone parses. We classify outcomes as:
10
+
11
+ - ``correct=True`` → passing testcase, no children
12
+ - ``fell_for_trap`` → ``<failure>`` (this is the diagnostic signal)
13
+ - ``refused`` → ``<skipped>`` (model wouldn't answer — not a quality fail)
14
+ - everything else → ``<failure>``
15
+
16
+ We deliberately don't emit ``<error>`` — pdfhell's upstream errors
17
+ (provider down, SDK missing) get caught in the runner and recorded as
18
+ the model's text output starting with ``[error]``. Surfacing them as
19
+ ``<error>`` would noise the dashboard for transient infra issues.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import datetime
24
+ import xml.etree.ElementTree as ET
25
+
26
+ from .scorer import SuiteReport
27
+
28
+
29
+ def _suite_name(report: SuiteReport) -> str:
30
+ """Slug-ish name for the JUnit suite — 'pdfhell.<suite>.<model>'."""
31
+ safe_model = report.model.replace("/", ".").replace(":", ".")
32
+ return f"pdfhell.{report.suite}.{safe_model}"
33
+
34
+
35
+ def report_to_junit(report: SuiteReport) -> str:
36
+ """Return a JUnit XML string for ``report``.
37
+
38
+ The XML is human-readable and round-trips through every CI parser
39
+ we've tested. Failure messages include the expected and observed
40
+ answers so on-call doesn't have to dig through the runs JSON.
41
+ """
42
+ failures = sum(1 for c in report.cases if not c.correct and not c.refused)
43
+ skipped = sum(1 for c in report.cases if c.refused)
44
+ testsuite = ET.Element(
45
+ "testsuite",
46
+ {
47
+ "name": _suite_name(report),
48
+ "tests": str(report.n),
49
+ "failures": str(failures),
50
+ "errors": "0",
51
+ "skipped": str(skipped),
52
+ "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
53
+ },
54
+ )
55
+ for c in report.cases:
56
+ case_el = ET.SubElement(
57
+ testsuite,
58
+ "testcase",
59
+ {
60
+ "name": c.case_id,
61
+ "classname": c.trap_family,
62
+ },
63
+ )
64
+ if c.refused:
65
+ skipped_el = ET.SubElement(case_el, "skipped", {"message": "model refused"})
66
+ skipped_el.text = c.model_output[:200]
67
+ elif not c.correct:
68
+ kind = "fell_for_trap" if c.fell_for_trap else "hallucination"
69
+ failure_el = ET.SubElement(
70
+ case_el,
71
+ "failure",
72
+ {
73
+ "type": kind,
74
+ "message": (
75
+ f"expected={c.expected!r}; got={c.model_output[:80]!r}"
76
+ ),
77
+ },
78
+ )
79
+ details = [
80
+ f"expected_answer: {c.expected}",
81
+ f"model_output: {c.model_output}",
82
+ ]
83
+ if c.matched_forbidden:
84
+ details.append(f"matched_forbidden: {c.matched_forbidden}")
85
+ if c.failure_mode:
86
+ details.append(f"failure_mode: {c.failure_mode}")
87
+ failure_el.text = "\n".join(details)
88
+ testsuites = ET.Element("testsuites")
89
+ testsuites.append(testsuite)
90
+ # ET in 3.10+ supports short_empty_elements but xml_declaration is what CIs expect.
91
+ return ET.tostring(testsuites, encoding="unicode", xml_declaration=True)
92
+
93
+
94
+ __all__ = ["report_to_junit"]
pdfhell/runner.py ADDED
@@ -0,0 +1,142 @@
1
+ """Run a model against a pdfhell suite.
2
+
3
+ The runner is intentionally thin. It:
4
+
5
+ 1. Loads cases from disk (or builds them on demand by re-seeding).
6
+ 2. Sends each ``(question, pdf)`` pair to a vision-capable model.
7
+ 3. Scores the model's free-text answer against code-based ground truth.
8
+
9
+ It does NOT do its own scoring methodology — that lives in
10
+ :mod:`pdfhell.scorer`. It does NOT do its own provider dispatch — that
11
+ lives in :mod:`multivon_eval.judge` (we reuse the vision-call dispatch
12
+ from the multimodal evaluators).
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ from typing import Iterable
21
+
22
+ from multivon_eval import JudgeConfig
23
+
24
+ from .case import HellCase
25
+ from .scorer import CaseScore, SuiteReport, score_case, summarise
26
+ from .vision import call_vision
27
+
28
+
29
+ def parse_model_spec(spec: str) -> JudgeConfig:
30
+ """Parse ``"provider:model"`` shorthand into a ``JudgeConfig``.
31
+
32
+ Examples::
33
+
34
+ anthropic:claude-sonnet-4-6
35
+ openai:gpt-4o
36
+ google:gemini-2.5-pro
37
+
38
+ The shorthand is opinionated about temperature (0.0 — we want
39
+ deterministic answers for a benchmark) and max_tokens (256 — answers
40
+ should be short; long answers usually mean the model is rambling
41
+ around the answer rather than giving it).
42
+ """
43
+ if ":" not in spec:
44
+ raise ValueError(
45
+ f"model spec must be 'provider:model', got {spec!r}. "
46
+ "Example: anthropic:claude-sonnet-4-6"
47
+ )
48
+ provider, model = spec.split(":", 1)
49
+ provider = provider.strip().lower()
50
+ if provider not in {"anthropic", "openai", "google"}:
51
+ raise ValueError(
52
+ f"unsupported provider {provider!r}; "
53
+ "use anthropic, openai, or google"
54
+ )
55
+ # max_tokens=2048 gives prose answers room (e.g. footnote_override
56
+ # carve-out summaries) without letting models ramble. Gemini 2.5
57
+ # Flash allocates output tokens to internal "thinking"; tight
58
+ # budgets either produce empty responses or truncate mid-sentence.
59
+ # 2k is sufficient headroom in practice.
60
+ return JudgeConfig(
61
+ provider=provider,
62
+ model=model.strip(),
63
+ temperature=0.0,
64
+ max_tokens=2048,
65
+ )
66
+
67
+
68
+ @dataclass(slots=True)
69
+ class _Job:
70
+ case: HellCase
71
+ pdf_path: Path
72
+
73
+
74
+ def _ask_model(job: _Job, judge: JudgeConfig) -> tuple[HellCase, str]:
75
+ """Send one case to the model. Returns (case, raw_answer).
76
+
77
+ Provider-level errors propagate as JudgeUnavailable. The CLI catches
78
+ them and records the case as refused.
79
+ """
80
+ answer = call_vision(
81
+ prompt=job.case.question,
82
+ sources=[str(job.pdf_path)],
83
+ judge=judge,
84
+ max_tokens=judge.max_tokens or 2048,
85
+ )
86
+ return job.case, answer.strip()
87
+
88
+
89
+ def run_suite(
90
+ cases_dir: Path,
91
+ model_spec: str,
92
+ *,
93
+ workers: int = 4,
94
+ progress: bool = True,
95
+ suite_name: str = "mini",
96
+ ) -> SuiteReport:
97
+ """Evaluate every case under ``cases_dir`` against ``model_spec``.
98
+
99
+ ``cases_dir`` must contain ``<case_id>.json`` and ``<case_id>.pdf``
100
+ pairs produced by :func:`pdfhell.suite.build_suite`.
101
+ """
102
+ judge = parse_model_spec(model_spec)
103
+ jobs = list(_load_jobs(cases_dir))
104
+ if not jobs:
105
+ raise FileNotFoundError(f"no cases found in {cases_dir}")
106
+ scores: list[CaseScore] = []
107
+ with ThreadPoolExecutor(max_workers=workers) as pool:
108
+ futures = {pool.submit(_ask_model, job, judge): job for job in jobs}
109
+ completed = 0
110
+ for fut in as_completed(futures):
111
+ job = futures[fut]
112
+ try:
113
+ case, answer = fut.result()
114
+ except Exception as exc: # JudgeUnavailable, provider error, etc.
115
+ # Treat upstream errors as refusals so the run still
116
+ # produces a complete report; the case is scored as
117
+ # incorrect with a refusal flag.
118
+ case = job.case
119
+ answer = f"[error] {type(exc).__name__}: {exc}"
120
+ score = score_case(case, answer)
121
+ scores.append(score)
122
+ completed += 1
123
+ if progress:
124
+ mark = "✓" if score.correct else ("⚠" if score.fell_for_trap else "✗")
125
+ print(f" {mark} {score.case_id:36s} expected={score.expected!r:30s} got={answer[:60]!r}")
126
+ return summarise(model_spec, suite_name, scores)
127
+
128
+
129
+ def _load_jobs(cases_dir: Path) -> Iterable[_Job]:
130
+ """Yield (case, pdf_path) pairs sorted by case id."""
131
+ for json_path in sorted(cases_dir.glob("*.json")):
132
+ case = HellCase.load_json(json_path)
133
+ pdf_path = json_path.with_suffix(".pdf")
134
+ if not pdf_path.exists():
135
+ # The case JSON tracks its own pdf_path relative to the
136
+ # suite root; honour that as a fallback.
137
+ pdf_path = (cases_dir / case.pdf_path).resolve()
138
+ if not pdf_path.exists():
139
+ raise FileNotFoundError(
140
+ f"PDF not found for case {case.id}; expected {pdf_path}"
141
+ )
142
+ yield _Job(case=case, pdf_path=pdf_path)
pdfhell/scorer.py ADDED
@@ -0,0 +1,214 @@
1
+ """Code-based scoring for pdfhell cases.
2
+
3
+ LLM-as-judge is circular: the same complexity that fools the agent
4
+ often fools the judge. pdfhell's primary correctness signal therefore
5
+ does *not* go through an LLM. The PDF was generated from code, so the
6
+ answer is exactly known and the scorer compares strings directly.
7
+
8
+ QAG (multivon-eval's :class:`~multivon_eval.DocumentGrounding`) is
9
+ available separately as the *explanation* of why a model failed — "the
10
+ model returned $19,900.25, matching the hidden-OCR layer rather than
11
+ the visible $18,900.25" — but it never affects pass/fail.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from typing import Any
18
+
19
+ from .case import HellCase
20
+
21
+
22
+ _WHITESPACE_RE = re.compile(r"\s+")
23
+ _PUNCT_NORMALIZE_RE = re.compile(r"[.,;:]+\s*$")
24
+
25
+
26
+ def _normalize(s: str) -> str:
27
+ """Loose normalisation for free-text matching: collapse whitespace,
28
+ strip trailing punctuation, lower-case. Money strings stay
29
+ case-irrelevant; trailing periods get stripped so "The answer is
30
+ $18,900.25." matches "$18,900.25"."""
31
+ s = _WHITESPACE_RE.sub(" ", s.strip().lower())
32
+ s = _PUNCT_NORMALIZE_RE.sub("", s)
33
+ return s
34
+
35
+
36
+ def _contains_loose(haystack: str, needle: str) -> bool:
37
+ return _normalize(needle) in _normalize(haystack)
38
+
39
+
40
+ @dataclass(slots=True)
41
+ class CaseScore:
42
+ """Score for one (case, model_output) pair.
43
+
44
+ ``correct`` is the headline binary signal pdfhell publishes. The
45
+ other fields disambiguate *how* the model got it wrong, which is
46
+ what makes pdfhell useful as a diagnostic tool — not just a number.
47
+ """
48
+
49
+ case_id: str
50
+ trap_family: str
51
+ correct: bool
52
+ fell_for_trap: bool
53
+ refused: bool
54
+ matched_expected: bool
55
+ matched_forbidden: list[str] = field(default_factory=list)
56
+ model_output: str = ""
57
+ expected: str = ""
58
+ failure_mode: str = "" # human-readable, drawn from the case metadata when relevant
59
+
60
+ def to_dict(self) -> dict[str, Any]:
61
+ return {
62
+ "case_id": self.case_id,
63
+ "trap_family": self.trap_family,
64
+ "correct": self.correct,
65
+ "fell_for_trap": self.fell_for_trap,
66
+ "refused": self.refused,
67
+ "matched_expected": self.matched_expected,
68
+ "matched_forbidden": self.matched_forbidden,
69
+ "model_output": self.model_output,
70
+ "expected": self.expected,
71
+ "failure_mode": self.failure_mode,
72
+ }
73
+
74
+
75
+ _REFUSAL_PATTERNS = (
76
+ "i cannot",
77
+ "i can't",
78
+ "unable to",
79
+ "i do not have access",
80
+ "i don't have access",
81
+ "no information",
82
+ "not visible",
83
+ "cannot determine",
84
+ "can't determine",
85
+ )
86
+
87
+
88
+ def _looks_like_refusal(output: str) -> bool:
89
+ low = output.strip().lower()
90
+ if len(low) < 4:
91
+ return True
92
+ return any(p in low for p in _REFUSAL_PATTERNS)
93
+
94
+
95
+ def score_case(case: HellCase, model_output: str) -> CaseScore:
96
+ """Score a single (case, model_output) pair using code-based ground truth.
97
+
98
+ Decision tree:
99
+
100
+ - If the case has :attr:`HellCase.expected_tokens`, the output is
101
+ correct iff *every* token appears (whitespace-/case-tolerant).
102
+ Used for prose-style traps where the right answer can be phrased
103
+ multiple equally-valid ways but the *facts* are fixed (clause
104
+ numbers, dollar amounts, region codes).
105
+ - Otherwise, the output is correct iff it contains
106
+ :attr:`HellCase.expected_answer`. Used for single-value traps
107
+ (dollar amounts, dates, citations).
108
+ - Forbidden-answer detection runs regardless — if the model produces
109
+ a known wrong value, we record the diagnostic. A correct answer
110
+ that *also* contains a forbidden string is still wrong (it means
111
+ the model returned both, which is incoherent and should be flagged).
112
+ - Refusal detection runs last, only on otherwise-wrong outputs.
113
+ """
114
+ if case.expected_tokens:
115
+ matched_expected = all(_contains_loose(model_output, t) for t in case.expected_tokens)
116
+ else:
117
+ matched_expected = _contains_loose(model_output, case.expected_answer)
118
+ # When a case uses ``expected_tokens`` (prose answers), the
119
+ # ``forbidden_answer`` is often a literal substring of any complete
120
+ # correct answer (e.g. the body-only phrase is contained inside a
121
+ # full body+footnote summary). If all tokens matched, the model
122
+ # demonstrably captured the right facts; ignore the substring
123
+ # signal. For single-value traps (hidden OCR amount, table cell),
124
+ # expected and forbidden are mutually exclusive by construction, so
125
+ # the check stays active.
126
+ if case.expected_tokens and matched_expected:
127
+ matched_forbidden: list[str] = []
128
+ else:
129
+ matched_forbidden = [
130
+ f for f in case.forbidden_answers if _contains_loose(model_output, f)
131
+ ]
132
+ correct = matched_expected and not matched_forbidden
133
+ refused = (not matched_expected) and (not matched_forbidden) and _looks_like_refusal(model_output)
134
+ fell_for_trap = bool(matched_forbidden) and not matched_expected
135
+ failure_mode = ""
136
+ if fell_for_trap:
137
+ failure_mode = case.metadata.get("expected_failure_mode", "")
138
+ return CaseScore(
139
+ case_id=case.id,
140
+ trap_family=case.trap_family,
141
+ correct=correct,
142
+ fell_for_trap=fell_for_trap,
143
+ refused=refused,
144
+ matched_expected=matched_expected,
145
+ matched_forbidden=matched_forbidden,
146
+ model_output=model_output,
147
+ expected=case.expected_answer,
148
+ failure_mode=failure_mode,
149
+ )
150
+
151
+
152
+ @dataclass(slots=True)
153
+ class SuiteReport:
154
+ """Aggregate scoring summary for a run.
155
+
156
+ Designed to serialise to the leaderboard JSON multivon.ai already
157
+ consumes (one entry per (suite, model, judge) tuple).
158
+ """
159
+
160
+ model: str
161
+ suite: str
162
+ n: int
163
+ pass_rate: float
164
+ per_trap_pass: dict[str, float]
165
+ per_trap_fell_for_trap: dict[str, float]
166
+ refused_rate: float
167
+ cases: list[CaseScore] = field(default_factory=list)
168
+
169
+ def to_dict(self) -> dict[str, Any]:
170
+ return {
171
+ "model": self.model,
172
+ "suite": self.suite,
173
+ "n": self.n,
174
+ "pass_rate": self.pass_rate,
175
+ "per_trap_pass": self.per_trap_pass,
176
+ "per_trap_fell_for_trap": self.per_trap_fell_for_trap,
177
+ "refused_rate": self.refused_rate,
178
+ "cases": [c.to_dict() for c in self.cases],
179
+ }
180
+
181
+ def worst_failures(self, k: int = 5) -> list[CaseScore]:
182
+ """Return up to ``k`` cases that fell into the designed trap.
183
+
184
+ Useful for the launch share-card: "Worst failures: hidden-OCR
185
+ mismatch caught the model 8/10 times."
186
+ """
187
+ return [c for c in self.cases if c.fell_for_trap][:k]
188
+
189
+
190
+ def summarise(model: str, suite: str, scores: list[CaseScore]) -> SuiteReport:
191
+ if not scores:
192
+ return SuiteReport(model=model, suite=suite, n=0, pass_rate=0.0,
193
+ per_trap_pass={}, per_trap_fell_for_trap={}, refused_rate=0.0, cases=[])
194
+ by_trap: dict[str, list[CaseScore]] = {}
195
+ for s in scores:
196
+ by_trap.setdefault(s.trap_family, []).append(s)
197
+ per_trap_pass = {
198
+ trap: sum(c.correct for c in cs) / len(cs)
199
+ for trap, cs in by_trap.items()
200
+ }
201
+ per_trap_fell = {
202
+ trap: sum(c.fell_for_trap for c in cs) / len(cs)
203
+ for trap, cs in by_trap.items()
204
+ }
205
+ return SuiteReport(
206
+ model=model,
207
+ suite=suite,
208
+ n=len(scores),
209
+ pass_rate=sum(c.correct for c in scores) / len(scores),
210
+ per_trap_pass=per_trap_pass,
211
+ per_trap_fell_for_trap=per_trap_fell,
212
+ refused_rate=sum(c.refused for c in scores) / len(scores),
213
+ cases=scores,
214
+ )
pdfhell/suite.py ADDED
@@ -0,0 +1,104 @@
1
+ """Suite builder.
2
+
3
+ Suites are sets of (PDF, case-JSON) pairs on disk. A suite is fully
4
+ described by ``SuiteSpec`` — a deterministic recipe — so anyone with
5
+ the suite name and ``pdfhell`` can re-derive byte-identical PDFs and
6
+ answer keys.
7
+
8
+ This is part of the "code-based ground truth" promise: the suite isn't
9
+ a static blob, it's a recipe + a verifiable hash.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Iterable
16
+
17
+ from .case import HellCase
18
+ from .generators import generate_case
19
+
20
+
21
+ @dataclass(slots=True)
22
+ class SuiteSpec:
23
+ """Recipe for a reproducible suite.
24
+
25
+ ``traps`` maps a trap family name to a list of seeds — those exact
26
+ seeds produce those exact PDFs. Run ``pdfhell build-suite --suite
27
+ mini`` to materialise to disk.
28
+ """
29
+
30
+ name: str
31
+ traps: dict[str, list[int]] = field(default_factory=dict)
32
+
33
+ @property
34
+ def total_cases(self) -> int:
35
+ return sum(len(s) for s in self.traps.values())
36
+
37
+
38
+ def mini_suite() -> SuiteSpec:
39
+ """The canonical ``mini`` suite: 30 cases, 10 per trap family.
40
+
41
+ Seeds are arbitrary but fixed. The published leaderboard at
42
+ ``multivon.ai/leaderboard`` runs this exact spec — re-running it on
43
+ any machine produces identical PDFs.
44
+ """
45
+ return SuiteSpec(
46
+ name="mini",
47
+ traps={
48
+ "hidden_ocr_mismatch": list(range(1001, 1011)),
49
+ "footnote_override": list(range(2001, 2011)),
50
+ "split_table_across_pages": list(range(3001, 3011)),
51
+ },
52
+ )
53
+
54
+
55
+ def smoke_suite() -> SuiteSpec:
56
+ """3-case quick-run for first-time users — one case per trap family.
57
+
58
+ Useful for ``uvx pdfhell run --suite smoke`` — runs in ~10 seconds
59
+ on Gemini Flash, costs fractions of a cent, and exercises every
60
+ trap family so a new user can see all three failure modes without
61
+ waiting for the full 30-case mini suite. Same seeds as the first
62
+ case in each mini-suite family, so smoke results are a strict
63
+ subset of mini results.
64
+ """
65
+ return SuiteSpec(
66
+ name="smoke",
67
+ traps={
68
+ "hidden_ocr_mismatch": [1001],
69
+ "footnote_override": [2001],
70
+ "split_table_across_pages": [3001],
71
+ },
72
+ )
73
+
74
+
75
+ SUITES: dict[str, SuiteSpec] = {
76
+ "smoke": smoke_suite(),
77
+ "mini": mini_suite(),
78
+ }
79
+
80
+
81
+ def build_suite(spec: SuiteSpec, out_dir: Path) -> list[HellCase]:
82
+ """Materialise a suite to ``out_dir``.
83
+
84
+ Writes ``<case_id>.pdf`` and ``<case_id>.json`` pairs. Returns the
85
+ list of generated cases (with ``pdf_path`` set so callers can
86
+ serialise an index).
87
+ """
88
+ out_dir.mkdir(parents=True, exist_ok=True)
89
+ cases: list[HellCase] = []
90
+ for trap_family, seeds in spec.traps.items():
91
+ for seed in seeds:
92
+ pdf_bytes, case = generate_case(trap_family, seed)
93
+ pdf_path = out_dir / f"{case.id}.pdf"
94
+ pdf_path.write_bytes(pdf_bytes)
95
+ case.pdf_path = pdf_path.name # relative — runners join with cases_dir
96
+ case.dump_json(out_dir / f"{case.id}.json")
97
+ cases.append(case)
98
+ return cases
99
+
100
+
101
+ def iter_cases(cases_dir: Path) -> Iterable[HellCase]:
102
+ """Read every case from a materialised suite directory."""
103
+ for json_path in sorted(cases_dir.glob("*.json")):
104
+ yield HellCase.load_json(json_path)