pdfhell 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pdfhell-0.1.0 → pdfhell-0.1.2}/PKG-INFO +47 -17
- {pdfhell-0.1.0 → pdfhell-0.1.2}/README.md +46 -16
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/__init__.py +1 -1
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/auditpack.py +27 -10
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/cli.py +41 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/runner.py +11 -1
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/scorer.py +68 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/suite.py +40 -1
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/PKG-INFO +47 -17
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/SOURCES.txt +2 -1
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pyproject.toml +1 -1
- pdfhell-0.1.2/tests/test_statistical.py +151 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/LICENSE +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/case.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/generators/__init__.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/generators/_common.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/generators/footnote_override.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/generators/hidden_ocr_mismatch.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/generators/split_table_across_pages.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/junit.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell/vision.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/dependency_links.txt +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/entry_points.txt +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/requires.txt +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/pdfhell.egg-info/top_level.txt +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/setup.cfg +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/tests/test_auditpack.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/tests/test_cli.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/tests/test_generators.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/tests/test_junit.py +0 -0
- {pdfhell-0.1.0 → pdfhell-0.1.2}/tests/test_scorer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pdfhell
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge.
|
|
5
5
|
Author: Multivon
|
|
6
6
|
License: Apache-2.0
|
|
@@ -29,38 +29,68 @@ Dynamic: license-file
|
|
|
29
29
|
|
|
30
30
|
# PDF Hell
|
|
31
31
|
|
|
32
|
-
**Adversarial PDFs that
|
|
32
|
+
**Adversarial PDFs that stress-test AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
33
33
|
|
|
34
|
-
PDF Hell is a small,
|
|
34
|
+
PDF Hell is a small, focused benchmark for three specific failure modes in AI document pipelines. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same complexity that fools the model isn't asked to grade it.
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
## The headline finding (mini-v1, 30 cases, 2026-05-17)
|
|
37
|
+
|
|
38
|
+
GPT-4o falls for the hidden-OCR trap on **10 out of 10 cases (95% Wilson CI [72%, 100%])** — it consistently returns the *invisible* amount from the PDF's text layer instead of the *visible* amount rendered on the page:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
Trap: hidden_ocr_mismatch (invoice — visible total $12,345.67, hidden OCR total $22,345.67)
|
|
42
|
+
Question: What is the TOTAL AMOUNT DUE?
|
|
43
|
+
|
|
44
|
+
→ openai:gpt-4o $22,345.67 ← fell for trap (10/10 in this trap family)
|
|
45
|
+
→ openai:gpt-5.4-mini $22,345.67 ← fell for trap (9/10)
|
|
46
|
+
→ openai:gpt-5.4 $12,345.67 ← correct (8/10 across trap)
|
|
47
|
+
→ google:gemini-2.5-flash $12,345.67 ← correct (10/10)
|
|
48
|
+
→ anthropic:claude-sonnet-4-6 $12,345.67 ← correct (10/10)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The visible page, the hidden text layer, and an agent that fuses both will give three different answers. pdfhell exists to catch that.
|
|
37
52
|
|
|
38
53
|
## Quickstart (30 seconds)
|
|
39
54
|
|
|
40
55
|
```bash
|
|
41
|
-
# 3-case smoke run against the cheapest vision model
|
|
56
|
+
# 3-case smoke run against the cheapest vision model
|
|
42
57
|
export GOOGLE_API_KEY=...
|
|
43
58
|
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
44
59
|
|
|
45
|
-
# Or
|
|
60
|
+
# Or the full mini-v1 suite (30 cases, ~10s on Flash, ~$0.01)
|
|
46
61
|
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
47
62
|
|
|
48
|
-
# Or
|
|
63
|
+
# Or generate one trap PDF and inspect it
|
|
49
64
|
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
50
65
|
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
51
66
|
```
|
|
52
67
|
|
|
53
|
-
|
|
68
|
+
`pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
54
69
|
|
|
55
|
-
|
|
70
|
+
## Mini-v1 leaderboard (8 models, 30 cases)
|
|
56
71
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
72
|
+
| Model | Pass rate | 95% CI | Hidden OCR | Footnote | Split table |
|
|
73
|
+
|---|---:|---:|---:|---:|---:|
|
|
74
|
+
| `anthropic:claude-sonnet-4-6` | 29/30 (97%) | [83%, 99%] | 10/10 | 9/10 | 10/10 |
|
|
75
|
+
| `google:gemini-3.1-pro-preview` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
76
|
+
| `google:gemini-3.1-flash-lite` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
77
|
+
| `google:gemini-2.5-pro` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
78
|
+
| `google:gemini-2.5-flash` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
79
|
+
| `openai:gpt-5.4` | 27/30 (90%) | [74%, 97%] | 8/10 | 9/10 | 10/10 |
|
|
80
|
+
| `openai:gpt-5.4-mini` | 20/30 (67%) | [49%, 81%] | 1/10 | 9/10 | 10/10 |
|
|
81
|
+
| `openai:gpt-4o` | 14/30 (47%) | [30%, 64%] | **0/10** | 8/10 | 6/10 |
|
|
82
|
+
|
|
83
|
+
**What is and isn't supported by this data:**
|
|
84
|
+
|
|
85
|
+
- ✅ GPT-4o is materially worse than the others on this suite — its CI [30%, 64%] does not overlap with any other model's.
|
|
86
|
+
- ✅ GPT-4o falls for the hidden-OCR trap 100% of cases (CI [72%, 100%]). Every failure returned the hidden-OCR amount specifically.
|
|
87
|
+
- ✅ GPT-5.4 fixes most of it (80% pass on hidden OCR) — a real generational improvement.
|
|
88
|
+
- ❌ "Claude leads" — Sonnet's CI [83%, 99%] overlaps with Gemini's [78%, 98%]. The two are statistically indistinguishable on this suite. Don't read ordinal rankings from 30 cases.
|
|
89
|
+
- ❌ "PDF Hell is sufficient to evaluate document AI." It's a stress test for three specific failure modes. Pair it with a domain benchmark (DocVQA, your own regression suite) for coverage.
|
|
90
|
+
|
|
91
|
+
Suite hash: `8ad87b8d` (mini-v1, 30 cases). Every leaderboard row above was measured on the same hash. Raw run JSON at <https://github.com/multivon-ai/multivon-web/tree/main/public/data/pdfhell-runs>.
|
|
62
92
|
|
|
63
|
-
## What's in
|
|
93
|
+
## What's in mini-v1
|
|
64
94
|
|
|
65
95
|
| Trap family | Cases | What breaks |
|
|
66
96
|
|---|---|---|
|
|
@@ -68,9 +98,9 @@ pass: 3/3 (100.0%)
|
|
|
68
98
|
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
69
99
|
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
70
100
|
|
|
71
|
-
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys
|
|
101
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys (`Canvas(invariant=True)` on every generator).
|
|
72
102
|
|
|
73
|
-
The
|
|
103
|
+
**Suite versioning.** The `mini-v1` label + suite hash (`8ad87b8d`) fingerprints the exact (trap_family, seed) pairs measured. Adding a new trap family produces `mini-v2` with a different hash — runs across different hashes are not directly comparable. See the next section for the roadmap.
|
|
74
104
|
|
|
75
105
|
## Why this exists
|
|
76
106
|
|
|
@@ -1,37 +1,67 @@
|
|
|
1
1
|
# PDF Hell
|
|
2
2
|
|
|
3
|
-
**Adversarial PDFs that
|
|
3
|
+
**Adversarial PDFs that stress-test AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
4
4
|
|
|
5
|
-
PDF Hell is a small,
|
|
5
|
+
PDF Hell is a small, focused benchmark for three specific failure modes in AI document pipelines. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same complexity that fools the model isn't asked to grade it.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## The headline finding (mini-v1, 30 cases, 2026-05-17)
|
|
8
|
+
|
|
9
|
+
GPT-4o falls for the hidden-OCR trap on **10 out of 10 cases (95% Wilson CI [72%, 100%])** — it consistently returns the *invisible* amount from the PDF's text layer instead of the *visible* amount rendered on the page:
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
Trap: hidden_ocr_mismatch (invoice — visible total $12,345.67, hidden OCR total $22,345.67)
|
|
13
|
+
Question: What is the TOTAL AMOUNT DUE?
|
|
14
|
+
|
|
15
|
+
→ openai:gpt-4o $22,345.67 ← fell for trap (10/10 in this trap family)
|
|
16
|
+
→ openai:gpt-5.4-mini $22,345.67 ← fell for trap (9/10)
|
|
17
|
+
→ openai:gpt-5.4 $12,345.67 ← correct (8/10 across trap)
|
|
18
|
+
→ google:gemini-2.5-flash $12,345.67 ← correct (10/10)
|
|
19
|
+
→ anthropic:claude-sonnet-4-6 $12,345.67 ← correct (10/10)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
The visible page, the hidden text layer, and an agent that fuses both will give three different answers. pdfhell exists to catch that.
|
|
8
23
|
|
|
9
24
|
## Quickstart (30 seconds)
|
|
10
25
|
|
|
11
26
|
```bash
|
|
12
|
-
# 3-case smoke run against the cheapest vision model
|
|
27
|
+
# 3-case smoke run against the cheapest vision model
|
|
13
28
|
export GOOGLE_API_KEY=...
|
|
14
29
|
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
15
30
|
|
|
16
|
-
# Or
|
|
31
|
+
# Or the full mini-v1 suite (30 cases, ~10s on Flash, ~$0.01)
|
|
17
32
|
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
18
33
|
|
|
19
|
-
# Or
|
|
34
|
+
# Or generate one trap PDF and inspect it
|
|
20
35
|
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
21
36
|
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
22
37
|
```
|
|
23
38
|
|
|
24
|
-
|
|
39
|
+
`pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
25
40
|
|
|
26
|
-
|
|
41
|
+
## Mini-v1 leaderboard (8 models, 30 cases)
|
|
27
42
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
43
|
+
| Model | Pass rate | 95% CI | Hidden OCR | Footnote | Split table |
|
|
44
|
+
|---|---:|---:|---:|---:|---:|
|
|
45
|
+
| `anthropic:claude-sonnet-4-6` | 29/30 (97%) | [83%, 99%] | 10/10 | 9/10 | 10/10 |
|
|
46
|
+
| `google:gemini-3.1-pro-preview` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
47
|
+
| `google:gemini-3.1-flash-lite` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
48
|
+
| `google:gemini-2.5-pro` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
49
|
+
| `google:gemini-2.5-flash` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
50
|
+
| `openai:gpt-5.4` | 27/30 (90%) | [74%, 97%] | 8/10 | 9/10 | 10/10 |
|
|
51
|
+
| `openai:gpt-5.4-mini` | 20/30 (67%) | [49%, 81%] | 1/10 | 9/10 | 10/10 |
|
|
52
|
+
| `openai:gpt-4o` | 14/30 (47%) | [30%, 64%] | **0/10** | 8/10 | 6/10 |
|
|
53
|
+
|
|
54
|
+
**What is and isn't supported by this data:**
|
|
55
|
+
|
|
56
|
+
- ✅ GPT-4o is materially worse than the others on this suite — its CI [30%, 64%] does not overlap with any other model's.
|
|
57
|
+
- ✅ GPT-4o falls for the hidden-OCR trap 100% of cases (CI [72%, 100%]). Every failure returned the hidden-OCR amount specifically.
|
|
58
|
+
- ✅ GPT-5.4 fixes most of it (80% pass on hidden OCR) — a real generational improvement.
|
|
59
|
+
- ❌ "Claude leads" — Sonnet's CI [83%, 99%] overlaps with Gemini's [78%, 98%]. The two are statistically indistinguishable on this suite. Don't read ordinal rankings from 30 cases.
|
|
60
|
+
- ❌ "PDF Hell is sufficient to evaluate document AI." It's a stress test for three specific failure modes. Pair it with a domain benchmark (DocVQA, your own regression suite) for coverage.
|
|
61
|
+
|
|
62
|
+
Suite hash: `8ad87b8d` (mini-v1, 30 cases). Every leaderboard row above was measured on the same hash. Raw run JSON at <https://github.com/multivon-ai/multivon-web/tree/main/public/data/pdfhell-runs>.
|
|
33
63
|
|
|
34
|
-
## What's in
|
|
64
|
+
## What's in mini-v1
|
|
35
65
|
|
|
36
66
|
| Trap family | Cases | What breaks |
|
|
37
67
|
|---|---|---|
|
|
@@ -39,9 +69,9 @@ pass: 3/3 (100.0%)
|
|
|
39
69
|
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
40
70
|
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
41
71
|
|
|
42
|
-
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys
|
|
72
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys (`Canvas(invariant=True)` on every generator).
|
|
43
73
|
|
|
44
|
-
The
|
|
74
|
+
**Suite versioning.** The `mini-v1` label + suite hash (`8ad87b8d`) fingerprints the exact (trap_family, seed) pairs measured. Adding a new trap family produces `mini-v2` with a different hash — runs across different hashes are not directly comparable. See the next section for the roadmap.
|
|
45
75
|
|
|
46
76
|
## Why this exists
|
|
47
77
|
|
|
@@ -39,14 +39,24 @@ from .scorer import SuiteReport
|
|
|
39
39
|
_README_TEMPLATE = """\
|
|
40
40
|
# pdfhell audit pack
|
|
41
41
|
|
|
42
|
-
This ZIP is a complete, self-
|
|
42
|
+
This ZIP is a complete, self-verifying record of one PDF Hell run. It
|
|
43
43
|
contains every PDF the model was asked to read, every answer key, the
|
|
44
|
-
raw model output, and a
|
|
44
|
+
raw model output, and a SHA-256 manifest you can recompute from the
|
|
45
|
+
ZIP contents to check that nothing has been edited since delivery.
|
|
46
|
+
|
|
47
|
+
## A note on threat model
|
|
48
|
+
|
|
49
|
+
This is a self-verifying pack, not a tamper-PROOF one: an adversary
|
|
50
|
+
with access to the ZIP can edit any file AND re-write the manifest
|
|
51
|
+
hashes to match. To detect tampering by an external party, pin the
|
|
52
|
+
manifest's SHA-256 in your procurement record (out-of-band) and
|
|
53
|
+
verify on receipt. For full tamper-proof attestation, sign the
|
|
54
|
+
manifest with an external GPG / Sigstore key (out of scope for v1).
|
|
45
55
|
|
|
46
56
|
## What's in this pack
|
|
47
57
|
|
|
48
58
|
- manifest.json — Run metadata + SHA-256 of every file in this ZIP.
|
|
49
|
-
- run.json — Full run report (per-case scores, model outputs).
|
|
59
|
+
- run.json — Full run report (per-case scores, model outputs, CIs).
|
|
50
60
|
- run.xml — JUnit XML (renders in CI dashboards).
|
|
51
61
|
- cases/*.pdf — The adversarial PDFs the model was tested against.
|
|
52
62
|
- cases/*.json — The answer keys + per-case metadata.
|
|
@@ -54,9 +64,6 @@ raw model output, and a tamper-evident manifest.
|
|
|
54
64
|
|
|
55
65
|
## How to verify
|
|
56
66
|
|
|
57
|
-
The manifest contains a SHA-256 for every file in this ZIP. To verify
|
|
58
|
-
nothing was edited after delivery:
|
|
59
|
-
|
|
60
67
|
unzip -p audit-pack.zip manifest.json | jq .files
|
|
61
68
|
sha256sum cases/*.pdf cases/*.json run.json run.xml README.txt
|
|
62
69
|
|
|
@@ -70,13 +77,17 @@ byte-identical PDFs and re-run the same model:
|
|
|
70
77
|
{repro_command}
|
|
71
78
|
|
|
72
79
|
pdfhell uses Canvas(invariant=True) on every generator so PDFs are
|
|
73
|
-
byte-identical across runs with the same seed.
|
|
80
|
+
byte-identical across runs with the same seed. The manifest's
|
|
81
|
+
`suite_hash` fingerprints the exact (trap_family, seed) pairs that
|
|
82
|
+
were measured — re-runs with a different hash measured different
|
|
83
|
+
cases and are not directly comparable.
|
|
74
84
|
|
|
75
85
|
## Scope
|
|
76
86
|
|
|
77
87
|
pdfhell {pdfhell_version}, suite {suite}, model {model}. Generated
|
|
78
|
-
{timestamp}. {n} cases, {passed}/{n} passed ({pass_rate:.0%}
|
|
79
|
-
manifest.json for per-trap
|
|
88
|
+
{timestamp}. {n} cases, {passed}/{n} passed ({pass_rate:.0%}, 95%
|
|
89
|
+
Wilson CI shown in manifest.json). See manifest.json for per-trap
|
|
90
|
+
breakdown and per-trap CIs.
|
|
80
91
|
"""
|
|
81
92
|
|
|
82
93
|
|
|
@@ -149,16 +160,22 @@ def build_audit_pack(
|
|
|
149
160
|
"generated_at": timestamp,
|
|
150
161
|
"model": report.model,
|
|
151
162
|
"suite": report.suite,
|
|
163
|
+
"suite_version": report.suite_version,
|
|
164
|
+
"suite_hash": report.suite_hash,
|
|
152
165
|
"n": report.n,
|
|
153
166
|
"passed": passed,
|
|
154
167
|
"pass_rate": report.pass_rate,
|
|
168
|
+
"pass_rate_ci_95": list(report.pass_rate_ci),
|
|
155
169
|
"per_trap_pass": report.per_trap_pass,
|
|
170
|
+
"per_trap_pass_ci_95": {k: list(v) for k, v in report.per_trap_pass_ci.items()},
|
|
156
171
|
"per_trap_fell_for_trap": report.per_trap_fell_for_trap,
|
|
157
172
|
"reproduction": {
|
|
158
173
|
"command": repro_command,
|
|
159
174
|
"note": (
|
|
160
175
|
"PDFs are regenerated byte-identically via Canvas(invariant=True). "
|
|
161
|
-
"Same seed → same PDF → same answer key."
|
|
176
|
+
"Same seed → same PDF → same answer key. The suite_hash above "
|
|
177
|
+
"fingerprints the exact (trap, seed) pairs measured — auditors "
|
|
178
|
+
"should refuse any run with a mismatched hash."
|
|
162
179
|
),
|
|
163
180
|
},
|
|
164
181
|
"files": [
|
|
@@ -34,6 +34,40 @@ def _cmd_list_traps(args: argparse.Namespace) -> int:
|
|
|
34
34
|
return 0
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
def _cmd_discover(args: argparse.Namespace) -> int:
|
|
38
|
+
"""Print the machine-readable pdfhell capability catalog as JSON.
|
|
39
|
+
|
|
40
|
+
Same shape an agent gets via the multivon-mcp ``eval_discover`` tool —
|
|
41
|
+
surfaced as a CLI command so agents that don't speak MCP can pipe
|
|
42
|
+
``pdfhell discover --json | jq ...`` to plan a run.
|
|
43
|
+
"""
|
|
44
|
+
from .generators import GENERATORS
|
|
45
|
+
catalog = {
|
|
46
|
+
"package": "pdfhell",
|
|
47
|
+
"version": __version__,
|
|
48
|
+
"traps": [],
|
|
49
|
+
"suites": [],
|
|
50
|
+
}
|
|
51
|
+
for trap in TRAP_FAMILIES:
|
|
52
|
+
_, example_case = GENERATORS[trap](seed=1)
|
|
53
|
+
catalog["traps"].append({
|
|
54
|
+
"name": trap,
|
|
55
|
+
"example_question": example_case.question,
|
|
56
|
+
"example_expected_answer": example_case.expected_answer,
|
|
57
|
+
})
|
|
58
|
+
for name, spec in SUITES.items():
|
|
59
|
+
catalog["suites"].append({
|
|
60
|
+
"name": name,
|
|
61
|
+
"version": spec.version,
|
|
62
|
+
"suite_hash": spec.suite_hash,
|
|
63
|
+
"total_cases": spec.total_cases,
|
|
64
|
+
"trap_seeds": {trap: list(seeds) for trap, seeds in spec.traps.items()},
|
|
65
|
+
})
|
|
66
|
+
json.dump(catalog, sys.stdout, indent=2 if not args.compact else None)
|
|
67
|
+
print()
|
|
68
|
+
return 0
|
|
69
|
+
|
|
70
|
+
|
|
37
71
|
def _cmd_make(args: argparse.Namespace) -> int:
|
|
38
72
|
try:
|
|
39
73
|
pdf_bytes, case = generate_case(args.trap, args.seed)
|
|
@@ -158,6 +192,13 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
158
192
|
p_list = sub.add_parser("list-traps", help="list available trap families")
|
|
159
193
|
p_list.set_defaults(func=_cmd_list_traps)
|
|
160
194
|
|
|
195
|
+
p_discover = sub.add_parser(
|
|
196
|
+
"discover",
|
|
197
|
+
help="emit pdfhell capability catalog as JSON (for agents that don't speak MCP)",
|
|
198
|
+
)
|
|
199
|
+
p_discover.add_argument("--compact", action="store_true", help="single-line JSON, no indent")
|
|
200
|
+
p_discover.set_defaults(func=_cmd_discover)
|
|
201
|
+
|
|
161
202
|
p_make = sub.add_parser("make", help="generate one case (pdf + json)")
|
|
162
203
|
p_make.add_argument("--trap", required=True, choices=TRAP_FAMILIES)
|
|
163
204
|
p_make.add_argument("--seed", required=True, type=int)
|
|
@@ -23,6 +23,7 @@ from multivon_eval import JudgeConfig
|
|
|
23
23
|
|
|
24
24
|
from .case import HellCase
|
|
25
25
|
from .scorer import CaseScore, SuiteReport, score_case, summarise
|
|
26
|
+
from .suite import SUITES
|
|
26
27
|
from .vision import call_vision
|
|
27
28
|
|
|
28
29
|
|
|
@@ -98,6 +99,10 @@ def run_suite(
|
|
|
98
99
|
|
|
99
100
|
``cases_dir`` must contain ``<case_id>.json`` and ``<case_id>.pdf``
|
|
100
101
|
pairs produced by :func:`pdfhell.suite.build_suite`.
|
|
102
|
+
|
|
103
|
+
The returned :class:`SuiteReport` is annotated with the canonical
|
|
104
|
+
``suite_version`` and ``suite_hash`` from the named suite, so
|
|
105
|
+
consumers can verify the run measured the expected cases.
|
|
101
106
|
"""
|
|
102
107
|
judge = parse_model_spec(model_spec)
|
|
103
108
|
jobs = list(_load_jobs(cases_dir))
|
|
@@ -123,7 +128,12 @@ def run_suite(
|
|
|
123
128
|
if progress:
|
|
124
129
|
mark = "✓" if score.correct else ("⚠" if score.fell_for_trap else "✗")
|
|
125
130
|
print(f" {mark} {score.case_id:36s} expected={score.expected!r:30s} got={answer[:60]!r}")
|
|
126
|
-
|
|
131
|
+
report = summarise(model_spec, suite_name, scores)
|
|
132
|
+
spec = SUITES.get(suite_name)
|
|
133
|
+
if spec is not None:
|
|
134
|
+
report.suite_version = spec.version
|
|
135
|
+
report.suite_hash = spec.suite_hash
|
|
136
|
+
return report
|
|
127
137
|
|
|
128
138
|
|
|
129
139
|
def _load_jobs(cases_dir: Path) -> Iterable[_Job]:
|
|
@@ -9,9 +9,17 @@ QAG (multivon-eval's :class:`~multivon_eval.DocumentGrounding`) is
|
|
|
9
9
|
available separately as the *explanation* of why a model failed — "the
|
|
10
10
|
model returned $19,900.25, matching the hidden-OCR layer rather than
|
|
11
11
|
the visible $18,900.25" — but it never affects pass/fail.
|
|
12
|
+
|
|
13
|
+
Every reported pass rate is paired with a 95% Wilson confidence
|
|
14
|
+
interval. A 10-case trap-family run at 100% pass has Wilson 95% CI
|
|
15
|
+
[0.72, 1.00] — meaning the *true* per-trap pass rate could plausibly
|
|
16
|
+
be as low as 72%. Differences of <~10pp at n=30 are not statistically
|
|
17
|
+
distinguishable. We surface the CI everywhere we surface the rate so
|
|
18
|
+
nobody draws ordinal conclusions from indistinguishable runs.
|
|
12
19
|
"""
|
|
13
20
|
from __future__ import annotations
|
|
14
21
|
|
|
22
|
+
import math
|
|
15
23
|
import re
|
|
16
24
|
from dataclasses import dataclass, field
|
|
17
25
|
from typing import Any
|
|
@@ -19,6 +27,33 @@ from typing import Any
|
|
|
19
27
|
from .case import HellCase
|
|
20
28
|
|
|
21
29
|
|
|
30
|
+
# ─── Statistical-rigor utility ─────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
def wilson_ci(passes: int, n: int, *, z: float = 1.959963984540054) -> tuple[float, float]:
|
|
33
|
+
"""Return the (lower, upper) Wilson 95% confidence interval for a
|
|
34
|
+
binomial proportion of ``passes`` successes out of ``n`` trials.
|
|
35
|
+
|
|
36
|
+
Defaults to z = 1.96 (95% CI). Pass z=2.576 for 99% CI. Returns
|
|
37
|
+
(0.0, 1.0) when ``n == 0`` — vacuous CI for an empty run.
|
|
38
|
+
|
|
39
|
+
Why Wilson over the Wald / normal-approximation interval? At our
|
|
40
|
+
sample sizes (n=10 per trap, n=30 per suite) the Wald interval is
|
|
41
|
+
*wrong* near 0 and 1 (it can return negative lower bounds or
|
|
42
|
+
upper bounds > 1, both nonsensical for a probability). Wilson is
|
|
43
|
+
well-behaved across the entire [0, 1] domain and is the standard
|
|
44
|
+
interval for small-sample proportion estimates.
|
|
45
|
+
"""
|
|
46
|
+
if n <= 0:
|
|
47
|
+
return (0.0, 1.0)
|
|
48
|
+
p = passes / n
|
|
49
|
+
denom = 1.0 + (z * z) / n
|
|
50
|
+
centre = (p + (z * z) / (2.0 * n)) / denom
|
|
51
|
+
half = (z / denom) * math.sqrt((p * (1.0 - p) + (z * z) / (4.0 * n)) / n)
|
|
52
|
+
lo = max(0.0, centre - half)
|
|
53
|
+
hi = min(1.0, centre + half)
|
|
54
|
+
return (lo, hi)
|
|
55
|
+
|
|
56
|
+
|
|
22
57
|
_WHITESPACE_RE = re.compile(r"\s+")
|
|
23
58
|
_PUNCT_NORMALIZE_RE = re.compile(r"[.,;:]+\s*$")
|
|
24
59
|
|
|
@@ -165,14 +200,47 @@ class SuiteReport:
|
|
|
165
200
|
per_trap_fell_for_trap: dict[str, float]
|
|
166
201
|
refused_rate: float
|
|
167
202
|
cases: list[CaseScore] = field(default_factory=list)
|
|
203
|
+
suite_version: str = "" # e.g. "mini-v1" — see pdfhell.suite.SuiteSpec.version
|
|
204
|
+
suite_hash: str = "" # 8-char SHA-256 prefix of the sorted (trap, seed) pairs
|
|
205
|
+
|
|
206
|
+
# ─── Confidence intervals ──────────────────────────────────────────────
|
|
207
|
+
|
|
208
|
+
@property
|
|
209
|
+
def pass_rate_ci(self) -> tuple[float, float]:
|
|
210
|
+
"""95% Wilson confidence interval on the overall pass rate."""
|
|
211
|
+
return wilson_ci(int(round(self.pass_rate * self.n)), self.n)
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def per_trap_pass_ci(self) -> dict[str, tuple[float, float]]:
|
|
215
|
+
"""Per-trap-family Wilson 95% CIs.
|
|
216
|
+
|
|
217
|
+
Uses the actual case counts (typically 10 per family in the mini
|
|
218
|
+
suite). Surfaced on the leaderboard so 100% pass on n=10 isn't
|
|
219
|
+
confused with "the model never fails."
|
|
220
|
+
"""
|
|
221
|
+
if not self.cases:
|
|
222
|
+
return {}
|
|
223
|
+
# Count cases per family rather than guessing.
|
|
224
|
+
by_family: dict[str, list[CaseScore]] = {}
|
|
225
|
+
for c in self.cases:
|
|
226
|
+
by_family.setdefault(c.trap_family, []).append(c)
|
|
227
|
+
out: dict[str, tuple[float, float]] = {}
|
|
228
|
+
for family, scores in by_family.items():
|
|
229
|
+
passes = sum(1 for s in scores if s.correct)
|
|
230
|
+
out[family] = wilson_ci(passes, len(scores))
|
|
231
|
+
return out
|
|
168
232
|
|
|
169
233
|
def to_dict(self) -> dict[str, Any]:
|
|
170
234
|
return {
|
|
171
235
|
"model": self.model,
|
|
172
236
|
"suite": self.suite,
|
|
237
|
+
"suite_version": self.suite_version,
|
|
238
|
+
"suite_hash": self.suite_hash,
|
|
173
239
|
"n": self.n,
|
|
174
240
|
"pass_rate": self.pass_rate,
|
|
241
|
+
"pass_rate_ci": list(self.pass_rate_ci),
|
|
175
242
|
"per_trap_pass": self.per_trap_pass,
|
|
243
|
+
"per_trap_pass_ci": {k: list(v) for k, v in self.per_trap_pass_ci.items()},
|
|
176
244
|
"per_trap_fell_for_trap": self.per_trap_fell_for_trap,
|
|
177
245
|
"refused_rate": self.refused_rate,
|
|
178
246
|
"cases": [c.to_dict() for c in self.cases],
|
|
@@ -7,9 +7,20 @@ answer keys.
|
|
|
7
7
|
|
|
8
8
|
This is part of the "code-based ground truth" promise: the suite isn't
|
|
9
9
|
a static blob, it's a recipe + a verifiable hash.
|
|
10
|
+
|
|
11
|
+
# Versioning
|
|
12
|
+
|
|
13
|
+
Suites are versioned (e.g. ``mini-v1``) so adding a new trap family
|
|
14
|
+
doesn't silently invalidate published leaderboard numbers. Each suite
|
|
15
|
+
also carries a :attr:`SuiteSpec.suite_hash` — an 8-char SHA-256 prefix
|
|
16
|
+
of the sorted ``(trap_family, seed)`` pairs. Two runs with the same
|
|
17
|
+
``suite_hash`` measured the *exact* same cases; runs with different
|
|
18
|
+
hashes are not directly comparable. The hash is included in every
|
|
19
|
+
``SuiteReport`` and the audit pack ``manifest.json``.
|
|
10
20
|
"""
|
|
11
21
|
from __future__ import annotations
|
|
12
22
|
|
|
23
|
+
import hashlib
|
|
13
24
|
from dataclasses import dataclass, field
|
|
14
25
|
from pathlib import Path
|
|
15
26
|
from typing import Iterable
|
|
@@ -25,25 +36,52 @@ class SuiteSpec:
|
|
|
25
36
|
``traps`` maps a trap family name to a list of seeds — those exact
|
|
26
37
|
seeds produce those exact PDFs. Run ``pdfhell build-suite --suite
|
|
27
38
|
mini`` to materialise to disk.
|
|
39
|
+
|
|
40
|
+
``version`` is the human-readable label that gets published in
|
|
41
|
+
leaderboard rows (e.g. ``mini-v1``). Bump the version (and the name)
|
|
42
|
+
when adding trap families so historical comparisons stay valid.
|
|
28
43
|
"""
|
|
29
44
|
|
|
30
45
|
name: str
|
|
31
46
|
traps: dict[str, list[int]] = field(default_factory=dict)
|
|
47
|
+
version: str = ""
|
|
32
48
|
|
|
33
49
|
@property
|
|
34
50
|
def total_cases(self) -> int:
|
|
35
51
|
return sum(len(s) for s in self.traps.values())
|
|
36
52
|
|
|
53
|
+
@property
|
|
54
|
+
def suite_hash(self) -> str:
|
|
55
|
+
"""8-char SHA-256 prefix of the sorted ``(trap, seed)`` pairs.
|
|
56
|
+
|
|
57
|
+
Two suites with the same ``suite_hash`` evaluated the EXACT same
|
|
58
|
+
cases; runs across different hashes are not directly comparable.
|
|
59
|
+
Surfaced in every SuiteReport + the audit-pack manifest.
|
|
60
|
+
"""
|
|
61
|
+
items = sorted(
|
|
62
|
+
(trap, seed)
|
|
63
|
+
for trap, seeds in self.traps.items()
|
|
64
|
+
for seed in seeds
|
|
65
|
+
)
|
|
66
|
+
payload = "\n".join(f"{trap}\t{seed}" for trap, seed in items).encode("utf-8")
|
|
67
|
+
return hashlib.sha256(payload).hexdigest()[:8]
|
|
68
|
+
|
|
37
69
|
|
|
38
70
|
def mini_suite() -> SuiteSpec:
|
|
39
|
-
"""The canonical ``mini`` suite: 30 cases, 10 per trap family.
|
|
71
|
+
"""The canonical ``mini-v1`` suite: 30 cases, 10 per trap family.
|
|
40
72
|
|
|
41
73
|
Seeds are arbitrary but fixed. The published leaderboard at
|
|
42
74
|
``multivon.ai/leaderboard`` runs this exact spec — re-running it on
|
|
43
75
|
any machine produces identical PDFs.
|
|
76
|
+
|
|
77
|
+
Versioning: adding a new trap family to the mini suite produces a
|
|
78
|
+
new spec (``mini-v2``, etc.). Older leaderboard rows tagged
|
|
79
|
+
``mini-v1`` remain directly comparable across machines, dates, and
|
|
80
|
+
judge versions; rows tagged different versions are not.
|
|
44
81
|
"""
|
|
45
82
|
return SuiteSpec(
|
|
46
83
|
name="mini",
|
|
84
|
+
version="mini-v1",
|
|
47
85
|
traps={
|
|
48
86
|
"hidden_ocr_mismatch": list(range(1001, 1011)),
|
|
49
87
|
"footnote_override": list(range(2001, 2011)),
|
|
@@ -64,6 +102,7 @@ def smoke_suite() -> SuiteSpec:
|
|
|
64
102
|
"""
|
|
65
103
|
return SuiteSpec(
|
|
66
104
|
name="smoke",
|
|
105
|
+
version="smoke-v1",
|
|
67
106
|
traps={
|
|
68
107
|
"hidden_ocr_mismatch": [1001],
|
|
69
108
|
"footnote_override": [2001],
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pdfhell
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge.
|
|
5
5
|
Author: Multivon
|
|
6
6
|
License: Apache-2.0
|
|
@@ -29,38 +29,68 @@ Dynamic: license-file
|
|
|
29
29
|
|
|
30
30
|
# PDF Hell
|
|
31
31
|
|
|
32
|
-
**Adversarial PDFs that
|
|
32
|
+
**Adversarial PDFs that stress-test AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
33
33
|
|
|
34
|
-
PDF Hell is a small,
|
|
34
|
+
PDF Hell is a small, focused benchmark for three specific failure modes in AI document pipelines. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same complexity that fools the model isn't asked to grade it.
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
## The headline finding (mini-v1, 30 cases, 2026-05-17)
|
|
37
|
+
|
|
38
|
+
GPT-4o falls for the hidden-OCR trap on **10 out of 10 cases (95% Wilson CI [72%, 100%])** — it consistently returns the *invisible* amount from the PDF's text layer instead of the *visible* amount rendered on the page:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
Trap: hidden_ocr_mismatch (invoice — visible total $12,345.67, hidden OCR total $22,345.67)
|
|
42
|
+
Question: What is the TOTAL AMOUNT DUE?
|
|
43
|
+
|
|
44
|
+
→ openai:gpt-4o $22,345.67 ← fell for trap (10/10 in this trap family)
|
|
45
|
+
→ openai:gpt-5.4-mini $22,345.67 ← fell for trap (9/10)
|
|
46
|
+
→ openai:gpt-5.4 $12,345.67 ← correct (8/10 across trap)
|
|
47
|
+
→ google:gemini-2.5-flash $12,345.67 ← correct (10/10)
|
|
48
|
+
→ anthropic:claude-sonnet-4-6 $12,345.67 ← correct (10/10)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The visible page, the hidden text layer, and an agent that fuses both will give three different answers. pdfhell exists to catch that.
|
|
37
52
|
|
|
38
53
|
## Quickstart (30 seconds)
|
|
39
54
|
|
|
40
55
|
```bash
|
|
41
|
-
# 3-case smoke run against the cheapest vision model
|
|
56
|
+
# 3-case smoke run against the cheapest vision model
|
|
42
57
|
export GOOGLE_API_KEY=...
|
|
43
58
|
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
44
59
|
|
|
45
|
-
# Or
|
|
60
|
+
# Or the full mini-v1 suite (30 cases, ~10s on Flash, ~$0.01)
|
|
46
61
|
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
47
62
|
|
|
48
|
-
# Or
|
|
63
|
+
# Or generate one trap PDF and inspect it
|
|
49
64
|
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
50
65
|
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
51
66
|
```
|
|
52
67
|
|
|
53
|
-
|
|
68
|
+
`pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
54
69
|
|
|
55
|
-
|
|
70
|
+
## Mini-v1 leaderboard (8 models, 30 cases)
|
|
56
71
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
72
|
+
| Model | Pass rate | 95% CI | Hidden OCR | Footnote | Split table |
|
|
73
|
+
|---|---:|---:|---:|---:|---:|
|
|
74
|
+
| `anthropic:claude-sonnet-4-6` | 29/30 (97%) | [83%, 99%] | 10/10 | 9/10 | 10/10 |
|
|
75
|
+
| `google:gemini-3.1-pro-preview` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
76
|
+
| `google:gemini-3.1-flash-lite` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
77
|
+
| `google:gemini-2.5-pro` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
78
|
+
| `google:gemini-2.5-flash` | 28/30 (93%) | [78%, 98%] | 10/10 | 8/10 | 10/10 |
|
|
79
|
+
| `openai:gpt-5.4` | 27/30 (90%) | [74%, 97%] | 8/10 | 9/10 | 10/10 |
|
|
80
|
+
| `openai:gpt-5.4-mini` | 20/30 (67%) | [49%, 81%] | 1/10 | 9/10 | 10/10 |
|
|
81
|
+
| `openai:gpt-4o` | 14/30 (47%) | [30%, 64%] | **0/10** | 8/10 | 6/10 |
|
|
82
|
+
|
|
83
|
+
**What is and isn't supported by this data:**
|
|
84
|
+
|
|
85
|
+
- ✅ GPT-4o is materially worse than the others on this suite — its CI [30%, 64%] does not overlap with any other model's.
|
|
86
|
+
- ✅ GPT-4o falls for the hidden-OCR trap 100% of cases (CI [72%, 100%]). Every failure returned the hidden-OCR amount specifically.
|
|
87
|
+
- ✅ GPT-5.4 fixes most of it (80% pass on hidden OCR) — a real generational improvement.
|
|
88
|
+
- ❌ "Claude leads" — Sonnet's CI [83%, 99%] overlaps with Gemini's [78%, 98%]. The two are statistically indistinguishable on this suite. Don't read ordinal rankings from 30 cases.
|
|
89
|
+
- ❌ "PDF Hell is sufficient to evaluate document AI." It's a stress test for three specific failure modes. Pair it with a domain benchmark (DocVQA, your own regression suite) for coverage.
|
|
90
|
+
|
|
91
|
+
Suite hash: `8ad87b8d` (mini-v1, 30 cases). Every leaderboard row above was measured on the same hash. Raw run JSON at <https://github.com/multivon-ai/multivon-web/tree/main/public/data/pdfhell-runs>.
|
|
62
92
|
|
|
63
|
-
## What's in
|
|
93
|
+
## What's in mini-v1
|
|
64
94
|
|
|
65
95
|
| Trap family | Cases | What breaks |
|
|
66
96
|
|---|---|---|
|
|
@@ -68,9 +98,9 @@ pass: 3/3 (100.0%)
|
|
|
68
98
|
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
69
99
|
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
70
100
|
|
|
71
|
-
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys
|
|
101
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys (`Canvas(invariant=True)` on every generator).
|
|
72
102
|
|
|
73
|
-
The
|
|
103
|
+
**Suite versioning.** The `mini-v1` label + suite hash (`8ad87b8d`) fingerprints the exact (trap_family, seed) pairs measured. Adding a new trap family produces `mini-v2` with a different hash — runs across different hashes are not directly comparable. See the next section for the roadmap.
|
|
74
104
|
|
|
75
105
|
## Why this exists
|
|
76
106
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pdfhell"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Tests for the statistical-rigor additions: Wilson CIs + suite versioning.
|
|
2
|
+
|
|
3
|
+
The professor-persona review of pdfhell flagged two methodology gaps:
|
|
4
|
+
single-point pass rates without confidence intervals, and unversioned
|
|
5
|
+
suites that mutate as we add trap families. These tests guard the
|
|
6
|
+
fixes.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
from pdfhell.case import HellCase
|
|
13
|
+
from pdfhell.scorer import score_case, summarise, wilson_ci
|
|
14
|
+
from pdfhell.suite import SUITES, mini_suite, smoke_suite
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ─── Wilson CI math ────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_wilson_ci_perfect_score_small_n():
|
|
21
|
+
"""10/10 passes — the CI lower bound is well below 1.0 (small-sample
|
|
22
|
+
uncertainty). This is the case that motivated adding CIs in the
|
|
23
|
+
first place — a per-trap 10/10 is not statistically distinguishable
|
|
24
|
+
from a true rate of 75%."""
|
|
25
|
+
lo, hi = wilson_ci(10, 10)
|
|
26
|
+
assert 0.65 < lo < 0.80, f"unexpected lower bound: {lo}"
|
|
27
|
+
assert hi == pytest.approx(1.0)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_wilson_ci_zero_score_small_n():
|
|
31
|
+
"""0/10 passes — symmetric to the 10/10 case. Upper bound is well
|
|
32
|
+
above 0.0."""
|
|
33
|
+
lo, hi = wilson_ci(0, 10)
|
|
34
|
+
assert lo == pytest.approx(0.0)
|
|
35
|
+
assert 0.20 < hi < 0.35
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_wilson_ci_thirty_case_ci_width():
|
|
39
|
+
"""The mini-suite n=30 at 28/30 (93%) — Wilson CI width must be wide
|
|
40
|
+
enough that 28/30 vs 29/30 is NOT clearly separable. This guards
|
|
41
|
+
against accidentally tightening to a narrower interval (e.g. Wald)
|
|
42
|
+
that would mislead users."""
|
|
43
|
+
lo_28, hi_28 = wilson_ci(28, 30)
|
|
44
|
+
lo_29, hi_29 = wilson_ci(29, 30)
|
|
45
|
+
# The two intervals overlap substantially.
|
|
46
|
+
assert lo_29 < hi_28, "97% vs 93% CIs should overlap at n=30"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_wilson_ci_empty_run_is_vacuous():
|
|
50
|
+
"""n=0 → CI is the full [0, 1]. Don't crash on empty runs."""
|
|
51
|
+
lo, hi = wilson_ci(0, 0)
|
|
52
|
+
assert lo == 0.0
|
|
53
|
+
assert hi == 1.0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_wilson_ci_z_parameter():
|
|
57
|
+
"""99% CI is wider than 95% CI for the same data."""
|
|
58
|
+
lo95, hi95 = wilson_ci(7, 10)
|
|
59
|
+
lo99, hi99 = wilson_ci(7, 10, z=2.576)
|
|
60
|
+
assert lo99 < lo95
|
|
61
|
+
assert hi99 > hi95
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ─── Suite versioning ──────────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_mini_suite_is_versioned():
|
|
68
|
+
spec = mini_suite()
|
|
69
|
+
assert spec.version == "mini-v1"
|
|
70
|
+
assert spec.suite_hash, "suite_hash must be set"
|
|
71
|
+
assert len(spec.suite_hash) == 8
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_smoke_suite_is_versioned():
|
|
75
|
+
spec = smoke_suite()
|
|
76
|
+
assert spec.version == "smoke-v1"
|
|
77
|
+
assert spec.suite_hash
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_suite_hash_is_deterministic():
|
|
81
|
+
"""Same trap-seed contents → same hash."""
|
|
82
|
+
a = mini_suite().suite_hash
|
|
83
|
+
b = mini_suite().suite_hash
|
|
84
|
+
assert a == b
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_suite_hash_differs_with_different_seeds():
|
|
88
|
+
"""Mutating the seeds changes the hash. Adding a new trap family
|
|
89
|
+
must not silently keep the same suite_hash."""
|
|
90
|
+
a = mini_suite()
|
|
91
|
+
b = mini_suite()
|
|
92
|
+
b.traps["new_trap_family"] = [9001]
|
|
93
|
+
assert a.suite_hash != b.suite_hash
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_suites_registered():
|
|
97
|
+
assert "mini" in SUITES
|
|
98
|
+
assert "smoke" in SUITES
|
|
99
|
+
assert SUITES["mini"].version == "mini-v1"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ─── SuiteReport CI integration ────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _make_case(expected: str) -> HellCase:
|
|
106
|
+
return HellCase(
|
|
107
|
+
id="t-0001",
|
|
108
|
+
trap_family="hidden_ocr_mismatch",
|
|
109
|
+
seed=1,
|
|
110
|
+
question="x?",
|
|
111
|
+
expected_answer=expected,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def test_suite_report_carries_pass_rate_ci():
|
|
116
|
+
cases = [score_case(_make_case("$1.00"), "$1.00") for _ in range(10)]
|
|
117
|
+
cases += [score_case(_make_case("$2.00"), "wrong") for _ in range(5)]
|
|
118
|
+
report = summarise("test:model", "mini", cases)
|
|
119
|
+
lo, hi = report.pass_rate_ci
|
|
120
|
+
assert 0.0 <= lo <= report.pass_rate <= hi <= 1.0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_suite_report_to_dict_includes_cis_and_version():
|
|
124
|
+
cases = [score_case(_make_case("$1.00"), "$1.00") for _ in range(3)]
|
|
125
|
+
report = summarise("test:model", "mini", cases)
|
|
126
|
+
report.suite_version = "mini-v1"
|
|
127
|
+
report.suite_hash = "deadbeef"
|
|
128
|
+
d = report.to_dict()
|
|
129
|
+
assert "pass_rate_ci" in d
|
|
130
|
+
assert "per_trap_pass_ci" in d
|
|
131
|
+
assert d["suite_version"] == "mini-v1"
|
|
132
|
+
assert d["suite_hash"] == "deadbeef"
|
|
133
|
+
# CIs are lists not tuples (JSON-friendly).
|
|
134
|
+
assert isinstance(d["pass_rate_ci"], list)
|
|
135
|
+
assert len(d["pass_rate_ci"]) == 2
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_per_trap_ci_uses_actual_case_counts():
|
|
139
|
+
"""Per-trap CI must reflect the number of cases in that family.
|
|
140
|
+
Mixing families with different N counts shouldn't collapse to one
|
|
141
|
+
aggregate."""
|
|
142
|
+
cases = [
|
|
143
|
+
# 10 hidden_ocr passes
|
|
144
|
+
*[score_case(_make_case("$1.00"), "$1.00") for _ in range(10)],
|
|
145
|
+
]
|
|
146
|
+
report = summarise("test:model", "mini", cases)
|
|
147
|
+
cis = report.per_trap_pass_ci
|
|
148
|
+
assert "hidden_ocr_mismatch" in cis
|
|
149
|
+
lo, hi = cis["hidden_ocr_mismatch"]
|
|
150
|
+
# 10/10 at n=10 → lower bound ~0.72
|
|
151
|
+
assert 0.65 < lo < 0.80
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|