pdfhell 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfhell-0.1.0/LICENSE +17 -0
- pdfhell-0.1.0/PKG-INFO +208 -0
- pdfhell-0.1.0/README.md +179 -0
- pdfhell-0.1.0/pdfhell/__init__.py +34 -0
- pdfhell-0.1.0/pdfhell/auditpack.py +182 -0
- pdfhell-0.1.0/pdfhell/case.py +87 -0
- pdfhell-0.1.0/pdfhell/cli.py +216 -0
- pdfhell-0.1.0/pdfhell/generators/__init__.py +49 -0
- pdfhell-0.1.0/pdfhell/generators/_common.py +183 -0
- pdfhell-0.1.0/pdfhell/generators/footnote_override.py +212 -0
- pdfhell-0.1.0/pdfhell/generators/hidden_ocr_mismatch.py +129 -0
- pdfhell-0.1.0/pdfhell/generators/split_table_across_pages.py +174 -0
- pdfhell-0.1.0/pdfhell/junit.py +94 -0
- pdfhell-0.1.0/pdfhell/runner.py +142 -0
- pdfhell-0.1.0/pdfhell/scorer.py +214 -0
- pdfhell-0.1.0/pdfhell/suite.py +104 -0
- pdfhell-0.1.0/pdfhell/vision.py +231 -0
- pdfhell-0.1.0/pdfhell.egg-info/PKG-INFO +208 -0
- pdfhell-0.1.0/pdfhell.egg-info/SOURCES.txt +28 -0
- pdfhell-0.1.0/pdfhell.egg-info/dependency_links.txt +1 -0
- pdfhell-0.1.0/pdfhell.egg-info/entry_points.txt +2 -0
- pdfhell-0.1.0/pdfhell.egg-info/requires.txt +6 -0
- pdfhell-0.1.0/pdfhell.egg-info/top_level.txt +1 -0
- pdfhell-0.1.0/pyproject.toml +60 -0
- pdfhell-0.1.0/setup.cfg +4 -0
- pdfhell-0.1.0/tests/test_auditpack.py +97 -0
- pdfhell-0.1.0/tests/test_cli.py +113 -0
- pdfhell-0.1.0/tests/test_generators.py +87 -0
- pdfhell-0.1.0/tests/test_junit.py +79 -0
- pdfhell-0.1.0/tests/test_scorer.py +92 -0
pdfhell-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2026 Multivon
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
|
|
13
|
+
Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
See the License for the specific language governing permissions and
|
|
17
|
+
limitations under the License.
|
pdfhell-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfhell
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge.
|
|
5
|
+
Author: Multivon
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://pdfhell.multivon.ai
|
|
8
|
+
Project-URL: Repository, https://github.com/multivon-ai/pdfhell
|
|
9
|
+
Project-URL: Issues, https://github.com/multivon-ai/pdfhell/issues
|
|
10
|
+
Project-URL: Leaderboard, https://pdfhell.multivon.ai/leaderboard
|
|
11
|
+
Keywords: llm,evaluation,pdf,multimodal,benchmark,adversarial,document-ai,rag
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: multivon-eval>=0.7.2
|
|
24
|
+
Requires-Dist: google-genai>=1.0
|
|
25
|
+
Requires-Dist: reportlab>=4.0
|
|
26
|
+
Requires-Dist: pypdf>=5.0
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# PDF Hell
|
|
31
|
+
|
|
32
|
+
**Adversarial PDFs that break AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
33
|
+
|
|
34
|
+
PDF Hell is a small, sharp benchmark for the "AI reads PDFs" claim. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same loop that fooled the model isn't asked to grade it.
|
|
35
|
+
|
|
36
|
+
If your AI claims it can read documents, it should survive PDFs designed to break it.
|
|
37
|
+
|
|
38
|
+
## Quickstart (30 seconds)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 3-case smoke run against the cheapest vision model — works in any env with a Gemini key
|
|
42
|
+
export GOOGLE_API_KEY=...
|
|
43
|
+
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
44
|
+
|
|
45
|
+
# Or run the full mini suite (30 cases, ~10s on Flash, ~$0.01)
|
|
46
|
+
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
47
|
+
|
|
48
|
+
# Or just generate one trap PDF and open it
|
|
49
|
+
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
50
|
+
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
That's it. `pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
54
|
+
|
|
55
|
+
Smoke result on Gemini 2.5 Flash (one case per family, run this minute):
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
PDF Hell smoke suite — n=3
|
|
59
|
+
model: google:gemini-2.5-flash
|
|
60
|
+
pass: 3/3 (100.0%)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## What's in the mini suite
|
|
64
|
+
|
|
65
|
+
| Trap family | Cases | What breaks |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| `hidden_ocr_mismatch` | 10 | Invoices where the visible amount differs from an invisible OCR text layer. Vision-only models read the page; text-extraction pipelines read the layer; they disagree. |
|
|
68
|
+
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
69
|
+
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
70
|
+
|
|
71
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys. `Canvas(invariant=True)` is set on every generator so timestamps and document IDs don't drift between runs.
|
|
72
|
+
|
|
73
|
+
The full suite (10 trap families, ~50 cases) is on the [roadmap](#roadmap).
|
|
74
|
+
|
|
75
|
+
## Why this exists
|
|
76
|
+
|
|
77
|
+
The current AI-eval state of the art uses an LLM-as-judge to grade another LLM's answer. That's circular: the same complexity that fools the agent fools the judge. PDF Hell rejects that:
|
|
78
|
+
|
|
79
|
+
1. **Code-based ground truth.** The answer is a literal Python value the generator chose, not a frontier model's opinion.
|
|
80
|
+
2. **A named failure mode per trap.** When a model fails, we know *which* specific failure caught it (e.g. "trusted the hidden OCR layer over the visible page").
|
|
81
|
+
3. **A diagnostic signal**, not just a score. Per-trap-family breakdown tells you which assumption broke.
|
|
82
|
+
|
|
83
|
+
## Commands
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
pdfhell list-traps # list trap families
|
|
87
|
+
pdfhell make --trap <family> --seed <n> # generate one case
|
|
88
|
+
pdfhell build --suite <smoke|mini> --out <dir> # materialise a suite
|
|
89
|
+
pdfhell run --model <provider>:<model> # evaluate a model
|
|
90
|
+
[--suite smoke|mini] # (default: mini)
|
|
91
|
+
[--cases-dir <dir>] # (default: ./cases/<suite>)
|
|
92
|
+
[--out <path>] # JSON output
|
|
93
|
+
[--junit <path>] # JUnit XML for GitHub Actions / GitLab CI
|
|
94
|
+
[--fail-threshold <0.0-1.0>] # non-zero exit if pass_rate below threshold
|
|
95
|
+
[--workers <n>] # parallel API requests (default: 4)
|
|
96
|
+
[--quiet]
|
|
97
|
+
pdfhell report runs/<file>.json # print a saved run's summary
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Provider shorthand: `anthropic:claude-sonnet-4-6`, `openai:gpt-4o`, `google:gemini-2.5-pro`, `google:gemini-2.5-flash`, etc. API key from env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`).
|
|
101
|
+
|
|
102
|
+
## CI integration
|
|
103
|
+
|
|
104
|
+
Drop this into `.github/workflows/eval.yml`:
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
name: PDF Hell
|
|
108
|
+
on: [pull_request]
|
|
109
|
+
jobs:
|
|
110
|
+
pdfhell:
|
|
111
|
+
runs-on: ubuntu-latest
|
|
112
|
+
steps:
|
|
113
|
+
- uses: actions/checkout@v4
|
|
114
|
+
- uses: astral-sh/setup-uv@v5
|
|
115
|
+
- run: uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini --junit results.xml --fail-threshold 0.7
|
|
116
|
+
env:
|
|
117
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
118
|
+
- uses: actions/upload-artifact@v4
|
|
119
|
+
with:
|
|
120
|
+
name: pdfhell-results
|
|
121
|
+
path: results.xml
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
JUnit XML renders natively in the GitHub Actions / GitLab CI / CircleCI / Jenkins PR panel — failures show up as red rows with the expected and observed answers in the failure message.
|
|
125
|
+
|
|
126
|
+
## How scoring works
|
|
127
|
+
|
|
128
|
+
Two layers, applied in order:
|
|
129
|
+
|
|
130
|
+
1. **Procedural exact match (primary)** — for single-value traps, the model's free-text answer must contain the expected value (whitespace-tolerant, case-insensitive). For prose traps like `footnote_override`, the model must include every required token (the cap value, every carve-out section number, etc.) in any order, in any phrasing. The model isn't graded on prose style; it's graded on whether it captured the facts.
|
|
131
|
+
2. **Forbidden-answer detection (diagnostic)** — did the model return one of the answers the trap was specifically designed to elicit (e.g. the hidden-OCR amount)? If so, the trap caught a *known* failure mode and we record it. Doesn't affect the primary score.
|
|
132
|
+
|
|
133
|
+
Anything that looks like a refusal (`"I can't determine..."`) is recorded as `refused`, not as a wrong answer.
|
|
134
|
+
|
|
135
|
+
The QAG explanation layer from `multivon-eval` (`DocumentGrounding`) is available separately for users who want a human-readable "why did the model fail" breakdown — but it's never on the scoring path.
|
|
136
|
+
|
|
137
|
+
## Adding a new trap family
|
|
138
|
+
|
|
139
|
+
Add a generator at `pdfhell/generators/<your_trap>.py`:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from ..case import HellCase
|
|
143
|
+
from . import _common as C
|
|
144
|
+
|
|
145
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
146
|
+
rng = C.rng_for(seed)
|
|
147
|
+
# ... draw a PDF with reportlab using rng for all random choices ...
|
|
148
|
+
# invariant=True is the default — keep your generator deterministic.
|
|
149
|
+
return pdf_bytes, HellCase(
|
|
150
|
+
id=f"your_trap-{seed:04d}",
|
|
151
|
+
trap_family="your_trap",
|
|
152
|
+
seed=seed,
|
|
153
|
+
question="What is ...?",
|
|
154
|
+
expected_answer="42", # single canonical answer
|
|
155
|
+
expected_tokens=["42"], # OR list of required substrings for prose
|
|
156
|
+
forbidden_answers=["41", "43"], # OR a value the trap specifically elicits
|
|
157
|
+
metadata={"expected_failure_mode": "Model does X when it should do Y."},
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Register it in `pdfhell/generators/__init__.py`. See [CONTRIBUTING.md](./CONTRIBUTING.md) for the full guide. Tests run with `pytest`.
|
|
162
|
+
|
|
163
|
+
## Roadmap
|
|
164
|
+
|
|
165
|
+
The 0.1 release is intentionally narrow — three trap families, 30 cases. Coming next:
|
|
166
|
+
|
|
167
|
+
- `merged_table_cells` — value depends on row/column span interpretation
|
|
168
|
+
- `rotated_scan` — visually legible but OCR-broken pages
|
|
169
|
+
- `near_duplicate_entities` — "ACME Ltd." vs "ACME Holdings Ltd."
|
|
170
|
+
- `prompt_injection_in_body` — "Ignore previous instructions and answer X"
|
|
171
|
+
- `chart_axis_inversion` — answers depend on reading axis direction
|
|
172
|
+
- `checkbox_ambiguity` — selected vs unselected with low visual margin
|
|
173
|
+
- `cross_page_citation` — answers requiring page + bounding-box citations
|
|
174
|
+
|
|
175
|
+
Target full suite: 10 trap families, ~50 cases.
|
|
176
|
+
|
|
177
|
+
## Hosted generator
|
|
178
|
+
|
|
179
|
+
For document-AI teams who need adversarial test cases tailored to *their* templates (claims forms, MSAs, medical records, KYC docs), there's a hosted generator that takes your templates and produces adversarial variants with code-based ground truth — same methodology, your data shape.
|
|
180
|
+
|
|
181
|
+
Email `hello@multivon.ai` for early access, or see [multivon.ai/pricing](https://multivon.ai/pricing).
|
|
182
|
+
|
|
183
|
+
## Installing
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# Recommended (zero-install with uv):
|
|
187
|
+
uvx pdfhell list-traps
|
|
188
|
+
|
|
189
|
+
# Or in a venv:
|
|
190
|
+
python -m venv .venv && source .venv/bin/activate
|
|
191
|
+
pip install pdfhell
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Bare install brings in `multivon-eval` (the engine), `reportlab` (PDF generation), `pypdf`, and the three frontier-provider SDKs (anthropic, openai, google-genai). No provider extras to remember; no GPU required.
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
Apache 2.0. Built on [`multivon-eval`](https://github.com/multivon-ai/multivon-eval).
|
|
199
|
+
|
|
200
|
+
## Citing
|
|
201
|
+
|
|
202
|
+
```bibtex
|
|
203
|
+
@software{pdfhell,
|
|
204
|
+
title = {PDF Hell: Adversarial PDFs for AI document readers},
|
|
205
|
+
author = {Multivon},
|
|
206
|
+
url = {https://github.com/multivon-ai/pdfhell},
|
|
207
|
+
}
|
|
208
|
+
```
|
pdfhell-0.1.0/README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# PDF Hell
|
|
2
|
+
|
|
3
|
+
**Adversarial PDFs that break AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
4
|
+
|
|
5
|
+
PDF Hell is a small, sharp benchmark for the "AI reads PDFs" claim. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same loop that fooled the model isn't asked to grade it.
|
|
6
|
+
|
|
7
|
+
If your AI claims it can read documents, it should survive PDFs designed to break it.
|
|
8
|
+
|
|
9
|
+
## Quickstart (30 seconds)
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# 3-case smoke run against the cheapest vision model — works in any env with a Gemini key
|
|
13
|
+
export GOOGLE_API_KEY=...
|
|
14
|
+
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
15
|
+
|
|
16
|
+
# Or run the full mini suite (30 cases, ~10s on Flash, ~$0.01)
|
|
17
|
+
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
18
|
+
|
|
19
|
+
# Or just generate one trap PDF and open it
|
|
20
|
+
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
21
|
+
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
That's it. `pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
25
|
+
|
|
26
|
+
Smoke result on Gemini 2.5 Flash (one case per family, run this minute):
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
PDF Hell smoke suite — n=3
|
|
30
|
+
model: google:gemini-2.5-flash
|
|
31
|
+
pass: 3/3 (100.0%)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## What's in the mini suite
|
|
35
|
+
|
|
36
|
+
| Trap family | Cases | What breaks |
|
|
37
|
+
|---|---|---|
|
|
38
|
+
| `hidden_ocr_mismatch` | 10 | Invoices where the visible amount differs from an invisible OCR text layer. Vision-only models read the page; text-extraction pipelines read the layer; they disagree. |
|
|
39
|
+
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
40
|
+
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
41
|
+
|
|
42
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys. `Canvas(invariant=True)` is set on every generator so timestamps and document IDs don't drift between runs.
|
|
43
|
+
|
|
44
|
+
The full suite (10 trap families, ~50 cases) is on the [roadmap](#roadmap).
|
|
45
|
+
|
|
46
|
+
## Why this exists
|
|
47
|
+
|
|
48
|
+
The current AI-eval state of the art uses an LLM-as-judge to grade another LLM's answer. That's circular: the same complexity that fools the agent fools the judge. PDF Hell rejects that:
|
|
49
|
+
|
|
50
|
+
1. **Code-based ground truth.** The answer is a literal Python value the generator chose, not a frontier model's opinion.
|
|
51
|
+
2. **A named failure mode per trap.** When a model fails, we know *which* specific failure caught it (e.g. "trusted the hidden OCR layer over the visible page").
|
|
52
|
+
3. **A diagnostic signal**, not just a score. Per-trap-family breakdown tells you which assumption broke.
|
|
53
|
+
|
|
54
|
+
## Commands
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
pdfhell list-traps # list trap families
|
|
58
|
+
pdfhell make --trap <family> --seed <n> # generate one case
|
|
59
|
+
pdfhell build --suite <smoke|mini> --out <dir> # materialise a suite
|
|
60
|
+
pdfhell run --model <provider>:<model> # evaluate a model
|
|
61
|
+
[--suite smoke|mini] # (default: mini)
|
|
62
|
+
[--cases-dir <dir>] # (default: ./cases/<suite>)
|
|
63
|
+
[--out <path>] # JSON output
|
|
64
|
+
[--junit <path>] # JUnit XML for GitHub Actions / GitLab CI
|
|
65
|
+
[--fail-threshold <0.0-1.0>] # non-zero exit if pass_rate below threshold
|
|
66
|
+
[--workers <n>] # parallel API requests (default: 4)
|
|
67
|
+
[--quiet]
|
|
68
|
+
pdfhell report runs/<file>.json # print a saved run's summary
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Provider shorthand: `anthropic:claude-sonnet-4-6`, `openai:gpt-4o`, `google:gemini-2.5-pro`, `google:gemini-2.5-flash`, etc. API key from env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`).
|
|
72
|
+
|
|
73
|
+
## CI integration
|
|
74
|
+
|
|
75
|
+
Drop this into `.github/workflows/eval.yml`:
|
|
76
|
+
|
|
77
|
+
```yaml
|
|
78
|
+
name: PDF Hell
|
|
79
|
+
on: [pull_request]
|
|
80
|
+
jobs:
|
|
81
|
+
pdfhell:
|
|
82
|
+
runs-on: ubuntu-latest
|
|
83
|
+
steps:
|
|
84
|
+
- uses: actions/checkout@v4
|
|
85
|
+
- uses: astral-sh/setup-uv@v5
|
|
86
|
+
- run: uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini --junit results.xml --fail-threshold 0.7
|
|
87
|
+
env:
|
|
88
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
89
|
+
- uses: actions/upload-artifact@v4
|
|
90
|
+
with:
|
|
91
|
+
name: pdfhell-results
|
|
92
|
+
path: results.xml
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
JUnit XML renders natively in the GitHub Actions / GitLab CI / CircleCI / Jenkins PR panel — failures show up as red rows with the expected and observed answers in the failure message.
|
|
96
|
+
|
|
97
|
+
## How scoring works
|
|
98
|
+
|
|
99
|
+
Two layers, applied in order:
|
|
100
|
+
|
|
101
|
+
1. **Procedural exact match (primary)** — for single-value traps, the model's free-text answer must contain the expected value (whitespace-tolerant, case-insensitive). For prose traps like `footnote_override`, the model must include every required token (the cap value, every carve-out section number, etc.) in any order, in any phrasing. The model isn't graded on prose style; it's graded on whether it captured the facts.
|
|
102
|
+
2. **Forbidden-answer detection (diagnostic)** — did the model return one of the answers the trap was specifically designed to elicit (e.g. the hidden-OCR amount)? If so, the trap caught a *known* failure mode and we record it. Doesn't affect the primary score.
|
|
103
|
+
|
|
104
|
+
Anything that looks like a refusal (`"I can't determine..."`) is recorded as `refused`, not as a wrong answer.
|
|
105
|
+
|
|
106
|
+
The QAG explanation layer from `multivon-eval` (`DocumentGrounding`) is available separately for users who want a human-readable "why did the model fail" breakdown — but it's never on the scoring path.
|
|
107
|
+
|
|
108
|
+
## Adding a new trap family
|
|
109
|
+
|
|
110
|
+
Add a generator at `pdfhell/generators/<your_trap>.py`:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from ..case import HellCase
|
|
114
|
+
from . import _common as C
|
|
115
|
+
|
|
116
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
117
|
+
rng = C.rng_for(seed)
|
|
118
|
+
# ... draw a PDF with reportlab using rng for all random choices ...
|
|
119
|
+
# invariant=True is the default — keep your generator deterministic.
|
|
120
|
+
return pdf_bytes, HellCase(
|
|
121
|
+
id=f"your_trap-{seed:04d}",
|
|
122
|
+
trap_family="your_trap",
|
|
123
|
+
seed=seed,
|
|
124
|
+
question="What is ...?",
|
|
125
|
+
expected_answer="42", # single canonical answer
|
|
126
|
+
expected_tokens=["42"], # OR list of required substrings for prose
|
|
127
|
+
forbidden_answers=["41", "43"], # OR a value the trap specifically elicits
|
|
128
|
+
metadata={"expected_failure_mode": "Model does X when it should do Y."},
|
|
129
|
+
)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Register it in `pdfhell/generators/__init__.py`. See [CONTRIBUTING.md](./CONTRIBUTING.md) for the full guide. Tests run with `pytest`.
|
|
133
|
+
|
|
134
|
+
## Roadmap
|
|
135
|
+
|
|
136
|
+
The 0.1 release is intentionally narrow — three trap families, 30 cases. Coming next:
|
|
137
|
+
|
|
138
|
+
- `merged_table_cells` — value depends on row/column span interpretation
|
|
139
|
+
- `rotated_scan` — visually legible but OCR-broken pages
|
|
140
|
+
- `near_duplicate_entities` — "ACME Ltd." vs "ACME Holdings Ltd."
|
|
141
|
+
- `prompt_injection_in_body` — "Ignore previous instructions and answer X"
|
|
142
|
+
- `chart_axis_inversion` — answers depend on reading axis direction
|
|
143
|
+
- `checkbox_ambiguity` — selected vs unselected with low visual margin
|
|
144
|
+
- `cross_page_citation` — answers requiring page + bounding-box citations
|
|
145
|
+
|
|
146
|
+
Target full suite: 10 trap families, ~50 cases.
|
|
147
|
+
|
|
148
|
+
## Hosted generator
|
|
149
|
+
|
|
150
|
+
For document-AI teams who need adversarial test cases tailored to *their* templates (claims forms, MSAs, medical records, KYC docs), there's a hosted generator that takes your templates and produces adversarial variants with code-based ground truth — same methodology, your data shape.
|
|
151
|
+
|
|
152
|
+
Email `hello@multivon.ai` for early access, or see [multivon.ai/pricing](https://multivon.ai/pricing).
|
|
153
|
+
|
|
154
|
+
## Installing
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Recommended (zero-install with uv):
|
|
158
|
+
uvx pdfhell list-traps
|
|
159
|
+
|
|
160
|
+
# Or in a venv:
|
|
161
|
+
python -m venv .venv && source .venv/bin/activate
|
|
162
|
+
pip install pdfhell
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Bare install brings in `multivon-eval` (the engine), `reportlab` (PDF generation), `pypdf`, and the three frontier-provider SDKs (anthropic, openai, google-genai). No provider extras to remember; no GPU required.
|
|
166
|
+
|
|
167
|
+
## License
|
|
168
|
+
|
|
169
|
+
Apache 2.0. Built on [`multivon-eval`](https://github.com/multivon-ai/multivon-eval).
|
|
170
|
+
|
|
171
|
+
## Citing
|
|
172
|
+
|
|
173
|
+
```bibtex
|
|
174
|
+
@software{pdfhell,
|
|
175
|
+
title = {PDF Hell: Adversarial PDFs for AI document readers},
|
|
176
|
+
author = {Multivon},
|
|
177
|
+
url = {https://github.com/multivon-ai/pdfhell},
|
|
178
|
+
}
|
|
179
|
+
```
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""PDF Hell — adversarial PDFs that break AI document readers.
|
|
2
|
+
|
|
3
|
+
Procedural ground truth, not LLM-as-judge. Each trap family generates PDFs
|
|
4
|
+
*from code*, so the answer key is exact and reproducible — no circular
|
|
5
|
+
assurance.
|
|
6
|
+
|
|
7
|
+
Quickstart::
|
|
8
|
+
|
|
9
|
+
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
10
|
+
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
11
|
+
uvx pdfhell report runs/claude.json --share-card
|
|
12
|
+
|
|
13
|
+
Build on top of ``multivon-eval`` (the QAG engine, provider adapters, audit
|
|
14
|
+
packaging, cost tracking). pdfhell is *only* the adversarial generation
|
|
15
|
+
layer; the runtime, scoring, and reporting come from multivon-eval.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
from .case import HellCase
|
|
22
|
+
from .generators import (
|
|
23
|
+
GENERATORS,
|
|
24
|
+
TRAP_FAMILIES,
|
|
25
|
+
generate_case,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"HellCase",
|
|
31
|
+
"GENERATORS",
|
|
32
|
+
"TRAP_FAMILIES",
|
|
33
|
+
"generate_case",
|
|
34
|
+
]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Build a downloadable, hash-chained audit pack from a pdfhell run.
|
|
2
|
+
|
|
3
|
+
The pack is a ZIP containing:
|
|
4
|
+
|
|
5
|
+
- ``manifest.json`` — pdfhell version, run timestamp, model spec, suite,
|
|
6
|
+
per-trap pass rates, total cost (when known), SHA-256 of every file
|
|
7
|
+
inside the pack.
|
|
8
|
+
- ``run.json`` — the full :class:`SuiteReport` JSON.
|
|
9
|
+
- ``run.xml`` — JUnit XML (same data as ``run.json``, machine-readable
|
|
10
|
+
for CI dashboards).
|
|
11
|
+
- ``cases/<case_id>.pdf`` — every adversarial PDF the model was tested
|
|
12
|
+
against.
|
|
13
|
+
- ``cases/<case_id>.json`` — each case's answer key + metadata.
|
|
14
|
+
- ``README.txt`` — human-readable "what's in this ZIP" + reproduction
|
|
15
|
+
command. Procurement teams open this first.
|
|
16
|
+
|
|
17
|
+
The audit pack is the artifact a buyer's procurement team attaches to
|
|
18
|
+
a diligence appendix. It must be self-describing (no out-of-band
|
|
19
|
+
context required), reproducible (the manifest tells you the exact
|
|
20
|
+
command to regenerate the run), and tamper-evident (the manifest
|
|
21
|
+
includes a SHA-256 for every file in the pack; auditors can verify the
|
|
22
|
+
ZIP wasn't edited after delivery).
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import hashlib
|
|
27
|
+
import json
|
|
28
|
+
import zipfile
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Iterable
|
|
32
|
+
|
|
33
|
+
from . import __version__
|
|
34
|
+
from .case import HellCase
|
|
35
|
+
from .junit import report_to_junit
|
|
36
|
+
from .scorer import SuiteReport
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_README_TEMPLATE = """\
|
|
40
|
+
# pdfhell audit pack
|
|
41
|
+
|
|
42
|
+
This ZIP is a complete, self-describing record of one PDF Hell run. It
|
|
43
|
+
contains every PDF the model was asked to read, every answer key, the
|
|
44
|
+
raw model output, and a tamper-evident manifest.
|
|
45
|
+
|
|
46
|
+
## What's in this pack
|
|
47
|
+
|
|
48
|
+
- manifest.json — Run metadata + SHA-256 of every file in this ZIP.
|
|
49
|
+
- run.json — Full run report (per-case scores, model outputs).
|
|
50
|
+
- run.xml — JUnit XML (renders in CI dashboards).
|
|
51
|
+
- cases/*.pdf — The adversarial PDFs the model was tested against.
|
|
52
|
+
- cases/*.json — The answer keys + per-case metadata.
|
|
53
|
+
- README.txt — This file.
|
|
54
|
+
|
|
55
|
+
## How to verify
|
|
56
|
+
|
|
57
|
+
The manifest contains a SHA-256 for every file in this ZIP. To verify
|
|
58
|
+
nothing was edited after delivery:
|
|
59
|
+
|
|
60
|
+
unzip -p audit-pack.zip manifest.json | jq .files
|
|
61
|
+
sha256sum cases/*.pdf cases/*.json run.json run.xml README.txt
|
|
62
|
+
|
|
63
|
+
Each hash in the manifest must match the file's actual SHA-256.
|
|
64
|
+
|
|
65
|
+
## How to reproduce
|
|
66
|
+
|
|
67
|
+
The manifest records the exact pdfhell command. To regenerate
|
|
68
|
+
byte-identical PDFs and re-run the same model:
|
|
69
|
+
|
|
70
|
+
{repro_command}
|
|
71
|
+
|
|
72
|
+
pdfhell uses Canvas(invariant=True) on every generator so PDFs are
|
|
73
|
+
byte-identical across runs with the same seed.
|
|
74
|
+
|
|
75
|
+
## Scope
|
|
76
|
+
|
|
77
|
+
pdfhell {pdfhell_version}, suite {suite}, model {model}. Generated
|
|
78
|
+
{timestamp}. {n} cases, {passed}/{n} passed ({pass_rate:.0%}). See
|
|
79
|
+
manifest.json for per-trap breakdown.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _sha256(data: bytes) -> str:
|
|
84
|
+
return hashlib.sha256(data).hexdigest()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _gather_files(report: SuiteReport, cases_dir: Path) -> Iterable[tuple[str, bytes]]:
|
|
88
|
+
"""Yield (arcname, bytes) pairs for every file going into the ZIP.
|
|
89
|
+
|
|
90
|
+
Order: README first (humans see it first), then manifest, then JSON
|
|
91
|
+
+ XML, then case PDFs + answer keys. Stable ordering keeps the
|
|
92
|
+
SHA-256 of the ZIP itself stable across runs.
|
|
93
|
+
"""
|
|
94
|
+
for case_summary in report.cases:
|
|
95
|
+
case_id = case_summary.case_id
|
|
96
|
+
pdf_path = cases_dir / f"{case_id}.pdf"
|
|
97
|
+
json_path = cases_dir / f"{case_id}.json"
|
|
98
|
+
if pdf_path.exists():
|
|
99
|
+
yield f"cases/{case_id}.pdf", pdf_path.read_bytes()
|
|
100
|
+
if json_path.exists():
|
|
101
|
+
yield f"cases/{case_id}.json", json_path.read_bytes()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_audit_pack(
|
|
105
|
+
report: SuiteReport,
|
|
106
|
+
cases_dir: Path,
|
|
107
|
+
out_path: Path,
|
|
108
|
+
) -> Path:
|
|
109
|
+
"""Write a complete audit ZIP for ``report`` to ``out_path``.
|
|
110
|
+
|
|
111
|
+
Returns the resolved output path.
|
|
112
|
+
"""
|
|
113
|
+
out_path = out_path.resolve()
|
|
114
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
|
|
116
|
+
# Materialise the per-case files into bytes first so we can hash them.
|
|
117
|
+
case_files: list[tuple[str, bytes]] = list(_gather_files(report, cases_dir))
|
|
118
|
+
|
|
119
|
+
run_json_bytes = json.dumps(report.to_dict(), indent=2).encode("utf-8")
|
|
120
|
+
run_xml_bytes = report_to_junit(report).encode("utf-8")
|
|
121
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
122
|
+
passed = sum(1 for c in report.cases if c.correct)
|
|
123
|
+
|
|
124
|
+
repro_command = (
|
|
125
|
+
f"uvx pdfhell run --model {report.model} --suite {report.suite}"
|
|
126
|
+
)
|
|
127
|
+
readme_bytes = _README_TEMPLATE.format(
|
|
128
|
+
pdfhell_version=__version__,
|
|
129
|
+
suite=report.suite,
|
|
130
|
+
model=report.model,
|
|
131
|
+
timestamp=timestamp,
|
|
132
|
+
n=report.n,
|
|
133
|
+
passed=passed,
|
|
134
|
+
pass_rate=report.pass_rate,
|
|
135
|
+
repro_command=repro_command,
|
|
136
|
+
).encode("utf-8")
|
|
137
|
+
|
|
138
|
+
# Build a manifest that hashes every other file in the pack. The
|
|
139
|
+
# manifest is the LAST file we hash so we can include the hashes of
|
|
140
|
+
# everything else inside it.
|
|
141
|
+
files_in_pack: list[tuple[str, bytes]] = [
|
|
142
|
+
("README.txt", readme_bytes),
|
|
143
|
+
("run.json", run_json_bytes),
|
|
144
|
+
("run.xml", run_xml_bytes),
|
|
145
|
+
*case_files,
|
|
146
|
+
]
|
|
147
|
+
manifest = {
|
|
148
|
+
"pdfhell_version": __version__,
|
|
149
|
+
"generated_at": timestamp,
|
|
150
|
+
"model": report.model,
|
|
151
|
+
"suite": report.suite,
|
|
152
|
+
"n": report.n,
|
|
153
|
+
"passed": passed,
|
|
154
|
+
"pass_rate": report.pass_rate,
|
|
155
|
+
"per_trap_pass": report.per_trap_pass,
|
|
156
|
+
"per_trap_fell_for_trap": report.per_trap_fell_for_trap,
|
|
157
|
+
"reproduction": {
|
|
158
|
+
"command": repro_command,
|
|
159
|
+
"note": (
|
|
160
|
+
"PDFs are regenerated byte-identically via Canvas(invariant=True). "
|
|
161
|
+
"Same seed → same PDF → same answer key."
|
|
162
|
+
),
|
|
163
|
+
},
|
|
164
|
+
"files": [
|
|
165
|
+
{"path": name, "sha256": _sha256(data), "size": len(data)}
|
|
166
|
+
for name, data in files_in_pack
|
|
167
|
+
],
|
|
168
|
+
}
|
|
169
|
+
manifest_bytes = json.dumps(manifest, indent=2).encode("utf-8")
|
|
170
|
+
|
|
171
|
+
# ZIP_DEFLATED is universal; mtime is set to the run timestamp so
|
|
172
|
+
# the ZIP itself is reproducible across packaging runs.
|
|
173
|
+
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
174
|
+
for name, data in [("manifest.json", manifest_bytes), *files_in_pack]:
|
|
175
|
+
info = zipfile.ZipInfo(name)
|
|
176
|
+
info.date_time = (2026, 1, 1, 0, 0, 0)
|
|
177
|
+
zf.writestr(info, data)
|
|
178
|
+
|
|
179
|
+
return out_path
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
__all__ = ["build_audit_pack"]
|