pdfhell 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfhell-0.1.0/LICENSE ADDED
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Multivon
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
pdfhell-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfhell
3
+ Version: 0.1.0
4
+ Summary: PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge.
5
+ Author: Multivon
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://pdfhell.multivon.ai
8
+ Project-URL: Repository, https://github.com/multivon-ai/pdfhell
9
+ Project-URL: Issues, https://github.com/multivon-ai/pdfhell/issues
10
+ Project-URL: Leaderboard, https://pdfhell.multivon.ai/leaderboard
11
+ Keywords: llm,evaluation,pdf,multimodal,benchmark,adversarial,document-ai,rag
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Testing
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: multivon-eval>=0.7.2
24
+ Requires-Dist: google-genai>=1.0
25
+ Requires-Dist: reportlab>=4.0
26
+ Requires-Dist: pypdf>=5.0
27
+ Provides-Extra: all
28
+ Dynamic: license-file
29
+
30
+ # PDF Hell
31
+
32
+ **Adversarial PDFs that break AI document readers — with procedural ground truth, not LLM-as-judge.**
33
+
34
+ PDF Hell is a small, sharp benchmark for the "AI reads PDFs" claim. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same loop that fooled the model isn't asked to grade it.
35
+
36
+ If your AI claims it can read documents, it should survive PDFs designed to break it.
37
+
38
+ ## Quickstart (30 seconds)
39
+
40
+ ```bash
41
+ # 3-case smoke run against the cheapest vision model — works in any env with a Gemini key
42
+ export GOOGLE_API_KEY=...
43
+ uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
44
+
45
+ # Or run the full mini suite (30 cases, ~10s on Flash, ~$0.01)
46
+ uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
47
+
48
+ # Or just generate one trap PDF and open it
49
+ uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
50
+ open ./cases/hidden_ocr_mismatch-0042.pdf
51
+ ```
52
+
53
+ That's it. `pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
54
+
55
+ Smoke result on Gemini 2.5 Flash (one case per family, run this minute):
56
+
57
+ ```
58
+ PDF Hell smoke suite — n=3
59
+ model: google:gemini-2.5-flash
60
+ pass: 3/3 (100.0%)
61
+ ```
62
+
63
+ ## What's in the mini suite
64
+
65
+ | Trap family | Cases | What breaks |
66
+ |---|---|---|
67
+ | `hidden_ocr_mismatch` | 10 | Invoices where the visible amount differs from an invisible OCR text layer. Vision-only models read the page; text-extraction pipelines read the layer; they disagree. |
68
+ | `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
69
+ | `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
70
+
71
+ Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys. `Canvas(invariant=True)` is set on every generator so timestamps and document IDs don't drift between runs.
72
+
73
+ The full suite (10 trap families, ~50 cases) is on the [roadmap](#roadmap).
74
+
75
+ ## Why this exists
76
+
77
+ The current AI-eval state of the art uses an LLM-as-judge to grade another LLM's answer. That's circular: the same complexity that fools the agent fools the judge. PDF Hell rejects that:
78
+
79
+ 1. **Code-based ground truth.** The answer is a literal Python value the generator chose, not a frontier model's opinion.
80
+ 2. **A named failure mode per trap.** When a model fails, we know *which* specific failure caught it (e.g. "trusted the hidden OCR layer over the visible page").
81
+ 3. **A diagnostic signal**, not just a score. Per-trap-family breakdown tells you which assumption broke.
82
+
83
+ ## Commands
84
+
85
+ ```
86
+ pdfhell list-traps # list trap families
87
+ pdfhell make --trap <family> --seed <n> # generate one case
88
+ pdfhell build --suite <smoke|mini> --out <dir> # materialise a suite
89
+ pdfhell run --model <provider>:<model> # evaluate a model
90
+ [--suite smoke|mini] # (default: mini)
91
+ [--cases-dir <dir>] # (default: ./cases/<suite>)
92
+ [--out <path>] # JSON output
93
+ [--junit <path>] # JUnit XML for GitHub Actions / GitLab CI
94
+ [--fail-threshold <0.0-1.0>] # non-zero exit if pass_rate below threshold
95
+ [--workers <n>] # parallel API requests (default: 4)
96
+ [--quiet]
97
+ pdfhell report runs/<file>.json # print a saved run's summary
98
+ ```
99
+
100
+ Provider shorthand: `anthropic:claude-sonnet-4-6`, `openai:gpt-4o`, `google:gemini-2.5-pro`, `google:gemini-2.5-flash`, etc. API key from env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`).
101
+
102
+ ## CI integration
103
+
104
+ Drop this into `.github/workflows/eval.yml`:
105
+
106
+ ```yaml
107
+ name: PDF Hell
108
+ on: [pull_request]
109
+ jobs:
110
+ pdfhell:
111
+ runs-on: ubuntu-latest
112
+ steps:
113
+ - uses: actions/checkout@v4
114
+ - uses: astral-sh/setup-uv@v5
115
+ - run: uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini --junit results.xml --fail-threshold 0.7
116
+ env:
117
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
118
+ - uses: actions/upload-artifact@v4
119
+ with:
120
+ name: pdfhell-results
121
+ path: results.xml
122
+ ```
123
+
124
+ JUnit XML renders natively in the GitHub Actions / GitLab CI / CircleCI / Jenkins PR panel — failures show up as red rows with the expected and observed answers in the failure message.
125
+
126
+ ## How scoring works
127
+
128
+ Two layers, applied in order:
129
+
130
+ 1. **Procedural exact match (primary)** — for single-value traps, the model's free-text answer must contain the expected value (whitespace-tolerant, case-insensitive). For prose traps like `footnote_override`, the model must include every required token (the cap value, every carve-out section number, etc.) in any order, in any phrasing. The model isn't graded on prose style; it's graded on whether it captured the facts.
131
+ 2. **Forbidden-answer detection (diagnostic)** — did the model return one of the answers the trap was specifically designed to elicit (e.g. the hidden-OCR amount)? If so, the trap caught a *known* failure mode and we record it. Doesn't affect the primary score.
132
+
133
+ Anything that looks like a refusal (`"I can't determine..."`) is recorded as `refused`, not as a wrong answer.
134
+
135
+ The QAG explanation layer from `multivon-eval` (`DocumentGrounding`) is available separately for users who want a human-readable "why did the model fail" breakdown — but it's never on the scoring path.
136
+
137
+ ## Adding a new trap family
138
+
139
+ Add a generator at `pdfhell/generators/<your_trap>.py`:
140
+
141
+ ```python
142
+ from ..case import HellCase
143
+ from . import _common as C
144
+
145
+ def generate(seed: int) -> tuple[bytes, HellCase]:
146
+ rng = C.rng_for(seed)
147
+ # ... draw a PDF with reportlab using rng for all random choices ...
148
+ # invariant=True is the default — keep your generator deterministic.
149
+ return pdf_bytes, HellCase(
150
+ id=f"your_trap-{seed:04d}",
151
+ trap_family="your_trap",
152
+ seed=seed,
153
+ question="What is ...?",
154
+ expected_answer="42", # single canonical answer
155
+ expected_tokens=["42"], # OR list of required substrings for prose
156
+ forbidden_answers=["41", "43"], # OR a value the trap specifically elicits
157
+ metadata={"expected_failure_mode": "Model does X when it should do Y."},
158
+ )
159
+ ```
160
+
161
+ Register it in `pdfhell/generators/__init__.py`. See [CONTRIBUTING.md](./CONTRIBUTING.md) for the full guide. Tests run with `pytest`.
162
+
163
+ ## Roadmap
164
+
165
+ The 0.1 release is intentionally narrow — three trap families, 30 cases. Coming next:
166
+
167
+ - `merged_table_cells` — value depends on row/column span interpretation
168
+ - `rotated_scan` — visually legible but OCR-broken pages
169
+ - `near_duplicate_entities` — "ACME Ltd." vs "ACME Holdings Ltd."
170
+ - `prompt_injection_in_body` — "Ignore previous instructions and answer X"
171
+ - `chart_axis_inversion` — answers depend on reading axis direction
172
+ - `checkbox_ambiguity` — selected vs unselected with low visual margin
173
+ - `cross_page_citation` — answers requiring page + bounding-box citations
174
+
175
+ Target full suite: 10 trap families, ~50 cases.
176
+
177
+ ## Hosted generator
178
+
179
+ For document-AI teams who need adversarial test cases tailored to *their* templates (claims forms, MSAs, medical records, KYC docs), there's a hosted generator that takes your templates and produces adversarial variants with code-based ground truth — same methodology, your data shape.
180
+
181
+ Email `hello@multivon.ai` for early access, or see [multivon.ai/pricing](https://multivon.ai/pricing).
182
+
183
+ ## Installing
184
+
185
+ ```bash
186
+ # Recommended (zero-install with uv):
187
+ uvx pdfhell list-traps
188
+
189
+ # Or in a venv:
190
+ python -m venv .venv && source .venv/bin/activate
191
+ pip install pdfhell
192
+ ```
193
+
194
+ Bare install brings in `multivon-eval` (the engine), `reportlab` (PDF generation), `pypdf`, and the three frontier-provider SDKs (anthropic, openai, google-genai). No provider extras to remember; no GPU required.
195
+
196
+ ## License
197
+
198
+ Apache 2.0. Built on [`multivon-eval`](https://github.com/multivon-ai/multivon-eval).
199
+
200
+ ## Citing
201
+
202
+ ```bibtex
203
+ @software{pdfhell,
204
+ title = {PDF Hell: Adversarial PDFs for AI document readers},
205
+ author = {Multivon},
206
+ url = {https://github.com/multivon-ai/pdfhell},
207
+ }
208
+ ```
@@ -0,0 +1,179 @@
1
+ # PDF Hell
2
+
3
+ **Adversarial PDFs that break AI document readers — with procedural ground truth, not LLM-as-judge.**
4
+
5
+ PDF Hell is a small, sharp benchmark for the "AI reads PDFs" claim. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same loop that fooled the model isn't asked to grade it.
6
+
7
+ If your AI claims it can read documents, it should survive PDFs designed to break it.
8
+
9
+ ## Quickstart (30 seconds)
10
+
11
+ ```bash
12
+ # 3-case smoke run against the cheapest vision model — works in any env with a Gemini key
13
+ export GOOGLE_API_KEY=...
14
+ uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
15
+
16
+ # Or run the full mini suite (30 cases, ~10s on Flash, ~$0.01)
17
+ uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
18
+
19
+ # Or just generate one trap PDF and open it
20
+ uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
21
+ open ./cases/hidden_ocr_mismatch-0042.pdf
22
+ ```
23
+
24
+ That's it. `pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
25
+
26
+ Smoke result on Gemini 2.5 Flash (one case per family, run this minute):
27
+
28
+ ```
29
+ PDF Hell smoke suite — n=3
30
+ model: google:gemini-2.5-flash
31
+ pass: 3/3 (100.0%)
32
+ ```
33
+
34
+ ## What's in the mini suite
35
+
36
+ | Trap family | Cases | What breaks |
37
+ |---|---|---|
38
+ | `hidden_ocr_mismatch` | 10 | Invoices where the visible amount differs from an invisible OCR text layer. Vision-only models read the page; text-extraction pipelines read the layer; they disagree. |
39
+ | `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
40
+ | `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
41
+
42
+ Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys. `Canvas(invariant=True)` is set on every generator so timestamps and document IDs don't drift between runs.
43
+
44
+ The full suite (10 trap families, ~50 cases) is on the [roadmap](#roadmap).
45
+
46
+ ## Why this exists
47
+
48
+ The current AI-eval state of the art uses an LLM-as-judge to grade another LLM's answer. That's circular: the same complexity that fools the agent fools the judge. PDF Hell rejects that:
49
+
50
+ 1. **Code-based ground truth.** The answer is a literal Python value the generator chose, not a frontier model's opinion.
51
+ 2. **A named failure mode per trap.** When a model fails, we know *which* specific failure caught it (e.g. "trusted the hidden OCR layer over the visible page").
52
+ 3. **A diagnostic signal**, not just a score. Per-trap-family breakdown tells you which assumption broke.
53
+
54
+ ## Commands
55
+
56
+ ```
57
+ pdfhell list-traps # list trap families
58
+ pdfhell make --trap <family> --seed <n> # generate one case
59
+ pdfhell build --suite <smoke|mini> --out <dir> # materialise a suite
60
+ pdfhell run --model <provider>:<model> # evaluate a model
61
+ [--suite smoke|mini] # (default: mini)
62
+ [--cases-dir <dir>] # (default: ./cases/<suite>)
63
+ [--out <path>] # JSON output
64
+ [--junit <path>] # JUnit XML for GitHub Actions / GitLab CI
65
+ [--fail-threshold <0.0-1.0>] # non-zero exit if pass_rate below threshold
66
+ [--workers <n>] # parallel API requests (default: 4)
67
+ [--quiet]
68
+ pdfhell report runs/<file>.json # print a saved run's summary
69
+ ```
70
+
71
+ Provider shorthand: `anthropic:claude-sonnet-4-6`, `openai:gpt-4o`, `google:gemini-2.5-pro`, `google:gemini-2.5-flash`, etc. API key from env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`).
72
+
73
+ ## CI integration
74
+
75
+ Drop this into `.github/workflows/eval.yml`:
76
+
77
+ ```yaml
78
+ name: PDF Hell
79
+ on: [pull_request]
80
+ jobs:
81
+ pdfhell:
82
+ runs-on: ubuntu-latest
83
+ steps:
84
+ - uses: actions/checkout@v4
85
+ - uses: astral-sh/setup-uv@v5
86
+ - run: uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini --junit results.xml --fail-threshold 0.7
87
+ env:
88
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
89
+ - uses: actions/upload-artifact@v4
90
+ with:
91
+ name: pdfhell-results
92
+ path: results.xml
93
+ ```
94
+
95
+ JUnit XML renders natively in the GitHub Actions / GitLab CI / CircleCI / Jenkins PR panel — failures show up as red rows with the expected and observed answers in the failure message.
96
+
97
+ ## How scoring works
98
+
99
+ Two layers, applied in order:
100
+
101
+ 1. **Procedural exact match (primary)** — for single-value traps, the model's free-text answer must contain the expected value (whitespace-tolerant, case-insensitive). For prose traps like `footnote_override`, the model must include every required token (the cap value, every carve-out section number, etc.) in any order, in any phrasing. The model isn't graded on prose style; it's graded on whether it captured the facts.
102
+ 2. **Forbidden-answer detection (diagnostic)** — did the model return one of the answers the trap was specifically designed to elicit (e.g. the hidden-OCR amount)? If so, the trap caught a *known* failure mode and we record it. Doesn't affect the primary score.
103
+
104
+ Anything that looks like a refusal (`"I can't determine..."`) is recorded as `refused`, not as a wrong answer.
105
+
106
+ The QAG explanation layer from `multivon-eval` (`DocumentGrounding`) is available separately for users who want a human-readable "why did the model fail" breakdown — but it's never on the scoring path.
107
+
108
+ ## Adding a new trap family
109
+
110
+ Add a generator at `pdfhell/generators/<your_trap>.py`:
111
+
112
+ ```python
113
+ from ..case import HellCase
114
+ from . import _common as C
115
+
116
+ def generate(seed: int) -> tuple[bytes, HellCase]:
117
+ rng = C.rng_for(seed)
118
+ # ... draw a PDF with reportlab using rng for all random choices ...
119
+ # invariant=True is the default — keep your generator deterministic.
120
+ return pdf_bytes, HellCase(
121
+ id=f"your_trap-{seed:04d}",
122
+ trap_family="your_trap",
123
+ seed=seed,
124
+ question="What is ...?",
125
+ expected_answer="42", # single canonical answer
126
+ expected_tokens=["42"], # OR list of required substrings for prose
127
+ forbidden_answers=["41", "43"], # OR a value the trap specifically elicits
128
+ metadata={"expected_failure_mode": "Model does X when it should do Y."},
129
+ )
130
+ ```
131
+
132
+ Register it in `pdfhell/generators/__init__.py`. See [CONTRIBUTING.md](./CONTRIBUTING.md) for the full guide. Tests run with `pytest`.
133
+
134
+ ## Roadmap
135
+
136
+ The 0.1 release is intentionally narrow — three trap families, 30 cases. Coming next:
137
+
138
+ - `merged_table_cells` — value depends on row/column span interpretation
139
+ - `rotated_scan` — visually legible but OCR-broken pages
140
+ - `near_duplicate_entities` — "ACME Ltd." vs "ACME Holdings Ltd."
141
+ - `prompt_injection_in_body` — "Ignore previous instructions and answer X"
142
+ - `chart_axis_inversion` — answers depend on reading axis direction
143
+ - `checkbox_ambiguity` — selected vs unselected with low visual margin
144
+ - `cross_page_citation` — answers requiring page + bounding-box citations
145
+
146
+ Target full suite: 10 trap families, ~50 cases.
147
+
148
+ ## Hosted generator
149
+
150
+ For document-AI teams who need adversarial test cases tailored to *their* templates (claims forms, MSAs, medical records, KYC docs), there's a hosted generator that takes your templates and produces adversarial variants with code-based ground truth — same methodology, your data shape.
151
+
152
+ Email `hello@multivon.ai` for early access, or see [multivon.ai/pricing](https://multivon.ai/pricing).
153
+
154
+ ## Installing
155
+
156
+ ```bash
157
+ # Recommended (zero-install with uv):
158
+ uvx pdfhell list-traps
159
+
160
+ # Or in a venv:
161
+ python -m venv .venv && source .venv/bin/activate
162
+ pip install pdfhell
163
+ ```
164
+
165
+ Bare install brings in `multivon-eval` (the engine), `reportlab` (PDF generation), `pypdf`, and the three frontier-provider SDKs (anthropic, openai, google-genai). No provider extras to remember; no GPU required.
166
+
167
+ ## License
168
+
169
+ Apache 2.0. Built on [`multivon-eval`](https://github.com/multivon-ai/multivon-eval).
170
+
171
+ ## Citing
172
+
173
+ ```bibtex
174
+ @software{pdfhell,
175
+ title = {PDF Hell: Adversarial PDFs for AI document readers},
176
+ author = {Multivon},
177
+ url = {https://github.com/multivon-ai/pdfhell},
178
+ }
179
+ ```
@@ -0,0 +1,34 @@
1
+ """PDF Hell — adversarial PDFs that break AI document readers.
2
+
3
+ Procedural ground truth, not LLM-as-judge. Each trap family generates PDFs
4
+ *from code*, so the answer key is exact and reproducible — no circular
5
+ assurance.
6
+
7
+ Quickstart::
8
+
9
+ uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
10
+ uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
11
+ uvx pdfhell report runs/claude.json --share-card
12
+
13
+ Build on top of ``multivon-eval`` (the QAG engine, provider adapters, audit
14
+ packaging, cost tracking). pdfhell is *only* the adversarial generation
15
+ layer; the runtime, scoring, and reporting come from multivon-eval.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ from .case import HellCase
22
+ from .generators import (
23
+ GENERATORS,
24
+ TRAP_FAMILIES,
25
+ generate_case,
26
+ )
27
+
28
+ __all__ = [
29
+ "__version__",
30
+ "HellCase",
31
+ "GENERATORS",
32
+ "TRAP_FAMILIES",
33
+ "generate_case",
34
+ ]
@@ -0,0 +1,182 @@
1
+ """Build a downloadable, hash-chained audit pack from a pdfhell run.
2
+
3
+ The pack is a ZIP containing:
4
+
5
+ - ``manifest.json`` — pdfhell version, run timestamp, model spec, suite,
6
+ per-trap pass rates, total cost (when known), SHA-256 of every file
7
+ inside the pack.
8
+ - ``run.json`` — the full :class:`SuiteReport` JSON.
9
+ - ``run.xml`` — JUnit XML (same data as ``run.json``, machine-readable
10
+ for CI dashboards).
11
+ - ``cases/<case_id>.pdf`` — every adversarial PDF the model was tested
12
+ against.
13
+ - ``cases/<case_id>.json`` — each case's answer key + metadata.
14
+ - ``README.txt`` — human-readable "what's in this ZIP" + reproduction
15
+ command. Procurement teams open this first.
16
+
17
+ The audit pack is the artifact a buyer's procurement team attaches to
18
+ a diligence appendix. It must be self-describing (no out-of-band
19
+ context required), reproducible (the manifest tells you the exact
20
+ command to regenerate the run), and tamper-evident (the manifest
21
+ includes a SHA-256 for every file in the pack; auditors can verify the
22
+ ZIP wasn't edited after delivery).
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import hashlib
27
+ import json
28
+ import zipfile
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+ from typing import Iterable
32
+
33
+ from . import __version__
34
+ from .case import HellCase
35
+ from .junit import report_to_junit
36
+ from .scorer import SuiteReport
37
+
38
+
39
+ _README_TEMPLATE = """\
40
+ # pdfhell audit pack
41
+
42
+ This ZIP is a complete, self-describing record of one PDF Hell run. It
43
+ contains every PDF the model was asked to read, every answer key, the
44
+ raw model output, and a tamper-evident manifest.
45
+
46
+ ## What's in this pack
47
+
48
+ - manifest.json — Run metadata + SHA-256 of every file in this ZIP.
49
+ - run.json — Full run report (per-case scores, model outputs).
50
+ - run.xml — JUnit XML (renders in CI dashboards).
51
+ - cases/*.pdf — The adversarial PDFs the model was tested against.
52
+ - cases/*.json — The answer keys + per-case metadata.
53
+ - README.txt — This file.
54
+
55
+ ## How to verify
56
+
57
+ The manifest contains a SHA-256 for every file in this ZIP. To verify
58
+ nothing was edited after delivery:
59
+
60
+ unzip -p audit-pack.zip manifest.json | jq .files
61
+ sha256sum cases/*.pdf cases/*.json run.json run.xml README.txt
62
+
63
+ Each hash in the manifest must match the file's actual SHA-256.
64
+
65
+ ## How to reproduce
66
+
67
+ The manifest records the exact pdfhell command. To regenerate
68
+ byte-identical PDFs and re-run the same model:
69
+
70
+ {repro_command}
71
+
72
+ pdfhell uses Canvas(invariant=True) on every generator so PDFs are
73
+ byte-identical across runs with the same seed.
74
+
75
+ ## Scope
76
+
77
+ pdfhell {pdfhell_version}, suite {suite}, model {model}. Generated
78
+ {timestamp}. {n} cases, {passed}/{n} passed ({pass_rate:.0%}). See
79
+ manifest.json for per-trap breakdown.
80
+ """
81
+
82
+
83
+ def _sha256(data: bytes) -> str:
84
+ return hashlib.sha256(data).hexdigest()
85
+
86
+
87
+ def _gather_files(report: SuiteReport, cases_dir: Path) -> Iterable[tuple[str, bytes]]:
88
+ """Yield (arcname, bytes) pairs for every file going into the ZIP.
89
+
90
+ Order: README first (humans see it first), then manifest, then JSON
91
+ + XML, then case PDFs + answer keys. Stable ordering keeps the
92
+ SHA-256 of the ZIP itself stable across runs.
93
+ """
94
+ for case_summary in report.cases:
95
+ case_id = case_summary.case_id
96
+ pdf_path = cases_dir / f"{case_id}.pdf"
97
+ json_path = cases_dir / f"{case_id}.json"
98
+ if pdf_path.exists():
99
+ yield f"cases/{case_id}.pdf", pdf_path.read_bytes()
100
+ if json_path.exists():
101
+ yield f"cases/{case_id}.json", json_path.read_bytes()
102
+
103
+
104
+ def build_audit_pack(
105
+ report: SuiteReport,
106
+ cases_dir: Path,
107
+ out_path: Path,
108
+ ) -> Path:
109
+ """Write a complete audit ZIP for ``report`` to ``out_path``.
110
+
111
+ Returns the resolved output path.
112
+ """
113
+ out_path = out_path.resolve()
114
+ out_path.parent.mkdir(parents=True, exist_ok=True)
115
+
116
+ # Materialise the per-case files into bytes first so we can hash them.
117
+ case_files: list[tuple[str, bytes]] = list(_gather_files(report, cases_dir))
118
+
119
+ run_json_bytes = json.dumps(report.to_dict(), indent=2).encode("utf-8")
120
+ run_xml_bytes = report_to_junit(report).encode("utf-8")
121
+ timestamp = datetime.now(timezone.utc).isoformat()
122
+ passed = sum(1 for c in report.cases if c.correct)
123
+
124
+ repro_command = (
125
+ f"uvx pdfhell run --model {report.model} --suite {report.suite}"
126
+ )
127
+ readme_bytes = _README_TEMPLATE.format(
128
+ pdfhell_version=__version__,
129
+ suite=report.suite,
130
+ model=report.model,
131
+ timestamp=timestamp,
132
+ n=report.n,
133
+ passed=passed,
134
+ pass_rate=report.pass_rate,
135
+ repro_command=repro_command,
136
+ ).encode("utf-8")
137
+
138
+ # Build a manifest that hashes every other file in the pack. The
139
+ # manifest is the LAST file we hash so we can include the hashes of
140
+ # everything else inside it.
141
+ files_in_pack: list[tuple[str, bytes]] = [
142
+ ("README.txt", readme_bytes),
143
+ ("run.json", run_json_bytes),
144
+ ("run.xml", run_xml_bytes),
145
+ *case_files,
146
+ ]
147
+ manifest = {
148
+ "pdfhell_version": __version__,
149
+ "generated_at": timestamp,
150
+ "model": report.model,
151
+ "suite": report.suite,
152
+ "n": report.n,
153
+ "passed": passed,
154
+ "pass_rate": report.pass_rate,
155
+ "per_trap_pass": report.per_trap_pass,
156
+ "per_trap_fell_for_trap": report.per_trap_fell_for_trap,
157
+ "reproduction": {
158
+ "command": repro_command,
159
+ "note": (
160
+ "PDFs are regenerated byte-identically via Canvas(invariant=True). "
161
+ "Same seed → same PDF → same answer key."
162
+ ),
163
+ },
164
+ "files": [
165
+ {"path": name, "sha256": _sha256(data), "size": len(data)}
166
+ for name, data in files_in_pack
167
+ ],
168
+ }
169
+ manifest_bytes = json.dumps(manifest, indent=2).encode("utf-8")
170
+
171
+ # ZIP_DEFLATED is universal; mtime is set to the run timestamp so
172
+ # the ZIP itself is reproducible across packaging runs.
173
+ with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
174
+ for name, data in [("manifest.json", manifest_bytes), *files_in_pack]:
175
+ info = zipfile.ZipInfo(name)
176
+ info.date_time = (2026, 1, 1, 0, 0, 0)
177
+ zf.writestr(info, data)
178
+
179
+ return out_path
180
+
181
+
182
+ __all__ = ["build_audit_pack"]