pdfhell 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfhell/__init__.py +34 -0
- pdfhell/auditpack.py +182 -0
- pdfhell/case.py +87 -0
- pdfhell/cli.py +216 -0
- pdfhell/generators/__init__.py +49 -0
- pdfhell/generators/_common.py +183 -0
- pdfhell/generators/footnote_override.py +212 -0
- pdfhell/generators/hidden_ocr_mismatch.py +129 -0
- pdfhell/generators/split_table_across_pages.py +174 -0
- pdfhell/junit.py +94 -0
- pdfhell/runner.py +142 -0
- pdfhell/scorer.py +214 -0
- pdfhell/suite.py +104 -0
- pdfhell/vision.py +231 -0
- pdfhell-0.1.0.dist-info/METADATA +208 -0
- pdfhell-0.1.0.dist-info/RECORD +20 -0
- pdfhell-0.1.0.dist-info/WHEEL +5 -0
- pdfhell-0.1.0.dist-info/entry_points.txt +2 -0
- pdfhell-0.1.0.dist-info/licenses/LICENSE +17 -0
- pdfhell-0.1.0.dist-info/top_level.txt +1 -0
pdfhell/vision.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Vision-call dispatch for the three supported providers.
|
|
2
|
+
|
|
3
|
+
Each provider's vision API has a slightly different content-block
|
|
4
|
+
shape (Anthropic wants ``image`` + ``document`` blocks; OpenAI wants
|
|
5
|
+
``image_url`` + ``file``; Google wants inline ``Part.from_bytes``). We
|
|
6
|
+
dispatch on :attr:`JudgeConfig.provider` and let the per-provider
|
|
7
|
+
helpers do the format conversion. Errors surface as
|
|
8
|
+
:class:`JudgeUnavailable` so the runner can treat upstream failures as
|
|
9
|
+
refusals and still produce a complete report.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import base64
|
|
14
|
+
import mimetypes
|
|
15
|
+
import pathlib
|
|
16
|
+
import re
|
|
17
|
+
from typing import Iterable
|
|
18
|
+
|
|
19
|
+
from multivon_eval import JudgeConfig
|
|
20
|
+
from multivon_eval.exceptions import JudgeUnavailable
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Models we know support vision input. Conservative: when in doubt we
|
|
24
|
+
# don't gate (rely on the provider API to surface a real error).
|
|
25
|
+
# Prefix-match — "gpt-5" catches gpt-5, gpt-5-mini, gpt-5.1, gpt-5.4, etc.
|
|
26
|
+
_VISION_CAPABLE = {
|
|
27
|
+
"anthropic": {
|
|
28
|
+
"claude-haiku-4-5", "claude-sonnet-4-6", "claude-opus-4-7",
|
|
29
|
+
"claude-3-5-sonnet", "claude-3-5-haiku", "claude-3-opus",
|
|
30
|
+
},
|
|
31
|
+
"openai": {
|
|
32
|
+
"gpt-4o", "gpt-4.1", "gpt-5",
|
|
33
|
+
},
|
|
34
|
+
"google": {
|
|
35
|
+
"gemini-1.5", "gemini-2.5", "gemini-3", "gemini-3.1",
|
|
36
|
+
},
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_vision_capable(judge: JudgeConfig) -> bool:
|
|
41
|
+
model = (judge.model or "").lower()
|
|
42
|
+
if not model:
|
|
43
|
+
return True
|
|
44
|
+
for known in _VISION_CAPABLE.get(judge.provider, set()):
|
|
45
|
+
if model.startswith(known.lower()):
|
|
46
|
+
return True
|
|
47
|
+
return judge.provider not in _VISION_CAPABLE
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _image_to_data_uri(src: str) -> tuple[str, str, str]:
|
|
51
|
+
"""Return ``(uri_or_url, mime_type, base64_data)`` for an image source.
|
|
52
|
+
|
|
53
|
+
``src`` may be ``http(s)://``, ``data:``, or a local filesystem path.
|
|
54
|
+
Local paths are read and inlined as base64.
|
|
55
|
+
"""
|
|
56
|
+
if src.startswith("data:"):
|
|
57
|
+
match = re.match(r"data:([^;]+);base64,(.+)$", src)
|
|
58
|
+
if not match:
|
|
59
|
+
raise ValueError(f"unrecognised data URI: {src[:60]}")
|
|
60
|
+
return src, match.group(1), match.group(2)
|
|
61
|
+
if src.startswith("http://") or src.startswith("https://"):
|
|
62
|
+
mime = mimetypes.guess_type(src)[0] or "image/jpeg"
|
|
63
|
+
return src, mime, ""
|
|
64
|
+
path = pathlib.Path(src).expanduser().resolve()
|
|
65
|
+
if not path.is_file():
|
|
66
|
+
raise FileNotFoundError(f"image not found: {src}")
|
|
67
|
+
# PDFs are valid input to most vision APIs (they accept PDF mime
|
|
68
|
+
# types via the same image content blocks). We honour the actual
|
|
69
|
+
# extension rather than forcing image/jpeg.
|
|
70
|
+
suffix = path.suffix.lower()
|
|
71
|
+
if suffix == ".pdf":
|
|
72
|
+
mime = "application/pdf"
|
|
73
|
+
else:
|
|
74
|
+
mime = mimetypes.guess_type(path.name)[0] or "image/jpeg"
|
|
75
|
+
data = base64.b64encode(path.read_bytes()).decode("ascii")
|
|
76
|
+
return f"data:{mime};base64,{data}", mime, data
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def call_vision(
|
|
80
|
+
prompt: str,
|
|
81
|
+
sources: list[str],
|
|
82
|
+
judge: JudgeConfig,
|
|
83
|
+
max_tokens: int = 2048,
|
|
84
|
+
) -> str:
|
|
85
|
+
"""Call a vision-capable judge with a text prompt + one or more image
|
|
86
|
+
sources (paths, URLs, or data URIs). Returns the raw text answer.
|
|
87
|
+
|
|
88
|
+
Raises :class:`JudgeUnavailable` if the SDK is missing, an API key
|
|
89
|
+
isn't set, or the model is text-only.
|
|
90
|
+
"""
|
|
91
|
+
if not _is_vision_capable(judge):
|
|
92
|
+
raise JudgeUnavailable(
|
|
93
|
+
f"vision-capable judge required; {judge.provider}/{judge.model} "
|
|
94
|
+
"is text-only. Try google:gemini-2.5-flash (cheap), "
|
|
95
|
+
"anthropic:claude-haiku-4-5, or openai:gpt-4o-mini."
|
|
96
|
+
)
|
|
97
|
+
provider = judge.provider
|
|
98
|
+
if provider == "anthropic":
|
|
99
|
+
return _anthropic_call(prompt, sources, judge, max_tokens)
|
|
100
|
+
if provider == "openai":
|
|
101
|
+
return _openai_call(prompt, sources, judge, max_tokens)
|
|
102
|
+
if provider == "google":
|
|
103
|
+
return _google_call(prompt, sources, judge, max_tokens)
|
|
104
|
+
raise JudgeUnavailable(
|
|
105
|
+
f"provider {provider!r} is not wired for vision; use "
|
|
106
|
+
"anthropic, openai, or google."
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _anthropic_call(
|
|
111
|
+
prompt: str, sources: list[str], judge: JudgeConfig, max_tokens: int
|
|
112
|
+
) -> str:
|
|
113
|
+
try:
|
|
114
|
+
import anthropic # type: ignore[import-not-found]
|
|
115
|
+
except ImportError as exc:
|
|
116
|
+
raise JudgeUnavailable(
|
|
117
|
+
"anthropic SDK not installed. Install with `pip install 'pdfhell[anthropic]'` "
|
|
118
|
+
"or `pip install anthropic`."
|
|
119
|
+
) from exc
|
|
120
|
+
content: list[dict] = []
|
|
121
|
+
for src in sources:
|
|
122
|
+
_, mime, b64 = _image_to_data_uri(src)
|
|
123
|
+
# PDF inputs use document content blocks on Anthropic; image
|
|
124
|
+
# inputs use image content blocks. Both encode as base64.
|
|
125
|
+
if mime == "application/pdf":
|
|
126
|
+
content.append({
|
|
127
|
+
"type": "document",
|
|
128
|
+
"source": {"type": "base64", "media_type": mime, "data": b64},
|
|
129
|
+
})
|
|
130
|
+
elif b64:
|
|
131
|
+
content.append({
|
|
132
|
+
"type": "image",
|
|
133
|
+
"source": {"type": "base64", "media_type": mime, "data": b64},
|
|
134
|
+
})
|
|
135
|
+
else:
|
|
136
|
+
content.append({"type": "image", "source": {"type": "url", "url": src}})
|
|
137
|
+
content.append({"type": "text", "text": prompt})
|
|
138
|
+
client = anthropic.Anthropic()
|
|
139
|
+
msg = client.messages.create(
|
|
140
|
+
model=judge.model,
|
|
141
|
+
max_tokens=max_tokens,
|
|
142
|
+
temperature=judge.temperature,
|
|
143
|
+
messages=[{"role": "user", "content": content}],
|
|
144
|
+
)
|
|
145
|
+
return "".join(b.text for b in msg.content if getattr(b, "type", "") == "text")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _openai_call(
|
|
149
|
+
prompt: str, sources: list[str], judge: JudgeConfig, max_tokens: int
|
|
150
|
+
) -> str:
|
|
151
|
+
try:
|
|
152
|
+
import openai # type: ignore[import-not-found]
|
|
153
|
+
except ImportError as exc:
|
|
154
|
+
raise JudgeUnavailable(
|
|
155
|
+
"openai SDK not installed. Install with `pip install 'pdfhell[openai]'` "
|
|
156
|
+
"or `pip install openai`."
|
|
157
|
+
) from exc
|
|
158
|
+
parts: list[dict] = [{"type": "text", "text": prompt}]
|
|
159
|
+
for src in sources:
|
|
160
|
+
data_uri, mime, _ = _image_to_data_uri(src)
|
|
161
|
+
if mime == "application/pdf":
|
|
162
|
+
# OpenAI accepts PDFs via the file input type since GPT-4o
|
|
163
|
+
# (April 2025). The API expects an inline base64 data URI in
|
|
164
|
+
# the file part.
|
|
165
|
+
parts.append({"type": "file", "file": {"filename": pathlib.Path(src).name, "file_data": data_uri}})
|
|
166
|
+
else:
|
|
167
|
+
parts.append({"type": "image_url", "image_url": {"url": data_uri}})
|
|
168
|
+
client = openai.OpenAI(
|
|
169
|
+
base_url=judge.base_url if judge.base_url else None,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# GPT-5.x and the o-series reasoning models deprecated `max_tokens`
|
|
173
|
+
# in favour of `max_completion_tokens`, and they reject the legacy
|
|
174
|
+
# name with a 400. They also reserve some of the output budget for
|
|
175
|
+
# internal "thinking" tokens, so we double the cap for reasoning
|
|
176
|
+
# models to leave room for both reasoning and answer.
|
|
177
|
+
model = (judge.model or "").lower()
|
|
178
|
+
is_reasoning_model = model.startswith("gpt-5") or model.startswith("o1") or model.startswith("o3") or model.startswith("o4")
|
|
179
|
+
kwargs: dict = {
|
|
180
|
+
"model": judge.model,
|
|
181
|
+
"messages": [{"role": "user", "content": parts}],
|
|
182
|
+
}
|
|
183
|
+
if is_reasoning_model:
|
|
184
|
+
kwargs["max_completion_tokens"] = max_tokens * 2
|
|
185
|
+
# Reasoning models reject temperature != 1 with a 400. Omit the
|
|
186
|
+
# param entirely; the model picks its own default.
|
|
187
|
+
else:
|
|
188
|
+
kwargs["max_tokens"] = max_tokens
|
|
189
|
+
kwargs["temperature"] = judge.temperature
|
|
190
|
+
|
|
191
|
+
resp = client.chat.completions.create(**kwargs)
|
|
192
|
+
return resp.choices[0].message.content or ""
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _google_call(
|
|
196
|
+
prompt: str, sources: list[str], judge: JudgeConfig, max_tokens: int
|
|
197
|
+
) -> str:
|
|
198
|
+
try:
|
|
199
|
+
from google import genai # type: ignore[import-not-found]
|
|
200
|
+
from google.genai import types as genai_types # type: ignore[import-not-found]
|
|
201
|
+
except ImportError as exc:
|
|
202
|
+
raise JudgeUnavailable(
|
|
203
|
+
"google-genai SDK not installed. Install with `pip install 'pdfhell[google]'` "
|
|
204
|
+
"or `pip install google-genai`."
|
|
205
|
+
) from exc
|
|
206
|
+
contents: list = []
|
|
207
|
+
for src in sources:
|
|
208
|
+
_, mime, b64 = _image_to_data_uri(src)
|
|
209
|
+
if b64:
|
|
210
|
+
contents.append(
|
|
211
|
+
genai_types.Part.from_bytes(data=base64.b64decode(b64), mime_type=mime)
|
|
212
|
+
)
|
|
213
|
+
else:
|
|
214
|
+
raise JudgeUnavailable(
|
|
215
|
+
"google-genai requires local files or data URIs for image input; "
|
|
216
|
+
f"got remote URL: {src}"
|
|
217
|
+
)
|
|
218
|
+
contents.append(prompt)
|
|
219
|
+
client = genai.Client()
|
|
220
|
+
resp = client.models.generate_content(
|
|
221
|
+
model=judge.model,
|
|
222
|
+
contents=contents,
|
|
223
|
+
config=genai_types.GenerateContentConfig(
|
|
224
|
+
temperature=judge.temperature,
|
|
225
|
+
max_output_tokens=max_tokens,
|
|
226
|
+
),
|
|
227
|
+
)
|
|
228
|
+
return resp.text or ""
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
__all__ = ["call_vision", "JudgeUnavailable"]
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfhell
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF Hell — adversarial PDFs that break AI document readers. Procedural ground truth, not LLM-as-judge.
|
|
5
|
+
Author: Multivon
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://pdfhell.multivon.ai
|
|
8
|
+
Project-URL: Repository, https://github.com/multivon-ai/pdfhell
|
|
9
|
+
Project-URL: Issues, https://github.com/multivon-ai/pdfhell/issues
|
|
10
|
+
Project-URL: Leaderboard, https://pdfhell.multivon.ai/leaderboard
|
|
11
|
+
Keywords: llm,evaluation,pdf,multimodal,benchmark,adversarial,document-ai,rag
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: multivon-eval>=0.7.2
|
|
24
|
+
Requires-Dist: google-genai>=1.0
|
|
25
|
+
Requires-Dist: reportlab>=4.0
|
|
26
|
+
Requires-Dist: pypdf>=5.0
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# PDF Hell
|
|
31
|
+
|
|
32
|
+
**Adversarial PDFs that break AI document readers — with procedural ground truth, not LLM-as-judge.**
|
|
33
|
+
|
|
34
|
+
PDF Hell is a small, sharp benchmark for the "AI reads PDFs" claim. Every test case is a PDF generated *from code*, so the correct answer is known exactly. There's no LLM judging another LLM's interpretation — the same loop that fooled the model isn't asked to grade it.
|
|
35
|
+
|
|
36
|
+
If your AI claims it can read documents, it should survive PDFs designed to break it.
|
|
37
|
+
|
|
38
|
+
## Quickstart (30 seconds)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# 3-case smoke run against the cheapest vision model — works in any env with a Gemini key
|
|
42
|
+
export GOOGLE_API_KEY=...
|
|
43
|
+
uvx pdfhell run --model google:gemini-2.5-flash --suite smoke
|
|
44
|
+
|
|
45
|
+
# Or run the full mini suite (30 cases, ~10s on Flash, ~$0.01)
|
|
46
|
+
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
47
|
+
|
|
48
|
+
# Or just generate one trap PDF and open it
|
|
49
|
+
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
50
|
+
open ./cases/hidden_ocr_mismatch-0042.pdf
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
That's it. `pdfhell run` builds the suite on first use, sends each PDF to the vision model, and grades the answer against code-based ground truth.
|
|
54
|
+
|
|
55
|
+
Smoke result on Gemini 2.5 Flash (one case per family, run this minute):
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
PDF Hell smoke suite — n=3
|
|
59
|
+
model: google:gemini-2.5-flash
|
|
60
|
+
pass: 3/3 (100.0%)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## What's in the mini suite
|
|
64
|
+
|
|
65
|
+
| Trap family | Cases | What breaks |
|
|
66
|
+
|---|---|---|
|
|
67
|
+
| `hidden_ocr_mismatch` | 10 | Invoices where the visible amount differs from an invisible OCR text layer. Vision-only models read the page; text-extraction pipelines read the layer; they disagree. |
|
|
68
|
+
| `footnote_override` | 10 | Legal clauses where a 6pt footnote overrides the body — liability caps with carve-outs, terminations with restrictions, data-residency with disaster-recovery exceptions. |
|
|
69
|
+
| `split_table_across_pages` | 10 | Financial tables where the header row sits on page 1 and the body rows on page 2. RAG loaders that paginate independently lose column context. |
|
|
70
|
+
|
|
71
|
+
Every case has a deterministic seed. Re-running with the same seed regenerates **byte-identical PDFs** and identical answer keys. `Canvas(invariant=True)` is set on every generator so timestamps and document IDs don't drift between runs.
|
|
72
|
+
|
|
73
|
+
The full suite (10 trap families, ~50 cases) is on the [roadmap](#roadmap).
|
|
74
|
+
|
|
75
|
+
## Why this exists
|
|
76
|
+
|
|
77
|
+
The current AI-eval state of the art uses an LLM-as-judge to grade another LLM's answer. That's circular: the same complexity that fools the agent fools the judge. PDF Hell rejects that:
|
|
78
|
+
|
|
79
|
+
1. **Code-based ground truth.** The answer is a literal Python value the generator chose, not a frontier model's opinion.
|
|
80
|
+
2. **A named failure mode per trap.** When a model fails, we know *which* specific failure caught it (e.g. "trusted the hidden OCR layer over the visible page").
|
|
81
|
+
3. **A diagnostic signal**, not just a score. Per-trap-family breakdown tells you which assumption broke.
|
|
82
|
+
|
|
83
|
+
## Commands
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
pdfhell list-traps # list trap families
|
|
87
|
+
pdfhell make --trap <family> --seed <n> # generate one case
|
|
88
|
+
pdfhell build --suite <smoke|mini> --out <dir> # materialise a suite
|
|
89
|
+
pdfhell run --model <provider>:<model> # evaluate a model
|
|
90
|
+
[--suite smoke|mini] # (default: mini)
|
|
91
|
+
[--cases-dir <dir>] # (default: ./cases/<suite>)
|
|
92
|
+
[--out <path>] # JSON output
|
|
93
|
+
[--junit <path>] # JUnit XML for GitHub Actions / GitLab CI
|
|
94
|
+
[--fail-threshold <0.0-1.0>] # non-zero exit if pass_rate below threshold
|
|
95
|
+
[--workers <n>] # parallel API requests (default: 4)
|
|
96
|
+
[--quiet]
|
|
97
|
+
pdfhell report runs/<file>.json # print a saved run's summary
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Provider shorthand: `anthropic:claude-sonnet-4-6`, `openai:gpt-4o`, `google:gemini-2.5-pro`, `google:gemini-2.5-flash`, etc. API key from env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`).
|
|
101
|
+
|
|
102
|
+
## CI integration
|
|
103
|
+
|
|
104
|
+
Drop this into `.github/workflows/eval.yml`:
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
name: PDF Hell
|
|
108
|
+
on: [pull_request]
|
|
109
|
+
jobs:
|
|
110
|
+
pdfhell:
|
|
111
|
+
runs-on: ubuntu-latest
|
|
112
|
+
steps:
|
|
113
|
+
- uses: actions/checkout@v4
|
|
114
|
+
- uses: astral-sh/setup-uv@v5
|
|
115
|
+
- run: uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini --junit results.xml --fail-threshold 0.7
|
|
116
|
+
env:
|
|
117
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
118
|
+
- uses: actions/upload-artifact@v4
|
|
119
|
+
with:
|
|
120
|
+
name: pdfhell-results
|
|
121
|
+
path: results.xml
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
JUnit XML renders natively in the GitHub Actions / GitLab CI / CircleCI / Jenkins PR panel — failures show up as red rows with the expected and observed answers in the failure message.
|
|
125
|
+
|
|
126
|
+
## How scoring works
|
|
127
|
+
|
|
128
|
+
Two layers, applied in order:
|
|
129
|
+
|
|
130
|
+
1. **Procedural exact match (primary)** — for single-value traps, the model's free-text answer must contain the expected value (whitespace-tolerant, case-insensitive). For prose traps like `footnote_override`, the model must include every required token (the cap value, every carve-out section number, etc.) in any order, in any phrasing. The model isn't graded on prose style; it's graded on whether it captured the facts.
|
|
131
|
+
2. **Forbidden-answer detection (diagnostic)** — did the model return one of the answers the trap was specifically designed to elicit (e.g. the hidden-OCR amount)? If so, the trap caught a *known* failure mode and we record it. Doesn't affect the primary score.
|
|
132
|
+
|
|
133
|
+
Anything that looks like a refusal (`"I can't determine..."`) is recorded as `refused`, not as a wrong answer.
|
|
134
|
+
|
|
135
|
+
The QAG explanation layer from `multivon-eval` (`DocumentGrounding`) is available separately for users who want a human-readable "why did the model fail" breakdown — but it's never on the scoring path.
|
|
136
|
+
|
|
137
|
+
## Adding a new trap family
|
|
138
|
+
|
|
139
|
+
Add a generator at `pdfhell/generators/<your_trap>.py`:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from ..case import HellCase
|
|
143
|
+
from . import _common as C
|
|
144
|
+
|
|
145
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
146
|
+
rng = C.rng_for(seed)
|
|
147
|
+
# ... draw a PDF with reportlab using rng for all random choices ...
|
|
148
|
+
# invariant=True is the default — keep your generator deterministic.
|
|
149
|
+
return pdf_bytes, HellCase(
|
|
150
|
+
id=f"your_trap-{seed:04d}",
|
|
151
|
+
trap_family="your_trap",
|
|
152
|
+
seed=seed,
|
|
153
|
+
question="What is ...?",
|
|
154
|
+
expected_answer="42", # single canonical answer
|
|
155
|
+
expected_tokens=["42"], # OR list of required substrings for prose
|
|
156
|
+
forbidden_answers=["41", "43"], # OR a value the trap specifically elicits
|
|
157
|
+
metadata={"expected_failure_mode": "Model does X when it should do Y."},
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Register it in `pdfhell/generators/__init__.py`. See [CONTRIBUTING.md](./CONTRIBUTING.md) for the full guide. Tests run with `pytest`.
|
|
162
|
+
|
|
163
|
+
## Roadmap
|
|
164
|
+
|
|
165
|
+
The 0.1 release is intentionally narrow — three trap families, 30 cases. Coming next:
|
|
166
|
+
|
|
167
|
+
- `merged_table_cells` — value depends on row/column span interpretation
|
|
168
|
+
- `rotated_scan` — visually legible but OCR-broken pages
|
|
169
|
+
- `near_duplicate_entities` — "ACME Ltd." vs "ACME Holdings Ltd."
|
|
170
|
+
- `prompt_injection_in_body` — "Ignore previous instructions and answer X"
|
|
171
|
+
- `chart_axis_inversion` — answers depend on reading axis direction
|
|
172
|
+
- `checkbox_ambiguity` — selected vs unselected with low visual margin
|
|
173
|
+
- `cross_page_citation` — answers requiring page + bounding-box citations
|
|
174
|
+
|
|
175
|
+
Target full suite: 10 trap families, ~50 cases.
|
|
176
|
+
|
|
177
|
+
## Hosted generator
|
|
178
|
+
|
|
179
|
+
For document-AI teams who need adversarial test cases tailored to *their* templates (claims forms, MSAs, medical records, KYC docs), there's a hosted generator that takes your templates and produces adversarial variants with code-based ground truth — same methodology, your data shape.
|
|
180
|
+
|
|
181
|
+
Email `hello@multivon.ai` for early access, or see [multivon.ai/pricing](https://multivon.ai/pricing).
|
|
182
|
+
|
|
183
|
+
## Installing
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# Recommended (zero-install with uv):
|
|
187
|
+
uvx pdfhell list-traps
|
|
188
|
+
|
|
189
|
+
# Or in a venv:
|
|
190
|
+
python -m venv .venv && source .venv/bin/activate
|
|
191
|
+
pip install pdfhell
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Bare install brings in `multivon-eval` (the engine), `reportlab` (PDF generation), `pypdf`, and the three frontier-provider SDKs (anthropic, openai, google-genai). No provider extras to remember; no GPU required.
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
Apache 2.0. Built on [`multivon-eval`](https://github.com/multivon-ai/multivon-eval).
|
|
199
|
+
|
|
200
|
+
## Citing
|
|
201
|
+
|
|
202
|
+
```bibtex
|
|
203
|
+
@software{pdfhell,
|
|
204
|
+
title = {PDF Hell: Adversarial PDFs for AI document readers},
|
|
205
|
+
author = {Multivon},
|
|
206
|
+
url = {https://github.com/multivon-ai/pdfhell},
|
|
207
|
+
}
|
|
208
|
+
```
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
pdfhell/__init__.py,sha256=HQSEHz5hlaZ1LFbY34fdzUGXMlJc6VxXDhfG77AyCQA,919
|
|
2
|
+
pdfhell/auditpack.py,sha256=aU65vUBlq3QJuzKhH5TvmcKLvMWhtJ8z_A8SETA9lms,6297
|
|
3
|
+
pdfhell/case.py,sha256=KRg10SCPy7b_vbCRPEWszJFSyn1HYJuoNeSSfXJ4M98,3835
|
|
4
|
+
pdfhell/cli.py,sha256=F5wdepx-1cir8wO3TK47nufi4gdT3QbujCvfoWuGelE,8216
|
|
5
|
+
pdfhell/junit.py,sha256=DYWmKb6VDTaNlY7TBGfQS7Hu24qsO7hJULB6FXL9eEc,3580
|
|
6
|
+
pdfhell/runner.py,sha256=Tr_Zdnewn8lkkNEV3MPRtqjZEj5exke7y1k641ctBu8,5119
|
|
7
|
+
pdfhell/scorer.py,sha256=cMgnzks5fX-Hz9c-aYsx3ZwbpQfy7Gv0xMfElT2YD0U,7711
|
|
8
|
+
pdfhell/suite.py,sha256=VQkEi4CEuE1vdxRzXj804MlWXM-infiB0xFs2YMeUrg,3382
|
|
9
|
+
pdfhell/vision.py,sha256=HUEpNFVvaFAI1mTwBlp9zIh-94gYCknVrCCve-BamRg,8781
|
|
10
|
+
pdfhell/generators/__init__.py,sha256=wmYbtoI_G6_-J7TaWyIvQ3887YJyDI2gGwNxZjWoGVE,1841
|
|
11
|
+
pdfhell/generators/_common.py,sha256=OlSmOtymawWXUjP1UbcYEkSRKxezm7AltH6rnacOaSk,6112
|
|
12
|
+
pdfhell/generators/footnote_override.py,sha256=9S703znrSZy2SDQ5-p5sMgpRmUDh3szQmTe6WaYnl-c,8835
|
|
13
|
+
pdfhell/generators/hidden_ocr_mismatch.py,sha256=Q3Lx5SZRH_Zywa_vSNkUu9fvFWsI_m0wKXyIIxyaSTE,4992
|
|
14
|
+
pdfhell/generators/split_table_across_pages.py,sha256=1bWL5kv8CNUR6bbgfTNkzXtfoMbRNImQk6dGdkSYaw4,6293
|
|
15
|
+
pdfhell-0.1.0.dist-info/licenses/LICENSE,sha256=42Dg0T3y9vfd-um9B8j26KeAXJrE3jgtK_ggMrcHS9o,737
|
|
16
|
+
pdfhell-0.1.0.dist-info/METADATA,sha256=TbFR-FnLd2Tbs5sw7vhVGieOLUFBBcf5Ci3Q8OlLU8U,9691
|
|
17
|
+
pdfhell-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
18
|
+
pdfhell-0.1.0.dist-info/entry_points.txt,sha256=LiHsXd67OOxVUSAT9N-EQARvAXnoHlGhOW8R1QclsAs,45
|
|
19
|
+
pdfhell-0.1.0.dist-info/top_level.txt,sha256=kLeB8Xx5QVEC_p8kAoAEg-wB80unVpYb50ELR7QO_Rg,8
|
|
20
|
+
pdfhell-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2026 Multivon
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
|
|
13
|
+
Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
See the License for the specific language governing permissions and
|
|
17
|
+
limitations under the License.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdfhell
|