pdfhell 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfhell/__init__.py +34 -0
- pdfhell/auditpack.py +182 -0
- pdfhell/case.py +87 -0
- pdfhell/cli.py +216 -0
- pdfhell/generators/__init__.py +49 -0
- pdfhell/generators/_common.py +183 -0
- pdfhell/generators/footnote_override.py +212 -0
- pdfhell/generators/hidden_ocr_mismatch.py +129 -0
- pdfhell/generators/split_table_across_pages.py +174 -0
- pdfhell/junit.py +94 -0
- pdfhell/runner.py +142 -0
- pdfhell/scorer.py +214 -0
- pdfhell/suite.py +104 -0
- pdfhell/vision.py +231 -0
- pdfhell-0.1.0.dist-info/METADATA +208 -0
- pdfhell-0.1.0.dist-info/RECORD +20 -0
- pdfhell-0.1.0.dist-info/WHEEL +5 -0
- pdfhell-0.1.0.dist-info/entry_points.txt +2 -0
- pdfhell-0.1.0.dist-info/licenses/LICENSE +17 -0
- pdfhell-0.1.0.dist-info/top_level.txt +1 -0
pdfhell/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""PDF Hell — adversarial PDFs that break AI document readers.
|
|
2
|
+
|
|
3
|
+
Procedural ground truth, not LLM-as-judge. Each trap family generates PDFs
|
|
4
|
+
*from code*, so the answer key is exact and reproducible — no circular
|
|
5
|
+
assurance.
|
|
6
|
+
|
|
7
|
+
Quickstart::
|
|
8
|
+
|
|
9
|
+
uvx pdfhell make --trap hidden_ocr_mismatch --seed 42
|
|
10
|
+
uvx pdfhell run --model anthropic:claude-sonnet-4-6 --suite mini
|
|
11
|
+
uvx pdfhell report runs/claude.json --share-card
|
|
12
|
+
|
|
13
|
+
Build on top of ``multivon-eval`` (the QAG engine, provider adapters, audit
|
|
14
|
+
packaging, cost tracking). pdfhell is *only* the adversarial generation
|
|
15
|
+
layer; the runtime, scoring, and reporting come from multivon-eval.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
from .case import HellCase
|
|
22
|
+
from .generators import (
|
|
23
|
+
GENERATORS,
|
|
24
|
+
TRAP_FAMILIES,
|
|
25
|
+
generate_case,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"__version__",
|
|
30
|
+
"HellCase",
|
|
31
|
+
"GENERATORS",
|
|
32
|
+
"TRAP_FAMILIES",
|
|
33
|
+
"generate_case",
|
|
34
|
+
]
|
pdfhell/auditpack.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Build a downloadable, hash-chained audit pack from a pdfhell run.
|
|
2
|
+
|
|
3
|
+
The pack is a ZIP containing:
|
|
4
|
+
|
|
5
|
+
- ``manifest.json`` — pdfhell version, run timestamp, model spec, suite,
|
|
6
|
+
per-trap pass rates, total cost (when known), SHA-256 of every file
|
|
7
|
+
inside the pack.
|
|
8
|
+
- ``run.json`` — the full :class:`SuiteReport` JSON.
|
|
9
|
+
- ``run.xml`` — JUnit XML (same data as ``run.json``, machine-readable
|
|
10
|
+
for CI dashboards).
|
|
11
|
+
- ``cases/<case_id>.pdf`` — every adversarial PDF the model was tested
|
|
12
|
+
against.
|
|
13
|
+
- ``cases/<case_id>.json`` — each case's answer key + metadata.
|
|
14
|
+
- ``README.txt`` — human-readable "what's in this ZIP" + reproduction
|
|
15
|
+
command. Procurement teams open this first.
|
|
16
|
+
|
|
17
|
+
The audit pack is the artifact a buyer's procurement team attaches to
|
|
18
|
+
a diligence appendix. It must be self-describing (no out-of-band
|
|
19
|
+
context required), reproducible (the manifest tells you the exact
|
|
20
|
+
command to regenerate the run), and tamper-evident (the manifest
|
|
21
|
+
includes a SHA-256 for every file in the pack; auditors can verify the
|
|
22
|
+
ZIP wasn't edited after delivery).
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import hashlib
|
|
27
|
+
import json
|
|
28
|
+
import zipfile
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Iterable
|
|
32
|
+
|
|
33
|
+
from . import __version__
|
|
34
|
+
from .case import HellCase
|
|
35
|
+
from .junit import report_to_junit
|
|
36
|
+
from .scorer import SuiteReport
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_README_TEMPLATE = """\
|
|
40
|
+
# pdfhell audit pack
|
|
41
|
+
|
|
42
|
+
This ZIP is a complete, self-describing record of one PDF Hell run. It
|
|
43
|
+
contains every PDF the model was asked to read, every answer key, the
|
|
44
|
+
raw model output, and a tamper-evident manifest.
|
|
45
|
+
|
|
46
|
+
## What's in this pack
|
|
47
|
+
|
|
48
|
+
- manifest.json — Run metadata + SHA-256 of every file in this ZIP.
|
|
49
|
+
- run.json — Full run report (per-case scores, model outputs).
|
|
50
|
+
- run.xml — JUnit XML (renders in CI dashboards).
|
|
51
|
+
- cases/*.pdf — The adversarial PDFs the model was tested against.
|
|
52
|
+
- cases/*.json — The answer keys + per-case metadata.
|
|
53
|
+
- README.txt — This file.
|
|
54
|
+
|
|
55
|
+
## How to verify
|
|
56
|
+
|
|
57
|
+
The manifest contains a SHA-256 for every file in this ZIP. To verify
|
|
58
|
+
nothing was edited after delivery:
|
|
59
|
+
|
|
60
|
+
unzip -p audit-pack.zip manifest.json | jq .files
|
|
61
|
+
sha256sum cases/*.pdf cases/*.json run.json run.xml README.txt
|
|
62
|
+
|
|
63
|
+
Each hash in the manifest must match the file's actual SHA-256.
|
|
64
|
+
|
|
65
|
+
## How to reproduce
|
|
66
|
+
|
|
67
|
+
The manifest records the exact pdfhell command. To regenerate
|
|
68
|
+
byte-identical PDFs and re-run the same model:
|
|
69
|
+
|
|
70
|
+
{repro_command}
|
|
71
|
+
|
|
72
|
+
pdfhell uses Canvas(invariant=True) on every generator so PDFs are
|
|
73
|
+
byte-identical across runs with the same seed.
|
|
74
|
+
|
|
75
|
+
## Scope
|
|
76
|
+
|
|
77
|
+
pdfhell {pdfhell_version}, suite {suite}, model {model}. Generated
|
|
78
|
+
{timestamp}. {n} cases, {passed}/{n} passed ({pass_rate:.0%}). See
|
|
79
|
+
manifest.json for per-trap breakdown.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _sha256(data: bytes) -> str:
|
|
84
|
+
return hashlib.sha256(data).hexdigest()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _gather_files(report: SuiteReport, cases_dir: Path) -> Iterable[tuple[str, bytes]]:
|
|
88
|
+
"""Yield (arcname, bytes) pairs for every file going into the ZIP.
|
|
89
|
+
|
|
90
|
+
Order: README first (humans see it first), then manifest, then JSON
|
|
91
|
+
+ XML, then case PDFs + answer keys. Stable ordering keeps the
|
|
92
|
+
SHA-256 of the ZIP itself stable across runs.
|
|
93
|
+
"""
|
|
94
|
+
for case_summary in report.cases:
|
|
95
|
+
case_id = case_summary.case_id
|
|
96
|
+
pdf_path = cases_dir / f"{case_id}.pdf"
|
|
97
|
+
json_path = cases_dir / f"{case_id}.json"
|
|
98
|
+
if pdf_path.exists():
|
|
99
|
+
yield f"cases/{case_id}.pdf", pdf_path.read_bytes()
|
|
100
|
+
if json_path.exists():
|
|
101
|
+
yield f"cases/{case_id}.json", json_path.read_bytes()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_audit_pack(
|
|
105
|
+
report: SuiteReport,
|
|
106
|
+
cases_dir: Path,
|
|
107
|
+
out_path: Path,
|
|
108
|
+
) -> Path:
|
|
109
|
+
"""Write a complete audit ZIP for ``report`` to ``out_path``.
|
|
110
|
+
|
|
111
|
+
Returns the resolved output path.
|
|
112
|
+
"""
|
|
113
|
+
out_path = out_path.resolve()
|
|
114
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
115
|
+
|
|
116
|
+
# Materialise the per-case files into bytes first so we can hash them.
|
|
117
|
+
case_files: list[tuple[str, bytes]] = list(_gather_files(report, cases_dir))
|
|
118
|
+
|
|
119
|
+
run_json_bytes = json.dumps(report.to_dict(), indent=2).encode("utf-8")
|
|
120
|
+
run_xml_bytes = report_to_junit(report).encode("utf-8")
|
|
121
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
122
|
+
passed = sum(1 for c in report.cases if c.correct)
|
|
123
|
+
|
|
124
|
+
repro_command = (
|
|
125
|
+
f"uvx pdfhell run --model {report.model} --suite {report.suite}"
|
|
126
|
+
)
|
|
127
|
+
readme_bytes = _README_TEMPLATE.format(
|
|
128
|
+
pdfhell_version=__version__,
|
|
129
|
+
suite=report.suite,
|
|
130
|
+
model=report.model,
|
|
131
|
+
timestamp=timestamp,
|
|
132
|
+
n=report.n,
|
|
133
|
+
passed=passed,
|
|
134
|
+
pass_rate=report.pass_rate,
|
|
135
|
+
repro_command=repro_command,
|
|
136
|
+
).encode("utf-8")
|
|
137
|
+
|
|
138
|
+
# Build a manifest that hashes every other file in the pack. The
|
|
139
|
+
# manifest is the LAST file we hash so we can include the hashes of
|
|
140
|
+
# everything else inside it.
|
|
141
|
+
files_in_pack: list[tuple[str, bytes]] = [
|
|
142
|
+
("README.txt", readme_bytes),
|
|
143
|
+
("run.json", run_json_bytes),
|
|
144
|
+
("run.xml", run_xml_bytes),
|
|
145
|
+
*case_files,
|
|
146
|
+
]
|
|
147
|
+
manifest = {
|
|
148
|
+
"pdfhell_version": __version__,
|
|
149
|
+
"generated_at": timestamp,
|
|
150
|
+
"model": report.model,
|
|
151
|
+
"suite": report.suite,
|
|
152
|
+
"n": report.n,
|
|
153
|
+
"passed": passed,
|
|
154
|
+
"pass_rate": report.pass_rate,
|
|
155
|
+
"per_trap_pass": report.per_trap_pass,
|
|
156
|
+
"per_trap_fell_for_trap": report.per_trap_fell_for_trap,
|
|
157
|
+
"reproduction": {
|
|
158
|
+
"command": repro_command,
|
|
159
|
+
"note": (
|
|
160
|
+
"PDFs are regenerated byte-identically via Canvas(invariant=True). "
|
|
161
|
+
"Same seed → same PDF → same answer key."
|
|
162
|
+
),
|
|
163
|
+
},
|
|
164
|
+
"files": [
|
|
165
|
+
{"path": name, "sha256": _sha256(data), "size": len(data)}
|
|
166
|
+
for name, data in files_in_pack
|
|
167
|
+
],
|
|
168
|
+
}
|
|
169
|
+
manifest_bytes = json.dumps(manifest, indent=2).encode("utf-8")
|
|
170
|
+
|
|
171
|
+
# ZIP_DEFLATED is universal; mtime is set to the run timestamp so
|
|
172
|
+
# the ZIP itself is reproducible across packaging runs.
|
|
173
|
+
with zipfile.ZipFile(out_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
174
|
+
for name, data in [("manifest.json", manifest_bytes), *files_in_pack]:
|
|
175
|
+
info = zipfile.ZipInfo(name)
|
|
176
|
+
info.date_time = (2026, 1, 1, 0, 0, 0)
|
|
177
|
+
zf.writestr(info, data)
|
|
178
|
+
|
|
179
|
+
return out_path
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
__all__ = ["build_audit_pack"]
|
pdfhell/case.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Canonical case representation for a generated pdfhell trap.
|
|
2
|
+
|
|
3
|
+
A ``HellCase`` is the serialisable artifact that every trap-family generator
|
|
4
|
+
returns. The runner consumes these; the scoring layer treats
|
|
5
|
+
``expected_answer`` as code-based ground truth and grades the model's
|
|
6
|
+
free-text output against it. QAG (multivon-eval's
|
|
7
|
+
:class:`~multivon_eval.DocumentGrounding`) is layered on as the
|
|
8
|
+
*explanation* of why a particular answer scored a particular way — not as
|
|
9
|
+
the score itself. This is the architectural fix the Round-7 churned design
|
|
10
|
+
partner demanded: no LLM-judges-LLM circular assurance for the primary
|
|
11
|
+
correctness signal.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from dataclasses import asdict, dataclass, field
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(slots=True)
|
|
22
|
+
class HellCase:
|
|
23
|
+
"""One adversarial PDF + its ground truth.
|
|
24
|
+
|
|
25
|
+
Attributes
|
|
26
|
+
----------
|
|
27
|
+
id: stable, deterministic, e.g. ``"hidden_ocr_mismatch-0042"``.
|
|
28
|
+
trap_family: one of :data:`pdfhell.generators.TRAP_FAMILIES`.
|
|
29
|
+
seed: integer seed used to generate the case. Regenerating with the
|
|
30
|
+
same seed produces a byte-identical PDF and identical answer key.
|
|
31
|
+
question: the user-facing question the model must answer.
|
|
32
|
+
expected_answer: a human-readable form of the correct answer used in
|
|
33
|
+
reports + JUnit output. For single-value traps this is also the
|
|
34
|
+
substring the scorer looks for; for prose-style traps (e.g.
|
|
35
|
+
:mod:`pdfhell.generators.footnote_override`) the scorer instead
|
|
36
|
+
uses :attr:`expected_tokens` (see below).
|
|
37
|
+
expected_tokens: optional list of substrings that ALL must appear in
|
|
38
|
+
the model's output for the case to count as correct. Used by
|
|
39
|
+
traps where the right answer can be expressed multiple
|
|
40
|
+
equally-valid ways (e.g. a list of clause carve-outs in any
|
|
41
|
+
order). When empty, the scorer falls back to a contains-match
|
|
42
|
+
against ``expected_answer``.
|
|
43
|
+
forbidden_answers: optional list of plausible-but-wrong answers the
|
|
44
|
+
trap aims to elicit (e.g. the hidden-OCR amount). Used by the
|
|
45
|
+
scorer to detect the specific failure mode the trap was designed
|
|
46
|
+
for.
|
|
47
|
+
pdf_path: location of the generated PDF (relative to the suite root).
|
|
48
|
+
metadata: extra fields — number of pages, font size of footnotes,
|
|
49
|
+
the literal hidden-OCR string, etc. Useful for diagnostics and
|
|
50
|
+
for trap-family-specific scorers.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
id: str
|
|
54
|
+
trap_family: str
|
|
55
|
+
seed: int
|
|
56
|
+
question: str
|
|
57
|
+
expected_answer: str
|
|
58
|
+
forbidden_answers: list[str] = field(default_factory=list)
|
|
59
|
+
expected_tokens: list[str] = field(default_factory=list)
|
|
60
|
+
pdf_path: str = ""
|
|
61
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
# ─── serialisation ─────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> dict[str, Any]:
|
|
66
|
+
return asdict(self)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_dict(cls, raw: dict[str, Any]) -> "HellCase":
|
|
70
|
+
return cls(
|
|
71
|
+
id=raw["id"],
|
|
72
|
+
trap_family=raw["trap_family"],
|
|
73
|
+
seed=int(raw["seed"]),
|
|
74
|
+
question=raw["question"],
|
|
75
|
+
expected_answer=raw["expected_answer"],
|
|
76
|
+
forbidden_answers=list(raw.get("forbidden_answers", [])),
|
|
77
|
+
expected_tokens=list(raw.get("expected_tokens", [])),
|
|
78
|
+
pdf_path=raw.get("pdf_path", ""),
|
|
79
|
+
metadata=dict(raw.get("metadata", {})),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def dump_json(self, path: str | Path) -> None:
|
|
83
|
+
Path(path).write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def load_json(cls, path: str | Path) -> "HellCase":
|
|
87
|
+
return cls.from_dict(json.loads(Path(path).read_text(encoding="utf-8")))
|
pdfhell/cli.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""pdfhell CLI.
|
|
2
|
+
|
|
3
|
+
Four subcommands keep the surface minimal:
|
|
4
|
+
|
|
5
|
+
pdfhell list-traps list available trap families
|
|
6
|
+
pdfhell make --trap X --seed N [--out P] generate one case (pdf + json)
|
|
7
|
+
pdfhell build --suite mini --out cases/ materialise a named suite
|
|
8
|
+
pdfhell run --suite mini --model ... evaluate a model against the suite
|
|
9
|
+
pdfhell report runs/<name>.json print summary + optional share card
|
|
10
|
+
|
|
11
|
+
Everything else (scoring, provider dispatch, audit packaging) is pulled
|
|
12
|
+
from :mod:`multivon_eval`. The CLI is glue.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import json
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from . import __version__
|
|
22
|
+
from .auditpack import build_audit_pack
|
|
23
|
+
from .case import HellCase
|
|
24
|
+
from .generators import TRAP_FAMILIES, generate_case
|
|
25
|
+
from .junit import report_to_junit
|
|
26
|
+
from .runner import parse_model_spec, run_suite
|
|
27
|
+
from .scorer import SuiteReport
|
|
28
|
+
from .suite import SUITES, build_suite
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _cmd_list_traps(args: argparse.Namespace) -> int:
|
|
32
|
+
for family in TRAP_FAMILIES:
|
|
33
|
+
print(family)
|
|
34
|
+
return 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _cmd_make(args: argparse.Namespace) -> int:
|
|
38
|
+
try:
|
|
39
|
+
pdf_bytes, case = generate_case(args.trap, args.seed)
|
|
40
|
+
except KeyError as exc:
|
|
41
|
+
print(exc, file=sys.stderr)
|
|
42
|
+
print(f"available trap families: {', '.join(TRAP_FAMILIES)}", file=sys.stderr)
|
|
43
|
+
return 2
|
|
44
|
+
out_dir = Path(args.out).resolve()
|
|
45
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
pdf_path = out_dir / f"{case.id}.pdf"
|
|
47
|
+
json_path = out_dir / f"{case.id}.json"
|
|
48
|
+
pdf_path.write_bytes(pdf_bytes)
|
|
49
|
+
case.pdf_path = pdf_path.name
|
|
50
|
+
case.dump_json(json_path)
|
|
51
|
+
print(f"wrote {pdf_path} ({len(pdf_bytes):,} bytes)")
|
|
52
|
+
print(f"wrote {json_path}")
|
|
53
|
+
print(f"expected answer: {case.expected_answer}")
|
|
54
|
+
return 0
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _cmd_build(args: argparse.Namespace) -> int:
|
|
58
|
+
if args.suite not in SUITES:
|
|
59
|
+
print(f"unknown suite {args.suite!r}; available: {', '.join(SUITES)}", file=sys.stderr)
|
|
60
|
+
return 2
|
|
61
|
+
spec = SUITES[args.suite]
|
|
62
|
+
out_dir = Path(args.out).resolve()
|
|
63
|
+
print(f"building suite {spec.name!r} ({spec.total_cases} cases) → {out_dir}")
|
|
64
|
+
cases = build_suite(spec, out_dir)
|
|
65
|
+
print(f"wrote {len(cases)} cases")
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _cmd_run(args: argparse.Namespace) -> int:
|
|
70
|
+
# Default cases dir tracks the suite name so `--suite smoke` doesn't
|
|
71
|
+
# silently run the mini suite.
|
|
72
|
+
cases_dir = Path(args.cases_dir).resolve() if args.cases_dir else Path(f"./cases/{args.suite}").resolve()
|
|
73
|
+
if not cases_dir.is_dir():
|
|
74
|
+
if args.suite in SUITES:
|
|
75
|
+
print(f"cases dir {cases_dir} not found; building {args.suite} suite first ...")
|
|
76
|
+
cases_dir.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
build_suite(SUITES[args.suite], cases_dir)
|
|
78
|
+
else:
|
|
79
|
+
print(f"cases dir {cases_dir} not found and suite {args.suite!r} is unknown",
|
|
80
|
+
file=sys.stderr)
|
|
81
|
+
return 2
|
|
82
|
+
|
|
83
|
+
print(f"running {args.model} against {args.suite} suite at {cases_dir}")
|
|
84
|
+
report = run_suite(
|
|
85
|
+
cases_dir=cases_dir,
|
|
86
|
+
model_spec=args.model,
|
|
87
|
+
workers=args.workers,
|
|
88
|
+
progress=not args.quiet,
|
|
89
|
+
suite_name=args.suite,
|
|
90
|
+
)
|
|
91
|
+
out_path = Path(args.out).resolve() if args.out else _default_run_path(args.model, args.suite)
|
|
92
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
out_path.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
|
|
94
|
+
print()
|
|
95
|
+
_print_report(report)
|
|
96
|
+
print()
|
|
97
|
+
print(f"wrote {out_path}")
|
|
98
|
+
if args.junit:
|
|
99
|
+
junit_path = Path(args.junit).resolve()
|
|
100
|
+
junit_path.parent.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
junit_path.write_text(report_to_junit(report), encoding="utf-8")
|
|
102
|
+
print(f"wrote {junit_path}")
|
|
103
|
+
if args.audit_pack:
|
|
104
|
+
zip_path = Path(args.audit_pack).resolve()
|
|
105
|
+
build_audit_pack(report, cases_dir, zip_path)
|
|
106
|
+
print(f"wrote {zip_path}")
|
|
107
|
+
if args.fail_threshold is not None and report.pass_rate < args.fail_threshold:
|
|
108
|
+
print(
|
|
109
|
+
f"\nFAIL: pass_rate {report.pass_rate:.1%} below --fail-threshold "
|
|
110
|
+
f"{args.fail_threshold:.1%}"
|
|
111
|
+
)
|
|
112
|
+
return 1
|
|
113
|
+
return 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _cmd_report(args: argparse.Namespace) -> int:
|
|
117
|
+
raw = json.loads(Path(args.run).read_text(encoding="utf-8"))
|
|
118
|
+
report = SuiteReport(
|
|
119
|
+
model=raw["model"],
|
|
120
|
+
suite=raw["suite"],
|
|
121
|
+
n=raw["n"],
|
|
122
|
+
pass_rate=raw["pass_rate"],
|
|
123
|
+
per_trap_pass=raw["per_trap_pass"],
|
|
124
|
+
per_trap_fell_for_trap=raw["per_trap_fell_for_trap"],
|
|
125
|
+
refused_rate=raw["refused_rate"],
|
|
126
|
+
cases=[], # not needed for printing the summary
|
|
127
|
+
)
|
|
128
|
+
_print_report(report)
|
|
129
|
+
return 0
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _print_report(report: SuiteReport) -> None:
|
|
133
|
+
print(f"PDF Hell {report.suite} suite — n={report.n}")
|
|
134
|
+
print()
|
|
135
|
+
print(f"model: {report.model}")
|
|
136
|
+
print(f"pass: {sum(1 for _ in report.cases if _.correct) if report.cases else int(report.pass_rate * report.n)}/{report.n} ({report.pass_rate:.1%})")
|
|
137
|
+
print(f"refused: {report.refused_rate:.1%}")
|
|
138
|
+
print()
|
|
139
|
+
print("per-trap pass rate:")
|
|
140
|
+
for trap, rate in sorted(report.per_trap_pass.items()):
|
|
141
|
+
fell = report.per_trap_fell_for_trap.get(trap, 0.0)
|
|
142
|
+
print(f" {trap:30s} pass={rate:.0%} fell-for-trap={fell:.0%}")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _default_run_path(model_spec: str, suite: str) -> Path:
|
|
146
|
+
safe = model_spec.replace("/", "-").replace(":", "-")
|
|
147
|
+
return Path(f"runs/{suite}-{safe}.json").resolve()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
151
|
+
p = argparse.ArgumentParser(
|
|
152
|
+
prog="pdfhell",
|
|
153
|
+
description="PDF Hell — adversarial PDFs that break AI document readers.",
|
|
154
|
+
)
|
|
155
|
+
p.add_argument("--version", action="version", version=f"pdfhell {__version__}")
|
|
156
|
+
sub = p.add_subparsers(dest="cmd", required=True, metavar="<command>")
|
|
157
|
+
|
|
158
|
+
p_list = sub.add_parser("list-traps", help="list available trap families")
|
|
159
|
+
p_list.set_defaults(func=_cmd_list_traps)
|
|
160
|
+
|
|
161
|
+
p_make = sub.add_parser("make", help="generate one case (pdf + json)")
|
|
162
|
+
p_make.add_argument("--trap", required=True, choices=TRAP_FAMILIES)
|
|
163
|
+
p_make.add_argument("--seed", required=True, type=int)
|
|
164
|
+
p_make.add_argument("--out", default="./cases", help="output directory (default: ./cases)")
|
|
165
|
+
p_make.set_defaults(func=_cmd_make)
|
|
166
|
+
|
|
167
|
+
p_build = sub.add_parser("build", help="materialise a named suite to disk")
|
|
168
|
+
p_build.add_argument("--suite", default="mini", choices=tuple(SUITES.keys()))
|
|
169
|
+
p_build.add_argument("--out", default="./cases/mini")
|
|
170
|
+
p_build.set_defaults(func=_cmd_build)
|
|
171
|
+
|
|
172
|
+
p_run = sub.add_parser("run", help="evaluate a model against a suite")
|
|
173
|
+
p_run.add_argument("--model", required=True,
|
|
174
|
+
help="provider:model, e.g. anthropic:claude-sonnet-4-6")
|
|
175
|
+
p_run.add_argument("--suite", default="mini", choices=tuple(SUITES.keys()))
|
|
176
|
+
p_run.add_argument(
|
|
177
|
+
"--cases-dir",
|
|
178
|
+
default=None,
|
|
179
|
+
help="dir with materialised cases (default: ./cases/<suite>; built on demand if missing)",
|
|
180
|
+
)
|
|
181
|
+
p_run.add_argument("--workers", type=int, default=4)
|
|
182
|
+
p_run.add_argument("--quiet", action="store_true")
|
|
183
|
+
p_run.add_argument("--out", help="output JSON path (default: runs/<suite>-<model>.json)")
|
|
184
|
+
p_run.add_argument(
|
|
185
|
+
"--junit",
|
|
186
|
+
help="also write a JUnit XML report to this path (renders in GitHub Actions / GitLab CI)",
|
|
187
|
+
)
|
|
188
|
+
p_run.add_argument(
|
|
189
|
+
"--audit-pack",
|
|
190
|
+
help=(
|
|
191
|
+
"also write a complete, hash-chained audit ZIP to this path "
|
|
192
|
+
"(PDFs + answer keys + run JSON + JUnit XML + SHA-256 manifest)"
|
|
193
|
+
),
|
|
194
|
+
)
|
|
195
|
+
p_run.add_argument(
|
|
196
|
+
"--fail-threshold",
|
|
197
|
+
type=float,
|
|
198
|
+
help="exit nonzero if pass_rate is below this fraction (0.0–1.0); for CI gates",
|
|
199
|
+
)
|
|
200
|
+
p_run.set_defaults(func=_cmd_run)
|
|
201
|
+
|
|
202
|
+
p_report = sub.add_parser("report", help="print summary from a run JSON")
|
|
203
|
+
p_report.add_argument("run", help="path to runs/<suite>-<model>.json")
|
|
204
|
+
p_report.set_defaults(func=_cmd_report)
|
|
205
|
+
|
|
206
|
+
return p
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def main(argv: list[str] | None = None) -> int:
|
|
210
|
+
parser = build_parser()
|
|
211
|
+
args = parser.parse_args(argv)
|
|
212
|
+
return args.func(args)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
if __name__ == "__main__":
|
|
216
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Trap-family generators.
|
|
2
|
+
|
|
3
|
+
Each generator is a deterministic ``(seed) -> (pdf_bytes, HellCase)``
|
|
4
|
+
function. The mapping :data:`GENERATORS` powers ``pdfhell make --trap X``
|
|
5
|
+
and the suite builder. Adding a new trap family means registering it
|
|
6
|
+
here.
|
|
7
|
+
|
|
8
|
+
Why a registry and not subclasses? Each trap is fundamentally a *small*
|
|
9
|
+
parameterised generator. The registry keeps generators portable, makes
|
|
10
|
+
the CLI's ``--trap`` flag introspectable, and avoids the class-hierarchy
|
|
11
|
+
sprawl that a 50-trap eventual suite would otherwise turn into.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Callable
|
|
16
|
+
|
|
17
|
+
from ..case import HellCase
|
|
18
|
+
from .hidden_ocr_mismatch import generate as _hidden_ocr_mismatch
|
|
19
|
+
from .footnote_override import generate as _footnote_override
|
|
20
|
+
from .split_table_across_pages import generate as _split_table_across_pages
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Signature: (seed: int) -> (pdf_bytes: bytes, case: HellCase).
|
|
24
|
+
# The case's pdf_path is set by the suite-builder after writing the bytes.
|
|
25
|
+
GeneratorFn = Callable[[int], tuple[bytes, HellCase]]
|
|
26
|
+
|
|
27
|
+
GENERATORS: dict[str, GeneratorFn] = {
|
|
28
|
+
"hidden_ocr_mismatch": _hidden_ocr_mismatch,
|
|
29
|
+
"footnote_override": _footnote_override,
|
|
30
|
+
"split_table_across_pages": _split_table_across_pages,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
TRAP_FAMILIES: tuple[str, ...] = tuple(GENERATORS.keys())
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def generate_case(trap_family: str, seed: int) -> tuple[bytes, HellCase]:
|
|
37
|
+
"""Generate one case from ``trap_family`` with the given ``seed``.
|
|
38
|
+
|
|
39
|
+
Raises :class:`KeyError` for an unknown trap family — the CLI catches
|
|
40
|
+
this and prints the list of available families.
|
|
41
|
+
"""
|
|
42
|
+
if trap_family not in GENERATORS:
|
|
43
|
+
raise KeyError(
|
|
44
|
+
f"unknown trap family {trap_family!r}; available: {', '.join(TRAP_FAMILIES)}"
|
|
45
|
+
)
|
|
46
|
+
return GENERATORS[trap_family](seed)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = ["GENERATORS", "TRAP_FAMILIES", "generate_case", "GeneratorFn"]
|