pdfhell 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfhell/__init__.py +34 -0
- pdfhell/auditpack.py +182 -0
- pdfhell/case.py +87 -0
- pdfhell/cli.py +216 -0
- pdfhell/generators/__init__.py +49 -0
- pdfhell/generators/_common.py +183 -0
- pdfhell/generators/footnote_override.py +212 -0
- pdfhell/generators/hidden_ocr_mismatch.py +129 -0
- pdfhell/generators/split_table_across_pages.py +174 -0
- pdfhell/junit.py +94 -0
- pdfhell/runner.py +142 -0
- pdfhell/scorer.py +214 -0
- pdfhell/suite.py +104 -0
- pdfhell/vision.py +231 -0
- pdfhell-0.1.0.dist-info/METADATA +208 -0
- pdfhell-0.1.0.dist-info/RECORD +20 -0
- pdfhell-0.1.0.dist-info/WHEEL +5 -0
- pdfhell-0.1.0.dist-info/entry_points.txt +2 -0
- pdfhell-0.1.0.dist-info/licenses/LICENSE +17 -0
- pdfhell-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Shared utilities for trap-family generators.
|
|
2
|
+
|
|
3
|
+
Each helper here is a small, well-typed primitive that the per-trap
|
|
4
|
+
generators compose. The aim is that adding a new trap family means
|
|
5
|
+
writing one new file under ``pdfhell/generators/`` and registering it in
|
|
6
|
+
``__init__.py`` — without copy-pasting reportlab boilerplate.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
import random
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from typing import Iterable, Sequence
|
|
14
|
+
|
|
15
|
+
from reportlab.lib.pagesizes import LETTER
|
|
16
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
17
|
+
from reportlab.pdfgen import canvas
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Letter portrait. We keep the page size constant across traps so visual
|
|
21
|
+
# scoring across the suite is comparable.
|
|
22
|
+
PAGE_WIDTH, PAGE_HEIGHT = LETTER
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(slots=True)
|
|
26
|
+
class FontSpec:
|
|
27
|
+
"""Font selection + size."""
|
|
28
|
+
|
|
29
|
+
family: str = "Helvetica"
|
|
30
|
+
size: float = 11.0
|
|
31
|
+
bold: bool = False
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def name(self) -> str:
|
|
35
|
+
return f"{self.family}-Bold" if self.bold else self.family
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# A reproducible RNG that's seeded per-case. Generators must use this
|
|
39
|
+
# (not the global ``random``) so byte-identical PDFs come out of byte-
|
|
40
|
+
# identical seeds even when many generators run in the same process.
|
|
41
|
+
def rng_for(seed: int) -> random.Random:
|
|
42
|
+
return random.Random(seed)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def draw_paragraph(
|
|
46
|
+
c: "canvas.Canvas",
|
|
47
|
+
text: str,
|
|
48
|
+
x: float,
|
|
49
|
+
y: float,
|
|
50
|
+
*,
|
|
51
|
+
width: float = PAGE_WIDTH - 144,
|
|
52
|
+
font: FontSpec = FontSpec(),
|
|
53
|
+
leading: float | None = None,
|
|
54
|
+
) -> float:
|
|
55
|
+
"""Draw wrapped text with simple word-wrap. Returns the next free y.
|
|
56
|
+
|
|
57
|
+
We hand-roll wrapping rather than using Platypus flowables because
|
|
58
|
+
every trap family wants pixel-precise control over where text lands
|
|
59
|
+
(especially for tiny footnotes and split-table headers). Platypus
|
|
60
|
+
would fight us. canvas.drawString gives us the control.
|
|
61
|
+
"""
|
|
62
|
+
leading = leading or (font.size * 1.25)
|
|
63
|
+
c.setFont(font.name, font.size)
|
|
64
|
+
words = text.split()
|
|
65
|
+
current: list[str] = []
|
|
66
|
+
|
|
67
|
+
def line_width(parts: list[str]) -> float:
|
|
68
|
+
return c.stringWidth(" ".join(parts), font.name, font.size)
|
|
69
|
+
|
|
70
|
+
cursor_y = y
|
|
71
|
+
for word in words:
|
|
72
|
+
current.append(word)
|
|
73
|
+
if line_width(current) > width:
|
|
74
|
+
current.pop()
|
|
75
|
+
c.drawString(x, cursor_y, " ".join(current))
|
|
76
|
+
cursor_y -= leading
|
|
77
|
+
current = [word]
|
|
78
|
+
if current:
|
|
79
|
+
c.drawString(x, cursor_y, " ".join(current))
|
|
80
|
+
cursor_y -= leading
|
|
81
|
+
return cursor_y
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def draw_invisible_text(c: "canvas.Canvas", text: str, x: float, y: float, *, size: float = 11.0) -> None:
|
|
85
|
+
"""Place a string in the PDF text stream that is invisible to the eye.
|
|
86
|
+
|
|
87
|
+
This is the core trick behind :mod:`hidden_ocr_mismatch`. PDFs can
|
|
88
|
+
contain text rendered as invisible (render mode 3 — neither stroke
|
|
89
|
+
nor fill). A human reader sees nothing. An OCR/text-extraction
|
|
90
|
+
pipeline that reads the underlying text stream sees the invisible
|
|
91
|
+
string. A vision-only model reads the page's pixels. A
|
|
92
|
+
text-extraction pipeline reads the invisible layer. The two answers
|
|
93
|
+
diverge.
|
|
94
|
+
|
|
95
|
+
This is exactly how scanned-then-re-OCR'd PDFs go wrong in the wild
|
|
96
|
+
— the OCR layer can drift from the rendered page. Procedurally
|
|
97
|
+
constructing this means we *know* both answers and can score either
|
|
98
|
+
correctly.
|
|
99
|
+
"""
|
|
100
|
+
text_obj = c.beginText(x, y)
|
|
101
|
+
text_obj.setFont("Helvetica", size)
|
|
102
|
+
# Render mode 3 = neither stroke nor fill, so the glyphs are placed
|
|
103
|
+
# in the text content stream but never rasterised. Visible text is
|
|
104
|
+
# mode 0.
|
|
105
|
+
text_obj.setTextRenderMode(3)
|
|
106
|
+
text_obj.textOut(text)
|
|
107
|
+
c.drawText(text_obj)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def draw_table(
|
|
111
|
+
c: "canvas.Canvas",
|
|
112
|
+
rows: Sequence[Sequence[str]],
|
|
113
|
+
x: float,
|
|
114
|
+
y: float,
|
|
115
|
+
*,
|
|
116
|
+
col_widths: Sequence[float] | None = None,
|
|
117
|
+
row_height: float = 24,
|
|
118
|
+
font: FontSpec = FontSpec(size=10),
|
|
119
|
+
header_bold: bool = True,
|
|
120
|
+
) -> float:
|
|
121
|
+
"""Draw a borderless monospaced table. Returns the next free y.
|
|
122
|
+
|
|
123
|
+
Each generator that needs tables uses this to avoid reportlab's
|
|
124
|
+
Platypus tables (which paginate awkwardly when we explicitly *want*
|
|
125
|
+
to split a row across a page boundary).
|
|
126
|
+
"""
|
|
127
|
+
if not rows:
|
|
128
|
+
return y
|
|
129
|
+
if col_widths is None:
|
|
130
|
+
col_count = max(len(r) for r in rows)
|
|
131
|
+
col_widths = [(PAGE_WIDTH - 144) / col_count] * col_count
|
|
132
|
+
for i, row in enumerate(rows):
|
|
133
|
+
cur_x = x
|
|
134
|
+
is_header = i == 0
|
|
135
|
+
c.setFont(
|
|
136
|
+
"Helvetica-Bold" if (is_header and header_bold) else font.name,
|
|
137
|
+
font.size,
|
|
138
|
+
)
|
|
139
|
+
for cell, w in zip(row, col_widths):
|
|
140
|
+
c.drawString(cur_x, y, cell)
|
|
141
|
+
cur_x += w
|
|
142
|
+
y -= row_height
|
|
143
|
+
return y
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def page_break(c: "canvas.Canvas") -> None:
|
|
147
|
+
c.showPage()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def canvas_to_bytes(make: "Callable[[canvas.Canvas], None]") -> bytes: # noqa: F821
|
|
151
|
+
"""Run a draw routine against a fresh canvas and return the bytes.
|
|
152
|
+
|
|
153
|
+
Centralised so every generator does ``return canvas_to_bytes(draw)``
|
|
154
|
+
rather than duplicating BytesIO + canvas wiring.
|
|
155
|
+
|
|
156
|
+
``invariant=True`` is non-negotiable: it tells reportlab to zero out
|
|
157
|
+
the creation timestamp and use a deterministic document ID, so the
|
|
158
|
+
same generator + seed always produces byte-identical PDFs. Without
|
|
159
|
+
this, the strategy memo's reproducibility claim is a lie and the
|
|
160
|
+
published leaderboard can't be re-derived.
|
|
161
|
+
"""
|
|
162
|
+
buf = io.BytesIO()
|
|
163
|
+
c = canvas.Canvas(buf, pagesize=LETTER, invariant=True)
|
|
164
|
+
make(c)
|
|
165
|
+
c.save()
|
|
166
|
+
return buf.getvalue()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def fmt_money(amount: int | float, currency: str = "$") -> str:
|
|
170
|
+
"""Render money in a stable format. Generators set the *expected
|
|
171
|
+
answer* using this exact function so the answer string and the
|
|
172
|
+
rendered PDF text agree to the byte."""
|
|
173
|
+
return f"{currency}{amount:,.2f}"
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def pick_from(rng: random.Random, choices: Iterable):
|
|
177
|
+
"""Convenience for picking one element from an iterable using rng.
|
|
178
|
+
|
|
179
|
+
``random.Random.choice`` requires a sequence; this lets generators
|
|
180
|
+
pass generators/sets without converting upfront.
|
|
181
|
+
"""
|
|
182
|
+
items = list(choices)
|
|
183
|
+
return rng.choice(items)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Trap family: footnote override.
|
|
2
|
+
|
|
3
|
+
The body of the document states a contractual position confidently — for
|
|
4
|
+
example, *"Customer's liability shall be capped at twelve (12) months of
|
|
5
|
+
fees paid."* — but a 6pt footnote near the bottom of the page overrides
|
|
6
|
+
it: *"Notwithstanding the foregoing, liability for breaches of Sections
|
|
7
|
+
4.2 (Confidentiality) and 7.1 (Data Protection) shall be uncapped."*
|
|
8
|
+
|
|
9
|
+
A model that summarises the document by reading only the body will state
|
|
10
|
+
the cap is 12 months. The correct answer is "12 months, except for
|
|
11
|
+
breaches of Sections 4.2 and 7.1, which are uncapped." Missing the
|
|
12
|
+
footnote is the single most common failure mode for legal/contract AI
|
|
13
|
+
agents.
|
|
14
|
+
|
|
15
|
+
We procedurally fabricate this so the *exact* set of carve-outs is in
|
|
16
|
+
the answer key.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import random
|
|
21
|
+
|
|
22
|
+
from reportlab.pdfgen import canvas
|
|
23
|
+
|
|
24
|
+
from ..case import HellCase
|
|
25
|
+
from . import _common as C
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_CONTRACT_TYPES = [
|
|
29
|
+
"Master Services Agreement",
|
|
30
|
+
"Software License Agreement",
|
|
31
|
+
"Data Processing Addendum",
|
|
32
|
+
"Subscription Order Form",
|
|
33
|
+
"Statement of Work #4",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
_BODY_POSITIONS = [
|
|
37
|
+
# (label, body_text_template, footnote_template, expected_answer_template)
|
|
38
|
+
(
|
|
39
|
+
"liability_cap",
|
|
40
|
+
"The aggregate liability of either party for any claims arising out of or relating to "
|
|
41
|
+
"this Agreement shall not exceed an amount equal to {months} months of fees paid by Customer "
|
|
42
|
+
"during the twelve (12) month period immediately preceding the event giving rise to such liability.",
|
|
43
|
+
"Notwithstanding Section {section_num}, liability arising from "
|
|
44
|
+
"Sections {carveout_sections} shall be uncapped.",
|
|
45
|
+
"Liability is capped at {months} months of fees paid, EXCEPT that liability arising from "
|
|
46
|
+
"Sections {carveout_sections} is uncapped.",
|
|
47
|
+
"Liability is capped at {months} months of fees paid.", # the wrong/forbidden answer
|
|
48
|
+
),
|
|
49
|
+
(
|
|
50
|
+
"termination_notice",
|
|
51
|
+
"Either party may terminate this Agreement for convenience upon "
|
|
52
|
+
"{notice_days} days written notice to the other party.",
|
|
53
|
+
"However, termination for convenience is not permitted during the "
|
|
54
|
+
"initial twelve (12) month term.",
|
|
55
|
+
"Either party may terminate for convenience on {notice_days} days notice, "
|
|
56
|
+
"BUT NOT during the initial 12-month term.",
|
|
57
|
+
"Either party may terminate for convenience on {notice_days} days notice.",
|
|
58
|
+
),
|
|
59
|
+
(
|
|
60
|
+
"data_residency",
|
|
61
|
+
"Customer Data shall be stored and processed in the {primary_region} region.",
|
|
62
|
+
"Provided that, with Customer's written consent, Customer Data may also "
|
|
63
|
+
"be processed in {fallback_region} for purposes of disaster recovery.",
|
|
64
|
+
"Customer Data is stored in {primary_region}, with disaster-recovery "
|
|
65
|
+
"processing permitted in {fallback_region} ONLY with written consent.",
|
|
66
|
+
"Customer Data is stored in {primary_region}.",
|
|
67
|
+
),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _random_sections(rng: random.Random) -> tuple[str, str]:
|
|
72
|
+
"""Return ``(section_num, carveout_sections)`` for the carve-out clause."""
|
|
73
|
+
sec = f"{rng.randint(8, 14)}.{rng.randint(1, 5)}"
|
|
74
|
+
carve = ", ".join(
|
|
75
|
+
sorted(
|
|
76
|
+
{
|
|
77
|
+
f"{rng.randint(2, 7)}.{rng.randint(1, 4)}"
|
|
78
|
+
for _ in range(rng.randint(2, 3))
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
return sec, carve
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
86
|
+
rng = C.rng_for(seed)
|
|
87
|
+
contract = rng.choice(_CONTRACT_TYPES)
|
|
88
|
+
label, body_tpl, footnote_tpl, expected_tpl, wrong_tpl = rng.choice(_BODY_POSITIONS)
|
|
89
|
+
|
|
90
|
+
# Bind the per-template parameters.
|
|
91
|
+
# expected_tokens are the substrings any acceptable prose answer must
|
|
92
|
+
# contain — facts, not phrasing. The scorer requires ALL tokens.
|
|
93
|
+
if label == "liability_cap":
|
|
94
|
+
months = rng.choice([3, 6, 12, 24])
|
|
95
|
+
section_num, carveout_sections = _random_sections(rng)
|
|
96
|
+
ctx = {"months": months, "section_num": section_num, "carveout_sections": carveout_sections}
|
|
97
|
+
question = (
|
|
98
|
+
f"Read the attached {contract}. What is the LIABILITY CAP "
|
|
99
|
+
"and what carve-outs (if any) apply? Be precise about which Sections are uncapped."
|
|
100
|
+
)
|
|
101
|
+
# Acceptable: any prose that includes (1) the cap value, (2) the
|
|
102
|
+
# carve-out section refs, (3) the word "uncapped" or equivalent.
|
|
103
|
+
expected_tokens = [
|
|
104
|
+
f"{months} month",
|
|
105
|
+
"uncapped",
|
|
106
|
+
*carveout_sections.split(", "),
|
|
107
|
+
]
|
|
108
|
+
elif label == "termination_notice":
|
|
109
|
+
notice_days = rng.choice([30, 60, 90])
|
|
110
|
+
ctx = {"notice_days": notice_days}
|
|
111
|
+
question = (
|
|
112
|
+
f"Read the attached {contract}. Under what conditions can either party "
|
|
113
|
+
"terminate this Agreement for convenience? Be specific about any restrictions."
|
|
114
|
+
)
|
|
115
|
+
expected_tokens = [
|
|
116
|
+
f"{notice_days} day",
|
|
117
|
+
"12 month", # the initial-term restriction
|
|
118
|
+
]
|
|
119
|
+
else: # data_residency
|
|
120
|
+
primary_region = rng.choice(["us-east-1", "eu-west-1", "ap-southeast-2"])
|
|
121
|
+
fallback_region = rng.choice(["us-west-2", "eu-central-1", "ap-northeast-1"])
|
|
122
|
+
while fallback_region == primary_region:
|
|
123
|
+
fallback_region = rng.choice(["us-west-2", "eu-central-1", "ap-northeast-1"])
|
|
124
|
+
ctx = {"primary_region": primary_region, "fallback_region": fallback_region}
|
|
125
|
+
question = (
|
|
126
|
+
f"Read the attached {contract}. Where is Customer Data stored, "
|
|
127
|
+
"and under what conditions (if any) may it be processed elsewhere?"
|
|
128
|
+
)
|
|
129
|
+
expected_tokens = [
|
|
130
|
+
primary_region,
|
|
131
|
+
fallback_region,
|
|
132
|
+
"consent",
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
body_text = body_tpl.format(**ctx)
|
|
136
|
+
footnote_text = footnote_tpl.format(**ctx)
|
|
137
|
+
expected_answer = expected_tpl.format(**ctx)
|
|
138
|
+
wrong_answer = wrong_tpl.format(**ctx)
|
|
139
|
+
|
|
140
|
+
section_index = rng.randint(8, 14)
|
|
141
|
+
section_label = f"{section_index}.{rng.randint(1, 5)}"
|
|
142
|
+
|
|
143
|
+
case_id = f"footnote_override-{seed:04d}"
|
|
144
|
+
|
|
145
|
+
def draw(c: canvas.Canvas) -> None:
|
|
146
|
+
# Heading
|
|
147
|
+
c.setFont("Helvetica-Bold", 16)
|
|
148
|
+
c.drawString(72, 720, contract.upper())
|
|
149
|
+
c.setFont("Helvetica-Oblique", 10)
|
|
150
|
+
c.drawString(72, 700, f"Effective Date: 2026-{rng.randint(1, 12):02d}-{rng.randint(1, 28):02d}")
|
|
151
|
+
|
|
152
|
+
# Intro paragraph (filler so the doc looks normal)
|
|
153
|
+
intro = (
|
|
154
|
+
"This Agreement is entered into between the Customer and Vendor (each a "
|
|
155
|
+
'"Party" and collectively the "Parties") and governs the Parties\' '
|
|
156
|
+
"respective rights and obligations with respect to the Services described in the Order Form. "
|
|
157
|
+
"Capitalised terms used but not defined herein have the meanings given in the Order Form."
|
|
158
|
+
)
|
|
159
|
+
y = C.draw_paragraph(c, intro, 72, 670, font=C.FontSpec(size=10))
|
|
160
|
+
|
|
161
|
+
# The clause of interest (body)
|
|
162
|
+
c.setFont("Helvetica-Bold", 11)
|
|
163
|
+
c.drawString(72, y - 10, f"{section_label} Limitation.")
|
|
164
|
+
y = C.draw_paragraph(
|
|
165
|
+
c,
|
|
166
|
+
body_text + f"¹", # superscript 1 — the footnote marker
|
|
167
|
+
72, y - 30,
|
|
168
|
+
font=C.FontSpec(size=11),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# More filler so the footnote isn't suspiciously isolated
|
|
172
|
+
filler = (
|
|
173
|
+
"Each Party shall comply with all applicable laws and regulations in connection with "
|
|
174
|
+
"its performance under this Agreement and shall promptly notify the other Party of any "
|
|
175
|
+
"material non-compliance of which it becomes aware. The provisions of this Section shall "
|
|
176
|
+
"survive termination of this Agreement."
|
|
177
|
+
)
|
|
178
|
+
y = C.draw_paragraph(c, filler, 72, y - 10, font=C.FontSpec(size=10))
|
|
179
|
+
|
|
180
|
+
# The footnote (6pt — the trap)
|
|
181
|
+
# We deliberately place it near the bottom of page 1 to mimic
|
|
182
|
+
# real legal-doc layout where footnotes get visually compressed.
|
|
183
|
+
c.setFont("Helvetica", 6)
|
|
184
|
+
c.drawString(72, 100, f"¹ {footnote_text}")
|
|
185
|
+
|
|
186
|
+
# Page number
|
|
187
|
+
c.setFont("Helvetica", 9)
|
|
188
|
+
c.drawCentredString(C.PAGE_WIDTH / 2, 60, "Page 1 of 1")
|
|
189
|
+
|
|
190
|
+
pdf_bytes = C.canvas_to_bytes(draw)
|
|
191
|
+
|
|
192
|
+
case = HellCase(
|
|
193
|
+
id=case_id,
|
|
194
|
+
trap_family="footnote_override",
|
|
195
|
+
seed=seed,
|
|
196
|
+
question=question,
|
|
197
|
+
expected_answer=expected_answer,
|
|
198
|
+
expected_tokens=expected_tokens,
|
|
199
|
+
forbidden_answers=[wrong_answer],
|
|
200
|
+
metadata={
|
|
201
|
+
"contract_type": contract,
|
|
202
|
+
"clause_label": label,
|
|
203
|
+
"section_label": section_label,
|
|
204
|
+
"params": ctx,
|
|
205
|
+
"footnote_text": footnote_text,
|
|
206
|
+
"expected_failure_mode": (
|
|
207
|
+
"Model reads the body clause and ignores the 6pt footnote, missing the "
|
|
208
|
+
"material carve-out / exception."
|
|
209
|
+
),
|
|
210
|
+
},
|
|
211
|
+
)
|
|
212
|
+
return pdf_bytes, case
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Trap family: hidden OCR mismatch.
|
|
2
|
+
|
|
3
|
+
The PDF *looks* like an invoice with an amount of ``$X``. But beneath
|
|
4
|
+
that visible glyph is an invisible text layer that says ``$Y`` instead
|
|
5
|
+
(rendered with PDF text render mode 3 — placed in the text content
|
|
6
|
+
stream but never rasterised).
|
|
7
|
+
|
|
8
|
+
A vision-only model reads the rendered pixels and answers ``$X``. A
|
|
9
|
+
text-extraction pipeline (pdfminer, PyMuPDF, pdfplumber, most RAG
|
|
10
|
+
loaders) reads the invisible layer and answers ``$Y``. An agent that
|
|
11
|
+
combines both — without resolving the conflict — answers whichever the
|
|
12
|
+
final stage trusted.
|
|
13
|
+
|
|
14
|
+
This is the most common silent failure mode for "PDF understanding" in
|
|
15
|
+
production: a scanned-then-OCR'd document where the OCR layer disagrees
|
|
16
|
+
with the rendered page. We procedurally fabricate it so we know exactly
|
|
17
|
+
which answer is correct (the visible one) and which answer is the
|
|
18
|
+
specific failure mode the trap was designed to catch (the hidden one).
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import random
|
|
23
|
+
|
|
24
|
+
from reportlab.pdfgen import canvas
|
|
25
|
+
|
|
26
|
+
from ..case import HellCase
|
|
27
|
+
from . import _common as C
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Visible amounts are sampled from a realistic invoice distribution.
|
|
31
|
+
# The hidden amount is *not* a small perturbation — it's a meaningfully
|
|
32
|
+
# different value, because if the agent's downstream user processes it,
|
|
33
|
+
# the dollar gap should be visible.
|
|
34
|
+
_VENDORS = [
|
|
35
|
+
"Acme Industrial Supplies LLC",
|
|
36
|
+
"Northwind Logistics Inc.",
|
|
37
|
+
"Globex Manufacturing Co.",
|
|
38
|
+
"Initech Software Solutions",
|
|
39
|
+
"Hooli Cloud Services, Inc.",
|
|
40
|
+
"Soylent Foods Corp.",
|
|
41
|
+
"Wayne Enterprises LP",
|
|
42
|
+
"Stark Industries Inc.",
|
|
43
|
+
"Tyrell Robotics Group",
|
|
44
|
+
"Cyberdyne Systems Inc.",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
49
|
+
rng = C.rng_for(seed)
|
|
50
|
+
|
|
51
|
+
# Two distinct amounts; the visible one is the ground truth.
|
|
52
|
+
visible_amount = rng.choice([1234.56, 2750.00, 4980.40, 7820.99, 12345.67, 18900.25])
|
|
53
|
+
hidden_amount = visible_amount + rng.choice([1000, 2000, 5000, 10000])
|
|
54
|
+
vendor = rng.choice(_VENDORS)
|
|
55
|
+
invoice_no = f"INV-{rng.randint(100000, 999999)}"
|
|
56
|
+
due_day = rng.randint(1, 28)
|
|
57
|
+
|
|
58
|
+
visible_str = C.fmt_money(visible_amount)
|
|
59
|
+
hidden_str = C.fmt_money(hidden_amount)
|
|
60
|
+
|
|
61
|
+
case_id = f"hidden_ocr_mismatch-{seed:04d}"
|
|
62
|
+
question = (
|
|
63
|
+
f"You are an accounts-payable assistant. The attached PDF is invoice "
|
|
64
|
+
f"{invoice_no} from {vendor}. What is the TOTAL AMOUNT DUE? "
|
|
65
|
+
"Respond with only the amount including currency symbol, no other text."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def draw(c: canvas.Canvas) -> None:
|
|
69
|
+
# Header
|
|
70
|
+
c.setFont("Helvetica-Bold", 18)
|
|
71
|
+
c.drawString(72, 720, "INVOICE")
|
|
72
|
+
c.setFont("Helvetica", 11)
|
|
73
|
+
c.drawString(72, 700, vendor)
|
|
74
|
+
c.drawString(72, 685, f"Invoice #: {invoice_no}")
|
|
75
|
+
c.drawString(72, 670, f"Due: 2026-08-{due_day:02d}")
|
|
76
|
+
|
|
77
|
+
# Line items (3-4 plausible rows so the doc doesn't look skeletal)
|
|
78
|
+
C.draw_paragraph(
|
|
79
|
+
c,
|
|
80
|
+
"ITEM QTY UNIT LINE TOTAL",
|
|
81
|
+
72, 620,
|
|
82
|
+
font=C.FontSpec(size=10, bold=True),
|
|
83
|
+
)
|
|
84
|
+
items = [
|
|
85
|
+
("Configuration consulting ", 4, 250.00, 1000.00),
|
|
86
|
+
("On-site implementation ", 1, 1200.00, 1200.00),
|
|
87
|
+
("Annual support (Tier 2) ", 1, visible_amount - 2200, visible_amount - 2200),
|
|
88
|
+
]
|
|
89
|
+
y = 600
|
|
90
|
+
for desc, qty, unit, total in items:
|
|
91
|
+
c.drawString(72, y, f"{desc} {qty:>4} {C.fmt_money(unit):>10} {C.fmt_money(total):>12}")
|
|
92
|
+
y -= 16
|
|
93
|
+
|
|
94
|
+
# The "TOTAL DUE" line is what we're asking the model about.
|
|
95
|
+
c.setFont("Helvetica-Bold", 13)
|
|
96
|
+
c.drawString(72, y - 30, "TOTAL DUE:")
|
|
97
|
+
c.drawString(220, y - 30, visible_str)
|
|
98
|
+
|
|
99
|
+
# The trap: place the *wrong* amount in an invisible text layer.
|
|
100
|
+
# It lands at the same coordinate as the visible total, so a
|
|
101
|
+
# text-extraction pipeline that reads the underlying text stream
|
|
102
|
+
# by position would see the hidden value, not the visible one.
|
|
103
|
+
C.draw_invisible_text(c, hidden_str, x=220, y=y - 30, size=13)
|
|
104
|
+
|
|
105
|
+
# Footer
|
|
106
|
+
c.setFont("Helvetica-Oblique", 9)
|
|
107
|
+
c.drawString(72, 90, "Thank you for your business. Net 30 from invoice date.")
|
|
108
|
+
|
|
109
|
+
pdf_bytes = C.canvas_to_bytes(draw)
|
|
110
|
+
|
|
111
|
+
case = HellCase(
|
|
112
|
+
id=case_id,
|
|
113
|
+
trap_family="hidden_ocr_mismatch",
|
|
114
|
+
seed=seed,
|
|
115
|
+
question=question,
|
|
116
|
+
expected_answer=visible_str,
|
|
117
|
+
forbidden_answers=[hidden_str],
|
|
118
|
+
metadata={
|
|
119
|
+
"vendor": vendor,
|
|
120
|
+
"invoice_no": invoice_no,
|
|
121
|
+
"visible_amount": visible_amount,
|
|
122
|
+
"hidden_amount": hidden_amount,
|
|
123
|
+
"expected_failure_mode": (
|
|
124
|
+
"Model answers the hidden-OCR amount when it should answer the visible amount; "
|
|
125
|
+
"indicates the model trusted a text-extraction layer over the rendered page."
|
|
126
|
+
),
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
return pdf_bytes, case
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Trap family: split table across pages.
|
|
2
|
+
|
|
3
|
+
A 6-column financial / inventory / pricing table is drawn so that the
|
|
4
|
+
column *header* row appears at the bottom of page 1 and the *body* rows
|
|
5
|
+
appear at the top of page 2. The visual continuity is obvious to a
|
|
6
|
+
human flipping pages but breaks every document-pipeline that processes
|
|
7
|
+
pages independently (most RAG loaders, most OCR pipelines).
|
|
8
|
+
|
|
9
|
+
The trap question asks the model about a specific cell — e.g. "What is
|
|
10
|
+
the Q3 Net Revenue for the Northwest region?". A model that loses the
|
|
11
|
+
header context on page 2 will either confuse columns (returning Gross
|
|
12
|
+
Revenue or Operating Income instead) or refuse to answer. Procedural
|
|
13
|
+
ground truth means we know exactly which column the answer is in.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import random
|
|
18
|
+
|
|
19
|
+
from reportlab.pdfgen import canvas
|
|
20
|
+
|
|
21
|
+
from ..case import HellCase
|
|
22
|
+
from . import _common as C
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_REGIONS = ["Northwest", "Northeast", "Southwest", "Southeast", "Central"]
|
|
26
|
+
_QUARTERS = ["Q1", "Q2", "Q3", "Q4"]
|
|
27
|
+
_COLUMNS = [
|
|
28
|
+
("Region", "region"),
|
|
29
|
+
("Quarter", "quarter"),
|
|
30
|
+
("Gross Revenue", "gross"),
|
|
31
|
+
("Cost of Goods", "cogs"),
|
|
32
|
+
("Operating Income", "op_income"),
|
|
33
|
+
("Net Revenue", "net"),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _generate_row(rng: random.Random) -> dict:
|
|
38
|
+
region = rng.choice(_REGIONS)
|
|
39
|
+
quarter = rng.choice(_QUARTERS)
|
|
40
|
+
gross = round(rng.uniform(800_000, 5_000_000), 2)
|
|
41
|
+
cogs = round(gross * rng.uniform(0.35, 0.55), 2)
|
|
42
|
+
op_income = round(gross * rng.uniform(0.15, 0.30), 2)
|
|
43
|
+
net = round(op_income - rng.uniform(20_000, 80_000), 2)
|
|
44
|
+
return {
|
|
45
|
+
"region": region,
|
|
46
|
+
"quarter": quarter,
|
|
47
|
+
"gross": gross,
|
|
48
|
+
"cogs": cogs,
|
|
49
|
+
"op_income": op_income,
|
|
50
|
+
"net": net,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def generate(seed: int) -> tuple[bytes, HellCase]:
|
|
55
|
+
rng = C.rng_for(seed)
|
|
56
|
+
|
|
57
|
+
# Build 8 unique (region, quarter) rows.
|
|
58
|
+
seen: set[tuple[str, str]] = set()
|
|
59
|
+
rows: list[dict] = []
|
|
60
|
+
while len(rows) < 8:
|
|
61
|
+
row = _generate_row(rng)
|
|
62
|
+
key = (row["region"], row["quarter"])
|
|
63
|
+
if key in seen:
|
|
64
|
+
continue
|
|
65
|
+
seen.add(key)
|
|
66
|
+
rows.append(row)
|
|
67
|
+
|
|
68
|
+
# The case asks about ONE specific row and ONE specific column.
|
|
69
|
+
target_row = rng.choice(rows)
|
|
70
|
+
target_column_label, target_column_key = rng.choice(_COLUMNS[2:]) # skip region/quarter
|
|
71
|
+
expected_value = target_row[target_column_key]
|
|
72
|
+
expected_str = C.fmt_money(expected_value)
|
|
73
|
+
|
|
74
|
+
# The most plausible *wrong* answer is the value from an adjacent
|
|
75
|
+
# column in the same row (the "column-confusion" failure mode that
|
|
76
|
+
# page-split tables specifically elicit).
|
|
77
|
+
other_money_cols = [k for _, k in _COLUMNS[2:] if k != target_column_key]
|
|
78
|
+
wrong_col = rng.choice(other_money_cols)
|
|
79
|
+
wrong_str = C.fmt_money(target_row[wrong_col])
|
|
80
|
+
|
|
81
|
+
case_id = f"split_table_across_pages-{seed:04d}"
|
|
82
|
+
question = (
|
|
83
|
+
f"The attached PDF contains a financial-results table. "
|
|
84
|
+
f"What was the {target_column_label} for the {target_row['region']} region in "
|
|
85
|
+
f"{target_row['quarter']} of 2026? Respond with only the dollar amount, no other text."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def draw(c: canvas.Canvas) -> None:
|
|
89
|
+
# Page 1 — intro + header row at the bottom (the trap)
|
|
90
|
+
c.setFont("Helvetica-Bold", 16)
|
|
91
|
+
c.drawString(72, 720, "FY2026 REGIONAL FINANCIAL SUMMARY")
|
|
92
|
+
c.setFont("Helvetica", 10)
|
|
93
|
+
C.draw_paragraph(
|
|
94
|
+
c,
|
|
95
|
+
"The following table summarises gross and net revenue, cost of goods sold, and operating "
|
|
96
|
+
"income by region and quarter for fiscal year 2026. All amounts are reported in USD "
|
|
97
|
+
"and exclude inter-regional transfers. See Appendix B for the methodology used to allocate "
|
|
98
|
+
"shared infrastructure costs across regions.",
|
|
99
|
+
72, 690,
|
|
100
|
+
font=C.FontSpec(size=10),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Drop some filler so the header naturally ends up near the bottom
|
|
104
|
+
C.draw_paragraph(
|
|
105
|
+
c,
|
|
106
|
+
"Note that Q3 results reflect the regional reorganisation announced in our Q2 earnings "
|
|
107
|
+
"call. Comparisons to prior years should account for the boundary shift between the "
|
|
108
|
+
"Northwest and Central regions effective 2026-07-01.",
|
|
109
|
+
72, 620,
|
|
110
|
+
font=C.FontSpec(size=10),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Header row at the bottom of page 1
|
|
114
|
+
col_widths = [80, 60, 100, 100, 100, 100]
|
|
115
|
+
x_start = 72
|
|
116
|
+
header_y = 130
|
|
117
|
+
c.setFont("Helvetica-Bold", 10)
|
|
118
|
+
cx = x_start
|
|
119
|
+
for (label, _), w in zip(_COLUMNS, col_widths):
|
|
120
|
+
c.drawString(cx, header_y, label)
|
|
121
|
+
cx += w
|
|
122
|
+
|
|
123
|
+
# Page footer
|
|
124
|
+
c.setFont("Helvetica", 9)
|
|
125
|
+
c.drawCentredString(C.PAGE_WIDTH / 2, 60, "Page 1 of 2")
|
|
126
|
+
|
|
127
|
+
# Page break — body rows go on page 2 with no repeated header
|
|
128
|
+
C.page_break(c)
|
|
129
|
+
|
|
130
|
+
# Page 2 — the body rows, headerless
|
|
131
|
+
y = 720
|
|
132
|
+
c.setFont("Helvetica", 10)
|
|
133
|
+
for row in rows:
|
|
134
|
+
cx = x_start
|
|
135
|
+
cells = [
|
|
136
|
+
row["region"],
|
|
137
|
+
row["quarter"],
|
|
138
|
+
C.fmt_money(row["gross"]),
|
|
139
|
+
C.fmt_money(row["cogs"]),
|
|
140
|
+
C.fmt_money(row["op_income"]),
|
|
141
|
+
C.fmt_money(row["net"]),
|
|
142
|
+
]
|
|
143
|
+
for cell, w in zip(cells, col_widths):
|
|
144
|
+
c.drawString(cx, y, cell)
|
|
145
|
+
cx += w
|
|
146
|
+
y -= 22
|
|
147
|
+
|
|
148
|
+
# Page footer
|
|
149
|
+
c.setFont("Helvetica", 9)
|
|
150
|
+
c.drawCentredString(C.PAGE_WIDTH / 2, 60, "Page 2 of 2")
|
|
151
|
+
|
|
152
|
+
pdf_bytes = C.canvas_to_bytes(draw)
|
|
153
|
+
|
|
154
|
+
case = HellCase(
|
|
155
|
+
id=case_id,
|
|
156
|
+
trap_family="split_table_across_pages",
|
|
157
|
+
seed=seed,
|
|
158
|
+
question=question,
|
|
159
|
+
expected_answer=expected_str,
|
|
160
|
+
forbidden_answers=[wrong_str],
|
|
161
|
+
metadata={
|
|
162
|
+
"target_region": target_row["region"],
|
|
163
|
+
"target_quarter": target_row["quarter"],
|
|
164
|
+
"target_column": target_column_label,
|
|
165
|
+
"target_column_key": target_column_key,
|
|
166
|
+
"expected_value": expected_value,
|
|
167
|
+
"row_count": len(rows),
|
|
168
|
+
"expected_failure_mode": (
|
|
169
|
+
"Model loses column-header context when reading page 2 in isolation; "
|
|
170
|
+
"returns a value from an adjacent column in the same row."
|
|
171
|
+
),
|
|
172
|
+
},
|
|
173
|
+
)
|
|
174
|
+
return pdf_bytes, case
|