@heylemon/lemonade 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/canvas-host/a2ui/.bundle.hash +1 -1
- package/dist/gateway/skills-http.js +74 -19
- package/package.json +1 -1
- package/skills/docx/SKILL.md +25 -30
- package/skills/docx/scripts/accept_changes.py +0 -17
- package/skills/docx/scripts/comment.py +10 -39
- package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/docx/scripts/office/pack.py +0 -30
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/docx/scripts/office/soffice.py +0 -55
- package/skills/docx/scripts/office/unpack.py +5 -27
- package/skills/docx/scripts/office/validate.py +19 -14
- package/skills/docx/scripts/office/validators/base.py +48 -224
- package/skills/docx/scripts/office/validators/docx.py +44 -117
- package/skills/docx/scripts/office/validators/pptx.py +2 -42
- package/skills/docx/scripts/office/validators/redlining.py +3 -40
- package/skills/pdf/SKILL.md +22 -15
- package/skills/pdf/{FORMS.md → forms.md} +0 -14
- package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
- package/skills/pdf/scripts/check_fillable_fields.py +0 -1
- package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
- package/skills/pdf/scripts/create_validation_image.py +0 -4
- package/skills/pdf/scripts/extract_form_field_info.py +1 -31
- package/skills/pdf/scripts/extract_form_structure.py +0 -9
- package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
- package/skills/pptx/SKILL.md +2 -29
- package/skills/pptx/editing.md +2 -2
- package/skills/pptx/pptxgenjs.md +53 -8
- package/skills/pptx/scripts/add_slide.py +0 -30
- package/skills/pptx/scripts/clean.py +0 -23
- package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/pptx/scripts/office/pack.py +0 -30
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/pptx/scripts/office/soffice.py +0 -55
- package/skills/pptx/scripts/office/unpack.py +5 -27
- package/skills/pptx/scripts/office/validate.py +19 -14
- package/skills/pptx/scripts/office/validators/base.py +48 -224
- package/skills/pptx/scripts/office/validators/docx.py +44 -117
- package/skills/pptx/scripts/office/validators/pptx.py +2 -42
- package/skills/pptx/scripts/office/validators/redlining.py +3 -40
- package/skills/pptx/scripts/thumbnail.py +0 -31
- package/skills/xlsx/SKILL.md +3 -26
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/xlsx/scripts/office/pack.py +0 -30
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/xlsx/scripts/office/soffice.py +0 -55
- package/skills/xlsx/scripts/office/unpack.py +5 -27
- package/skills/xlsx/scripts/office/validate.py +19 -14
- package/skills/xlsx/scripts/office/validators/base.py +48 -224
- package/skills/xlsx/scripts/office/validators/docx.py +44 -117
- package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
- package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
- package/skills/xlsx/scripts/recalc.py +2 -26
- package/skills/docx/scripts/__init__.py +0 -1
- package/skills/docx/scripts/office/helpers/__init__.py +0 -0
- package/skills/docx/scripts/office/validators/__init__.py +0 -15
- package/skills/pptx/scripts/__init__.py +0 -0
- package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
- package/skills/pptx/scripts/office/validators/__init__.py +0 -15
- package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
- /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
|
@@ -9,7 +9,6 @@ from pathlib import Path
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class RedliningValidator:
|
|
12
|
-
"""Validator for tracked changes in Word documents."""
|
|
13
12
|
|
|
14
13
|
def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
|
|
15
14
|
self.unpacked_dir = Path(unpacked_dir)
|
|
@@ -21,29 +20,23 @@ class RedliningValidator:
|
|
|
21
20
|
}
|
|
22
21
|
|
|
23
22
|
def repair(self) -> int:
|
|
24
|
-
"""No auto-repairs for redlining validation. Returns 0."""
|
|
25
23
|
return 0
|
|
26
24
|
|
|
27
25
|
def validate(self):
|
|
28
|
-
"""Main validation method that returns True if valid, False otherwise."""
|
|
29
|
-
# Verify unpacked directory exists and has correct structure
|
|
30
26
|
modified_file = self.unpacked_dir / "word" / "document.xml"
|
|
31
27
|
if not modified_file.exists():
|
|
32
28
|
print(f"FAILED - Modified document.xml not found at {modified_file}")
|
|
33
29
|
return False
|
|
34
30
|
|
|
35
|
-
# First, check if there are any tracked changes by the author to validate
|
|
36
31
|
try:
|
|
37
32
|
import xml.etree.ElementTree as ET
|
|
38
33
|
|
|
39
34
|
tree = ET.parse(modified_file)
|
|
40
35
|
root = tree.getroot()
|
|
41
36
|
|
|
42
|
-
# Check for w:del or w:ins tags by the specified author
|
|
43
37
|
del_elements = root.findall(".//w:del", self.namespaces)
|
|
44
38
|
ins_elements = root.findall(".//w:ins", self.namespaces)
|
|
45
39
|
|
|
46
|
-
# Filter to only include changes by the specified author
|
|
47
40
|
author_del_elements = [
|
|
48
41
|
elem
|
|
49
42
|
for elem in del_elements
|
|
@@ -55,21 +48,17 @@ class RedliningValidator:
|
|
|
55
48
|
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
|
|
56
49
|
]
|
|
57
50
|
|
|
58
|
-
# Redlining validation is only needed if tracked changes by the author have been used.
|
|
59
51
|
if not author_del_elements and not author_ins_elements:
|
|
60
52
|
if self.verbose:
|
|
61
53
|
print(f"PASSED - No tracked changes by {self.author} found.")
|
|
62
54
|
return True
|
|
63
55
|
|
|
64
56
|
except Exception:
|
|
65
|
-
# If we can't parse the XML, continue with full validation
|
|
66
57
|
pass
|
|
67
58
|
|
|
68
|
-
# Create temporary directory for unpacking original docx
|
|
69
59
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
70
60
|
temp_path = Path(temp_dir)
|
|
71
61
|
|
|
72
|
-
# Unpack original docx
|
|
73
62
|
try:
|
|
74
63
|
with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
|
|
75
64
|
zip_ref.extractall(temp_path)
|
|
@@ -84,7 +73,6 @@ class RedliningValidator:
|
|
|
84
73
|
)
|
|
85
74
|
return False
|
|
86
75
|
|
|
87
|
-
# Parse both XML files using xml.etree.ElementTree for redlining validation
|
|
88
76
|
try:
|
|
89
77
|
import xml.etree.ElementTree as ET
|
|
90
78
|
|
|
@@ -96,16 +84,13 @@ class RedliningValidator:
|
|
|
96
84
|
print(f"FAILED - Error parsing XML files: {e}")
|
|
97
85
|
return False
|
|
98
86
|
|
|
99
|
-
# Remove the author's tracked changes from both documents
|
|
100
87
|
self._remove_author_tracked_changes(original_root)
|
|
101
88
|
self._remove_author_tracked_changes(modified_root)
|
|
102
89
|
|
|
103
|
-
# Extract and compare text content
|
|
104
90
|
modified_text = self._extract_text_content(modified_root)
|
|
105
91
|
original_text = self._extract_text_content(original_root)
|
|
106
92
|
|
|
107
93
|
if modified_text != original_text:
|
|
108
|
-
# Show detailed character-level differences for each paragraph
|
|
109
94
|
error_message = self._generate_detailed_diff(
|
|
110
95
|
original_text, modified_text
|
|
111
96
|
)
|
|
@@ -117,7 +102,6 @@ class RedliningValidator:
|
|
|
117
102
|
return True
|
|
118
103
|
|
|
119
104
|
def _generate_detailed_diff(self, original_text, modified_text):
|
|
120
|
-
"""Generate detailed word-level differences using git word diff."""
|
|
121
105
|
error_parts = [
|
|
122
106
|
f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
|
|
123
107
|
"",
|
|
@@ -132,7 +116,6 @@ class RedliningValidator:
|
|
|
132
116
|
"",
|
|
133
117
|
]
|
|
134
118
|
|
|
135
|
-
# Show git word diff
|
|
136
119
|
git_diff = self._get_git_word_diff(original_text, modified_text)
|
|
137
120
|
if git_diff:
|
|
138
121
|
error_parts.extend(["Differences:", "============", git_diff])
|
|
@@ -142,26 +125,23 @@ class RedliningValidator:
|
|
|
142
125
|
return "\n".join(error_parts)
|
|
143
126
|
|
|
144
127
|
def _get_git_word_diff(self, original_text, modified_text):
|
|
145
|
-
"""Generate word diff using git with character-level precision."""
|
|
146
128
|
try:
|
|
147
129
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
148
130
|
temp_path = Path(temp_dir)
|
|
149
131
|
|
|
150
|
-
# Create two files
|
|
151
132
|
original_file = temp_path / "original.txt"
|
|
152
133
|
modified_file = temp_path / "modified.txt"
|
|
153
134
|
|
|
154
135
|
original_file.write_text(original_text, encoding="utf-8")
|
|
155
136
|
modified_file.write_text(modified_text, encoding="utf-8")
|
|
156
137
|
|
|
157
|
-
# Try character-level diff first for precise differences
|
|
158
138
|
result = subprocess.run(
|
|
159
139
|
[
|
|
160
140
|
"git",
|
|
161
141
|
"diff",
|
|
162
142
|
"--word-diff=plain",
|
|
163
|
-
"--word-diff-regex=.",
|
|
164
|
-
"-U0",
|
|
143
|
+
"--word-diff-regex=.",
|
|
144
|
+
"-U0",
|
|
165
145
|
"--no-index",
|
|
166
146
|
str(original_file),
|
|
167
147
|
str(modified_file),
|
|
@@ -171,9 +151,7 @@ class RedliningValidator:
|
|
|
171
151
|
)
|
|
172
152
|
|
|
173
153
|
if result.stdout.strip():
|
|
174
|
-
# Clean up the output - remove git diff header lines
|
|
175
154
|
lines = result.stdout.split("\n")
|
|
176
|
-
# Skip the header lines (diff --git, index, +++, ---, @@)
|
|
177
155
|
content_lines = []
|
|
178
156
|
in_content = False
|
|
179
157
|
for line in lines:
|
|
@@ -186,13 +164,12 @@ class RedliningValidator:
|
|
|
186
164
|
if content_lines:
|
|
187
165
|
return "\n".join(content_lines)
|
|
188
166
|
|
|
189
|
-
# Fallback to word-level diff if character-level is too verbose
|
|
190
167
|
result = subprocess.run(
|
|
191
168
|
[
|
|
192
169
|
"git",
|
|
193
170
|
"diff",
|
|
194
171
|
"--word-diff=plain",
|
|
195
|
-
"-U0",
|
|
172
|
+
"-U0",
|
|
196
173
|
"--no-index",
|
|
197
174
|
str(original_file),
|
|
198
175
|
str(modified_file),
|
|
@@ -214,18 +191,15 @@ class RedliningValidator:
|
|
|
214
191
|
return "\n".join(content_lines)
|
|
215
192
|
|
|
216
193
|
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
|
217
|
-
# Git not available or other error, return None to use fallback
|
|
218
194
|
pass
|
|
219
195
|
|
|
220
196
|
return None
|
|
221
197
|
|
|
222
198
|
def _remove_author_tracked_changes(self, root):
|
|
223
|
-
"""Remove tracked changes authored by the specified author from the XML root."""
|
|
224
199
|
ins_tag = f"{{{self.namespaces['w']}}}ins"
|
|
225
200
|
del_tag = f"{{{self.namespaces['w']}}}del"
|
|
226
201
|
author_attr = f"{{{self.namespaces['w']}}}author"
|
|
227
202
|
|
|
228
|
-
# Remove w:ins elements
|
|
229
203
|
for parent in root.iter():
|
|
230
204
|
to_remove = []
|
|
231
205
|
for child in parent:
|
|
@@ -234,7 +208,6 @@ class RedliningValidator:
|
|
|
234
208
|
for elem in to_remove:
|
|
235
209
|
parent.remove(elem)
|
|
236
210
|
|
|
237
|
-
# Unwrap content in w:del elements where author matches
|
|
238
211
|
deltext_tag = f"{{{self.namespaces['w']}}}delText"
|
|
239
212
|
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
240
213
|
|
|
@@ -244,36 +217,26 @@ class RedliningValidator:
|
|
|
244
217
|
if child.tag == del_tag and child.get(author_attr) == self.author:
|
|
245
218
|
to_process.append((child, list(parent).index(child)))
|
|
246
219
|
|
|
247
|
-
# Process in reverse order to maintain indices
|
|
248
220
|
for del_elem, del_index in reversed(to_process):
|
|
249
|
-
# Convert w:delText to w:t before moving
|
|
250
221
|
for elem in del_elem.iter():
|
|
251
222
|
if elem.tag == deltext_tag:
|
|
252
223
|
elem.tag = t_tag
|
|
253
224
|
|
|
254
|
-
# Move all children of w:del to its parent before removing w:del
|
|
255
225
|
for child in reversed(list(del_elem)):
|
|
256
226
|
parent.insert(del_index, child)
|
|
257
227
|
parent.remove(del_elem)
|
|
258
228
|
|
|
259
229
|
def _extract_text_content(self, root):
|
|
260
|
-
"""Extract text content from Word XML, preserving paragraph structure.
|
|
261
|
-
|
|
262
|
-
Empty paragraphs are skipped to avoid false positives when tracked
|
|
263
|
-
insertions add only structural elements without text content.
|
|
264
|
-
"""
|
|
265
230
|
p_tag = f"{{{self.namespaces['w']}}}p"
|
|
266
231
|
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
267
232
|
|
|
268
233
|
paragraphs = []
|
|
269
234
|
for p_elem in root.findall(f".//{p_tag}"):
|
|
270
|
-
# Get all text elements within this paragraph
|
|
271
235
|
text_parts = []
|
|
272
236
|
for t_elem in p_elem.findall(f".//{t_tag}"):
|
|
273
237
|
if t_elem.text:
|
|
274
238
|
text_parts.append(t_elem.text)
|
|
275
239
|
paragraph_text = "".join(text_parts)
|
|
276
|
-
# Skip empty paragraphs - they don't affect content validation
|
|
277
240
|
if paragraph_text:
|
|
278
241
|
paragraphs.append(paragraph_text)
|
|
279
242
|
|
package/skills/pdf/SKILL.md
CHANGED
|
@@ -1,24 +1,11 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: pdf
|
|
3
|
-
description:
|
|
3
|
+
description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill.
|
|
4
4
|
license: Proprietary. LICENSE.txt has complete terms
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# PDF Processing Guide
|
|
8
8
|
|
|
9
|
-
## Document Integrity Mode (CRITICAL)
|
|
10
|
-
|
|
11
|
-
When the user asks to **fill an existing PDF** (especially official/government forms), preserve layout exactly:
|
|
12
|
-
|
|
13
|
-
- Never recreate the document from scratch.
|
|
14
|
-
- Never reflow, rewrite, or "clean up" page content.
|
|
15
|
-
- Never convert PDF -> DOCX/Markdown -> PDF for form filling tasks.
|
|
16
|
-
- Never replace or redesign page templates.
|
|
17
|
-
- Always keep the original file unchanged and write to a new output file.
|
|
18
|
-
- Use the workflow in `FORMS.md` exactly (fillable fields first, then fallback path).
|
|
19
|
-
|
|
20
|
-
If the user asks for exact formatting, treat that as strict mode and prioritize minimal-delta edits only.
|
|
21
|
-
|
|
22
9
|
## Overview
|
|
23
10
|
|
|
24
11
|
This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
|
|
@@ -128,7 +115,7 @@ with pdfplumber.open("document.pdf") as pdf:
|
|
|
128
115
|
# Combine all tables
|
|
129
116
|
if all_tables:
|
|
130
117
|
combined_df = pd.concat(all_tables, ignore_index=True)
|
|
131
|
-
combined_df.to_excel(
|
|
118
|
+
combined_df.to_excel("extracted_tables.xlsx", index=False)
|
|
132
119
|
```
|
|
133
120
|
|
|
134
121
|
### reportlab - Create PDFs
|
|
@@ -179,6 +166,26 @@ story.append(Paragraph("Content for page 2", styles['Normal']))
|
|
|
179
166
|
doc.build(story)
|
|
180
167
|
```
|
|
181
168
|
|
|
169
|
+
#### Subscripts and Superscripts
|
|
170
|
+
|
|
171
|
+
**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
|
|
172
|
+
|
|
173
|
+
Instead, use ReportLab's XML markup tags in Paragraph objects:
|
|
174
|
+
```python
|
|
175
|
+
from reportlab.platypus import Paragraph
|
|
176
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
177
|
+
|
|
178
|
+
styles = getSampleStyleSheet()
|
|
179
|
+
|
|
180
|
+
# Subscripts: use <sub> tag
|
|
181
|
+
chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
|
|
182
|
+
|
|
183
|
+
# Superscripts: use <super> tag
|
|
184
|
+
squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
|
|
188
|
+
|
|
182
189
|
## Command-Line Tools
|
|
183
190
|
|
|
184
191
|
### pdftotext (poppler-utils)
|
|
@@ -1,17 +1,5 @@
|
|
|
1
1
|
**CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.**
|
|
2
2
|
|
|
3
|
-
## Safety Rules (Exact-Formatting Forms)
|
|
4
|
-
|
|
5
|
-
For official/sensitive forms, follow these rules strictly:
|
|
6
|
-
|
|
7
|
-
- Keep the original PDF untouched. Always write to a new file (for example, `original.filled.pdf`).
|
|
8
|
-
- Do not overwrite the input file.
|
|
9
|
-
- Do not regenerate pages or rebuild the PDF from extracted text.
|
|
10
|
-
- Do not use "create PDF" workflows for form-filling requests.
|
|
11
|
-
- Prefer true form-field filling whenever available; this preserves layout best.
|
|
12
|
-
- If the file has no fillable fields, explain that non-fillable fallback uses overlays/annotations and may not be pixel-perfect in every viewer.
|
|
13
|
-
- For non-fillable fallback, ask for a brief confirmation before writing output when exact legal formatting is required.
|
|
14
|
-
|
|
15
3
|
If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory:
|
|
16
4
|
`python scripts/check_fillable_fields <file.pdf>`, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions.
|
|
17
5
|
|
|
@@ -86,7 +74,6 @@ Then analyze the images to determine the purpose of each form field (make sure t
|
|
|
86
74
|
- Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF:
|
|
87
75
|
`python scripts/fill_fillable_fields.py <input pdf> <field_values.json> <output pdf>`
|
|
88
76
|
This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again.
|
|
89
|
-
- Use a new output filename and keep the input unchanged.
|
|
90
77
|
|
|
91
78
|
# Non-fillable fields
|
|
92
79
|
If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed.
|
|
@@ -295,7 +282,6 @@ Fix any reported errors in fields.json before proceeding.
|
|
|
295
282
|
|
|
296
283
|
The fill script auto-detects the coordinate system and handles conversion:
|
|
297
284
|
`python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
|
|
298
|
-
- Use a new output filename and keep the input unchanged.
|
|
299
285
|
|
|
300
286
|
## Step 4: Verify Output
|
|
301
287
|
|
|
@@ -3,8 +3,6 @@ import json
|
|
|
3
3
|
import sys
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
# Script to check that the `fields.json` file that Claude creates when analyzing PDFs
|
|
7
|
-
# does not have overlapping bounding boxes. See FORMS.md.
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
@dataclass
|
|
@@ -14,7 +12,6 @@ class RectAndField:
|
|
|
14
12
|
field: dict
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
# Returns a list of messages that are printed to stdout for Claude to read.
|
|
18
15
|
def get_bounding_box_messages(fields_json_stream) -> list[str]:
|
|
19
16
|
messages = []
|
|
20
17
|
fields = json.load(fields_json_stream)
|
|
@@ -32,7 +29,6 @@ def get_bounding_box_messages(fields_json_stream) -> list[str]:
|
|
|
32
29
|
|
|
33
30
|
has_error = False
|
|
34
31
|
for i, ri in enumerate(rects_and_fields):
|
|
35
|
-
# This is O(N^2); we can optimize if it becomes a problem.
|
|
36
32
|
for j in range(i + 1, len(rects_and_fields)):
|
|
37
33
|
rj = rects_and_fields[j]
|
|
38
34
|
if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
|
|
@@ -63,7 +59,6 @@ if __name__ == "__main__":
|
|
|
63
59
|
if len(sys.argv) != 2:
|
|
64
60
|
print("Usage: check_bounding_boxes.py [fields.json]")
|
|
65
61
|
sys.exit(1)
|
|
66
|
-
# Input file should be in the `fields.json` format described in FORMS.md.
|
|
67
62
|
with open(sys.argv[1]) as f:
|
|
68
63
|
messages = get_bounding_box_messages(f)
|
|
69
64
|
for msg in messages:
|
|
@@ -4,14 +4,12 @@ import sys
|
|
|
4
4
|
from pdf2image import convert_from_path
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
# Converts each page of a PDF to a PNG image.
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def convert(pdf_path, output_dir, max_dim=1000):
|
|
11
10
|
images = convert_from_path(pdf_path, dpi=200)
|
|
12
11
|
|
|
13
12
|
for i, image in enumerate(images):
|
|
14
|
-
# Scale image if needed to keep width/height under `max_dim`
|
|
15
13
|
width, height = image.size
|
|
16
14
|
if width > max_dim or height > max_dim:
|
|
17
15
|
scale_factor = min(max_dim / width, max_dim / height)
|
|
@@ -4,12 +4,9 @@ import sys
|
|
|
4
4
|
from PIL import Image, ImageDraw
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
# Creates "validation" images with rectangles for the bounding box information that
|
|
8
|
-
# Claude creates when determining where to add text annotations in PDFs. See FORMS.md.
|
|
9
7
|
|
|
10
8
|
|
|
11
9
|
def create_validation_image(page_number, fields_json_path, input_path, output_path):
|
|
12
|
-
# Input file should be in the `fields.json` format described in FORMS.md.
|
|
13
10
|
with open(fields_json_path, 'r') as f:
|
|
14
11
|
data = json.load(f)
|
|
15
12
|
|
|
@@ -21,7 +18,6 @@ def create_validation_image(page_number, fields_json_path, input_path, output_pa
|
|
|
21
18
|
if field["page_number"] == page_number:
|
|
22
19
|
entry_box = field['entry_bounding_box']
|
|
23
20
|
label_box = field['label_bounding_box']
|
|
24
|
-
# Draw red rectangle over entry bounding box and blue rectangle over the label.
|
|
25
21
|
draw.rectangle(entry_box, outline='red', width=2)
|
|
26
22
|
draw.rectangle(label_box, outline='blue', width=2)
|
|
27
23
|
num_boxes += 2
|
|
@@ -4,11 +4,8 @@ import sys
|
|
|
4
4
|
from pypdf import PdfReader
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
# Extracts data for the fillable form fields in a PDF and outputs JSON that
|
|
8
|
-
# Claude uses to fill the fields. See FORMS.md.
|
|
9
7
|
|
|
10
8
|
|
|
11
|
-
# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
|
|
12
9
|
def get_full_annotation_field_id(annotation):
|
|
13
10
|
components = []
|
|
14
11
|
while annotation:
|
|
@@ -25,12 +22,9 @@ def make_field_dict(field, field_id):
|
|
|
25
22
|
if ft == "/Tx":
|
|
26
23
|
field_dict["type"] = "text"
|
|
27
24
|
elif ft == "/Btn":
|
|
28
|
-
field_dict["type"] = "checkbox"
|
|
25
|
+
field_dict["type"] = "checkbox"
|
|
29
26
|
states = field.get("/_States_", [])
|
|
30
27
|
if len(states) == 2:
|
|
31
|
-
# "/Off" seems to always be the unchecked value, as suggested by
|
|
32
|
-
# https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
|
|
33
|
-
# It can be either first or second in the "/_States_" list.
|
|
34
28
|
if "/Off" in states:
|
|
35
29
|
field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
|
|
36
30
|
field_dict["unchecked_value"] = "/Off"
|
|
@@ -50,15 +44,6 @@ def make_field_dict(field, field_id):
|
|
|
50
44
|
return field_dict
|
|
51
45
|
|
|
52
46
|
|
|
53
|
-
# Returns a list of fillable PDF fields:
|
|
54
|
-
# [
|
|
55
|
-
# {
|
|
56
|
-
# "field_id": "name",
|
|
57
|
-
# "page": 1,
|
|
58
|
-
# "type": ("text", "checkbox", "radio_group", or "choice")
|
|
59
|
-
# // Per-type additional fields described in FORMS.md
|
|
60
|
-
# },
|
|
61
|
-
# ]
|
|
62
47
|
def get_field_info(reader: PdfReader):
|
|
63
48
|
fields = reader.get_fields()
|
|
64
49
|
|
|
@@ -66,19 +51,13 @@ def get_field_info(reader: PdfReader):
|
|
|
66
51
|
possible_radio_names = set()
|
|
67
52
|
|
|
68
53
|
for field_id, field in fields.items():
|
|
69
|
-
# Skip if this is a container field with children, except that it might be
|
|
70
|
-
# a parent group for radio button options.
|
|
71
54
|
if field.get("/Kids"):
|
|
72
55
|
if field.get("/FT") == "/Btn":
|
|
73
56
|
possible_radio_names.add(field_id)
|
|
74
57
|
continue
|
|
75
58
|
field_info_by_id[field_id] = make_field_dict(field, field_id)
|
|
76
59
|
|
|
77
|
-
# Bounding rects are stored in annotations in page objects.
|
|
78
60
|
|
|
79
|
-
# Radio button options have a separate annotation for each choice;
|
|
80
|
-
# all choices have the same field name.
|
|
81
|
-
# See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
|
|
82
61
|
radio_fields_by_id = {}
|
|
83
62
|
|
|
84
63
|
for page_index, page in enumerate(reader.pages):
|
|
@@ -90,8 +69,6 @@ def get_field_info(reader: PdfReader):
|
|
|
90
69
|
field_info_by_id[field_id]["rect"] = ann.get('/Rect')
|
|
91
70
|
elif field_id in possible_radio_names:
|
|
92
71
|
try:
|
|
93
|
-
# ann['/AP']['/N'] should have two items. One of them is '/Off',
|
|
94
|
-
# the other is the active value.
|
|
95
72
|
on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
|
|
96
73
|
except KeyError:
|
|
97
74
|
continue
|
|
@@ -104,17 +81,11 @@ def get_field_info(reader: PdfReader):
|
|
|
104
81
|
"page": page_index + 1,
|
|
105
82
|
"radio_options": [],
|
|
106
83
|
}
|
|
107
|
-
# Note: at least on macOS 15.7, Preview.app doesn't show selected
|
|
108
|
-
# radio buttons correctly. (It does if you remove the leading slash
|
|
109
|
-
# from the value, but that causes them not to appear correctly in
|
|
110
|
-
# Chrome/Firefox/Acrobat/etc).
|
|
111
84
|
radio_fields_by_id[field_id]["radio_options"].append({
|
|
112
85
|
"value": on_values[0],
|
|
113
86
|
"rect": rect,
|
|
114
87
|
})
|
|
115
88
|
|
|
116
|
-
# Some PDFs have form field definitions without corresponding annotations,
|
|
117
|
-
# so we can't tell where they are. Ignore these fields for now.
|
|
118
89
|
fields_with_location = []
|
|
119
90
|
for field_info in field_info_by_id.values():
|
|
120
91
|
if "page" in field_info:
|
|
@@ -122,7 +93,6 @@ def get_field_info(reader: PdfReader):
|
|
|
122
93
|
else:
|
|
123
94
|
print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
|
|
124
95
|
|
|
125
|
-
# Sort by page number, then Y position (flipped in PDF coordinate system), then X.
|
|
126
96
|
def sort_key(f):
|
|
127
97
|
if "radio_options" in f:
|
|
128
98
|
rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
"""
|
|
3
2
|
Extract form structure from a non-fillable PDF.
|
|
4
3
|
|
|
@@ -19,7 +18,6 @@ import pdfplumber
|
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
def extract_form_structure(pdf_path):
|
|
22
|
-
"""Extract structural elements from a PDF form."""
|
|
23
21
|
structure = {
|
|
24
22
|
"pages": [],
|
|
25
23
|
"labels": [],
|
|
@@ -30,14 +28,12 @@ def extract_form_structure(pdf_path):
|
|
|
30
28
|
|
|
31
29
|
with pdfplumber.open(pdf_path) as pdf:
|
|
32
30
|
for page_num, page in enumerate(pdf.pages, 1):
|
|
33
|
-
# Page info
|
|
34
31
|
structure["pages"].append({
|
|
35
32
|
"page_number": page_num,
|
|
36
33
|
"width": float(page.width),
|
|
37
34
|
"height": float(page.height)
|
|
38
35
|
})
|
|
39
36
|
|
|
40
|
-
# Extract text labels with positions
|
|
41
37
|
words = page.extract_words()
|
|
42
38
|
for word in words:
|
|
43
39
|
structure["labels"].append({
|
|
@@ -49,9 +45,7 @@ def extract_form_structure(pdf_path):
|
|
|
49
45
|
"bottom": round(float(word["bottom"]), 1)
|
|
50
46
|
})
|
|
51
47
|
|
|
52
|
-
# Extract horizontal lines (row separators)
|
|
53
48
|
for line in page.lines:
|
|
54
|
-
# Horizontal lines span most of page width
|
|
55
49
|
if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
|
|
56
50
|
structure["lines"].append({
|
|
57
51
|
"page": page_num,
|
|
@@ -60,11 +54,9 @@ def extract_form_structure(pdf_path):
|
|
|
60
54
|
"x1": round(float(line["x1"]), 1)
|
|
61
55
|
})
|
|
62
56
|
|
|
63
|
-
# Extract checkboxes (small square rectangles)
|
|
64
57
|
for rect in page.rects:
|
|
65
58
|
width = float(rect["x1"]) - float(rect["x0"])
|
|
66
59
|
height = float(rect["bottom"]) - float(rect["top"])
|
|
67
|
-
# Checkboxes are typically 5-15 points square
|
|
68
60
|
if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
|
|
69
61
|
structure["checkboxes"].append({
|
|
70
62
|
"page": page_num,
|
|
@@ -76,7 +68,6 @@ def extract_form_structure(pdf_path):
|
|
|
76
68
|
"center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
|
|
77
69
|
})
|
|
78
70
|
|
|
79
|
-
# Calculate row boundaries from horizontal lines
|
|
80
71
|
lines_by_page = {}
|
|
81
72
|
for line in structure["lines"]:
|
|
82
73
|
page = line["page"]
|
|
@@ -1,25 +1,16 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import sys
|
|
3
|
-
import os
|
|
4
3
|
|
|
5
4
|
from pypdf import PdfReader, PdfWriter
|
|
6
5
|
|
|
7
6
|
from extract_form_field_info import get_field_info
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
# Fills fillable form fields in a PDF. See FORMS.md.
|
|
11
9
|
|
|
12
10
|
|
|
13
11
|
def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
|
|
14
|
-
input_abs = os.path.abspath(input_pdf_path)
|
|
15
|
-
output_abs = os.path.abspath(output_pdf_path)
|
|
16
|
-
if input_abs == output_abs:
|
|
17
|
-
print("ERROR: Refusing to overwrite input PDF. Use a different output path.")
|
|
18
|
-
sys.exit(1)
|
|
19
|
-
|
|
20
12
|
with open(fields_json_path) as f:
|
|
21
13
|
fields = json.load(f)
|
|
22
|
-
# Group by page number.
|
|
23
14
|
fields_by_page = {}
|
|
24
15
|
for field in fields:
|
|
25
16
|
if "value" in field:
|
|
@@ -55,8 +46,6 @@ def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path:
|
|
|
55
46
|
for page, field_values in fields_by_page.items():
|
|
56
47
|
writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
|
|
57
48
|
|
|
58
|
-
# This seems to be necessary for many PDF viewers to format the form values correctly.
|
|
59
|
-
# It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
|
|
60
49
|
writer.set_need_appearances_writer(True)
|
|
61
50
|
|
|
62
51
|
with open(output_pdf_path, "wb") as f:
|
|
@@ -82,18 +71,6 @@ def validation_error_for_field_value(field_info, field_value):
|
|
|
82
71
|
return None
|
|
83
72
|
|
|
84
73
|
|
|
85
|
-
# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
|
|
86
|
-
# In _writer.py around line 966:
|
|
87
|
-
#
|
|
88
|
-
# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
|
|
89
|
-
# txt = "\n".join(annotation.get_inherited(FA.Opt, []))
|
|
90
|
-
#
|
|
91
|
-
# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
|
|
92
|
-
# [["value1", "Text 1"], ["value2", "Text 2"], ...]
|
|
93
|
-
# This causes `join` to throw a TypeError because it expects an iterable of strings.
|
|
94
|
-
# The horrible workaround is to patch `get_inherited` to return a list of the value strings.
|
|
95
|
-
# We call the original method and adjust the return value only if the argument to `get_inherited`
|
|
96
|
-
# is `FA.Opt` and if the return value is a list of two-element lists.
|
|
97
74
|
def monkeypatch_pydpf_method():
|
|
98
75
|
from pypdf.generic import DictionaryObject
|
|
99
76
|
from pypdf.constants import FieldDictionaryAttributes
|