@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/build-info.json +3 -3
  2. package/dist/canvas-host/a2ui/.bundle.hash +1 -1
  3. package/dist/gateway/skills-http.js +74 -19
  4. package/package.json +1 -1
  5. package/skills/docx/SKILL.md +25 -30
  6. package/skills/docx/scripts/accept_changes.py +0 -17
  7. package/skills/docx/scripts/comment.py +10 -39
  8. package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
  9. package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
  10. package/skills/docx/scripts/office/pack.py +0 -30
  11. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  12. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  13. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  14. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  15. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  16. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  17. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  18. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  19. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  20. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  21. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  22. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  23. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  24. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  25. package/skills/docx/scripts/office/soffice.py +0 -55
  26. package/skills/docx/scripts/office/unpack.py +5 -27
  27. package/skills/docx/scripts/office/validate.py +19 -14
  28. package/skills/docx/scripts/office/validators/base.py +48 -224
  29. package/skills/docx/scripts/office/validators/docx.py +44 -117
  30. package/skills/docx/scripts/office/validators/pptx.py +2 -42
  31. package/skills/docx/scripts/office/validators/redlining.py +3 -40
  32. package/skills/pdf/SKILL.md +22 -15
  33. package/skills/pdf/{FORMS.md → forms.md} +0 -14
  34. package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
  35. package/skills/pdf/scripts/check_fillable_fields.py +0 -1
  36. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
  37. package/skills/pdf/scripts/create_validation_image.py +0 -4
  38. package/skills/pdf/scripts/extract_form_field_info.py +1 -31
  39. package/skills/pdf/scripts/extract_form_structure.py +0 -9
  40. package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
  41. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
  42. package/skills/pptx/SKILL.md +2 -29
  43. package/skills/pptx/editing.md +2 -2
  44. package/skills/pptx/pptxgenjs.md +53 -8
  45. package/skills/pptx/scripts/add_slide.py +0 -30
  46. package/skills/pptx/scripts/clean.py +0 -23
  47. package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
  48. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
  49. package/skills/pptx/scripts/office/pack.py +0 -30
  50. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  51. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  52. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  53. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  54. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  55. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  56. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  57. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  58. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  59. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  60. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  61. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  62. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  63. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  64. package/skills/pptx/scripts/office/soffice.py +0 -55
  65. package/skills/pptx/scripts/office/unpack.py +5 -27
  66. package/skills/pptx/scripts/office/validate.py +19 -14
  67. package/skills/pptx/scripts/office/validators/base.py +48 -224
  68. package/skills/pptx/scripts/office/validators/docx.py +44 -117
  69. package/skills/pptx/scripts/office/validators/pptx.py +2 -42
  70. package/skills/pptx/scripts/office/validators/redlining.py +3 -40
  71. package/skills/pptx/scripts/thumbnail.py +0 -31
  72. package/skills/xlsx/SKILL.md +3 -26
  73. package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
  74. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
  75. package/skills/xlsx/scripts/office/pack.py +0 -30
  76. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  77. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  78. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  79. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  80. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  81. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  82. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  83. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  84. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  85. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  86. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  87. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  88. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  89. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  90. package/skills/xlsx/scripts/office/soffice.py +0 -55
  91. package/skills/xlsx/scripts/office/unpack.py +5 -27
  92. package/skills/xlsx/scripts/office/validate.py +19 -14
  93. package/skills/xlsx/scripts/office/validators/base.py +48 -224
  94. package/skills/xlsx/scripts/office/validators/docx.py +44 -117
  95. package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
  96. package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
  97. package/skills/xlsx/scripts/recalc.py +2 -26
  98. package/skills/docx/scripts/__init__.py +0 -1
  99. package/skills/docx/scripts/office/helpers/__init__.py +0 -0
  100. package/skills/docx/scripts/office/validators/__init__.py +0 -15
  101. package/skills/pptx/scripts/__init__.py +0 -0
  102. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  103. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  104. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  105. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  106. /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
@@ -9,7 +9,6 @@ from pathlib import Path
9
9
 
10
10
 
11
11
  class RedliningValidator:
12
- """Validator for tracked changes in Word documents."""
13
12
 
14
13
  def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
15
14
  self.unpacked_dir = Path(unpacked_dir)
@@ -21,29 +20,23 @@ class RedliningValidator:
21
20
  }
22
21
 
23
22
  def repair(self) -> int:
24
- """No auto-repairs for redlining validation. Returns 0."""
25
23
  return 0
26
24
 
27
25
  def validate(self):
28
- """Main validation method that returns True if valid, False otherwise."""
29
- # Verify unpacked directory exists and has correct structure
30
26
  modified_file = self.unpacked_dir / "word" / "document.xml"
31
27
  if not modified_file.exists():
32
28
  print(f"FAILED - Modified document.xml not found at {modified_file}")
33
29
  return False
34
30
 
35
- # First, check if there are any tracked changes by the author to validate
36
31
  try:
37
32
  import xml.etree.ElementTree as ET
38
33
 
39
34
  tree = ET.parse(modified_file)
40
35
  root = tree.getroot()
41
36
 
42
- # Check for w:del or w:ins tags by the specified author
43
37
  del_elements = root.findall(".//w:del", self.namespaces)
44
38
  ins_elements = root.findall(".//w:ins", self.namespaces)
45
39
 
46
- # Filter to only include changes by the specified author
47
40
  author_del_elements = [
48
41
  elem
49
42
  for elem in del_elements
@@ -55,21 +48,17 @@ class RedliningValidator:
55
48
  if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
56
49
  ]
57
50
 
58
- # Redlining validation is only needed if tracked changes by the author have been used.
59
51
  if not author_del_elements and not author_ins_elements:
60
52
  if self.verbose:
61
53
  print(f"PASSED - No tracked changes by {self.author} found.")
62
54
  return True
63
55
 
64
56
  except Exception:
65
- # If we can't parse the XML, continue with full validation
66
57
  pass
67
58
 
68
- # Create temporary directory for unpacking original docx
69
59
  with tempfile.TemporaryDirectory() as temp_dir:
70
60
  temp_path = Path(temp_dir)
71
61
 
72
- # Unpack original docx
73
62
  try:
74
63
  with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
75
64
  zip_ref.extractall(temp_path)
@@ -84,7 +73,6 @@ class RedliningValidator:
84
73
  )
85
74
  return False
86
75
 
87
- # Parse both XML files using xml.etree.ElementTree for redlining validation
88
76
  try:
89
77
  import xml.etree.ElementTree as ET
90
78
 
@@ -96,16 +84,13 @@ class RedliningValidator:
96
84
  print(f"FAILED - Error parsing XML files: {e}")
97
85
  return False
98
86
 
99
- # Remove the author's tracked changes from both documents
100
87
  self._remove_author_tracked_changes(original_root)
101
88
  self._remove_author_tracked_changes(modified_root)
102
89
 
103
- # Extract and compare text content
104
90
  modified_text = self._extract_text_content(modified_root)
105
91
  original_text = self._extract_text_content(original_root)
106
92
 
107
93
  if modified_text != original_text:
108
- # Show detailed character-level differences for each paragraph
109
94
  error_message = self._generate_detailed_diff(
110
95
  original_text, modified_text
111
96
  )
@@ -117,7 +102,6 @@ class RedliningValidator:
117
102
  return True
118
103
 
119
104
  def _generate_detailed_diff(self, original_text, modified_text):
120
- """Generate detailed word-level differences using git word diff."""
121
105
  error_parts = [
122
106
  f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
123
107
  "",
@@ -132,7 +116,6 @@ class RedliningValidator:
132
116
  "",
133
117
  ]
134
118
 
135
- # Show git word diff
136
119
  git_diff = self._get_git_word_diff(original_text, modified_text)
137
120
  if git_diff:
138
121
  error_parts.extend(["Differences:", "============", git_diff])
@@ -142,26 +125,23 @@ class RedliningValidator:
142
125
  return "\n".join(error_parts)
143
126
 
144
127
  def _get_git_word_diff(self, original_text, modified_text):
145
- """Generate word diff using git with character-level precision."""
146
128
  try:
147
129
  with tempfile.TemporaryDirectory() as temp_dir:
148
130
  temp_path = Path(temp_dir)
149
131
 
150
- # Create two files
151
132
  original_file = temp_path / "original.txt"
152
133
  modified_file = temp_path / "modified.txt"
153
134
 
154
135
  original_file.write_text(original_text, encoding="utf-8")
155
136
  modified_file.write_text(modified_text, encoding="utf-8")
156
137
 
157
- # Try character-level diff first for precise differences
158
138
  result = subprocess.run(
159
139
  [
160
140
  "git",
161
141
  "diff",
162
142
  "--word-diff=plain",
163
- "--word-diff-regex=.", # Character-by-character diff
164
- "-U0", # Zero lines of context - show only changed lines
143
+ "--word-diff-regex=.",
144
+ "-U0",
165
145
  "--no-index",
166
146
  str(original_file),
167
147
  str(modified_file),
@@ -171,9 +151,7 @@ class RedliningValidator:
171
151
  )
172
152
 
173
153
  if result.stdout.strip():
174
- # Clean up the output - remove git diff header lines
175
154
  lines = result.stdout.split("\n")
176
- # Skip the header lines (diff --git, index, +++, ---, @@)
177
155
  content_lines = []
178
156
  in_content = False
179
157
  for line in lines:
@@ -186,13 +164,12 @@ class RedliningValidator:
186
164
  if content_lines:
187
165
  return "\n".join(content_lines)
188
166
 
189
- # Fallback to word-level diff if character-level is too verbose
190
167
  result = subprocess.run(
191
168
  [
192
169
  "git",
193
170
  "diff",
194
171
  "--word-diff=plain",
195
- "-U0", # Zero lines of context
172
+ "-U0",
196
173
  "--no-index",
197
174
  str(original_file),
198
175
  str(modified_file),
@@ -214,18 +191,15 @@ class RedliningValidator:
214
191
  return "\n".join(content_lines)
215
192
 
216
193
  except (subprocess.CalledProcessError, FileNotFoundError, Exception):
217
- # Git not available or other error, return None to use fallback
218
194
  pass
219
195
 
220
196
  return None
221
197
 
222
198
  def _remove_author_tracked_changes(self, root):
223
- """Remove tracked changes authored by the specified author from the XML root."""
224
199
  ins_tag = f"{{{self.namespaces['w']}}}ins"
225
200
  del_tag = f"{{{self.namespaces['w']}}}del"
226
201
  author_attr = f"{{{self.namespaces['w']}}}author"
227
202
 
228
- # Remove w:ins elements
229
203
  for parent in root.iter():
230
204
  to_remove = []
231
205
  for child in parent:
@@ -234,7 +208,6 @@ class RedliningValidator:
234
208
  for elem in to_remove:
235
209
  parent.remove(elem)
236
210
 
237
- # Unwrap content in w:del elements where author matches
238
211
  deltext_tag = f"{{{self.namespaces['w']}}}delText"
239
212
  t_tag = f"{{{self.namespaces['w']}}}t"
240
213
 
@@ -244,36 +217,26 @@ class RedliningValidator:
244
217
  if child.tag == del_tag and child.get(author_attr) == self.author:
245
218
  to_process.append((child, list(parent).index(child)))
246
219
 
247
- # Process in reverse order to maintain indices
248
220
  for del_elem, del_index in reversed(to_process):
249
- # Convert w:delText to w:t before moving
250
221
  for elem in del_elem.iter():
251
222
  if elem.tag == deltext_tag:
252
223
  elem.tag = t_tag
253
224
 
254
- # Move all children of w:del to its parent before removing w:del
255
225
  for child in reversed(list(del_elem)):
256
226
  parent.insert(del_index, child)
257
227
  parent.remove(del_elem)
258
228
 
259
229
  def _extract_text_content(self, root):
260
- """Extract text content from Word XML, preserving paragraph structure.
261
-
262
- Empty paragraphs are skipped to avoid false positives when tracked
263
- insertions add only structural elements without text content.
264
- """
265
230
  p_tag = f"{{{self.namespaces['w']}}}p"
266
231
  t_tag = f"{{{self.namespaces['w']}}}t"
267
232
 
268
233
  paragraphs = []
269
234
  for p_elem in root.findall(f".//{p_tag}"):
270
- # Get all text elements within this paragraph
271
235
  text_parts = []
272
236
  for t_elem in p_elem.findall(f".//{t_tag}"):
273
237
  if t_elem.text:
274
238
  text_parts.append(t_elem.text)
275
239
  paragraph_text = "".join(text_parts)
276
- # Skip empty paragraphs - they don't affect content validation
277
240
  if paragraph_text:
278
241
  paragraphs.append(paragraph_text)
279
242
 
@@ -1,24 +1,11 @@
1
1
  ---
2
2
  name: pdf
3
- description: Comprehensive PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms. When Claude needs to fill in a PDF form or programmatically process, generate, or analyze PDF documents at scale.
3
+ description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill.
4
4
  license: Proprietary. LICENSE.txt has complete terms
5
5
  ---
6
6
 
7
7
  # PDF Processing Guide
8
8
 
9
- ## Document Integrity Mode (CRITICAL)
10
-
11
- When the user asks to **fill an existing PDF** (especially official/government forms), preserve layout exactly:
12
-
13
- - Never recreate the document from scratch.
14
- - Never reflow, rewrite, or "clean up" page content.
15
- - Never convert PDF -> DOCX/Markdown -> PDF for form filling tasks.
16
- - Never replace or redesign page templates.
17
- - Always keep the original file unchanged and write to a new output file.
18
- - Use the workflow in `FORMS.md` exactly (fillable fields first, then fallback path).
19
-
20
- If the user asks for exact formatting, treat that as strict mode and prioritize minimal-delta edits only.
21
-
22
9
  ## Overview
23
10
 
24
11
  This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
@@ -128,7 +115,7 @@ with pdfplumber.open("document.pdf") as pdf:
128
115
  # Combine all tables
129
116
  if all_tables:
130
117
  combined_df = pd.concat(all_tables, ignore_index=True)
131
- combined_df.to_excel(os.path.expanduser("~/Desktop/extracted_tables.xlsx"), index=False)
118
+ combined_df.to_excel("extracted_tables.xlsx", index=False)
132
119
  ```
133
120
 
134
121
  ### reportlab - Create PDFs
@@ -179,6 +166,26 @@ story.append(Paragraph("Content for page 2", styles['Normal']))
179
166
  doc.build(story)
180
167
  ```
181
168
 
169
+ #### Subscripts and Superscripts
170
+
171
+ **IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
172
+
173
+ Instead, use ReportLab's XML markup tags in Paragraph objects:
174
+ ```python
175
+ from reportlab.platypus import Paragraph
176
+ from reportlab.lib.styles import getSampleStyleSheet
177
+
178
+ styles = getSampleStyleSheet()
179
+
180
+ # Subscripts: use <sub> tag
181
+ chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
182
+
183
+ # Superscripts: use <super> tag
184
+ squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
185
+ ```
186
+
187
+ For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
188
+
182
189
  ## Command-Line Tools
183
190
 
184
191
  ### pdftotext (poppler-utils)
@@ -1,17 +1,5 @@
1
1
  **CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.**
2
2
 
3
- ## Safety Rules (Exact-Formatting Forms)
4
-
5
- For official/sensitive forms, follow these rules strictly:
6
-
7
- - Keep the original PDF untouched. Always write to a new file (for example, `original.filled.pdf`).
8
- - Do not overwrite the input file.
9
- - Do not regenerate pages or rebuild the PDF from extracted text.
10
- - Do not use "create PDF" workflows for form-filling requests.
11
- - Prefer true form-field filling whenever available; this preserves layout best.
12
- - If the file has no fillable fields, explain that non-fillable fallback uses overlays/annotations and may not be pixel-perfect in every viewer.
13
- - For non-fillable fallback, ask for a brief confirmation before writing output when exact legal formatting is required.
14
-
15
3
  If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory:
16
4
  `python scripts/check_fillable_fields <file.pdf>`, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions.
17
5
 
@@ -86,7 +74,6 @@ Then analyze the images to determine the purpose of each form field (make sure t
86
74
  - Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF:
87
75
  `python scripts/fill_fillable_fields.py <input pdf> <field_values.json> <output pdf>`
88
76
  This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again.
89
- - Use a new output filename and keep the input unchanged.
90
77
 
91
78
  # Non-fillable fields
92
79
  If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed.
@@ -295,7 +282,6 @@ Fix any reported errors in fields.json before proceeding.
295
282
 
296
283
  The fill script auto-detects the coordinate system and handles conversion:
297
284
  `python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
298
- - Use a new output filename and keep the input unchanged.
299
285
 
300
286
  ## Step 4: Verify Output
301
287
 
@@ -3,8 +3,6 @@ import json
3
3
  import sys
4
4
 
5
5
 
6
- # Script to check that the `fields.json` file that Claude creates when analyzing PDFs
7
- # does not have overlapping bounding boxes. See FORMS.md.
8
6
 
9
7
 
10
8
  @dataclass
@@ -14,7 +12,6 @@ class RectAndField:
14
12
  field: dict
15
13
 
16
14
 
17
- # Returns a list of messages that are printed to stdout for Claude to read.
18
15
  def get_bounding_box_messages(fields_json_stream) -> list[str]:
19
16
  messages = []
20
17
  fields = json.load(fields_json_stream)
@@ -32,7 +29,6 @@ def get_bounding_box_messages(fields_json_stream) -> list[str]:
32
29
 
33
30
  has_error = False
34
31
  for i, ri in enumerate(rects_and_fields):
35
- # This is O(N^2); we can optimize if it becomes a problem.
36
32
  for j in range(i + 1, len(rects_and_fields)):
37
33
  rj = rects_and_fields[j]
38
34
  if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
@@ -63,7 +59,6 @@ if __name__ == "__main__":
63
59
  if len(sys.argv) != 2:
64
60
  print("Usage: check_bounding_boxes.py [fields.json]")
65
61
  sys.exit(1)
66
- # Input file should be in the `fields.json` format described in FORMS.md.
67
62
  with open(sys.argv[1]) as f:
68
63
  messages = get_bounding_box_messages(f)
69
64
  for msg in messages:
@@ -2,7 +2,6 @@ import sys
2
2
  from pypdf import PdfReader
3
3
 
4
4
 
5
- # Script for Claude to run to determine whether a PDF has fillable form fields. See FORMS.md.
6
5
 
7
6
 
8
7
  reader = PdfReader(sys.argv[1])
@@ -4,14 +4,12 @@ import sys
4
4
  from pdf2image import convert_from_path
5
5
 
6
6
 
7
- # Converts each page of a PDF to a PNG image.
8
7
 
9
8
 
10
9
  def convert(pdf_path, output_dir, max_dim=1000):
11
10
  images = convert_from_path(pdf_path, dpi=200)
12
11
 
13
12
  for i, image in enumerate(images):
14
- # Scale image if needed to keep width/height under `max_dim`
15
13
  width, height = image.size
16
14
  if width > max_dim or height > max_dim:
17
15
  scale_factor = min(max_dim / width, max_dim / height)
@@ -4,12 +4,9 @@ import sys
4
4
  from PIL import Image, ImageDraw
5
5
 
6
6
 
7
- # Creates "validation" images with rectangles for the bounding box information that
8
- # Claude creates when determining where to add text annotations in PDFs. See FORMS.md.
9
7
 
10
8
 
11
9
  def create_validation_image(page_number, fields_json_path, input_path, output_path):
12
- # Input file should be in the `fields.json` format described in FORMS.md.
13
10
  with open(fields_json_path, 'r') as f:
14
11
  data = json.load(f)
15
12
 
@@ -21,7 +18,6 @@ def create_validation_image(page_number, fields_json_path, input_path, output_pa
21
18
  if field["page_number"] == page_number:
22
19
  entry_box = field['entry_bounding_box']
23
20
  label_box = field['label_bounding_box']
24
- # Draw red rectangle over entry bounding box and blue rectangle over the label.
25
21
  draw.rectangle(entry_box, outline='red', width=2)
26
22
  draw.rectangle(label_box, outline='blue', width=2)
27
23
  num_boxes += 2
@@ -4,11 +4,8 @@ import sys
4
4
  from pypdf import PdfReader
5
5
 
6
6
 
7
- # Extracts data for the fillable form fields in a PDF and outputs JSON that
8
- # Claude uses to fill the fields. See FORMS.md.
9
7
 
10
8
 
11
- # This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
12
9
  def get_full_annotation_field_id(annotation):
13
10
  components = []
14
11
  while annotation:
@@ -25,12 +22,9 @@ def make_field_dict(field, field_id):
25
22
  if ft == "/Tx":
26
23
  field_dict["type"] = "text"
27
24
  elif ft == "/Btn":
28
- field_dict["type"] = "checkbox" # radio groups handled separately
25
+ field_dict["type"] = "checkbox"
29
26
  states = field.get("/_States_", [])
30
27
  if len(states) == 2:
31
- # "/Off" seems to always be the unchecked value, as suggested by
32
- # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
33
- # It can be either first or second in the "/_States_" list.
34
28
  if "/Off" in states:
35
29
  field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
36
30
  field_dict["unchecked_value"] = "/Off"
@@ -50,15 +44,6 @@ def make_field_dict(field, field_id):
50
44
  return field_dict
51
45
 
52
46
 
53
- # Returns a list of fillable PDF fields:
54
- # [
55
- # {
56
- # "field_id": "name",
57
- # "page": 1,
58
- # "type": ("text", "checkbox", "radio_group", or "choice")
59
- # // Per-type additional fields described in FORMS.md
60
- # },
61
- # ]
62
47
  def get_field_info(reader: PdfReader):
63
48
  fields = reader.get_fields()
64
49
 
@@ -66,19 +51,13 @@ def get_field_info(reader: PdfReader):
66
51
  possible_radio_names = set()
67
52
 
68
53
  for field_id, field in fields.items():
69
- # Skip if this is a container field with children, except that it might be
70
- # a parent group for radio button options.
71
54
  if field.get("/Kids"):
72
55
  if field.get("/FT") == "/Btn":
73
56
  possible_radio_names.add(field_id)
74
57
  continue
75
58
  field_info_by_id[field_id] = make_field_dict(field, field_id)
76
59
 
77
- # Bounding rects are stored in annotations in page objects.
78
60
 
79
- # Radio button options have a separate annotation for each choice;
80
- # all choices have the same field name.
81
- # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
82
61
  radio_fields_by_id = {}
83
62
 
84
63
  for page_index, page in enumerate(reader.pages):
@@ -90,8 +69,6 @@ def get_field_info(reader: PdfReader):
90
69
  field_info_by_id[field_id]["rect"] = ann.get('/Rect')
91
70
  elif field_id in possible_radio_names:
92
71
  try:
93
- # ann['/AP']['/N'] should have two items. One of them is '/Off',
94
- # the other is the active value.
95
72
  on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
96
73
  except KeyError:
97
74
  continue
@@ -104,17 +81,11 @@ def get_field_info(reader: PdfReader):
104
81
  "page": page_index + 1,
105
82
  "radio_options": [],
106
83
  }
107
- # Note: at least on macOS 15.7, Preview.app doesn't show selected
108
- # radio buttons correctly. (It does if you remove the leading slash
109
- # from the value, but that causes them not to appear correctly in
110
- # Chrome/Firefox/Acrobat/etc).
111
84
  radio_fields_by_id[field_id]["radio_options"].append({
112
85
  "value": on_values[0],
113
86
  "rect": rect,
114
87
  })
115
88
 
116
- # Some PDFs have form field definitions without corresponding annotations,
117
- # so we can't tell where they are. Ignore these fields for now.
118
89
  fields_with_location = []
119
90
  for field_info in field_info_by_id.values():
120
91
  if "page" in field_info:
@@ -122,7 +93,6 @@ def get_field_info(reader: PdfReader):
122
93
  else:
123
94
  print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
124
95
 
125
- # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
126
96
  def sort_key(f):
127
97
  if "radio_options" in f:
128
98
  rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  """
3
2
  Extract form structure from a non-fillable PDF.
4
3
 
@@ -19,7 +18,6 @@ import pdfplumber
19
18
 
20
19
 
21
20
  def extract_form_structure(pdf_path):
22
- """Extract structural elements from a PDF form."""
23
21
  structure = {
24
22
  "pages": [],
25
23
  "labels": [],
@@ -30,14 +28,12 @@ def extract_form_structure(pdf_path):
30
28
 
31
29
  with pdfplumber.open(pdf_path) as pdf:
32
30
  for page_num, page in enumerate(pdf.pages, 1):
33
- # Page info
34
31
  structure["pages"].append({
35
32
  "page_number": page_num,
36
33
  "width": float(page.width),
37
34
  "height": float(page.height)
38
35
  })
39
36
 
40
- # Extract text labels with positions
41
37
  words = page.extract_words()
42
38
  for word in words:
43
39
  structure["labels"].append({
@@ -49,9 +45,7 @@ def extract_form_structure(pdf_path):
49
45
  "bottom": round(float(word["bottom"]), 1)
50
46
  })
51
47
 
52
- # Extract horizontal lines (row separators)
53
48
  for line in page.lines:
54
- # Horizontal lines span most of page width
55
49
  if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
56
50
  structure["lines"].append({
57
51
  "page": page_num,
@@ -60,11 +54,9 @@ def extract_form_structure(pdf_path):
60
54
  "x1": round(float(line["x1"]), 1)
61
55
  })
62
56
 
63
- # Extract checkboxes (small square rectangles)
64
57
  for rect in page.rects:
65
58
  width = float(rect["x1"]) - float(rect["x0"])
66
59
  height = float(rect["bottom"]) - float(rect["top"])
67
- # Checkboxes are typically 5-15 points square
68
60
  if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
69
61
  structure["checkboxes"].append({
70
62
  "page": page_num,
@@ -76,7 +68,6 @@ def extract_form_structure(pdf_path):
76
68
  "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
77
69
  })
78
70
 
79
- # Calculate row boundaries from horizontal lines
80
71
  lines_by_page = {}
81
72
  for line in structure["lines"]:
82
73
  page = line["page"]
@@ -1,25 +1,16 @@
1
1
  import json
2
2
  import sys
3
- import os
4
3
 
5
4
  from pypdf import PdfReader, PdfWriter
6
5
 
7
6
  from extract_form_field_info import get_field_info
8
7
 
9
8
 
10
- # Fills fillable form fields in a PDF. See FORMS.md.
11
9
 
12
10
 
13
11
  def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
14
- input_abs = os.path.abspath(input_pdf_path)
15
- output_abs = os.path.abspath(output_pdf_path)
16
- if input_abs == output_abs:
17
- print("ERROR: Refusing to overwrite input PDF. Use a different output path.")
18
- sys.exit(1)
19
-
20
12
  with open(fields_json_path) as f:
21
13
  fields = json.load(f)
22
- # Group by page number.
23
14
  fields_by_page = {}
24
15
  for field in fields:
25
16
  if "value" in field:
@@ -55,8 +46,6 @@ def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path:
55
46
  for page, field_values in fields_by_page.items():
56
47
  writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
57
48
 
58
- # This seems to be necessary for many PDF viewers to format the form values correctly.
59
- # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
60
49
  writer.set_need_appearances_writer(True)
61
50
 
62
51
  with open(output_pdf_path, "wb") as f:
@@ -82,18 +71,6 @@ def validation_error_for_field_value(field_info, field_value):
82
71
  return None
83
72
 
84
73
 
85
- # pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
86
- # In _writer.py around line 966:
87
- #
88
- # if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
89
- # txt = "\n".join(annotation.get_inherited(FA.Opt, []))
90
- #
91
- # The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
92
- # [["value1", "Text 1"], ["value2", "Text 2"], ...]
93
- # This causes `join` to throw a TypeError because it expects an iterable of strings.
94
- # The horrible workaround is to patch `get_inherited` to return a list of the value strings.
95
- # We call the original method and adjust the return value only if the argument to `get_inherited`
96
- # is `FA.Opt` and if the return value is a list of two-element lists.
97
74
  def monkeypatch_pydpf_method():
98
75
  from pypdf.generic import DictionaryObject
99
76
  from pypdf.constants import FieldDictionaryAttributes