@panda-agent/panda-cli 0.1.29 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. package/dist/panda-cli-ink.bundle.mjs +258 -247
  2. package/package.json +6 -4
  3. package/skills/.gitkeep +0 -0
  4. package/skills/README.md +13 -0
  5. package/skills/docx/.skill-metadata.yaml +173 -0
  6. package/skills/docx/LICENSE.txt +30 -0
  7. package/skills/docx/SKILL.md +589 -0
  8. package/skills/docx/scripts/__init__.py +1 -0
  9. package/skills/docx/scripts/accept_changes.py +206 -0
  10. package/skills/docx/scripts/comment.py +442 -0
  11. package/skills/docx/scripts/office/helpers/__init__.py +1 -0
  12. package/skills/docx/scripts/office/helpers/merge_runs.py +190 -0
  13. package/skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
  14. package/skills/docx/scripts/office/pack.py +167 -0
  15. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  16. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  17. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  18. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  19. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  20. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  21. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  22. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  23. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  24. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  25. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  26. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  27. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  28. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  29. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  30. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  31. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  32. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  33. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  34. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  35. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  36. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  37. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  38. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  39. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  40. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  41. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  42. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  43. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  44. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  45. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  46. package/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
  47. package/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  48. package/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  49. package/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  50. package/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  51. package/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  52. package/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  53. package/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  54. package/skills/docx/scripts/office/soffice.py +194 -0
  55. package/skills/docx/scripts/office/unpack.py +145 -0
  56. package/skills/docx/scripts/office/validate.py +114 -0
  57. package/skills/docx/scripts/office/validators/__init__.py +16 -0
  58. package/skills/docx/scripts/office/validators/base.py +733 -0
  59. package/skills/docx/scripts/office/validators/docx.py +354 -0
  60. package/skills/docx/scripts/office/validators/pptx.py +230 -0
  61. package/skills/docx/scripts/office/validators/redlining.py +212 -0
  62. package/skills/docx/scripts/templates/comments.xml +3 -0
  63. package/skills/docx/scripts/templates/commentsExtended.xml +3 -0
  64. package/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
  65. package/skills/docx/scripts/templates/commentsIds.xml +3 -0
  66. package/skills/docx/scripts/templates/people.xml +3 -0
  67. package/skills/frontend-design/LICENSE.txt +177 -0
  68. package/skills/frontend-design/SKILL.md +42 -0
  69. package/skills/pdf/.skill-metadata.yaml +273 -0
  70. package/skills/pdf/LICENSE.txt +30 -0
  71. package/skills/pdf/SKILL.md +324 -0
  72. package/skills/pdf/advanced-reference.md +609 -0
  73. package/skills/pdf/form-filling-guide.md +318 -0
  74. package/skills/pdf/forms.md +294 -0
  75. package/skills/pdf/reference.md +612 -0
  76. package/skills/pdf/scripts/check_bounding_boxes.py +198 -0
  77. package/skills/pdf/scripts/check_fillable_fields.py +64 -0
  78. package/skills/pdf/scripts/convert_pdf_to_images.py +102 -0
  79. package/skills/pdf/scripts/create_validation_image.py +125 -0
  80. package/skills/pdf/scripts/extract_form_field_info.py +220 -0
  81. package/skills/pdf/scripts/extract_form_structure.py +202 -0
  82. package/skills/pdf/scripts/fill_fillable_fields.py +205 -0
  83. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
  84. package/skills/pptx-generator/SKILL.md +204 -0
  85. package/skills/pptx-generator/assets/styles/business.json +8 -0
  86. package/skills/pptx-generator/assets/styles/minimal.json +8 -0
  87. package/skills/pptx-generator/assets/styles/modern.json +8 -0
  88. package/skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
  89. package/skills/pptx-generator/references/collaboration_guide.md +381 -0
  90. package/skills/pptx-generator/references/json_format_spec.md +215 -0
  91. package/skills/pptx-generator/references/layout_guide.md +290 -0
  92. package/skills/pptx-generator/scripts/json_validator.py +194 -0
  93. package/skills/pptx-generator/scripts/pptx_builder.py +340 -0
  94. package/skills/pptx-generator/scripts/pptx_validator.py +162 -0
  95. package/skills/skill-creator/LICENSE.txt +202 -0
  96. package/skills/skill-creator/SKILL.md +479 -0
  97. package/skills/skill-creator/agents/analyzer.md +274 -0
  98. package/skills/skill-creator/agents/comparator.md +202 -0
  99. package/skills/skill-creator/agents/grader.md +223 -0
  100. package/skills/skill-creator/assets/eval_review.html +146 -0
  101. package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  102. package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  103. package/skills/skill-creator/references/schemas.md +430 -0
  104. package/skills/skill-creator/scripts/__init__.py +0 -0
  105. package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  106. package/skills/skill-creator/scripts/generate_report.py +326 -0
  107. package/skills/skill-creator/scripts/improve_description.py +248 -0
  108. package/skills/skill-creator/scripts/package_skill.py +136 -0
  109. package/skills/skill-creator/scripts/quick_validate.py +103 -0
  110. package/skills/skill-creator/scripts/run_eval.py +310 -0
  111. package/skills/skill-creator/scripts/run_loop.py +332 -0
  112. package/skills/skill-creator/scripts/utils.py +47 -0
  113. package/skills/xlsx/.skill-metadata.yaml +185 -0
  114. package/skills/xlsx/LICENSE.txt +30 -0
  115. package/skills/xlsx/SKILL.md +233 -0
  116. package/skills/xlsx/scripts/office/helpers/__init__.py +1 -0
  117. package/skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
  118. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
  119. package/skills/xlsx/scripts/office/pack.py +162 -0
  120. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
  121. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
  122. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
  123. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
  124. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
  125. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
  126. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
  127. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
  128. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
  129. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
  130. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
  131. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
  132. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
  133. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
  134. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
  135. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
  136. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
  137. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
  138. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
  139. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
  140. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
  141. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
  142. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
  143. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
  144. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
  145. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
  146. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
  147. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
  148. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
  149. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
  150. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
  151. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
  152. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
  153. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
  154. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
  155. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
  156. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
  157. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
  158. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
  159. package/skills/xlsx/scripts/office/soffice.py +185 -0
  160. package/skills/xlsx/scripts/office/unpack.py +146 -0
  161. package/skills/xlsx/scripts/office/validate.py +108 -0
  162. package/skills/xlsx/scripts/office/validators/__init__.py +13 -0
  163. package/skills/xlsx/scripts/office/validators/base.py +800 -0
  164. package/skills/xlsx/scripts/office/validators/docx.py +383 -0
  165. package/skills/xlsx/scripts/office/validators/pptx.py +250 -0
  166. package/skills/xlsx/scripts/office/validators/redlining.py +229 -0
  167. package/skills/xlsx/scripts/recalc.py +296 -0
@@ -0,0 +1,220 @@
1
+ """Introspect fillable PDF form fields and serialize their metadata to JSON.
2
+
3
+ Supports text inputs, checkboxes, radio button groups, and dropdown choices.
4
+
5
+ Usage:
6
+ python extract_form_field_info.py <input.pdf> <output.json>
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional, Set
14
+
15
+ import pypdf
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Constants
19
+ # ---------------------------------------------------------------------------
20
+
21
+ EXIT_SUCCESS: int = 0
22
+ EXIT_FAILURE: int = 1
23
+
24
+ FIELD_TYPE_TEXT: str = "text"
25
+ FIELD_TYPE_CHECKBOX: str = "checkbox"
26
+ FIELD_TYPE_RADIO: str = "radio_group"
27
+ FIELD_TYPE_CHOICE: str = "choice"
28
+
29
+ PDF_FT_TEXT: str = "/Tx"
30
+ PDF_FT_BUTTON: str = "/Btn"
31
+ PDF_FT_CHOICE: str = "/Ch"
32
+
33
+ OFF_STATE: str = "/Off"
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Internal helpers
37
+ # ---------------------------------------------------------------------------
38
+
39
+
40
+ def _resolve_qualified_name(annot_obj: Any) -> Optional[str]:
41
+ """Walk the /Parent chain to assemble a dot-separated field identifier."""
42
+ parts: List[str] = []
43
+ node = annot_obj
44
+ while node is not None:
45
+ name_component = node.get("/T")
46
+ if name_component:
47
+ parts.append(name_component)
48
+ node = node.get("/Parent")
49
+ return ".".join(reversed(parts)) if parts else None
50
+
51
+
52
+ def _build_field_descriptor(raw_field: Any, identifier: str) -> Dict[str, Any]:
53
+ """Construct a typed descriptor dict from a raw PDF field object."""
54
+ descriptor: Dict[str, Any] = {"field_id": identifier}
55
+ field_type_code = raw_field.get("/FT")
56
+
57
+ if field_type_code == PDF_FT_TEXT:
58
+ descriptor["type"] = FIELD_TYPE_TEXT
59
+
60
+ elif field_type_code == PDF_FT_BUTTON:
61
+ descriptor["type"] = FIELD_TYPE_CHECKBOX
62
+ available_states = raw_field.get("/_States_", [])
63
+ if len(available_states) == 2:
64
+ off_present = OFF_STATE in available_states
65
+ if off_present:
66
+ on_val = (
67
+ available_states[0]
68
+ if available_states[0] != OFF_STATE
69
+ else available_states[1]
70
+ )
71
+ descriptor["checked_value"] = on_val
72
+ descriptor["unchecked_value"] = OFF_STATE
73
+ else:
74
+ print(
75
+ "Unexpected state values for checkbox `$%s`. "
76
+ "Its checked and unchecked values may not be correct; "
77
+ "if you're trying to check it, visually verify the results."
78
+ % identifier
79
+ )
80
+ descriptor["checked_value"] = available_states[0]
81
+ descriptor["unchecked_value"] = available_states[1]
82
+
83
+ elif field_type_code == PDF_FT_CHOICE:
84
+ descriptor["type"] = FIELD_TYPE_CHOICE
85
+ available_states = raw_field.get("/_States_", [])
86
+ descriptor["choice_options"] = [
87
+ {"value": opt[0], "text": opt[1]} for opt in available_states
88
+ ]
89
+
90
+ else:
91
+ descriptor["type"] = "unknown (%s)" % field_type_code
92
+
93
+ return descriptor
94
+
95
+
96
+ def _ordering_key(item: Dict[str, Any]) -> List[Any]:
97
+ """Produce a sort key: page number, then top-to-bottom left-to-right."""
98
+ if "radio_options" in item:
99
+ rect = item["radio_options"][0]["rect"] or [0, 0, 0, 0]
100
+ else:
101
+ rect = item.get("rect") or [0, 0, 0, 0]
102
+ return [item.get("page"), [-rect[1], rect[0]]]
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Public API
107
+ # ---------------------------------------------------------------------------
108
+
109
+
110
+ def get_field_info(pdf_reader: pypdf.PdfReader) -> List[Dict[str, Any]]:
111
+ """Extract structured field metadata from all form fields in the document.
112
+
113
+ Returns a sorted list of field descriptors with page/rect info.
114
+ """
115
+ raw_fields = pdf_reader.get_fields()
116
+ if not raw_fields:
117
+ return []
118
+
119
+ descriptors_map: Dict[str, Dict[str, Any]] = {}
120
+ candidate_radio_ids: Set[str] = set()
121
+
122
+ for fid, fobj in raw_fields.items():
123
+ if fobj.get("/Kids"):
124
+ if fobj.get("/FT") == PDF_FT_BUTTON:
125
+ candidate_radio_ids.add(fid)
126
+ continue
127
+ descriptors_map[fid] = _build_field_descriptor(fobj, fid)
128
+
129
+ radio_groups: Dict[str, Dict[str, Any]] = {}
130
+
131
+ for pg_idx, pg in enumerate(pdf_reader.pages):
132
+ annot_list = pg.get("/Annots", [])
133
+ for annot in annot_list:
134
+ qualified = _resolve_qualified_name(annot)
135
+ if qualified in descriptors_map:
136
+ descriptors_map[qualified]["page"] = pg_idx + 1
137
+ descriptors_map[qualified]["rect"] = annot.get("/Rect")
138
+ elif qualified in candidate_radio_ids:
139
+ try:
140
+ active_vals = [
141
+ k for k in annot["/AP"]["/N"] if k != OFF_STATE
142
+ ]
143
+ except KeyError:
144
+ continue
145
+ if len(active_vals) != 1:
146
+ continue
147
+ rect_val = annot.get("/Rect")
148
+ if qualified not in radio_groups:
149
+ radio_groups[qualified] = {
150
+ "field_id": qualified,
151
+ "type": FIELD_TYPE_RADIO,
152
+ "page": pg_idx + 1,
153
+ "radio_options": [],
154
+ }
155
+ radio_groups[qualified]["radio_options"].append({
156
+ "value": active_vals[0],
157
+ "rect": rect_val,
158
+ })
159
+
160
+ # Filter out fields without a determined page location
161
+ located = [d for d in descriptors_map.values() if "page" in d]
162
+ for orphan in descriptors_map.values():
163
+ if "page" not in orphan:
164
+ print(
165
+ "Unable to determine location for field id: %s, ignoring"
166
+ % orphan.get("field_id")
167
+ )
168
+
169
+ combined = located + list(radio_groups.values())
170
+ combined.sort(key=_ordering_key)
171
+ return combined
172
+
173
+
174
+ def serialize_field_info(pdf_file: Path, output_json: Path) -> None:
175
+ """Read the PDF and write field info as JSON."""
176
+ reader = pypdf.PdfReader(str(pdf_file))
177
+ info = get_field_info(reader)
178
+ with open(output_json, "w", encoding="utf-8") as fp:
179
+ json.dump(info, fp, indent=2)
180
+ print("Wrote %d fields to %s" % (len(info), output_json))
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # CLI
185
+ # ---------------------------------------------------------------------------
186
+
187
+
188
+ def build_parser() -> argparse.ArgumentParser:
189
+ """Construct the CLI argument parser."""
190
+ parser = argparse.ArgumentParser(
191
+ description="Extract fillable form field metadata from a PDF to JSON."
192
+ )
193
+ parser.add_argument(
194
+ "input_pdf",
195
+ type=Path,
196
+ help="Path to the source PDF with form fields.",
197
+ )
198
+ parser.add_argument(
199
+ "output_json",
200
+ type=Path,
201
+ help="Destination path for the JSON output.",
202
+ )
203
+ return parser
204
+
205
+
206
+ def main() -> None:
207
+ """Entry point: parse arguments and run extraction."""
208
+ parser = build_parser()
209
+ args = parser.parse_args()
210
+
211
+ input_pdf: Path = args.input_pdf
212
+ if not input_pdf.exists():
213
+ print("ERROR: File not found: {}".format(input_pdf), file=sys.stderr)
214
+ sys.exit(EXIT_FAILURE)
215
+
216
+ serialize_field_info(input_pdf, args.output_json)
217
+
218
+
219
+ if __name__ == "__main__":
220
+ main()
@@ -0,0 +1,202 @@
1
+ """Analyze non-fillable PDF layout to discover text elements, ruling lines,
2
+ and checkbox-like rectangles.
3
+
4
+ Produces a JSON manifest for downstream coordinate-based form filling.
5
+
6
+ Usage:
7
+ python extract_form_structure.py <input.pdf> <output.json>
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List
15
+
16
+ import pdfplumber
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Constants
20
+ # ---------------------------------------------------------------------------
21
+
22
+ EXIT_SUCCESS: int = 0
23
+ EXIT_FAILURE: int = 1
24
+
25
+ # Constraints for identifying checkbox-shaped rectangles
26
+ CHECKBOX_MIN_SIZE: float = 5.0
27
+ CHECKBOX_MAX_SIZE: float = 15.0
28
+ CHECKBOX_ASPECT_TOLERANCE: float = 2.0
29
+
30
+ # Minimum fraction of page width for a line to be considered "spanning"
31
+ SPANNING_LINE_RATIO: float = 0.5
32
+
33
+ COORDINATE_PRECISION: int = 1
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Geometry helpers
37
+ # ---------------------------------------------------------------------------
38
+
39
+
40
+ def _is_checkbox_rect(rect_obj: Dict[str, Any]) -> bool:
41
+ """Return True if the rectangle resembles a checkbox (small, roughly square)."""
42
+ w: float = float(rect_obj["x1"]) - float(rect_obj["x0"])
43
+ h: float = float(rect_obj["bottom"]) - float(rect_obj["top"])
44
+ size_ok = (
45
+ CHECKBOX_MIN_SIZE <= w <= CHECKBOX_MAX_SIZE
46
+ and CHECKBOX_MIN_SIZE <= h <= CHECKBOX_MAX_SIZE
47
+ )
48
+ square_ok = abs(w - h) < CHECKBOX_ASPECT_TOLERANCE
49
+ return size_ok and square_ok
50
+
51
+
52
+ def _is_spanning_line(line_obj: Dict[str, Any], page_width: float) -> bool:
53
+ """Return True if the line covers more than half the page width."""
54
+ span: float = abs(float(line_obj["x1"]) - float(line_obj["x0"]))
55
+ return span > page_width * SPANNING_LINE_RATIO
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Core analysis
60
+ # ---------------------------------------------------------------------------
61
+
62
+
63
+ def analyze_pdf_layout(pdf_path: Path) -> Dict[str, Any]:
64
+ """Open a PDF and extract structural elements.
65
+
66
+ Extracts words, long horizontal lines, and small square rectangles
67
+ (checkboxes). Returns a dict of categorized data.
68
+ """
69
+ result: Dict[str, Any] = {
70
+ "pages": [],
71
+ "labels": [],
72
+ "lines": [],
73
+ "checkboxes": [],
74
+ "row_boundaries": [],
75
+ }
76
+
77
+ with pdfplumber.open(str(pdf_path)) as doc:
78
+ for pg_num, pg in enumerate(doc.pages, start=1):
79
+ result["pages"].append({
80
+ "page_number": pg_num,
81
+ "width": float(pg.width),
82
+ "height": float(pg.height),
83
+ })
84
+
85
+ # Collect word-level text elements
86
+ for word in pg.extract_words():
87
+ result["labels"].append({
88
+ "page": pg_num,
89
+ "text": word["text"],
90
+ "x0": round(float(word["x0"]), COORDINATE_PRECISION),
91
+ "top": round(float(word["top"]), COORDINATE_PRECISION),
92
+ "x1": round(float(word["x1"]), COORDINATE_PRECISION),
93
+ "bottom": round(float(word["bottom"]), COORDINATE_PRECISION),
94
+ })
95
+
96
+ # Collect long horizontal rules
97
+ for ln in pg.lines:
98
+ if _is_spanning_line(ln, pg.width):
99
+ result["lines"].append({
100
+ "page": pg_num,
101
+ "y": round(float(ln["top"]), COORDINATE_PRECISION),
102
+ "x0": round(float(ln["x0"]), COORDINATE_PRECISION),
103
+ "x1": round(float(ln["x1"]), COORDINATE_PRECISION),
104
+ })
105
+
106
+ # Collect checkbox-like rectangles
107
+ for rect in pg.rects:
108
+ if _is_checkbox_rect(rect):
109
+ x0v: float = float(rect["x0"])
110
+ x1v: float = float(rect["x1"])
111
+ topv: float = float(rect["top"])
112
+ botv: float = float(rect["bottom"])
113
+ result["checkboxes"].append({
114
+ "page": pg_num,
115
+ "x0": round(x0v, COORDINATE_PRECISION),
116
+ "top": round(topv, COORDINATE_PRECISION),
117
+ "x1": round(x1v, COORDINATE_PRECISION),
118
+ "bottom": round(botv, COORDINATE_PRECISION),
119
+ "center_x": round((x0v + x1v) / 2, COORDINATE_PRECISION),
120
+ "center_y": round((topv + botv) / 2, COORDINATE_PRECISION),
121
+ })
122
+
123
+ # Derive row boundaries from horizontal lines
124
+ _compute_row_boundaries(result)
125
+
126
+ return result
127
+
128
+
129
+ def _compute_row_boundaries(result: Dict[str, Any]) -> None:
130
+ """Derive row boundary intervals from collected horizontal lines."""
131
+ per_page_ys: Dict[int, List[float]] = {}
132
+ for ln in result["lines"]:
133
+ per_page_ys.setdefault(ln["page"], []).append(ln["y"])
134
+
135
+ for pg_key, ys in per_page_ys.items():
136
+ sorted_ys = sorted(set(ys))
137
+ for k in range(len(sorted_ys) - 1):
138
+ result["row_boundaries"].append({
139
+ "page": pg_key,
140
+ "row_top": sorted_ys[k],
141
+ "row_bottom": sorted_ys[k + 1],
142
+ "row_height": round(
143
+ sorted_ys[k + 1] - sorted_ys[k], COORDINATE_PRECISION
144
+ ),
145
+ })
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # CLI
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ def build_parser() -> argparse.ArgumentParser:
154
+ """Construct the CLI argument parser."""
155
+ parser = argparse.ArgumentParser(
156
+ description=(
157
+ "Analyze PDF layout structure: text labels, ruling lines, "
158
+ "and checkbox shapes."
159
+ )
160
+ )
161
+ parser.add_argument(
162
+ "input_pdf",
163
+ type=Path,
164
+ help="Path to the source PDF to analyze.",
165
+ )
166
+ parser.add_argument(
167
+ "output_json",
168
+ type=Path,
169
+ help="Destination path for the JSON structure output.",
170
+ )
171
+ return parser
172
+
173
+
174
+ def main() -> None:
175
+ """Entry point: parse arguments and run layout analysis."""
176
+ parser = build_parser()
177
+ args = parser.parse_args()
178
+
179
+ input_pdf: Path = args.input_pdf
180
+ output_json: Path = args.output_json
181
+
182
+ if not input_pdf.exists():
183
+ print("ERROR: File not found: {}".format(input_pdf), file=sys.stderr)
184
+ sys.exit(EXIT_FAILURE)
185
+
186
+ print("Extracting structure from %s..." % input_pdf)
187
+ data = analyze_pdf_layout(input_pdf)
188
+
189
+ with open(output_json, "w", encoding="utf-8") as out:
190
+ json.dump(data, out, indent=2)
191
+
192
+ print("Found:")
193
+ print(" - %d pages" % len(data["pages"]))
194
+ print(" - %d text labels" % len(data["labels"]))
195
+ print(" - %d horizontal lines" % len(data["lines"]))
196
+ print(" - %d checkboxes" % len(data["checkboxes"]))
197
+ print(" - %d row boundaries" % len(data["row_boundaries"]))
198
+ print("Saved to %s" % output_json)
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
@@ -0,0 +1,205 @@
1
+ """Populate interactive PDF form fields with values specified in a JSON manifest.
2
+
3
+ Validates field IDs, page numbers, and value constraints before writing.
4
+
5
+ Usage:
6
+ python fill_fillable_fields.py <input.pdf> <field_values.json> <output.pdf>
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import sys
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ import pypdf
16
+
17
+ from extract_form_field_info import get_field_info
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Constants
21
+ # ---------------------------------------------------------------------------
22
+
23
+ EXIT_SUCCESS: int = 0
24
+ EXIT_FAILURE: int = 1
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Validation
28
+ # ---------------------------------------------------------------------------
29
+
30
+
31
+ def _check_value_constraint(descriptor: Dict[str, Any], val: str) -> Optional[str]:
32
+ """Verify that *val* is acceptable for the given field descriptor.
33
+
34
+ Returns an error string or None if valid.
35
+ """
36
+ ftype: str = descriptor["type"]
37
+ fid: str = descriptor["field_id"]
38
+
39
+ if ftype == "checkbox":
40
+ on_val: str = descriptor["checked_value"]
41
+ off_val: str = descriptor["unchecked_value"]
42
+ if val != on_val and val != off_val:
43
+ return (
44
+ 'ERROR: Invalid value "%s" for checkbox field "%s". '
45
+ 'The checked value is "%s" and the unchecked value is "%s"'
46
+ % (val, fid, on_val, off_val)
47
+ )
48
+
49
+ elif ftype == "radio_group":
50
+ allowed: List[str] = [o["value"] for o in descriptor["radio_options"]]
51
+ if val not in allowed:
52
+ return (
53
+ 'ERROR: Invalid value "%s" for radio group field "%s". '
54
+ "Valid values are: %s" % (val, fid, allowed)
55
+ )
56
+
57
+ elif ftype == "choice":
58
+ allowed = [o["value"] for o in descriptor["choice_options"]]
59
+ if val not in allowed:
60
+ return (
61
+ 'ERROR: Invalid value "%s" for choice field "%s". '
62
+ "Valid values are: %s" % (val, fid, allowed)
63
+ )
64
+
65
+ return None
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # pypdf compatibility patch
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ def _apply_pypdf_option_patch() -> None:
74
+ """Monkey-patch pypdf to handle two-element option arrays correctly.
75
+
76
+ Some PDFs encode choices as [[export_value, display_text], ...].
77
+ """
78
+ from pypdf.generic import DictionaryObject
79
+ from pypdf.constants import FieldDictionaryAttributes
80
+
81
+ _orig = DictionaryObject.get_inherited
82
+
83
+ def _patched(self: Any, key: str, default: Any = None) -> Any:
84
+ out = _orig(self, key, default)
85
+ if key == FieldDictionaryAttributes.Opt:
86
+ if isinstance(out, list) and all(
87
+ isinstance(v, list) and len(v) == 2 for v in out
88
+ ):
89
+ out = [pair[0] for pair in out]
90
+ return out
91
+
92
+ DictionaryObject.get_inherited = _patched
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Core logic
97
+ # ---------------------------------------------------------------------------
98
+
99
+
100
+ def populate_fields(src_pdf: Path, values_json: Path, dest_pdf: Path) -> None:
101
+ """Read field values from *values_json*, validate against the PDF's actual
102
+ fields, then write the filled output to *dest_pdf*.
103
+ """
104
+ with open(values_json, "r", encoding="utf-8") as fh:
105
+ requested: List[Dict[str, Any]] = json.load(fh)
106
+
107
+ # Group values by page
108
+ page_map: Dict[int, Dict[str, str]] = {}
109
+ for item in requested:
110
+ if "value" not in item:
111
+ continue
112
+ page_map.setdefault(item["page"], {})[item["field_id"]] = item["value"]
113
+
114
+ reader = pypdf.PdfReader(str(src_pdf))
115
+
116
+ # Validate all entries
117
+ known_fields = get_field_info(reader)
118
+ lookup: Dict[str, Dict[str, Any]] = {f["field_id"]: f for f in known_fields}
119
+ error_found: bool = False
120
+
121
+ for item in requested:
122
+ fid: str = item["field_id"]
123
+ existing = lookup.get(fid)
124
+ if existing is None:
125
+ error_found = True
126
+ print("ERROR: `%s` is not a valid field ID" % fid)
127
+ elif item["page"] != existing["page"]:
128
+ error_found = True
129
+ print(
130
+ "ERROR: Incorrect page number for `%s` (got %s, expected %s)"
131
+ % (fid, item["page"], existing["page"])
132
+ )
133
+ elif "value" in item:
134
+ err = _check_value_constraint(existing, item["value"])
135
+ if err:
136
+ print(err)
137
+ error_found = True
138
+
139
+ if error_found:
140
+ sys.exit(EXIT_FAILURE)
141
+
142
+ # Write filled PDF
143
+ writer = pypdf.PdfWriter(clone_from=reader)
144
+ for pg_num, vals in page_map.items():
145
+ writer.update_page_form_field_values(
146
+ writer.pages[pg_num - 1], vals, auto_regenerate=False
147
+ )
148
+
149
+ writer.set_need_appearances_writer(True)
150
+
151
+ with open(dest_pdf, "wb") as out:
152
+ writer.write(out)
153
+
154
+
155
+ # ---------------------------------------------------------------------------
156
+ # CLI
157
+ # ---------------------------------------------------------------------------
158
+
159
+
160
+ def build_parser() -> argparse.ArgumentParser:
161
+ """Construct the CLI argument parser."""
162
+ parser = argparse.ArgumentParser(
163
+ description="Fill interactive PDF form fields using a JSON value manifest."
164
+ )
165
+ parser.add_argument(
166
+ "input_pdf",
167
+ type=Path,
168
+ help="Path to the source PDF with form fields.",
169
+ )
170
+ parser.add_argument(
171
+ "field_values_json",
172
+ type=Path,
173
+ help="JSON file specifying field IDs and values.",
174
+ )
175
+ parser.add_argument(
176
+ "output_pdf",
177
+ type=Path,
178
+ help="Destination path for the filled PDF.",
179
+ )
180
+ return parser
181
+
182
+
183
+ def main() -> None:
184
+ """Entry point: parse arguments, apply patch, populate fields."""
185
+ parser = build_parser()
186
+ args = parser.parse_args()
187
+
188
+ input_pdf: Path = args.input_pdf
189
+ values_json: Path = args.field_values_json
190
+ output_pdf: Path = args.output_pdf
191
+
192
+ if not input_pdf.exists():
193
+ print("ERROR: File not found: {}".format(input_pdf), file=sys.stderr)
194
+ sys.exit(EXIT_FAILURE)
195
+
196
+ if not values_json.exists():
197
+ print("ERROR: File not found: {}".format(values_json), file=sys.stderr)
198
+ sys.exit(EXIT_FAILURE)
199
+
200
+ _apply_pypdf_option_patch()
201
+ populate_fields(input_pdf, values_json, output_pdf)
202
+
203
+
204
+ if __name__ == "__main__":
205
+ main()