wormclaude 1.0.119 → 1.0.121

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (232) hide show
  1. package/dist/theme.js +1 -1
  2. package/dist/tui.js +6 -1
  3. package/package.json +1 -1
  4. package/skills/build-mcp-app/SKILL.md +0 -393
  5. package/skills/build-mcp-app/references/abuse-protection.md +0 -60
  6. package/skills/build-mcp-app/references/apps-sdk-messages.md +0 -227
  7. package/skills/build-mcp-app/references/directory-checklist.md +0 -18
  8. package/skills/build-mcp-app/references/iframe-sandbox.md +0 -164
  9. package/skills/build-mcp-app/references/payload-budgeting.md +0 -54
  10. package/skills/build-mcp-app/references/widget-templates.md +0 -249
  11. package/skills/build-mcp-server/SKILL.md +0 -222
  12. package/skills/build-mcp-server/references/auth.md +0 -108
  13. package/skills/build-mcp-server/references/deploy-cloudflare-workers.md +0 -106
  14. package/skills/build-mcp-server/references/elicitation.md +0 -129
  15. package/skills/build-mcp-server/references/remote-http-scaffold.md +0 -211
  16. package/skills/build-mcp-server/references/resources-and-prompts.md +0 -122
  17. package/skills/build-mcp-server/references/server-capabilities.md +0 -164
  18. package/skills/build-mcp-server/references/tool-design.md +0 -189
  19. package/skills/build-mcp-server/references/versions.md +0 -25
  20. package/skills/build-mcpb/SKILL.md +0 -200
  21. package/skills/build-mcpb/references/local-security.md +0 -149
  22. package/skills/build-mcpb/references/manifest-schema.md +0 -156
  23. package/skills/docx/script/__init__.py +0 -1
  24. package/skills/docx/script/accept_chages.py +0 -135
  25. package/skills/docx/script/comment.py +0 -318
  26. package/skills/docx/script/office/helpers/__init__.py +0 -0
  27. package/skills/docx/script/office/helpers/merge_runs.py +0 -199
  28. package/skills/docx/script/office/helpers/simplify_redlines.py +0 -197
  29. package/skills/docx/script/office/pack.py +0 -159
  30. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  31. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  32. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  33. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  34. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  35. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  36. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  37. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  38. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  39. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  40. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  41. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  42. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  43. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  44. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  45. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  46. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  47. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  48. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  49. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  50. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  51. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  52. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  53. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  54. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  55. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  56. package/skills/docx/script/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  57. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  58. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  59. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  60. package/skills/docx/script/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  61. package/skills/docx/script/office/schemas/mce/mc.xsd +0 -75
  62. package/skills/docx/script/office/schemas/microsoft/wml-2010.xsd +0 -560
  63. package/skills/docx/script/office/schemas/microsoft/wml-2012.xsd +0 -67
  64. package/skills/docx/script/office/schemas/microsoft/wml-2018.xsd +0 -14
  65. package/skills/docx/script/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  66. package/skills/docx/script/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  67. package/skills/docx/script/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  68. package/skills/docx/script/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  69. package/skills/docx/script/office/soffice.py +0 -183
  70. package/skills/docx/script/office/unpack.py +0 -132
  71. package/skills/docx/script/office/validate.py +0 -117
  72. package/skills/docx/script/office/validators/__init__.py +0 -15
  73. package/skills/docx/script/office/validators/base.py +0 -851
  74. package/skills/docx/script/office/validators/docx.py +0 -446
  75. package/skills/docx/script/office/validators/pptx.py +0 -275
  76. package/skills/docx/script/office/validators/redlining.py +0 -247
  77. package/skills/docx/script/templates/comments.xml +0 -3
  78. package/skills/docx/script/templates/commentsExtended.xml +0 -3
  79. package/skills/docx/script/templates/commentsExtensible.xml +0 -3
  80. package/skills/docx/script/templates/commentsIds.xml +0 -3
  81. package/skills/docx/script/templates/people.xml +0 -3
  82. package/skills/docx/skill.md +0 -593
  83. package/skills/explain.md +0 -14
  84. package/skills/frontend-design/SKILL.md +0 -42
  85. package/skills/pdf/FORMS.md +0 -294
  86. package/skills/pdf/REFERENCE.md +0 -612
  87. package/skills/pdf/SKILL.md +0 -314
  88. package/skills/pdf/scripts/check_bounding_boxes.py +0 -65
  89. package/skills/pdf/scripts/check_fillable_fields.py +0 -11
  90. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -33
  91. package/skills/pdf/scripts/create_validation_image.py +0 -37
  92. package/skills/pdf/scripts/extract_form_field_info.py +0 -122
  93. package/skills/pdf/scripts/extract_form_structure.py +0 -115
  94. package/skills/pdf/scripts/fill_fillable_fields.py +0 -98
  95. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -107
  96. package/skills/playground/SKILL.md +0 -77
  97. package/skills/playground/templates/code-map.md +0 -158
  98. package/skills/playground/templates/concept-map.md +0 -73
  99. package/skills/playground/templates/data-explorer.md +0 -67
  100. package/skills/playground/templates/design-playground.md +0 -67
  101. package/skills/playground/templates/diff-review.md +0 -179
  102. package/skills/playground/templates/document-critique.md +0 -171
  103. package/skills/pptx/SKILL.md +0 -230
  104. package/skills/pptx/editing.md +0 -205
  105. package/skills/pptx/pptxgenjs.md +0 -437
  106. package/skills/pptx/scripts/__init__.py +0 -0
  107. package/skills/pptx/scripts/add_slide.py +0 -195
  108. package/skills/pptx/scripts/clean.py +0 -286
  109. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  110. package/skills/pptx/scripts/office/helpers/merge_runs.py +0 -199
  111. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -197
  112. package/skills/pptx/scripts/office/pack.py +0 -159
  113. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  114. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  115. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  116. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  117. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  118. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  119. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  120. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  121. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  122. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  123. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  124. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  125. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  126. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  127. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  128. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  129. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  130. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  131. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  132. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  133. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  134. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  135. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  136. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  137. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  138. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  139. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  140. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  141. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  142. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  143. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  144. package/skills/pptx/scripts/office/schemas/mce/mc.xsd +0 -75
  145. package/skills/pptx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  146. package/skills/pptx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  147. package/skills/pptx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  148. package/skills/pptx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  149. package/skills/pptx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  150. package/skills/pptx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  151. package/skills/pptx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  152. package/skills/pptx/scripts/office/soffice.py +0 -183
  153. package/skills/pptx/scripts/office/unpack.py +0 -132
  154. package/skills/pptx/scripts/office/validate.py +0 -117
  155. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  156. package/skills/pptx/scripts/office/validators/base.py +0 -851
  157. package/skills/pptx/scripts/office/validators/docx.py +0 -446
  158. package/skills/pptx/scripts/office/validators/pptx.py +0 -275
  159. package/skills/pptx/scripts/office/validators/redlining.py +0 -247
  160. package/skills/pptx/scripts/thumbnail.py +0 -289
  161. package/skills/recon.md +0 -16
  162. package/skills/security-audit/SKILL.md +0 -26
  163. package/skills/talent-creator/SKILL.md +0 -486
  164. package/skills/talent-creator/agents/analyzer.md +0 -274
  165. package/skills/talent-creator/agents/comparator.md +0 -202
  166. package/skills/talent-creator/agents/grader.md +0 -223
  167. package/skills/talent-creator/assets/eval_review.html +0 -146
  168. package/skills/talent-creator/eval-viewer/generate_review.py +0 -471
  169. package/skills/talent-creator/eval-viewer/viewer.html +0 -1325
  170. package/skills/talent-creator/references/schemas.md +0 -430
  171. package/skills/talent-creator/scripts/__init__.py +0 -0
  172. package/skills/talent-creator/scripts/aggregate_benchmark.py +0 -401
  173. package/skills/talent-creator/scripts/generate_report.py +0 -326
  174. package/skills/talent-creator/scripts/improve_description.py +0 -247
  175. package/skills/talent-creator/scripts/package_skill.py +0 -136
  176. package/skills/talent-creator/scripts/quick_validate.py +0 -146
  177. package/skills/talent-creator/scripts/run_eval.py +0 -310
  178. package/skills/talent-creator/scripts/run_loop.py +0 -328
  179. package/skills/talent-creator/scripts/utils.py +0 -47
  180. package/skills/xlsx/SKILL.md +0 -300
  181. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  182. package/skills/xlsx/scripts/office/helpers/merge_runs.py +0 -199
  183. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -197
  184. package/skills/xlsx/scripts/office/pack.py +0 -159
  185. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  186. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  187. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  188. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  189. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  190. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  191. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  192. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  193. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  194. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  195. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  196. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  197. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  198. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  199. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  200. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  201. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  202. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  203. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  204. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  205. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  206. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  207. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  208. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  209. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  210. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  211. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  212. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  213. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  214. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  215. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  216. package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +0 -75
  217. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +0 -560
  218. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +0 -67
  219. package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +0 -14
  220. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +0 -20
  221. package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +0 -13
  222. package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  223. package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +0 -8
  224. package/skills/xlsx/scripts/office/soffice.py +0 -183
  225. package/skills/xlsx/scripts/office/unpack.py +0 -132
  226. package/skills/xlsx/scripts/office/validate.py +0 -117
  227. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  228. package/skills/xlsx/scripts/office/validators/base.py +0 -851
  229. package/skills/xlsx/scripts/office/validators/docx.py +0 -446
  230. package/skills/xlsx/scripts/office/validators/pptx.py +0 -275
  231. package/skills/xlsx/scripts/office/validators/redlining.py +0 -247
  232. package/skills/xlsx/scripts/recalc.py +0 -184
@@ -1,314 +0,0 @@
1
- ---
2
- name: pdf
3
- description: Reach for this skill any time a task touches PDF files. Covers pulling text or tables out of PDFs, joining several PDFs together, breaking one apart, turning pages, stamping watermarks, generating brand-new PDFs, populating PDF forms, locking or unlocking PDFs with passwords, pulling out embedded images, and running OCR over scanned documents so they become searchable. Whenever a .pdf is mentioned or one needs to be produced, this skill applies.
4
- license: WormClaude
5
- ---
6
-
7
- # PDF Processing Guide
8
-
9
- ## Overview
10
-
11
- This guide walks through the core PDF tasks you'll hit day to day, relying on Python packages and shell utilities. When you need deeper functionality, JavaScript options, or fuller worked examples, turn to REFERENCE.md. Whenever the job is to populate a PDF form, open FORMS.md first and do exactly what it says.
12
-
13
- ## Quick Start
14
-
15
- ```python
16
- from pypdf import PdfReader, PdfWriter
17
-
18
- # Read a PDF
19
- reader = PdfReader("document.pdf")
20
- print(f"Pages: {len(reader.pages)}")
21
-
22
- # Extract text
23
- text = ""
24
- for page in reader.pages:
25
- text += page.extract_text()
26
- ```
27
-
28
- ## Python Libraries
29
-
30
- ### pypdf - Basic Operations
31
-
32
- #### Merge PDFs
33
- ```python
34
- from pypdf import PdfWriter, PdfReader
35
-
36
- writer = PdfWriter()
37
- for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
38
- reader = PdfReader(pdf_file)
39
- for page in reader.pages:
40
- writer.add_page(page)
41
-
42
- with open("merged.pdf", "wb") as output:
43
- writer.write(output)
44
- ```
45
-
46
- #### Split PDF
47
- ```python
48
- reader = PdfReader("input.pdf")
49
- for i, page in enumerate(reader.pages):
50
- writer = PdfWriter()
51
- writer.add_page(page)
52
- with open(f"page_{i+1}.pdf", "wb") as output:
53
- writer.write(output)
54
- ```
55
-
56
- #### Extract Metadata
57
- ```python
58
- reader = PdfReader("document.pdf")
59
- meta = reader.metadata
60
- print(f"Title: {meta.title}")
61
- print(f"Author: {meta.author}")
62
- print(f"Subject: {meta.subject}")
63
- print(f"Creator: {meta.creator}")
64
- ```
65
-
66
- #### Rotate Pages
67
- ```python
68
- reader = PdfReader("input.pdf")
69
- writer = PdfWriter()
70
-
71
- page = reader.pages[0]
72
- page.rotate(90) # Rotate 90 degrees clockwise
73
- writer.add_page(page)
74
-
75
- with open("rotated.pdf", "wb") as output:
76
- writer.write(output)
77
- ```
78
-
79
- ### pdfplumber - Text and Table Extraction
80
-
81
- #### Extract Text with Layout
82
- ```python
83
- import pdfplumber
84
-
85
- with pdfplumber.open("document.pdf") as pdf:
86
- for page in pdf.pages:
87
- text = page.extract_text()
88
- print(text)
89
- ```
90
-
91
- #### Extract Tables
92
- ```python
93
- with pdfplumber.open("document.pdf") as pdf:
94
- for i, page in enumerate(pdf.pages):
95
- tables = page.extract_tables()
96
- for j, table in enumerate(tables):
97
- print(f"Table {j+1} on page {i+1}:")
98
- for row in table:
99
- print(row)
100
- ```
101
-
102
- #### Advanced Table Extraction
103
- ```python
104
- import pandas as pd
105
-
106
- with pdfplumber.open("document.pdf") as pdf:
107
- all_tables = []
108
- for page in pdf.pages:
109
- tables = page.extract_tables()
110
- for table in tables:
111
- if table: # Check if table is not empty
112
- df = pd.DataFrame(table[1:], columns=table[0])
113
- all_tables.append(df)
114
-
115
- # Combine all tables
116
- if all_tables:
117
- combined_df = pd.concat(all_tables, ignore_index=True)
118
- combined_df.to_excel("extracted_tables.xlsx", index=False)
119
- ```
120
-
121
- ### reportlab - Create PDFs
122
-
123
- #### Basic PDF Creation
124
- ```python
125
- from reportlab.lib.pagesizes import letter
126
- from reportlab.pdfgen import canvas
127
-
128
- c = canvas.Canvas("hello.pdf", pagesize=letter)
129
- width, height = letter
130
-
131
- # Add text
132
- c.drawString(100, height - 100, "Hello World!")
133
- c.drawString(100, height - 120, "This is a PDF created with reportlab")
134
-
135
- # Add a line
136
- c.line(100, height - 140, 400, height - 140)
137
-
138
- # Save
139
- c.save()
140
- ```
141
-
142
- #### Create PDF with Multiple Pages
143
- ```python
144
- from reportlab.lib.pagesizes import letter
145
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
146
- from reportlab.lib.styles import getSampleStyleSheet
147
-
148
- doc = SimpleDocTemplate("report.pdf", pagesize=letter)
149
- styles = getSampleStyleSheet()
150
- story = []
151
-
152
- # Add content
153
- title = Paragraph("Report Title", styles['Title'])
154
- story.append(title)
155
- story.append(Spacer(1, 12))
156
-
157
- body = Paragraph("This is the body of the report. " * 20, styles['Normal'])
158
- story.append(body)
159
- story.append(PageBreak())
160
-
161
- # Page 2
162
- story.append(Paragraph("Page 2", styles['Heading1']))
163
- story.append(Paragraph("Content for page 2", styles['Normal']))
164
-
165
- # Build PDF
166
- doc.build(story)
167
- ```
168
-
169
- #### Subscripts and Superscripts
170
-
171
- **IMPORTANT**: Keep Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) out of ReportLab PDFs entirely. Those glyphs are missing from the bundled fonts, so they show up as filled black squares.
172
-
173
- Reach for ReportLab's XML markup tags inside Paragraph objects instead:
174
- ```python
175
- from reportlab.platypus import Paragraph
176
- from reportlab.lib.styles import getSampleStyleSheet
177
-
178
- styles = getSampleStyleSheet()
179
-
180
- # Subscripts: use <sub> tag
181
- chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
182
-
183
- # Superscripts: use <super> tag
184
- squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
185
- ```
186
-
187
- For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
188
-
189
- ## Command-Line Tools
190
-
191
- ### pdftotext (poppler-utils)
192
- ```bash
193
- # Extract text
194
- pdftotext input.pdf output.txt
195
-
196
- # Extract text preserving layout
197
- pdftotext -layout input.pdf output.txt
198
-
199
- # Extract specific pages
200
- pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5
201
- ```
202
-
203
- ### qpdf
204
- ```bash
205
- # Merge PDFs
206
- qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
207
-
208
- # Split pages
209
- qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
210
- qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
211
-
212
- # Rotate pages
213
- qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees
214
-
215
- # Remove password
216
- qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
217
- ```
218
-
219
- ### pdftk (if available)
220
- ```bash
221
- # Merge
222
- pdftk file1.pdf file2.pdf cat output merged.pdf
223
-
224
- # Split
225
- pdftk input.pdf burst
226
-
227
- # Rotate
228
- pdftk input.pdf rotate 1east output rotated.pdf
229
- ```
230
-
231
- ## Common Tasks
232
-
233
- ### Extract Text from Scanned PDFs
234
- ```python
235
- # Requires: pip install pytesseract pdf2image
236
- import pytesseract
237
- from pdf2image import convert_from_path
238
-
239
- # Convert PDF to images
240
- images = convert_from_path('scanned.pdf')
241
-
242
- # OCR each page
243
- text = ""
244
- for i, image in enumerate(images):
245
- text += f"Page {i+1}:\n"
246
- text += pytesseract.image_to_string(image)
247
- text += "\n\n"
248
-
249
- print(text)
250
- ```
251
-
252
- ### Add Watermark
253
- ```python
254
- from pypdf import PdfReader, PdfWriter
255
-
256
- # Create watermark (or load existing)
257
- watermark = PdfReader("watermark.pdf").pages[0]
258
-
259
- # Apply to all pages
260
- reader = PdfReader("document.pdf")
261
- writer = PdfWriter()
262
-
263
- for page in reader.pages:
264
- page.merge_page(watermark)
265
- writer.add_page(page)
266
-
267
- with open("watermarked.pdf", "wb") as output:
268
- writer.write(output)
269
- ```
270
-
271
- ### Extract Images
272
- ```bash
273
- # Using pdfimages (poppler-utils)
274
- pdfimages -j input.pdf output_prefix
275
-
276
- # This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc.
277
- ```
278
-
279
- ### Password Protection
280
- ```python
281
- from pypdf import PdfReader, PdfWriter
282
-
283
- reader = PdfReader("input.pdf")
284
- writer = PdfWriter()
285
-
286
- for page in reader.pages:
287
- writer.add_page(page)
288
-
289
- # Add password
290
- writer.encrypt("userpassword", "ownerpassword")
291
-
292
- with open("encrypted.pdf", "wb") as output:
293
- writer.write(output)
294
- ```
295
-
296
- ## Quick Reference
297
-
298
- | Task | Best Tool | Command/Code |
299
- |------|-----------|--------------|
300
- | Merge PDFs | pypdf | `writer.add_page(page)` |
301
- | Split PDFs | pypdf | One page per file |
302
- | Extract text | pdfplumber | `page.extract_text()` |
303
- | Extract tables | pdfplumber | `page.extract_tables()` |
304
- | Create PDFs | reportlab | Canvas or Platypus |
305
- | Command line merge | qpdf | `qpdf --empty --pages ...` |
306
- | OCR scanned PDFs | pytesseract | Convert to image first |
307
- | Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md |
308
-
309
- ## Next Steps
310
-
311
- - For advanced pypdfium2 usage, see REFERENCE.md
312
- - For JavaScript libraries (pdf-lib), see REFERENCE.md
313
- - If you need to fill out a PDF form, follow the instructions in FORMS.md
314
- - For troubleshooting guides, see REFERENCE.md
@@ -1,65 +0,0 @@
1
- from dataclasses import dataclass
2
- import json
3
- import sys
4
-
5
-
6
-
7
-
8
- @dataclass
9
- class RectAndField:
10
- rect: list[float]
11
- rect_type: str
12
- field: dict
13
-
14
-
15
- def get_bounding_box_messages(fields_json_stream) -> list[str]:
16
- messages = []
17
- fields = json.load(fields_json_stream)
18
- messages.append(f"Read {len(fields['form_fields'])} fields")
19
-
20
- def rects_intersect(r1, r2):
21
- disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
22
- disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
23
- return not (disjoint_horizontal or disjoint_vertical)
24
-
25
- rects_and_fields = []
26
- for f in fields["form_fields"]:
27
- rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f))
28
- rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f))
29
-
30
- has_error = False
31
- for i, ri in enumerate(rects_and_fields):
32
- for j in range(i + 1, len(rects_and_fields)):
33
- rj = rects_and_fields[j]
34
- if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
35
- has_error = True
36
- if ri.field is rj.field:
37
- messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})")
38
- else:
39
- messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})")
40
- if len(messages) >= 20:
41
- messages.append("Aborting further checks; fix bounding boxes and try again")
42
- return messages
43
- if ri.rect_type == "entry":
44
- if "entry_text" in ri.field:
45
- font_size = ri.field["entry_text"].get("font_size", 14)
46
- entry_height = ri.rect[3] - ri.rect[1]
47
- if entry_height < font_size:
48
- has_error = True
49
- messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
50
- if len(messages) >= 20:
51
- messages.append("Aborting further checks; fix bounding boxes and try again")
52
- return messages
53
-
54
- if not has_error:
55
- messages.append("SUCCESS: All bounding boxes are valid")
56
- return messages
57
-
58
- if __name__ == "__main__":
59
- if len(sys.argv) != 2:
60
- print("Usage: check_bounding_boxes.py [fields.json]")
61
- sys.exit(1)
62
- with open(sys.argv[1]) as f:
63
- messages = get_bounding_box_messages(f)
64
- for msg in messages:
65
- print(msg)
@@ -1,11 +0,0 @@
1
- import sys
2
- from pypdf import PdfReader
3
-
4
-
5
-
6
-
7
- reader = PdfReader(sys.argv[1])
8
- if (reader.get_fields()):
9
- print("This PDF has fillable form fields")
10
- else:
11
- print("This PDF does not have fillable form fields; you will need to visually determine where to enter data")
@@ -1,33 +0,0 @@
1
- import os
2
- import sys
3
-
4
- from pdf2image import convert_from_path
5
-
6
-
7
-
8
-
9
- def convert(pdf_path, output_dir, max_dim=1000):
10
- images = convert_from_path(pdf_path, dpi=200)
11
-
12
- for i, image in enumerate(images):
13
- width, height = image.size
14
- if width > max_dim or height > max_dim:
15
- scale_factor = min(max_dim / width, max_dim / height)
16
- new_width = int(width * scale_factor)
17
- new_height = int(height * scale_factor)
18
- image = image.resize((new_width, new_height))
19
-
20
- image_path = os.path.join(output_dir, f"page_{i+1}.png")
21
- image.save(image_path)
22
- print(f"Saved page {i+1} as {image_path} (size: {image.size})")
23
-
24
- print(f"Converted {len(images)} pages to PNG images")
25
-
26
-
27
- if __name__ == "__main__":
28
- if len(sys.argv) != 3:
29
- print("Usage: convert_pdf_to_images.py [input pdf] [output directory]")
30
- sys.exit(1)
31
- pdf_path = sys.argv[1]
32
- output_directory = sys.argv[2]
33
- convert(pdf_path, output_directory)
@@ -1,37 +0,0 @@
1
- import json
2
- import sys
3
-
4
- from PIL import Image, ImageDraw
5
-
6
-
7
-
8
-
9
- def create_validation_image(page_number, fields_json_path, input_path, output_path):
10
- with open(fields_json_path, 'r') as f:
11
- data = json.load(f)
12
-
13
- img = Image.open(input_path)
14
- draw = ImageDraw.Draw(img)
15
- num_boxes = 0
16
-
17
- for field in data["form_fields"]:
18
- if field["page_number"] == page_number:
19
- entry_box = field['entry_bounding_box']
20
- label_box = field['label_bounding_box']
21
- draw.rectangle(entry_box, outline='red', width=2)
22
- draw.rectangle(label_box, outline='blue', width=2)
23
- num_boxes += 2
24
-
25
- img.save(output_path)
26
- print(f"Created validation image at {output_path} with {num_boxes} bounding boxes")
27
-
28
-
29
- if __name__ == "__main__":
30
- if len(sys.argv) != 5:
31
- print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]")
32
- sys.exit(1)
33
- page_number = int(sys.argv[1])
34
- fields_json_path = sys.argv[2]
35
- input_image_path = sys.argv[3]
36
- output_image_path = sys.argv[4]
37
- create_validation_image(page_number, fields_json_path, input_image_path, output_image_path)
@@ -1,122 +0,0 @@
1
- import json
2
- import sys
3
-
4
- from pypdf import PdfReader
5
-
6
-
7
-
8
-
9
- def get_full_annotation_field_id(annotation):
10
- components = []
11
- while annotation:
12
- field_name = annotation.get('/T')
13
- if field_name:
14
- components.append(field_name)
15
- annotation = annotation.get('/Parent')
16
- return ".".join(reversed(components)) if components else None
17
-
18
-
19
- def make_field_dict(field, field_id):
20
- field_dict = {"field_id": field_id}
21
- ft = field.get('/FT')
22
- if ft == "/Tx":
23
- field_dict["type"] = "text"
24
- elif ft == "/Btn":
25
- field_dict["type"] = "checkbox"
26
- states = field.get("/_States_", [])
27
- if len(states) == 2:
28
- if "/Off" in states:
29
- field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
30
- field_dict["unchecked_value"] = "/Off"
31
- else:
32
- print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
33
- field_dict["checked_value"] = states[0]
34
- field_dict["unchecked_value"] = states[1]
35
- elif ft == "/Ch":
36
- field_dict["type"] = "choice"
37
- states = field.get("/_States_", [])
38
- field_dict["choice_options"] = [{
39
- "value": state[0],
40
- "text": state[1],
41
- } for state in states]
42
- else:
43
- field_dict["type"] = f"unknown ({ft})"
44
- return field_dict
45
-
46
-
47
- def get_field_info(reader: PdfReader):
48
- fields = reader.get_fields()
49
-
50
- field_info_by_id = {}
51
- possible_radio_names = set()
52
-
53
- for field_id, field in fields.items():
54
- if field.get("/Kids"):
55
- if field.get("/FT") == "/Btn":
56
- possible_radio_names.add(field_id)
57
- continue
58
- field_info_by_id[field_id] = make_field_dict(field, field_id)
59
-
60
-
61
- radio_fields_by_id = {}
62
-
63
- for page_index, page in enumerate(reader.pages):
64
- annotations = page.get('/Annots', [])
65
- for ann in annotations:
66
- field_id = get_full_annotation_field_id(ann)
67
- if field_id in field_info_by_id:
68
- field_info_by_id[field_id]["page"] = page_index + 1
69
- field_info_by_id[field_id]["rect"] = ann.get('/Rect')
70
- elif field_id in possible_radio_names:
71
- try:
72
- on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
73
- except KeyError:
74
- continue
75
- if len(on_values) == 1:
76
- rect = ann.get("/Rect")
77
- if field_id not in radio_fields_by_id:
78
- radio_fields_by_id[field_id] = {
79
- "field_id": field_id,
80
- "type": "radio_group",
81
- "page": page_index + 1,
82
- "radio_options": [],
83
- }
84
- radio_fields_by_id[field_id]["radio_options"].append({
85
- "value": on_values[0],
86
- "rect": rect,
87
- })
88
-
89
- fields_with_location = []
90
- for field_info in field_info_by_id.values():
91
- if "page" in field_info:
92
- fields_with_location.append(field_info)
93
- else:
94
- print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
95
-
96
- def sort_key(f):
97
- if "radio_options" in f:
98
- rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
99
- else:
100
- rect = f.get("rect") or [0, 0, 0, 0]
101
- adjusted_position = [-rect[1], rect[0]]
102
- return [f.get("page"), adjusted_position]
103
-
104
- sorted_fields = fields_with_location + list(radio_fields_by_id.values())
105
- sorted_fields.sort(key=sort_key)
106
-
107
- return sorted_fields
108
-
109
-
110
- def write_field_info(pdf_path: str, json_output_path: str):
111
- reader = PdfReader(pdf_path)
112
- field_info = get_field_info(reader)
113
- with open(json_output_path, "w") as f:
114
- json.dump(field_info, f, indent=2)
115
- print(f"Wrote {len(field_info)} fields to {json_output_path}")
116
-
117
-
118
- if __name__ == "__main__":
119
- if len(sys.argv) != 3:
120
- print("Usage: extract_form_field_info.py [input pdf] [output json]")
121
- sys.exit(1)
122
- write_field_info(sys.argv[1], sys.argv[2])
@@ -1,115 +0,0 @@
1
- """
2
- Extract form structure from a non-fillable PDF.
3
-
4
- This script analyzes the PDF to find:
5
- - Text labels with their exact coordinates
6
- - Horizontal lines (row boundaries)
7
- - Checkboxes (small rectangles)
8
-
9
- Output: A JSON file with the form structure that can be used to generate
10
- accurate field coordinates for filling.
11
-
12
- Usage: python extract_form_structure.py <input.pdf> <output.json>
13
- """
14
-
15
- import json
16
- import sys
17
- import pdfplumber
18
-
19
-
20
- def extract_form_structure(pdf_path):
21
- structure = {
22
- "pages": [],
23
- "labels": [],
24
- "lines": [],
25
- "checkboxes": [],
26
- "row_boundaries": []
27
- }
28
-
29
- with pdfplumber.open(pdf_path) as pdf:
30
- for page_num, page in enumerate(pdf.pages, 1):
31
- structure["pages"].append({
32
- "page_number": page_num,
33
- "width": float(page.width),
34
- "height": float(page.height)
35
- })
36
-
37
- words = page.extract_words()
38
- for word in words:
39
- structure["labels"].append({
40
- "page": page_num,
41
- "text": word["text"],
42
- "x0": round(float(word["x0"]), 1),
43
- "top": round(float(word["top"]), 1),
44
- "x1": round(float(word["x1"]), 1),
45
- "bottom": round(float(word["bottom"]), 1)
46
- })
47
-
48
- for line in page.lines:
49
- if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
50
- structure["lines"].append({
51
- "page": page_num,
52
- "y": round(float(line["top"]), 1),
53
- "x0": round(float(line["x0"]), 1),
54
- "x1": round(float(line["x1"]), 1)
55
- })
56
-
57
- for rect in page.rects:
58
- width = float(rect["x1"]) - float(rect["x0"])
59
- height = float(rect["bottom"]) - float(rect["top"])
60
- if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
61
- structure["checkboxes"].append({
62
- "page": page_num,
63
- "x0": round(float(rect["x0"]), 1),
64
- "top": round(float(rect["top"]), 1),
65
- "x1": round(float(rect["x1"]), 1),
66
- "bottom": round(float(rect["bottom"]), 1),
67
- "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
68
- "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
69
- })
70
-
71
- lines_by_page = {}
72
- for line in structure["lines"]:
73
- page = line["page"]
74
- if page not in lines_by_page:
75
- lines_by_page[page] = []
76
- lines_by_page[page].append(line["y"])
77
-
78
- for page, y_coords in lines_by_page.items():
79
- y_coords = sorted(set(y_coords))
80
- for i in range(len(y_coords) - 1):
81
- structure["row_boundaries"].append({
82
- "page": page,
83
- "row_top": y_coords[i],
84
- "row_bottom": y_coords[i + 1],
85
- "row_height": round(y_coords[i + 1] - y_coords[i], 1)
86
- })
87
-
88
- return structure
89
-
90
-
91
- def main():
92
- if len(sys.argv) != 3:
93
- print("Usage: extract_form_structure.py <input.pdf> <output.json>")
94
- sys.exit(1)
95
-
96
- pdf_path = sys.argv[1]
97
- output_path = sys.argv[2]
98
-
99
- print(f"Extracting structure from {pdf_path}...")
100
- structure = extract_form_structure(pdf_path)
101
-
102
- with open(output_path, "w") as f:
103
- json.dump(structure, f, indent=2)
104
-
105
- print(f"Found:")
106
- print(f" - {len(structure['pages'])} pages")
107
- print(f" - {len(structure['labels'])} text labels")
108
- print(f" - {len(structure['lines'])} horizontal lines")
109
- print(f" - {len(structure['checkboxes'])} checkboxes")
110
- print(f" - {len(structure['row_boundaries'])} row boundaries")
111
- print(f"Saved to {output_path}")
112
-
113
-
114
- if __name__ == "__main__":
115
- main()