@clawpify/skills 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,308 @@
1
+ # PDF Processing Guide
2
+
3
+ ## Overview
4
+
5
+ This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
6
+
7
+ ## Quick Start
8
+
9
+ ```python
10
+ from pypdf import PdfReader, PdfWriter
11
+
12
+ # Read a PDF
13
+ reader = PdfReader("document.pdf")
14
+ print(f"Pages: {len(reader.pages)}")
15
+
16
+ # Extract text
17
+ text = ""
18
+ for page in reader.pages:
19
+ text += page.extract_text()
20
+ ```
21
+
22
+ ## Python Libraries
23
+
24
+ ### pypdf - Basic Operations
25
+
26
+ #### Merge PDFs
27
+ ```python
28
+ from pypdf import PdfWriter, PdfReader
29
+
30
+ writer = PdfWriter()
31
+ for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
32
+ reader = PdfReader(pdf_file)
33
+ for page in reader.pages:
34
+ writer.add_page(page)
35
+
36
+ with open("merged.pdf", "wb") as output:
37
+ writer.write(output)
38
+ ```
39
+
40
+ #### Split PDF
41
+ ```python
42
+ reader = PdfReader("input.pdf")
43
+ for i, page in enumerate(reader.pages):
44
+ writer = PdfWriter()
45
+ writer.add_page(page)
46
+ with open(f"page_{i+1}.pdf", "wb") as output:
47
+ writer.write(output)
48
+ ```
49
+
50
+ #### Extract Metadata
51
+ ```python
52
+ reader = PdfReader("document.pdf")
53
+ meta = reader.metadata
54
+ print(f"Title: {meta.title}")
55
+ print(f"Author: {meta.author}")
56
+ print(f"Subject: {meta.subject}")
57
+ print(f"Creator: {meta.creator}")
58
+ ```
59
+
60
+ #### Rotate Pages
61
+ ```python
62
+ reader = PdfReader("input.pdf")
63
+ writer = PdfWriter()
64
+
65
+ page = reader.pages[0]
66
+ page.rotate(90) # Rotate 90 degrees clockwise
67
+ writer.add_page(page)
68
+
69
+ with open("rotated.pdf", "wb") as output:
70
+ writer.write(output)
71
+ ```
72
+
73
+ ### pdfplumber - Text and Table Extraction
74
+
75
+ #### Extract Text with Layout
76
+ ```python
77
+ import pdfplumber
78
+
79
+ with pdfplumber.open("document.pdf") as pdf:
80
+ for page in pdf.pages:
81
+ text = page.extract_text()
82
+ print(text)
83
+ ```
84
+
85
+ #### Extract Tables
86
+ ```python
87
+ with pdfplumber.open("document.pdf") as pdf:
88
+ for i, page in enumerate(pdf.pages):
89
+ tables = page.extract_tables()
90
+ for j, table in enumerate(tables):
91
+ print(f"Table {j+1} on page {i+1}:")
92
+ for row in table:
93
+ print(row)
94
+ ```
95
+
96
+ #### Advanced Table Extraction
97
+ ```python
98
+ import pandas as pd
99
+
100
+ with pdfplumber.open("document.pdf") as pdf:
101
+ all_tables = []
102
+ for page in pdf.pages:
103
+ tables = page.extract_tables()
104
+ for table in tables:
105
+ if table: # Check if table is not empty
106
+ df = pd.DataFrame(table[1:], columns=table[0])
107
+ all_tables.append(df)
108
+
109
+ # Combine all tables
110
+ if all_tables:
111
+ combined_df = pd.concat(all_tables, ignore_index=True)
112
+ combined_df.to_excel("extracted_tables.xlsx", index=False)
113
+ ```
114
+
115
+ ### reportlab - Create PDFs
116
+
117
+ #### Basic PDF Creation
118
+ ```python
119
+ from reportlab.lib.pagesizes import letter
120
+ from reportlab.pdfgen import canvas
121
+
122
+ c = canvas.Canvas("hello.pdf", pagesize=letter)
123
+ width, height = letter
124
+
125
+ # Add text
126
+ c.drawString(100, height - 100, "Hello World!")
127
+ c.drawString(100, height - 120, "This is a PDF created with reportlab")
128
+
129
+ # Add a line
130
+ c.line(100, height - 140, 400, height - 140)
131
+
132
+ # Save
133
+ c.save()
134
+ ```
135
+
136
+ #### Create PDF with Multiple Pages
137
+ ```python
138
+ from reportlab.lib.pagesizes import letter
139
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
140
+ from reportlab.lib.styles import getSampleStyleSheet
141
+
142
+ doc = SimpleDocTemplate("report.pdf", pagesize=letter)
143
+ styles = getSampleStyleSheet()
144
+ story = []
145
+
146
+ # Add content
147
+ title = Paragraph("Report Title", styles['Title'])
148
+ story.append(title)
149
+ story.append(Spacer(1, 12))
150
+
151
+ body = Paragraph("This is the body of the report. " * 20, styles['Normal'])
152
+ story.append(body)
153
+ story.append(PageBreak())
154
+
155
+ # Page 2
156
+ story.append(Paragraph("Page 2", styles['Heading1']))
157
+ story.append(Paragraph("Content for page 2", styles['Normal']))
158
+
159
+ # Build PDF
160
+ doc.build(story)
161
+ ```
162
+
163
+ #### Subscripts and Superscripts
164
+
165
+ **IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
166
+
167
+ Instead, use ReportLab's XML markup tags in Paragraph objects:
168
+ ```python
169
+ from reportlab.platypus import Paragraph
170
+ from reportlab.lib.styles import getSampleStyleSheet
171
+
172
+ styles = getSampleStyleSheet()
173
+
174
+ # Subscripts: use <sub> tag
175
+ chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
176
+
177
+ # Superscripts: use <super> tag
178
+ squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
179
+ ```
180
+
181
+ For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
182
+
183
+ ## Command-Line Tools
184
+
185
+ ### pdftotext (poppler-utils)
186
+ ```bash
187
+ # Extract text
188
+ pdftotext input.pdf output.txt
189
+
190
+ # Extract text preserving layout
191
+ pdftotext -layout input.pdf output.txt
192
+
193
+ # Extract specific pages
194
+ pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5
195
+ ```
196
+
197
+ ### qpdf
198
+ ```bash
199
+ # Merge PDFs
200
+ qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
201
+
202
+ # Split pages
203
+ qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
204
+ qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
205
+
206
+ # Rotate pages
207
+ qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees
208
+
209
+ # Remove password
210
+ qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
211
+ ```
212
+
213
+ ### pdftk (if available)
214
+ ```bash
215
+ # Merge
216
+ pdftk file1.pdf file2.pdf cat output merged.pdf
217
+
218
+ # Split
219
+ pdftk input.pdf burst
220
+
221
+ # Rotate
222
+ pdftk input.pdf rotate 1east output rotated.pdf
223
+ ```
224
+
225
+ ## Common Tasks
226
+
227
+ ### Extract Text from Scanned PDFs
228
+ ```python
229
+ # Requires: pip install pytesseract pdf2image
230
+ import pytesseract
231
+ from pdf2image import convert_from_path
232
+
233
+ # Convert PDF to images
234
+ images = convert_from_path('scanned.pdf')
235
+
236
+ # OCR each page
237
+ text = ""
238
+ for i, image in enumerate(images):
239
+ text += f"Page {i+1}:\n"
240
+ text += pytesseract.image_to_string(image)
241
+ text += "\n\n"
242
+
243
+ print(text)
244
+ ```
245
+
246
+ ### Add Watermark
247
+ ```python
248
+ from pypdf import PdfReader, PdfWriter
249
+
250
+ # Create watermark (or load existing)
251
+ watermark = PdfReader("watermark.pdf").pages[0]
252
+
253
+ # Apply to all pages
254
+ reader = PdfReader("document.pdf")
255
+ writer = PdfWriter()
256
+
257
+ for page in reader.pages:
258
+ page.merge_page(watermark)
259
+ writer.add_page(page)
260
+
261
+ with open("watermarked.pdf", "wb") as output:
262
+ writer.write(output)
263
+ ```
264
+
265
+ ### Extract Images
266
+ ```bash
267
+ # Using pdfimages (poppler-utils)
268
+ pdfimages -j input.pdf output_prefix
269
+
270
+ # This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc.
271
+ ```
272
+
273
+ ### Password Protection
274
+ ```python
275
+ from pypdf import PdfReader, PdfWriter
276
+
277
+ reader = PdfReader("input.pdf")
278
+ writer = PdfWriter()
279
+
280
+ for page in reader.pages:
281
+ writer.add_page(page)
282
+
283
+ # Add password
284
+ writer.encrypt("userpassword", "ownerpassword")
285
+
286
+ with open("encrypted.pdf", "wb") as output:
287
+ writer.write(output)
288
+ ```
289
+
290
+ ## Quick Reference
291
+
292
+ | Task | Best Tool | Command/Code |
293
+ |------|-----------|--------------|
294
+ | Merge PDFs | pypdf | `writer.add_page(page)` |
295
+ | Split PDFs | pypdf | One page per file |
296
+ | Extract text | pdfplumber | `page.extract_text()` |
297
+ | Extract tables | pdfplumber | `page.extract_tables()` |
298
+ | Create PDFs | reportlab | Canvas or Platypus |
299
+ | Command line merge | qpdf | `qpdf --empty --pages ...` |
300
+ | OCR scanned PDFs | pytesseract | Convert to image first |
301
+ | Fill PDF forms | pdf-lib or pypdf (see FORMS.md) | See FORMS.md |
302
+
303
+ ## Next Steps
304
+
305
+ - For advanced pypdfium2 usage, see REFERENCE.md
306
+ - For JavaScript libraries (pdf-lib), see REFERENCE.md
307
+ - If you need to fill out a PDF form, follow the instructions in FORMS.md
308
+ - For troubleshooting guides, see REFERENCE.md