@nano-step/skill-manager 5.2.1 → 5.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/utils.d.ts +1 -1
- package/dist/utils.js +1 -1
- package/package.json +1 -1
- package/skills/pdf/SKILL.md +303 -0
- package/skills/pdf/skill.json +17 -0
package/dist/utils.d.ts
CHANGED
package/dist/utils.js
CHANGED
|
@@ -13,7 +13,7 @@ exports.writeText = writeText;
|
|
|
13
13
|
const path_1 = __importDefault(require("path"));
|
|
14
14
|
const os_1 = __importDefault(require("os"));
|
|
15
15
|
const fs_extra_1 = __importDefault(require("fs-extra"));
|
|
16
|
-
exports.MANAGER_VERSION = "5.2.
|
|
16
|
+
exports.MANAGER_VERSION = "5.2.2";
|
|
17
17
|
async function detectOpenCodePaths() {
|
|
18
18
|
const homeConfig = path_1.default.join(os_1.default.homedir(), ".config", "opencode");
|
|
19
19
|
const cwd = process.cwd();
|
package/package.json
CHANGED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pdf
|
|
3
|
+
description: "Comprehensive PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms. Use when filling PDF forms or programmatically processing, generating, or analyzing PDF documents."
|
|
4
|
+
compatibility: "OpenCode"
|
|
5
|
+
metadata:
|
|
6
|
+
author: openclaw/skillmd
|
|
7
|
+
version: "1.0.0"
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# PDF Processing Guide
|
|
11
|
+
|
|
12
|
+
## When This Skill Activates
|
|
13
|
+
|
|
14
|
+
Activate when the user asks to:
|
|
15
|
+
- Extract text or tables from PDFs
|
|
16
|
+
- Create, merge, split, or rotate PDFs
|
|
17
|
+
- Add watermarks or password protection
|
|
18
|
+
- OCR scanned PDFs
|
|
19
|
+
- Fill PDF forms
|
|
20
|
+
- Convert PDFs to text
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
```python
|
|
24
|
+
from pypdf import PdfReader, PdfWriter
|
|
25
|
+
|
|
26
|
+
# Read a PDF
|
|
27
|
+
reader = PdfReader("document.pdf")
|
|
28
|
+
print(f"Pages: {len(reader.pages)}")
|
|
29
|
+
|
|
30
|
+
# Extract text
|
|
31
|
+
text = ""
|
|
32
|
+
for page in reader.pages:
|
|
33
|
+
text += page.extract_text()
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Python Libraries
|
|
37
|
+
|
|
38
|
+
### pypdf - Basic Operations
|
|
39
|
+
|
|
40
|
+
#### Merge PDFs
|
|
41
|
+
```python
|
|
42
|
+
from pypdf import PdfWriter, PdfReader
|
|
43
|
+
|
|
44
|
+
writer = PdfWriter()
|
|
45
|
+
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
|
|
46
|
+
reader = PdfReader(pdf_file)
|
|
47
|
+
for page in reader.pages:
|
|
48
|
+
writer.add_page(page)
|
|
49
|
+
|
|
50
|
+
with open("merged.pdf", "wb") as output:
|
|
51
|
+
writer.write(output)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
#### Split PDF
|
|
55
|
+
```python
|
|
56
|
+
reader = PdfReader("input.pdf")
|
|
57
|
+
for i, page in enumerate(reader.pages):
|
|
58
|
+
writer = PdfWriter()
|
|
59
|
+
writer.add_page(page)
|
|
60
|
+
with open(f"page_{i+1}.pdf", "wb") as output:
|
|
61
|
+
writer.write(output)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
#### Rotate Pages
|
|
65
|
+
```python
|
|
66
|
+
reader = PdfReader("input.pdf")
|
|
67
|
+
writer = PdfWriter()
|
|
68
|
+
|
|
69
|
+
page = reader.pages[0]
|
|
70
|
+
page.rotate(90) # Rotate 90 degrees clockwise
|
|
71
|
+
writer.add_page(page)
|
|
72
|
+
|
|
73
|
+
with open("rotated.pdf", "wb") as output:
|
|
74
|
+
writer.write(output)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
#### Extract Metadata
|
|
78
|
+
```python
|
|
79
|
+
reader = PdfReader("document.pdf")
|
|
80
|
+
meta = reader.metadata
|
|
81
|
+
print(f"Author: {meta.author}")
|
|
82
|
+
print(f"Title: {meta.title}")
|
|
83
|
+
print(f"Subject: {meta.subject}")
|
|
84
|
+
print(f"Creator: {meta.creator}")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### pdfplumber - Text and Table Extraction
|
|
88
|
+
|
|
89
|
+
#### Extract Text
|
|
90
|
+
```python
|
|
91
|
+
import pdfplumber
|
|
92
|
+
|
|
93
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
94
|
+
for page in pdf.pages:
|
|
95
|
+
text = page.extract_text()
|
|
96
|
+
print(text)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
#### Extract Tables
|
|
100
|
+
```python
|
|
101
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
102
|
+
for i, page in enumerate(pdf.pages):
|
|
103
|
+
tables = page.extract_tables()
|
|
104
|
+
for j, table in enumerate(tables):
|
|
105
|
+
print(f"Table {j+1} on page {i+1}:")
|
|
106
|
+
for row in table:
|
|
107
|
+
print(row)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
#### Extract Tables to DataFrame
|
|
111
|
+
```python
|
|
112
|
+
import pdfplumber
|
|
113
|
+
import pandas as pd
|
|
114
|
+
|
|
115
|
+
with pdfplumber.open("document.pdf") as pdf:
|
|
116
|
+
page = pdf.pages[0]
|
|
117
|
+
table = page.extract_table()
|
|
118
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
|
119
|
+
print(df)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### reportlab - Create PDFs
|
|
123
|
+
|
|
124
|
+
#### Simple PDF
|
|
125
|
+
```python
|
|
126
|
+
from reportlab.lib.pagesizes import letter
|
|
127
|
+
from reportlab.pdfgen import canvas
|
|
128
|
+
|
|
129
|
+
c = canvas.Canvas("hello.pdf", pagesize=letter)
|
|
130
|
+
width, height = letter
|
|
131
|
+
|
|
132
|
+
c.drawString(100, height - 100, "Hello World!")
|
|
133
|
+
c.line(100, height - 140, 400, height - 140)
|
|
134
|
+
c.save()
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### Multi-page with Platypus
|
|
138
|
+
```python
|
|
139
|
+
from reportlab.lib.pagesizes import letter
|
|
140
|
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
|
141
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
142
|
+
|
|
143
|
+
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
|
|
144
|
+
styles = getSampleStyleSheet()
|
|
145
|
+
story = []
|
|
146
|
+
|
|
147
|
+
story.append(Paragraph("Report Title", styles['Title']))
|
|
148
|
+
story.append(Spacer(1, 12))
|
|
149
|
+
story.append(Paragraph("This is the body text.", styles['Normal']))
|
|
150
|
+
|
|
151
|
+
doc.build(story)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Command-Line Tools
|
|
155
|
+
|
|
156
|
+
### pdftotext (poppler-utils)
|
|
157
|
+
```bash
|
|
158
|
+
# Extract text
|
|
159
|
+
pdftotext input.pdf output.txt
|
|
160
|
+
|
|
161
|
+
# Preserve layout
|
|
162
|
+
pdftotext -layout input.pdf output.txt
|
|
163
|
+
|
|
164
|
+
# Specific pages
|
|
165
|
+
pdftotext -f 1 -l 5 input.pdf output.txt
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### qpdf
|
|
169
|
+
```bash
|
|
170
|
+
# Merge PDFs
|
|
171
|
+
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
|
|
172
|
+
|
|
173
|
+
# Split pages
|
|
174
|
+
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
|
|
175
|
+
|
|
176
|
+
# Rotate pages
|
|
177
|
+
qpdf input.pdf output.pdf --rotate=+90:1
|
|
178
|
+
|
|
179
|
+
# Remove password
|
|
180
|
+
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
|
|
181
|
+
|
|
182
|
+
# Linearize (optimize for web)
|
|
183
|
+
qpdf --linearize input.pdf output.pdf
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Common Tasks
|
|
187
|
+
|
|
188
|
+
### OCR Scanned PDFs
|
|
189
|
+
```python
|
|
190
|
+
import pytesseract
|
|
191
|
+
from pdf2image import convert_from_path
|
|
192
|
+
|
|
193
|
+
images = convert_from_path('scanned.pdf')
|
|
194
|
+
text = ""
|
|
195
|
+
for i, image in enumerate(images):
|
|
196
|
+
text += f"Page {i+1}:\n"
|
|
197
|
+
text += pytesseract.image_to_string(image)
|
|
198
|
+
text += "\n\n"
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Add Watermark
|
|
202
|
+
```python
|
|
203
|
+
from pypdf import PdfReader, PdfWriter
|
|
204
|
+
|
|
205
|
+
watermark = PdfReader("watermark.pdf").pages[0]
|
|
206
|
+
reader = PdfReader("document.pdf")
|
|
207
|
+
writer = PdfWriter()
|
|
208
|
+
|
|
209
|
+
for page in reader.pages:
|
|
210
|
+
page.merge_page(watermark)
|
|
211
|
+
writer.add_page(page)
|
|
212
|
+
|
|
213
|
+
with open("watermarked.pdf", "wb") as output:
|
|
214
|
+
writer.write(output)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Password Protection
|
|
218
|
+
```python
|
|
219
|
+
from pypdf import PdfReader, PdfWriter
|
|
220
|
+
|
|
221
|
+
reader = PdfReader("input.pdf")
|
|
222
|
+
writer = PdfWriter()
|
|
223
|
+
|
|
224
|
+
for page in reader.pages:
|
|
225
|
+
writer.add_page(page)
|
|
226
|
+
|
|
227
|
+
writer.encrypt("userpassword", "ownerpassword")
|
|
228
|
+
|
|
229
|
+
with open("encrypted.pdf", "wb") as output:
|
|
230
|
+
writer.write(output)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
### Fill PDF Forms
|
|
234
|
+
```python
|
|
235
|
+
from pypdf import PdfReader, PdfWriter
|
|
236
|
+
|
|
237
|
+
reader = PdfReader("form.pdf")
|
|
238
|
+
writer = PdfWriter()
|
|
239
|
+
writer.append(reader)
|
|
240
|
+
|
|
241
|
+
# Get form field names
|
|
242
|
+
fields = reader.get_fields()
|
|
243
|
+
for name, field in fields.items():
|
|
244
|
+
print(f"Field: {name}, Type: {field.get('/FT')}")
|
|
245
|
+
|
|
246
|
+
# Fill fields
|
|
247
|
+
writer.update_page_form_field_values(
|
|
248
|
+
writer.pages[0],
|
|
249
|
+
{"field_name": "value", "another_field": "another_value"}
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
with open("filled_form.pdf", "wb") as output:
|
|
253
|
+
writer.write(output)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### PDF to Images
|
|
257
|
+
```python
|
|
258
|
+
from pdf2image import convert_from_path
|
|
259
|
+
|
|
260
|
+
# Convert all pages
|
|
261
|
+
images = convert_from_path('document.pdf', dpi=300)
|
|
262
|
+
for i, image in enumerate(images):
|
|
263
|
+
image.save(f'page_{i+1}.png', 'PNG')
|
|
264
|
+
|
|
265
|
+
# Convert specific pages
|
|
266
|
+
images = convert_from_path('document.pdf', first_page=1, last_page=3)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Installation Commands
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
# Core libraries
|
|
273
|
+
pip install pypdf pdfplumber reportlab
|
|
274
|
+
|
|
275
|
+
# OCR support
|
|
276
|
+
pip install pytesseract pdf2image
|
|
277
|
+
# Also needs: apt-get install tesseract-ocr poppler-utils
|
|
278
|
+
|
|
279
|
+
# CLI tools
|
|
280
|
+
apt-get install poppler-utils qpdf
|
|
281
|
+
|
|
282
|
+
# All at once
|
|
283
|
+
pip install pypdf pdfplumber reportlab pytesseract pdf2image
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
## Quick Reference
|
|
287
|
+
|
|
288
|
+
| Task | Best Tool | Command/Code |
|
|
289
|
+
|------|-----------|--------------|
|
|
290
|
+
| Read/extract text | pdfplumber | `page.extract_text()` |
|
|
291
|
+
| Extract tables | pdfplumber | `page.extract_tables()` |
|
|
292
|
+
| Merge PDFs | pypdf | `writer.add_page(page)` |
|
|
293
|
+
| Split PDFs | pypdf | One page per PdfWriter |
|
|
294
|
+
| Rotate pages | pypdf | `page.rotate(90)` |
|
|
295
|
+
| Create PDFs | reportlab | Canvas or Platypus |
|
|
296
|
+
| Fill forms | pypdf | `update_page_form_field_values()` |
|
|
297
|
+
| Add watermark | pypdf | `page.merge_page(watermark)` |
|
|
298
|
+
| Password protect | pypdf | `writer.encrypt()` |
|
|
299
|
+
| OCR scanned PDFs | pytesseract + pdf2image | Convert to image first |
|
|
300
|
+
| CLI text extract | poppler-utils | `pdftotext input.pdf` |
|
|
301
|
+
| CLI merge/split | qpdf | `qpdf --empty --pages ...` |
|
|
302
|
+
| PDF to images | pdf2image | `convert_from_path()` |
|
|
303
|
+
| Extract metadata | pypdf | `reader.metadata` |
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pdf",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "PDF manipulation toolkit \u2014 extract text/tables, create, merge, split, rotate, OCR, fill forms, watermark, and password protect",
|
|
5
|
+
"compatibility": "OpenCode",
|
|
6
|
+
"agent": null,
|
|
7
|
+
"commands": [],
|
|
8
|
+
"tags": [
|
|
9
|
+
"pdf",
|
|
10
|
+
"extract",
|
|
11
|
+
"merge",
|
|
12
|
+
"ocr",
|
|
13
|
+
"forms",
|
|
14
|
+
"reportlab",
|
|
15
|
+
"pypdf"
|
|
16
|
+
]
|
|
17
|
+
}
|