@panda-agent/panda-cli 0.1.29 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/panda-cli-ink.bundle.mjs +258 -247
- package/package.json +6 -4
- package/skills/.gitkeep +0 -0
- package/skills/README.md +13 -0
- package/skills/docx/.skill-metadata.yaml +173 -0
- package/skills/docx/LICENSE.txt +30 -0
- package/skills/docx/SKILL.md +589 -0
- package/skills/docx/scripts/__init__.py +1 -0
- package/skills/docx/scripts/accept_changes.py +206 -0
- package/skills/docx/scripts/comment.py +442 -0
- package/skills/docx/scripts/office/helpers/__init__.py +1 -0
- package/skills/docx/scripts/office/helpers/merge_runs.py +190 -0
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
- package/skills/docx/scripts/office/pack.py +167 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/docx/scripts/office/soffice.py +194 -0
- package/skills/docx/scripts/office/unpack.py +145 -0
- package/skills/docx/scripts/office/validate.py +114 -0
- package/skills/docx/scripts/office/validators/__init__.py +16 -0
- package/skills/docx/scripts/office/validators/base.py +733 -0
- package/skills/docx/scripts/office/validators/docx.py +354 -0
- package/skills/docx/scripts/office/validators/pptx.py +230 -0
- package/skills/docx/scripts/office/validators/redlining.py +212 -0
- package/skills/docx/scripts/templates/comments.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/skills/docx/scripts/templates/people.xml +3 -0
- package/skills/frontend-design/LICENSE.txt +177 -0
- package/skills/frontend-design/SKILL.md +42 -0
- package/skills/pdf/.skill-metadata.yaml +273 -0
- package/skills/pdf/LICENSE.txt +30 -0
- package/skills/pdf/SKILL.md +324 -0
- package/skills/pdf/advanced-reference.md +609 -0
- package/skills/pdf/form-filling-guide.md +318 -0
- package/skills/pdf/forms.md +294 -0
- package/skills/pdf/reference.md +612 -0
- package/skills/pdf/scripts/check_bounding_boxes.py +198 -0
- package/skills/pdf/scripts/check_fillable_fields.py +64 -0
- package/skills/pdf/scripts/convert_pdf_to_images.py +102 -0
- package/skills/pdf/scripts/create_validation_image.py +125 -0
- package/skills/pdf/scripts/extract_form_field_info.py +220 -0
- package/skills/pdf/scripts/extract_form_structure.py +202 -0
- package/skills/pdf/scripts/fill_fillable_fields.py +205 -0
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
- package/skills/pptx-generator/SKILL.md +204 -0
- package/skills/pptx-generator/assets/styles/business.json +8 -0
- package/skills/pptx-generator/assets/styles/minimal.json +8 -0
- package/skills/pptx-generator/assets/styles/modern.json +8 -0
- package/skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
- package/skills/pptx-generator/references/collaboration_guide.md +381 -0
- package/skills/pptx-generator/references/json_format_spec.md +215 -0
- package/skills/pptx-generator/references/layout_guide.md +290 -0
- package/skills/pptx-generator/scripts/json_validator.py +194 -0
- package/skills/pptx-generator/scripts/pptx_builder.py +340 -0
- package/skills/pptx-generator/scripts/pptx_validator.py +162 -0
- package/skills/skill-creator/LICENSE.txt +202 -0
- package/skills/skill-creator/SKILL.md +479 -0
- package/skills/skill-creator/agents/analyzer.md +274 -0
- package/skills/skill-creator/agents/comparator.md +202 -0
- package/skills/skill-creator/agents/grader.md +223 -0
- package/skills/skill-creator/assets/eval_review.html +146 -0
- package/skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/skill-creator/references/schemas.md +430 -0
- package/skills/skill-creator/scripts/__init__.py +0 -0
- package/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/skill-creator/scripts/generate_report.py +326 -0
- package/skills/skill-creator/scripts/improve_description.py +248 -0
- package/skills/skill-creator/scripts/package_skill.py +136 -0
- package/skills/skill-creator/scripts/quick_validate.py +103 -0
- package/skills/skill-creator/scripts/run_eval.py +310 -0
- package/skills/skill-creator/scripts/run_loop.py +332 -0
- package/skills/skill-creator/scripts/utils.py +47 -0
- package/skills/xlsx/.skill-metadata.yaml +185 -0
- package/skills/xlsx/LICENSE.txt +30 -0
- package/skills/xlsx/SKILL.md +233 -0
- package/skills/xlsx/scripts/office/helpers/__init__.py +1 -0
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
- package/skills/xlsx/scripts/office/pack.py +162 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/xlsx/scripts/office/soffice.py +185 -0
- package/skills/xlsx/scripts/office/unpack.py +146 -0
- package/skills/xlsx/scripts/office/validate.py +108 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +13 -0
- package/skills/xlsx/scripts/office/validators/base.py +800 -0
- package/skills/xlsx/scripts/office/validators/docx.py +383 -0
- package/skills/xlsx/scripts/office/validators/pptx.py +250 -0
- package/skills/xlsx/scripts/office/validators/redlining.py +229 -0
- package/skills/xlsx/scripts/recalc.py +296 -0
|
@@ -0,0 +1,609 @@
|
|
|
1
|
+
# Advanced PDF Toolkit Reference
|
|
2
|
+
|
|
3
|
+
This companion document covers advanced capabilities, auxiliary libraries, and specialized workflows beyond the essentials in the primary skill guide.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## pypdfium2 (Apache/BSD Licensed)
|
|
8
|
+
|
|
9
|
+
### About the Library
|
|
10
|
+
|
|
11
|
+
A Python wrapper around PDFium (the rendering engine inside Chromium). Ideal for high-fidelity page rasterization and serves as an alternative to PyMuPDF.
|
|
12
|
+
|
|
13
|
+
### Page Rasterization
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import pypdfium2 as pdfium
|
|
17
|
+
from PIL import Image
|
|
18
|
+
|
|
19
|
+
doc = pdfium.PdfDocument("document.pdf")
|
|
20
|
+
|
|
21
|
+
# Render the first page at double resolution
|
|
22
|
+
first_page = doc[0]
|
|
23
|
+
bmp = first_page.render(scale=2.0, rotation=0)
|
|
24
|
+
|
|
25
|
+
# Export as PIL image
|
|
26
|
+
pil_img = bmp.to_pil()
|
|
27
|
+
pil_img.save("page_1.png", "PNG")
|
|
28
|
+
|
|
29
|
+
# Batch-render every page
|
|
30
|
+
for n, pg in enumerate(doc):
|
|
31
|
+
bmp = pg.render(scale=1.5)
|
|
32
|
+
pil_img = bmp.to_pil()
|
|
33
|
+
pil_img.save("page_{}.jpg".format(n + 1), "JPEG", quality=90)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Textual Content Retrieval
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import pypdfium2 as pdfium
|
|
40
|
+
|
|
41
|
+
doc = pdfium.PdfDocument("document.pdf")
|
|
42
|
+
for n, pg in enumerate(doc):
|
|
43
|
+
raw = pg.get_text()
|
|
44
|
+
print("Page {} text length: {} chars".format(n + 1, len(raw)))
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## JavaScript Ecosystem
|
|
50
|
+
|
|
51
|
+
### pdf-lib (MIT Licensed)
|
|
52
|
+
|
|
53
|
+
A versatile JS library for constructing and editing PDF documents across all JavaScript runtimes.
|
|
54
|
+
|
|
55
|
+
#### Modifying an Existing Document
|
|
56
|
+
|
|
57
|
+
```javascript
|
|
58
|
+
import { PDFDocument } from 'pdf-lib';
|
|
59
|
+
import fs from 'fs';
|
|
60
|
+
|
|
61
|
+
async function manipulatePDF() {
|
|
62
|
+
const rawBytes = fs.readFileSync('input.pdf');
|
|
63
|
+
const doc = await PDFDocument.load(rawBytes);
|
|
64
|
+
|
|
65
|
+
const numPages = doc.getPageCount();
|
|
66
|
+
console.log(`Document has ${numPages} pages`);
|
|
67
|
+
|
|
68
|
+
const fresh = doc.addPage([600, 400]);
|
|
69
|
+
fresh.drawText('Added by pdf-lib', {
|
|
70
|
+
x: 100,
|
|
71
|
+
y: 300,
|
|
72
|
+
size: 16
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
const output = await doc.save();
|
|
76
|
+
fs.writeFileSync('modified.pdf', output);
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
#### Building a Document From Scratch
|
|
81
|
+
|
|
82
|
+
```javascript
|
|
83
|
+
import { PDFDocument, rgb, StandardFonts } from 'pdf-lib';
|
|
84
|
+
import fs from 'fs';
|
|
85
|
+
|
|
86
|
+
async function createPDF() {
|
|
87
|
+
const doc = await PDFDocument.create();
|
|
88
|
+
|
|
89
|
+
const regular = await doc.embedFont(StandardFonts.Helvetica);
|
|
90
|
+
const bold = await doc.embedFont(StandardFonts.HelveticaBold);
|
|
91
|
+
|
|
92
|
+
const sheet = doc.addPage([595, 842]); // A4 dimensions
|
|
93
|
+
const { width, height } = sheet.getSize();
|
|
94
|
+
|
|
95
|
+
sheet.drawText('Invoice #12345', {
|
|
96
|
+
x: 50,
|
|
97
|
+
y: height - 50,
|
|
98
|
+
size: 18,
|
|
99
|
+
font: bold,
|
|
100
|
+
color: rgb(0.2, 0.2, 0.8)
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
sheet.drawRectangle({
|
|
104
|
+
x: 40,
|
|
105
|
+
y: height - 100,
|
|
106
|
+
width: width - 80,
|
|
107
|
+
height: 30,
|
|
108
|
+
color: rgb(0.9, 0.9, 0.9)
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
const rows = [
|
|
112
|
+
['Item', 'Qty', 'Price', 'Total'],
|
|
113
|
+
['Widget', '2', '$50', '$100'],
|
|
114
|
+
['Gadget', '1', '$75', '$75']
|
|
115
|
+
];
|
|
116
|
+
|
|
117
|
+
let cursorY = height - 150;
|
|
118
|
+
rows.forEach(cells => {
|
|
119
|
+
let cursorX = 50;
|
|
120
|
+
cells.forEach(val => {
|
|
121
|
+
sheet.drawText(val, {
|
|
122
|
+
x: cursorX,
|
|
123
|
+
y: cursorY,
|
|
124
|
+
size: 12,
|
|
125
|
+
font: regular
|
|
126
|
+
});
|
|
127
|
+
cursorX += 120;
|
|
128
|
+
});
|
|
129
|
+
cursorY -= 25;
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
const bytes = await doc.save();
|
|
133
|
+
fs.writeFileSync('created.pdf', bytes);
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
#### Page Selection and Merging
|
|
138
|
+
|
|
139
|
+
```javascript
|
|
140
|
+
import { PDFDocument } from 'pdf-lib';
|
|
141
|
+
import fs from 'fs';
|
|
142
|
+
|
|
143
|
+
async function mergePDFs() {
|
|
144
|
+
const combined = await PDFDocument.create();
|
|
145
|
+
|
|
146
|
+
const src1 = await PDFDocument.load(fs.readFileSync('doc1.pdf'));
|
|
147
|
+
const src2 = await PDFDocument.load(fs.readFileSync('doc2.pdf'));
|
|
148
|
+
|
|
149
|
+
const allFromFirst = await combined.copyPages(src1, src1.getPageIndices());
|
|
150
|
+
allFromFirst.forEach(p => combined.addPage(p));
|
|
151
|
+
|
|
152
|
+
const selectedFromSecond = await combined.copyPages(src2, [0, 2, 4]);
|
|
153
|
+
selectedFromSecond.forEach(p => combined.addPage(p));
|
|
154
|
+
|
|
155
|
+
const result = await combined.save();
|
|
156
|
+
fs.writeFileSync('merged.pdf', result);
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### pdfjs-dist (Apache Licensed)
|
|
161
|
+
|
|
162
|
+
Mozilla's client-side PDF rendering engine.
|
|
163
|
+
|
|
164
|
+
#### Loading and Rendering
|
|
165
|
+
|
|
166
|
+
```javascript
|
|
167
|
+
import * as pdfjsLib from 'pdfjs-dist';
|
|
168
|
+
|
|
169
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js';
|
|
170
|
+
|
|
171
|
+
async function renderPDF() {
|
|
172
|
+
const task = pdfjsLib.getDocument('document.pdf');
|
|
173
|
+
const doc = await task.promise;
|
|
174
|
+
|
|
175
|
+
console.log(`Loaded PDF with ${doc.numPages} pages`);
|
|
176
|
+
|
|
177
|
+
const pg = await doc.getPage(1);
|
|
178
|
+
const vp = pg.getViewport({ scale: 1.5 });
|
|
179
|
+
|
|
180
|
+
const cvs = document.createElement('canvas');
|
|
181
|
+
const ctx = cvs.getContext('2d');
|
|
182
|
+
cvs.height = vp.height;
|
|
183
|
+
cvs.width = vp.width;
|
|
184
|
+
|
|
185
|
+
await pg.render({ canvasContext: ctx, viewport: vp }).promise;
|
|
186
|
+
document.body.appendChild(cvs);
|
|
187
|
+
}
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
#### Positioned Text Extraction
|
|
191
|
+
|
|
192
|
+
```javascript
|
|
193
|
+
import * as pdfjsLib from 'pdfjs-dist';
|
|
194
|
+
|
|
195
|
+
async function extractText() {
|
|
196
|
+
const task = pdfjsLib.getDocument('document.pdf');
|
|
197
|
+
const doc = await task.promise;
|
|
198
|
+
|
|
199
|
+
let accumulated = '';
|
|
200
|
+
|
|
201
|
+
for (let n = 1; n <= doc.numPages; n++) {
|
|
202
|
+
const pg = await doc.getPage(n);
|
|
203
|
+
const tc = await pg.getTextContent();
|
|
204
|
+
|
|
205
|
+
const pageStr = tc.items.map(el => el.str).join(' ');
|
|
206
|
+
accumulated += `\n--- Page ${n} ---\n${pageStr}`;
|
|
207
|
+
|
|
208
|
+
const positioned = tc.items.map(el => ({
|
|
209
|
+
text: el.str,
|
|
210
|
+
x: el.transform[4],
|
|
211
|
+
y: el.transform[5],
|
|
212
|
+
width: el.width,
|
|
213
|
+
height: el.height
|
|
214
|
+
}));
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
console.log(accumulated);
|
|
218
|
+
return accumulated;
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
#### Annotation and Form Field Discovery
|
|
223
|
+
|
|
224
|
+
```javascript
|
|
225
|
+
import * as pdfjsLib from 'pdfjs-dist';
|
|
226
|
+
|
|
227
|
+
async function extractAnnotations() {
|
|
228
|
+
const task = pdfjsLib.getDocument('annotated.pdf');
|
|
229
|
+
const doc = await task.promise;
|
|
230
|
+
|
|
231
|
+
for (let n = 1; n <= doc.numPages; n++) {
|
|
232
|
+
const pg = await doc.getPage(n);
|
|
233
|
+
const notes = await pg.getAnnotations();
|
|
234
|
+
|
|
235
|
+
notes.forEach(note => {
|
|
236
|
+
console.log(`Annotation type: ${note.subtype}`);
|
|
237
|
+
console.log(`Content: ${note.contents}`);
|
|
238
|
+
console.log(`Coordinates: ${JSON.stringify(note.rect)}`);
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Advanced Shell Operations
|
|
247
|
+
|
|
248
|
+
### poppler-utils: Deep Features
|
|
249
|
+
|
|
250
|
+
#### Coordinate-Tagged Text Export
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
# Produce XML with precise bounding boxes per text element
|
|
254
|
+
pdftotext -bbox-layout document.pdf output.xml
|
|
255
|
+
|
|
256
|
+
# The XML contains exact spatial data for every text fragment
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
#### High-Fidelity Image Conversion
|
|
260
|
+
|
|
261
|
+
```bash
|
|
262
|
+
# PNG output at 300 DPI
|
|
263
|
+
pdftoppm -png -r 300 document.pdf output_prefix
|
|
264
|
+
|
|
265
|
+
# Selective pages at maximum quality
|
|
266
|
+
pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages
|
|
267
|
+
|
|
268
|
+
# JPEG with compression control
|
|
269
|
+
pdftoppm -jpeg -jpegopt quality=85 -r 200 document.pdf jpeg_output
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
#### Embedded Image Retrieval
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Dump all images preserving original encoding
|
|
276
|
+
pdfimages -j -p document.pdf page_images
|
|
277
|
+
|
|
278
|
+
# Catalogue images without extraction
|
|
279
|
+
pdfimages -list document.pdf
|
|
280
|
+
|
|
281
|
+
# Native-format extraction
|
|
282
|
+
pdfimages -all document.pdf images/img
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### qpdf: Power Features
|
|
286
|
+
|
|
287
|
+
#### Sophisticated Page Operations
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
# Chunk-split every 3 pages
|
|
291
|
+
qpdf --split-pages=3 input.pdf output_group_%02d.pdf
|
|
292
|
+
|
|
293
|
+
# Complex range expressions
|
|
294
|
+
qpdf input.pdf --pages input.pdf 1,3-5,8,10-end -- extracted.pdf
|
|
295
|
+
|
|
296
|
+
# Cross-document page assembly
|
|
297
|
+
qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
#### Optimization and Recovery
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
# Web-optimized streaming layout
|
|
304
|
+
qpdf --linearize input.pdf optimized.pdf
|
|
305
|
+
|
|
306
|
+
# Aggressive size reduction
|
|
307
|
+
qpdf --optimize-level=all input.pdf compressed.pdf
|
|
308
|
+
|
|
309
|
+
# Structural integrity check
|
|
310
|
+
qpdf --check input.pdf
|
|
311
|
+
qpdf --fix-qdf damaged.pdf repaired.pdf
|
|
312
|
+
|
|
313
|
+
# Dump internal structure
|
|
314
|
+
qpdf --show-all-pages input.pdf > structure.txt
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
#### Encryption Management
|
|
318
|
+
|
|
319
|
+
```bash
|
|
320
|
+
# Apply 256-bit encryption with restricted permissions
|
|
321
|
+
qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf
|
|
322
|
+
|
|
323
|
+
# Inspect protection status
|
|
324
|
+
qpdf --show-encryption encrypted.pdf
|
|
325
|
+
|
|
326
|
+
# Strip encryption (password required)
|
|
327
|
+
qpdf --password=secret123 --decrypt encrypted.pdf decrypted.pdf
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
---
|
|
331
|
+
|
|
332
|
+
## Advanced Python Patterns
|
|
333
|
+
|
|
334
|
+
### pdfplumber: Precision Features
|
|
335
|
+
|
|
336
|
+
#### Character-Level Coordinate Access
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
import pdfplumber
|
|
340
|
+
|
|
341
|
+
with pdfplumber.open("document.pdf") as doc:
|
|
342
|
+
pg = doc.pages[0]
|
|
343
|
+
|
|
344
|
+
# Individual character positions
|
|
345
|
+
for ch in pg.chars[:10]:
|
|
346
|
+
print("Char: '{}' at x:{:.1f} y:{:.1f}".format(ch['text'], ch['x0'], ch['y0']))
|
|
347
|
+
|
|
348
|
+
# Region-bounded text extraction (left, top, right, bottom)
|
|
349
|
+
region_text = pg.within_bbox((100, 100, 400, 200)).extract_text()
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
#### Custom Table Detection Parameters
|
|
353
|
+
|
|
354
|
+
```python
|
|
355
|
+
import pdfplumber
|
|
356
|
+
import pandas as pd
|
|
357
|
+
|
|
358
|
+
with pdfplumber.open("complex_table.pdf") as doc:
|
|
359
|
+
pg = doc.pages[0]
|
|
360
|
+
|
|
361
|
+
config = {
|
|
362
|
+
"vertical_strategy": "lines",
|
|
363
|
+
"horizontal_strategy": "lines",
|
|
364
|
+
"snap_tolerance": 3,
|
|
365
|
+
"intersection_tolerance": 15
|
|
366
|
+
}
|
|
367
|
+
found = pg.extract_tables(config)
|
|
368
|
+
|
|
369
|
+
# Debug visualization
|
|
370
|
+
debug_img = pg.to_image(resolution=150)
|
|
371
|
+
debug_img.save("debug_layout.png")
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### reportlab: Professional Output
|
|
375
|
+
|
|
376
|
+
#### Styled Tabular Reports
|
|
377
|
+
|
|
378
|
+
```python
|
|
379
|
+
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
|
|
380
|
+
from reportlab.lib.styles import getSampleStyleSheet
|
|
381
|
+
from reportlab.lib import colors
|
|
382
|
+
|
|
383
|
+
records = [
|
|
384
|
+
['Product', 'Q1', 'Q2', 'Q3', 'Q4'],
|
|
385
|
+
['Widgets', '120', '135', '142', '158'],
|
|
386
|
+
['Gadgets', '85', '92', '98', '105']
|
|
387
|
+
]
|
|
388
|
+
|
|
389
|
+
template = SimpleDocTemplate("report.pdf")
|
|
390
|
+
parts = []
|
|
391
|
+
|
|
392
|
+
styles = getSampleStyleSheet()
|
|
393
|
+
parts.append(Paragraph("Quarterly Sales Report", styles['Title']))
|
|
394
|
+
|
|
395
|
+
grid = Table(records)
|
|
396
|
+
grid.setStyle(TableStyle([
|
|
397
|
+
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
|
398
|
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
|
399
|
+
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
|
400
|
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
401
|
+
('FONTSIZE', (0, 0), (-1, 0), 14),
|
|
402
|
+
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
|
403
|
+
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
|
404
|
+
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
|
405
|
+
]))
|
|
406
|
+
parts.append(grid)
|
|
407
|
+
|
|
408
|
+
template.build(parts)
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
---
|
|
412
|
+
|
|
413
|
+
## Composite Workflows
|
|
414
|
+
|
|
415
|
+
### Extracting Visual Assets
|
|
416
|
+
|
|
417
|
+
#### Approach 1: Shell-Based (Fastest)
|
|
418
|
+
|
|
419
|
+
```bash
|
|
420
|
+
pdfimages -all document.pdf images/img
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
#### Approach 2: Programmatic with pypdfium2
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
import pypdfium2 as pdfium
|
|
427
|
+
from PIL import Image
|
|
428
|
+
import numpy as np
|
|
429
|
+
|
|
430
|
+
def extract_figures(pdf_path, output_dir):
|
|
431
|
+
doc = pdfium.PdfDocument(pdf_path)
|
|
432
|
+
|
|
433
|
+
for pg_idx, pg in enumerate(doc):
|
|
434
|
+
bmp = pg.render(scale=3.0)
|
|
435
|
+
frame = bmp.to_pil()
|
|
436
|
+
|
|
437
|
+
arr = np.array(frame)
|
|
438
|
+
|
|
439
|
+
# Naive figure detection: non-white pixel regions
|
|
440
|
+
non_white = np.any(arr != [255, 255, 255], axis=2)
|
|
441
|
+
|
|
442
|
+
# Contour analysis and bounding box extraction
|
|
443
|
+
# (Simplified — production use needs more robust detection)
|
|
444
|
+
|
|
445
|
+
# Save discovered regions
|
|
446
|
+
# ... details depend on requirements
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
### Batch Operations with Resilience
|
|
450
|
+
|
|
451
|
+
```python
|
|
452
|
+
import os
|
|
453
|
+
import glob
|
|
454
|
+
import logging
|
|
455
|
+
|
|
456
|
+
import pypdf
|
|
457
|
+
|
|
458
|
+
logging.basicConfig(level=logging.INFO)
|
|
459
|
+
log = logging.getLogger(__name__)
|
|
460
|
+
|
|
461
|
+
def batch_process(source_dir, mode='merge'):
|
|
462
|
+
targets = glob.glob(os.path.join(source_dir, "*.pdf"))
|
|
463
|
+
|
|
464
|
+
if mode == 'merge':
|
|
465
|
+
output = pypdf.PdfWriter()
|
|
466
|
+
for path in targets:
|
|
467
|
+
try:
|
|
468
|
+
rdr = pypdf.PdfReader(path)
|
|
469
|
+
for pg in rdr.pages:
|
|
470
|
+
output.add_page(pg)
|
|
471
|
+
log.info("Processed: %s", path)
|
|
472
|
+
except Exception as exc:
|
|
473
|
+
log.error("Failed to process %s: %s", path, exc)
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
with open("batch_merged.pdf", "wb") as dest:
|
|
477
|
+
output.write(dest)
|
|
478
|
+
|
|
479
|
+
elif mode == 'extract_text':
|
|
480
|
+
for path in targets:
|
|
481
|
+
try:
|
|
482
|
+
rdr = pypdf.PdfReader(path)
|
|
483
|
+
content = "".join(pg.extract_text() for pg in rdr.pages)
|
|
484
|
+
|
|
485
|
+
txt_path = path.replace('.pdf', '.txt')
|
|
486
|
+
with open(txt_path, 'w', encoding='utf-8') as out:
|
|
487
|
+
out.write(content)
|
|
488
|
+
log.info("Extracted text from: %s", path)
|
|
489
|
+
|
|
490
|
+
except Exception as exc:
|
|
491
|
+
log.error("Failed to extract text from %s: %s", path, exc)
|
|
492
|
+
continue
|
|
493
|
+
```
|
|
494
|
+
|
|
495
|
+
### Region Cropping
|
|
496
|
+
|
|
497
|
+
```python
|
|
498
|
+
import pypdf
|
|
499
|
+
|
|
500
|
+
source = pypdf.PdfReader("input.pdf")
|
|
501
|
+
output = pypdf.PdfWriter()
|
|
502
|
+
|
|
503
|
+
# Define visible area (left, bottom, right, top in points)
|
|
504
|
+
pg = source.pages[0]
|
|
505
|
+
pg.mediabox.left = 50
|
|
506
|
+
pg.mediabox.bottom = 50
|
|
507
|
+
pg.mediabox.right = 550
|
|
508
|
+
pg.mediabox.top = 750
|
|
509
|
+
|
|
510
|
+
output.add_page(pg)
|
|
511
|
+
with open("cropped.pdf", "wb") as dest:
|
|
512
|
+
output.write(dest)
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
---
|
|
516
|
+
|
|
517
|
+
## Performance Guidelines
|
|
518
|
+
|
|
519
|
+
### 1. Handling Large Documents
|
|
520
|
+
- Process pages individually rather than loading entire files into memory
|
|
521
|
+
- Leverage `qpdf --split-pages` for breaking apart large PDFs
|
|
522
|
+
- Use pypdfium2 for per-page rendering without full document buffering
|
|
523
|
+
|
|
524
|
+
### 2. Text Extraction Speed
|
|
525
|
+
- `pdftotext -bbox-layout` provides the fastest plain-text pipeline
|
|
526
|
+
- pdfplumber excels at structured/tabular content
|
|
527
|
+
- Avoid `pypdf.extract_text()` on very large files
|
|
528
|
+
|
|
529
|
+
### 3. Image Extraction Efficiency
|
|
530
|
+
- `pdfimages` significantly outperforms page rendering for embedded assets
|
|
531
|
+
- Use low DPI for thumbnails, high DPI for production output
|
|
532
|
+
|
|
533
|
+
### 4. Form Processing
|
|
534
|
+
- pdf-lib preserves form structure more reliably than most alternatives
|
|
535
|
+
- Always validate field specifications before bulk processing
|
|
536
|
+
|
|
537
|
+
### 5. Memory-Conscious Processing
|
|
538
|
+
|
|
539
|
+
```python
|
|
540
|
+
import pypdf
|
|
541
|
+
|
|
542
|
+
def chunked_processing(pdf_path, pages_per_chunk=10):
|
|
543
|
+
source = pypdf.PdfReader(pdf_path)
|
|
544
|
+
n_pages = len(source.pages)
|
|
545
|
+
|
|
546
|
+
for offset in range(0, n_pages, pages_per_chunk):
|
|
547
|
+
limit = min(offset + pages_per_chunk, n_pages)
|
|
548
|
+
chunk = pypdf.PdfWriter()
|
|
549
|
+
|
|
550
|
+
for k in range(offset, limit):
|
|
551
|
+
chunk.add_page(source.pages[k])
|
|
552
|
+
|
|
553
|
+
with open("chunk_{}.pdf".format(offset // pages_per_chunk), "wb") as dest:
|
|
554
|
+
chunk.write(dest)
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
---
|
|
558
|
+
|
|
559
|
+
## Diagnosing Common Problems
|
|
560
|
+
|
|
561
|
+
### Encrypted Documents
|
|
562
|
+
|
|
563
|
+
```python
|
|
564
|
+
import pypdf
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
doc = pypdf.PdfReader("encrypted.pdf")
|
|
568
|
+
if doc.is_encrypted:
|
|
569
|
+
doc.decrypt("password")
|
|
570
|
+
except Exception as exc:
|
|
571
|
+
print("Failed to decrypt: {}".format(exc))
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
### Damaged Files
|
|
575
|
+
|
|
576
|
+
```bash
|
|
577
|
+
# Verify structural integrity
|
|
578
|
+
qpdf --check corrupted.pdf
|
|
579
|
+
qpdf --replace-input corrupted.pdf
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
### Unreadable Scanned Content
|
|
583
|
+
|
|
584
|
+
```python
|
|
585
|
+
import pytesseract
|
|
586
|
+
from pdf2image import convert_from_path
|
|
587
|
+
|
|
588
|
+
def ocr_fallback(pdf_path):
|
|
589
|
+
frames = convert_from_path(pdf_path)
|
|
590
|
+
content = ""
|
|
591
|
+
for frame in frames:
|
|
592
|
+
content += pytesseract.image_to_string(frame)
|
|
593
|
+
return content
|
|
594
|
+
```
|
|
595
|
+
|
|
596
|
+
---
|
|
597
|
+
|
|
598
|
+
## Licensing Summary
|
|
599
|
+
|
|
600
|
+
| Library | License |
|
|
601
|
+
|---------|---------|
|
|
602
|
+
| pypdf | BSD |
|
|
603
|
+
| pdfplumber | MIT |
|
|
604
|
+
| pypdfium2 | Apache/BSD |
|
|
605
|
+
| reportlab | BSD |
|
|
606
|
+
| poppler-utils | GPL-2 |
|
|
607
|
+
| qpdf | Apache |
|
|
608
|
+
| pdf-lib | MIT |
|
|
609
|
+
| pdfjs-dist | Apache |
|