opencodekit 0.16.18 → 0.16.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +8 -18
- package/dist/template/.opencode/opencode.json +160 -59
- package/dist/template/.opencode/skill/context-management/SKILL.md +47 -60
- package/dist/template/.opencode/skill/pdf-extract/SKILL.md +428 -0
- package/dist/template/.opencode/skill/playwright/SKILL.md +263 -65
- package/package.json +3 -13
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: pdf-extract
|
|
3
|
+
description: Extract text, images, tables, and metadata from PDF files. Choose the right library based on extraction needs - text only, structured data, or complex layouts.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# PDF Content Extraction
|
|
7
|
+
|
|
8
|
+
Extract content from PDF files using the best library for your specific use case.
|
|
9
|
+
|
|
10
|
+
## Quick Decision Guide
|
|
11
|
+
|
|
12
|
+
| Use Case | Recommended Library | Why |
|
|
13
|
+
| --------------------------- | ------------------- | ---------------------------------- |
|
|
14
|
+
| Simple text extraction | `pdf-parse` (v2+) | Fast, lightweight, pure TypeScript |
|
|
15
|
+
| Complex layouts/coordinates | `pdfjs-dist` | Full control, precise positioning |
|
|
16
|
+
| Tables/tabular data | `pdf-data-parser` | Built for grid-based content |
|
|
17
|
+
| Forms (XFA) | `pdf-lib` + custom | Form field extraction |
|
|
18
|
+
| Browser + Node.js | `pdf-parse` v2 | Cross-platform, works everywhere |
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Library 1: pdf-parse (Recommended for Text)
|
|
23
|
+
|
|
24
|
+
**Best for:** Simple text extraction, metadata, fast processing
|
|
25
|
+
|
|
26
|
+
### Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
npm install pdf-parse
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Basic Text Extraction
|
|
33
|
+
|
|
34
|
+
```typescript
|
|
35
|
+
import { PDFParse } from "pdf-parse";
|
|
36
|
+
import { readFile } from "fs/promises";
|
|
37
|
+
|
|
38
|
+
async function extractText(filePath: string): Promise<string> {
|
|
39
|
+
const parser = new PDFParse();
|
|
40
|
+
const buffer = await readFile(filePath);
|
|
41
|
+
|
|
42
|
+
const result = await parser.parse(buffer);
|
|
43
|
+
return result.text;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Usage
|
|
47
|
+
const text = await extractText("./document.pdf");
|
|
48
|
+
console.log(text);
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Extract with Metadata
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
import { PDFParse } from "pdf-parse";
|
|
55
|
+
|
|
56
|
+
async function extractWithMetadata(filePath: string) {
|
|
57
|
+
const parser = new PDFParse();
|
|
58
|
+
const buffer = await readFile(filePath);
|
|
59
|
+
|
|
60
|
+
const result = await parser.parse(buffer);
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
text: result.text,
|
|
64
|
+
info: result.info, // Document metadata
|
|
65
|
+
numpages: result.numpages,
|
|
66
|
+
version: result.version,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Extract Specific Pages
|
|
72
|
+
|
|
73
|
+
```typescript
|
|
74
|
+
import { PDFParse } from "pdf-parse";
|
|
75
|
+
|
|
76
|
+
async function extractPage(filePath: string, pageNum: number) {
|
|
77
|
+
const parser = new PDFParse();
|
|
78
|
+
const buffer = await readFile(filePath);
|
|
79
|
+
|
|
80
|
+
const result = await parser.parse(buffer, {
|
|
81
|
+
max: pageNum,
|
|
82
|
+
min: pageNum,
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
return result.text;
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### URL-based Extraction (without downloading full file)
|
|
90
|
+
|
|
91
|
+
```typescript
|
|
92
|
+
import { getHeader } from "pdf-parse/node";
|
|
93
|
+
|
|
94
|
+
async function checkPDFHeaders(url: string) {
|
|
95
|
+
// Check file size and headers before downloading
|
|
96
|
+
const headers = await getHeader(url, true);
|
|
97
|
+
console.log(`File size: ${headers.size} bytes`);
|
|
98
|
+
|
|
99
|
+
if (headers.size > 10 * 1024 * 1024) {
|
|
100
|
+
console.warn("Large PDF - consider streaming");
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Library 2: pdfjs-dist (Mozilla PDF.js)
|
|
108
|
+
|
|
109
|
+
**Best for:** Complex layouts, coordinates, images, page-by-page control
|
|
110
|
+
|
|
111
|
+
### Installation
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
npm install pdfjs-dist
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Basic Text Extraction with Coordinates
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
121
|
+
import { readFile } from "fs/promises";
|
|
122
|
+
import path from "path";
|
|
123
|
+
|
|
124
|
+
async function extractWithCoordinates(pdfPath: string) {
|
|
125
|
+
const data = await readFile(pdfPath);
|
|
126
|
+
const dataArray = new Uint8Array(data);
|
|
127
|
+
|
|
128
|
+
const pdfDocument = await pdfjsLib.getDocument({
|
|
129
|
+
data: dataArray,
|
|
130
|
+
standardFontDataUrl: path.join(process.cwd(), "node_modules/pdfjs-dist/standard_fonts/"),
|
|
131
|
+
}).promise;
|
|
132
|
+
|
|
133
|
+
const numPages = pdfDocument.numPages;
|
|
134
|
+
const results = [];
|
|
135
|
+
|
|
136
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
137
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
138
|
+
const textContent = await page.getTextContent();
|
|
139
|
+
|
|
140
|
+
const pageText = textContent.items.map((item: any) => ({
|
|
141
|
+
text: item.str,
|
|
142
|
+
x: item.transform[4],
|
|
143
|
+
y: item.transform[5],
|
|
144
|
+
font: item.fontName,
|
|
145
|
+
width: item.width,
|
|
146
|
+
height: item.height,
|
|
147
|
+
}));
|
|
148
|
+
|
|
149
|
+
results.push({
|
|
150
|
+
page: pageNum,
|
|
151
|
+
items: pageText,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return results;
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Extract Images from PDF
|
|
160
|
+
|
|
161
|
+
```typescript
|
|
162
|
+
import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
163
|
+
|
|
164
|
+
async function extractImages(pdfPath: string) {
|
|
165
|
+
const data = await readFile(pdfPath);
|
|
166
|
+
const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
|
|
167
|
+
|
|
168
|
+
const images = [];
|
|
169
|
+
|
|
170
|
+
for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
|
|
171
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
172
|
+
const ops = await page.getOperatorList();
|
|
173
|
+
|
|
174
|
+
for (let i = 0; i < ops.fnArray.length; i++) {
|
|
175
|
+
if (ops.fnArray[i] === pdfjsLib.OPS.paintImageXObject) {
|
|
176
|
+
const imageName = ops.argsArray[i][0];
|
|
177
|
+
const image = await page.objs.get(imageName);
|
|
178
|
+
|
|
179
|
+
images.push({
|
|
180
|
+
page: pageNum,
|
|
181
|
+
name: imageName,
|
|
182
|
+
width: image.width,
|
|
183
|
+
height: image.height,
|
|
184
|
+
data: image.data, // Raw image data
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return images;
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Render Page to Image
|
|
195
|
+
|
|
196
|
+
```typescript
|
|
197
|
+
import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
198
|
+
import { createCanvas } from "canvas";
|
|
199
|
+
import { writeFile } from "fs/promises";
|
|
200
|
+
|
|
201
|
+
async function renderPageToImage(pdfPath: string, pageNum: number, outputPath: string) {
|
|
202
|
+
const data = await readFile(pdfPath);
|
|
203
|
+
const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
|
|
204
|
+
|
|
205
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
206
|
+
const viewport = page.getViewport({ scale: 2.0 }); // Higher scale = better quality
|
|
207
|
+
|
|
208
|
+
const canvas = createCanvas(viewport.width, viewport.height);
|
|
209
|
+
const context = canvas.getContext("2d");
|
|
210
|
+
|
|
211
|
+
await page.render({
|
|
212
|
+
canvasContext: context,
|
|
213
|
+
viewport: viewport,
|
|
214
|
+
}).promise;
|
|
215
|
+
|
|
216
|
+
const buffer = canvas.toBuffer("image/png");
|
|
217
|
+
await writeFile(outputPath, buffer);
|
|
218
|
+
|
|
219
|
+
console.log(`Page ${pageNum} saved to ${outputPath}`);
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Library 3: pdf-data-parser (Tables)
|
|
226
|
+
|
|
227
|
+
**Best for:** Tabular data, structured grid content
|
|
228
|
+
|
|
229
|
+
### Installation
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
npm install pdf-data-parser
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Extract Tables
|
|
236
|
+
|
|
237
|
+
```typescript
|
|
238
|
+
import { PdfDataParser } from "pdf-data-parser";
|
|
239
|
+
|
|
240
|
+
async function extractTables(pdfPath: string) {
|
|
241
|
+
const parser = new PdfDataParser({
|
|
242
|
+
url: pdfPath,
|
|
243
|
+
// Options
|
|
244
|
+
heading: "Table Title", // Filter to specific table
|
|
245
|
+
cells: 3, // Minimum cells per row
|
|
246
|
+
headers: ["Name", "Amount"], // Expected headers
|
|
247
|
+
repeating: false, // Handle repeating headers
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
const rows = await parser.parse();
|
|
251
|
+
return rows; // Array of arrays
|
|
252
|
+
}
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### Stream Large PDFs
|
|
256
|
+
|
|
257
|
+
```typescript
|
|
258
|
+
import { PdfDataReader } from "pdf-data-parser";
|
|
259
|
+
import { createWriteStream } from "fs";
|
|
260
|
+
|
|
261
|
+
async function streamToCSV(pdfPath: string, outputPath: string) {
|
|
262
|
+
const reader = new PdfDataReader({
|
|
263
|
+
url: pdfPath,
|
|
264
|
+
cells: 2,
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
const output = createWriteStream(outputPath);
|
|
268
|
+
|
|
269
|
+
reader.on("data", (row: string[]) => {
|
|
270
|
+
output.write(row.join(",") + "\n");
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
reader.on("end", () => {
|
|
274
|
+
output.end();
|
|
275
|
+
console.log("CSV created");
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Best Practices
|
|
283
|
+
|
|
284
|
+
### 1. Error Handling
|
|
285
|
+
|
|
286
|
+
```typescript
|
|
287
|
+
async function safeExtract(filePath: string) {
|
|
288
|
+
try {
|
|
289
|
+
const buffer = await readFile(filePath);
|
|
290
|
+
|
|
291
|
+
// Validate PDF header
|
|
292
|
+
const header = buffer.slice(0, 5).toString();
|
|
293
|
+
if (header !== "%PDF-") {
|
|
294
|
+
throw new Error("Invalid PDF file");
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const result = await parser.parse(buffer);
|
|
298
|
+
return result;
|
|
299
|
+
} catch (error) {
|
|
300
|
+
if (error.message.includes("password")) {
|
|
301
|
+
throw new Error("PDF is password protected");
|
|
302
|
+
}
|
|
303
|
+
if (error.message.includes("damaged")) {
|
|
304
|
+
throw new Error("PDF is corrupted");
|
|
305
|
+
}
|
|
306
|
+
throw error;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### 2. Memory Management (Large Files)
|
|
312
|
+
|
|
313
|
+
```typescript
|
|
314
|
+
// For large PDFs, process page by page
|
|
315
|
+
async function extractLargePDF(pdfPath: string) {
|
|
316
|
+
const data = await readFile(pdfPath);
|
|
317
|
+
const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
|
|
318
|
+
|
|
319
|
+
// Don't load all pages at once
|
|
320
|
+
for (let i = 1; i <= pdfDocument.numPages; i++) {
|
|
321
|
+
const page = await pdfDocument.getPage(i);
|
|
322
|
+
const text = await page.getTextContent();
|
|
323
|
+
|
|
324
|
+
// Process immediately, don't accumulate
|
|
325
|
+
await processPageText(text);
|
|
326
|
+
|
|
327
|
+
// Clean up
|
|
328
|
+
page.cleanup();
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
### 3. Text Cleaning
|
|
334
|
+
|
|
335
|
+
```typescript
|
|
336
|
+
function cleanExtractedText(text: string): string {
|
|
337
|
+
return text
|
|
338
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
339
|
+
.replace(/[^\x20-\x7E\n]/g, "") // Remove non-printable chars
|
|
340
|
+
.trim();
|
|
341
|
+
}
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
### 4. Performance Tips
|
|
345
|
+
|
|
346
|
+
```typescript
|
|
347
|
+
// Parallel extraction for multiple files
|
|
348
|
+
async function extractMultiple(files: string[]) {
|
|
349
|
+
const results = await Promise.all(
|
|
350
|
+
files.map((file) => extractText(file).catch((err) => ({ file, error: err }))),
|
|
351
|
+
);
|
|
352
|
+
return results;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Use streams for very large files
|
|
356
|
+
import { createReadStream } from "fs";
|
|
357
|
+
import { PdfDataReader } from "pdf-data-parser";
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
---
|
|
361
|
+
|
|
362
|
+
## Common Issues & Solutions
|
|
363
|
+
|
|
364
|
+
| Issue | Cause | Solution |
|
|
365
|
+
| -------------------- | --------------------- | ------------------------------------- |
|
|
366
|
+
| Text appears garbled | Encoding issue | Use pdfjs-dist with explicit encoding |
|
|
367
|
+
| Missing text | Scanned image PDF | Use OCR (Tesseract) before extraction |
|
|
368
|
+
| Out of memory | Large PDF | Stream processing, page-by-page |
|
|
369
|
+
| Password error | Encrypted PDF | Use `pdf-lib` to decrypt first |
|
|
370
|
+
| Missing coordinates | Wrong library | Use pdfjs-dist for positioning |
|
|
371
|
+
| Table structure lost | Plain text extraction | Use pdf-data-parser |
|
|
372
|
+
| Font warnings | Missing fonts | Set `standardFontDataUrl` option |
|
|
373
|
+
|
|
374
|
+
---
|
|
375
|
+
|
|
376
|
+
## Complete Example: Document Processor
|
|
377
|
+
|
|
378
|
+
```typescript
|
|
379
|
+
import { PDFParse } from "pdf-parse";
|
|
380
|
+
import { readFile } from "fs/promises";
|
|
381
|
+
|
|
382
|
+
interface DocumentResult {
|
|
383
|
+
text: string;
|
|
384
|
+
metadata: {
|
|
385
|
+
title?: string;
|
|
386
|
+
author?: string;
|
|
387
|
+
pages: number;
|
|
388
|
+
creationDate?: Date;
|
|
389
|
+
};
|
|
390
|
+
summary: string;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
async function processDocument(filePath: string): Promise<DocumentResult> {
|
|
394
|
+
const parser = new PDFParse();
|
|
395
|
+
const buffer = await readFile(filePath);
|
|
396
|
+
|
|
397
|
+
const result = await parser.parse(buffer);
|
|
398
|
+
|
|
399
|
+
// Generate summary (first 500 chars)
|
|
400
|
+
const summary = result.text.replace(/\s+/g, " ").slice(0, 500).trim() + "...";
|
|
401
|
+
|
|
402
|
+
return {
|
|
403
|
+
text: result.text,
|
|
404
|
+
metadata: {
|
|
405
|
+
title: result.info?.Title,
|
|
406
|
+
author: result.info?.Author,
|
|
407
|
+
pages: result.numpages,
|
|
408
|
+
creationDate: result.info?.CreationDate ? new Date(result.info.CreationDate) : undefined,
|
|
409
|
+
},
|
|
410
|
+
summary,
|
|
411
|
+
};
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Usage
|
|
415
|
+
const doc = await processDocument("./report.pdf");
|
|
416
|
+
console.log(`Document: ${doc.metadata.title}`);
|
|
417
|
+
console.log(`Pages: ${doc.metadata.pages}`);
|
|
418
|
+
console.log(`Summary: ${doc.summary}`);
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
---
|
|
422
|
+
|
|
423
|
+
## References
|
|
424
|
+
|
|
425
|
+
- [pdf-parse npm](https://www.npmjs.com/package/pdf-parse)
|
|
426
|
+
- [pdfjs-dist docs](https://mozilla.github.io/pdf.js/)
|
|
427
|
+
- [pdf-data-parser GitHub](https://github.com/drewletcher/pdf-data-parser)
|
|
428
|
+
- [pdf-lib GitHub](https://github.com/Hopding/pdf-lib)
|