opencodekit 0.16.18 → 0.16.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,428 @@
1
+ ---
2
+ name: pdf-extract
3
+ description: Extract text, images, tables, and metadata from PDF files. Choose the right library based on extraction needs - text only, structured data, or complex layouts.
4
+ ---
5
+
6
+ # PDF Content Extraction
7
+
8
+ Extract content from PDF files using the best library for your specific use case.
9
+
10
+ ## Quick Decision Guide
11
+
12
+ | Use Case | Recommended Library | Why |
13
+ | --------------------------- | ------------------- | ---------------------------------- |
14
+ | Simple text extraction | `pdf-parse` (v2+) | Fast, lightweight, pure TypeScript |
15
+ | Complex layouts/coordinates | `pdfjs-dist` | Full control, precise positioning |
16
+ | Tables/tabular data | `pdf-data-parser` | Built for grid-based content |
17
+ | Forms (XFA) | `pdf-lib` + custom | Form field extraction |
18
+ | Browser + Node.js | `pdf-parse` v2 | Cross-platform, works everywhere |
19
+
20
+ ---
21
+
22
+ ## Library 1: pdf-parse (Recommended for Text)
23
+
24
+ **Best for:** Simple text extraction, metadata, fast processing
25
+
26
+ ### Installation
27
+
28
+ ```bash
29
+ npm install pdf-parse
30
+ ```
31
+
32
+ ### Basic Text Extraction
33
+
34
+ ```typescript
35
+ import { PDFParse } from "pdf-parse";
36
+ import { readFile } from "fs/promises";
37
+
38
+ async function extractText(filePath: string): Promise<string> {
39
+ const parser = new PDFParse();
40
+ const buffer = await readFile(filePath);
41
+
42
+ const result = await parser.parse(buffer);
43
+ return result.text;
44
+ }
45
+
46
+ // Usage
47
+ const text = await extractText("./document.pdf");
48
+ console.log(text);
49
+ ```
50
+
51
+ ### Extract with Metadata
52
+
53
+ ```typescript
54
+ import { PDFParse } from "pdf-parse";
55
+
56
+ async function extractWithMetadata(filePath: string) {
57
+ const parser = new PDFParse();
58
+ const buffer = await readFile(filePath);
59
+
60
+ const result = await parser.parse(buffer);
61
+
62
+ return {
63
+ text: result.text,
64
+ info: result.info, // Document metadata
65
+ numpages: result.numpages,
66
+ version: result.version,
67
+ };
68
+ }
69
+ ```
70
+
71
+ ### Extract Specific Pages
72
+
73
+ ```typescript
74
+ import { PDFParse } from "pdf-parse";
75
+
76
+ async function extractPage(filePath: string, pageNum: number) {
77
+ const parser = new PDFParse();
78
+ const buffer = await readFile(filePath);
79
+
80
+ const result = await parser.parse(buffer, {
81
+ max: pageNum,
82
+ min: pageNum,
83
+ });
84
+
85
+ return result.text;
86
+ }
87
+ ```
88
+
89
+ ### URL-based Extraction (without downloading full file)
90
+
91
+ ```typescript
92
+ import { getHeader } from "pdf-parse/node";
93
+
94
+ async function checkPDFHeaders(url: string) {
95
+ // Check file size and headers before downloading
96
+ const headers = await getHeader(url, true);
97
+ console.log(`File size: ${headers.size} bytes`);
98
+
99
+ if (headers.size > 10 * 1024 * 1024) {
100
+ console.warn("Large PDF - consider streaming");
101
+ }
102
+ }
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Library 2: pdfjs-dist (Mozilla PDF.js)
108
+
109
+ **Best for:** Complex layouts, coordinates, images, page-by-page control
110
+
111
+ ### Installation
112
+
113
+ ```bash
114
+ npm install pdfjs-dist
115
+ ```
116
+
117
+ ### Basic Text Extraction with Coordinates
118
+
119
+ ```typescript
120
+ import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
121
+ import { readFile } from "fs/promises";
122
+ import path from "path";
123
+
124
+ async function extractWithCoordinates(pdfPath: string) {
125
+ const data = await readFile(pdfPath);
126
+ const dataArray = new Uint8Array(data);
127
+
128
+ const pdfDocument = await pdfjsLib.getDocument({
129
+ data: dataArray,
130
+ standardFontDataUrl: path.join(process.cwd(), "node_modules/pdfjs-dist/standard_fonts/"),
131
+ }).promise;
132
+
133
+ const numPages = pdfDocument.numPages;
134
+ const results = [];
135
+
136
+ for (let pageNum = 1; pageNum <= numPages; pageNum++) {
137
+ const page = await pdfDocument.getPage(pageNum);
138
+ const textContent = await page.getTextContent();
139
+
140
+ const pageText = textContent.items.map((item: any) => ({
141
+ text: item.str,
142
+ x: item.transform[4],
143
+ y: item.transform[5],
144
+ font: item.fontName,
145
+ width: item.width,
146
+ height: item.height,
147
+ }));
148
+
149
+ results.push({
150
+ page: pageNum,
151
+ items: pageText,
152
+ });
153
+ }
154
+
155
+ return results;
156
+ }
157
+ ```
158
+
159
+ ### Extract Images from PDF
160
+
161
+ ```typescript
162
+ import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
163
+
164
+ async function extractImages(pdfPath: string) {
165
+ const data = await readFile(pdfPath);
166
+ const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
167
+
168
+ const images = [];
169
+
170
+ for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
171
+ const page = await pdfDocument.getPage(pageNum);
172
+ const ops = await page.getOperatorList();
173
+
174
+ for (let i = 0; i < ops.fnArray.length; i++) {
175
+ if (ops.fnArray[i] === pdfjsLib.OPS.paintImageXObject) {
176
+ const imageName = ops.argsArray[i][0];
177
+ const image = await page.objs.get(imageName);
178
+
179
+ images.push({
180
+ page: pageNum,
181
+ name: imageName,
182
+ width: image.width,
183
+ height: image.height,
184
+ data: image.data, // Raw image data
185
+ });
186
+ }
187
+ }
188
+ }
189
+
190
+ return images;
191
+ }
192
+ ```
193
+
194
+ ### Render Page to Image
195
+
196
+ ```typescript
197
+ import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs";
198
+ import { createCanvas } from "canvas";
199
+ import { writeFile } from "fs/promises";
200
+
201
+ async function renderPageToImage(pdfPath: string, pageNum: number, outputPath: string) {
202
+ const data = await readFile(pdfPath);
203
+ const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
204
+
205
+ const page = await pdfDocument.getPage(pageNum);
206
+ const viewport = page.getViewport({ scale: 2.0 }); // Higher scale = better quality
207
+
208
+ const canvas = createCanvas(viewport.width, viewport.height);
209
+ const context = canvas.getContext("2d");
210
+
211
+ await page.render({
212
+ canvasContext: context,
213
+ viewport: viewport,
214
+ }).promise;
215
+
216
+ const buffer = canvas.toBuffer("image/png");
217
+ await writeFile(outputPath, buffer);
218
+
219
+ console.log(`Page ${pageNum} saved to ${outputPath}`);
220
+ }
221
+ ```
222
+
223
+ ---
224
+
225
+ ## Library 3: pdf-data-parser (Tables)
226
+
227
+ **Best for:** Tabular data, structured grid content
228
+
229
+ ### Installation
230
+
231
+ ```bash
232
+ npm install pdf-data-parser
233
+ ```
234
+
235
+ ### Extract Tables
236
+
237
+ ```typescript
238
+ import { PdfDataParser } from "pdf-data-parser";
239
+
240
+ async function extractTables(pdfPath: string) {
241
+ const parser = new PdfDataParser({
242
+ url: pdfPath,
243
+ // Options
244
+ heading: "Table Title", // Filter to specific table
245
+ cells: 3, // Minimum cells per row
246
+ headers: ["Name", "Amount"], // Expected headers
247
+ repeating: false, // Handle repeating headers
248
+ });
249
+
250
+ const rows = await parser.parse();
251
+ return rows; // Array of arrays
252
+ }
253
+ ```
254
+
255
+ ### Stream Large PDFs
256
+
257
+ ```typescript
258
+ import { PdfDataReader } from "pdf-data-parser";
259
+ import { createWriteStream } from "fs";
260
+
261
+ async function streamToCSV(pdfPath: string, outputPath: string) {
262
+ const reader = new PdfDataReader({
263
+ url: pdfPath,
264
+ cells: 2,
265
+ });
266
+
267
+ const output = createWriteStream(outputPath);
268
+
269
+ reader.on("data", (row: string[]) => {
270
+ output.write(row.join(",") + "\n");
271
+ });
272
+
273
+ reader.on("end", () => {
274
+ output.end();
275
+ console.log("CSV created");
276
+ });
277
+ }
278
+ ```
279
+
280
+ ---
281
+
282
+ ## Best Practices
283
+
284
+ ### 1. Error Handling
285
+
286
+ ```typescript
287
+ async function safeExtract(filePath: string) {
288
+ try {
289
+ const buffer = await readFile(filePath);
290
+
291
+ // Validate PDF header
292
+ const header = buffer.slice(0, 5).toString();
293
+ if (header !== "%PDF-") {
294
+ throw new Error("Invalid PDF file");
295
+ }
296
+
297
+ const result = await parser.parse(buffer);
298
+ return result;
299
+ } catch (error) {
300
+ if (error.message.includes("password")) {
301
+ throw new Error("PDF is password protected");
302
+ }
303
+ if (error.message.includes("damaged")) {
304
+ throw new Error("PDF is corrupted");
305
+ }
306
+ throw error;
307
+ }
308
+ }
309
+ ```
310
+
311
+ ### 2. Memory Management (Large Files)
312
+
313
+ ```typescript
314
+ // For large PDFs, process page by page
315
+ async function extractLargePDF(pdfPath: string) {
316
+ const data = await readFile(pdfPath);
317
+ const pdfDocument = await pdfjsLib.getDocument({ data: new Uint8Array(data) }).promise;
318
+
319
+ // Don't load all pages at once
320
+ for (let i = 1; i <= pdfDocument.numPages; i++) {
321
+ const page = await pdfDocument.getPage(i);
322
+ const text = await page.getTextContent();
323
+
324
+ // Process immediately, don't accumulate
325
+ await processPageText(text);
326
+
327
+ // Clean up
328
+ page.cleanup();
329
+ }
330
+ }
331
+ ```
332
+
333
+ ### 3. Text Cleaning
334
+
335
+ ```typescript
336
+ function cleanExtractedText(text: string): string {
337
+ return text
338
+ .replace(/\s+/g, " ") // Normalize whitespace
339
+ .replace(/[^\x20-\x7E\n]/g, "") // Remove non-printable chars
340
+ .trim();
341
+ }
342
+ ```
343
+
344
+ ### 4. Performance Tips
345
+
346
+ ```typescript
347
+ // Parallel extraction for multiple files
348
+ async function extractMultiple(files: string[]) {
349
+ const results = await Promise.all(
350
+ files.map((file) => extractText(file).catch((err) => ({ file, error: err }))),
351
+ );
352
+ return results;
353
+ }
354
+
355
+ // Use streams for very large files
356
+ import { createReadStream } from "fs";
357
+ import { PdfDataReader } from "pdf-data-parser";
358
+ ```
359
+
360
+ ---
361
+
362
+ ## Common Issues & Solutions
363
+
364
+ | Issue | Cause | Solution |
365
+ | -------------------- | --------------------- | ------------------------------------- |
366
+ | Text appears garbled | Encoding issue | Use pdfjs-dist with explicit encoding |
367
+ | Missing text | Scanned image PDF | Use OCR (Tesseract) before extraction |
368
+ | Out of memory | Large PDF | Stream processing, page-by-page |
369
+ | Password error | Encrypted PDF | Use `pdf-lib` to decrypt first |
370
+ | Missing coordinates | Wrong library | Use pdfjs-dist for positioning |
371
+ | Table structure lost | Plain text extraction | Use pdf-data-parser |
372
+ | Font warnings | Missing fonts | Set `standardFontDataUrl` option |
373
+
374
+ ---
375
+
376
+ ## Complete Example: Document Processor
377
+
378
+ ```typescript
379
+ import { PDFParse } from "pdf-parse";
380
+ import { readFile } from "fs/promises";
381
+
382
+ interface DocumentResult {
383
+ text: string;
384
+ metadata: {
385
+ title?: string;
386
+ author?: string;
387
+ pages: number;
388
+ creationDate?: Date;
389
+ };
390
+ summary: string;
391
+ }
392
+
393
+ async function processDocument(filePath: string): Promise<DocumentResult> {
394
+ const parser = new PDFParse();
395
+ const buffer = await readFile(filePath);
396
+
397
+ const result = await parser.parse(buffer);
398
+
399
+ // Generate summary (first 500 chars)
400
+ const summary = result.text.replace(/\s+/g, " ").slice(0, 500).trim() + "...";
401
+
402
+ return {
403
+ text: result.text,
404
+ metadata: {
405
+ title: result.info?.Title,
406
+ author: result.info?.Author,
407
+ pages: result.numpages,
408
+ creationDate: result.info?.CreationDate ? new Date(result.info.CreationDate) : undefined,
409
+ },
410
+ summary,
411
+ };
412
+ }
413
+
414
+ // Usage
415
+ const doc = await processDocument("./report.pdf");
416
+ console.log(`Document: ${doc.metadata.title}`);
417
+ console.log(`Pages: ${doc.metadata.pages}`);
418
+ console.log(`Summary: ${doc.summary}`);
419
+ ```
420
+
421
+ ---
422
+
423
+ ## References
424
+
425
+ - [pdf-parse npm](https://www.npmjs.com/package/pdf-parse)
426
+ - [pdfjs-dist docs](https://mozilla.github.io/pdf.js/)
427
+ - [pdf-data-parser GitHub](https://github.com/drewletcher/pdf-data-parser)
428
+ - [pdf-lib GitHub](https://github.com/Hopding/pdf-lib)