afpp 2.0.0-beta.1 β†’ 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,57 +8,101 @@
8
8
  ![Repo Size](https://img.shields.io/github/repo-size/l2ysho/afpp)
9
9
  ![Last Commit](https://img.shields.io/github/last-commit/l2ysho/afpp.svg)
10
10
 
11
- Another f\*cking pdf parser. (alpha)
11
+ Another f\*cking PDF parser. Because parsing PDFs in Node.js should be easy. Live long and parse PDFs. πŸ––
12
12
 
13
13
  ## Why?
14
14
 
15
- If you are parsing pdf files in nodejs and you are satisfied with your actual solution, good for you, you don't need this.
15
+ There are plenty of PDF-related packages for Node.js. They work… until they don’t.
16
+
17
+ Afpp was built to solve the headaches I ran into while trying to parse PDFs in Node.js:
18
+
19
+ - πŸ“¦ Do I need a package with 30+ MB just to read a PDF?
20
+ - 🧡 Why is the event loop blocked?
21
+ - 🐏 Is that a memory leak I smell?
22
+ - 🐌 Should reading a PDF really be this performance-heavy?
23
+ - 🐞 Why is everything so buggy?
24
+ - 🎨 Why does it complain about the lack of a canvas in Node.js?
25
+ - 🧱 Why does canvas require native C++/Python dependencies to build?
26
+ - πŸͺŸ Why does it complain about the missing window object?
27
+ - πŸͺ„ Why do I need ImageMagick for this?!
28
+ - πŸ‘» What the hell is Ghostscript, and why does it keep failing?
29
+ - ❌ Where’s the TypeScript support?
30
+ - πŸ§“ Why are the dependencies older than my dev career?
31
+ - πŸ” Why does everything work… until I try an encrypted PDF?
32
+ - πŸ•―οΈ Why does every OS need its own special setup ritual?
16
33
 
17
- But if you’ve encountered one or more of these issues:
34
+ ## Prerequisites
18
35
 
19
- - package size (+30mb)
20
- - blocking event loop
21
- - performance issues
22
- - buggy as shit
23
- - not working in esm/commonjs
24
- - old pdfjs-dist as peer dependency
25
- - no typescript support
26
- - parsing of encrypted pdf files (password needed)
36
+ - Node.js >= v22.14.0
27
37
 
28
- then you might find this package useful.
38
+ ## πŸ“¦ Installation
29
39
 
30
- ## Prerequisites
40
+ You can install `afpp` via npm, Yarn, or pnpm.
41
+
42
+ ### npm
43
+
44
+ ```bash
45
+ npm install afpp
46
+ ```
31
47
 
32
- - Node.js v22.14.0
48
+ ### Yarn
49
+
50
+ ```bash
51
+ yarn add afpp
52
+ ```
53
+
54
+ ### pnpm
55
+
56
+ ```bash
57
+ pnpm add afpp
58
+ ```
33
59
 
34
60
  ## Getting started
35
61
 
36
- `npm install afpp`
62
+ The `afpp` library makes it simple to extract text or images from PDF files in Node.js. Whether your PDF is stored locally, hosted online, or encrypted, `afpp` provides an easy-to-use API to handle it all. All functions have common parameters and accepts string path, buffer, or URL object.
37
63
 
38
- **commonjs**:
64
+ ### Get text from path
39
65
 
40
- ```js
41
- const { pdf2string } = require('afpp');
42
- const path = require('node:path');
66
+ ```ts
67
+ import { readFile } from 'fs/promises';
68
+ import path from 'path';
43
69
 
44
- const pathToFile = path.join('example.pdf');
70
+ import { pdf2string } from 'afpp';
71
+
72
+ (async function main() {
73
+ const pathToFile = path.join('..', 'test', 'example.pdf');
74
+ const input = await readFile(pathToFile);
75
+ const data = await pdf2string(input);
45
76
 
46
- (async function start() {
47
- const pdfString = await pdf2string(pathToFile);
48
- console.log(pdfString);
77
+ console.log('Extracted text:', data); // ['page 1 content', 'page 2 content', ...]
49
78
  })();
50
79
  ```
51
80
 
52
- **esm**:
81
+ ### Get image from URL
53
82
 
54
- ```js
55
- import { pdf2string } from 'afpp';
56
- import path from 'node:path';
83
+ ```ts
84
+ import { pdf2image } from 'afpp';
85
+
86
+ (async function main() {
87
+ const url = new URL('https://pdfobject.com/pdf/sample.pdf');
88
+ const arrayOfImages = await pdf2image(url);
89
+
90
+ console.log(arrayOfImages); // [imageBuffer, imageBuffer, ...]
91
+ })();
92
+ ```
93
+
94
+ ### Parse pdf buffer
95
+
96
+ ```ts
97
+ import { parsePdf } from 'afpp';
57
98
 
58
- const pathToFile = path.join('example.pdf');
99
+ (async function main() {
100
+ // Download PDF from URL
101
+ const response = await fetch('https://pdfobject.com/pdf/sample.pdf');
102
+ const buffer = Buffer.from(await response.arrayBuffer());
59
103
 
60
- (async function start() {
61
- const pdfString = await pdf2string(pathToFile);
62
- console.log(pdfString);
104
+ // Parse the PDF buffer
105
+ const result = await parsePdf(buffer, {}, (content) => content);
106
+ console.log('Parsed PDF:', result);
63
107
  })();
64
108
  ```
package/dist/core.d.ts ADDED
@@ -0,0 +1,38 @@
1
+ export declare enum PROCESSING_TYPE {
2
+ IMAGE = "IMAGE",
3
+ MIXED = "MIXED",
4
+ TEXT = "TEXT"
5
+ }
6
+ import { Canvas, CanvasRenderingContext2D } from '@napi-rs/canvas';
7
+ export interface AfppParseOptions {
8
+ /**
9
+ * Concurrency level for page processing.
10
+ */
11
+ concurrency?: number;
12
+ /**
13
+ * Image encoding format when rendering non-text pages. Defaults to 'png'.
14
+ */
15
+ imageEncoding?: ImageEncoding;
16
+ /**
17
+ * Password for encrypted pdf files.
18
+ */
19
+ password?: string;
20
+ /**
21
+ * Scale of a page if content is not text.
22
+ */
23
+ scale?: number;
24
+ }
25
+ export interface CanvasAndContext {
26
+ canvas: Canvas;
27
+ context: CanvasRenderingContext2D;
28
+ }
29
+ export type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
30
+ export type PageProcessor<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
31
+ export interface PdfCanvasFactory {
32
+ create(width: number, height: number): CanvasAndContext;
33
+ destroy(canvasAndContext: CanvasAndContext): void;
34
+ reset(canvasAndContext: CanvasAndContext, width: number, height: number): void;
35
+ }
36
+ export declare function parsePdfFile(type: PROCESSING_TYPE.IMAGE, input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions, callback?: undefined): Promise<Buffer[]>;
37
+ export declare function parsePdfFile(type: PROCESSING_TYPE.TEXT, input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions, callback?: undefined): Promise<string[]>;
38
+ export declare function parsePdfFile<T>(type: PROCESSING_TYPE.MIXED, input: Buffer | string | Uint8Array | URL, options: AfppParseOptions, callback: PageProcessor<T>): Promise<T[]>;
package/dist/core.js ADDED
@@ -0,0 +1,135 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.PROCESSING_TYPE = void 0;
7
+ exports.parsePdfFile = parsePdfFile;
8
+ var PROCESSING_TYPE;
9
+ (function (PROCESSING_TYPE) {
10
+ PROCESSING_TYPE["IMAGE"] = "IMAGE";
11
+ PROCESSING_TYPE["MIXED"] = "MIXED";
12
+ PROCESSING_TYPE["TEXT"] = "TEXT";
13
+ })(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {}));
14
+ const promises_1 = require("node:fs/promises");
15
+ const p_limit_1 = __importDefault(require("p-limit"));
16
+ const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
17
+ const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => {
18
+ const textContent = await page.getTextContent({
19
+ includeMarkedContent: false,
20
+ });
21
+ const items = textContent.items;
22
+ if (items.length === 0) {
23
+ const viewport = page.getViewport({ scale });
24
+ const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
25
+ await page.render({ canvasContext: canvasAndContext.context, viewport })
26
+ .promise;
27
+ //@ts-expect-error this should be fixed in release
28
+ const imageBuffer = await canvasAndContext.canvas.encode(encoding);
29
+ canvasFactory.destroy(canvasAndContext);
30
+ return callback(imageBuffer, pageNumber, pageCount);
31
+ }
32
+ const pageText = items.map((item) => item.str || '').join(' ');
33
+ return callback(pageText, pageNumber, pageCount);
34
+ };
35
+ const processPdfPageTypeText = async (page) => {
36
+ const textContent = await page.getTextContent({
37
+ includeMarkedContent: false,
38
+ });
39
+ const items = textContent.items;
40
+ if (items.length === 0) {
41
+ return '';
42
+ }
43
+ else {
44
+ return items.map((item) => item.str || '').join(' ');
45
+ }
46
+ };
47
+ const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => {
48
+ const viewport = page.getViewport({ scale });
49
+ const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
50
+ await page.render({ canvasContext: canvasAndContext.context, viewport })
51
+ .promise;
52
+ //@ts-expect-error this should be fixed in release
53
+ const imageBuffer = await canvasAndContext.canvas.encode(encoding);
54
+ canvasFactory.destroy(canvasAndContext);
55
+ return imageBuffer;
56
+ };
57
+ const validateParameters = async (input, options) => {
58
+ const documentInitParameters = {};
59
+ switch (true) {
60
+ case typeof input === 'string':
61
+ documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input));
62
+ break;
63
+ case Buffer.isBuffer(input):
64
+ documentInitParameters.data = new Uint8Array(input);
65
+ break;
66
+ case input instanceof Uint8Array:
67
+ documentInitParameters.data = input;
68
+ break;
69
+ case input instanceof URL:
70
+ documentInitParameters.url = input;
71
+ break;
72
+ default:
73
+ throw new Error(`Invalid source type: ${typeof input}`);
74
+ }
75
+ documentInitParameters.password = options?.password;
76
+ documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS;
77
+ const scale = options?.scale ?? 2.0;
78
+ const concurrency = options?.concurrency ?? 1;
79
+ const encoding = options?.imageEncoding ?? 'png';
80
+ if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
81
+ throw new Error(`Unsupported image encoding format: '${encoding}'`);
82
+ }
83
+ return { concurrency, documentInitParameters, encoding, scale };
84
+ };
85
+ async function parsePdfFile(type, input, options, callback) {
86
+ const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options);
87
+ const limit = (0, p_limit_1.default)(concurrency);
88
+ const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters);
89
+ const pdfDocument = await loadingTask.promise;
90
+ const { numPages } = pdfDocument;
91
+ if (type === PROCESSING_TYPE.MIXED) {
92
+ if (!callback || typeof callback !== 'function') {
93
+ throw new Error(`Invalid callback type: ${typeof callback}`);
94
+ }
95
+ const results = new Array(numPages);
96
+ const pageTasks = Array.from({ length: numPages }, (_, i) => {
97
+ const pageNum = i + 1;
98
+ return limit(async () => {
99
+ const page = await pdfDocument.getPage(pageNum);
100
+ const canvasFactory = pdfDocument.canvasFactory;
101
+ const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback);
102
+ results[i] = result;
103
+ });
104
+ });
105
+ await Promise.all(pageTasks);
106
+ return results;
107
+ }
108
+ if (type === PROCESSING_TYPE.TEXT) {
109
+ const results = new Array(numPages);
110
+ const pageTasks = Array.from({ length: numPages }, (_, i) => {
111
+ const pageNum = i + 1;
112
+ return limit(async () => {
113
+ const page = await pdfDocument.getPage(pageNum);
114
+ results[i] = await processPdfPageTypeText(page);
115
+ });
116
+ });
117
+ await Promise.all(pageTasks);
118
+ return results;
119
+ }
120
+ if (type === PROCESSING_TYPE.IMAGE) {
121
+ const results = new Array(numPages);
122
+ const pageTasks = Array.from({ length: numPages }, (_, i) => {
123
+ const pageNum = i + 1;
124
+ return limit(async () => {
125
+ const page = await pdfDocument.getPage(pageNum);
126
+ const canvasFactory = pdfDocument.canvasFactory;
127
+ results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding);
128
+ });
129
+ });
130
+ await Promise.all(pageTasks);
131
+ return results;
132
+ }
133
+ throw new Error('Invalid PROCESSING_TYPE');
134
+ }
135
+ //# sourceMappingURL=core.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"core.js","sourceRoot":"","sources":["../src/core.ts"],"names":[],"mappings":";;;;;;AA6LA,oCAgFC;AA7QD,IAAY,eAIX;AAJD,WAAY,eAAe;IACzB,kCAAe,CAAA;IACf,kCAAe,CAAA;IACf,gCAAa,CAAA;AACf,CAAC,EAJW,eAAe,+BAAf,eAAe,QAI1B;AAED,+CAA4C;AAG5C,sDAA6B;AAC7B,6DAA8E;AAmD9E,MAAM,uBAAuB,GAAG,KAAK,EACnC,IAAkB,EAClB,aAA+B,EAC/B,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,QAA0B,EACd,EAAE;IACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QAE7C,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,CAC3C,QAAQ,CAAC,KAAK,EACd,QAAQ,CAAC,MAAM,CAChB,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,gBAAgB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC;aACrE,OAAO,CAAC;QACX,kDAAkD;QAClD,MAAM,WAAW,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACnE,aAAa,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;QACxC,OAAO,QAAQ,CAAC,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;AACnD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,KAAK,EAAE,IAAkB,EAAE,EAAE;IAC1D,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,EAAE,CAAC;IACZ,CAAC;SAAM,CAAC;QACN,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;AACH,CAAC,CAAC;AAEF,MAAM,uBAAuB,GAAG,KAAK,EACnC,IAAkB,EAClB,aAA+B,EAC/B,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,EAAE;IACF,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;IAE7C,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,CAC3C,QAAQ,CAAC,KAAK,EACd,QAAQ,CAAC,MAAM,CAChB,CAAC;IAEF,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,gBAAgB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC;SACrE,OAAO,CAAC;IACX,kDAAkD;IAClD,MAAM,WAAW,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACnE,aAAa,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACxC,OAAO,WAAW,CAAC;AACrB,CAAC,CAAC;AAEF,MAAM,kBAAkB,GAAG,KAAK,EAC9B,KAAyC,EACzC,OAA0B,EAC1B,EAAE;IACF,MAAM,sBAAsB,GAA2B,EAAE,CAAC;IAE1D,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,OAAO,KAAK,KAAK,QAAQ;YAC5B,sBAAsB,CAAC,IAAI,GAAG,IAAI,UAAU,CAAC,MAAM,IAAA,mBAAQ,EAAC,KAAK,CAAC,CAAC,CAAC;YACpE,MAAM;QACR,KAAK,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YACzB,sBAAsB,CAAC,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;YACpD,MAAM;QACR,KAAK,KAAK,YAAY,UAAU;YAC9B,sBAAsB,CAAC,IAAI,GAAG,KAAK,CAAC;YACpC,MAAM;QACR,KAAK,KAAK,YAAY,GAAG;YACvB,sBAAsB,CAAC,GAAG,GAAG,KAAK,CAAC;YACnC,MAAM;QACR;YACE,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,sBAAsB,CAAC,QAAQ,GAAG,OAAO,EAAE,QAAQ,CAAC;IACpD,sBAAsB,CAAC,SAAS,GAAG,wBAAc,CAAC,MAAM,CAAC;IAEzD,MAAM,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,GAAG,CAAC;IACpC,MAAM,WAAW,GAAG,OAAO,EAAE,WAAW,IAAI,CAAC,CAAC;IAC9C,MAAM,QAAQ,GAAG,OAAO,EAAE,aAAa,IAAI,KAAK,CAAC;IAEjD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,uCAAuC,QAAQ,GAAG,CAAC,CAAC;IACtE,CAAC;IAED,OAAO,EAAE,WAAW,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;AAClE,CAAC,CAAC;AAuBK,KAAK,UAAU,YAAY,CAChC,IAAqB,EACrB,KAAyC,EACzC,OAA0B,EAC1B,QAA2B;IAE3B,MAAM,EAAE,WAAW,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,EAAE,GAC5D,MAAM,kBAAkB,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAA,qBAAW,EAAC,sBAAsB,CAAC,CAAC;IACxD,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAC9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IAEjC,IAAI,IAAI,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;QACnC,IAAI,CAAC,QAAQ,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;YAChD,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;QAC/D,CAAC;QACD,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,MAAM,aAAa,GAAG,WAAW,CAAC,aAAiC,CAAC;gBAEpE,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAC1C,IAAI,EACJ,aAAa,EACb,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,QAAQ,CACT,CAAC;gBACF,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC;YACtB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,IAAI,KAAK,eAAe,CAAC,IAAI,EAAE,CAAC;QAClC,MAAM,OAAO,GAAa,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,sBAAsB,CAAC,IAAI,CAAC,CAAC;YAClD,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,IAAI,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;QACnC,MAAM,OAAO,GAAa,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,MAAM,aAAa,GAAG,WAAW,CAAC,aAAiC,CAAC;gBACpE,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,uBAAuB,CACxC,IAAI,EACJ,aAAa,EACb,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,CACT,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;AAC7C,CAAC"}
@@ -1,29 +1,4 @@
1
- import { Canvas } from '@napi-rs/canvas';
2
- export interface CanvasFactory {
3
- createCanvas: (width: number, height: number) => Canvas;
4
- destroyCanvas: (canvas: Canvas) => void;
5
- resetCanvas: (canvas: Canvas, width: number, height: number) => void;
6
- }
7
- export type PageProcessor<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
8
- type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
9
- interface ParseOptions {
10
- /**
11
- * Concurrency level for page processing.
12
- */
13
- concurrency?: number;
14
- /**
15
- * Image encoding format when rendering non-text pages. Defaults to 'png'.
16
- */
17
- imageEncoding?: ImageEncoding;
18
- /**
19
- * Password for encrypted pdf files.
20
- */
21
- password?: string;
22
- /**
23
- * Scale of a page if content is not text.
24
- */
25
- scale: number;
26
- }
1
+ import { AfppParseOptions, PageProcessor } from './core';
27
2
  /**
28
3
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
29
4
  *
@@ -44,5 +19,4 @@ interface ParseOptions {
44
19
  *
45
20
  * @throws {Error} Throws an error if the input type is invalid.
46
21
  */
47
- export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: ParseOptions, callback: PageProcessor<T>) => Promise<T[]>;
48
- export {};
22
+ export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: AfppParseOptions, callback: PageProcessor<T>) => Promise<T[]>;
package/dist/parsePdf.js CHANGED
@@ -1,49 +1,7 @@
1
1
  "use strict";
2
- var __importDefault = (this && this.__importDefault) || function (mod) {
3
- return (mod && mod.__esModule) ? mod : { "default": mod };
4
- };
5
2
  Object.defineProperty(exports, "__esModule", { value: true });
6
3
  exports.parsePdf = void 0;
7
- const promises_1 = require("node:fs/promises");
8
- const p_limit_1 = __importDefault(require("p-limit"));
9
- const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
10
- const processPdfPage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => {
11
- const textContent = await page.getTextContent({
12
- includeMarkedContent: false,
13
- });
14
- const items = textContent.items;
15
- if (items.length === 0) {
16
- const viewport = page.getViewport({ scale });
17
- const canvas = canvasFactory.createCanvas(viewport.width, viewport.height);
18
- // console.log(createCanvas, canvas);
19
- const context = canvas.getContext('2d');
20
- await page.render({ canvasContext: context, viewport }).promise;
21
- //@ts-expect-error this should be fixed in release
22
- const imageBuffer = await canvas.encode(encoding);
23
- canvasFactory.destroyCanvas(canvas);
24
- return callback(imageBuffer, pageNumber, pageCount);
25
- }
26
- const pageText = items.map((item) => item.str || '').join(' ');
27
- return callback(pageText, pageNumber, pageCount);
28
- };
29
- const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callback) => {
30
- const limit = (0, p_limit_1.default)(concurrency);
31
- const loadingTask = (0, pdf_mjs_1.getDocument)({ ...options, verbosity: 0 });
32
- const pdfDocument = await loadingTask.promise;
33
- const { numPages } = pdfDocument;
34
- const results = new Array(numPages);
35
- const pageTasks = Array.from({ length: numPages }, (_, i) => {
36
- const pageNum = i + 1;
37
- return limit(async () => {
38
- const page = await pdfDocument.getPage(pageNum);
39
- const canvasFactory = pdfDocument.canvasFactory;
40
- const result = await processPdfPage(page, canvasFactory, pageNum, numPages, scale, encoding, callback);
41
- results[i] = result;
42
- });
43
- });
44
- await Promise.all(pageTasks);
45
- return results;
46
- };
4
+ const core_1 = require("./core");
47
5
  /**
48
6
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
49
7
  *
@@ -64,31 +22,6 @@ const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callbac
64
22
  *
65
23
  * @throws {Error} Throws an error if the input type is invalid.
66
24
  */
67
- const parsePdf = async (input, options, callback) => {
68
- if (typeof callback !== 'function') {
69
- throw new Error(`Invalid callback type: ${typeof callback}`);
70
- }
71
- const scale = options.scale ?? 2.0;
72
- const concurrency = options.concurrency ?? 1;
73
- const encoding = options.imageEncoding ?? 'png';
74
- if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
75
- throw new Error(`Unsupported image encoding format: '${encoding}'`);
76
- }
77
- const baseOptions = { ...options };
78
- if (typeof input === 'string') {
79
- const fileBuffer = await (0, promises_1.readFile)(input);
80
- return parsePdfFileBuffer({ data: new Uint8Array(fileBuffer), ...baseOptions }, scale, concurrency, encoding, callback);
81
- }
82
- if (Buffer.isBuffer(input)) {
83
- return parsePdfFileBuffer({ data: new Uint8Array(input), ...baseOptions }, scale, concurrency, encoding, callback);
84
- }
85
- if (input instanceof Uint8Array) {
86
- return parsePdfFileBuffer({ data: input, ...baseOptions }, scale, concurrency, encoding, callback);
87
- }
88
- if (input instanceof URL) {
89
- return parsePdfFileBuffer({ url: input, ...baseOptions }, scale, concurrency, encoding, callback);
90
- }
91
- throw new Error(`Invalid source type: ${typeof input}`);
92
- };
25
+ const parsePdf = async (input, options, callback) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.MIXED, input, options, callback);
93
26
  exports.parsePdf = parsePdf;
94
27
  //# sourceMappingURL=parsePdf.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAG5C,sDAA6B;AAC7B,6DAA8D;AA0C9D,MAAM,cAAc,GAAG,KAAK,EAC1B,IAAkB,EAClB,aAA4B,EAC5B,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,QAA0B,EACd,EAAE;IACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QAE7C,MAAM,MAAM,GAAG,aAAa,CAAC,YAAY,CAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC3E,qCAAqC;QACrC,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;QAChE,kDAAkD;QAClD,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAClD,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;QACpC,OAAO,QAAQ,CAAC,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;AACnD,CAAC,CAAC;AAEF,MAAM,kBAAkB,GAAG,KAAK,EAC9B,OAA+B,EAC/B,KAAa,EACb,WAAmB,EACnB,QAAuB,EACvB,QAA0B,EACZ,EAAE;IAChB,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAA,qBAAW,EAAC,EAAE,GAAG,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9D,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAC9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;IAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;QACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;YACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAChD,MAAM,aAAa,GAAG,WAAW,CAAC,aAA8B,CAAC;YAEjE,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,IAAI,EACJ,aAAa,EACb,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,QAAQ,CACT,CAAC;YACF,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAC7B,OAAO,OAAO,CAAC;AACjB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;GAmBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAqB,EACrB,QAA0B,EACZ,EAAE;IAChB,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC;IACnC,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,aAAa,IAAI,KAAK,CAAC;IAEhD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,uCAAuC,QAAQ,GAAG,CAAC,CAAC;IACtE,CAAC;IAED,MAAM,WAAW,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAEnC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,CAAC,CAAC;QACzC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,GAAG,WAAW,EAAE,EACpD,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,WAAW,EAAE,EAC/C,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC/B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CACvB,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC9B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AA7DW,QAAA,QAAQ,YA6DnB"}
1
+ {"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;AAAA,yCAKwB;AAExB;;;;;;;;;;;;;;;;;;;GAmBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAyB,EACzB,QAA0B,EACZ,EAAE,CAChB,IAAA,mBAAY,EAAC,sBAAe,CAAC,KAAK,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;AALnD,QAAA,QAAQ,YAK2C"}
@@ -1,6 +1,4 @@
1
- interface ParseOptions {
2
- password?: string;
3
- }
1
+ import { AfppParseOptions } from './core';
4
2
  /**
5
3
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
6
4
  *
@@ -17,5 +15,4 @@ interface ParseOptions {
17
15
  *
18
16
  * @throws {Error} Throws an error if the input type is invalid.
19
17
  */
20
- export declare const pdf2image: (input: Buffer | string | Uint8Array | URL, options?: ParseOptions) => Promise<Buffer<ArrayBufferLike>[]>;
21
- export {};
18
+ export declare const pdf2image: (input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions) => Promise<Buffer<ArrayBufferLike>[]>;
package/dist/pdf2image.js CHANGED
@@ -1,37 +1,7 @@
1
1
  "use strict";
2
- var __importDefault = (this && this.__importDefault) || function (mod) {
3
- return (mod && mod.__esModule) ? mod : { "default": mod };
4
- };
5
2
  Object.defineProperty(exports, "__esModule", { value: true });
6
3
  exports.pdf2image = void 0;
7
- const promises_1 = require("node:fs/promises");
8
- const canvas_1 = require("@napi-rs/canvas");
9
- const p_limit_1 = __importDefault(require("p-limit"));
10
- const promiseLimit = (0, p_limit_1.default)(1);
11
- const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
12
- const loadingTask = pdfjsLib.getDocument({
13
- ...options,
14
- verbosity: 0, // TODO enable for debug
15
- });
16
- const pdfDocument = await loadingTask.promise;
17
- const { numPages } = pdfDocument;
18
- const pageContents = new Array(numPages).fill(Buffer.from(''));
19
- // eslint-disable-next-line @typescript-eslint/no-invalid-void-type
20
- const pagePromises = [];
21
- for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
22
- pagePromises.push(promiseLimit(() => pdfDocument.getPage(pageNum).then(async (page) => {
23
- const viewport = page.getViewport({ scale: 2.0 });
24
- const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
25
- const context = canvas.getContext('2d');
26
- await page.render({ canvasContext: context, viewport }).promise;
27
- const imageBuffer = await canvas.encode('png');
28
- pageContents[pageNum - 1] = imageBuffer;
29
- return;
30
- })));
31
- }
32
- await Promise.all(pagePromises);
33
- return pageContents;
34
- });
4
+ const core_1 = require("./core");
35
5
  /**
36
6
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
37
7
  *
@@ -48,23 +18,6 @@ const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pd
48
18
  *
49
19
  * @throws {Error} Throws an error if the input type is invalid.
50
20
  */
51
- const pdf2image = async (input, options) => {
52
- if (typeof input === 'string') {
53
- const fileBuffer = await (0, promises_1.readFile)(input, {});
54
- const data = new Uint8Array(fileBuffer);
55
- return parsePdfFileBuffer({ data, ...options });
56
- }
57
- if (Buffer.isBuffer(input)) {
58
- const data = new Uint8Array(input);
59
- return parsePdfFileBuffer({ data, ...options });
60
- }
61
- if (input instanceof Uint8Array) {
62
- return parsePdfFileBuffer({ data: input, ...options });
63
- }
64
- if (input instanceof URL) {
65
- return parsePdfFileBuffer({ url: input, ...options });
66
- }
67
- throw new Error(`Invalid source type: ${typeof input}`);
68
- };
21
+ const pdf2image = async (input, options) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.IMAGE, input, options);
69
22
  exports.pdf2image = pdf2image;
70
23
  //# sourceMappingURL=pdf2image.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"pdf2image.js","sourceRoot":"","sources":["../src/pdf2image.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAE5C,4CAA+C;AAC/C,sDAA6B;AAI7B,MAAM,YAAY,GAAG,IAAA,iBAAM,EAAC,CAAC,CAAC,CAAC;AAE/B,MAAM,kBAAkB,GAAG,KAAK,EAAE,OAA+B,EAAE,EAAE,CACnE,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC,EAAE,wBAAwB;KACvC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAa,IAAI,KAAK,CAAS,QAAQ,CAAC,CAAC,IAAI,CAC7D,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAChB,CAAC;IACF,mEAAmE;IACnE,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,YAAY,CAAC,GAAG,EAAE,CAChB,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;YAClD,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YAExC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;YAEhE,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC/C,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC;YACxC,OAAO;QACT,CAAC,CAAC,CACH,CACF,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAML;;;;;;;;;;;;;;;GAeG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,KAAyC,EACzC,OAAsB,EACtB,EAAE;IACF,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AApBW,QAAA,SAAS,aAoBpB"}
1
+ {"version":3,"file":"pdf2image.js","sourceRoot":"","sources":["../src/pdf2image.ts"],"names":[],"mappings":";;;AAAA,yCAIwB;AAExB;;;;;;;;;;;;;;;GAeG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,KAAyC,EACzC,OAA0B,EAC1B,EAAE,CAAC,IAAA,mBAAY,EAAC,sBAAe,CAAC,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AAH5C,QAAA,SAAS,aAGmC"}
@@ -1,6 +1,4 @@
1
- interface ParseOptions {
2
- password?: string;
3
- }
1
+ import { AfppParseOptions } from './core';
4
2
  /**
5
3
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
6
4
  *
@@ -17,5 +15,4 @@ interface ParseOptions {
17
15
  *
18
16
  * @throws {Error} Throws an error if the input type is invalid.
19
17
  */
20
- export declare const pdf2string: (input: Buffer | string | Uint8Array | URL, options?: ParseOptions) => Promise<string[]>;
21
- export {};
18
+ export declare const pdf2string: (input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions) => Promise<string[]>;
@@ -1,37 +1,7 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.pdf2string = void 0;
4
- const promises_1 = require("node:fs/promises");
5
- const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
6
- const loadingTask = pdfjsLib.getDocument({
7
- ...options,
8
- verbosity: 0, // TODO enable for debug
9
- });
10
- const pdfDocument = await loadingTask.promise;
11
- const { numPages } = pdfDocument;
12
- const pageContents = new Array(numPages).fill('');
13
- // eslint-disable-next-line @typescript-eslint/no-invalid-void-type
14
- const pagePromises = [];
15
- for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
16
- pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
17
- const textContent = await page.getTextContent({
18
- includeMarkedContent: false,
19
- });
20
- // ? Type assertion of items to TextItem[] should be safe because {includeMarkedContent: false}
21
- const items = textContent.items;
22
- if (items.length === 0) {
23
- pageContents[pageNum - 1] = '';
24
- }
25
- else {
26
- const pageText = items.map((item) => item.str || '').join(' ');
27
- pageContents[pageNum - 1] = pageText;
28
- }
29
- return;
30
- }));
31
- }
32
- await Promise.all(pagePromises);
33
- return pageContents;
34
- });
4
+ const core_1 = require("./core");
35
5
  /**
36
6
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
37
7
  *
@@ -48,23 +18,6 @@ const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pd
48
18
  *
49
19
  * @throws {Error} Throws an error if the input type is invalid.
50
20
  */
51
- const pdf2string = async (input, options) => {
52
- if (typeof input === 'string') {
53
- const fileBuffer = await (0, promises_1.readFile)(input, {});
54
- const data = new Uint8Array(fileBuffer);
55
- return parsePdfFileBuffer({ data, ...options });
56
- }
57
- if (Buffer.isBuffer(input)) {
58
- const data = new Uint8Array(input);
59
- return parsePdfFileBuffer({ data, ...options });
60
- }
61
- if (input instanceof Uint8Array) {
62
- return parsePdfFileBuffer({ data: input, ...options });
63
- }
64
- if (input instanceof URL) {
65
- return parsePdfFileBuffer({ url: input, ...options });
66
- }
67
- throw new Error(`Invalid source type: ${typeof input}`);
68
- };
21
+ const pdf2string = async (input, options) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.TEXT, input, options);
69
22
  exports.pdf2string = pdf2string;
70
23
  //# sourceMappingURL=pdf2string.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"pdf2string.js","sourceRoot":"","sources":["../src/pdf2string.ts"],"names":[],"mappings":";;;AAAA,+CAA4C;AAQ5C,MAAM,kBAAkB,GAAG,KAAK,EAAE,OAA+B,EAAE,EAAE,CACnE,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC,EAAE,wBAAwB;KACvC,CAAC,CAAC;IACH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAa,IAAI,KAAK,CAAS,QAAQ,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEpE,mEAAmE;IACnE,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBAC5C,oBAAoB,EAAE,KAAK;aAC5B,CAAC,CAAC;YACH,+FAA+F;YAC/F,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;YAC9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC;YACjC,CAAC;iBAAM,CAAC;gBACN,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAC/D,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC;YACvC,CAAC;YACD,OAAO;QACT,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAML;;;;;;;;;;;;;;;GAeG;AACI,MAAM,UAAU,GAAG,KAAK,EAC7B,KAAyC,EACzC,OAAsB,EACtB,EAAE;IACF,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AApBW,QAAA,UAAU,cAoBrB"}
1
+ {"version":3,"file":"pdf2string.js","sourceRoot":"","sources":["../src/pdf2string.ts"],"names":[],"mappings":";;;AAAA,yCAIwB;AACxB;;;;;;;;;;;;;;;GAeG;AAEI,MAAM,UAAU,GAAG,KAAK,EAC7B,KAAyC,EACzC,OAA0B,EAC1B,EAAE,CAAC,IAAA,mBAAY,EAAC,sBAAe,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AAH3C,QAAA,UAAU,cAGiC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "afpp",
3
- "version": "2.0.0-beta.1",
3
+ "version": "2.0.0",
4
4
  "description": "another f*cking pdf parser",
5
5
  "types": "./dist/index.d.ts",
6
6
  "main": "./dist/index.js",
@@ -25,8 +25,8 @@
25
25
  "url": "git+ssh://git@github.com/l2ysho/afpp.git"
26
26
  },
27
27
  "engines": {
28
- "node": "v22.14.0",
29
- "npm": "10.9.2"
28
+ "node": ">=22.14.0",
29
+ "npm": ">=10.9.2"
30
30
  },
31
31
  "keywords": [
32
32
  "pdf",
@@ -71,8 +71,5 @@
71
71
  "dependencies": {
72
72
  "p-limit": "6.2.0",
73
73
  "pdfjs-dist": "5.1.91"
74
- },
75
- "peerDependencies": {
76
- "@napi-rs/canvas": "0.1.69"
77
74
  }
78
75
  }