afpp 1.8.0-beta.2 → 1.8.0-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/parsePdf.d.ts +14 -2
- package/dist/parsePdf.js +47 -42
- package/dist/parsePdf.js.map +1 -1
- package/package.json +1 -1
package/dist/parsePdf.d.ts
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
|
-
export type
|
|
1
|
+
export type PageProcessor<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
|
|
2
|
+
type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
|
|
2
3
|
interface ParseOptions {
|
|
4
|
+
/**
|
|
5
|
+
* Concurrency level for page processing.
|
|
6
|
+
*/
|
|
7
|
+
concurrency?: number;
|
|
8
|
+
/**
|
|
9
|
+
* Image encoding format when rendering non-text pages. Defaults to 'png'.
|
|
10
|
+
*/
|
|
11
|
+
imageEncoding?: ImageEncoding;
|
|
3
12
|
/**
|
|
4
13
|
* Password for encrypted pdf files.
|
|
5
14
|
*/
|
|
@@ -18,6 +27,9 @@ interface ParseOptions {
|
|
|
18
27
|
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
19
28
|
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
20
29
|
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
30
|
+
* @param {number} [options.scale=2.0] - Scale factor for rendering pages (affects image resolution).
|
|
31
|
+
* @param {number} [options.concurrency=1] - Number of pages to process in parallel.
|
|
32
|
+
* @param {'png' | 'jpeg' | 'webp' | 'avif'} [options.imageEncoding='png'] - Image format for rendered PDF pages.
|
|
21
33
|
* @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
|
|
22
34
|
*
|
|
23
35
|
* @since — v1.0.0
|
|
@@ -26,5 +38,5 @@ interface ParseOptions {
|
|
|
26
38
|
*
|
|
27
39
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
28
40
|
*/
|
|
29
|
-
export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: ParseOptions, callback:
|
|
41
|
+
export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: ParseOptions, callback: PageProcessor<T>) => Promise<T[]>;
|
|
30
42
|
export {};
|
package/dist/parsePdf.js
CHANGED
|
@@ -7,44 +7,41 @@ exports.parsePdf = void 0;
|
|
|
7
7
|
const promises_1 = require("node:fs/promises");
|
|
8
8
|
const canvas_1 = require("@napi-rs/canvas");
|
|
9
9
|
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
13
|
-
|
|
14
|
-
verbosity: 0,
|
|
10
|
+
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
11
|
+
const processPdfPage = async (page, pageNumber, pageCount, scale, encoding, callback) => {
|
|
12
|
+
const textContent = await page.getTextContent({
|
|
13
|
+
includeMarkedContent: false,
|
|
15
14
|
});
|
|
15
|
+
const items = textContent.items;
|
|
16
|
+
if (items.length === 0) {
|
|
17
|
+
const viewport = page.getViewport({ scale });
|
|
18
|
+
const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
|
|
19
|
+
const context = canvas.getContext('2d');
|
|
20
|
+
await page.render({ canvasContext: context, viewport }).promise;
|
|
21
|
+
//@ts-expect-error this should be fixed in release
|
|
22
|
+
const imageBuffer = await canvas.encode(encoding);
|
|
23
|
+
return callback(imageBuffer, pageNumber, pageCount);
|
|
24
|
+
}
|
|
25
|
+
const pageText = items.map((item) => item.str || '').join(' ');
|
|
26
|
+
return callback(pageText, pageNumber, pageCount);
|
|
27
|
+
};
|
|
28
|
+
const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callback) => {
|
|
29
|
+
const limit = (0, p_limit_1.default)(concurrency);
|
|
30
|
+
const loadingTask = (0, pdf_mjs_1.getDocument)({ ...options, verbosity: 0 });
|
|
16
31
|
const pdfDocument = await loadingTask.promise;
|
|
17
32
|
const { numPages } = pdfDocument;
|
|
18
|
-
const
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
const
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
const context = canvas.getContext('2d');
|
|
31
|
-
await page.render({ canvasContext: context, viewport }).promise;
|
|
32
|
-
const imageBuffer = await canvas.encode('png');
|
|
33
|
-
// eslint-disable-next-line promise/no-callback-in-promise
|
|
34
|
-
pageContents[pageNum - 1] = await callback(imageBuffer, pageNum, numPages);
|
|
35
|
-
return page;
|
|
36
|
-
}
|
|
37
|
-
else {
|
|
38
|
-
const pageText = items.map((item) => item.str || '').join(' ');
|
|
39
|
-
// eslint-disable-next-line promise/no-callback-in-promise
|
|
40
|
-
pageContents[pageNum - 1] = await callback(pageText, pageNum, numPages);
|
|
41
|
-
return page;
|
|
42
|
-
}
|
|
43
|
-
})));
|
|
44
|
-
}
|
|
45
|
-
await Promise.all(pagePromises);
|
|
46
|
-
return pageContents;
|
|
47
|
-
});
|
|
33
|
+
const results = new Array(numPages);
|
|
34
|
+
const pageTasks = Array.from({ length: numPages }, (_, i) => {
|
|
35
|
+
const pageNum = i + 1;
|
|
36
|
+
return limit(async () => {
|
|
37
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
38
|
+
const result = await processPdfPage(page, pageNum, numPages, scale, encoding, callback);
|
|
39
|
+
results[i] = result;
|
|
40
|
+
});
|
|
41
|
+
});
|
|
42
|
+
await Promise.all(pageTasks);
|
|
43
|
+
return results;
|
|
44
|
+
};
|
|
48
45
|
/**
|
|
49
46
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
|
|
50
47
|
*
|
|
@@ -54,6 +51,9 @@ const parsePdfFileBuffer = async (options, callback) => import('pdfjs-dist/legac
|
|
|
54
51
|
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
55
52
|
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
56
53
|
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
54
|
+
* @param {number} [options.scale=2.0] - Scale factor for rendering pages (affects image resolution).
|
|
55
|
+
* @param {number} [options.concurrency=1] - Number of pages to process in parallel.
|
|
56
|
+
* @param {'png' | 'jpeg' | 'webp' | 'avif'} [options.imageEncoding='png'] - Image format for rendered PDF pages.
|
|
57
57
|
* @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
|
|
58
58
|
*
|
|
59
59
|
* @since — v1.0.0
|
|
@@ -66,20 +66,25 @@ const parsePdf = async (input, options, callback) => {
|
|
|
66
66
|
if (typeof callback !== 'function') {
|
|
67
67
|
throw new Error(`Invalid callback type: ${typeof callback}`);
|
|
68
68
|
}
|
|
69
|
+
const scale = options.scale ?? 2.0;
|
|
70
|
+
const concurrency = options.concurrency ?? 1;
|
|
71
|
+
const encoding = options.imageEncoding ?? 'png';
|
|
72
|
+
if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
|
|
73
|
+
throw new Error(`Unsupported image encoding format: '${encoding}'`);
|
|
74
|
+
}
|
|
75
|
+
const baseOptions = { ...options };
|
|
69
76
|
if (typeof input === 'string') {
|
|
70
|
-
const fileBuffer = await (0, promises_1.readFile)(input
|
|
71
|
-
|
|
72
|
-
return parsePdfFileBuffer({ data, ...options }, callback);
|
|
77
|
+
const fileBuffer = await (0, promises_1.readFile)(input);
|
|
78
|
+
return parsePdfFileBuffer({ data: new Uint8Array(fileBuffer), ...baseOptions }, scale, concurrency, encoding, callback);
|
|
73
79
|
}
|
|
74
80
|
if (Buffer.isBuffer(input)) {
|
|
75
|
-
|
|
76
|
-
return parsePdfFileBuffer({ data, ...options }, callback);
|
|
81
|
+
return parsePdfFileBuffer({ data: new Uint8Array(input), ...baseOptions }, scale, concurrency, encoding, callback);
|
|
77
82
|
}
|
|
78
83
|
if (input instanceof Uint8Array) {
|
|
79
|
-
return parsePdfFileBuffer({ data: input, ...
|
|
84
|
+
return parsePdfFileBuffer({ data: input, ...baseOptions }, scale, concurrency, encoding, callback);
|
|
80
85
|
}
|
|
81
86
|
if (input instanceof URL) {
|
|
82
|
-
return parsePdfFileBuffer({ url: input, ...
|
|
87
|
+
return parsePdfFileBuffer({ url: input, ...baseOptions }, scale, concurrency, encoding, callback);
|
|
83
88
|
}
|
|
84
89
|
throw new Error(`Invalid source type: ${typeof input}`);
|
|
85
90
|
};
|
package/dist/parsePdf.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAE5C,4CAA+C;AAC/C,sDAA6B;
|
|
1
|
+
{"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAE5C,4CAA+C;AAC/C,sDAA6B;AAC7B,6DAA8D;AAqC9D,MAAM,cAAc,GAAG,KAAK,EAC1B,IAAkB,EAClB,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,QAA0B,EACd,EAAE;IACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;QAChE,kDAAkD;QAClD,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAClD,OAAO,QAAQ,CAAC,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;AACnD,CAAC,CAAC;AAEF,MAAM,kBAAkB,GAAG,KAAK,EAC9B,OAA+B,EAC/B,KAAa,EACb,WAAmB,EACnB,QAAuB,EACvB,QAA0B,EACZ,EAAE;IAChB,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAA,qBAAW,EAAC,EAAE,GAAG,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9D,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAC9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;IAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;QACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;YACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,IAAI,EACJ,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,QAAQ,CACT,CAAC;YACF,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAC7B,OAAO,OAAO,CAAC;AACjB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;GAmBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAqB,EACrB,QAA0B,EACZ,EAAE;IAChB,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC;IACnC,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,aAAa,IAAI,KAAK,CAAC;IAEhD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,uCAAuC,QAAQ,GAAG,CAAC,CAAC;IACtE,CAAC;IAED,MAAM,WAAW,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAEnC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,CAAC,CAAC;QACzC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,GAAG,WAAW,EAAE,EACpD,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,WAAW,EAAE,EAC/C,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC/B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CACvB,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC9B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AA7DW,QAAA,QAAQ,YA6DnB"}
|