afpp 1.8.0-beta.2 → 1.8.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,14 @@
1
- export type ParsePdfCallback<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
1
+ export type PageProcessor<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
2
+ type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
2
3
  interface ParseOptions {
4
+ /**
5
+ * Concurrency level for page processing.
6
+ */
7
+ concurrency?: number;
8
+ /**
9
+ * Image encoding format when rendering non-text pages. Defaults to 'png'.
10
+ */
11
+ imageEncoding?: ImageEncoding;
3
12
  /**
4
13
  * Password for encrypted pdf files.
5
14
  */
@@ -18,6 +27,9 @@ interface ParseOptions {
18
27
  * @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
19
28
  * @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
20
29
  * @param {string} [options.password] - The password for encrypted PDF files, if required.
30
+ * @param {number} [options.scale=2.0] - Scale factor for rendering pages (affects image resolution).
31
+ * @param {number} [options.concurrency=1] - Number of pages to process in parallel.
32
+ * @param {'png' | 'jpeg' | 'webp' | 'avif'} [options.imageEncoding='png'] - Image format for rendered PDF pages.
21
33
  * @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
22
34
  *
23
35
  * @since — v1.0.0
@@ -26,5 +38,5 @@ interface ParseOptions {
26
38
  *
27
39
  * @throws {Error} Throws an error if the input type is invalid.
28
40
  */
29
- export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: ParseOptions, callback: ParsePdfCallback<T>) => Promise<T[]>;
41
+ export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: ParseOptions, callback: PageProcessor<T>) => Promise<T[]>;
30
42
  export {};
package/dist/parsePdf.js CHANGED
@@ -7,44 +7,41 @@ exports.parsePdf = void 0;
7
7
  const promises_1 = require("node:fs/promises");
8
8
  const canvas_1 = require("@napi-rs/canvas");
9
9
  const p_limit_1 = __importDefault(require("p-limit"));
10
- const promiseLimit = (0, p_limit_1.default)(1);
11
- const parsePdfFileBuffer = async (options, callback) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
12
- const loadingTask = pdfjsLib.getDocument({
13
- ...options,
14
- verbosity: 0,
10
+ const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
11
+ const processPdfPage = async (page, pageNumber, pageCount, scale, encoding, callback) => {
12
+ const textContent = await page.getTextContent({
13
+ includeMarkedContent: false,
15
14
  });
15
+ const items = textContent.items;
16
+ if (items.length === 0) {
17
+ const viewport = page.getViewport({ scale });
18
+ const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
19
+ const context = canvas.getContext('2d');
20
+ await page.render({ canvasContext: context, viewport }).promise;
21
+ //@ts-expect-error this should be fixed in release
22
+ const imageBuffer = await canvas.encode(encoding);
23
+ return callback(imageBuffer, pageNumber, pageCount);
24
+ }
25
+ const pageText = items.map((item) => item.str || '').join(' ');
26
+ return callback(pageText, pageNumber, pageCount);
27
+ };
28
+ const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callback) => {
29
+ const limit = (0, p_limit_1.default)(concurrency);
30
+ const loadingTask = (0, pdf_mjs_1.getDocument)({ ...options, verbosity: 0 });
16
31
  const pdfDocument = await loadingTask.promise;
17
32
  const { numPages } = pdfDocument;
18
- const pageContents = Array.from({ length: numPages }, () => null);
19
- // eslint-disable-next-line @typescript-eslint/no-invalid-void-type
20
- const pagePromises = [];
21
- for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
22
- pagePromises.push(promiseLimit(() => pdfDocument.getPage(pageNum).then(async (page) => {
23
- const textContent = await page.getTextContent({
24
- includeMarkedContent: false,
25
- });
26
- const items = textContent.items;
27
- if (items.length === 0) {
28
- const viewport = page.getViewport({ scale: 2.0 });
29
- const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
30
- const context = canvas.getContext('2d');
31
- await page.render({ canvasContext: context, viewport }).promise;
32
- const imageBuffer = await canvas.encode('png');
33
- // eslint-disable-next-line promise/no-callback-in-promise
34
- pageContents[pageNum - 1] = await callback(imageBuffer, pageNum, numPages);
35
- return page;
36
- }
37
- else {
38
- const pageText = items.map((item) => item.str || '').join(' ');
39
- // eslint-disable-next-line promise/no-callback-in-promise
40
- pageContents[pageNum - 1] = await callback(pageText, pageNum, numPages);
41
- return page;
42
- }
43
- })));
44
- }
45
- await Promise.all(pagePromises);
46
- return pageContents;
47
- });
33
+ const results = new Array(numPages);
34
+ const pageTasks = Array.from({ length: numPages }, (_, i) => {
35
+ const pageNum = i + 1;
36
+ return limit(async () => {
37
+ const page = await pdfDocument.getPage(pageNum);
38
+ const result = await processPdfPage(page, pageNum, numPages, scale, encoding, callback);
39
+ results[i] = result;
40
+ });
41
+ });
42
+ await Promise.all(pageTasks);
43
+ return results;
44
+ };
48
45
  /**
49
46
  * Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
50
47
  *
@@ -54,6 +51,9 @@ const parsePdfFileBuffer = async (options, callback) => import('pdfjs-dist/legac
54
51
  * @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
55
52
  * @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
56
53
  * @param {string} [options.password] - The password for encrypted PDF files, if required.
54
+ * @param {number} [options.scale=2.0] - Scale factor for rendering pages (affects image resolution).
55
+ * @param {number} [options.concurrency=1] - Number of pages to process in parallel.
56
+ * @param {'png' | 'jpeg' | 'webp' | 'avif'} [options.imageEncoding='png'] - Image format for rendered PDF pages.
57
57
  * @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
58
58
  *
59
59
  * @since — v1.0.0
@@ -66,20 +66,25 @@ const parsePdf = async (input, options, callback) => {
66
66
  if (typeof callback !== 'function') {
67
67
  throw new Error(`Invalid callback type: ${typeof callback}`);
68
68
  }
69
+ const scale = options.scale ?? 2.0;
70
+ const concurrency = options.concurrency ?? 1;
71
+ const encoding = options.imageEncoding ?? 'png';
72
+ if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
73
+ throw new Error(`Unsupported image encoding format: '${encoding}'`);
74
+ }
75
+ const baseOptions = { ...options };
69
76
  if (typeof input === 'string') {
70
- const fileBuffer = await (0, promises_1.readFile)(input, {});
71
- const data = new Uint8Array(fileBuffer);
72
- return parsePdfFileBuffer({ data, ...options }, callback);
77
+ const fileBuffer = await (0, promises_1.readFile)(input);
78
+ return parsePdfFileBuffer({ data: new Uint8Array(fileBuffer), ...baseOptions }, scale, concurrency, encoding, callback);
73
79
  }
74
80
  if (Buffer.isBuffer(input)) {
75
- const data = new Uint8Array(input);
76
- return parsePdfFileBuffer({ data, ...options }, callback);
81
+ return parsePdfFileBuffer({ data: new Uint8Array(input), ...baseOptions }, scale, concurrency, encoding, callback);
77
82
  }
78
83
  if (input instanceof Uint8Array) {
79
- return parsePdfFileBuffer({ data: input, ...options }, callback);
84
+ return parsePdfFileBuffer({ data: input, ...baseOptions }, scale, concurrency, encoding, callback);
80
85
  }
81
86
  if (input instanceof URL) {
82
- return parsePdfFileBuffer({ url: input, ...options }, callback);
87
+ return parsePdfFileBuffer({ url: input, ...baseOptions }, scale, concurrency, encoding, callback);
83
88
  }
84
89
  throw new Error(`Invalid source type: ${typeof input}`);
85
90
  };
@@ -1 +1 @@
1
- {"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAE5C,4CAA+C;AAC/C,sDAA6B;AAO7B,MAAM,YAAY,GAAG,IAAA,iBAAM,EAAC,CAAC,CAAC,CAAC;AAQ/B,MAAM,kBAAkB,GAAG,KAAK,EAC9B,OAA+B,EAC/B,QAA6B,EAC7B,EAAE,CACF,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC;KACb,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAQ,KAAK,CAAC,IAAI,CAClC,EAAE,MAAM,EAAE,QAAQ,EAAE,EACpB,GAAG,EAAE,CAAC,IAAoB,CAC3B,CAAC;IACF,mEAAmE;IACnE,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,YAAY,CAAC,GAAG,EAAE,CAChB,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBAC5C,oBAAoB,EAAE,KAAK;aAC5B,CAAC,CAAC;YACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;YAC9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;gBAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;gBAExC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;gBAEhE,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBAC/C,0DAA0D;gBAC1D,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,MAAM,QAAQ,CACxC,WAAW,EACX,OAAO,EACP,QAAQ,CACT,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC;iBAAM,CAAC;gBACN,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAC/D,0DAA0D;gBAC1D,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,MAAM,QAAQ,CACxC,QAAQ,EACR,OAAO,EACP,QAAQ,CACT,CAAC;gBACF,OAAO,IAAI,CAAC;YACd,CAAC;QACH,CAAC,CAAC,CACH,CACF,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAaL;;;;;;;;;;;;;;;;GAgBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAqB,EACrB,QAA6B,EAC7B,EAAE;IACF,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAC5D,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAC5D,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IACnE,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAClE,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AAxBW,QAAA,QAAQ,YAwBnB"}
1
+ {"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;;;;AAAA,+CAA4C;AAE5C,4CAA+C;AAC/C,sDAA6B;AAC7B,6DAA8D;AAqC9D,MAAM,cAAc,GAAG,KAAK,EAC1B,IAAkB,EAClB,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,QAA0B,EACd,EAAE;IACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QAC7C,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QACxC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;QAChE,kDAAkD;QAClD,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAClD,OAAO,QAAQ,CAAC,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;AACnD,CAAC,CAAC;AAEF,MAAM,kBAAkB,GAAG,KAAK,EAC9B,OAA+B,EAC/B,KAAa,EACb,WAAmB,EACnB,QAAuB,EACvB,QAA0B,EACZ,EAAE;IAChB,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAA,qBAAW,EAAC,EAAE,GAAG,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9D,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAC9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;IAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;QACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;YACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,IAAI,EACJ,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,QAAQ,CACT,CAAC;YACF,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;IAC7B,OAAO,OAAO,CAAC;AACjB,CAAC,CAAC;AAEF;;;;;;;;;;;;;;;;;;;GAmBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAqB,EACrB,QAA0B,EACZ,EAAE;IAChB,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,GAAG,CAAC;IACnC,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,aAAa,IAAI,KAAK,CAAC;IAEhD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,uCAAuC,QAAQ,GAAG,CAAC,CAAC;IACtE,CAAC;IAED,MAAM,WAAW,GAAG,EAAE,GAAG,OAAO,EAAE,CAAC;IAEnC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,CAAC,CAAC;QACzC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,UAAU,CAAC,EAAE,GAAG,WAAW,EAAE,EACpD,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,IAAI,UAAU,CAAC,KAAK,CAAC,EAAE,GAAG,WAAW,EAAE,EAC/C,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CACvB,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC/B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CACvB,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,WAAW,EAAE,EAC9B,KAAK,EACL,WAAW,EACX,QAAQ,EACR,QAAQ,CACT,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AA7DW,QAAA,QAAQ,YA6DnB"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "afpp",
3
- "version": "1.8.0-beta.2",
3
+ "version": "1.8.0-beta.3",
4
4
  "description": "another f*cking pdf parser",
5
5
  "types": "./dist/index.d.ts",
6
6
  "main": "./dist/index.js",