afpp 1.8.0-beta.3 → 2.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/core.d.ts +38 -0
- package/dist/core.js +135 -0
- package/dist/core.js.map +1 -0
- package/dist/parsePdf.d.ts +2 -22
- package/dist/parsePdf.js +2 -67
- package/dist/parsePdf.js.map +1 -1
- package/dist/pdf2image.d.ts +2 -5
- package/dist/pdf2image.js +2 -49
- package/dist/pdf2image.js.map +1 -1
- package/dist/pdf2string.d.ts +2 -5
- package/dist/pdf2string.js +2 -49
- package/dist/pdf2string.js.map +1 -1
- package/package.json +2 -3
package/README.md
CHANGED
package/dist/core.d.ts
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export declare enum PROCESSING_TYPE {
|
|
2
|
+
IMAGE = "IMAGE",
|
|
3
|
+
MIXED = "MIXED",
|
|
4
|
+
TEXT = "TEXT"
|
|
5
|
+
}
|
|
6
|
+
import { Canvas, CanvasRenderingContext2D } from '@napi-rs/canvas';
|
|
7
|
+
export interface AfppParseOptions {
|
|
8
|
+
/**
|
|
9
|
+
* Concurrency level for page processing.
|
|
10
|
+
*/
|
|
11
|
+
concurrency?: number;
|
|
12
|
+
/**
|
|
13
|
+
* Image encoding format when rendering non-text pages. Defaults to 'png'.
|
|
14
|
+
*/
|
|
15
|
+
imageEncoding?: ImageEncoding;
|
|
16
|
+
/**
|
|
17
|
+
* Password for encrypted pdf files.
|
|
18
|
+
*/
|
|
19
|
+
password?: string;
|
|
20
|
+
/**
|
|
21
|
+
* Scale of a page if content is not text.
|
|
22
|
+
*/
|
|
23
|
+
scale?: number;
|
|
24
|
+
}
|
|
25
|
+
export interface CanvasAndContext {
|
|
26
|
+
canvas: Canvas;
|
|
27
|
+
context: CanvasRenderingContext2D;
|
|
28
|
+
}
|
|
29
|
+
export type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
|
|
30
|
+
export type PageProcessor<T> = (content: Buffer | string, pageNumber: number, pageCount: number) => Promise<T> | T;
|
|
31
|
+
export interface PdfCanvasFactory {
|
|
32
|
+
create(width: number, height: number): CanvasAndContext;
|
|
33
|
+
destroy(canvasAndContext: CanvasAndContext): void;
|
|
34
|
+
reset(canvasAndContext: CanvasAndContext, width: number, height: number): void;
|
|
35
|
+
}
|
|
36
|
+
export declare function parsePdfFile(type: PROCESSING_TYPE.IMAGE, input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions, callback?: undefined): Promise<Buffer[]>;
|
|
37
|
+
export declare function parsePdfFile(type: PROCESSING_TYPE.TEXT, input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions, callback?: undefined): Promise<string[]>;
|
|
38
|
+
export declare function parsePdfFile<T>(type: PROCESSING_TYPE.MIXED, input: Buffer | string | Uint8Array | URL, options: AfppParseOptions, callback: PageProcessor<T>): Promise<T[]>;
|
package/dist/core.js
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.PROCESSING_TYPE = void 0;
|
|
7
|
+
exports.parsePdfFile = parsePdfFile;
|
|
8
|
+
var PROCESSING_TYPE;
|
|
9
|
+
(function (PROCESSING_TYPE) {
|
|
10
|
+
PROCESSING_TYPE["IMAGE"] = "IMAGE";
|
|
11
|
+
PROCESSING_TYPE["MIXED"] = "MIXED";
|
|
12
|
+
PROCESSING_TYPE["TEXT"] = "TEXT";
|
|
13
|
+
})(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {}));
|
|
14
|
+
const promises_1 = require("node:fs/promises");
|
|
15
|
+
const p_limit_1 = __importDefault(require("p-limit"));
|
|
16
|
+
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
17
|
+
const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => {
|
|
18
|
+
const textContent = await page.getTextContent({
|
|
19
|
+
includeMarkedContent: false,
|
|
20
|
+
});
|
|
21
|
+
const items = textContent.items;
|
|
22
|
+
if (items.length === 0) {
|
|
23
|
+
const viewport = page.getViewport({ scale });
|
|
24
|
+
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
|
|
25
|
+
await page.render({ canvasContext: canvasAndContext.context, viewport })
|
|
26
|
+
.promise;
|
|
27
|
+
//@ts-expect-error this should be fixed in release
|
|
28
|
+
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
|
|
29
|
+
canvasFactory.destroy(canvasAndContext);
|
|
30
|
+
return callback(imageBuffer, pageNumber, pageCount);
|
|
31
|
+
}
|
|
32
|
+
const pageText = items.map((item) => item.str || '').join(' ');
|
|
33
|
+
return callback(pageText, pageNumber, pageCount);
|
|
34
|
+
};
|
|
35
|
+
const processPdfPageTypeText = async (page) => {
|
|
36
|
+
const textContent = await page.getTextContent({
|
|
37
|
+
includeMarkedContent: false,
|
|
38
|
+
});
|
|
39
|
+
const items = textContent.items;
|
|
40
|
+
if (items.length === 0) {
|
|
41
|
+
return '';
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
return items.map((item) => item.str || '').join(' ');
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => {
|
|
48
|
+
const viewport = page.getViewport({ scale });
|
|
49
|
+
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
|
|
50
|
+
await page.render({ canvasContext: canvasAndContext.context, viewport })
|
|
51
|
+
.promise;
|
|
52
|
+
//@ts-expect-error this should be fixed in release
|
|
53
|
+
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
|
|
54
|
+
canvasFactory.destroy(canvasAndContext);
|
|
55
|
+
return imageBuffer;
|
|
56
|
+
};
|
|
57
|
+
const validateParameters = async (input, options) => {
|
|
58
|
+
const documentInitParameters = {};
|
|
59
|
+
switch (true) {
|
|
60
|
+
case typeof input === 'string':
|
|
61
|
+
documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input));
|
|
62
|
+
break;
|
|
63
|
+
case Buffer.isBuffer(input):
|
|
64
|
+
documentInitParameters.data = new Uint8Array(input);
|
|
65
|
+
break;
|
|
66
|
+
case input instanceof Uint8Array:
|
|
67
|
+
documentInitParameters.data = input;
|
|
68
|
+
break;
|
|
69
|
+
case input instanceof URL:
|
|
70
|
+
documentInitParameters.url = input;
|
|
71
|
+
break;
|
|
72
|
+
default:
|
|
73
|
+
throw new Error(`Invalid source type: ${typeof input}`);
|
|
74
|
+
}
|
|
75
|
+
documentInitParameters.password = options?.password;
|
|
76
|
+
documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS;
|
|
77
|
+
const scale = options?.scale ?? 2.0;
|
|
78
|
+
const concurrency = options?.concurrency ?? 1;
|
|
79
|
+
const encoding = options?.imageEncoding ?? 'png';
|
|
80
|
+
if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
|
|
81
|
+
throw new Error(`Unsupported image encoding format: '${encoding}'`);
|
|
82
|
+
}
|
|
83
|
+
return { concurrency, documentInitParameters, encoding, scale };
|
|
84
|
+
};
|
|
85
|
+
async function parsePdfFile(type, input, options, callback) {
|
|
86
|
+
const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options);
|
|
87
|
+
const limit = (0, p_limit_1.default)(concurrency);
|
|
88
|
+
const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters);
|
|
89
|
+
const pdfDocument = await loadingTask.promise;
|
|
90
|
+
const { numPages } = pdfDocument;
|
|
91
|
+
if (type === PROCESSING_TYPE.MIXED) {
|
|
92
|
+
if (!callback || typeof callback !== 'function') {
|
|
93
|
+
throw new Error(`Invalid callback type: ${typeof callback}`);
|
|
94
|
+
}
|
|
95
|
+
const results = new Array(numPages);
|
|
96
|
+
const pageTasks = Array.from({ length: numPages }, (_, i) => {
|
|
97
|
+
const pageNum = i + 1;
|
|
98
|
+
return limit(async () => {
|
|
99
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
100
|
+
const canvasFactory = pdfDocument.canvasFactory;
|
|
101
|
+
const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback);
|
|
102
|
+
results[i] = result;
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
await Promise.all(pageTasks);
|
|
106
|
+
return results;
|
|
107
|
+
}
|
|
108
|
+
if (type === PROCESSING_TYPE.TEXT) {
|
|
109
|
+
const results = new Array(numPages);
|
|
110
|
+
const pageTasks = Array.from({ length: numPages }, (_, i) => {
|
|
111
|
+
const pageNum = i + 1;
|
|
112
|
+
return limit(async () => {
|
|
113
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
114
|
+
results[i] = await processPdfPageTypeText(page);
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
await Promise.all(pageTasks);
|
|
118
|
+
return results;
|
|
119
|
+
}
|
|
120
|
+
if (type === PROCESSING_TYPE.IMAGE) {
|
|
121
|
+
const results = new Array(numPages);
|
|
122
|
+
const pageTasks = Array.from({ length: numPages }, (_, i) => {
|
|
123
|
+
const pageNum = i + 1;
|
|
124
|
+
return limit(async () => {
|
|
125
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
126
|
+
const canvasFactory = pdfDocument.canvasFactory;
|
|
127
|
+
results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding);
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
await Promise.all(pageTasks);
|
|
131
|
+
return results;
|
|
132
|
+
}
|
|
133
|
+
throw new Error('Invalid PROCESSING_TYPE');
|
|
134
|
+
}
|
|
135
|
+
//# sourceMappingURL=core.js.map
|
package/dist/core.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"core.js","sourceRoot":"","sources":["../src/core.ts"],"names":[],"mappings":";;;;;;AA6LA,oCAgFC;AA7QD,IAAY,eAIX;AAJD,WAAY,eAAe;IACzB,kCAAe,CAAA;IACf,kCAAe,CAAA;IACf,gCAAa,CAAA;AACf,CAAC,EAJW,eAAe,+BAAf,eAAe,QAI1B;AAED,+CAA4C;AAG5C,sDAA6B;AAC7B,6DAA8E;AAmD9E,MAAM,uBAAuB,GAAG,KAAK,EACnC,IAAkB,EAClB,aAA+B,EAC/B,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,QAA0B,EACd,EAAE;IACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;QAE7C,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,CAC3C,QAAQ,CAAC,KAAK,EACd,QAAQ,CAAC,MAAM,CAChB,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,gBAAgB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC;aACrE,OAAO,CAAC;QACX,kDAAkD;QAClD,MAAM,WAAW,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QACnE,aAAa,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;QACxC,OAAO,QAAQ,CAAC,WAAW,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/D,OAAO,QAAQ,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,CAAC,CAAC;AACnD,CAAC,CAAC;AAEF,MAAM,sBAAsB,GAAG,KAAK,EAAE,IAAkB,EAAE,EAAE;IAC1D,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;QAC5C,oBAAoB,EAAE,KAAK;KAC5B,CAAC,CAAC;IACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;IAE9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,EAAE,CAAC;IACZ,CAAC;SAAM,CAAC;QACN,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvD,CAAC;AACH,CAAC,CAAC;AAEF,MAAM,uBAAuB,GAAG,KAAK,EACnC,IAAkB,EAClB,aAA+B,EAC/B,UAAkB,EAClB,SAAiB,EACjB,KAAa,EACb,QAAuB,EACvB,EAAE;IACF,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC;IAE7C,MAAM,gBAAgB,GAAG,aAAa,CAAC,MAAM,CAC3C,QAAQ,CAAC,KAAK,EACd,QAAQ,CAAC,MAAM,CAChB,CAAC;IAEF,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,gBAAgB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC;SACrE,OAAO,CAAC;IACX,kDAAkD;IAClD,MAAM,WAAW,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACnE,aAAa,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACxC,OAAO,WAAW,CAAC;AACrB,CAAC,CAAC;AAEF,MAAM,kBAAkB,GAAG,KAAK,EAC9B,KAAyC,EACzC,OAA0B,EAC1B,EAAE;IACF,MAAM,sBAAsB,GAA2B,EAAE,CAAC;IAE1D,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,OAAO,KAAK,KAAK,QAAQ;YAC5B,sBAAsB,CAAC,IAAI,GAAG,IAAI,UAAU,CAAC,MAAM,IAAA,mBAAQ,EAAC,KAAK,CAAC,CAAC,CAAC;YACpE,MAAM;QACR,KAAK,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YACzB,sBAAsB,CAAC,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;YACpD,MAAM;QACR,KAAK,KAAK,YAAY,UAAU;YAC9B,sBAAsB,CAAC,IAAI,GAAG,KAAK,CAAC;YACpC,MAAM;QACR,KAAK,KAAK,YAAY,GAAG;YACvB,sBAAsB,CAAC,GAAG,GAAG,KAAK,CAAC;YACnC,MAAM;QACR;YACE,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;IAC5D,CAAC;IAED,sBAAsB,CAAC,QAAQ,GAAG,OAAO,EAAE,QAAQ,CAAC;IACpD,sBAAsB,CAAC,SAAS,GAAG,wBAAc,CAAC,MAAM,CAAC;IAEzD,MAAM,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,GAAG,CAAC;IACpC,MAAM,WAAW,GAAG,OAAO,EAAE,WAAW,IAAI,CAAC,CAAC;IAC9C,MAAM,QAAQ,GAAG,OAAO,EAAE,aAAa,IAAI,KAAK,CAAC;IAEjD,IAAI,CAAC,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,uCAAuC,QAAQ,GAAG,CAAC,CAAC;IACtE,CAAC;IAED,OAAO,EAAE,WAAW,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;AAClE,CAAC,CAAC;AAuBK,KAAK,UAAU,YAAY,CAChC,IAAqB,EACrB,KAAyC,EACzC,OAA0B,EAC1B,QAA2B;IAE3B,MAAM,EAAE,WAAW,EAAE,sBAAsB,EAAE,QAAQ,EAAE,KAAK,EAAE,GAC5D,MAAM,kBAAkB,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAE3C,MAAM,KAAK,GAAG,IAAA,iBAAM,EAAC,WAAW,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAA,qBAAW,EAAC,sBAAsB,CAAC,CAAC;IACxD,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAC9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IAEjC,IAAI,IAAI,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;QACnC,IAAI,CAAC,QAAQ,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;YAChD,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;QAC/D,CAAC;QACD,MAAM,OAAO,GAAQ,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEzC,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,MAAM,aAAa,GAAG,WAAW,CAAC,aAAiC,CAAC;gBAEpE,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAC1C,IAAI,EACJ,aAAa,EACb,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,EACR,QAAQ,CACT,CAAC;gBACF,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC;YACtB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,IAAI,KAAK,eAAe,CAAC,IAAI,EAAE,CAAC;QAClC,MAAM,OAAO,GAAa,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,sBAAsB,CAAC,IAAI,CAAC,CAAC;YAClD,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,IAAI,IAAI,KAAK,eAAe,CAAC,KAAK,EAAE,CAAC;QACnC,MAAM,OAAO,GAAa,IAAI,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAC1D,MAAM,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC;YACtB,OAAO,KAAK,CAAC,KAAK,IAAI,EAAE;gBACtB,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;gBAChD,MAAM,aAAa,GAAG,WAAW,CAAC,aAAiC,CAAC;gBACpE,OAAO,CAAC,CAAC,CAAC,GAAG,MAAM,uBAAuB,CACxC,IAAI,EACJ,aAAa,EACb,OAAO,EACP,QAAQ,EACR,KAAK,EACL,QAAQ,CACT,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QAC7B,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;AAC7C,CAAC"}
|
package/dist/parsePdf.d.ts
CHANGED
|
@@ -1,23 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
type ImageEncoding = 'avif' | 'jpeg' | 'png' | 'webp';
|
|
3
|
-
interface ParseOptions {
|
|
4
|
-
/**
|
|
5
|
-
* Concurrency level for page processing.
|
|
6
|
-
*/
|
|
7
|
-
concurrency?: number;
|
|
8
|
-
/**
|
|
9
|
-
* Image encoding format when rendering non-text pages. Defaults to 'png'.
|
|
10
|
-
*/
|
|
11
|
-
imageEncoding?: ImageEncoding;
|
|
12
|
-
/**
|
|
13
|
-
* Password for encrypted pdf files.
|
|
14
|
-
*/
|
|
15
|
-
password?: string;
|
|
16
|
-
/**
|
|
17
|
-
* Scale of a page if content is not text.
|
|
18
|
-
*/
|
|
19
|
-
scale: number;
|
|
20
|
-
}
|
|
1
|
+
import { AfppParseOptions, PageProcessor } from './core';
|
|
21
2
|
/**
|
|
22
3
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
|
|
23
4
|
*
|
|
@@ -38,5 +19,4 @@ interface ParseOptions {
|
|
|
38
19
|
*
|
|
39
20
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
40
21
|
*/
|
|
41
|
-
export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options:
|
|
42
|
-
export {};
|
|
22
|
+
export declare const parsePdf: <T>(input: Buffer | string | Uint8Array | URL, options: AfppParseOptions, callback: PageProcessor<T>) => Promise<T[]>;
|
package/dist/parsePdf.js
CHANGED
|
@@ -1,47 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
3
|
exports.parsePdf = void 0;
|
|
7
|
-
const
|
|
8
|
-
const canvas_1 = require("@napi-rs/canvas");
|
|
9
|
-
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
|
-
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
11
|
-
const processPdfPage = async (page, pageNumber, pageCount, scale, encoding, callback) => {
|
|
12
|
-
const textContent = await page.getTextContent({
|
|
13
|
-
includeMarkedContent: false,
|
|
14
|
-
});
|
|
15
|
-
const items = textContent.items;
|
|
16
|
-
if (items.length === 0) {
|
|
17
|
-
const viewport = page.getViewport({ scale });
|
|
18
|
-
const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
|
|
19
|
-
const context = canvas.getContext('2d');
|
|
20
|
-
await page.render({ canvasContext: context, viewport }).promise;
|
|
21
|
-
//@ts-expect-error this should be fixed in release
|
|
22
|
-
const imageBuffer = await canvas.encode(encoding);
|
|
23
|
-
return callback(imageBuffer, pageNumber, pageCount);
|
|
24
|
-
}
|
|
25
|
-
const pageText = items.map((item) => item.str || '').join(' ');
|
|
26
|
-
return callback(pageText, pageNumber, pageCount);
|
|
27
|
-
};
|
|
28
|
-
const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callback) => {
|
|
29
|
-
const limit = (0, p_limit_1.default)(concurrency);
|
|
30
|
-
const loadingTask = (0, pdf_mjs_1.getDocument)({ ...options, verbosity: 0 });
|
|
31
|
-
const pdfDocument = await loadingTask.promise;
|
|
32
|
-
const { numPages } = pdfDocument;
|
|
33
|
-
const results = new Array(numPages);
|
|
34
|
-
const pageTasks = Array.from({ length: numPages }, (_, i) => {
|
|
35
|
-
const pageNum = i + 1;
|
|
36
|
-
return limit(async () => {
|
|
37
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
38
|
-
const result = await processPdfPage(page, pageNum, numPages, scale, encoding, callback);
|
|
39
|
-
results[i] = result;
|
|
40
|
-
});
|
|
41
|
-
});
|
|
42
|
-
await Promise.all(pageTasks);
|
|
43
|
-
return results;
|
|
44
|
-
};
|
|
4
|
+
const core_1 = require("./core");
|
|
45
5
|
/**
|
|
46
6
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
|
|
47
7
|
*
|
|
@@ -62,31 +22,6 @@ const parsePdfFileBuffer = async (options, scale, concurrency, encoding, callbac
|
|
|
62
22
|
*
|
|
63
23
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
64
24
|
*/
|
|
65
|
-
const parsePdf = async (input, options, callback) =>
|
|
66
|
-
if (typeof callback !== 'function') {
|
|
67
|
-
throw new Error(`Invalid callback type: ${typeof callback}`);
|
|
68
|
-
}
|
|
69
|
-
const scale = options.scale ?? 2.0;
|
|
70
|
-
const concurrency = options.concurrency ?? 1;
|
|
71
|
-
const encoding = options.imageEncoding ?? 'png';
|
|
72
|
-
if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
|
|
73
|
-
throw new Error(`Unsupported image encoding format: '${encoding}'`);
|
|
74
|
-
}
|
|
75
|
-
const baseOptions = { ...options };
|
|
76
|
-
if (typeof input === 'string') {
|
|
77
|
-
const fileBuffer = await (0, promises_1.readFile)(input);
|
|
78
|
-
return parsePdfFileBuffer({ data: new Uint8Array(fileBuffer), ...baseOptions }, scale, concurrency, encoding, callback);
|
|
79
|
-
}
|
|
80
|
-
if (Buffer.isBuffer(input)) {
|
|
81
|
-
return parsePdfFileBuffer({ data: new Uint8Array(input), ...baseOptions }, scale, concurrency, encoding, callback);
|
|
82
|
-
}
|
|
83
|
-
if (input instanceof Uint8Array) {
|
|
84
|
-
return parsePdfFileBuffer({ data: input, ...baseOptions }, scale, concurrency, encoding, callback);
|
|
85
|
-
}
|
|
86
|
-
if (input instanceof URL) {
|
|
87
|
-
return parsePdfFileBuffer({ url: input, ...baseOptions }, scale, concurrency, encoding, callback);
|
|
88
|
-
}
|
|
89
|
-
throw new Error(`Invalid source type: ${typeof input}`);
|
|
90
|
-
};
|
|
25
|
+
const parsePdf = async (input, options, callback) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.MIXED, input, options, callback);
|
|
91
26
|
exports.parsePdf = parsePdf;
|
|
92
27
|
//# sourceMappingURL=parsePdf.js.map
|
package/dist/parsePdf.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;AAAA,yCAKwB;AAExB;;;;;;;;;;;;;;;;;;;GAmBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAyB,EACzB,QAA0B,EACZ,EAAE,CAChB,IAAA,mBAAY,EAAC,sBAAe,CAAC,KAAK,EAAE,KAAK,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC;AALnD,QAAA,QAAQ,YAK2C"}
|
package/dist/pdf2image.d.ts
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
password?: string;
|
|
3
|
-
}
|
|
1
|
+
import { AfppParseOptions } from './core';
|
|
4
2
|
/**
|
|
5
3
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
|
|
6
4
|
*
|
|
@@ -17,5 +15,4 @@ interface ParseOptions {
|
|
|
17
15
|
*
|
|
18
16
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
19
17
|
*/
|
|
20
|
-
export declare const pdf2image: (input: Buffer | string | Uint8Array | URL, options?:
|
|
21
|
-
export {};
|
|
18
|
+
export declare const pdf2image: (input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions) => Promise<Buffer<ArrayBufferLike>[]>;
|
package/dist/pdf2image.js
CHANGED
|
@@ -1,37 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
-
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
-
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
-
};
|
|
5
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
3
|
exports.pdf2image = void 0;
|
|
7
|
-
const
|
|
8
|
-
const canvas_1 = require("@napi-rs/canvas");
|
|
9
|
-
const p_limit_1 = __importDefault(require("p-limit"));
|
|
10
|
-
const promiseLimit = (0, p_limit_1.default)(1);
|
|
11
|
-
const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
|
|
12
|
-
const loadingTask = pdfjsLib.getDocument({
|
|
13
|
-
...options,
|
|
14
|
-
verbosity: 0, // TODO enable for debug
|
|
15
|
-
});
|
|
16
|
-
const pdfDocument = await loadingTask.promise;
|
|
17
|
-
const { numPages } = pdfDocument;
|
|
18
|
-
const pageContents = new Array(numPages).fill(Buffer.from(''));
|
|
19
|
-
// eslint-disable-next-line @typescript-eslint/no-invalid-void-type
|
|
20
|
-
const pagePromises = [];
|
|
21
|
-
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
22
|
-
pagePromises.push(promiseLimit(() => pdfDocument.getPage(pageNum).then(async (page) => {
|
|
23
|
-
const viewport = page.getViewport({ scale: 2.0 });
|
|
24
|
-
const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
|
|
25
|
-
const context = canvas.getContext('2d');
|
|
26
|
-
await page.render({ canvasContext: context, viewport }).promise;
|
|
27
|
-
const imageBuffer = await canvas.encode('png');
|
|
28
|
-
pageContents[pageNum - 1] = imageBuffer;
|
|
29
|
-
return;
|
|
30
|
-
})));
|
|
31
|
-
}
|
|
32
|
-
await Promise.all(pagePromises);
|
|
33
|
-
return pageContents;
|
|
34
|
-
});
|
|
4
|
+
const core_1 = require("./core");
|
|
35
5
|
/**
|
|
36
6
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
|
|
37
7
|
*
|
|
@@ -48,23 +18,6 @@ const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pd
|
|
|
48
18
|
*
|
|
49
19
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
50
20
|
*/
|
|
51
|
-
const pdf2image = async (input, options) =>
|
|
52
|
-
if (typeof input === 'string') {
|
|
53
|
-
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
54
|
-
const data = new Uint8Array(fileBuffer);
|
|
55
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
56
|
-
}
|
|
57
|
-
if (Buffer.isBuffer(input)) {
|
|
58
|
-
const data = new Uint8Array(input);
|
|
59
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
60
|
-
}
|
|
61
|
-
if (input instanceof Uint8Array) {
|
|
62
|
-
return parsePdfFileBuffer({ data: input, ...options });
|
|
63
|
-
}
|
|
64
|
-
if (input instanceof URL) {
|
|
65
|
-
return parsePdfFileBuffer({ url: input, ...options });
|
|
66
|
-
}
|
|
67
|
-
throw new Error(`Invalid source type: ${typeof input}`);
|
|
68
|
-
};
|
|
21
|
+
const pdf2image = async (input, options) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.IMAGE, input, options);
|
|
69
22
|
exports.pdf2image = pdf2image;
|
|
70
23
|
//# sourceMappingURL=pdf2image.js.map
|
package/dist/pdf2image.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pdf2image.js","sourceRoot":"","sources":["../src/pdf2image.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"pdf2image.js","sourceRoot":"","sources":["../src/pdf2image.ts"],"names":[],"mappings":";;;AAAA,yCAIwB;AAExB;;;;;;;;;;;;;;;GAeG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,KAAyC,EACzC,OAA0B,EAC1B,EAAE,CAAC,IAAA,mBAAY,EAAC,sBAAe,CAAC,KAAK,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AAH5C,QAAA,SAAS,aAGmC"}
|
package/dist/pdf2string.d.ts
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
password?: string;
|
|
3
|
-
}
|
|
1
|
+
import { AfppParseOptions } from './core';
|
|
4
2
|
/**
|
|
5
3
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
6
4
|
*
|
|
@@ -17,5 +15,4 @@ interface ParseOptions {
|
|
|
17
15
|
*
|
|
18
16
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
19
17
|
*/
|
|
20
|
-
export declare const pdf2string: (input: Buffer | string | Uint8Array | URL, options?:
|
|
21
|
-
export {};
|
|
18
|
+
export declare const pdf2string: (input: Buffer | string | Uint8Array | URL, options?: AfppParseOptions) => Promise<string[]>;
|
package/dist/pdf2string.js
CHANGED
|
@@ -1,37 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.pdf2string = void 0;
|
|
4
|
-
const
|
|
5
|
-
const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
|
|
6
|
-
const loadingTask = pdfjsLib.getDocument({
|
|
7
|
-
...options,
|
|
8
|
-
verbosity: 0, // TODO enable for debug
|
|
9
|
-
});
|
|
10
|
-
const pdfDocument = await loadingTask.promise;
|
|
11
|
-
const { numPages } = pdfDocument;
|
|
12
|
-
const pageContents = new Array(numPages).fill('');
|
|
13
|
-
// eslint-disable-next-line @typescript-eslint/no-invalid-void-type
|
|
14
|
-
const pagePromises = [];
|
|
15
|
-
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
16
|
-
pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
|
|
17
|
-
const textContent = await page.getTextContent({
|
|
18
|
-
includeMarkedContent: false,
|
|
19
|
-
});
|
|
20
|
-
// ? Type assertion of items to TextItem[] should be safe because {includeMarkedContent: false}
|
|
21
|
-
const items = textContent.items;
|
|
22
|
-
if (items.length === 0) {
|
|
23
|
-
pageContents[pageNum - 1] = '';
|
|
24
|
-
}
|
|
25
|
-
else {
|
|
26
|
-
const pageText = items.map((item) => item.str || '').join(' ');
|
|
27
|
-
pageContents[pageNum - 1] = pageText;
|
|
28
|
-
}
|
|
29
|
-
return;
|
|
30
|
-
}));
|
|
31
|
-
}
|
|
32
|
-
await Promise.all(pagePromises);
|
|
33
|
-
return pageContents;
|
|
34
|
-
});
|
|
4
|
+
const core_1 = require("./core");
|
|
35
5
|
/**
|
|
36
6
|
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
37
7
|
*
|
|
@@ -48,23 +18,6 @@ const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pd
|
|
|
48
18
|
*
|
|
49
19
|
* @throws {Error} Throws an error if the input type is invalid.
|
|
50
20
|
*/
|
|
51
|
-
const pdf2string = async (input, options) =>
|
|
52
|
-
if (typeof input === 'string') {
|
|
53
|
-
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
54
|
-
const data = new Uint8Array(fileBuffer);
|
|
55
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
56
|
-
}
|
|
57
|
-
if (Buffer.isBuffer(input)) {
|
|
58
|
-
const data = new Uint8Array(input);
|
|
59
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
60
|
-
}
|
|
61
|
-
if (input instanceof Uint8Array) {
|
|
62
|
-
return parsePdfFileBuffer({ data: input, ...options });
|
|
63
|
-
}
|
|
64
|
-
if (input instanceof URL) {
|
|
65
|
-
return parsePdfFileBuffer({ url: input, ...options });
|
|
66
|
-
}
|
|
67
|
-
throw new Error(`Invalid source type: ${typeof input}`);
|
|
68
|
-
};
|
|
21
|
+
const pdf2string = async (input, options) => (0, core_1.parsePdfFile)(core_1.PROCESSING_TYPE.TEXT, input, options);
|
|
69
22
|
exports.pdf2string = pdf2string;
|
|
70
23
|
//# sourceMappingURL=pdf2string.js.map
|
package/dist/pdf2string.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"pdf2string.js","sourceRoot":"","sources":["../src/pdf2string.ts"],"names":[],"mappings":";;;AAAA
|
|
1
|
+
{"version":3,"file":"pdf2string.js","sourceRoot":"","sources":["../src/pdf2string.ts"],"names":[],"mappings":";;;AAAA,yCAIwB;AACxB;;;;;;;;;;;;;;;GAeG;AAEI,MAAM,UAAU,GAAG,KAAK,EAC7B,KAAyC,EACzC,OAA0B,EAC1B,EAAE,CAAC,IAAA,mBAAY,EAAC,sBAAe,CAAC,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,CAAC;AAH3C,QAAA,UAAU,cAGiC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "afpp",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0-beta.2",
|
|
4
4
|
"description": "another f*cking pdf parser",
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
"url": "git+ssh://git@github.com/l2ysho/afpp.git"
|
|
26
26
|
},
|
|
27
27
|
"engines": {
|
|
28
|
-
"node": "v22.
|
|
28
|
+
"node": "v22.15.0",
|
|
29
29
|
"npm": "10.9.2"
|
|
30
30
|
},
|
|
31
31
|
"keywords": [
|
|
@@ -69,7 +69,6 @@
|
|
|
69
69
|
"typescript-eslint": "8.29.0"
|
|
70
70
|
},
|
|
71
71
|
"dependencies": {
|
|
72
|
-
"@napi-rs/canvas": "0.1.69",
|
|
73
72
|
"p-limit": "6.2.0",
|
|
74
73
|
"pdfjs-dist": "5.1.91"
|
|
75
74
|
}
|