afpp 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -0
- package/dist/index.js +7 -65
- package/dist/index.js.map +1 -1
- package/dist/parsePdf.js +79 -0
- package/dist/parsePdf.js.map +1 -0
- package/dist/pdf2image.js +64 -0
- package/dist/pdf2image.js.map +1 -0
- package/dist/pdf2string.js +68 -0
- package/dist/pdf2string.js.map +1 -0
- package/dist/types/index.d.ts +3 -21
- package/dist/types/parsePdf.d.ts +23 -0
- package/dist/types/pdf2image.d.ts +21 -0
- package/dist/types/pdf2string.d.ts +21 -0
- package/package.json +11 -8
package/README.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# afpp
|
|
2
2
|
|
|
3
|
+

|
|
4
|
+
[](https://codecov.io/github/l2ysho/afpp)
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+

|
|
9
|
+
|
|
3
10
|
Another f\*cking pdf parser. (alpha)
|
|
4
11
|
|
|
5
12
|
## Why?
|
package/dist/index.js
CHANGED
|
@@ -1,68 +1,10 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.pdf2string = void 0;
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
const pdfDocument = await loadingTask.promise;
|
|
11
|
-
const { numPages } = pdfDocument;
|
|
12
|
-
const pageContents = new Array(numPages).fill('');
|
|
13
|
-
const pagePromises = [];
|
|
14
|
-
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
15
|
-
pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
|
|
16
|
-
const textContent = await page.getTextContent({
|
|
17
|
-
includeMarkedContent: false,
|
|
18
|
-
});
|
|
19
|
-
// ? Type assertion of items to TextItem[] should be safe because {includeMarkedContent: false}
|
|
20
|
-
const items = textContent.items;
|
|
21
|
-
if (items.length === 0) {
|
|
22
|
-
pageContents[pageNum - 1] = '';
|
|
23
|
-
}
|
|
24
|
-
else {
|
|
25
|
-
const pageText = items.map((item) => item.str || '').join(' ');
|
|
26
|
-
pageContents[pageNum - 1] = pageText;
|
|
27
|
-
}
|
|
28
|
-
}));
|
|
29
|
-
}
|
|
30
|
-
await Promise.all(pagePromises);
|
|
31
|
-
return pageContents;
|
|
32
|
-
});
|
|
33
|
-
/**
|
|
34
|
-
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
35
|
-
*
|
|
36
|
-
* @async
|
|
37
|
-
* @function pdf2string
|
|
38
|
-
*
|
|
39
|
-
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
40
|
-
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
41
|
-
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
42
|
-
*
|
|
43
|
-
* @since — v1.0.0
|
|
44
|
-
*
|
|
45
|
-
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
46
|
-
*
|
|
47
|
-
* @throws {Error} Throws an error if the input type is invalid.
|
|
48
|
-
*/
|
|
49
|
-
const pdf2string = async (input, options) => {
|
|
50
|
-
if (typeof input === 'string') {
|
|
51
|
-
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
52
|
-
const data = new Uint8Array(fileBuffer);
|
|
53
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
54
|
-
}
|
|
55
|
-
if (Buffer.isBuffer(input)) {
|
|
56
|
-
const data = new Uint8Array(input);
|
|
57
|
-
return parsePdfFileBuffer({ data, ...options });
|
|
58
|
-
}
|
|
59
|
-
if (input instanceof Uint8Array) {
|
|
60
|
-
return parsePdfFileBuffer({ data: input, ...options });
|
|
61
|
-
}
|
|
62
|
-
if (input instanceof URL) {
|
|
63
|
-
return parsePdfFileBuffer({ url: input, ...options });
|
|
64
|
-
}
|
|
65
|
-
throw new Error(`Invalid source type: ${typeof input}`);
|
|
66
|
-
};
|
|
67
|
-
exports.pdf2string = pdf2string;
|
|
3
|
+
exports.pdf2string = exports.pdf2image = exports.parsePdf = void 0;
|
|
4
|
+
var parsePdf_1 = require("#afpp/src/parsePdf");
|
|
5
|
+
Object.defineProperty(exports, "parsePdf", { enumerable: true, get: function () { return parsePdf_1.parsePdf; } });
|
|
6
|
+
var pdf2image_1 = require("#afpp/src/pdf2image");
|
|
7
|
+
Object.defineProperty(exports, "pdf2image", { enumerable: true, get: function () { return pdf2image_1.pdf2image; } });
|
|
8
|
+
var pdf2string_1 = require("#afpp/src/pdf2string");
|
|
9
|
+
Object.defineProperty(exports, "pdf2string", { enumerable: true, get: function () { return pdf2string_1.pdf2string; } });
|
|
68
10
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,+
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,+CAA8C;AAArC,oGAAA,QAAQ,OAAA;AACjB,iDAAgD;AAAvC,sGAAA,SAAS,OAAA;AAClB,mDAAkD;AAAzC,wGAAA,UAAU,OAAA"}
|
package/dist/parsePdf.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.parsePdf = void 0;
|
|
4
|
+
/* eslint-disable no-underscore-dangle */
|
|
5
|
+
const promises_1 = require("node:fs/promises");
|
|
6
|
+
const canvas_1 = require("canvas");
|
|
7
|
+
const defaultParsePdfCallback = (content) => content;
|
|
8
|
+
const parsePdfFileBuffer = async (options, callback = defaultParsePdfCallback) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
|
|
9
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
10
|
+
...options,
|
|
11
|
+
verbosity: 0,
|
|
12
|
+
});
|
|
13
|
+
const pdfDocument = await loadingTask.promise;
|
|
14
|
+
const { numPages } = pdfDocument;
|
|
15
|
+
const pageContents = Array.from({ length: numPages }, () => null);
|
|
16
|
+
const pagePromises = [];
|
|
17
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
18
|
+
pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
|
|
19
|
+
const textContent = await page.getTextContent({
|
|
20
|
+
includeMarkedContent: false,
|
|
21
|
+
});
|
|
22
|
+
const items = textContent.items;
|
|
23
|
+
if (items.length === 0) {
|
|
24
|
+
const viewport = page.getViewport({ scale: 1.0 });
|
|
25
|
+
const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
|
|
26
|
+
const context = canvas.getContext('2d');
|
|
27
|
+
await page.render({ canvasContext: context, viewport }).promise;
|
|
28
|
+
const imageBuffer = canvas.toBuffer();
|
|
29
|
+
pageContents[pageNum - 1] = callback(imageBuffer);
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
const pageText = items.map((item) => item.str || '').join(' ');
|
|
33
|
+
pageContents[pageNum - 1] = callback(pageText);
|
|
34
|
+
}
|
|
35
|
+
}));
|
|
36
|
+
}
|
|
37
|
+
await Promise.all(pagePromises);
|
|
38
|
+
return pageContents;
|
|
39
|
+
});
|
|
40
|
+
/**
|
|
41
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
|
|
42
|
+
*
|
|
43
|
+
* @async
|
|
44
|
+
* @function pdf2string
|
|
45
|
+
*
|
|
46
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
47
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
48
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
49
|
+
* @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
|
|
50
|
+
*
|
|
51
|
+
* @since — v1.0.0
|
|
52
|
+
*
|
|
53
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
54
|
+
*
|
|
55
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
56
|
+
*/
|
|
57
|
+
const parsePdf = async (input, options, callback = defaultParsePdfCallback) => {
|
|
58
|
+
if (typeof callback !== 'function') {
|
|
59
|
+
throw new Error(`Invalid callback type: ${typeof callback}`);
|
|
60
|
+
}
|
|
61
|
+
if (typeof input === 'string') {
|
|
62
|
+
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
63
|
+
const data = new Uint8Array(fileBuffer);
|
|
64
|
+
return parsePdfFileBuffer({ data, ...options }, callback);
|
|
65
|
+
}
|
|
66
|
+
if (Buffer.isBuffer(input)) {
|
|
67
|
+
const data = new Uint8Array(input);
|
|
68
|
+
return parsePdfFileBuffer({ data, ...options }, callback);
|
|
69
|
+
}
|
|
70
|
+
if (input instanceof Uint8Array) {
|
|
71
|
+
return parsePdfFileBuffer({ data: input, ...options }, callback);
|
|
72
|
+
}
|
|
73
|
+
if (input instanceof URL) {
|
|
74
|
+
return parsePdfFileBuffer({ url: input, ...options }, callback);
|
|
75
|
+
}
|
|
76
|
+
throw new Error(`Invalid source type: ${typeof input}`);
|
|
77
|
+
};
|
|
78
|
+
exports.parsePdf = parsePdf;
|
|
79
|
+
//# sourceMappingURL=parsePdf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parsePdf.js","sourceRoot":"","sources":["../src/parsePdf.ts"],"names":[],"mappings":";;;AAAA,yCAAyC;AACzC,+CAA4C;AAE5C,mCAAsC;AAStC,MAAM,uBAAuB,GAAsC,CAAC,OAAO,EAAE,EAAE,CAC7E,OAAO,CAAC;AAEV,MAAM,kBAAkB,GAAG,KAAK,EAC9B,OAA+B,EAC/B,WAAgC,uBAA8C,EAC9E,EAAE,CACF,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC;KACb,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAQ,KAAK,CAAC,IAAI,CAClC,EAAE,MAAM,EAAE,QAAQ,EAAE,EACpB,GAAG,EAAE,CAAC,IAAoB,CAC3B,CAAC;IACF,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBAC5C,oBAAoB,EAAE,KAAK;aAC5B,CAAC,CAAC;YACH,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;YAC9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;gBAClD,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;gBAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;gBAExC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;gBAChE,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;gBACtC,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC;YACpD,CAAC;iBAAM,CAAC;gBACN,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAC/D,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,QAAQ,CAAC,CAAC;YACjD,CAAC;QACH,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAML;;;;;;;;;;;;;;;;GAgBG;AAEI,MAAM,QAAQ,GAAG,KAAK,EAC3B,KAAyC,EACzC,OAAsB,EACtB,WAAgC,uBAA8C,EAC9E,EAAE;IACF,IAAI,OAAO,QAAQ,KAAK,UAAU,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,OAAO,QAAQ,EAAE,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAC5D,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAC5D,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IACnE,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,EAAE,QAAQ,CAAC,CAAC;IAClE,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AAxBW,QAAA,QAAQ,YAwBnB"}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.pdf2image = void 0;
|
|
4
|
+
/* eslint-disable no-underscore-dangle */
|
|
5
|
+
const promises_1 = require("node:fs/promises");
|
|
6
|
+
const canvas_1 = require("canvas");
|
|
7
|
+
const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
|
|
8
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
9
|
+
...options,
|
|
10
|
+
verbosity: 0, // TODO enable for debug
|
|
11
|
+
});
|
|
12
|
+
const pdfDocument = await loadingTask.promise;
|
|
13
|
+
const { numPages } = pdfDocument;
|
|
14
|
+
const pageContents = new Array(numPages).fill(Buffer.from(''));
|
|
15
|
+
const pagePromises = [];
|
|
16
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
17
|
+
pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
|
|
18
|
+
const viewport = page.getViewport({ scale: 1.0 });
|
|
19
|
+
const canvas = (0, canvas_1.createCanvas)(viewport.width, viewport.height);
|
|
20
|
+
const context = canvas.getContext('2d');
|
|
21
|
+
await page.render({ canvasContext: context, viewport }).promise;
|
|
22
|
+
const imageBuffer = canvas.toBuffer();
|
|
23
|
+
pageContents[pageNum - 1] = imageBuffer;
|
|
24
|
+
}));
|
|
25
|
+
}
|
|
26
|
+
await Promise.all(pagePromises);
|
|
27
|
+
return pageContents;
|
|
28
|
+
});
|
|
29
|
+
/**
|
|
30
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
|
|
31
|
+
*
|
|
32
|
+
* @async
|
|
33
|
+
* @function pdf2string
|
|
34
|
+
*
|
|
35
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
36
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
37
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
38
|
+
*
|
|
39
|
+
* @since — v1.0.0
|
|
40
|
+
*
|
|
41
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
42
|
+
*
|
|
43
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
44
|
+
*/
|
|
45
|
+
const pdf2image = async (input, options) => {
|
|
46
|
+
if (typeof input === 'string') {
|
|
47
|
+
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
48
|
+
const data = new Uint8Array(fileBuffer);
|
|
49
|
+
return parsePdfFileBuffer({ data, ...options });
|
|
50
|
+
}
|
|
51
|
+
if (Buffer.isBuffer(input)) {
|
|
52
|
+
const data = new Uint8Array(input);
|
|
53
|
+
return parsePdfFileBuffer({ data, ...options });
|
|
54
|
+
}
|
|
55
|
+
if (input instanceof Uint8Array) {
|
|
56
|
+
return parsePdfFileBuffer({ data: input, ...options });
|
|
57
|
+
}
|
|
58
|
+
if (input instanceof URL) {
|
|
59
|
+
return parsePdfFileBuffer({ url: input, ...options });
|
|
60
|
+
}
|
|
61
|
+
throw new Error(`Invalid source type: ${typeof input}`);
|
|
62
|
+
};
|
|
63
|
+
exports.pdf2image = pdf2image;
|
|
64
|
+
//# sourceMappingURL=pdf2image.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf2image.js","sourceRoot":"","sources":["../src/pdf2image.ts"],"names":[],"mappings":";;;AAAA,yCAAyC;AACzC,+CAA4C;AAE5C,mCAAsC;AAItC,MAAM,kBAAkB,GAAG,KAAK,EAAE,OAA+B,EAAE,EAAE,CACnE,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC,EAAE,wBAAwB;KACvC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAa,IAAI,KAAK,CAAS,QAAQ,CAAC,CAAC,IAAI,CAC7D,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAChB,CAAC;IACF,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;YAClD,MAAM,MAAM,GAAG,IAAA,qBAAY,EAAC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC7D,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YAExC,MAAM,IAAI,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC;YAChE,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,EAAE,CAAC;YACtC,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,WAAW,CAAC;QAC1C,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAML;;;;;;;;;;;;;;;GAeG;AACI,MAAM,SAAS,GAAG,KAAK,EAC5B,KAAyC,EACzC,OAAsB,EACtB,EAAE;IACF,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AApBW,QAAA,SAAS,aAoBpB"}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.pdf2string = void 0;
|
|
4
|
+
const promises_1 = require("node:fs/promises");
|
|
5
|
+
const parsePdfFileBuffer = async (options) => import('pdfjs-dist/legacy/build/pdf.mjs').then(async (pdfjsLib) => {
|
|
6
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
7
|
+
...options,
|
|
8
|
+
verbosity: 0, // TODO enable for debug
|
|
9
|
+
});
|
|
10
|
+
const pdfDocument = await loadingTask.promise;
|
|
11
|
+
const { numPages } = pdfDocument;
|
|
12
|
+
const pageContents = new Array(numPages).fill('');
|
|
13
|
+
const pagePromises = [];
|
|
14
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum += 1) {
|
|
15
|
+
pagePromises.push(pdfDocument.getPage(pageNum).then(async (page) => {
|
|
16
|
+
const textContent = await page.getTextContent({
|
|
17
|
+
includeMarkedContent: false,
|
|
18
|
+
});
|
|
19
|
+
// ? Type assertion of items to TextItem[] should be safe because {includeMarkedContent: false}
|
|
20
|
+
const items = textContent.items;
|
|
21
|
+
if (items.length === 0) {
|
|
22
|
+
pageContents[pageNum - 1] = '';
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
const pageText = items.map((item) => item.str || '').join(' ');
|
|
26
|
+
pageContents[pageNum - 1] = pageText;
|
|
27
|
+
}
|
|
28
|
+
}));
|
|
29
|
+
}
|
|
30
|
+
await Promise.all(pagePromises);
|
|
31
|
+
return pageContents;
|
|
32
|
+
});
|
|
33
|
+
/**
|
|
34
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
35
|
+
*
|
|
36
|
+
* @async
|
|
37
|
+
* @function pdf2string
|
|
38
|
+
*
|
|
39
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
40
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
41
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
42
|
+
*
|
|
43
|
+
* @since — v1.0.0
|
|
44
|
+
*
|
|
45
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
46
|
+
*
|
|
47
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
48
|
+
*/
|
|
49
|
+
const pdf2string = async (input, options) => {
|
|
50
|
+
if (typeof input === 'string') {
|
|
51
|
+
const fileBuffer = await (0, promises_1.readFile)(input, {});
|
|
52
|
+
const data = new Uint8Array(fileBuffer);
|
|
53
|
+
return parsePdfFileBuffer({ data, ...options });
|
|
54
|
+
}
|
|
55
|
+
if (Buffer.isBuffer(input)) {
|
|
56
|
+
const data = new Uint8Array(input);
|
|
57
|
+
return parsePdfFileBuffer({ data, ...options });
|
|
58
|
+
}
|
|
59
|
+
if (input instanceof Uint8Array) {
|
|
60
|
+
return parsePdfFileBuffer({ data: input, ...options });
|
|
61
|
+
}
|
|
62
|
+
if (input instanceof URL) {
|
|
63
|
+
return parsePdfFileBuffer({ url: input, ...options });
|
|
64
|
+
}
|
|
65
|
+
throw new Error(`Invalid source type: ${typeof input}`);
|
|
66
|
+
};
|
|
67
|
+
exports.pdf2string = pdf2string;
|
|
68
|
+
//# sourceMappingURL=pdf2string.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf2string.js","sourceRoot":"","sources":["../src/pdf2string.ts"],"names":[],"mappings":";;;AAAA,+CAA4C;AAQ5C,MAAM,kBAAkB,GAAG,KAAK,EAAE,OAA+B,EAAE,EAAE,CACnE,MAAM,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE;IAChE,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACvC,GAAG,OAAO;QACV,SAAS,EAAE,CAAC,EAAE,wBAAwB;KACvC,CAAC,CAAC;IACH,MAAM,WAAW,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC;IAE9C,MAAM,EAAE,QAAQ,EAAE,GAAG,WAAW,CAAC;IACjC,MAAM,YAAY,GAAa,IAAI,KAAK,CAAS,QAAQ,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACpE,MAAM,YAAY,GAAmC,EAAE,CAAC;IAExD,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,QAAQ,EAAE,OAAO,IAAI,CAAC,EAAE,CAAC;QACxD,YAAY,CAAC,IAAI,CACf,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,EAAE;YAC/C,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,cAAc,CAAC;gBAC5C,oBAAoB,EAAE,KAAK;aAC5B,CAAC,CAAC;YACH,+FAA+F;YAC/F,MAAM,KAAK,GAAG,WAAW,CAAC,KAAmB,CAAC;YAC9C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACvB,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC;YACjC,CAAC;iBAAM,CAAC;gBACN,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAC/D,YAAY,CAAC,OAAO,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC;YACvC,CAAC;QACH,CAAC,CAAC,CACH,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAChC,OAAO,YAAY,CAAC;AACtB,CAAC,CAAC,CAAC;AAML;;;;;;;;;;;;;;;GAeG;AACI,MAAM,UAAU,GAAG,KAAK,EAC7B,KAAyC,EACzC,OAAsB,EACtB,EAAE;IACF,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,MAAM,UAAU,GAAG,MAAM,IAAA,mBAAQ,EAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QAC7C,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,UAAU,CAAC,CAAC;QACxC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,UAAU,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IAClD,CAAC;IACD,IAAI,KAAK,YAAY,UAAU,EAAE,CAAC;QAChC,OAAO,kBAAkB,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,KAAK,YAAY,GAAG,EAAE,CAAC;QACzB,OAAO,kBAAkB,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,OAAO,EAAE,CAAC,CAAC;IACxD,CAAC;IACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,OAAO,KAAK,EAAE,CAAC,CAAC;AAC1D,CAAC,CAAC;AApBW,QAAA,UAAU,cAoBrB"}
|
package/dist/types/index.d.ts
CHANGED
|
@@ -1,21 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
};
|
|
4
|
-
/**
|
|
5
|
-
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
6
|
-
*
|
|
7
|
-
* @async
|
|
8
|
-
* @function pdf2string
|
|
9
|
-
*
|
|
10
|
-
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
11
|
-
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
12
|
-
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
13
|
-
*
|
|
14
|
-
* @since — v1.0.0
|
|
15
|
-
*
|
|
16
|
-
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
17
|
-
*
|
|
18
|
-
* @throws {Error} Throws an error if the input type is invalid.
|
|
19
|
-
*/
|
|
20
|
-
declare const pdf2string: (input: Buffer | URL | Uint8Array | string, options?: ParseOptions) => Promise<string[]>;
|
|
21
|
-
export { pdf2string };
|
|
1
|
+
export { parsePdf } from '#afpp/src/parsePdf';
|
|
2
|
+
export { pdf2image } from '#afpp/src/pdf2image';
|
|
3
|
+
export { pdf2string } from '#afpp/src/pdf2string';
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
type ParsePdfCallback<T> = (content: Buffer | string) => T;
|
|
2
|
+
type ParseOptions = {
|
|
3
|
+
password?: string;
|
|
4
|
+
};
|
|
5
|
+
/**
|
|
6
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL). Pages are returned in mixed array of strings (text content) and buffers (image content) with in callback function.
|
|
7
|
+
*
|
|
8
|
+
* @async
|
|
9
|
+
* @function pdf2string
|
|
10
|
+
*
|
|
11
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
12
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
13
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
14
|
+
* @param {function} callback - callback function to add another layer of processing, default callback returns content of page withouth any added processing.
|
|
15
|
+
*
|
|
16
|
+
* @since — v1.0.0
|
|
17
|
+
*
|
|
18
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
19
|
+
*
|
|
20
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
21
|
+
*/
|
|
22
|
+
export declare const parsePdf: <T>(input: Buffer | URL | Uint8Array | string, options?: ParseOptions, callback?: ParsePdfCallback<T>) => Promise<T[]>;
|
|
23
|
+
export {};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
type ParseOptions = {
|
|
2
|
+
password?: string;
|
|
3
|
+
};
|
|
4
|
+
/**
|
|
5
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to an array of image buffers.
|
|
6
|
+
*
|
|
7
|
+
* @async
|
|
8
|
+
* @function pdf2string
|
|
9
|
+
*
|
|
10
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
11
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
12
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
13
|
+
*
|
|
14
|
+
* @since — v1.0.0
|
|
15
|
+
*
|
|
16
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
17
|
+
*
|
|
18
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
19
|
+
*/
|
|
20
|
+
export declare const pdf2image: (input: Buffer | URL | Uint8Array | string, options?: ParseOptions) => Promise<Buffer[]>;
|
|
21
|
+
export {};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
type ParseOptions = {
|
|
2
|
+
password?: string;
|
|
3
|
+
};
|
|
4
|
+
/**
|
|
5
|
+
* Converts a PDF file from various input formats (Buffer, Uint8Array, string path, or URL) to a string.
|
|
6
|
+
*
|
|
7
|
+
* @async
|
|
8
|
+
* @function pdf2string
|
|
9
|
+
*
|
|
10
|
+
* @param {Buffer|Uint8Array|string|URL} input - The PDF source, which can be a file path, URL, Buffer, or Uint8Array.
|
|
11
|
+
* @param {Object} [options] - Optional parsing options for customizing the PDF parsing process.
|
|
12
|
+
* @param {string} [options.password] - The password for encrypted PDF files, if required.
|
|
13
|
+
*
|
|
14
|
+
* @since — v1.0.0
|
|
15
|
+
*
|
|
16
|
+
* @returns {Promise<string>} - A promise that resolves to the string representation of the PDF content.
|
|
17
|
+
*
|
|
18
|
+
* @throws {Error} Throws an error if the input type is invalid.
|
|
19
|
+
*/
|
|
20
|
+
export declare const pdf2string: (input: Buffer | URL | Uint8Array | string, options?: ParseOptions) => Promise<string[]>;
|
|
21
|
+
export {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "afpp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "another f*cking pdf parser",
|
|
5
5
|
"types": "./dist/types/index.d.ts",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -13,8 +13,9 @@
|
|
|
13
13
|
"lint": "eslint .",
|
|
14
14
|
"prebuild": "rm -rf dist",
|
|
15
15
|
"prepare": "husky",
|
|
16
|
-
"test": "NODE_ENV=test npx tsx --test --test-reporter
|
|
17
|
-
"
|
|
16
|
+
"test": "NODE_ENV=test npx tsx --test --test-reporter @voxpelli/node-test-pretty-reporter test/*.test.ts",
|
|
17
|
+
"pretest:coverage": "rm -rf coverage",
|
|
18
|
+
"test:coverage": "c8 --reporter=lcov npm test",
|
|
18
19
|
"typecheck": "tsc -p tsconfig.json --noEmit"
|
|
19
20
|
},
|
|
20
21
|
"repository": {
|
|
@@ -38,10 +39,6 @@
|
|
|
38
39
|
"url": "https://github.com/l2ysho/afpp/issues"
|
|
39
40
|
},
|
|
40
41
|
"homepage": "https://github.com/l2ysho/afpp#readme",
|
|
41
|
-
"dependencies": {
|
|
42
|
-
"pdfjs-dist": "4.6.82",
|
|
43
|
-
"typescript": "5.6.2"
|
|
44
|
-
},
|
|
45
42
|
"devDependencies": {
|
|
46
43
|
"@commitlint/cli": "19.5.0",
|
|
47
44
|
"@commitlint/config-conventional": "19.5.0",
|
|
@@ -49,6 +46,7 @@
|
|
|
49
46
|
"@typescript-eslint/eslint-plugin": "7.18.0",
|
|
50
47
|
"@typescript-eslint/parser": "7.18.0",
|
|
51
48
|
"@voxpelli/node-test-pretty-reporter": "1.1.2",
|
|
49
|
+
"c8": "10.1.2",
|
|
52
50
|
"commitizen": "4.3.0",
|
|
53
51
|
"cz-conventional-changelog": "3.3.0",
|
|
54
52
|
"eslint": "8.56.0",
|
|
@@ -64,6 +62,11 @@
|
|
|
64
62
|
"husky": "9.1.6",
|
|
65
63
|
"lint-staged": "15.2.10",
|
|
66
64
|
"semantic-release": "24.1.1",
|
|
67
|
-
"tsx": "4.19.1"
|
|
65
|
+
"tsx": "4.19.1",
|
|
66
|
+
"typescript": "5.6.2"
|
|
67
|
+
},
|
|
68
|
+
"dependencies": {
|
|
69
|
+
"canvas": "2.11.2",
|
|
70
|
+
"pdfjs-dist": "4.6.82"
|
|
68
71
|
}
|
|
69
72
|
}
|