@vertesia/converters 0.50.1 → 0.52.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -5
- package/lib/esm/mutool2.js +0 -16
- package/lib/esm/mutool2.js.map +0 -1
- package/lib/esm/pdf-test.js +0 -15
- package/lib/esm/pdf-test.js.map +0 -1
- package/lib/esm/pdf.js +0 -75
- package/lib/esm/pdf.js.map +0 -1
- package/lib/types/mutool2.d.ts +0 -2
- package/lib/types/mutool2.d.ts.map +0 -1
- package/lib/types/pdf-test.d.ts +0 -5
- package/lib/types/pdf-test.d.ts.map +0 -1
- package/lib/types/pdf.d.ts +0 -7
- package/lib/types/pdf.d.ts.map +0 -1
- package/src/mutool2.ts +0 -19
- package/src/pdf-test.ts +0 -21
- package/src/pdf.test.ts +0 -25
- package/src/pdf.ts +0 -93
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vertesia/converters",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.52.0",
|
|
4
4
|
"description": "Image and content converters",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"types": "./lib/types/index.d.ts",
|
|
@@ -18,12 +18,9 @@
|
|
|
18
18
|
"@types/tmp": "^0.2.6",
|
|
19
19
|
"ts-dual-module": "^0.6.3",
|
|
20
20
|
"typescript": "^5.0.2",
|
|
21
|
-
"vitest": "^
|
|
21
|
+
"vitest": "^3.0.9"
|
|
22
22
|
},
|
|
23
23
|
"dependencies": {
|
|
24
|
-
"@opendocsg/pdf2md": "0.2.0",
|
|
25
|
-
"@pdftron/pdfnet-node": "^10.11.0",
|
|
26
|
-
"mupdf": "^0.3.0",
|
|
27
24
|
"sharp": "^0.33.5",
|
|
28
25
|
"tmp": "^0.2.3"
|
|
29
26
|
},
|
package/lib/esm/mutool2.js
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import * as mupdf from "mupdf";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
async function test() {
|
|
4
|
-
const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
|
|
5
|
-
//const count = doc.countPages();
|
|
6
|
-
for (let i = 0; i < 5; i++) {
|
|
7
|
-
const page = doc.loadPage(i);
|
|
8
|
-
const stext = page.toStructuredText();
|
|
9
|
-
console.log("Page ================= ", i);
|
|
10
|
-
console.log("=================!!!!!!", stext.asText());
|
|
11
|
-
//console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
|
|
12
|
-
//console.log("=================!!!!!!", stext);
|
|
13
|
-
}
|
|
14
|
-
}
|
|
15
|
-
test();
|
|
16
|
-
//# sourceMappingURL=mutool2.js.map
|
package/lib/esm/mutool2.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mutool2.js","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,OAAO,CAAC;AAC/B,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,KAAK,UAAU,IAAI;IAEf,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC,YAAY,CAAC,0BAA0B,CAAC,EAAE,iBAAiB,CAAC,CAAC;IAExG,iCAAiC;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,CAAC,CAAC,CAAC;QAC1C,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QACvD,wEAAwE;QACxE,gDAAgD;IACpD,CAAC;AACL,CAAC;AAED,IAAI,EAAE,CAAC"}
|
package/lib/esm/pdf-test.js
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
|
|
3
|
-
*/
|
|
4
|
-
import fs from 'fs';
|
|
5
|
-
import path from 'path';
|
|
6
|
-
import { extractImagesFromPdfWithApryse } from "./pdf.js";
|
|
7
|
-
const main = async () => {
|
|
8
|
-
const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
|
|
9
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
10
|
-
console.log('start extracting images from pdf');
|
|
11
|
-
const result = await extractImagesFromPdfWithApryse(pdfBuffer);
|
|
12
|
-
console.log('result: ', result);
|
|
13
|
-
};
|
|
14
|
-
main();
|
|
15
|
-
//# sourceMappingURL=pdf-test.js.map
|
package/lib/esm/pdf-test.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"pdf-test.js","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,8BAA8B,EAAE,MAAM,UAAU,CAAC;AAE1D,MAAM,IAAI,GAAG,KAAK,IAAI,EAAE;IAEpB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,mBAAmB,EAAE,eAAe,CAAC,CAAC;IAC9E,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAQ,MAAM,8BAA8B,CAAC,SAAS,CAAC,CAAC;IAEpE,OAAO,CAAC,GAAG,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;AAEpC,CAAC,CAAA;AAED,IAAI,EAAE,CAAC"}
|
package/lib/esm/pdf.js
DELETED
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import pdf2md from "@opendocsg/pdf2md";
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import os from 'os';
|
|
4
|
-
import pkg from '@pdftron/pdfnet-node';
|
|
5
|
-
const { PDFNet } = pkg;
|
|
6
|
-
const pdf2mdFn = pdf2md;
|
|
7
|
-
export function trasformPdfToMarkdown(buffer) {
|
|
8
|
-
const arr = new Uint8Array(buffer);
|
|
9
|
-
return pdf2mdFn(arr);
|
|
10
|
-
}
|
|
11
|
-
async function extractImages(buffer, minHw = 300) {
|
|
12
|
-
const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
|
|
13
|
-
const reader = await PDFNet.ElementReader.create();
|
|
14
|
-
const tmpDir = os.tmpdir();
|
|
15
|
-
const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
|
|
16
|
-
// Read page content on every page in the document
|
|
17
|
-
const itr = await doc.getPageIterator();
|
|
18
|
-
for (itr; await itr.hasNext(); itr.next()) {
|
|
19
|
-
// Read the page
|
|
20
|
-
const page = await itr.current();
|
|
21
|
-
const pageNumber = await page.getIndex();
|
|
22
|
-
reader.beginOnPage(page);
|
|
23
|
-
await ProcessElements(reader, pageNumber);
|
|
24
|
-
reader.end();
|
|
25
|
-
}
|
|
26
|
-
return { workingDir };
|
|
27
|
-
async function ProcessElements(reader, pageNumber) {
|
|
28
|
-
// Traverse the page display list
|
|
29
|
-
let imgCount = 1;
|
|
30
|
-
for (let element = await reader.next(); element !== null; element = await reader.next()) {
|
|
31
|
-
const elementType = await element.getType();
|
|
32
|
-
switch (elementType) {
|
|
33
|
-
case PDFNet.Element.Type.e_image:
|
|
34
|
-
{
|
|
35
|
-
const image = await PDFNet.Image.createFromObj(await element.getXObject());
|
|
36
|
-
const h = await image.getImageHeight();
|
|
37
|
-
const w = await image.getImageWidth();
|
|
38
|
-
//console.log(`Image: width=${w}, height=${h}`);
|
|
39
|
-
//do not extract if image is too small, likely not relevant
|
|
40
|
-
//TODO: use LLM to decide if it matters?
|
|
41
|
-
if (w < minHw && h < minHw) {
|
|
42
|
-
break;
|
|
43
|
-
}
|
|
44
|
-
const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
|
|
45
|
-
image.exportAsPng(imgName);
|
|
46
|
-
break;
|
|
47
|
-
}
|
|
48
|
-
case PDFNet.Element.Type.e_form:
|
|
49
|
-
{
|
|
50
|
-
reader.formBegin();
|
|
51
|
-
ProcessElements(reader, pageNumber);
|
|
52
|
-
reader.end();
|
|
53
|
-
break;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
export async function extractImagesFromPdfWithApryse(buffer, minHw = 300) {
|
|
60
|
-
const APRYSE_KEY = process.env.APRYSE_KEY;
|
|
61
|
-
const extractImagesWrapper = async () => {
|
|
62
|
-
return await extractImages(buffer, minHw);
|
|
63
|
-
};
|
|
64
|
-
const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
|
|
65
|
-
return res;
|
|
66
|
-
}).finally(() => PDFNet.shutdown());
|
|
67
|
-
//read all images in the directory
|
|
68
|
-
const files = fs.readdirSync(res.workingDir);
|
|
69
|
-
const images = files.map((file) => {
|
|
70
|
-
const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
|
|
71
|
-
return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
|
|
72
|
-
});
|
|
73
|
-
return images;
|
|
74
|
-
}
|
|
75
|
-
//# sourceMappingURL=pdf.js.map
|
package/lib/esm/pdf.js.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,mBAAmB,CAAC;AAEvC,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,OAAO,GAAG,MAAM,sBAAsB,CAAC;AACvC,MAAM,EAAE,MAAM,EAAE,GAAG,GAAG,CAAC;AAGvB,MAAM,QAAQ,GAAG,MAA4D,CAAC;AAE9E,MAAM,UAAU,qBAAqB,CAAC,MAAc;IAChD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAID,KAAK,UAAU,aAAa,CAAC,MAAc,EAAE,QAAgB,GAAG;IAC5D,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;IACzD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,MAAM,EAAE,CAAC;IACnD,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;IAE3D,kDAAkD;IAClD,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,eAAe,EAAE,CAAC;IACxC,KAAK,GAAG,EAAE,MAAM,GAAG,CAAC,OAAO,EAAE,EAAE,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC;QACxC,gBAAgB;QAChB,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;QACjC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACzB,MAAM,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAC1C,MAAM,CAAC,GAAG,EAAE,CAAC;IACjB,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,CAAC;IAEtB,KAAK,UAAU,eAAe,CAAC,MAA6B,EAAE,UAAkB;QAC5E,iCAAiC;QACjC,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,KAAK,IAAI,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,OAAO,KAAK,IAAI,EAAE,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;YACtF,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAC5C,QAAQ,WAAW,EAAE,CAAC;gBAClB,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO;oBAC5B,CAAC;wBACG,MAAM,KAAK,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,aAAa,CAAC,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;wBAC3E,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,cAAc,EAAE,CAAC;wBACvC,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,aAAa,EAAE,CAAC;wBACtC,gDAAgD;wBAChD,2DAA2D;wBAC3D,wCAAwC;wBACxC,IAAI,CAAC,GAAG,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC;4BACzB,MAAM;wBACV,CAAC;wBACD,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,UAAU,IAAI,QAAQ,EAAE,MAAM,CAAC;wBACpE,KAAK,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;wBAC3B,MAAM;oBACV,CAAC;gBACL,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM;oBAC3B,CAAC;wBACG,MAAM,CAAC,SAAS,EAAE,CAAC;wBACnB,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;wBACpC,MAAM,CAAC,GAAG,EAAE,CAAC;wBACb,MAAM;oBACV,CAAC;YACT,CAAC;QACL,CAAC;IACL,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,8BAA8B,CAAC,MAAc,EAAE,QAAgB,GAAG;IACpF,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAE1C,MAAM,oBAAoB,GAAG,KAAK,IAAI,EAAE;QACpC,OAAO,MAAM,aAAa,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAC9C,CAAC,CAAC;IAEF,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,oBAAoB,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE;QACnF,OAAO,GAAG,CAAC;IACf,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAGpC,kCAAkC;IAClC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAE7C,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QAC9B,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACtE,OAAO,EAAE,IAAI,EAAE,QAAQ,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpF,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
package/lib/types/mutool2.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"mutool2.d.ts","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":""}
|
package/lib/types/pdf-test.d.ts
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"pdf-test.d.ts","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
package/lib/types/pdf.d.ts
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
export declare function trasformPdfToMarkdown(buffer: Buffer): Promise<string>;
|
|
2
|
-
export declare function extractImagesFromPdfWithApryse(buffer: Buffer, minHw?: number): Promise<{
|
|
3
|
-
page: number;
|
|
4
|
-
imgCount: number;
|
|
5
|
-
path: string;
|
|
6
|
-
}[]>;
|
|
7
|
-
//# sourceMappingURL=pdf.d.ts.map
|
package/lib/types/pdf.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAWA,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,mBAGnD;AAyDD,wBAAsB,8BAA8B,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,GAAE,MAAY;;;;KAqBvF"}
|
package/src/mutool2.ts
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import * as mupdf from "mupdf";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
|
|
4
|
-
async function test() {
|
|
5
|
-
|
|
6
|
-
const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
|
|
7
|
-
|
|
8
|
-
//const count = doc.countPages();
|
|
9
|
-
for (let i = 0; i < 5; i++) {
|
|
10
|
-
const page = doc.loadPage(i);
|
|
11
|
-
const stext = page.toStructuredText();
|
|
12
|
-
console.log("Page ================= ", i);
|
|
13
|
-
console.log("=================!!!!!!", stext.asText());
|
|
14
|
-
//console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
|
|
15
|
-
//console.log("=================!!!!!!", stext);
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
test();
|
package/src/pdf-test.ts
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
|
|
3
|
-
*/
|
|
4
|
-
|
|
5
|
-
import fs from 'fs';
|
|
6
|
-
import path from 'path';
|
|
7
|
-
import { extractImagesFromPdfWithApryse } from "./pdf.js";
|
|
8
|
-
|
|
9
|
-
const main = async () => {
|
|
10
|
-
|
|
11
|
-
const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
|
|
12
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
13
|
-
|
|
14
|
-
console.log('start extracting images from pdf');
|
|
15
|
-
const result: any = await extractImagesFromPdfWithApryse(pdfBuffer);
|
|
16
|
-
|
|
17
|
-
console.log('result: ', result);
|
|
18
|
-
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
main();
|
package/src/pdf.test.ts
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
3
|
-
import { expect, test } from 'vitest';
|
|
4
|
-
import { trasformPdfToMarkdown } from './pdf';
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
test('Converts a PDF to markdown', async () => {
|
|
9
|
-
const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf1.pdf');
|
|
10
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
11
|
-
const result = await trasformPdfToMarkdown(pdfBuffer);
|
|
12
|
-
|
|
13
|
-
expect(result).toContain('America');
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
});
|
|
17
|
-
|
|
18
|
-
test('Converts another PDF to markdown', async () => {
|
|
19
|
-
const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf2.pdf');
|
|
20
|
-
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
21
|
-
const result = await trasformPdfToMarkdown(pdfBuffer);
|
|
22
|
-
|
|
23
|
-
expect(result).toContain('America');
|
|
24
|
-
|
|
25
|
-
});
|
package/src/pdf.ts
DELETED
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
import pdf2md from "@opendocsg/pdf2md";
|
|
2
|
-
import type { PDFNet as PDFTron } from '@pdftron/pdfnet-node';
|
|
3
|
-
import fs from 'fs';
|
|
4
|
-
import os from 'os';
|
|
5
|
-
|
|
6
|
-
import pkg from '@pdftron/pdfnet-node';
|
|
7
|
-
const { PDFNet } = pkg;
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
const pdf2mdFn = pdf2md as unknown as (buffer: Uint8Array) => Promise<string>;
|
|
11
|
-
|
|
12
|
-
export function trasformPdfToMarkdown(buffer: Buffer) {
|
|
13
|
-
const arr = new Uint8Array(buffer);
|
|
14
|
-
return pdf2mdFn(arr);
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
async function extractImages(buffer: Buffer, minHw: number = 300) {
|
|
20
|
-
const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
|
|
21
|
-
const reader = await PDFNet.ElementReader.create();
|
|
22
|
-
const tmpDir = os.tmpdir()
|
|
23
|
-
const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
|
|
24
|
-
|
|
25
|
-
// Read page content on every page in the document
|
|
26
|
-
const itr = await doc.getPageIterator();
|
|
27
|
-
for (itr; await itr.hasNext(); itr.next()) {
|
|
28
|
-
// Read the page
|
|
29
|
-
const page = await itr.current();
|
|
30
|
-
const pageNumber = await page.getIndex();
|
|
31
|
-
reader.beginOnPage(page);
|
|
32
|
-
await ProcessElements(reader, pageNumber);
|
|
33
|
-
reader.end();
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
return { workingDir };
|
|
37
|
-
|
|
38
|
-
async function ProcessElements(reader: PDFTron.ElementReader, pageNumber: number) {
|
|
39
|
-
// Traverse the page display list
|
|
40
|
-
let imgCount = 1;
|
|
41
|
-
|
|
42
|
-
for (let element = await reader.next(); element !== null; element = await reader.next()) {
|
|
43
|
-
const elementType = await element.getType();
|
|
44
|
-
switch (elementType) {
|
|
45
|
-
case PDFNet.Element.Type.e_image:
|
|
46
|
-
{
|
|
47
|
-
const image = await PDFNet.Image.createFromObj(await element.getXObject());
|
|
48
|
-
const h = await image.getImageHeight();
|
|
49
|
-
const w = await image.getImageWidth();
|
|
50
|
-
//console.log(`Image: width=${w}, height=${h}`);
|
|
51
|
-
//do not extract if image is too small, likely not relevant
|
|
52
|
-
//TODO: use LLM to decide if it matters?
|
|
53
|
-
if (w < minHw && h < minHw) {
|
|
54
|
-
break;
|
|
55
|
-
}
|
|
56
|
-
const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
|
|
57
|
-
image.exportAsPng(imgName);
|
|
58
|
-
break;
|
|
59
|
-
}
|
|
60
|
-
case PDFNet.Element.Type.e_form:
|
|
61
|
-
{
|
|
62
|
-
reader.formBegin();
|
|
63
|
-
ProcessElements(reader, pageNumber);
|
|
64
|
-
reader.end();
|
|
65
|
-
break;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
export async function extractImagesFromPdfWithApryse(buffer: Buffer, minHw: number = 300) {
|
|
73
|
-
const APRYSE_KEY = process.env.APRYSE_KEY;
|
|
74
|
-
|
|
75
|
-
const extractImagesWrapper = async () => {
|
|
76
|
-
return await extractImages(buffer, minHw);
|
|
77
|
-
};
|
|
78
|
-
|
|
79
|
-
const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
|
|
80
|
-
return res;
|
|
81
|
-
}).finally(() => PDFNet.shutdown());
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
//read all images in the directory
|
|
85
|
-
const files = fs.readdirSync(res.workingDir);
|
|
86
|
-
|
|
87
|
-
const images = files.map((file) => {
|
|
88
|
-
const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
|
|
89
|
-
return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
|
|
90
|
-
});
|
|
91
|
-
|
|
92
|
-
return images;
|
|
93
|
-
}
|