@vertesia/converters 0.50.0 → 0.51.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vertesia/converters",
3
- "version": "0.50.0",
3
+ "version": "0.51.0",
4
4
  "description": "Image and content converters",
5
5
  "type": "module",
6
6
  "types": "./lib/types/index.d.ts",
@@ -18,12 +18,9 @@
18
18
  "@types/tmp": "^0.2.6",
19
19
  "ts-dual-module": "^0.6.3",
20
20
  "typescript": "^5.0.2",
21
- "vitest": "^2.1.6"
21
+ "vitest": "^2.1.9"
22
22
  },
23
23
  "dependencies": {
24
- "@opendocsg/pdf2md": "0.2.0",
25
- "@pdftron/pdfnet-node": "^10.11.0",
26
- "mupdf": "^0.3.0",
27
24
  "sharp": "^0.33.5",
28
25
  "tmp": "^0.2.3"
29
26
  },
@@ -1,16 +0,0 @@
1
- import * as mupdf from "mupdf";
2
- import fs from "fs";
3
- async function test() {
4
- const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
5
- //const count = doc.countPages();
6
- for (let i = 0; i < 5; i++) {
7
- const page = doc.loadPage(i);
8
- const stext = page.toStructuredText();
9
- console.log("Page ================= ", i);
10
- console.log("=================!!!!!!", stext.asText());
11
- //console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
12
- //console.log("=================!!!!!!", stext);
13
- }
14
- }
15
- test();
16
- //# sourceMappingURL=mutool2.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"mutool2.js","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,OAAO,CAAC;AAC/B,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,KAAK,UAAU,IAAI;IAEf,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC,YAAY,CAAC,0BAA0B,CAAC,EAAE,iBAAiB,CAAC,CAAC;IAExG,iCAAiC;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,CAAC,CAAC,CAAC;QAC1C,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QACvD,wEAAwE;QACxE,gDAAgD;IACpD,CAAC;AACL,CAAC;AAED,IAAI,EAAE,CAAC"}
@@ -1,15 +0,0 @@
1
- /**
2
- * IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
3
- */
4
- import fs from 'fs';
5
- import path from 'path';
6
- import { extractImagesFromPdfWithApryse } from "./pdf.js";
7
- const main = async () => {
8
- const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
9
- const pdfBuffer = fs.readFileSync(pdfPath);
10
- console.log('start extracting images from pdf');
11
- const result = await extractImagesFromPdfWithApryse(pdfBuffer);
12
- console.log('result: ', result);
13
- };
14
- main();
15
- //# sourceMappingURL=pdf-test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"pdf-test.js","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,8BAA8B,EAAE,MAAM,UAAU,CAAC;AAE1D,MAAM,IAAI,GAAG,KAAK,IAAI,EAAE;IAEpB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,mBAAmB,EAAE,eAAe,CAAC,CAAC;IAC9E,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAQ,MAAM,8BAA8B,CAAC,SAAS,CAAC,CAAC;IAEpE,OAAO,CAAC,GAAG,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;AAEpC,CAAC,CAAA;AAED,IAAI,EAAE,CAAC"}
package/lib/esm/pdf.js DELETED
@@ -1,75 +0,0 @@
1
- import pdf2md from "@opendocsg/pdf2md";
2
- import fs from 'fs';
3
- import os from 'os';
4
- import pkg from '@pdftron/pdfnet-node';
5
- const { PDFNet } = pkg;
6
- const pdf2mdFn = pdf2md;
7
- export function trasformPdfToMarkdown(buffer) {
8
- const arr = new Uint8Array(buffer);
9
- return pdf2mdFn(arr);
10
- }
11
- async function extractImages(buffer, minHw = 300) {
12
- const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
13
- const reader = await PDFNet.ElementReader.create();
14
- const tmpDir = os.tmpdir();
15
- const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
16
- // Read page content on every page in the document
17
- const itr = await doc.getPageIterator();
18
- for (itr; await itr.hasNext(); itr.next()) {
19
- // Read the page
20
- const page = await itr.current();
21
- const pageNumber = await page.getIndex();
22
- reader.beginOnPage(page);
23
- await ProcessElements(reader, pageNumber);
24
- reader.end();
25
- }
26
- return { workingDir };
27
- async function ProcessElements(reader, pageNumber) {
28
- // Traverse the page display list
29
- let imgCount = 1;
30
- for (let element = await reader.next(); element !== null; element = await reader.next()) {
31
- const elementType = await element.getType();
32
- switch (elementType) {
33
- case PDFNet.Element.Type.e_image:
34
- {
35
- const image = await PDFNet.Image.createFromObj(await element.getXObject());
36
- const h = await image.getImageHeight();
37
- const w = await image.getImageWidth();
38
- //console.log(`Image: width=${w}, height=${h}`);
39
- //do not extract if image is too small, likely not relevant
40
- //TODO: use LLM to decide if it matters?
41
- if (w < minHw && h < minHw) {
42
- break;
43
- }
44
- const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
45
- image.exportAsPng(imgName);
46
- break;
47
- }
48
- case PDFNet.Element.Type.e_form:
49
- {
50
- reader.formBegin();
51
- ProcessElements(reader, pageNumber);
52
- reader.end();
53
- break;
54
- }
55
- }
56
- }
57
- }
58
- }
59
- export async function extractImagesFromPdfWithApryse(buffer, minHw = 300) {
60
- const APRYSE_KEY = process.env.APRYSE_KEY;
61
- const extractImagesWrapper = async () => {
62
- return await extractImages(buffer, minHw);
63
- };
64
- const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
65
- return res;
66
- }).finally(() => PDFNet.shutdown());
67
- //read all images in the directory
68
- const files = fs.readdirSync(res.workingDir);
69
- const images = files.map((file) => {
70
- const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
71
- return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
72
- });
73
- return images;
74
- }
75
- //# sourceMappingURL=pdf.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,mBAAmB,CAAC;AAEvC,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,OAAO,GAAG,MAAM,sBAAsB,CAAC;AACvC,MAAM,EAAE,MAAM,EAAE,GAAG,GAAG,CAAC;AAGvB,MAAM,QAAQ,GAAG,MAA4D,CAAC;AAE9E,MAAM,UAAU,qBAAqB,CAAC,MAAc;IAChD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAID,KAAK,UAAU,aAAa,CAAC,MAAc,EAAE,QAAgB,GAAG;IAC5D,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;IACzD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,MAAM,EAAE,CAAC;IACnD,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;IAE3D,kDAAkD;IAClD,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,eAAe,EAAE,CAAC;IACxC,KAAK,GAAG,EAAE,MAAM,GAAG,CAAC,OAAO,EAAE,EAAE,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC;QACxC,gBAAgB;QAChB,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;QACjC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACzB,MAAM,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAC1C,MAAM,CAAC,GAAG,EAAE,CAAC;IACjB,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,CAAC;IAEtB,KAAK,UAAU,eAAe,CAAC,MAA6B,EAAE,UAAkB;QAC5E,iCAAiC;QACjC,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,KAAK,IAAI,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,OAAO,KAAK,IAAI,EAAE,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;YACtF,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAC5C,QAAQ,WAAW,EAAE,CAAC;gBAClB,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO;oBAC5B,CAAC;wBACG,MAAM,KAAK,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,aAAa,CAAC,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;wBAC3E,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,cAAc,EAAE,CAAC;wBACvC,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,aAAa,EAAE,CAAC;wBACtC,gDAAgD;wBAChD,2DAA2D;wBAC3D,wCAAwC;wBACxC,IAAI,CAAC,GAAG,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC;4BACzB,MAAM;wBACV,CAAC;wBACD,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,UAAU,IAAI,QAAQ,EAAE,MAAM,CAAC;wBACpE,KAAK,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;wBAC3B,MAAM;oBACV,CAAC;gBACL,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM;oBAC3B,CAAC;wBACG,MAAM,CAAC,SAAS,EAAE,CAAC;wBACnB,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;wBACpC,MAAM,CAAC,GAAG,EAAE,CAAC;wBACb,MAAM;oBACV,CAAC;YACT,CAAC;QACL,CAAC;IACL,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,8BAA8B,CAAC,MAAc,EAAE,QAAgB,GAAG;IACpF,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAE1C,MAAM,oBAAoB,GAAG,KAAK,IAAI,EAAE;QACpC,OAAO,MAAM,aAAa,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAC9C,CAAC,CAAC;IAEF,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,oBAAoB,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE;QACnF,OAAO,GAAG,CAAC;IACf,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAGpC,kCAAkC;IAClC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAE7C,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QAC9B,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACtE,OAAO,EAAE,IAAI,EAAE,QAAQ,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpF,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAClB,CAAC"}
@@ -1,2 +0,0 @@
1
- export {};
2
- //# sourceMappingURL=mutool2.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"mutool2.d.ts","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":""}
@@ -1,5 +0,0 @@
1
- /**
2
- * IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
3
- */
4
- export {};
5
- //# sourceMappingURL=pdf-test.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"pdf-test.d.ts","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG"}
@@ -1,7 +0,0 @@
1
- export declare function trasformPdfToMarkdown(buffer: Buffer): Promise<string>;
2
- export declare function extractImagesFromPdfWithApryse(buffer: Buffer, minHw?: number): Promise<{
3
- page: number;
4
- imgCount: number;
5
- path: string;
6
- }[]>;
7
- //# sourceMappingURL=pdf.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAWA,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,mBAGnD;AAyDD,wBAAsB,8BAA8B,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,GAAE,MAAY;;;;KAqBvF"}
package/src/mutool2.ts DELETED
@@ -1,19 +0,0 @@
1
- import * as mupdf from "mupdf";
2
- import fs from "fs";
3
-
4
- async function test() {
5
-
6
- const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
7
-
8
- //const count = doc.countPages();
9
- for (let i = 0; i < 5; i++) {
10
- const page = doc.loadPage(i);
11
- const stext = page.toStructuredText();
12
- console.log("Page ================= ", i);
13
- console.log("=================!!!!!!", stext.asText());
14
- //console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
15
- //console.log("=================!!!!!!", stext);
16
- }
17
- }
18
-
19
- test();
package/src/pdf-test.ts DELETED
@@ -1,21 +0,0 @@
1
- /**
2
- * IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
3
- */
4
-
5
- import fs from 'fs';
6
- import path from 'path';
7
- import { extractImagesFromPdfWithApryse } from "./pdf.js";
8
-
9
- const main = async () => {
10
-
11
- const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
12
- const pdfBuffer = fs.readFileSync(pdfPath);
13
-
14
- console.log('start extracting images from pdf');
15
- const result: any = await extractImagesFromPdfWithApryse(pdfBuffer);
16
-
17
- console.log('result: ', result);
18
-
19
- }
20
-
21
- main();
package/src/pdf.test.ts DELETED
@@ -1,25 +0,0 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
- import { expect, test } from 'vitest';
4
- import { trasformPdfToMarkdown } from './pdf';
5
-
6
-
7
-
8
- test('Converts a PDF to markdown', async () => {
9
- const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf1.pdf');
10
- const pdfBuffer = fs.readFileSync(pdfPath);
11
- const result = await trasformPdfToMarkdown(pdfBuffer);
12
-
13
- expect(result).toContain('America');
14
-
15
-
16
- });
17
-
18
- test('Converts another PDF to markdown', async () => {
19
- const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf2.pdf');
20
- const pdfBuffer = fs.readFileSync(pdfPath);
21
- const result = await trasformPdfToMarkdown(pdfBuffer);
22
-
23
- expect(result).toContain('America');
24
-
25
- });
package/src/pdf.ts DELETED
@@ -1,93 +0,0 @@
1
- import pdf2md from "@opendocsg/pdf2md";
2
- import type { PDFNet as PDFTron } from '@pdftron/pdfnet-node';
3
- import fs from 'fs';
4
- import os from 'os';
5
-
6
- import pkg from '@pdftron/pdfnet-node';
7
- const { PDFNet } = pkg;
8
-
9
-
10
- const pdf2mdFn = pdf2md as unknown as (buffer: Uint8Array) => Promise<string>;
11
-
12
- export function trasformPdfToMarkdown(buffer: Buffer) {
13
- const arr = new Uint8Array(buffer);
14
- return pdf2mdFn(arr);
15
- }
16
-
17
-
18
-
19
- async function extractImages(buffer: Buffer, minHw: number = 300) {
20
- const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
21
- const reader = await PDFNet.ElementReader.create();
22
- const tmpDir = os.tmpdir()
23
- const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
24
-
25
- // Read page content on every page in the document
26
- const itr = await doc.getPageIterator();
27
- for (itr; await itr.hasNext(); itr.next()) {
28
- // Read the page
29
- const page = await itr.current();
30
- const pageNumber = await page.getIndex();
31
- reader.beginOnPage(page);
32
- await ProcessElements(reader, pageNumber);
33
- reader.end();
34
- }
35
-
36
- return { workingDir };
37
-
38
- async function ProcessElements(reader: PDFTron.ElementReader, pageNumber: number) {
39
- // Traverse the page display list
40
- let imgCount = 1;
41
-
42
- for (let element = await reader.next(); element !== null; element = await reader.next()) {
43
- const elementType = await element.getType();
44
- switch (elementType) {
45
- case PDFNet.Element.Type.e_image:
46
- {
47
- const image = await PDFNet.Image.createFromObj(await element.getXObject());
48
- const h = await image.getImageHeight();
49
- const w = await image.getImageWidth();
50
- //console.log(`Image: width=${w}, height=${h}`);
51
- //do not extract if image is too small, likely not relevant
52
- //TODO: use LLM to decide if it matters?
53
- if (w < minHw && h < minHw) {
54
- break;
55
- }
56
- const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
57
- image.exportAsPng(imgName);
58
- break;
59
- }
60
- case PDFNet.Element.Type.e_form:
61
- {
62
- reader.formBegin();
63
- ProcessElements(reader, pageNumber);
64
- reader.end();
65
- break;
66
- }
67
- }
68
- }
69
- }
70
- }
71
-
72
- export async function extractImagesFromPdfWithApryse(buffer: Buffer, minHw: number = 300) {
73
- const APRYSE_KEY = process.env.APRYSE_KEY;
74
-
75
- const extractImagesWrapper = async () => {
76
- return await extractImages(buffer, minHw);
77
- };
78
-
79
- const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
80
- return res;
81
- }).finally(() => PDFNet.shutdown());
82
-
83
-
84
- //read all images in the directory
85
- const files = fs.readdirSync(res.workingDir);
86
-
87
- const images = files.map((file) => {
88
- const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
89
- return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
90
- });
91
-
92
- return images;
93
- }