@vertesia/converters 0.42.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +13 -0
- package/lib/esm/image.js +57 -0
- package/lib/esm/image.js.map +1 -0
- package/lib/esm/index.js +5 -0
- package/lib/esm/index.js.map +1 -0
- package/lib/esm/mutool.js +39 -0
- package/lib/esm/mutool.js.map +1 -0
- package/lib/esm/mutool2.js +16 -0
- package/lib/esm/mutool2.js.map +1 -0
- package/lib/esm/pandoc.js +40 -0
- package/lib/esm/pandoc.js.map +1 -0
- package/lib/esm/pdf-test.js +15 -0
- package/lib/esm/pdf-test.js.map +1 -0
- package/lib/esm/pdf.js +75 -0
- package/lib/esm/pdf.js.map +1 -0
- package/lib/types/image.d.ts +17 -0
- package/lib/types/image.d.ts.map +1 -0
- package/lib/types/index.d.ts +5 -0
- package/lib/types/index.d.ts.map +1 -0
- package/lib/types/mutool.d.ts +4 -0
- package/lib/types/mutool.d.ts.map +1 -0
- package/lib/types/mutool2.d.ts +2 -0
- package/lib/types/mutool2.d.ts.map +1 -0
- package/lib/types/pandoc.d.ts +7 -0
- package/lib/types/pandoc.d.ts.map +1 -0
- package/lib/types/pdf-test.d.ts +5 -0
- package/lib/types/pdf-test.d.ts.map +1 -0
- package/lib/types/pdf.d.ts +7 -0
- package/lib/types/pdf.d.ts.map +1 -0
- package/package.json +38 -0
- package/src/image.test.ts +25 -0
- package/src/image.ts +78 -0
- package/src/index.ts +12 -0
- package/src/mutool.test.ts +14 -0
- package/src/mutool.ts +45 -0
- package/src/mutool2.ts +19 -0
- package/src/pandoc.test.ts +14 -0
- package/src/pandoc.ts +46 -0
- package/src/pdf-test.ts +21 -0
- package/src/pdf.test.ts +25 -0
- package/src/pdf.ts +93 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright 2024 Composable
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
package/lib/esm/image.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import sharp from "sharp";
|
|
2
|
+
export function createImageTransformer(input, opts) {
|
|
3
|
+
const isInputStream = !!input.pipe;
|
|
4
|
+
let sh = isInputStream ? input.pipe(sharp()) : sharp(input);
|
|
5
|
+
if (opts.max_hw) {
|
|
6
|
+
sh = sh.resize({
|
|
7
|
+
width: opts.max_hw,
|
|
8
|
+
height: opts.max_hw,
|
|
9
|
+
fit: sharp.fit.inside,
|
|
10
|
+
withoutEnlargement: true,
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
if (opts.format) {
|
|
14
|
+
sh = sh.toFormat(opts.format);
|
|
15
|
+
}
|
|
16
|
+
return sh;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* @param max_hw
|
|
20
|
+
* @param format
|
|
21
|
+
* @returns
|
|
22
|
+
*/
|
|
23
|
+
export async function transformImage(input, output, opts) {
|
|
24
|
+
const sh = createImageTransformer(input, opts);
|
|
25
|
+
sh.pipe(output);
|
|
26
|
+
return new Promise((resolve, reject) => {
|
|
27
|
+
const handleError = (err) => {
|
|
28
|
+
console.error('Failed to transform', err);
|
|
29
|
+
try {
|
|
30
|
+
if (input.pipe && input.destroy) {
|
|
31
|
+
input.destroy();
|
|
32
|
+
}
|
|
33
|
+
if (output.destroy) {
|
|
34
|
+
output.destroy();
|
|
35
|
+
}
|
|
36
|
+
sh.destroy();
|
|
37
|
+
}
|
|
38
|
+
finally {
|
|
39
|
+
reject(err);
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
output.on('error', handleError);
|
|
43
|
+
input.pipe && input.on && input.on('error', handleError);
|
|
44
|
+
output.on("finish", () => {
|
|
45
|
+
resolve(sh);
|
|
46
|
+
});
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
export function transformImageToBuffer(input, opts) {
|
|
50
|
+
const sh = createImageTransformer(input, opts);
|
|
51
|
+
return sh.toBuffer();
|
|
52
|
+
}
|
|
53
|
+
export async function transformImageToFile(input, output, opts) {
|
|
54
|
+
const sh = createImageTransformer(input, opts);
|
|
55
|
+
await sh.toFile(output);
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=image.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.js","sourceRoot":"","sources":["../../src/image.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAoB1B,MAAM,UAAU,sBAAsB,CAAC,KAAqB,EAAE,IAAsB;IAChF,MAAM,aAAa,GAAG,CAAC,CAAE,KAA+B,CAAC,IAAI,CAAC;IAC9D,IAAI,EAAE,GAAG,aAAa,CAAC,CAAC,CAAE,KAA+B,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAY,CAAC,CAAC;IAC9F,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QACd,EAAE,GAAG,EAAE,CAAC,MAAM,CAAC;YACX,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,MAAM;YACrB,kBAAkB,EAAE,IAAI;SAC3B,CAAC,CAAC;IACP,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;QACd,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAClC,CAAC;IACD,OAAO,EAAE,CAAC;AACd,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,KAAqB,EAAE,MAA6B,EAAE,IAAsB;IAC7G,MAAM,EAAE,GAAG,sBAAsB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC/C,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEhB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACnC,MAAM,WAAW,GAAG,CAAC,GAAQ,EAAE,EAAE;YAC7B,OAAO,CAAC,KAAK,CAAC,qBAAqB,EAAE,GAAG,CAAC,CAAC;YAC1C,IAAI,CAAC;gBACD,IAAK,KAAa,CAAC,IAAI,IAAK,KAAa,CAAC,OAAO,EAAE,CAAC;oBAC/C,KAAa,CAAC,OAAO,EAAE,CAAC;gBAC7B,CAAC;gBACD,IAAK,MAAc,CAAC,OAAO,EAAE,CAAC;oBACzB,MAAc,CAAC,OAAO,EAAE,CAAC;gBAC9B,CAAC;gBACD,EAAE,CAAC,OAAO,EAAE,CAAC;YACjB,CAAC;oBAAS,CAAC;gBACP,MAAM,CAAC,GAAG,CAAC,CAAC;YAChB,CAAC;QACL,CAAC,CAAA;QACD,MAAM,CAAC,EAAE,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QAC/B,KAAa,CAAC,IAAI,IAAK,KAAa,CAAC,EAAE,IAAK,KAAa,CAAC,EAAE,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QACpF,MAAM,CAAC,EAAE,CAAC,QAAQ,EAAE,GAAG,EAAE;YACrB,OAAO,CAAC,EAAE,CAAC,CAAC;QAChB,CAAC,CAAC,CAAC;IACP,CAAC,CAAC,CAAC;AACP,CAAC;AAED,MAAM,UAAU,sBAAsB,CAAC,KAAqB,EAAE,IAAsB;IAChF,MAAM,EAAE,GAAG,sBAAsB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC/C,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC;AACzB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,KAAqB,EAAE,MAAc,EAAE,IAAsB;IACpG,MAAM,EAAE,GAAG,sBAAsB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IAC/C,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC"}
|
package/lib/esm/index.js
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { transformImage, transformImageToBuffer, transformImageToFile } from './image.js';
|
|
2
|
+
import { pdfFileToText, pdfToText, pdfToTextBuffer } from './mutool.js';
|
|
3
|
+
import { manyToMarkdown } from './pandoc.js';
|
|
4
|
+
export { manyToMarkdown, pdfFileToText, pdfToText, pdfToTextBuffer, transformImage, transformImageToBuffer, transformImageToFile };
|
|
5
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,YAAY,CAAC;AAC1F,OAAO,EAAE,aAAa,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE7C,OAAO,EACH,cAAc,EACd,aAAa,EAAE,SAAS,EACxB,eAAe,EACf,cAAc,EACd,sBAAsB,EACtB,oBAAoB,EACvB,CAAC"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import { readFile } from "fs/promises";
|
|
4
|
+
import tmp from 'tmp';
|
|
5
|
+
tmp.setGracefulCleanup();
|
|
6
|
+
export function pdfFileToText(input, output) {
|
|
7
|
+
return new Promise((resolve, reject) => {
|
|
8
|
+
const command = spawn("mutool", ["convert", "-o", output, input]);
|
|
9
|
+
command.on('exit', function (code) {
|
|
10
|
+
if (code) {
|
|
11
|
+
reject(new Error(`mutool exited with code ${code}`));
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
command.on('close', function (code) {
|
|
15
|
+
if (code) {
|
|
16
|
+
reject(new Error(`mutool exited with code ${code}`));
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
return resolve(output);
|
|
20
|
+
}
|
|
21
|
+
;
|
|
22
|
+
});
|
|
23
|
+
command.on('error', (err) => {
|
|
24
|
+
reject(err);
|
|
25
|
+
});
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
export function pdfToText(buffer) {
|
|
29
|
+
return pdfToTextBuffer(buffer).then((buffer) => buffer.toString('utf-8'));
|
|
30
|
+
}
|
|
31
|
+
export function pdfToTextBuffer(buffer) {
|
|
32
|
+
const inputFile = tmp.fileSync({ postfix: '.pdf' });
|
|
33
|
+
const targetFileName = tmp.tmpNameSync({ postfix: '.txt' });
|
|
34
|
+
fs.writeSync(inputFile.fd, buffer);
|
|
35
|
+
return pdfFileToText(inputFile.name, targetFileName).then(() => {
|
|
36
|
+
return readFile(targetFileName);
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
//# sourceMappingURL=mutool.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mutool.js","sourceRoot":"","sources":["../../src/mutool.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AACtC,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AACvC,OAAO,GAAG,MAAM,KAAK,CAAC;AACtB,GAAG,CAAC,kBAAkB,EAAE,CAAC;AAEzB,MAAM,UAAU,aAAa,CAAC,KAAa,EAAE,MAAc;IACvD,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QAEnC,MAAM,OAAO,GAAG,KAAK,CAAC,QAAQ,EAAE,CAAC,SAAS,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;QAElE,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,UAAU,IAAI;YAC7B,IAAI,IAAI,EAAE,CAAC;gBACP,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,IAAI,EAAE,CAAC,CAAC,CAAC;YACzD,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,IAAI;YAC9B,IAAI,IAAI,EAAE,CAAC;gBACP,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,IAAI,EAAE,CAAC,CAAC,CAAC;YACzD,CAAC;iBAAM,CAAC;gBACJ,OAAO,OAAO,CAAC,MAAM,CAAC,CAAC;YAC3B,CAAC;YAAA,CAAC;QACN,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YACxB,MAAM,CAAC,GAAG,CAAC,CAAC;QAChB,CAAC,CAAC,CAAC;IAEP,CAAC,CAAC,CAAC;AAEP,CAAC;AACD,MAAM,UAAU,SAAS,CAAC,MAAc;IACpC,OAAO,eAAe,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;AAC9E,CAAC;AACD,MAAM,UAAU,eAAe,CAAC,MAAc;IAC1C,MAAM,SAAS,GAAG,GAAG,CAAC,QAAQ,CAAC,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;IACpD,MAAM,cAAc,GAAG,GAAG,CAAC,WAAW,CAAC,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;IAE5D,EAAE,CAAC,SAAS,CAAC,SAAS,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;IAEnC,OAAO,aAAa,CAAC,SAAS,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE;QAC3D,OAAO,QAAQ,CAAC,cAAc,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACP,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import * as mupdf from "mupdf";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
async function test() {
|
|
4
|
+
const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
|
|
5
|
+
//const count = doc.countPages();
|
|
6
|
+
for (let i = 0; i < 5; i++) {
|
|
7
|
+
const page = doc.loadPage(i);
|
|
8
|
+
const stext = page.toStructuredText();
|
|
9
|
+
console.log("Page ================= ", i);
|
|
10
|
+
console.log("=================!!!!!!", stext.asText());
|
|
11
|
+
//console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
|
|
12
|
+
//console.log("=================!!!!!!", stext);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
test();
|
|
16
|
+
//# sourceMappingURL=mutool2.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mutool2.js","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,OAAO,CAAC;AAC/B,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,KAAK,UAAU,IAAI;IAEf,MAAM,GAAG,GAAG,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC,YAAY,CAAC,0BAA0B,CAAC,EAAE,iBAAiB,CAAC,CAAC;IAExG,iCAAiC;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QACzB,MAAM,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,CAAC,CAAC,CAAC;QAC1C,OAAO,CAAC,GAAG,CAAC,yBAAyB,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QACvD,wEAAwE;QACxE,gDAAgD;IACpD,CAAC;AACL,CAAC;AAED,IAAI,EAAE,CAAC"}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import { PassThrough } from 'stream';
|
|
3
|
+
export function manyToMarkdownFromBuffer(buffer, fromFormat) {
|
|
4
|
+
const input = new PassThrough();
|
|
5
|
+
input.end(buffer);
|
|
6
|
+
return manyToMarkdown(input, fromFormat);
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Requires pandoc to be installed on the system.
|
|
10
|
+
* @param fromFormat is the format of the input buffer.
|
|
11
|
+
*/
|
|
12
|
+
export function manyToMarkdown(input, fromFormat) {
|
|
13
|
+
return new Promise((resolve, reject) => {
|
|
14
|
+
let result = [];
|
|
15
|
+
const command = spawn("pandoc", ["-t", "markdown", '-f', fromFormat], {
|
|
16
|
+
stdio: 'pipe',
|
|
17
|
+
});
|
|
18
|
+
input.pipe(command.stdin);
|
|
19
|
+
command.stdout.on('data', function (data) {
|
|
20
|
+
result.push(data.toString());
|
|
21
|
+
});
|
|
22
|
+
command.on('exit', function (code) {
|
|
23
|
+
if (code) {
|
|
24
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
|
25
|
+
}
|
|
26
|
+
});
|
|
27
|
+
command.on('close', function (code) {
|
|
28
|
+
if (code) {
|
|
29
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
resolve(result.join(''));
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
command.on('error', (err) => {
|
|
36
|
+
reject(err);
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
//# sourceMappingURL=pandoc.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pandoc.js","sourceRoot":"","sources":["../../src/pandoc.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,MAAM,QAAQ,CAAC;AAErC,MAAM,UAAU,wBAAwB,CAAC,MAAc,EAAE,UAAkB;IACzE,MAAM,KAAK,GAAG,IAAI,WAAW,EAAE,CAAC;IAChC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;IAClB,OAAO,cAAc,CAAC,KAAK,EAAE,UAAU,CAAC,CAAC;AAE3C,CAAC;AACD;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,KAA4B,EAAE,UAAkB;IAE7E,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,IAAI,MAAM,GAAa,EAAE,CAAC;QAE1B,MAAM,OAAO,GAAG,KAAK,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,UAAU,CAAC,EAAE;YACpE,KAAK,EAAE,MAAM;SACd,CAAC,CAAC;QACH,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QAE1B,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,UAAU,IAAY;YAC9C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;QACH,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,UAAU,IAAI;YAC/B,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,IAAI,EAAE,CAAC,CAAC,CAAC;YACvD,CAAC;QACH,CAAC,CAAC,CAAC;QACH,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,UAAU,IAAI;YAChC,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,IAAI,EAAE,CAAC,CAAC,CAAC;YACvD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;YAC1B,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YAC1B,MAAM,CAAC,GAAG,CAAC,CAAC;QACd,CAAC,CAAC,CAAC;IAEL,CAAC,CAAC,CAAC;AAEL,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
|
|
3
|
+
*/
|
|
4
|
+
import fs from 'fs';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
import { extractImagesFromPdfWithApryse } from "./pdf.js";
|
|
7
|
+
const main = async () => {
|
|
8
|
+
const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
|
|
9
|
+
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
10
|
+
console.log('start extracting images from pdf');
|
|
11
|
+
const result = await extractImagesFromPdfWithApryse(pdfBuffer);
|
|
12
|
+
console.log('result: ', result);
|
|
13
|
+
};
|
|
14
|
+
main();
|
|
15
|
+
//# sourceMappingURL=pdf-test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-test.js","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,8BAA8B,EAAE,MAAM,UAAU,CAAC;AAE1D,MAAM,IAAI,GAAG,KAAK,IAAI,EAAE;IAEpB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,mBAAmB,EAAE,eAAe,CAAC,CAAC;IAC9E,MAAM,SAAS,GAAG,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,kCAAkC,CAAC,CAAC;IAChD,MAAM,MAAM,GAAQ,MAAM,8BAA8B,CAAC,SAAS,CAAC,CAAC;IAEpE,OAAO,CAAC,GAAG,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;AAEpC,CAAC,CAAA;AAED,IAAI,EAAE,CAAC"}
|
package/lib/esm/pdf.js
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import pdf2md from "@opendocsg/pdf2md";
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import os from 'os';
|
|
4
|
+
import pkg from '@pdftron/pdfnet-node';
|
|
5
|
+
const { PDFNet } = pkg;
|
|
6
|
+
const pdf2mdFn = pdf2md;
|
|
7
|
+
export function trasformPdfToMarkdown(buffer) {
|
|
8
|
+
const arr = new Uint8Array(buffer);
|
|
9
|
+
return pdf2mdFn(arr);
|
|
10
|
+
}
|
|
11
|
+
async function extractImages(buffer, minHw = 300) {
|
|
12
|
+
const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
|
|
13
|
+
const reader = await PDFNet.ElementReader.create();
|
|
14
|
+
const tmpDir = os.tmpdir();
|
|
15
|
+
const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
|
|
16
|
+
// Read page content on every page in the document
|
|
17
|
+
const itr = await doc.getPageIterator();
|
|
18
|
+
for (itr; await itr.hasNext(); itr.next()) {
|
|
19
|
+
// Read the page
|
|
20
|
+
const page = await itr.current();
|
|
21
|
+
const pageNumber = await page.getIndex();
|
|
22
|
+
reader.beginOnPage(page);
|
|
23
|
+
await ProcessElements(reader, pageNumber);
|
|
24
|
+
reader.end();
|
|
25
|
+
}
|
|
26
|
+
return { workingDir };
|
|
27
|
+
async function ProcessElements(reader, pageNumber) {
|
|
28
|
+
// Traverse the page display list
|
|
29
|
+
let imgCount = 1;
|
|
30
|
+
for (let element = await reader.next(); element !== null; element = await reader.next()) {
|
|
31
|
+
const elementType = await element.getType();
|
|
32
|
+
switch (elementType) {
|
|
33
|
+
case PDFNet.Element.Type.e_image:
|
|
34
|
+
{
|
|
35
|
+
const image = await PDFNet.Image.createFromObj(await element.getXObject());
|
|
36
|
+
const h = await image.getImageHeight();
|
|
37
|
+
const w = await image.getImageWidth();
|
|
38
|
+
//console.log(`Image: width=${w}, height=${h}`);
|
|
39
|
+
//do not extract if image is too small, likely not relevant
|
|
40
|
+
//TODO: use LLM to decide if it matters?
|
|
41
|
+
if (w < minHw && h < minHw) {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
|
|
45
|
+
image.exportAsPng(imgName);
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
case PDFNet.Element.Type.e_form:
|
|
49
|
+
{
|
|
50
|
+
reader.formBegin();
|
|
51
|
+
ProcessElements(reader, pageNumber);
|
|
52
|
+
reader.end();
|
|
53
|
+
break;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
export async function extractImagesFromPdfWithApryse(buffer, minHw = 300) {
|
|
60
|
+
const APRYSE_KEY = process.env.APRYSE_KEY;
|
|
61
|
+
const extractImagesWrapper = async () => {
|
|
62
|
+
return await extractImages(buffer, minHw);
|
|
63
|
+
};
|
|
64
|
+
const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
|
|
65
|
+
return res;
|
|
66
|
+
}).finally(() => PDFNet.shutdown());
|
|
67
|
+
//read all images in the directory
|
|
68
|
+
const files = fs.readdirSync(res.workingDir);
|
|
69
|
+
const images = files.map((file) => {
|
|
70
|
+
const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
|
|
71
|
+
return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
|
|
72
|
+
});
|
|
73
|
+
return images;
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=pdf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.js","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,mBAAmB,CAAC;AAEvC,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,EAAE,MAAM,IAAI,CAAC;AAEpB,OAAO,GAAG,MAAM,sBAAsB,CAAC;AACvC,MAAM,EAAE,MAAM,EAAE,GAAG,GAAG,CAAC;AAGvB,MAAM,QAAQ,GAAG,MAA4D,CAAC;AAE9E,MAAM,UAAU,qBAAqB,CAAC,MAAc;IAChD,MAAM,GAAG,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC;AACzB,CAAC;AAID,KAAK,UAAU,aAAa,CAAC,MAAc,EAAE,QAAgB,GAAG;IAC5D,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;IACzD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,MAAM,EAAE,CAAC;IACnD,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM,EAAE,CAAA;IAC1B,MAAM,UAAU,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,MAAM,cAAc,CAAC,CAAC;IAE3D,kDAAkD;IAClD,MAAM,GAAG,GAAG,MAAM,GAAG,CAAC,eAAe,EAAE,CAAC;IACxC,KAAK,GAAG,EAAE,MAAM,GAAG,CAAC,OAAO,EAAE,EAAE,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC;QACxC,gBAAgB;QAChB,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;QACjC,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,QAAQ,EAAE,CAAC;QACzC,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACzB,MAAM,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;QAC1C,MAAM,CAAC,GAAG,EAAE,CAAC;IACjB,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,CAAC;IAEtB,KAAK,UAAU,eAAe,CAAC,MAA6B,EAAE,UAAkB;QAC5E,iCAAiC;QACjC,IAAI,QAAQ,GAAG,CAAC,CAAC;QAEjB,KAAK,IAAI,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,OAAO,KAAK,IAAI,EAAE,OAAO,GAAG,MAAM,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;YACtF,MAAM,WAAW,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;YAC5C,QAAQ,WAAW,EAAE,CAAC;gBAClB,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,OAAO;oBAC5B,CAAC;wBACG,MAAM,KAAK,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,aAAa,CAAC,MAAM,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;wBAC3E,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,cAAc,EAAE,CAAC;wBACvC,MAAM,CAAC,GAAG,MAAM,KAAK,CAAC,aAAa,EAAE,CAAC;wBACtC,gDAAgD;wBAChD,2DAA2D;wBAC3D,wCAAwC;wBACxC,IAAI,CAAC,GAAG,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC;4BACzB,MAAM;wBACV,CAAC;wBACD,MAAM,OAAO,GAAG,GAAG,UAAU,QAAQ,UAAU,IAAI,QAAQ,EAAE,MAAM,CAAC;wBACpE,KAAK,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;wBAC3B,MAAM;oBACV,CAAC;gBACL,KAAK,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM;oBAC3B,CAAC;wBACG,MAAM,CAAC,SAAS,EAAE,CAAC;wBACnB,eAAe,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;wBACpC,MAAM,CAAC,GAAG,EAAE,CAAC;wBACb,MAAM;oBACV,CAAC;YACT,CAAC;QACL,CAAC;IACL,CAAC;AACL,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,8BAA8B,CAAC,MAAc,EAAE,QAAgB,GAAG;IACpF,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;IAE1C,MAAM,oBAAoB,GAAG,KAAK,IAAI,EAAE;QACpC,OAAO,MAAM,aAAa,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAC9C,CAAC,CAAC;IAEF,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,oBAAoB,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE;QACnF,OAAO,GAAG,CAAC;IACf,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;IAGpC,kCAAkC;IAClC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAE7C,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QAC9B,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACtE,OAAO,EAAE,IAAI,EAAE,QAAQ,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,QAAQ,CAAC,QAAQ,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IACpF,CAAC,CAAC,CAAC;IAEH,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import sharp from "sharp";
|
|
2
|
+
export interface TransformOptions {
|
|
3
|
+
max_hw?: number;
|
|
4
|
+
format?: keyof sharp.FormatEnum;
|
|
5
|
+
}
|
|
6
|
+
type SharpInputType = Buffer | ArrayBuffer | Uint8Array | Uint8ClampedArray | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array | Float32Array | Float64Array | string | NodeJS.ReadableStream;
|
|
7
|
+
export declare function createImageTransformer(input: SharpInputType, opts: TransformOptions): sharp.Sharp;
|
|
8
|
+
/**
|
|
9
|
+
* @param max_hw
|
|
10
|
+
* @param format
|
|
11
|
+
* @returns
|
|
12
|
+
*/
|
|
13
|
+
export declare function transformImage(input: SharpInputType, output: NodeJS.WritableStream, opts: TransformOptions): Promise<sharp.Sharp>;
|
|
14
|
+
export declare function transformImageToBuffer(input: SharpInputType, opts: TransformOptions): Promise<Buffer>;
|
|
15
|
+
export declare function transformImageToFile(input: SharpInputType, output: string, opts: TransformOptions): Promise<void>;
|
|
16
|
+
export {};
|
|
17
|
+
//# sourceMappingURL=image.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"image.d.ts","sourceRoot":"","sources":["../../src/image.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAE1B,MAAM,WAAW,gBAAgB;IAC7B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,KAAK,CAAC,UAAU,CAAA;CAClC;AAED,KAAK,cAAc,GAAG,MAAM,GACtB,WAAW,GACX,UAAU,GACV,iBAAiB,GACjB,SAAS,GACT,WAAW,GACX,UAAU,GACV,WAAW,GACX,UAAU,GACV,YAAY,GACZ,YAAY,GACZ,MAAM,GACN,MAAM,CAAC,cAAc,CAAA;AAC3B,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,cAAc,EAAE,IAAI,EAAE,gBAAgB,eAenF;AAED;;;;GAIG;AACH,wBAAsB,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,CAAC,cAAc,EAAE,IAAI,EAAE,gBAAgB,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAyBvI;AAED,wBAAgB,sBAAsB,CAAC,KAAK,EAAE,cAAc,EAAE,IAAI,EAAE,gBAAgB,GAAG,OAAO,CAAC,MAAM,CAAC,CAGrG;AAED,wBAAsB,oBAAoB,CAAC,KAAK,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,CAGvH"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { transformImage, transformImageToBuffer, transformImageToFile } from './image.js';
|
|
2
|
+
import { pdfFileToText, pdfToText, pdfToTextBuffer } from './mutool.js';
|
|
3
|
+
import { manyToMarkdown } from './pandoc.js';
|
|
4
|
+
export { manyToMarkdown, pdfFileToText, pdfToText, pdfToTextBuffer, transformImage, transformImageToBuffer, transformImageToFile };
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,YAAY,CAAC;AAC1F,OAAO,EAAE,aAAa,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAE7C,OAAO,EACH,cAAc,EACd,aAAa,EAAE,SAAS,EACxB,eAAe,EACf,cAAc,EACd,sBAAsB,EACtB,oBAAoB,EACvB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mutool.d.ts","sourceRoot":"","sources":["../../src/mutool.ts"],"names":[],"mappings":"AAMA,wBAAgB,aAAa,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,oBAyB1D;AACD,wBAAgB,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAEzD;AACD,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAS/D"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"mutool2.d.ts","sourceRoot":"","sources":["../../src/mutool2.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function manyToMarkdownFromBuffer(buffer: Buffer, fromFormat: string): Promise<string>;
|
|
2
|
+
/**
|
|
3
|
+
* Requires pandoc to be installed on the system.
|
|
4
|
+
* @param fromFormat is the format of the input buffer.
|
|
5
|
+
*/
|
|
6
|
+
export declare function manyToMarkdown(input: NodeJS.ReadableStream, fromFormat: string): Promise<string>;
|
|
7
|
+
//# sourceMappingURL=pandoc.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pandoc.d.ts","sourceRoot":"","sources":["../../src/pandoc.ts"],"names":[],"mappings":"AAGA,wBAAgB,wBAAwB,CAAC,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAK5F;AACD;;;GAGG;AACH,wBAAgB,cAAc,CAAC,KAAK,EAAE,MAAM,CAAC,cAAc,EAAE,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAgChG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf-test.d.ts","sourceRoot":"","sources":["../../src/pdf-test.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function trasformPdfToMarkdown(buffer: Buffer): Promise<string>;
|
|
2
|
+
export declare function extractImagesFromPdfWithApryse(buffer: Buffer, minHw?: number): Promise<{
|
|
3
|
+
page: number;
|
|
4
|
+
imgCount: number;
|
|
5
|
+
path: string;
|
|
6
|
+
}[]>;
|
|
7
|
+
//# sourceMappingURL=pdf.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdf.d.ts","sourceRoot":"","sources":["../../src/pdf.ts"],"names":[],"mappings":"AAWA,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,mBAGnD;AAyDD,wBAAsB,8BAA8B,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,GAAE,MAAY;;;;KAqBvF"}
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@vertesia/converters",
|
|
3
|
+
"version": "0.42.2",
|
|
4
|
+
"description": "Image and content converters",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"types": "./lib/types/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"lib",
|
|
9
|
+
"src"
|
|
10
|
+
],
|
|
11
|
+
"license": "Apache-2.0",
|
|
12
|
+
"exports": {
|
|
13
|
+
"types": "./lib/types/index.d.ts",
|
|
14
|
+
"import": "./lib/esm/index.js",
|
|
15
|
+
"require": "./lib/cjs/index.js"
|
|
16
|
+
},
|
|
17
|
+
"devDependencies": {
|
|
18
|
+
"@types/tmp": "^0.2.6",
|
|
19
|
+
"ts-dual-module": "^0.6.3",
|
|
20
|
+
"typescript": "^5.0.2",
|
|
21
|
+
"vitest": "^2.1.6"
|
|
22
|
+
},
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"@opendocsg/pdf2md": "0.2.0",
|
|
25
|
+
"@pdftron/pdfnet-node": "^10.11.0",
|
|
26
|
+
"mupdf": "^0.3.0",
|
|
27
|
+
"sharp": "^0.33.5",
|
|
28
|
+
"tmp": "^0.2.3"
|
|
29
|
+
},
|
|
30
|
+
"ts_dual_module": {
|
|
31
|
+
"outDir": "lib"
|
|
32
|
+
},
|
|
33
|
+
"scripts": {
|
|
34
|
+
"test": "vitest run",
|
|
35
|
+
"build": "pnpm exec tsmod build --esm",
|
|
36
|
+
"clean": "rimraf ./node_modules ./lib ./tsconfig.tsbuildinfo"
|
|
37
|
+
}
|
|
38
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import sharp from 'sharp';
|
|
4
|
+
import { expect, test } from 'vitest';
|
|
5
|
+
import { createImageTransformer } from './image';
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
test('should resize an image to a maximum height or width', async () => {
|
|
9
|
+
const max_hw = 1024;
|
|
10
|
+
const format: keyof sharp.FormatEnum = 'jpeg';
|
|
11
|
+
const imageFile = fs.readFileSync(path.join(__dirname, '../fixtures', 'cat-picture.jpg'));
|
|
12
|
+
|
|
13
|
+
const sh = createImageTransformer(imageFile, { max_hw, format });
|
|
14
|
+
|
|
15
|
+
const buffer = await sh.toBuffer();
|
|
16
|
+
const metadata = await sharp(buffer).metadata();
|
|
17
|
+
|
|
18
|
+
console.log(metadata);
|
|
19
|
+
//await sh.toFile('./cat-picture.jpg');
|
|
20
|
+
|
|
21
|
+
expect(metadata.width).to.be.lessThanOrEqual(max_hw);
|
|
22
|
+
expect(metadata.height).to.be.lessThanOrEqual(max_hw);
|
|
23
|
+
expect(metadata.format).to.equal(format);
|
|
24
|
+
|
|
25
|
+
});
|
package/src/image.ts
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import sharp from "sharp";
|
|
2
|
+
|
|
3
|
+
export interface TransformOptions {
|
|
4
|
+
max_hw?: number,
|
|
5
|
+
format?: keyof sharp.FormatEnum
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
type SharpInputType = Buffer
|
|
9
|
+
| ArrayBuffer
|
|
10
|
+
| Uint8Array
|
|
11
|
+
| Uint8ClampedArray
|
|
12
|
+
| Int8Array
|
|
13
|
+
| Uint16Array
|
|
14
|
+
| Int16Array
|
|
15
|
+
| Uint32Array
|
|
16
|
+
| Int32Array
|
|
17
|
+
| Float32Array
|
|
18
|
+
| Float64Array
|
|
19
|
+
| string
|
|
20
|
+
| NodeJS.ReadableStream
|
|
21
|
+
export function createImageTransformer(input: SharpInputType, opts: TransformOptions) {
|
|
22
|
+
const isInputStream = !!(input as NodeJS.ReadableStream).pipe;
|
|
23
|
+
let sh = isInputStream ? (input as NodeJS.ReadableStream).pipe(sharp()) : sharp(input as any);
|
|
24
|
+
if (opts.max_hw) {
|
|
25
|
+
sh = sh.resize({
|
|
26
|
+
width: opts.max_hw,
|
|
27
|
+
height: opts.max_hw,
|
|
28
|
+
fit: sharp.fit.inside,
|
|
29
|
+
withoutEnlargement: true,
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
if (opts.format) {
|
|
33
|
+
sh = sh.toFormat(opts.format);
|
|
34
|
+
}
|
|
35
|
+
return sh;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* @param max_hw
|
|
40
|
+
* @param format
|
|
41
|
+
* @returns
|
|
42
|
+
*/
|
|
43
|
+
export async function transformImage(input: SharpInputType, output: NodeJS.WritableStream, opts: TransformOptions): Promise<sharp.Sharp> {
|
|
44
|
+
const sh = createImageTransformer(input, opts);
|
|
45
|
+
sh.pipe(output);
|
|
46
|
+
|
|
47
|
+
return new Promise((resolve, reject) => {
|
|
48
|
+
const handleError = (err: any) => {
|
|
49
|
+
console.error('Failed to transform', err);
|
|
50
|
+
try {
|
|
51
|
+
if ((input as any).pipe && (input as any).destroy) {
|
|
52
|
+
(input as any).destroy();
|
|
53
|
+
}
|
|
54
|
+
if ((output as any).destroy) {
|
|
55
|
+
(output as any).destroy();
|
|
56
|
+
}
|
|
57
|
+
sh.destroy();
|
|
58
|
+
} finally {
|
|
59
|
+
reject(err);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
output.on('error', handleError);
|
|
63
|
+
(input as any).pipe && (input as any).on && (input as any).on('error', handleError);
|
|
64
|
+
output.on("finish", () => {
|
|
65
|
+
resolve(sh);
|
|
66
|
+
});
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export function transformImageToBuffer(input: SharpInputType, opts: TransformOptions): Promise<Buffer> {
|
|
71
|
+
const sh = createImageTransformer(input, opts);
|
|
72
|
+
return sh.toBuffer();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export async function transformImageToFile(input: SharpInputType, output: string, opts: TransformOptions): Promise<void> {
|
|
76
|
+
const sh = createImageTransformer(input, opts);
|
|
77
|
+
await sh.toFile(output);
|
|
78
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { transformImage, transformImageToBuffer, transformImageToFile } from './image.js';
|
|
2
|
+
import { pdfFileToText, pdfToText, pdfToTextBuffer } from './mutool.js';
|
|
3
|
+
import { manyToMarkdown } from './pandoc.js';
|
|
4
|
+
|
|
5
|
+
export {
|
|
6
|
+
manyToMarkdown,
|
|
7
|
+
pdfFileToText, pdfToText,
|
|
8
|
+
pdfToTextBuffer,
|
|
9
|
+
transformImage,
|
|
10
|
+
transformImageToBuffer,
|
|
11
|
+
transformImageToFile
|
|
12
|
+
};
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { expect, test } from 'vitest';
|
|
4
|
+
import { pdfToText } from './mutool.js';
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
test('[mutool] should convert pdf to text', async () => {
|
|
8
|
+
const pdf = fs.readFileSync(path.join(__dirname, '../fixtures', 'test-pdf2.pdf'));
|
|
9
|
+
const buf = Buffer.from(pdf);
|
|
10
|
+
console.log("Running mutoolPdfToText")
|
|
11
|
+
const result: string = await pdfToText(buf);
|
|
12
|
+
expect(result).toContain('its attentive Ambassadors');
|
|
13
|
+
|
|
14
|
+
});
|
package/src/mutool.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import { readFile } from "fs/promises";
|
|
4
|
+
import tmp from 'tmp';
|
|
5
|
+
tmp.setGracefulCleanup();
|
|
6
|
+
|
|
7
|
+
export function pdfFileToText(input: string, output: string) {
|
|
8
|
+
return new Promise((resolve, reject) => {
|
|
9
|
+
|
|
10
|
+
const command = spawn("mutool", ["convert", "-o", output, input]);
|
|
11
|
+
|
|
12
|
+
command.on('exit', function (code) {
|
|
13
|
+
if (code) {
|
|
14
|
+
reject(new Error(`mutool exited with code ${code}`));
|
|
15
|
+
}
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
command.on('close', function (code) {
|
|
19
|
+
if (code) {
|
|
20
|
+
reject(new Error(`mutool exited with code ${code}`));
|
|
21
|
+
} else {
|
|
22
|
+
return resolve(output);
|
|
23
|
+
};
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
command.on('error', (err) => {
|
|
27
|
+
reject(err);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
}
|
|
33
|
+
export function pdfToText(buffer: Buffer): Promise<string> {
|
|
34
|
+
return pdfToTextBuffer(buffer).then((buffer) => buffer.toString('utf-8'));
|
|
35
|
+
}
|
|
36
|
+
export function pdfToTextBuffer(buffer: Buffer): Promise<Buffer> {
|
|
37
|
+
const inputFile = tmp.fileSync({ postfix: '.pdf' });
|
|
38
|
+
const targetFileName = tmp.tmpNameSync({ postfix: '.txt' });
|
|
39
|
+
|
|
40
|
+
fs.writeSync(inputFile.fd, buffer);
|
|
41
|
+
|
|
42
|
+
return pdfFileToText(inputFile.name, targetFileName).then(() => {
|
|
43
|
+
return readFile(targetFileName);
|
|
44
|
+
});
|
|
45
|
+
}
|
package/src/mutool2.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import * as mupdf from "mupdf";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
|
|
4
|
+
async function test() {
|
|
5
|
+
|
|
6
|
+
const doc = mupdf.Document.openDocument(fs.readFileSync("./fixtures/test-pdf1.pdf"), "application/pdf");
|
|
7
|
+
|
|
8
|
+
//const count = doc.countPages();
|
|
9
|
+
for (let i = 0; i < 5; i++) {
|
|
10
|
+
const page = doc.loadPage(i);
|
|
11
|
+
const stext = page.toStructuredText();
|
|
12
|
+
console.log("Page ================= ", i);
|
|
13
|
+
console.log("=================!!!!!!", stext.asText());
|
|
14
|
+
//console.log(JSON.stringify(JSON.parse(stext.asJSON()), undefined, 2));
|
|
15
|
+
//console.log("=================!!!!!!", stext);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
test();
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { Readable } from 'stream';
|
|
4
|
+
import { expect, test } from 'vitest';
|
|
5
|
+
import { manyToMarkdown } from './pandoc';
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
// Add more test cases for other file types (ODT, DOCX) if needed
|
|
10
|
+
test('should convert docx to markdown', async () => {
|
|
11
|
+
const docx: Buffer = fs.readFileSync(path.join(__dirname, '../fixtures', 'us-ciia.docx'));
|
|
12
|
+
const result = await manyToMarkdown(Readable.from(docx), 'docx');
|
|
13
|
+
expect(result).to.include('confidential');
|
|
14
|
+
});
|
package/src/pandoc.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { spawn } from 'child_process';
|
|
2
|
+
import { PassThrough } from 'stream';
|
|
3
|
+
|
|
4
|
+
export function manyToMarkdownFromBuffer(buffer: Buffer, fromFormat: string): Promise<string> {
|
|
5
|
+
const input = new PassThrough();
|
|
6
|
+
input.end(buffer);
|
|
7
|
+
return manyToMarkdown(input, fromFormat);
|
|
8
|
+
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Requires pandoc to be installed on the system.
|
|
12
|
+
* @param fromFormat is the format of the input buffer.
|
|
13
|
+
*/
|
|
14
|
+
export function manyToMarkdown(input: NodeJS.ReadableStream, fromFormat: string): Promise<string> {
|
|
15
|
+
|
|
16
|
+
return new Promise((resolve, reject) => {
|
|
17
|
+
let result: string[] = [];
|
|
18
|
+
|
|
19
|
+
const command = spawn("pandoc", ["-t", "markdown", '-f', fromFormat], {
|
|
20
|
+
stdio: 'pipe',
|
|
21
|
+
});
|
|
22
|
+
input.pipe(command.stdin);
|
|
23
|
+
|
|
24
|
+
command.stdout.on('data', function (data: string) {
|
|
25
|
+
result.push(data.toString());
|
|
26
|
+
});
|
|
27
|
+
command.on('exit', function (code) {
|
|
28
|
+
if (code) {
|
|
29
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
command.on('close', function (code) {
|
|
33
|
+
if (code) {
|
|
34
|
+
reject(new Error(`pandoc exited with code ${code}`));
|
|
35
|
+
} else {
|
|
36
|
+
resolve(result.join(''))
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
command.on('error', (err) => {
|
|
41
|
+
reject(err);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
}
|
package/src/pdf-test.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* IMPORTANT: DO NOT RUN IN VITEST, VITEST DOESN'T WORK WITH APRYSE
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import fs from 'fs';
|
|
6
|
+
import path from 'path';
|
|
7
|
+
import { extractImagesFromPdfWithApryse } from "./pdf.js";
|
|
8
|
+
|
|
9
|
+
const main = async () => {
|
|
10
|
+
|
|
11
|
+
const pdfPath = path.resolve(__dirname, '../../../fixtures', 'test-pdf2.pdf');
|
|
12
|
+
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
13
|
+
|
|
14
|
+
console.log('start extracting images from pdf');
|
|
15
|
+
const result: any = await extractImagesFromPdfWithApryse(pdfBuffer);
|
|
16
|
+
|
|
17
|
+
console.log('result: ', result);
|
|
18
|
+
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
main();
|
package/src/pdf.test.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { expect, test } from 'vitest';
|
|
4
|
+
import { trasformPdfToMarkdown } from './pdf';
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
test('Converts a PDF to markdown', async () => {
|
|
9
|
+
const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf1.pdf');
|
|
10
|
+
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
11
|
+
const result = await trasformPdfToMarkdown(pdfBuffer);
|
|
12
|
+
|
|
13
|
+
expect(result).toContain('America');
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
test('Converts another PDF to markdown', async () => {
|
|
19
|
+
const pdfPath = path.resolve(__dirname, '../fixtures', 'test-pdf2.pdf');
|
|
20
|
+
const pdfBuffer = fs.readFileSync(pdfPath);
|
|
21
|
+
const result = await trasformPdfToMarkdown(pdfBuffer);
|
|
22
|
+
|
|
23
|
+
expect(result).toContain('America');
|
|
24
|
+
|
|
25
|
+
});
|
package/src/pdf.ts
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import pdf2md from "@opendocsg/pdf2md";
|
|
2
|
+
import type { PDFNet as PDFTron } from '@pdftron/pdfnet-node';
|
|
3
|
+
import fs from 'fs';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
|
|
6
|
+
import pkg from '@pdftron/pdfnet-node';
|
|
7
|
+
const { PDFNet } = pkg;
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
const pdf2mdFn = pdf2md as unknown as (buffer: Uint8Array) => Promise<string>;
|
|
11
|
+
|
|
12
|
+
export function trasformPdfToMarkdown(buffer: Buffer) {
|
|
13
|
+
const arr = new Uint8Array(buffer);
|
|
14
|
+
return pdf2mdFn(arr);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async function extractImages(buffer: Buffer, minHw: number = 300) {
|
|
20
|
+
const doc = await PDFNet.PDFDoc.createFromBuffer(buffer);
|
|
21
|
+
const reader = await PDFNet.ElementReader.create();
|
|
22
|
+
const tmpDir = os.tmpdir()
|
|
23
|
+
const workingDir = fs.mkdtempSync(`${tmpDir}/pdfextract_`);
|
|
24
|
+
|
|
25
|
+
// Read page content on every page in the document
|
|
26
|
+
const itr = await doc.getPageIterator();
|
|
27
|
+
for (itr; await itr.hasNext(); itr.next()) {
|
|
28
|
+
// Read the page
|
|
29
|
+
const page = await itr.current();
|
|
30
|
+
const pageNumber = await page.getIndex();
|
|
31
|
+
reader.beginOnPage(page);
|
|
32
|
+
await ProcessElements(reader, pageNumber);
|
|
33
|
+
reader.end();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return { workingDir };
|
|
37
|
+
|
|
38
|
+
async function ProcessElements(reader: PDFTron.ElementReader, pageNumber: number) {
|
|
39
|
+
// Traverse the page display list
|
|
40
|
+
let imgCount = 1;
|
|
41
|
+
|
|
42
|
+
for (let element = await reader.next(); element !== null; element = await reader.next()) {
|
|
43
|
+
const elementType = await element.getType();
|
|
44
|
+
switch (elementType) {
|
|
45
|
+
case PDFNet.Element.Type.e_image:
|
|
46
|
+
{
|
|
47
|
+
const image = await PDFNet.Image.createFromObj(await element.getXObject());
|
|
48
|
+
const h = await image.getImageHeight();
|
|
49
|
+
const w = await image.getImageWidth();
|
|
50
|
+
//console.log(`Image: width=${w}, height=${h}`);
|
|
51
|
+
//do not extract if image is too small, likely not relevant
|
|
52
|
+
//TODO: use LLM to decide if it matters?
|
|
53
|
+
if (w < minHw && h < minHw) {
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
const imgName = `${workingDir}/img_${pageNumber}_${imgCount++}.png`;
|
|
57
|
+
image.exportAsPng(imgName);
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
case PDFNet.Element.Type.e_form:
|
|
61
|
+
{
|
|
62
|
+
reader.formBegin();
|
|
63
|
+
ProcessElements(reader, pageNumber);
|
|
64
|
+
reader.end();
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export async function extractImagesFromPdfWithApryse(buffer: Buffer, minHw: number = 300) {
|
|
73
|
+
const APRYSE_KEY = process.env.APRYSE_KEY;
|
|
74
|
+
|
|
75
|
+
const extractImagesWrapper = async () => {
|
|
76
|
+
return await extractImages(buffer, minHw);
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const res = await PDFNet.runWithCleanup(extractImagesWrapper, APRYSE_KEY).then((res) => {
|
|
80
|
+
return res;
|
|
81
|
+
}).finally(() => PDFNet.shutdown());
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
//read all images in the directory
|
|
85
|
+
const files = fs.readdirSync(res.workingDir);
|
|
86
|
+
|
|
87
|
+
const images = files.map((file) => {
|
|
88
|
+
const [pageNumber, imgCount] = file.split('.')[0].split('_').slice(1);
|
|
89
|
+
return { page: parseInt(pageNumber), imgCount: parseInt(imgCount), path: file };
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
return images;
|
|
93
|
+
}
|