scribe.js-ocr 0.7.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build-deno-compile.sh +30 -0
- package/cli/cli.js +46 -18
- package/cli/detectPDFType.js +1 -2
- package/cli/extract.js +14 -7
- package/cli/main.js +39 -39
- package/cli/require.js +1 -1
- package/cli/scribe.js +12 -11
- package/fonts/Dingbats.woff +0 -0
- package/fonts/all/URWGothicBook-Bold.woff +0 -0
- package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/all/URWGothicBook-Italic.woff +0 -0
- package/fonts/all/URWGothicBook-Regular.woff +0 -0
- package/fonts/latin/URWGothicBook-Bold.woff +0 -0
- package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/latin/URWGothicBook-Italic.woff +0 -0
- package/fonts/latin/URWGothicBook-Regular.woff +0 -0
- package/js/canvasAdapter.js +4 -1
- package/js/clear.js +7 -8
- package/js/containers/app.js +2 -0
- package/js/containers/dataContainer.js +1 -4
- package/js/containers/fontContainer.js +59 -44
- package/js/containers/imageContainer.js +13 -35
- package/js/coordinates.js +3 -3
- package/js/debug.js +2 -2
- package/js/export/export.js +103 -18
- package/js/export/exportDebugCsv.js +4 -3
- package/js/export/pdf/writePdf.js +389 -0
- package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
- package/js/export/pdf/writePdfImages.js +218 -0
- package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
- package/js/export/writeDocx.js +12 -5
- package/js/export/writeHocr.js +11 -10
- package/js/export/writeHtml.js +208 -48
- package/js/export/writeTabular.js +31 -20
- package/js/export/writeText.js +12 -10
- package/js/fontContainerMain.js +101 -50
- package/js/fontEval.js +18 -14
- package/js/fontStatistics.js +90 -90
- package/js/generalWorkerMain.js +52 -6
- package/js/global.d.ts +178 -6
- package/js/import/convertDocTextract.js +447 -0
- package/js/import/convertPageAbbyy.js +10 -4
- package/js/import/convertPageBlocks.js +4 -4
- package/js/import/convertPageGoogleVision.js +204 -0
- package/js/import/convertPageHocr.js +3 -3
- package/js/import/convertPageShared.js +1 -0
- package/js/import/convertPageStext.js +18 -10
- package/js/import/convertPageText.js +289 -0
- package/js/import/import.js +133 -125
- package/js/import/importOCR.js +98 -46
- package/js/import/nodeAdapter.js +2 -2
- package/js/modifyOCR.js +6 -5
- package/js/nudge.js +3 -3
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/imageObjects.js +3 -2
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +51 -3
- package/js/recognizeConvert.js +74 -23
- package/js/utils/fontUtils.js +32 -1
- package/js/utils/imageUtils.js +99 -0
- package/js/utils/miscUtils.js +158 -9
- package/js/utils/reflowPars.js +4 -0
- package/js/worker/compareOCRModule.js +20 -18
- package/js/worker/generalWorker.js +12 -6
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +3 -3
- package/mupdf/libmupdf.wasm +0 -0
- package/mupdf/mupdf-async.js +1 -1
- package/mupdf/mupdf-worker.js +9 -4
- package/package.json +7 -4
- package/scribe.js +5 -5
- package/tess/tesseract.esm.min.js +1 -1
- package/tess/tesseract.min.js +1 -1
- package/tess/worker.min.js +1 -1
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
## Note:
|
|
4
|
+
## The Windows build does not work and the Mac build has never been tested.
|
|
5
|
+
## The Linux build should work.
|
|
6
|
+
|
|
7
|
+
# Extract version from package.json using grep and sed
|
|
8
|
+
VERSION=$(grep '"version"' package.json | sed -E 's/.*"version": *"([^"]+)".*/\1/')
|
|
9
|
+
if [ -z "$VERSION" ]; then
|
|
10
|
+
echo "Failed to extract version from package.json"
|
|
11
|
+
exit 1
|
|
12
|
+
fi
|
|
13
|
+
|
|
14
|
+
# Create build directory
|
|
15
|
+
mkdir -p build
|
|
16
|
+
|
|
17
|
+
# Build for different platforms
|
|
18
|
+
echo "Building for Linux x64..."
|
|
19
|
+
deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
|
|
20
|
+
|
|
21
|
+
echo "Building for macOS x64..."
|
|
22
|
+
deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
|
|
23
|
+
|
|
24
|
+
echo "Building for Windows x64..."
|
|
25
|
+
deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe --include mupdf --include fonts --include js/worker cli/scribe.js
|
|
26
|
+
|
|
27
|
+
# Create checksums
|
|
28
|
+
cd build
|
|
29
|
+
sha256sum * > checksums.txt
|
|
30
|
+
cd ..
|
package/cli/cli.js
CHANGED
|
@@ -7,18 +7,36 @@ import {
|
|
|
7
7
|
evalInternal, overlay, recognize,
|
|
8
8
|
} from './main.js';
|
|
9
9
|
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
/**
|
|
11
|
+
* Print confidence of Abbyy .xml file.
|
|
12
|
+
*
|
|
13
|
+
* @param {string[]} files - Paths to input files.
|
|
14
|
+
*/
|
|
15
|
+
export const confCLI = async (files) => {
|
|
16
|
+
await conf(files);
|
|
12
17
|
process.exitCode = 0;
|
|
13
18
|
};
|
|
14
19
|
|
|
15
|
-
|
|
16
|
-
|
|
20
|
+
/**
|
|
21
|
+
*
|
|
22
|
+
* @param {string[]} files - Paths to input files.
|
|
23
|
+
* @param {Object} options
|
|
24
|
+
* @param {number} [options.workers]
|
|
25
|
+
*/
|
|
26
|
+
export const checkCLI = async (files, options) => {
|
|
27
|
+
await check(files, options);
|
|
17
28
|
process.exitCode = 0;
|
|
18
29
|
};
|
|
19
30
|
|
|
20
|
-
|
|
21
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Evaluate internal OCR engine.
|
|
33
|
+
*
|
|
34
|
+
* @param {string[]} files - Paths to input files.
|
|
35
|
+
* @param {Object} options
|
|
36
|
+
* @param {number} [options.workers]
|
|
37
|
+
*/
|
|
38
|
+
export const evalInternalCLI = async (files, options) => {
|
|
39
|
+
const { evalMetrics } = await evalInternal(files, options);
|
|
22
40
|
|
|
23
41
|
const ignoreExtra = true;
|
|
24
42
|
let metricWER;
|
|
@@ -34,14 +52,14 @@ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
|
|
|
34
52
|
|
|
35
53
|
/**
|
|
36
54
|
*
|
|
37
|
-
* @param {string}
|
|
55
|
+
* @param {string} inputFile - Path to PDF file.
|
|
38
56
|
* @param {?string} [outputDir='.'] - Output directory.
|
|
39
57
|
* @param {Object} [options]
|
|
40
58
|
* @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
|
|
41
59
|
* @param {boolean} [options.reflow]
|
|
42
60
|
*/
|
|
43
|
-
export const extractCLI = async (
|
|
44
|
-
await extract(
|
|
61
|
+
export const extractCLI = async (inputFile, outputDir, options) => {
|
|
62
|
+
await extract(inputFile, outputDir, options);
|
|
45
63
|
process.exitCode = 0;
|
|
46
64
|
};
|
|
47
65
|
|
|
@@ -57,28 +75,38 @@ export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
|
|
|
57
75
|
|
|
58
76
|
/**
|
|
59
77
|
*
|
|
60
|
-
* @param {string}
|
|
61
|
-
* @param {*} ocrFile
|
|
62
|
-
* @param {*} outputDir
|
|
78
|
+
* @param {string[]} files - Paths to input files.
|
|
63
79
|
* @param {Object} options
|
|
80
|
+
* @param {string} [options.output] - Output directory for the resulting PDF.
|
|
64
81
|
* @param {boolean} [options.robust]
|
|
65
82
|
* @param {boolean} [options.conf]
|
|
66
83
|
* @param {boolean} [options.vis]
|
|
67
84
|
* @param {number} [options.workers]
|
|
68
85
|
*/
|
|
69
|
-
export const overlayCLI = async (
|
|
86
|
+
export const overlayCLI = async (files, options) => {
|
|
70
87
|
options.overlayMode = options.vis ? 'proof' : 'invis';
|
|
71
|
-
await overlay(
|
|
88
|
+
await overlay(files, options.output, options);
|
|
72
89
|
process.exitCode = 0;
|
|
73
90
|
};
|
|
74
91
|
|
|
75
|
-
|
|
92
|
+
/**
|
|
93
|
+
*
|
|
94
|
+
* @param {string[]} files - Paths to input files.
|
|
95
|
+
* @param {*} options
|
|
96
|
+
*/
|
|
97
|
+
export const recognizeCLI = async (files, options) => {
|
|
76
98
|
options.overlayMode = options.vis ? 'proof' : 'invis';
|
|
77
|
-
await recognize(
|
|
99
|
+
await recognize(files, options);
|
|
78
100
|
process.exitCode = 0;
|
|
79
101
|
};
|
|
80
102
|
|
|
81
|
-
|
|
82
|
-
|
|
103
|
+
/**
|
|
104
|
+
*
|
|
105
|
+
* @param {string[]} files - Paths to input files.
|
|
106
|
+
* @param {*} outputDir
|
|
107
|
+
* @param {*} options
|
|
108
|
+
*/
|
|
109
|
+
export const debugCLI = async (files, outputDir, options) => {
|
|
110
|
+
await debug(files, outputDir, options);
|
|
83
111
|
process.exitCode = 0;
|
|
84
112
|
};
|
package/cli/detectPDFType.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
1
|
+
import fs from 'node:fs';
|
|
2
2
|
import scribe from '../scribe.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
@@ -30,5 +30,4 @@ export const detectPDFType = async (pdfFile, outputPath) => {
|
|
|
30
30
|
console.log('PDF Type:', type);
|
|
31
31
|
|
|
32
32
|
mupdfScheduler.scheduler.terminate();
|
|
33
|
-
|
|
34
33
|
};
|
package/cli/extract.js
CHANGED
|
@@ -1,28 +1,35 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
import path from 'path';
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
3
|
import scribe from '../scribe.js';
|
|
4
4
|
|
|
5
|
+
// TODO: Consider whether this should exist and whether it should be combined into a larger CLI utility.
|
|
6
|
+
// This was originally created to provide a simple interface to extract existing text from a PDF file,
|
|
7
|
+
// however it now does other things, and this should likely be part of a larger `convert` utility.
|
|
8
|
+
|
|
5
9
|
/**
|
|
6
10
|
*
|
|
7
|
-
* @param {string}
|
|
11
|
+
* @param {string} inputFile - Path to input file.
|
|
8
12
|
* @param {?string} [output='.'] - Output file or directory.
|
|
9
13
|
* @param {Object} [options]
|
|
10
14
|
* @param {Parameters<typeof scribe.download>[0]} [options.format]
|
|
11
15
|
* @param {boolean} [options.reflow]
|
|
12
16
|
*/
|
|
13
|
-
export const extract = async (
|
|
17
|
+
export const extract = async (inputFile, output, options) => {
|
|
14
18
|
const format = options?.format || 'txt';
|
|
15
19
|
|
|
16
20
|
output = output || '.';
|
|
17
21
|
const outputDir = path.dirname(output);
|
|
18
|
-
const outputFile = outputDir === output ? `${path.basename(
|
|
22
|
+
const outputFile = outputDir === output ? `${path.basename(inputFile).replace(/\.\w{1,6}$/i, `.${format}`)}` : path.basename(output);
|
|
19
23
|
const outputPath = `${outputDir}/${outputFile}`;
|
|
20
24
|
|
|
21
25
|
scribe.opt.reflow = true;
|
|
22
26
|
scribe.opt.extractText = true;
|
|
27
|
+
scribe.opt.displayMode = 'ebook';
|
|
23
28
|
|
|
24
|
-
|
|
25
|
-
|
|
29
|
+
// TODO: Fonts do not need to be loaded for .txt output, but are needed for .pdf output.
|
|
30
|
+
// so a more robust implementation would consider the arguments and only load fonts if necessary.
|
|
31
|
+
await scribe.init({ font: true });
|
|
32
|
+
await scribe.importFiles([inputFile]);
|
|
26
33
|
|
|
27
34
|
if (outputDir) fs.mkdirSync(outputDir, { recursive: true });
|
|
28
35
|
|
package/cli/main.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
// Code for adding visualization to OCR output
|
|
2
2
|
// Use: `node addOverlay.js [PDF file] [OCR data file] [output directory]`
|
|
3
3
|
|
|
4
|
-
import fs from 'fs';
|
|
5
|
-
import path from 'path';
|
|
4
|
+
import fs from 'node:fs';
|
|
5
|
+
import path from 'node:path';
|
|
6
6
|
|
|
7
7
|
import scribe from '../scribe.js';
|
|
8
8
|
|
|
@@ -13,8 +13,7 @@ scribe.opt.saveDebugImages = debugMode;
|
|
|
13
13
|
/**
|
|
14
14
|
* @param {string} func
|
|
15
15
|
* @param {Object} params
|
|
16
|
-
* @param {string} [params.
|
|
17
|
-
* @param {string} [params.ocrFile]
|
|
16
|
+
* @param {string[]} [params.files]
|
|
18
17
|
* @param {string} [params.outputDir]
|
|
19
18
|
* @param {Array<string>} [params.list]
|
|
20
19
|
* @param {boolean} [params.robustConfMode]
|
|
@@ -26,6 +25,10 @@ scribe.opt.saveDebugImages = debugMode;
|
|
|
26
25
|
async function main(func, params) {
|
|
27
26
|
scribe.opt.workerN = params.workerN || null;
|
|
28
27
|
|
|
28
|
+
if (!params.files || params.files.length === 0) {
|
|
29
|
+
throw new Error('No input files provided.');
|
|
30
|
+
}
|
|
31
|
+
|
|
29
32
|
await scribe.init({
|
|
30
33
|
pdf: true,
|
|
31
34
|
ocr: true,
|
|
@@ -39,15 +42,9 @@ async function main(func, params) {
|
|
|
39
42
|
|
|
40
43
|
const output = {};
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
if (params.pdfFile) files.push(params.pdfFile);
|
|
44
|
-
if (params.ocrFile) files.push(params.ocrFile);
|
|
45
|
-
await scribe.importFiles(files);
|
|
45
|
+
await scribe.importFiles(params.files);
|
|
46
46
|
|
|
47
|
-
const
|
|
48
|
-
const backgroundStem = backgroundArg ? path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '') : undefined;
|
|
49
|
-
const ocrStem = params.ocrFile ? path.basename(params.ocrFile).replace(/\.\w{1,5}$/i, '') : undefined;
|
|
50
|
-
const outputStem = backgroundStem || ocrStem || 'output';
|
|
47
|
+
const outputStem = scribe.inputData.defaultDownloadFileName.replace(/\.\w{1,6}$/i, '') || 'output';
|
|
51
48
|
|
|
52
49
|
const outputDir = params.outputDir || '.';
|
|
53
50
|
|
|
@@ -75,23 +72,29 @@ async function main(func, params) {
|
|
|
75
72
|
}
|
|
76
73
|
}
|
|
77
74
|
|
|
78
|
-
if (['overlay', 'recognize'].includes(func) &&
|
|
75
|
+
if (['overlay', 'recognize'].includes(func) && (scribe.inputData.pdfMode || scribe.inputData.imageMode)) {
|
|
79
76
|
let outputSuffix = '';
|
|
80
77
|
if (scribe.opt.displayMode === 'proof') {
|
|
81
78
|
outputSuffix = '_vis';
|
|
82
79
|
} else if (scribe.opt.displayMode === 'invis') {
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
80
|
+
// Check if output file would overwrite any input file, and if so, add a suffix to avoid overwriting.
|
|
81
|
+
// This software is still in development--nobody should be ovewriting input files.
|
|
82
|
+
const resolvedOutputFileTmp = path.resolve(`${outputDir}/${outputStem}.pdf`);
|
|
83
|
+
for (let i = 0; i < params.files.length; i++) {
|
|
84
|
+
const resolvedInputFile = path.resolve(params.files[i]);
|
|
85
|
+
if (resolvedInputFile === resolvedOutputFileTmp) {
|
|
86
|
+
outputSuffix = '_ocr';
|
|
87
|
+
console.log(`Saving output with ${outputSuffix} suffix to avoid overwriting input: ${resolvedInputFile}`);
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
87
90
|
}
|
|
88
91
|
}
|
|
89
92
|
|
|
90
|
-
const outputPath = `${outputDir}/${
|
|
93
|
+
const outputPath = path.resolve(`${outputDir}/${outputStem}${outputSuffix}.pdf`);
|
|
91
94
|
await scribe.download('pdf', outputPath);
|
|
92
95
|
|
|
93
96
|
if (params.hocr) {
|
|
94
|
-
const outputPathHocr = `${outputDir}/${
|
|
97
|
+
const outputPathHocr = path.resolve(`${outputDir}/${outputStem}.hocr`);
|
|
95
98
|
await scribe.download('hocr', outputPathHocr);
|
|
96
99
|
}
|
|
97
100
|
}
|
|
@@ -100,7 +103,7 @@ async function main(func, params) {
|
|
|
100
103
|
const debugDir = `${outputDir}/${outputStem}_debug`;
|
|
101
104
|
fs.mkdirSync(debugDir, { recursive: true });
|
|
102
105
|
const outputPathCsv = `${debugDir}/_debug.csv`;
|
|
103
|
-
scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv);
|
|
106
|
+
scribe.utils.writeDebugCsv({ pages: scribe.data.ocr.active, fileName: outputPathCsv });
|
|
104
107
|
|
|
105
108
|
scribe.utils.dumpDebugImages(debugDir);
|
|
106
109
|
scribe.utils.dumpHOCR(debugDir);
|
|
@@ -123,33 +126,30 @@ async function main(func, params) {
|
|
|
123
126
|
/**
|
|
124
127
|
* Print confidence of Abbyy .xml file.
|
|
125
128
|
*
|
|
126
|
-
* @param {string}
|
|
129
|
+
* @param {string[]} files - Paths to input files.
|
|
127
130
|
*/
|
|
128
|
-
export const conf = async (
|
|
131
|
+
export const conf = async (files) => (main('conf', { files }));
|
|
129
132
|
|
|
130
133
|
/**
|
|
131
134
|
*
|
|
132
|
-
* @param {string}
|
|
133
|
-
* @param {string} ocrFile
|
|
135
|
+
* @param {string[]} files - Paths to input files.
|
|
134
136
|
* @param {Object} options
|
|
135
137
|
* @param {number} [options.workers]
|
|
136
138
|
*/
|
|
137
|
-
export const check = async (
|
|
139
|
+
export const check = async (files, options) => (main('check', { files, workerN: options?.workers }));
|
|
138
140
|
|
|
139
141
|
/**
|
|
140
142
|
* Evaluate internal OCR engine.
|
|
141
143
|
*
|
|
142
|
-
* @param {string}
|
|
143
|
-
* @param {string} ocrFile - Path to OCR file containing ground truth.
|
|
144
|
+
* @param {string[]} files - Paths to input files.
|
|
144
145
|
* @param {Object} options
|
|
145
146
|
* @param {number} [options.workers]
|
|
146
147
|
*/
|
|
147
|
-
export const evalInternal = async (
|
|
148
|
+
export const evalInternal = async (files, options) => (main('eval', { files, workerN: options?.workers }));
|
|
148
149
|
|
|
149
150
|
/**
|
|
150
151
|
*
|
|
151
|
-
* @param {string}
|
|
152
|
-
* @param {*} ocrFile
|
|
152
|
+
* @param {string[]} files - Paths to input files.
|
|
153
153
|
* @param {*} outputDir
|
|
154
154
|
* @param {Object} options
|
|
155
155
|
* @param {boolean} [options.robust]
|
|
@@ -157,29 +157,29 @@ export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', {
|
|
|
157
157
|
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
|
|
158
158
|
* @param {number} [options.workers]
|
|
159
159
|
*/
|
|
160
|
-
export const overlay = async (
|
|
161
|
-
|
|
160
|
+
export const overlay = async (files, outputDir, options) => (main('overlay', {
|
|
161
|
+
files, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
|
|
162
162
|
}));
|
|
163
163
|
|
|
164
164
|
/**
|
|
165
165
|
*
|
|
166
|
-
* @param {string}
|
|
166
|
+
* @param {string[]} files - Paths to input files.
|
|
167
167
|
* @param {Object} options
|
|
168
|
+
* @param {string} [options.output]
|
|
168
169
|
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
|
|
169
170
|
* @param {boolean} [options.hocr]
|
|
170
171
|
* @param {number} [options.workers]
|
|
171
172
|
*/
|
|
172
|
-
export const recognize = async (
|
|
173
|
-
|
|
173
|
+
export const recognize = async (files, options) => (main('recognize', {
|
|
174
|
+
files, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers, hocr: options?.hocr, outputDir: options?.output || '.',
|
|
174
175
|
}));
|
|
175
176
|
|
|
176
177
|
/**
|
|
177
178
|
*
|
|
178
|
-
* @param {string}
|
|
179
|
+
* @param {string[]} files - Paths to input files.
|
|
179
180
|
* @param {*} outputDir
|
|
180
181
|
* @param {*} options
|
|
181
|
-
* @returns
|
|
182
182
|
*/
|
|
183
|
-
export const debug = async (
|
|
184
|
-
|
|
183
|
+
export const debug = async (files, outputDir, options) => (main('debug', {
|
|
184
|
+
files, outputDir, list: options?.list,
|
|
185
185
|
}));
|
package/cli/require.js
CHANGED
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
// While we will switch everything to worker modules eventually, Firefox still does not support them:
|
|
3
3
|
// https://developer.mozilla.org/en-US/docs/Web/API/Worker#browser_compatibility
|
|
4
4
|
// Therefore, we use a dynamic import statement to run the following code only in the Node.js version.
|
|
5
|
-
const { createRequire } = await import('module');
|
|
5
|
+
const { createRequire } = await import('node:module');
|
|
6
6
|
globalThis.require = createRequire(import.meta.url);
|
|
7
7
|
globalThis.__dirname = import.meta.url;
|
package/cli/scribe.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
1
3
|
import { Command, Option } from 'commander';
|
|
2
4
|
|
|
3
5
|
import {
|
|
@@ -18,23 +20,21 @@ program
|
|
|
18
20
|
|
|
19
21
|
program
|
|
20
22
|
.command('check')
|
|
21
|
-
.argument('<pdf_file>', 'Input PDF file.')
|
|
22
|
-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
23
23
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
24
|
+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
24
25
|
.description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
|
|
25
26
|
.action(checkCLI);
|
|
26
27
|
|
|
27
28
|
program
|
|
28
29
|
.command('eval')
|
|
29
|
-
.argument('<pdf_file>', 'Input PDF file.')
|
|
30
|
-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
31
30
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
31
|
+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
32
32
|
.description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
|
|
33
33
|
.action(evalInternalCLI);
|
|
34
34
|
|
|
35
35
|
program
|
|
36
36
|
.command('extract')
|
|
37
|
-
.argument('<
|
|
37
|
+
.argument('<input_file>', 'Input PDF file.')
|
|
38
38
|
.argument('[output]', 'Output directory or file to save results.', '.')
|
|
39
39
|
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
|
|
40
40
|
.option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')
|
|
@@ -43,23 +43,23 @@ program
|
|
|
43
43
|
|
|
44
44
|
program
|
|
45
45
|
.command('overlay')
|
|
46
|
-
.
|
|
47
|
-
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
48
|
-
.argument('[output_dir]', 'Directory for output file(s).', '.')
|
|
46
|
+
.option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
|
|
49
47
|
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
|
|
50
48
|
.option('-c, --conf', 'Print average confidence metric for document.')
|
|
51
49
|
.option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
|
|
52
50
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
51
|
+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
53
52
|
.description('Add OCR data to provided PDF file and save result as PDF.')
|
|
54
53
|
.action(overlayCLI);
|
|
55
54
|
|
|
56
55
|
program
|
|
57
56
|
.command('recognize')
|
|
58
|
-
.
|
|
59
|
-
.description('Recognize text in PDF file using internal OCR engine.')
|
|
57
|
+
.option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
|
|
60
58
|
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
|
|
61
59
|
.option('-h, --hocr', 'Output .hocr intermediate data in addition to .pdf.')
|
|
62
60
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
61
|
+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
62
|
+
.description('Recognize text in PDF file using internal OCR engine.')
|
|
63
63
|
.action(recognizeCLI);
|
|
64
64
|
|
|
65
65
|
program
|
|
@@ -71,9 +71,10 @@ program
|
|
|
71
71
|
|
|
72
72
|
program
|
|
73
73
|
.command('debug')
|
|
74
|
-
.
|
|
74
|
+
.option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
|
|
75
75
|
.argument('[output_dir]', 'Directory for output file(s).', '.')
|
|
76
76
|
.option('--list <items>', 'Comma separated list of visualizations to include.', (value) => value.split(','))
|
|
77
|
+
.argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
77
78
|
.description('Generate and write Tesseract debugging images.')
|
|
78
79
|
.action(debugCLI);
|
|
79
80
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/js/canvasAdapter.js
CHANGED
|
@@ -28,6 +28,9 @@ export class ca {
|
|
|
28
28
|
if (typeof process === 'undefined') {
|
|
29
29
|
return new OffscreenCanvas(width, height);
|
|
30
30
|
}
|
|
31
|
+
if (!width || !height || width <= 0 || height <= 0) {
|
|
32
|
+
throw new Error(`Invalid canvas size: ${width}x${height}`);
|
|
33
|
+
}
|
|
31
34
|
const canvasKit = await ca.getCanvasKit();
|
|
32
35
|
return canvasKit.MakeCanvas(width, height);
|
|
33
36
|
};
|
|
@@ -80,7 +83,7 @@ export class ca {
|
|
|
80
83
|
} else {
|
|
81
84
|
const dummyCanvas = await ca.dummyCanvasPromise;
|
|
82
85
|
|
|
83
|
-
const fs = await import('fs');
|
|
86
|
+
const fs = await import('node:fs');
|
|
84
87
|
const fontBuffer = typeof fontObj.src === 'string' ? fs.readFileSync(fontObj.src) : fontObj.src;
|
|
85
88
|
|
|
86
89
|
dummyCanvas.loadFont(fontBuffer, {
|
package/js/clear.js
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
1
|
import { inputData } from './containers/app.js';
|
|
2
2
|
import {
|
|
3
3
|
convertPageWarn,
|
|
4
|
-
fontMetricsObj,
|
|
5
4
|
layoutDataTables,
|
|
6
5
|
layoutRegions,
|
|
7
6
|
ocrAll,
|
|
8
7
|
ocrAllRaw,
|
|
9
|
-
|
|
8
|
+
pageMetricsAll,
|
|
10
9
|
} from './containers/dataContainer.js';
|
|
11
10
|
import { FontCont } from './containers/fontContainer.js';
|
|
12
11
|
import { ImageCache } from './containers/imageContainer.js';
|
|
13
|
-
import {
|
|
12
|
+
import { clearObjectProperties } from './utils/miscUtils.js';
|
|
14
13
|
|
|
15
14
|
export function clearData() {
|
|
16
15
|
inputData.clear();
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
clearObjectProperties(ocrAll);
|
|
17
|
+
ocrAll.active = [];
|
|
18
|
+
clearObjectProperties(ocrAllRaw);
|
|
19
|
+
ocrAllRaw.active = [];
|
|
19
20
|
layoutRegions.pages.length = 0;
|
|
20
21
|
layoutDataTables.pages.length = 0;
|
|
21
|
-
|
|
22
|
+
pageMetricsAll.length = 0;
|
|
22
23
|
convertPageWarn.length = 0;
|
|
23
24
|
ImageCache.clear();
|
|
24
|
-
// Clear optimized font data and reset fontAll to raw data.
|
|
25
|
-
replaceObjectProperties(fontMetricsObj);
|
|
26
25
|
FontCont.clear();
|
|
27
26
|
}
|
package/js/containers/app.js
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
// This file contains various objects that are imported by other modules.
|
|
2
2
|
// Everything here is essentially a global variable; none of them are technically "containers".
|
|
3
3
|
|
|
4
|
-
/** @type {Object.<string, FontMetricsFamily>} */
|
|
5
|
-
export const fontMetricsObj = {};
|
|
6
|
-
|
|
7
4
|
export class layoutRegions {
|
|
8
5
|
/** @type {Array<LayoutPage>} */
|
|
9
6
|
static pages = [];
|
|
@@ -66,7 +63,7 @@ export const ocrAll = { active: [] };
|
|
|
66
63
|
export const ocrAllRaw = { active: [] };
|
|
67
64
|
|
|
68
65
|
/** @type {Array<PageMetrics>} */
|
|
69
|
-
export const
|
|
66
|
+
export const pageMetricsAll = [];
|
|
70
67
|
|
|
71
68
|
/**
|
|
72
69
|
* Class that stores various debug data.
|