scribe.js-ocr 0.7.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/build-deno-compile.sh +30 -0
  2. package/cli/cli.js +46 -18
  3. package/cli/detectPDFType.js +1 -2
  4. package/cli/extract.js +14 -7
  5. package/cli/main.js +39 -39
  6. package/cli/require.js +1 -1
  7. package/cli/scribe.js +12 -11
  8. package/fonts/Dingbats.woff +0 -0
  9. package/fonts/all/URWGothicBook-Bold.woff +0 -0
  10. package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
  11. package/fonts/all/URWGothicBook-Italic.woff +0 -0
  12. package/fonts/all/URWGothicBook-Regular.woff +0 -0
  13. package/fonts/latin/URWGothicBook-Bold.woff +0 -0
  14. package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
  15. package/fonts/latin/URWGothicBook-Italic.woff +0 -0
  16. package/fonts/latin/URWGothicBook-Regular.woff +0 -0
  17. package/js/canvasAdapter.js +4 -1
  18. package/js/clear.js +7 -8
  19. package/js/containers/app.js +2 -0
  20. package/js/containers/dataContainer.js +1 -4
  21. package/js/containers/fontContainer.js +59 -44
  22. package/js/containers/imageContainer.js +13 -35
  23. package/js/coordinates.js +3 -3
  24. package/js/debug.js +2 -2
  25. package/js/export/export.js +103 -18
  26. package/js/export/exportDebugCsv.js +4 -3
  27. package/js/export/pdf/writePdf.js +389 -0
  28. package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
  29. package/js/export/pdf/writePdfImages.js +218 -0
  30. package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
  31. package/js/export/writeDocx.js +12 -5
  32. package/js/export/writeHocr.js +11 -10
  33. package/js/export/writeHtml.js +208 -48
  34. package/js/export/writeTabular.js +31 -20
  35. package/js/export/writeText.js +12 -10
  36. package/js/fontContainerMain.js +101 -50
  37. package/js/fontEval.js +18 -14
  38. package/js/fontStatistics.js +90 -90
  39. package/js/generalWorkerMain.js +52 -6
  40. package/js/global.d.ts +178 -6
  41. package/js/import/convertDocTextract.js +447 -0
  42. package/js/import/convertPageAbbyy.js +10 -4
  43. package/js/import/convertPageBlocks.js +4 -4
  44. package/js/import/convertPageGoogleVision.js +204 -0
  45. package/js/import/convertPageHocr.js +3 -3
  46. package/js/import/convertPageShared.js +1 -0
  47. package/js/import/convertPageStext.js +18 -10
  48. package/js/import/convertPageText.js +289 -0
  49. package/js/import/import.js +133 -125
  50. package/js/import/importOCR.js +98 -46
  51. package/js/import/nodeAdapter.js +2 -2
  52. package/js/modifyOCR.js +6 -5
  53. package/js/nudge.js +3 -3
  54. package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
  55. package/js/objects/imageObjects.js +3 -2
  56. package/js/objects/layoutObjects.js +37 -0
  57. package/js/objects/ocrObjects.js +51 -3
  58. package/js/recognizeConvert.js +74 -23
  59. package/js/utils/fontUtils.js +32 -1
  60. package/js/utils/imageUtils.js +99 -0
  61. package/js/utils/miscUtils.js +158 -9
  62. package/js/utils/reflowPars.js +4 -0
  63. package/js/worker/compareOCRModule.js +20 -18
  64. package/js/worker/generalWorker.js +12 -6
  65. package/js/worker/optimizeFontModule.js +19 -19
  66. package/mupdf/libmupdf.js +3 -3
  67. package/mupdf/libmupdf.wasm +0 -0
  68. package/mupdf/mupdf-async.js +1 -1
  69. package/mupdf/mupdf-worker.js +9 -4
  70. package/package.json +7 -4
  71. package/scribe.js +5 -5
  72. package/tess/tesseract.esm.min.js +1 -1
  73. package/tess/tesseract.min.js +1 -1
  74. package/tess/worker.min.js +1 -1
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+
3
+ ## Note:
4
+ ## The Windows build does not work and the Mac build has never been tested.
5
+ ## The Linux build should work.
6
+
7
+ # Extract version from package.json using grep and sed
8
+ VERSION=$(grep '"version"' package.json | sed -E 's/.*"version": *"([^"]+)".*/\1/')
9
+ if [ -z "$VERSION" ]; then
10
+ echo "Failed to extract version from package.json"
11
+ exit 1
12
+ fi
13
+
14
+ # Create build directory
15
+ mkdir -p build
16
+
17
+ # Build for different platforms
18
+ echo "Building for Linux x64..."
19
+ deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
20
+
21
+ echo "Building for macOS x64..."
22
+ deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 --include mupdf --include fonts --include js/worker cli/scribe.js
23
+
24
+ echo "Building for Windows x64..."
25
+ deno compile --allow-sys --allow-read --allow-net --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe --include mupdf --include fonts --include js/worker cli/scribe.js
26
+
27
+ # Create checksums
28
+ cd build
29
+ sha256sum * > checksums.txt
30
+ cd ..
package/cli/cli.js CHANGED
@@ -7,18 +7,36 @@ import {
7
7
  evalInternal, overlay, recognize,
8
8
  } from './main.js';
9
9
 
10
- export const confCLI = async (ocrFile) => {
11
- await conf(ocrFile);
10
+ /**
11
+ * Print confidence of Abbyy .xml file.
12
+ *
13
+ * @param {string[]} files - Paths to input files.
14
+ */
15
+ export const confCLI = async (files) => {
16
+ await conf(files);
12
17
  process.exitCode = 0;
13
18
  };
14
19
 
15
- export const checkCLI = async (pdfFile, ocrFile, options) => {
16
- await check(pdfFile, ocrFile, options);
20
+ /**
21
+ *
22
+ * @param {string[]} files - Paths to input files.
23
+ * @param {Object} options
24
+ * @param {number} [options.workers]
25
+ */
26
+ export const checkCLI = async (files, options) => {
27
+ await check(files, options);
17
28
  process.exitCode = 0;
18
29
  };
19
30
 
20
- export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
21
- const { evalMetrics } = await evalInternal(pdfFile, ocrFile, options);
31
+ /**
32
+ * Evaluate internal OCR engine.
33
+ *
34
+ * @param {string[]} files - Paths to input files.
35
+ * @param {Object} options
36
+ * @param {number} [options.workers]
37
+ */
38
+ export const evalInternalCLI = async (files, options) => {
39
+ const { evalMetrics } = await evalInternal(files, options);
22
40
 
23
41
  const ignoreExtra = true;
24
42
  let metricWER;
@@ -34,14 +52,14 @@ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
34
52
 
35
53
  /**
36
54
  *
37
- * @param {string} pdfFile - Path to PDF file.
55
+ * @param {string} inputFile - Path to PDF file.
38
56
  * @param {?string} [outputDir='.'] - Output directory.
39
57
  * @param {Object} [options]
40
58
  * @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
41
59
  * @param {boolean} [options.reflow]
42
60
  */
43
- export const extractCLI = async (pdfFile, outputDir, options) => {
44
- await extract(pdfFile, outputDir, options);
61
+ export const extractCLI = async (inputFile, outputDir, options) => {
62
+ await extract(inputFile, outputDir, options);
45
63
  process.exitCode = 0;
46
64
  };
47
65
 
@@ -57,28 +75,38 @@ export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
57
75
 
58
76
  /**
59
77
  *
60
- * @param {string} pdfFile - Path to PDF file.
61
- * @param {*} ocrFile
62
- * @param {*} outputDir
78
+ * @param {string[]} files - Paths to input files.
63
79
  * @param {Object} options
80
+ * @param {string} [options.output] - Output directory for the resulting PDF.
64
81
  * @param {boolean} [options.robust]
65
82
  * @param {boolean} [options.conf]
66
83
  * @param {boolean} [options.vis]
67
84
  * @param {number} [options.workers]
68
85
  */
69
- export const overlayCLI = async (pdfFile, ocrFile, outputDir, options) => {
86
+ export const overlayCLI = async (files, options) => {
70
87
  options.overlayMode = options.vis ? 'proof' : 'invis';
71
- await overlay(pdfFile, ocrFile, outputDir, options);
88
+ await overlay(files, options.output, options);
72
89
  process.exitCode = 0;
73
90
  };
74
91
 
75
- export const recognizeCLI = async (pdfFile, options) => {
92
+ /**
93
+ *
94
+ * @param {string[]} files - Paths to input files.
95
+ * @param {*} options
96
+ */
97
+ export const recognizeCLI = async (files, options) => {
76
98
  options.overlayMode = options.vis ? 'proof' : 'invis';
77
- await recognize(pdfFile, options);
99
+ await recognize(files, options);
78
100
  process.exitCode = 0;
79
101
  };
80
102
 
81
- export const debugCLI = async (pdfFile, outputDir, options) => {
82
- await debug(pdfFile, outputDir, options);
103
+ /**
104
+ *
105
+ * @param {string[]} files - Paths to input files.
106
+ * @param {*} outputDir
107
+ * @param {*} options
108
+ */
109
+ export const debugCLI = async (files, outputDir, options) => {
110
+ await debug(files, outputDir, options);
83
111
  process.exitCode = 0;
84
112
  };
@@ -1,4 +1,4 @@
1
- import fs from 'fs';
1
+ import fs from 'node:fs';
2
2
  import scribe from '../scribe.js';
3
3
 
4
4
  /**
@@ -30,5 +30,4 @@ export const detectPDFType = async (pdfFile, outputPath) => {
30
30
  console.log('PDF Type:', type);
31
31
 
32
32
  mupdfScheduler.scheduler.terminate();
33
-
34
33
  };
package/cli/extract.js CHANGED
@@ -1,28 +1,35 @@
1
- import fs from 'fs';
2
- import path from 'path';
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
3
  import scribe from '../scribe.js';
4
4
 
5
+ // TODO: Consider whether this should exist and whether it should be combined into a larger CLI utility.
6
+ // This was originally created to provide a simple interface to extract existing text from a PDF file,
7
+ // however it now does other things, and this should likely be part of a larger `convert` utility.
8
+
5
9
  /**
6
10
  *
7
- * @param {string} pdfFile - Path to PDF file.
11
+ * @param {string} inputFile - Path to input file.
8
12
  * @param {?string} [output='.'] - Output file or directory.
9
13
  * @param {Object} [options]
10
14
  * @param {Parameters<typeof scribe.download>[0]} [options.format]
11
15
  * @param {boolean} [options.reflow]
12
16
  */
13
- export const extract = async (pdfFile, output, options) => {
17
+ export const extract = async (inputFile, output, options) => {
14
18
  const format = options?.format || 'txt';
15
19
 
16
20
  output = output || '.';
17
21
  const outputDir = path.dirname(output);
18
- const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,5}$/i, `.${format}`)}` : path.basename(output);
22
+ const outputFile = outputDir === output ? `${path.basename(inputFile).replace(/\.\w{1,6}$/i, `.${format}`)}` : path.basename(output);
19
23
  const outputPath = `${outputDir}/${outputFile}`;
20
24
 
21
25
  scribe.opt.reflow = true;
22
26
  scribe.opt.extractText = true;
27
+ scribe.opt.displayMode = 'ebook';
23
28
 
24
- await scribe.init();
25
- await scribe.importFiles([pdfFile]);
29
+ // TODO: Fonts do not need to be loaded for .txt output, but are needed for .pdf output.
30
+ // so a more robust implementation would consider the arguments and only load fonts if necessary.
31
+ await scribe.init({ font: true });
32
+ await scribe.importFiles([inputFile]);
26
33
 
27
34
  if (outputDir) fs.mkdirSync(outputDir, { recursive: true });
28
35
 
package/cli/main.js CHANGED
@@ -1,8 +1,8 @@
1
1
  // Code for adding visualization to OCR output
2
2
  // Use: `node addOverlay.js [PDF file] [OCR data file] [output directory]`
3
3
 
4
- import fs from 'fs';
5
- import path from 'path';
4
+ import fs from 'node:fs';
5
+ import path from 'node:path';
6
6
 
7
7
  import scribe from '../scribe.js';
8
8
 
@@ -13,8 +13,7 @@ scribe.opt.saveDebugImages = debugMode;
13
13
  /**
14
14
  * @param {string} func
15
15
  * @param {Object} params
16
- * @param {string} [params.pdfFile]
17
- * @param {string} [params.ocrFile]
16
+ * @param {string[]} [params.files]
18
17
  * @param {string} [params.outputDir]
19
18
  * @param {Array<string>} [params.list]
20
19
  * @param {boolean} [params.robustConfMode]
@@ -26,6 +25,10 @@ scribe.opt.saveDebugImages = debugMode;
26
25
  async function main(func, params) {
27
26
  scribe.opt.workerN = params.workerN || null;
28
27
 
28
+ if (!params.files || params.files.length === 0) {
29
+ throw new Error('No input files provided.');
30
+ }
31
+
29
32
  await scribe.init({
30
33
  pdf: true,
31
34
  ocr: true,
@@ -39,15 +42,9 @@ async function main(func, params) {
39
42
 
40
43
  const output = {};
41
44
 
42
- const files = [];
43
- if (params.pdfFile) files.push(params.pdfFile);
44
- if (params.ocrFile) files.push(params.ocrFile);
45
- await scribe.importFiles(files);
45
+ await scribe.importFiles(params.files);
46
46
 
47
- const backgroundArg = params.pdfFile;
48
- const backgroundStem = backgroundArg ? path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '') : undefined;
49
- const ocrStem = params.ocrFile ? path.basename(params.ocrFile).replace(/\.\w{1,5}$/i, '') : undefined;
50
- const outputStem = backgroundStem || ocrStem || 'output';
47
+ const outputStem = scribe.inputData.defaultDownloadFileName.replace(/\.\w{1,6}$/i, '') || 'output';
51
48
 
52
49
  const outputDir = params.outputDir || '.';
53
50
 
@@ -75,23 +72,29 @@ async function main(func, params) {
75
72
  }
76
73
  }
77
74
 
78
- if (['overlay', 'recognize'].includes(func) && backgroundArg) {
75
+ if (['overlay', 'recognize'].includes(func) && (scribe.inputData.pdfMode || scribe.inputData.imageMode)) {
79
76
  let outputSuffix = '';
80
77
  if (scribe.opt.displayMode === 'proof') {
81
78
  outputSuffix = '_vis';
82
79
  } else if (scribe.opt.displayMode === 'invis') {
83
- const resolvedInputFile = path.dirname(path.resolve(backgroundArg));
84
- const resolvedOutputDir = path.resolve(outputDir);
85
- if (resolvedInputFile === resolvedOutputDir) {
86
- outputSuffix = '_ocr';
80
+ // Check if output file would overwrite any input file, and if so, add a suffix to avoid overwriting.
81
+ // This software is still in development--nobody should be ovewriting input files.
82
+ const resolvedOutputFileTmp = path.resolve(`${outputDir}/${outputStem}.pdf`);
83
+ for (let i = 0; i < params.files.length; i++) {
84
+ const resolvedInputFile = path.resolve(params.files[i]);
85
+ if (resolvedInputFile === resolvedOutputFileTmp) {
86
+ outputSuffix = '_ocr';
87
+ console.log(`Saving output with ${outputSuffix} suffix to avoid overwriting input: ${resolvedInputFile}`);
88
+ break;
89
+ }
87
90
  }
88
91
  }
89
92
 
90
- const outputPath = `${outputDir}/${path.basename(backgroundArg).replace(/\.\w{1,5}$/i, `${outputSuffix}.pdf`)}`;
93
+ const outputPath = path.resolve(`${outputDir}/${outputStem}${outputSuffix}.pdf`);
91
94
  await scribe.download('pdf', outputPath);
92
95
 
93
96
  if (params.hocr) {
94
- const outputPathHocr = `${outputDir}/${path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '.hocr')}`;
97
+ const outputPathHocr = path.resolve(`${outputDir}/${outputStem}.hocr`);
95
98
  await scribe.download('hocr', outputPathHocr);
96
99
  }
97
100
  }
@@ -100,7 +103,7 @@ async function main(func, params) {
100
103
  const debugDir = `${outputDir}/${outputStem}_debug`;
101
104
  fs.mkdirSync(debugDir, { recursive: true });
102
105
  const outputPathCsv = `${debugDir}/_debug.csv`;
103
- scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv);
106
+ scribe.utils.writeDebugCsv({ pages: scribe.data.ocr.active, fileName: outputPathCsv });
104
107
 
105
108
  scribe.utils.dumpDebugImages(debugDir);
106
109
  scribe.utils.dumpHOCR(debugDir);
@@ -123,33 +126,30 @@ async function main(func, params) {
123
126
  /**
124
127
  * Print confidence of Abbyy .xml file.
125
128
  *
126
- * @param {string} ocrFile
129
+ * @param {string[]} files - Paths to input files.
127
130
  */
128
- export const conf = async (ocrFile) => (main('conf', { ocrFile }));
131
+ export const conf = async (files) => (main('conf', { files }));
129
132
 
130
133
  /**
131
134
  *
132
- * @param {string} pdfFile - Path to PDF file.
133
- * @param {string} ocrFile
135
+ * @param {string[]} files - Paths to input files.
134
136
  * @param {Object} options
135
137
  * @param {number} [options.workers]
136
138
  */
137
- export const check = async (pdfFile, ocrFile, options) => (main('check', { pdfFile, ocrFile, workerN: options?.workers }));
139
+ export const check = async (files, options) => (main('check', { files, workerN: options?.workers }));
138
140
 
139
141
  /**
140
142
  * Evaluate internal OCR engine.
141
143
  *
142
- * @param {string} pdfFile - Path to PDF file.
143
- * @param {string} ocrFile - Path to OCR file containing ground truth.
144
+ * @param {string[]} files - Paths to input files.
144
145
  * @param {Object} options
145
146
  * @param {number} [options.workers]
146
147
  */
147
- export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', { pdfFile, ocrFile, workerN: options?.workers }));
148
+ export const evalInternal = async (files, options) => (main('eval', { files, workerN: options?.workers }));
148
149
 
149
150
  /**
150
151
  *
151
- * @param {string} pdfFile - Path to PDF file.
152
- * @param {*} ocrFile
152
+ * @param {string[]} files - Paths to input files.
153
153
  * @param {*} outputDir
154
154
  * @param {Object} options
155
155
  * @param {boolean} [options.robust]
@@ -157,29 +157,29 @@ export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', {
157
157
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
158
158
  * @param {number} [options.workers]
159
159
  */
160
- export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('overlay', {
161
- pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
160
+ export const overlay = async (files, outputDir, options) => (main('overlay', {
161
+ files, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
162
162
  }));
163
163
 
164
164
  /**
165
165
  *
166
- * @param {string} pdfFile - Path to PDF file.
166
+ * @param {string[]} files - Paths to input files.
167
167
  * @param {Object} options
168
+ * @param {string} [options.output]
168
169
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
169
170
  * @param {boolean} [options.hocr]
170
171
  * @param {number} [options.workers]
171
172
  */
172
- export const recognize = async (pdfFile, options) => (main('recognize', {
173
- pdfFile, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers, hocr: options?.hocr,
173
+ export const recognize = async (files, options) => (main('recognize', {
174
+ files, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers, hocr: options?.hocr, outputDir: options?.output || '.',
174
175
  }));
175
176
 
176
177
  /**
177
178
  *
178
- * @param {string} pdfFile - Path to PDF file.
179
+ * @param {string[]} files - Paths to input files.
179
180
  * @param {*} outputDir
180
181
  * @param {*} options
181
- * @returns
182
182
  */
183
- export const debug = async (pdfFile, outputDir, options) => (main('debug', {
184
- pdfFile, outputDir, list: options?.list,
183
+ export const debug = async (files, outputDir, options) => (main('debug', {
184
+ files, outputDir, list: options?.list,
185
185
  }));
package/cli/require.js CHANGED
@@ -2,6 +2,6 @@
2
2
  // While we will switch everything to worker modules eventually, Firefox still does not support them:
3
3
  // https://developer.mozilla.org/en-US/docs/Web/API/Worker#browser_compatibility
4
4
  // Therefore, we use a dynamic import statement to run the following code only in the Node.js version.
5
- const { createRequire } = await import('module');
5
+ const { createRequire } = await import('node:module');
6
6
  globalThis.require = createRequire(import.meta.url);
7
7
  globalThis.__dirname = import.meta.url;
package/cli/scribe.js CHANGED
@@ -1,3 +1,5 @@
1
+ #!/usr/bin/env node
2
+
1
3
  import { Command, Option } from 'commander';
2
4
 
3
5
  import {
@@ -18,23 +20,21 @@ program
18
20
 
19
21
  program
20
22
  .command('check')
21
- .argument('<pdf_file>', 'Input PDF file.')
22
- .argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
23
23
  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
24
+ .argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
24
25
  .description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
25
26
  .action(checkCLI);
26
27
 
27
28
  program
28
29
  .command('eval')
29
- .argument('<pdf_file>', 'Input PDF file.')
30
- .argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
31
30
  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
31
+ .argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
32
32
  .description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
33
33
  .action(evalInternalCLI);
34
34
 
35
35
  program
36
36
  .command('extract')
37
- .argument('<pdf_file>', 'Input PDF file.')
37
+ .argument('<input_file>', 'Input PDF file.')
38
38
  .argument('[output]', 'Output directory or file to save results.', '.')
39
39
  .addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
40
40
  .option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')
@@ -43,23 +43,23 @@ program
43
43
 
44
44
  program
45
45
  .command('overlay')
46
- .argument('<pdf_file>', 'Input PDF file.')
47
- .argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
48
- .argument('[output_dir]', 'Directory for output file(s).', '.')
46
+ .option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
49
47
  .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
50
48
  .option('-c, --conf', 'Print average confidence metric for document.')
51
49
  .option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
52
50
  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
51
+ .argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
53
52
  .description('Add OCR data to provided PDF file and save result as PDF.')
54
53
  .action(overlayCLI);
55
54
 
56
55
  program
57
56
  .command('recognize')
58
- .argument('<pdf_file>', 'Input PDF file.')
59
- .description('Recognize text in PDF file using internal OCR engine.')
57
+ .option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
60
58
  .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
61
59
  .option('-h, --hocr', 'Output .hocr intermediate data in addition to .pdf.')
62
60
  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
61
+ .argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
62
+ .description('Recognize text in PDF file using internal OCR engine.')
63
63
  .action(recognizeCLI);
64
64
 
65
65
  program
@@ -71,9 +71,10 @@ program
71
71
 
72
72
  program
73
73
  .command('debug')
74
- .argument('<pdf_file>', 'Input PDF file.')
74
+ .option('-o, --output <directory>', 'Directory for output file(s). Default is current directory.')
75
75
  .argument('[output_dir]', 'Directory for output file(s).', '.')
76
76
  .option('--list <items>', 'Comma separated list of visualizations to include.', (value) => value.split(','))
77
+ .argument('<files...>', 'Input PDF file and OCR file(s). Accepts .hocr and Abbyy .xml (with character-level data enabled).')
77
78
  .description('Generate and write Tesseract debugging images.')
78
79
  .action(debugCLI);
79
80
 
Binary file
@@ -28,6 +28,9 @@ export class ca {
28
28
  if (typeof process === 'undefined') {
29
29
  return new OffscreenCanvas(width, height);
30
30
  }
31
+ if (!width || !height || width <= 0 || height <= 0) {
32
+ throw new Error(`Invalid canvas size: ${width}x${height}`);
33
+ }
31
34
  const canvasKit = await ca.getCanvasKit();
32
35
  return canvasKit.MakeCanvas(width, height);
33
36
  };
@@ -80,7 +83,7 @@ export class ca {
80
83
  } else {
81
84
  const dummyCanvas = await ca.dummyCanvasPromise;
82
85
 
83
- const fs = await import('fs');
86
+ const fs = await import('node:fs');
84
87
  const fontBuffer = typeof fontObj.src === 'string' ? fs.readFileSync(fontObj.src) : fontObj.src;
85
88
 
86
89
  dummyCanvas.loadFont(fontBuffer, {
package/js/clear.js CHANGED
@@ -1,27 +1,26 @@
1
1
  import { inputData } from './containers/app.js';
2
2
  import {
3
3
  convertPageWarn,
4
- fontMetricsObj,
5
4
  layoutDataTables,
6
5
  layoutRegions,
7
6
  ocrAll,
8
7
  ocrAllRaw,
9
- pageMetricsArr,
8
+ pageMetricsAll,
10
9
  } from './containers/dataContainer.js';
11
10
  import { FontCont } from './containers/fontContainer.js';
12
11
  import { ImageCache } from './containers/imageContainer.js';
13
- import { replaceObjectProperties } from './utils/miscUtils.js';
12
+ import { clearObjectProperties } from './utils/miscUtils.js';
14
13
 
15
14
  export function clearData() {
16
15
  inputData.clear();
17
- replaceObjectProperties(ocrAll, { active: [] });
18
- replaceObjectProperties(ocrAllRaw, { active: [] });
16
+ clearObjectProperties(ocrAll);
17
+ ocrAll.active = [];
18
+ clearObjectProperties(ocrAllRaw);
19
+ ocrAllRaw.active = [];
19
20
  layoutRegions.pages.length = 0;
20
21
  layoutDataTables.pages.length = 0;
21
- pageMetricsArr.length = 0;
22
+ pageMetricsAll.length = 0;
22
23
  convertPageWarn.length = 0;
23
24
  ImageCache.clear();
24
- // Clear optimized font data and reset fontAll to raw data.
25
- replaceObjectProperties(fontMetricsObj);
26
25
  FontCont.clear();
27
26
  }
@@ -31,6 +31,8 @@ export class opt {
31
31
 
32
32
  static removeMargins = false;
33
33
 
34
+ static includeImages = false;
35
+
34
36
  static pageBreaks = true;
35
37
 
36
38
  /** @type {("invis"|"ebook"|"eval"|"proof")} */
@@ -1,9 +1,6 @@
1
1
  // This file contains various objects that are imported by other modules.
2
2
  // Everything here is essentially a global variable; none of them are technically "containers".
3
3
 
4
- /** @type {Object.<string, FontMetricsFamily>} */
5
- export const fontMetricsObj = {};
6
-
7
4
  export class layoutRegions {
8
5
  /** @type {Array<LayoutPage>} */
9
6
  static pages = [];
@@ -66,7 +63,7 @@ export const ocrAll = { active: [] };
66
63
  export const ocrAllRaw = { active: [] };
67
64
 
68
65
  /** @type {Array<PageMetrics>} */
69
- export const pageMetricsArr = [];
66
+ export const pageMetricsAll = [];
70
67
 
71
68
  /**
72
69
  * Class that stores various debug data.