scribe.js-ocr 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.js +5 -4
- package/cli/main.js +13 -7
- package/cli/scribe.js +4 -0
- package/js/containers/app.js +15 -0
- package/js/containers/fontContainer.js +11 -0
- package/js/containers/imageContainer.js +3 -3
- package/js/export/export.js +4 -0
- package/js/export/writePdf.js +3 -2
- package/js/extractPDFText.js +3 -2
- package/js/fontContainerMain.js +17 -2
- package/js/fontEval.js +8 -3
- package/js/fontSupp.js +1 -1
- package/js/generalWorkerMain.js +9 -7
- package/js/global.d.ts +1 -0
- package/js/import/convertPageStext.js +124 -38
- package/js/import/import.js +5 -2
- package/js/objects/ocrObjects.js +32 -0
- package/js/recognizeConvert.js +19 -5
- package/js/utils/fontUtils.js +10 -2
- package/js/utils/miscUtils.js +1 -1
- package/js/worker/compareOCRModule.js +21 -4
- package/js/worker/optimizeFontModule.js +6 -3
- package/mupdf/libmupdf.wasm +0 -0
- package/mupdf/mupdf-async.js +1 -1
- package/mupdf/mupdf-worker.js +3 -1
- package/package.json +1 -1
- package/scribe.js +1 -1
package/cli/cli.js
CHANGED
|
@@ -11,13 +11,13 @@ export const confCLI = async (ocrFile) => {
|
|
|
11
11
|
process.exitCode = 0;
|
|
12
12
|
};
|
|
13
13
|
|
|
14
|
-
export const checkCLI = async (pdfFile, ocrFile) => {
|
|
15
|
-
await check(pdfFile, ocrFile);
|
|
14
|
+
export const checkCLI = async (pdfFile, ocrFile, options) => {
|
|
15
|
+
await check(pdfFile, ocrFile, options);
|
|
16
16
|
process.exitCode = 0;
|
|
17
17
|
};
|
|
18
18
|
|
|
19
|
-
export const evalInternalCLI = async (pdfFile, ocrFile) => {
|
|
20
|
-
const { evalMetrics } = await evalInternal(pdfFile, ocrFile);
|
|
19
|
+
export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
|
|
20
|
+
const { evalMetrics } = await evalInternal(pdfFile, ocrFile, options);
|
|
21
21
|
|
|
22
22
|
const ignoreExtra = true;
|
|
23
23
|
let metricWER;
|
|
@@ -53,6 +53,7 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
|
|
|
53
53
|
* @param {boolean} [options.robust]
|
|
54
54
|
* @param {boolean} [options.conf]
|
|
55
55
|
* @param {boolean} [options.vis]
|
|
56
|
+
* @param {number} [options.workers]
|
|
56
57
|
*/
|
|
57
58
|
export const overlayCLI = async (pdfFile, ocrFile, outputDir, options) => {
|
|
58
59
|
options.overlayMode = options.vis ? 'proof' : 'invis';
|
package/cli/main.js
CHANGED
|
@@ -21,9 +21,11 @@ scribe.opt.saveDebugImages = debugMode;
|
|
|
21
21
|
* @param {boolean} [params.robustConfMode]
|
|
22
22
|
* @param {boolean} [params.printConf]
|
|
23
23
|
* @param {"eval" | "ebook" | "proof" | "invis"} [params.overlayMode]
|
|
24
|
-
*
|
|
24
|
+
* @param {number} [params.workerN]
|
|
25
25
|
*/
|
|
26
26
|
async function main(func, params) {
|
|
27
|
+
scribe.opt.workerN = params.workerN || null;
|
|
28
|
+
|
|
27
29
|
await scribe.init({
|
|
28
30
|
pdf: true,
|
|
29
31
|
ocr: true,
|
|
@@ -118,16 +120,20 @@ export const conf = async (ocrFile) => (main('conf', { ocrFile }));
|
|
|
118
120
|
*
|
|
119
121
|
* @param {string} pdfFile - Path to PDF file.
|
|
120
122
|
* @param {string} ocrFile
|
|
123
|
+
* @param {Object} options
|
|
124
|
+
* @param {number} [options.workers]
|
|
121
125
|
*/
|
|
122
|
-
export const check = async (pdfFile, ocrFile) => (main('check', { pdfFile, ocrFile }));
|
|
126
|
+
export const check = async (pdfFile, ocrFile, options) => (main('check', { pdfFile, ocrFile, workerN: options?.workers }));
|
|
123
127
|
|
|
124
128
|
/**
|
|
125
129
|
* Evaluate internal OCR engine.
|
|
126
130
|
*
|
|
127
131
|
* @param {string} pdfFile - Path to PDF file.
|
|
128
132
|
* @param {string} ocrFile - Path to OCR file containing ground truth.
|
|
133
|
+
* @param {Object} options
|
|
134
|
+
* @param {number} [options.workers]
|
|
129
135
|
*/
|
|
130
|
-
export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile, ocrFile }));
|
|
136
|
+
export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', { pdfFile, ocrFile, workerN: options?.workers }));
|
|
131
137
|
|
|
132
138
|
/**
|
|
133
139
|
*
|
|
@@ -138,10 +144,10 @@ export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile,
|
|
|
138
144
|
* @param {boolean} [options.robust]
|
|
139
145
|
* @param {boolean} [options.conf]
|
|
140
146
|
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
|
|
141
|
-
* @
|
|
147
|
+
* @param {number} [options.workers]
|
|
142
148
|
*/
|
|
143
149
|
export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('overlay', {
|
|
144
|
-
pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis',
|
|
150
|
+
pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
|
|
145
151
|
}));
|
|
146
152
|
|
|
147
153
|
/**
|
|
@@ -149,9 +155,9 @@ export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('ov
|
|
|
149
155
|
* @param {string} pdfFile - Path to PDF file.
|
|
150
156
|
* @param {Object} options
|
|
151
157
|
* @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
|
|
152
|
-
* @
|
|
158
|
+
* @param {number} [options.workers]
|
|
153
159
|
*/
|
|
154
|
-
export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis' }));
|
|
160
|
+
export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers }));
|
|
155
161
|
|
|
156
162
|
/**
|
|
157
163
|
*
|
package/cli/scribe.js
CHANGED
|
@@ -19,6 +19,7 @@ program
|
|
|
19
19
|
.command('check')
|
|
20
20
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
21
21
|
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
22
|
+
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
22
23
|
.description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
|
|
23
24
|
.action(checkCLI);
|
|
24
25
|
|
|
@@ -26,6 +27,7 @@ program
|
|
|
26
27
|
.command('eval')
|
|
27
28
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
28
29
|
.argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
|
|
30
|
+
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
29
31
|
.description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
|
|
30
32
|
.action(evalInternalCLI);
|
|
31
33
|
|
|
@@ -46,6 +48,7 @@ program
|
|
|
46
48
|
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
|
|
47
49
|
.option('-c, --conf', 'Print average confidence metric for document.')
|
|
48
50
|
.option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
|
|
51
|
+
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
49
52
|
.description('Add OCR data to provided PDF file and save result as PDF.')
|
|
50
53
|
.action(overlayCLI);
|
|
51
54
|
|
|
@@ -54,6 +57,7 @@ program
|
|
|
54
57
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
55
58
|
.description('Recognize text in PDF file using internal OCR engine.')
|
|
56
59
|
.option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
|
|
60
|
+
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
57
61
|
.action(recognizeCLI);
|
|
58
62
|
|
|
59
63
|
program
|
package/js/containers/app.js
CHANGED
|
@@ -61,6 +61,18 @@ export class opt {
|
|
|
61
61
|
static extractPDFFonts = false;
|
|
62
62
|
|
|
63
63
|
static calcSuppFontInfo = false;
|
|
64
|
+
|
|
65
|
+
static usePDFTextSupp = true;
|
|
66
|
+
|
|
67
|
+
static usePDFTextMain = true;
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Number of workers to use. Must be set prior to initialization.
|
|
71
|
+
* If set to `null` (default), the number of workers will be set up to 6 (browser) or 8 (node),
|
|
72
|
+
* if the system has enough resources.
|
|
73
|
+
* @type {?number}
|
|
74
|
+
*/
|
|
75
|
+
static workerN = null;
|
|
64
76
|
}
|
|
65
77
|
|
|
66
78
|
export class inputData {
|
|
@@ -70,6 +82,9 @@ export class inputData {
|
|
|
70
82
|
/** `true` if user uploaded pdf */
|
|
71
83
|
static pdfMode = false;
|
|
72
84
|
|
|
85
|
+
/** @type {?('text'|'ocr'|'image')} */
|
|
86
|
+
static pdfType = null;
|
|
87
|
+
|
|
73
88
|
/** `true` if user uploaded image files (.png, .jpeg) */
|
|
74
89
|
static imageMode = false;
|
|
75
90
|
|
|
@@ -263,6 +263,13 @@ export class FontCont {
|
|
|
263
263
|
|
|
264
264
|
static sansDefaultName = 'NimbusSans';
|
|
265
265
|
|
|
266
|
+
/**
|
|
267
|
+
* If `false`, 'Courier' will not be cleaned to Nimbus Mono.
|
|
268
|
+
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
269
|
+
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
270
|
+
*/
|
|
271
|
+
static enableCleanToNimbusMono = false;
|
|
272
|
+
|
|
266
273
|
/** @type {?('latin'|'all')} */
|
|
267
274
|
static glyphSet = null;
|
|
268
275
|
|
|
@@ -337,6 +344,8 @@ export class FontCont {
|
|
|
337
344
|
family = 'Carlito';
|
|
338
345
|
} else if (/Calibri/i.test(family)) {
|
|
339
346
|
family = 'Carlito';
|
|
347
|
+
} else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
348
|
+
family = 'NimbusMono';
|
|
340
349
|
}
|
|
341
350
|
}
|
|
342
351
|
|
|
@@ -379,6 +388,8 @@ export class FontCont {
|
|
|
379
388
|
FontCont.rawMetrics = null;
|
|
380
389
|
FontCont.optMetrics = null;
|
|
381
390
|
|
|
391
|
+
FontCont.enableCleanToNimbusMono = false;
|
|
392
|
+
|
|
382
393
|
FontCont.defaultFontName = 'SerifDefault';
|
|
383
394
|
FontCont.serifDefaultName = 'NimbusRomNo9L';
|
|
384
395
|
FontCont.sansDefaultName = 'NimbusSans';
|
|
@@ -159,9 +159,6 @@ export class ImageCache {
|
|
|
159
159
|
image: false,
|
|
160
160
|
};
|
|
161
161
|
|
|
162
|
-
/** @type {?('text'|'ocr'|'image')} */
|
|
163
|
-
static pdfType = null;
|
|
164
|
-
|
|
165
162
|
static colorModeDefault = 'gray';
|
|
166
163
|
|
|
167
164
|
/**
|
|
@@ -196,6 +193,9 @@ export class ImageCache {
|
|
|
196
193
|
|
|
197
194
|
const workersPromiseArr = range(0, scheduler.workers.length - 1).map(async (x) => {
|
|
198
195
|
const w = scheduler.workers[x];
|
|
196
|
+
|
|
197
|
+
if (w.pdfDoc) await w.freeDocument(w.pdfDoc);
|
|
198
|
+
|
|
199
199
|
// The ArrayBuffer is transferred to the worker, so a new one must be created for each worker.
|
|
200
200
|
// const fileData = await file.arrayBuffer();
|
|
201
201
|
const fileDataCopy = fileData.slice(0);
|
package/js/export/export.js
CHANGED
|
@@ -143,6 +143,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
|
|
|
143
143
|
doc1: pdfOverlay, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
|
|
144
144
|
});
|
|
145
145
|
}
|
|
146
|
+
|
|
147
|
+
w.freeDocument(pdfOverlay);
|
|
146
148
|
} else {
|
|
147
149
|
const pdfStr = await writePdf(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
|
|
148
150
|
opt.overlayOpacity / 100);
|
|
@@ -169,6 +171,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
|
|
|
169
171
|
content = await w.write({
|
|
170
172
|
doc1: pdf, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
|
|
171
173
|
});
|
|
174
|
+
|
|
175
|
+
w.freeDocument(pdf);
|
|
172
176
|
}
|
|
173
177
|
} else if (format === 'hocr') {
|
|
174
178
|
content = writeHocr(ocrAll.active, minValue, maxValue);
|
package/js/export/writePdf.js
CHANGED
|
@@ -534,10 +534,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
534
534
|
const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
|
|
535
535
|
// const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
|
|
536
536
|
|
|
537
|
-
const
|
|
537
|
+
const wordGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
|
|
538
|
+
const wordGlyphMetrics = wordGlyph.getMetrics();
|
|
538
539
|
const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
|
|
539
540
|
|
|
540
|
-
const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.
|
|
541
|
+
const wordRightBearing = wordJ.visualCoords ? (wordGlyph.advanceWidth - wordGlyphMetrics.xMax) * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
|
|
541
542
|
|
|
542
543
|
const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
|
|
543
544
|
|
package/js/extractPDFText.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { inputData } from './containers/app.js';
|
|
1
2
|
import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
|
|
2
3
|
import { ImageCache } from './containers/imageContainer.js';
|
|
3
4
|
import { convertOCR } from './recognizeConvert.js';
|
|
@@ -83,7 +84,7 @@ export const extractInternalPDFText = async (options = {}) => {
|
|
|
83
84
|
|
|
84
85
|
const res = await extractInternalPDFTextRaw();
|
|
85
86
|
|
|
86
|
-
|
|
87
|
+
inputData.pdfType = res.type;
|
|
87
88
|
ocrAllRaw.pdf = res.contentRaw;
|
|
88
89
|
|
|
89
90
|
if (!extractPDFTextImage && res.type === 'image') return res;
|
|
@@ -102,7 +103,7 @@ export const extractInternalPDFText = async (options = {}) => {
|
|
|
102
103
|
const format = 'stext';
|
|
103
104
|
|
|
104
105
|
// Process HOCR using web worker, reading from file first if that has not been done already
|
|
105
|
-
await convertOCR(ocrAllRaw.
|
|
106
|
+
await convertOCR(ocrAllRaw.pdf, true, format, 'pdf', false);
|
|
106
107
|
|
|
107
108
|
res.content = ocrAll.pdf;
|
|
108
109
|
|
package/js/fontContainerMain.js
CHANGED
|
@@ -39,6 +39,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
39
39
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
|
|
40
40
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
|
|
41
41
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
|
|
42
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
|
|
43
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
|
|
44
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
|
|
42
45
|
if (typeof process === 'undefined') {
|
|
43
46
|
if (glyphSet === 'latin') {
|
|
44
47
|
carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
@@ -59,6 +62,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
59
62
|
nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
60
63
|
nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
61
64
|
nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
65
|
+
nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
66
|
+
nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
67
|
+
nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
62
68
|
} else {
|
|
63
69
|
carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
64
70
|
carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
@@ -78,6 +84,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
78
84
|
nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
79
85
|
nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
80
86
|
nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
87
|
+
nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
88
|
+
nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
89
|
+
nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
81
90
|
}
|
|
82
91
|
} else {
|
|
83
92
|
const { readFile } = await import('fs/promises');
|
|
@@ -99,6 +108,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
99
108
|
nimbusSansNormal = readFile(new URL('../fonts/all_ttf/NimbusSans-Regular.ttf', import.meta.url)).then((res) => res.buffer);
|
|
100
109
|
nimbusSansItalic = readFile(new URL('../fonts/all_ttf/NimbusSans-Italic.ttf', import.meta.url)).then((res) => res.buffer);
|
|
101
110
|
nimbusSansBold = readFile(new URL('../fonts/all_ttf/NimbusSans-Bold.ttf', import.meta.url)).then((res) => res.buffer);
|
|
111
|
+
nimbusMonoNormal = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Regular.ttf', import.meta.url)).then((res) => res.buffer);
|
|
112
|
+
nimbusMonoItalic = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Italic.ttf', import.meta.url)).then((res) => res.buffer);
|
|
113
|
+
nimbusMonoBold = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Bold.ttf', import.meta.url)).then((res) => res.buffer);
|
|
102
114
|
}
|
|
103
115
|
|
|
104
116
|
const srcObj = {
|
|
@@ -108,6 +120,7 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
108
120
|
Palatino: { normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold },
|
|
109
121
|
NimbusRomNo9L: { normal: await nimbusRomNo9LNormal, italic: await nimbusRomNo9LItalic, bold: await nimbusRomNo9LBold },
|
|
110
122
|
NimbusSans: { normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold },
|
|
123
|
+
NimbusMono: { normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold },
|
|
111
124
|
};
|
|
112
125
|
|
|
113
126
|
FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
|
|
@@ -256,7 +269,7 @@ export async function setUploadFontsWorker(scheduler) {
|
|
|
256
269
|
/** @type {Object<string, fontSrcBuiltIn|fontSrcUpload>} */
|
|
257
270
|
const fontsUpload = {};
|
|
258
271
|
for (const [key, value] of Object.entries(FontCont.active)) {
|
|
259
|
-
if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans'].includes(key)) {
|
|
272
|
+
if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans', 'NimbusMono'].includes(key)) {
|
|
260
273
|
fontsUpload[key] = {
|
|
261
274
|
normal: value?.normal?.src, italic: value?.italic?.src, bold: value?.bold?.src,
|
|
262
275
|
};
|
|
@@ -381,8 +394,9 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
|
|
|
381
394
|
const palatinoPromise = optimizeFontContainerFamily(fontPrivate.Palatino, fontMetricsObj);
|
|
382
395
|
const nimbusRomNo9LPromise = optimizeFontContainerFamily(fontPrivate.NimbusRomNo9L, fontMetricsObj);
|
|
383
396
|
const nimbusSansPromise = optimizeFontContainerFamily(fontPrivate.NimbusSans, fontMetricsObj);
|
|
397
|
+
const nimbusMonoPromise = optimizeFontContainerFamily(fontPrivate.NimbusMono, fontMetricsObj);
|
|
384
398
|
|
|
385
|
-
const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise]);
|
|
399
|
+
const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise, nimbusMonoPromise]);
|
|
386
400
|
|
|
387
401
|
if (results.every((x) => x === null)) return null;
|
|
388
402
|
|
|
@@ -393,5 +407,6 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
|
|
|
393
407
|
Palatino: results[3],
|
|
394
408
|
NimbusRomNo9L: results[4],
|
|
395
409
|
NimbusSans: results[5],
|
|
410
|
+
NimbusMono: results[6],
|
|
396
411
|
};
|
|
397
412
|
}
|
package/js/fontEval.js
CHANGED
|
@@ -50,6 +50,7 @@ export async function evaluateFonts(pageArr, opt) {
|
|
|
50
50
|
const evalPalatino = !!(opt ? FontCont.opt?.Palatino : FontCont.raw?.Palatino);
|
|
51
51
|
const evalGaramond = !!(opt ? FontCont.opt?.Garamond : FontCont.raw?.Garamond);
|
|
52
52
|
const evalNimbusRomNo9L = !!(opt ? FontCont.opt?.NimbusRomNo9L : FontCont.raw?.NimbusRomNo9L);
|
|
53
|
+
const evalNimbusMono = !!(opt ? FontCont.opt?.NimbusMono : FontCont.raw?.NimbusMono);
|
|
53
54
|
|
|
54
55
|
// The browser version runs in parallel using workers, however the Node.js version runs sequentially,
|
|
55
56
|
// as the canvas package does not support workers, and trying to run in parallel causes problems.
|
|
@@ -63,6 +64,7 @@ export async function evaluateFonts(pageArr, opt) {
|
|
|
63
64
|
palatino: evalPalatino ? evalPagesFont('Palatino', pageArr, opt) : null,
|
|
64
65
|
garamond: evalGaramond ? evalPagesFont('Garamond', pageArr, opt) : null,
|
|
65
66
|
nimbusRomNo9L: evalNimbusRomNo9L ? evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
|
|
67
|
+
nimbusMono: evalNimbusMono ? evalPagesFont('NimbusMono', pageArr, opt) : null,
|
|
66
68
|
};
|
|
67
69
|
|
|
68
70
|
fontMetricsTmp = {
|
|
@@ -72,6 +74,7 @@ export async function evaluateFonts(pageArr, opt) {
|
|
|
72
74
|
palatino: await fontMetricsPromises.palatino,
|
|
73
75
|
garamond: await fontMetricsPromises.garamond,
|
|
74
76
|
nimbusRomNo9L: await fontMetricsPromises.nimbusRomNo9L,
|
|
77
|
+
nimbusMono: await fontMetricsPromises.nimbusMono,
|
|
75
78
|
};
|
|
76
79
|
} else {
|
|
77
80
|
fontMetricsTmp = {
|
|
@@ -81,6 +84,7 @@ export async function evaluateFonts(pageArr, opt) {
|
|
|
81
84
|
palatino: evalPalatino ? await evalPagesFont('Palatino', pageArr, opt) : null,
|
|
82
85
|
garamond: evalGaramond ? await evalPagesFont('Garamond', pageArr, opt) : null,
|
|
83
86
|
nimbusRomNo9L: evalNimbusRomNo9L ? await evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
|
|
87
|
+
nimbusMono: evalNimbusMono ? await evalPagesFont('NimbusMono', pageArr, opt) : null,
|
|
84
88
|
};
|
|
85
89
|
}
|
|
86
90
|
|
|
@@ -91,6 +95,7 @@ export async function evaluateFonts(pageArr, opt) {
|
|
|
91
95
|
Palatino: fontMetricsTmp.palatino ? fontMetricsTmp.palatino.metricTotal / fontMetricsTmp.palatino.wordsTotal : null,
|
|
92
96
|
Garamond: fontMetricsTmp.garamond ? fontMetricsTmp.garamond.metricTotal / fontMetricsTmp.garamond.wordsTotal : null,
|
|
93
97
|
NimbusRomNo9L: fontMetricsTmp.nimbusRomNo9L ? fontMetricsTmp.nimbusRomNo9L.metricTotal / fontMetricsTmp.nimbusRomNo9L.wordsTotal : null,
|
|
98
|
+
NimbusMono: fontMetricsTmp.nimbusMono ? fontMetricsTmp.nimbusMono.metricTotal / fontMetricsTmp.nimbusMono.wordsTotal : null,
|
|
94
99
|
};
|
|
95
100
|
|
|
96
101
|
return fontMetrics;
|
|
@@ -106,7 +111,7 @@ const calcBestFonts = (fontMetrics) => {
|
|
|
106
111
|
|
|
107
112
|
for (const [key, value] of Object.entries(fontMetrics)) {
|
|
108
113
|
if (!['Carlito', 'NimbusSans'].includes(key)) continue;
|
|
109
|
-
if (value < minValueSans) {
|
|
114
|
+
if (value && value < minValueSans) {
|
|
110
115
|
minValueSans = value;
|
|
111
116
|
minKeySans = key;
|
|
112
117
|
}
|
|
@@ -116,8 +121,8 @@ const calcBestFonts = (fontMetrics) => {
|
|
|
116
121
|
let minValueSerif = Number.MAX_VALUE;
|
|
117
122
|
|
|
118
123
|
for (const [key, value] of Object.entries(fontMetrics)) {
|
|
119
|
-
if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L'].includes(key)) continue;
|
|
120
|
-
if (value < minValueSerif) {
|
|
124
|
+
if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L', 'NimbusMono'].includes(key)) continue;
|
|
125
|
+
if (value && value < minValueSerif) {
|
|
121
126
|
minValueSerif = value;
|
|
122
127
|
minKeySerif = key;
|
|
123
128
|
}
|
package/js/fontSupp.js
CHANGED
|
@@ -159,7 +159,7 @@ export const calcSuppFontInfo = async (ocrArr) => {
|
|
|
159
159
|
for (const line of page.lines) {
|
|
160
160
|
for (const word of line.words) {
|
|
161
161
|
if (word.font && word.size && FontProps.sizeMult[word.font]) {
|
|
162
|
-
word.size
|
|
162
|
+
word.size = Math.round(word.size * FontProps.sizeMult[word.font] * 1000) / 1000;
|
|
163
163
|
}
|
|
164
164
|
}
|
|
165
165
|
}
|
package/js/generalWorkerMain.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { opt } from './containers/app.js';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Initializes a general worker and returns an object with methods controlled by the worker.
|
|
3
5
|
* @returns {Promise} A promise that resolves to an object with control methods.
|
|
@@ -265,14 +267,14 @@ export class gs {
|
|
|
265
267
|
gs.#resReady = resolve;
|
|
266
268
|
});
|
|
267
269
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
// Node.js version only uses 1 worker.
|
|
273
|
-
let workerN = 1;
|
|
274
|
-
if (typeof process === 'undefined') {
|
|
270
|
+
let workerN;
|
|
271
|
+
if (opt.workerN) {
|
|
272
|
+
workerN = opt.workerN;
|
|
273
|
+
} else if (typeof process === 'undefined') {
|
|
275
274
|
workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
|
|
275
|
+
} else {
|
|
276
|
+
const cpuN = Math.floor((await import('os')).cpus().length / 2);
|
|
277
|
+
workerN = Math.min(cpuN - 1, 8);
|
|
276
278
|
}
|
|
277
279
|
|
|
278
280
|
const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
package/js/global.d.ts
CHANGED
|
@@ -50,6 +50,10 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
50
50
|
const xmlLinePreChar = xmlLine.match(/^[\s\S]*?(?=<char)/)?.[0];
|
|
51
51
|
if (!xmlLinePreChar) return;
|
|
52
52
|
|
|
53
|
+
const dirStr = xmlLinePreChar.match(/dir=['"]([^'"]*)/)?.[1];
|
|
54
|
+
const dirSlopeStr = dirStr?.match(/[-\d.]+$/)?.[0];
|
|
55
|
+
const dirSlope = dirSlopeStr ? parseFloat(dirSlopeStr) : null;
|
|
56
|
+
|
|
53
57
|
const xmlLineFormatting = xmlLinePreChar?.match(/<font[^>]+/)?.[0];
|
|
54
58
|
const fontName = xmlLineFormatting?.match(/name=['"]([^'"]*)/)?.[1];
|
|
55
59
|
const fontSizeStr = xmlLineFormatting?.match(/size=['"]([^'"]*)/)?.[1];
|
|
@@ -81,7 +85,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
81
85
|
/** @type {Array<Array<{left: number, top: number, right: number, bottom: number}>>} */
|
|
82
86
|
const bboxes = [];
|
|
83
87
|
|
|
84
|
-
|
|
88
|
+
let baselineFirstDone = false;
|
|
85
89
|
const baselineFirst = /** @type {Array<Number>} */ ([]);
|
|
86
90
|
|
|
87
91
|
let baselineCurrent = 0;
|
|
@@ -114,17 +118,72 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
114
118
|
/** @type {Array<boolean>} */
|
|
115
119
|
const superArr = [];
|
|
116
120
|
|
|
117
|
-
|
|
121
|
+
/**
|
|
122
|
+
* @typedef {Object} Point
|
|
123
|
+
* @property {number} x - The x coordinate.
|
|
124
|
+
* @property {number} y - The y coordinate.
|
|
125
|
+
*/
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* @typedef {Object} Quad
|
|
129
|
+
* @property {Point} ul - Upper left corner.
|
|
130
|
+
* @property {Point} ur - Upper right corner.
|
|
131
|
+
* @property {Point} ll - Lower left corner.
|
|
132
|
+
* @property {Point} lr - Lower right corner.
|
|
133
|
+
*/
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* @typedef {Object} StextChar
|
|
137
|
+
* @property {Quad} quad
|
|
138
|
+
* @property {Point} origin
|
|
139
|
+
* @property {string} text
|
|
140
|
+
*/
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* @typedef {Object} StextFont
|
|
144
|
+
* @property {string} name
|
|
145
|
+
* @property {number} size
|
|
146
|
+
*/
|
|
147
|
+
|
|
148
|
+
const wordCharOrFontArr = /** @type {Array<Array<StextChar|StextFont>>} */([]);
|
|
118
149
|
for (let i = 0; i < wordStrArr.length; i++) {
|
|
119
150
|
// Fonts can be changed at any point in the word string.
|
|
120
151
|
// Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
|
|
121
152
|
// This regex splits the string into elements that contain either (1) a font change or (2) a character.
|
|
122
153
|
// The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
|
|
123
|
-
const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(
|
|
124
|
-
|
|
154
|
+
const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
|
|
155
|
+
|
|
156
|
+
const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
|
|
157
|
+
|
|
158
|
+
wordCharOrFontArr[i] = [];
|
|
159
|
+
for (let j = 0; j < stextMatches.length; j++) {
|
|
160
|
+
const fontStr = stextMatches[j][1];
|
|
161
|
+
const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
|
|
162
|
+
const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
|
|
163
|
+
if (fontNameStrI && fontSizeStrI) {
|
|
164
|
+
wordCharOrFontArr[i][j] = {
|
|
165
|
+
name: fontNameStrI,
|
|
166
|
+
size: parseFloat(fontSizeStrI),
|
|
167
|
+
};
|
|
168
|
+
continue;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const quad = {
|
|
172
|
+
ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
|
|
173
|
+
ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
|
|
174
|
+
ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
|
|
175
|
+
lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
wordCharOrFontArr[i][j] = {
|
|
179
|
+
quad,
|
|
180
|
+
origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
|
|
181
|
+
text: stextMatches[j][12],
|
|
182
|
+
};
|
|
183
|
+
}
|
|
125
184
|
}
|
|
126
185
|
|
|
127
|
-
for (let i = 0; i <
|
|
186
|
+
for (let i = 0; i < wordCharOrFontArr.length; i++) {
|
|
128
187
|
let textWordArr = [];
|
|
129
188
|
let bboxesWordArr = [];
|
|
130
189
|
let fontFamily = familyCurrent || fontFamilyLine || 'Default';
|
|
@@ -137,28 +196,38 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
137
196
|
let smallCapsWordAltTitleCaseAdj = false;
|
|
138
197
|
let styleWord = 'normal';
|
|
139
198
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
if (letterOrFontArr.length === 0) continue;
|
|
199
|
+
if (wordCharOrFontArr[i].length === 0) continue;
|
|
143
200
|
|
|
144
201
|
let wordInit = false;
|
|
145
202
|
|
|
146
|
-
for (let j = 0; j <
|
|
147
|
-
const
|
|
148
|
-
|
|
149
|
-
const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
|
|
150
|
-
const baseline = parseFloat(letterOrFontArr[j][6]);
|
|
151
|
-
if (fontNameStrI && fontSizeStrI) {
|
|
203
|
+
for (let j = 0; j < wordCharOrFontArr[i].length; j++) {
|
|
204
|
+
const charOrFont = wordCharOrFontArr[i][j];
|
|
205
|
+
if ('name' in charOrFont) {
|
|
152
206
|
// While small caps can be printed using special "small caps" fonts, they can also be printed using a regular font with a size change.
|
|
153
207
|
// This block of code detects small caps printed in title case by checking for a decrease in font size after the first letter.
|
|
154
208
|
// TODO: This logic currently fails when:
|
|
155
209
|
// (1) Runs of small caps include punctuation, which is printed at the full size (and therefore is counted as a size increase ending small caps).
|
|
156
210
|
// (2) Runs of small caps that start with lower-case letters, which do not conform to the expectation that runs of small caps start with a capital letter.
|
|
157
211
|
const sizePrevRaw = sizeCurrentRaw;
|
|
158
|
-
sizeCurrentRaw =
|
|
212
|
+
sizeCurrentRaw = charOrFont.size;
|
|
159
213
|
const secondLetter = wordInit && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
|
|
160
|
-
|
|
161
|
-
|
|
214
|
+
|
|
215
|
+
let baselineNextLetter;
|
|
216
|
+
const possibleNextLetter1 = wordCharOrFontArr[i][j + 1];
|
|
217
|
+
const possibleNextLetter2 = wordCharOrFontArr[i + 1]?.[0];
|
|
218
|
+
const possibleNextLetter3 = wordCharOrFontArr[i + 1]?.[1];
|
|
219
|
+
const possibleNextLetter4 = wordCharOrFontArr[i + 1]?.[2];
|
|
220
|
+
|
|
221
|
+
if (possibleNextLetter1 && 'origin' in possibleNextLetter1) {
|
|
222
|
+
baselineNextLetter = possibleNextLetter1.origin.y;
|
|
223
|
+
} else if (possibleNextLetter2 && 'origin' in possibleNextLetter2) {
|
|
224
|
+
baselineNextLetter = possibleNextLetter2.origin.y;
|
|
225
|
+
} else if (possibleNextLetter3 && 'origin' in possibleNextLetter3) {
|
|
226
|
+
baselineNextLetter = possibleNextLetter3.origin.y;
|
|
227
|
+
} else if (possibleNextLetter4 && 'origin' in possibleNextLetter4) {
|
|
228
|
+
baselineNextLetter = possibleNextLetter4.origin.y;
|
|
229
|
+
}
|
|
230
|
+
|
|
162
231
|
const fontSizeMin = Math.min(sizeCurrentRaw, sizePrevRaw);
|
|
163
232
|
const baselineDelta = (baselineNextLetter - baselineCurrent) / fontSizeMin;
|
|
164
233
|
const sizeDelta = (sizeCurrentRaw - sizePrevRaw) / fontSizeMin;
|
|
@@ -177,7 +246,13 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
177
246
|
bboxes.push(bboxesWordArr);
|
|
178
247
|
styleArr.push(styleWord);
|
|
179
248
|
fontFamilyArr.push(fontFamily);
|
|
180
|
-
|
|
249
|
+
|
|
250
|
+
if (sizeDelta > 0) {
|
|
251
|
+
fontSizeArr.push(sizePrevRaw);
|
|
252
|
+
} else {
|
|
253
|
+
fontSizeArr.push(fontSizeWord);
|
|
254
|
+
}
|
|
255
|
+
|
|
181
256
|
smallCapsArr.push(smallCapsWord);
|
|
182
257
|
smallCapsAltArr.push(smallCapsWordAlt);
|
|
183
258
|
smallCapsAltTitleCaseArr.push(smallCapsWordAltTitleCaseAdj);
|
|
@@ -187,21 +262,25 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
187
262
|
bboxesWordArr = [];
|
|
188
263
|
}
|
|
189
264
|
|
|
190
|
-
// If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
|
|
191
265
|
if (sizeDelta > 0) {
|
|
192
|
-
baselineFirst
|
|
193
|
-
|
|
266
|
+
// If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
|
|
267
|
+
if (!baselineFirstDone) baselineFirst.length = 0;
|
|
268
|
+
familyCurrent = charOrFont.name || familyCurrent;
|
|
194
269
|
sizeCurrent = sizeCurrentRaw || sizeCurrent;
|
|
195
270
|
fontSizeWord = sizeCurrent;
|
|
196
271
|
fontFamily = familyCurrent;
|
|
197
272
|
superArr[superArr.length - 1] = true;
|
|
198
|
-
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// If `baselineFirstDone` was set using a non-superscript word, mark it as done.
|
|
276
|
+
if (superArr.length > 0 && !superArr[superArr.length - 1] && baselineFirst.length > 0) {
|
|
277
|
+
baselineFirstDone = true;
|
|
199
278
|
}
|
|
200
279
|
|
|
201
280
|
superCurrent = sizeDelta < 0;
|
|
202
281
|
} else {
|
|
203
282
|
sizeCurrent = sizeCurrentRaw || sizeCurrent;
|
|
204
|
-
familyCurrent =
|
|
283
|
+
familyCurrent = charOrFont.name || familyCurrent;
|
|
205
284
|
// Update current word only if this is before every letter in the word.
|
|
206
285
|
if (textWordArr.length === 0) {
|
|
207
286
|
fontSizeWord = sizeCurrent;
|
|
@@ -210,7 +289,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
210
289
|
// An increase in font size ends any small caps sequence.
|
|
211
290
|
// A threshold is necessary because stext data has been observed to have small variations without a clear reason.
|
|
212
291
|
// eslint-disable-next-line no-lonely-if
|
|
213
|
-
if (Math.abs(sizeDelta) > 0.05) {
|
|
292
|
+
if (Number.isFinite(sizeDelta) && Math.abs(sizeDelta) > 0.05) {
|
|
214
293
|
smallCapsCurrentAlt = false;
|
|
215
294
|
if (textWordArr.length === 0) {
|
|
216
295
|
superCurrent = false;
|
|
@@ -222,14 +301,14 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
222
301
|
|
|
223
302
|
// Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
|
|
224
303
|
smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
|
|
225
|
-
smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(
|
|
304
|
+
smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(charOrFont.name);
|
|
226
305
|
smallCapsWord = smallCapsCurrent;
|
|
227
306
|
|
|
228
|
-
if (/italic/i.test(
|
|
307
|
+
if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name)) {
|
|
229
308
|
// The word is already initialized, so we need to change the last element of the style array.
|
|
230
309
|
// Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
|
|
231
310
|
styleCurrent = 'italic';
|
|
232
|
-
} else if (/bold|black/i.test(
|
|
311
|
+
} else if (/bold|black/i.test(charOrFont.name)) {
|
|
233
312
|
styleCurrent = 'bold';
|
|
234
313
|
} else {
|
|
235
314
|
styleCurrent = 'normal';
|
|
@@ -237,7 +316,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
237
316
|
|
|
238
317
|
continue;
|
|
239
318
|
} else {
|
|
240
|
-
baselineCurrent =
|
|
319
|
+
baselineCurrent = charOrFont.origin.y;
|
|
241
320
|
}
|
|
242
321
|
|
|
243
322
|
if (!wordInit) {
|
|
@@ -246,24 +325,22 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
246
325
|
}
|
|
247
326
|
|
|
248
327
|
const bbox = {
|
|
249
|
-
left: Math.round(
|
|
250
|
-
top: Math.round(
|
|
251
|
-
right: Math.round(
|
|
252
|
-
bottom: Math.round(
|
|
328
|
+
left: Math.round(charOrFont.origin.x),
|
|
329
|
+
top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
|
|
330
|
+
right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)),
|
|
331
|
+
bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
|
|
253
332
|
};
|
|
254
333
|
|
|
255
334
|
if (!superCurrent) {
|
|
256
335
|
if (baselineFirst.length === 0) {
|
|
257
|
-
baselineFirst.push(bbox.left,
|
|
258
|
-
} else {
|
|
259
|
-
baselineSlopeArr.push((baseline - baselineFirst[1]) / (bbox.left - baselineFirst[0]));
|
|
336
|
+
baselineFirst.push(bbox.left, charOrFont.origin.y);
|
|
260
337
|
}
|
|
261
338
|
}
|
|
262
339
|
|
|
263
340
|
// Small caps created by reducing font size can carry forward across multiple words.
|
|
264
341
|
smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
|
|
265
342
|
|
|
266
|
-
textWordArr.push(
|
|
343
|
+
textWordArr.push(charOrFont.text);
|
|
267
344
|
|
|
268
345
|
bboxesWordArr.push(bbox);
|
|
269
346
|
}
|
|
@@ -288,13 +365,19 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
288
365
|
}
|
|
289
366
|
|
|
290
367
|
superArr.push(superCurrent);
|
|
368
|
+
if (superCurrent) fontSizeArr[fontSizeArr.length - 1] = sizeCurrentRaw;
|
|
291
369
|
}
|
|
292
370
|
|
|
293
371
|
// Return if there are no letters in the line.
|
|
294
372
|
// This commonly happens for "lines" that contain only space characters.
|
|
295
373
|
if (bboxes.length === 0) return;
|
|
296
374
|
|
|
297
|
-
|
|
375
|
+
let baselineSlope = 0;
|
|
376
|
+
if (dirSlope !== null) {
|
|
377
|
+
baselineSlope = dirSlope;
|
|
378
|
+
} else {
|
|
379
|
+
console.log('Unable to parse slope.');
|
|
380
|
+
}
|
|
298
381
|
|
|
299
382
|
const lineBbox = {
|
|
300
383
|
left: lineBoxArr[0], top: lineBoxArr[1], right: lineBoxArr[2], bottom: lineBoxArr[3],
|
|
@@ -427,7 +510,10 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
427
510
|
|
|
428
511
|
for (let i = 0; i < lineStrArr.length; i++) {
|
|
429
512
|
const angle = convertLineStext(lineStrArr[i]);
|
|
430
|
-
|
|
513
|
+
// The `Math.abs(angle) < 0.3` condition avoids vertical text impacting the angle calculation.
|
|
514
|
+
// The page angle is intended to account for page skew, not different orientations (90/180/270 degrees).
|
|
515
|
+
// TODO: Eventually different orientations should be supported.
|
|
516
|
+
if (typeof angle === 'number' && !Number.isNaN(angle) && Math.abs(angle) < 0.3) angleRisePage.push(angle);
|
|
431
517
|
}
|
|
432
518
|
|
|
433
519
|
if (parLineArr.length === 0) return;
|
package/js/import/import.js
CHANGED
|
@@ -447,9 +447,12 @@ export async function importFiles(files, options = {}) {
|
|
|
447
447
|
});
|
|
448
448
|
} else if (inputData.pdfMode && (extractPDFTextNative || extractPDFTextOCR)) {
|
|
449
449
|
await extractInternalPDFText({
|
|
450
|
-
setActive:
|
|
450
|
+
setActive: opt.usePDFTextMain, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
|
|
451
451
|
});
|
|
452
|
-
if (opt.
|
|
452
|
+
if (opt.usePDFTextMain) {
|
|
453
|
+
if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
|
|
454
|
+
if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
|
|
455
|
+
}
|
|
453
456
|
}
|
|
454
457
|
}
|
|
455
458
|
|
package/js/objects/ocrObjects.js
CHANGED
|
@@ -644,6 +644,37 @@ function cloneChar(char) {
|
|
|
644
644
|
return charNew;
|
|
645
645
|
}
|
|
646
646
|
|
|
647
|
+
/**
|
|
648
|
+
* Gets words that match the provided text.
|
|
649
|
+
* @param {string} text
|
|
650
|
+
* @param {OcrPage} ocrPage
|
|
651
|
+
*/
|
|
652
|
+
function getMatchingWords(text, ocrPage) {
|
|
653
|
+
text = text.trim().toLowerCase();
|
|
654
|
+
|
|
655
|
+
if (!text) return [];
|
|
656
|
+
const textArr = text.split(' ');
|
|
657
|
+
|
|
658
|
+
const wordArr = ocr.getPageWords(ocrPage);
|
|
659
|
+
|
|
660
|
+
const matchArr = [];
|
|
661
|
+
|
|
662
|
+
for (let i = 0; i < wordArr.length - (textArr.length - 1); i++) {
|
|
663
|
+
const word = wordArr[i];
|
|
664
|
+
|
|
665
|
+
if (!word.text.toLowerCase().includes(textArr[0])) continue;
|
|
666
|
+
|
|
667
|
+
const candArr = wordArr.slice(i, i + textArr.length);
|
|
668
|
+
const candText = candArr.map((x) => x.text).join(' ').toLowerCase();
|
|
669
|
+
|
|
670
|
+
if (candText.toLowerCase().includes(text)) {
|
|
671
|
+
matchArr.push(...candArr);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
return matchArr;
|
|
676
|
+
}
|
|
677
|
+
|
|
647
678
|
/**
|
|
648
679
|
* Gets word IDs that match the provided text.
|
|
649
680
|
* @param {string} text
|
|
@@ -729,6 +760,7 @@ const ocr = {
|
|
|
729
760
|
getPageWord,
|
|
730
761
|
getPageWords,
|
|
731
762
|
getDistinctChars,
|
|
763
|
+
getMatchingWords,
|
|
732
764
|
getMatchingWordIds,
|
|
733
765
|
getPageText,
|
|
734
766
|
getParText,
|
package/js/recognizeConvert.js
CHANGED
|
@@ -529,14 +529,21 @@ export async function recognize(options = {}) {
|
|
|
529
529
|
if (langs.includes('rus') || langs.includes('ukr') || langs.includes('ell')) fontPromiseArr.push(loadBuiltInFontsRaw('all'));
|
|
530
530
|
await Promise.all(fontPromiseArr);
|
|
531
531
|
|
|
532
|
-
|
|
533
|
-
|
|
532
|
+
let forceMainData = false;
|
|
533
|
+
let existingOCR;
|
|
534
|
+
if (ocrAll['User Upload']) {
|
|
535
|
+
existingOCR = ocrAll['User Upload'];
|
|
536
|
+
} else if (opt.usePDFTextSupp && ocrAll.pdf) {
|
|
537
|
+
existingOCR = ocrAll.pdf;
|
|
538
|
+
// If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
|
|
539
|
+
forceMainData = ocrAll.pdf !== ocrAll.active;
|
|
540
|
+
}
|
|
534
541
|
|
|
535
542
|
// A single Tesseract engine can be used (Legacy or LSTM) or the results from both can be used and combined.
|
|
536
543
|
if (oemMode === 'legacy' || oemMode === 'lstm') {
|
|
537
544
|
// Tesseract is used as the "main" data unless user-uploaded data exists and only the LSTM model is being run.
|
|
538
545
|
// This is because Tesseract Legacy provides very strong metrics, and Abbyy often does not.
|
|
539
|
-
await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !
|
|
546
|
+
await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !existingOCR, langs, vanillaMode);
|
|
540
547
|
|
|
541
548
|
// Metrics from the LSTM model are so inaccurate they are not worth using.
|
|
542
549
|
if (oemMode === 'legacy') {
|
|
@@ -544,7 +551,7 @@ export async function recognize(options = {}) {
|
|
|
544
551
|
await runFontOptimization(ocrAll['Tesseract Legacy']);
|
|
545
552
|
}
|
|
546
553
|
} else if (oemMode === 'combined') {
|
|
547
|
-
await recognizeAllPages(true, true,
|
|
554
|
+
await recognizeAllPages(true, true, !existingOCR, langs, vanillaMode);
|
|
548
555
|
|
|
549
556
|
if (opt.saveDebugImages) {
|
|
550
557
|
DebugData.debugImg.Combined = new Array(ImageCache.pageCount);
|
|
@@ -653,9 +660,16 @@ export async function recognize(options = {}) {
|
|
|
653
660
|
ignorePunct: opt.ignorePunct,
|
|
654
661
|
confThreshHigh: opt.confThreshHigh,
|
|
655
662
|
confThreshMed: opt.confThreshMed,
|
|
663
|
+
// If the existing data was invisible OCR text extracted from a PDF, it is assumed to not have accurate bounding boxes.
|
|
664
|
+
useBboxB: !forceMainData && existingOCR === ocrAll.pdf && inputData.pdfMode && !!inputData.pdfType && ['image', 'ocr'].includes(inputData.pdfType),
|
|
656
665
|
};
|
|
657
666
|
|
|
658
|
-
|
|
667
|
+
let res;
|
|
668
|
+
if (forceMainData) {
|
|
669
|
+
res = await compareOCR(ocrAll['Tesseract Combined'], existingOCR, compOptions);
|
|
670
|
+
} else {
|
|
671
|
+
res = await compareOCR(existingOCR, ocrAll['Tesseract Combined'], compOptions);
|
|
672
|
+
}
|
|
659
673
|
|
|
660
674
|
if (DebugData.debugImg.Combined) DebugData.debugImg.Combined = res.debug;
|
|
661
675
|
|
package/js/utils/fontUtils.js
CHANGED
|
@@ -237,8 +237,11 @@ export function calcWordMetrics(word, angle = 0) {
|
|
|
237
237
|
const wordLastGlyphMetrics = fontOpentype.charToGlyph(charArr2.at(-1)).getMetrics();
|
|
238
238
|
const wordFirstGlyphMetrics = fontOpentype.charToGlyph(charArr2[0]).getMetrics();
|
|
239
239
|
|
|
240
|
-
|
|
241
|
-
let
|
|
240
|
+
// The `leftSideBearing`/`rightSideBearing`/ numbers reported by Opentype.js are not accurate for mono-spaced fonts, so `xMin`/`xMax` are used instead.
|
|
241
|
+
let wordLeftBearing = wordFirstGlyphMetrics.xMin || 0;
|
|
242
|
+
let lastGlyphMax = wordLastGlyphMetrics.xMax || 0;
|
|
243
|
+
if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) lastGlyphMax *= fontI.smallCapsMult;
|
|
244
|
+
let wordRightBearing = advanceArr[advanceArr.length - 1] - lastGlyphMax;
|
|
242
245
|
if (word.smallCaps && charArr2[0] !== charArr[0]) wordLeftBearing *= fontI.smallCapsMult;
|
|
243
246
|
if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) wordRightBearing *= fontI.smallCapsMult;
|
|
244
247
|
|
|
@@ -290,6 +293,11 @@ export const calcWordFontSize = (word) => {
|
|
|
290
293
|
if (word.visualCoords) {
|
|
291
294
|
return getFontSize(fontOpentype, word.bbox.bottom - word.bbox.top, word.text);
|
|
292
295
|
}
|
|
296
|
+
if (word.size) {
|
|
297
|
+
const mult = FontProps.sizeMult[font.family] || 1;
|
|
298
|
+
return word.size / mult;
|
|
299
|
+
}
|
|
300
|
+
|
|
293
301
|
return (word.bbox.bottom - word.bbox.top) * (fontOpentype.unitsPerEm / (fontOpentype.ascender - fontOpentype.descender));
|
|
294
302
|
}
|
|
295
303
|
|
package/js/utils/miscUtils.js
CHANGED
|
@@ -379,7 +379,7 @@ export function replaceObjectProperties(obj, obj2 = {}) {
|
|
|
379
379
|
// Fonts that should not be added (both Sans and Serif variants):
|
|
380
380
|
// DejaVu
|
|
381
381
|
const serifFonts = ['SerifDefault', 'Baskerville', 'Bookman', 'C059', 'Calibri', 'Cambria', 'Century', 'Courier', 'Garamond', 'Georgia',
|
|
382
|
-
'LucidaBright', 'Minion', 'Optima', 'P052', 'Palatino', 'Times'];
|
|
382
|
+
'LucidaBright', 'Minion', 'NimbusMono', 'Optima', 'P052', 'Palatino', 'Times'];
|
|
383
383
|
const sansFonts = ['SansDefault', 'Avenir', 'Arial', 'Calibri', 'Candara', 'Carlito', 'Comic', 'Franklin', 'Futura', 'Gotham',
|
|
384
384
|
'Helvetica', 'Impact', 'Interstate', 'Myriad', 'Tahoma', 'Trebuchet', 'Univers', 'Verdana'];
|
|
385
385
|
|
|
@@ -463,6 +463,7 @@ async function penalizeWord(wordObjs) {
|
|
|
463
463
|
* rather than simply setting `compTruth`/`matchTruth`. Enabled when using recognition to update confidence metrics, but not when comparing to ground truth.
|
|
464
464
|
* @param {boolean} [params.options.legacyLSTMComb] - Whether Tesseract Legacy and Tesseract LSTM are being combined, when `mode = 'comb'`.
|
|
465
465
|
* When `legacyLSTMComb` is enabled, additional heuristics are applied that are based on specific behaviors of the Tesseract Legacy engine.
|
|
466
|
+
* @param {boolean} [params.options.useBboxB] - Use bounding boxes from `pageB` in combined output.
|
|
466
467
|
* @param {string} [params.options.debugLabel]
|
|
467
468
|
* @param {boolean} [params.options.evalConflicts] - Whether to evaluate word quality on conflicts. If `false` the text from `pageB` is always assumed correct.
|
|
468
469
|
* This option is useful for combining the style from Tesseract Legacy with the text from Tesseract LSTM.
|
|
@@ -494,6 +495,7 @@ export async function compareOCRPageImp({
|
|
|
494
495
|
const mode = options?.mode === undefined ? 'stats' : options?.mode;
|
|
495
496
|
const editConf = options?.editConf === undefined ? false : options?.editConf;
|
|
496
497
|
const legacyLSTMComb = options?.legacyLSTMComb === undefined ? false : options?.legacyLSTMComb;
|
|
498
|
+
const useBboxB = options?.useBboxB === undefined ? false : options?.useBboxB;
|
|
497
499
|
const debugLabel = options?.debugLabel === undefined ? '' : options?.debugLabel;
|
|
498
500
|
const evalConflicts = options?.evalConflicts === undefined ? true : options?.evalConflicts;
|
|
499
501
|
const supplementComp = options?.supplementComp === undefined ? false : options?.supplementComp;
|
|
@@ -597,8 +599,13 @@ export async function compareOCRPageImp({
|
|
|
597
599
|
|
|
598
600
|
const wordBoxACore = JSON.parse(JSON.stringify(wordBoxA));
|
|
599
601
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
+
if (wordA.visualCoords) {
|
|
603
|
+
wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.1);
|
|
604
|
+
wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.1);
|
|
605
|
+
} else {
|
|
606
|
+
wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.25);
|
|
607
|
+
wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.25);
|
|
608
|
+
}
|
|
602
609
|
|
|
603
610
|
for (let l = minWordB; l < lineB.words.length; l++) {
|
|
604
611
|
const wordB = lineB.words[l];
|
|
@@ -612,8 +619,13 @@ export async function compareOCRPageImp({
|
|
|
612
619
|
|
|
613
620
|
const wordBoxBCore = JSON.parse(JSON.stringify(wordBoxB));
|
|
614
621
|
|
|
615
|
-
|
|
616
|
-
|
|
622
|
+
if (wordB.visualCoords) {
|
|
623
|
+
wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.1);
|
|
624
|
+
wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.1);
|
|
625
|
+
} else {
|
|
626
|
+
wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.25);
|
|
627
|
+
wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.25);
|
|
628
|
+
}
|
|
617
629
|
|
|
618
630
|
// If left of word A is past right of word B, move to next word B
|
|
619
631
|
if (wordBoxACore.left > wordBoxBCore.right) {
|
|
@@ -660,6 +672,11 @@ export async function compareOCRPageImp({
|
|
|
660
672
|
if (mode === 'comb') wordA.conf = 100;
|
|
661
673
|
hocrACorrect[wordA.id] = 1;
|
|
662
674
|
hocrBCorrect[wordB.id] = 1;
|
|
675
|
+
if (mode === 'comb' && useBboxB) {
|
|
676
|
+
wordA.bbox = structuredClone(wordB.bbox);
|
|
677
|
+
wordA.visualCoords = true;
|
|
678
|
+
wordA.chars = structuredClone(wordB.chars);
|
|
679
|
+
}
|
|
663
680
|
} else if (mode === 'comb') {
|
|
664
681
|
wordA.conf = 0;
|
|
665
682
|
wordA.matchTruth = false;
|
|
@@ -101,8 +101,11 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
|
|
|
101
101
|
const indexFirst = font.charToGlyphIndex(charFirst);
|
|
102
102
|
const indexSecond = font.charToGlyphIndex(charSecond);
|
|
103
103
|
|
|
104
|
-
const
|
|
105
|
-
const
|
|
104
|
+
const glyphFirst = font.glyphs.glyphs[indexFirst];
|
|
105
|
+
const glyphSecond = font.glyphs.glyphs[indexSecond];
|
|
106
|
+
|
|
107
|
+
const metricsFirst = glyphFirst.getMetrics();
|
|
108
|
+
const metricsSecond = glyphSecond.getMetrics();
|
|
106
109
|
|
|
107
110
|
const fontKern1 = Math.round(value * xHeight);
|
|
108
111
|
let spaceTarget = fontKern1;
|
|
@@ -119,7 +122,7 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
|
|
|
119
122
|
}
|
|
120
123
|
|
|
121
124
|
// Calculate current space between these 2 glyphs (without kerning adjustments)
|
|
122
|
-
const spaceCurrent = metricsFirst.
|
|
125
|
+
const spaceCurrent = (glyphFirst.advanceWidth - metricsFirst.xMax) + metricsSecond.xMin;
|
|
123
126
|
|
|
124
127
|
// Calculate kerning adjustment needed
|
|
125
128
|
let fontKern = spaceTarget - spaceCurrent;
|
package/mupdf/libmupdf.wasm
CHANGED
|
Binary file
|
package/mupdf/mupdf-async.js
CHANGED
|
@@ -90,7 +90,7 @@ export async function initMuPDFWorker() {
|
|
|
90
90
|
return function (...args) {
|
|
91
91
|
return new Promise((resolve, reject) => {
|
|
92
92
|
// Add the PDF as the first argument for most functions
|
|
93
|
-
if (!['openDocument', 'cleanFile'].includes(func)) {
|
|
93
|
+
if (!['openDocument', 'cleanFile', 'freeDocument'].includes(func)) {
|
|
94
94
|
// Remove job number (appended by Tesseract scheduler function)
|
|
95
95
|
// args = args.slice(0,-1)
|
|
96
96
|
|
package/mupdf/mupdf-worker.js
CHANGED
|
@@ -165,6 +165,8 @@ mupdf.pageText = function (doc, {
|
|
|
165
165
|
|
|
166
166
|
const content = Module.UTF8ToString(dataPtr);
|
|
167
167
|
|
|
168
|
+
Module._free(dataPtr);
|
|
169
|
+
|
|
168
170
|
return {
|
|
169
171
|
letterCountTotal,
|
|
170
172
|
letterCountVis,
|
|
@@ -464,7 +466,7 @@ const handleMessage = (data) => {
|
|
|
464
466
|
} catch (error) {
|
|
465
467
|
parentPort.postMessage(['ERROR', id, { name: error.name, message: error.message }]);
|
|
466
468
|
}
|
|
467
|
-
}
|
|
469
|
+
};
|
|
468
470
|
|
|
469
471
|
if (typeof process === 'undefined') {
|
|
470
472
|
onmessage = (event) => handleMessage(event.data);
|
package/package.json
CHANGED
package/scribe.js
CHANGED
|
@@ -94,7 +94,7 @@ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options
|
|
|
94
94
|
init({ ocr: true, font: true });
|
|
95
95
|
await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
|
|
96
96
|
if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
|
|
97
|
-
const skipRecPDF = inputData.pdfMode && (
|
|
97
|
+
const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
|
|
98
98
|
const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
|
|
99
99
|
if (!skipRecPDF && !skipRecOCR) await recognize({ langs });
|
|
100
100
|
return exportData(outputFormat);
|