scribe.js-ocr 0.7.4 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build-deno-compile.sh +30 -0
- package/cli/cli.js +46 -18
- package/cli/detectPDFType.js +1 -2
- package/cli/extract.js +14 -7
- package/cli/main.js +39 -39
- package/cli/require.js +1 -1
- package/cli/scribe.js +12 -11
- package/fonts/Dingbats.woff +0 -0
- package/fonts/all/URWGothicBook-Bold.woff +0 -0
- package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/all/URWGothicBook-Italic.woff +0 -0
- package/fonts/all/URWGothicBook-Regular.woff +0 -0
- package/fonts/latin/URWGothicBook-Bold.woff +0 -0
- package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
- package/fonts/latin/URWGothicBook-Italic.woff +0 -0
- package/fonts/latin/URWGothicBook-Regular.woff +0 -0
- package/js/canvasAdapter.js +4 -1
- package/js/clear.js +7 -8
- package/js/containers/app.js +2 -0
- package/js/containers/dataContainer.js +1 -4
- package/js/containers/fontContainer.js +59 -44
- package/js/containers/imageContainer.js +13 -35
- package/js/coordinates.js +3 -3
- package/js/debug.js +2 -2
- package/js/export/export.js +103 -18
- package/js/export/exportDebugCsv.js +4 -3
- package/js/export/pdf/writePdf.js +389 -0
- package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
- package/js/export/pdf/writePdfImages.js +218 -0
- package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
- package/js/export/writeDocx.js +12 -5
- package/js/export/writeHocr.js +11 -10
- package/js/export/writeHtml.js +208 -48
- package/js/export/writeTabular.js +31 -20
- package/js/export/writeText.js +12 -10
- package/js/fontContainerMain.js +101 -50
- package/js/fontEval.js +18 -14
- package/js/fontStatistics.js +90 -90
- package/js/generalWorkerMain.js +52 -6
- package/js/global.d.ts +178 -6
- package/js/import/convertDocTextract.js +447 -0
- package/js/import/convertPageAbbyy.js +10 -4
- package/js/import/convertPageBlocks.js +4 -4
- package/js/import/convertPageGoogleVision.js +204 -0
- package/js/import/convertPageHocr.js +3 -3
- package/js/import/convertPageShared.js +1 -0
- package/js/import/convertPageStext.js +18 -10
- package/js/import/convertPageText.js +289 -0
- package/js/import/import.js +133 -125
- package/js/import/importOCR.js +98 -46
- package/js/import/nodeAdapter.js +2 -2
- package/js/modifyOCR.js +6 -5
- package/js/nudge.js +3 -3
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/imageObjects.js +3 -2
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +51 -3
- package/js/recognizeConvert.js +74 -23
- package/js/utils/fontUtils.js +32 -1
- package/js/utils/imageUtils.js +99 -0
- package/js/utils/miscUtils.js +158 -9
- package/js/utils/reflowPars.js +4 -0
- package/js/worker/compareOCRModule.js +20 -18
- package/js/worker/generalWorker.js +12 -6
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +3 -3
- package/mupdf/libmupdf.wasm +0 -0
- package/mupdf/mupdf-async.js +1 -1
- package/mupdf/mupdf-worker.js +9 -4
- package/package.json +7 -4
- package/scribe.js +5 -5
- package/tess/tesseract.esm.min.js +1 -1
- package/tess/tesseract.min.js +1 -1
- package/tess/worker.min.js +1 -1
|
@@ -5,23 +5,23 @@
|
|
|
5
5
|
|
|
6
6
|
// Node.js case
|
|
7
7
|
import opentype from '../../lib/opentype.module.js';
|
|
8
|
-
import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
|
|
8
|
+
import { determineSansSerif, getStyleLookup, clearObjectProperties } from '../utils/miscUtils.js';
|
|
9
9
|
import { ca } from '../canvasAdapter.js';
|
|
10
10
|
|
|
11
11
|
if (typeof process === 'object') {
|
|
12
12
|
// @ts-ignore
|
|
13
13
|
globalThis.self = globalThis;
|
|
14
14
|
// @ts-ignore
|
|
15
|
-
const { createRequire } = await import('module');
|
|
15
|
+
const { createRequire } = await import('node:module');
|
|
16
16
|
globalThis.require = createRequire(import.meta.url);
|
|
17
|
-
const { fileURLToPath } = await import('url');
|
|
18
|
-
const { dirname } = await import('path');
|
|
17
|
+
const { fileURLToPath } = await import('node:url');
|
|
18
|
+
const { dirname } = await import('node:path');
|
|
19
19
|
globalThis.__dirname = dirname(fileURLToPath(import.meta.url));
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
/**
|
|
23
23
|
* Checks whether `multiFontMode` should be enabled or disabled.
|
|
24
|
-
* @param {Object.<string,
|
|
24
|
+
* @param {Object.<string, CharMetricsFamily>} charMetricsObj
|
|
25
25
|
*
|
|
26
26
|
* Usually (including when the built-in OCR engine is used) we will have metrics for individual font families,
|
|
27
27
|
* which are used to optimize the appropriate fonts ("multiFontMode" is `true` in this case).
|
|
@@ -29,12 +29,12 @@ if (typeof process === 'object') {
|
|
|
29
29
|
* but no font identification information for most or all words.
|
|
30
30
|
* If this is encountered the "default" metric is applied to the default font ("multiFontMode" is `false` in this case).
|
|
31
31
|
*/
|
|
32
|
-
export function checkMultiFontMode(
|
|
32
|
+
export function checkMultiFontMode(charMetricsObj) {
|
|
33
33
|
let defaultFontObs = 0;
|
|
34
34
|
let namedFontObs = 0;
|
|
35
|
-
if (
|
|
36
|
-
if (
|
|
37
|
-
if (
|
|
35
|
+
if (charMetricsObj.Default?.obs) { defaultFontObs += (charMetricsObj.Default?.obs || 0); }
|
|
36
|
+
if (charMetricsObj.SerifDefault?.obs) { namedFontObs += (charMetricsObj.SerifDefault?.obs || 0); }
|
|
37
|
+
if (charMetricsObj.SansDefault?.obs) { namedFontObs += (charMetricsObj.SansDefault?.obs || 0); }
|
|
38
38
|
|
|
39
39
|
return namedFontObs > defaultFontObs;
|
|
40
40
|
}
|
|
@@ -191,7 +191,6 @@ export async function loadFontContainerFamily(family, src, opt = false) {
|
|
|
191
191
|
/**
|
|
192
192
|
*
|
|
193
193
|
* @param {StyleLookup} styleLookup
|
|
194
|
-
* @returns
|
|
195
194
|
*/
|
|
196
195
|
const loadType = (styleLookup) => new Promise((resolve) => {
|
|
197
196
|
const srcType = (src[styleLookup]);
|
|
@@ -250,33 +249,43 @@ export class FontCont {
|
|
|
250
249
|
chi_sim: null,
|
|
251
250
|
};
|
|
252
251
|
|
|
253
|
-
/**
|
|
254
|
-
|
|
252
|
+
/**
|
|
253
|
+
* This object contains all data that is saved and restored from intermediate .scribe files.
|
|
254
|
+
* Anything outside of this object is not saved or restored.
|
|
255
|
+
* @type {FontState}
|
|
256
|
+
*/
|
|
257
|
+
static state = {
|
|
258
|
+
/** Optimized fonts will be used when believed to improve quality. */
|
|
259
|
+
enableOpt: false,
|
|
255
260
|
|
|
256
|
-
|
|
257
|
-
|
|
261
|
+
/** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
|
|
262
|
+
forceOpt: false,
|
|
258
263
|
|
|
259
|
-
|
|
260
|
-
|
|
264
|
+
/**
|
|
265
|
+
* If `false`, 'Courier' will not be cleaned to Nimbus Mono.
|
|
266
|
+
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
267
|
+
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
268
|
+
*/
|
|
269
|
+
enableCleanToNimbusMono: false,
|
|
261
270
|
|
|
262
|
-
|
|
263
|
-
static optMetrics = null;
|
|
271
|
+
defaultFontName: 'SerifDefault',
|
|
264
272
|
|
|
265
|
-
|
|
273
|
+
serifDefaultName: 'NimbusRoman',
|
|
266
274
|
|
|
267
|
-
|
|
275
|
+
sansDefaultName: 'NimbusSans',
|
|
268
276
|
|
|
269
|
-
|
|
277
|
+
glyphSet: null,
|
|
270
278
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
274
|
-
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
275
|
-
*/
|
|
276
|
-
static enableCleanToNimbusMono = false;
|
|
279
|
+
/** @type {Object.<string, CharMetricsFamily>} */
|
|
280
|
+
charMetrics: {},
|
|
277
281
|
|
|
278
|
-
|
|
279
|
-
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
285
|
+
static rawMetrics = null;
|
|
286
|
+
|
|
287
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
288
|
+
static optMetrics = null;
|
|
280
289
|
|
|
281
290
|
/**
|
|
282
291
|
* Load fonts from an ArrayBuffer containing arbitrary font data.
|
|
@@ -344,15 +353,15 @@ export class FontCont {
|
|
|
344
353
|
const raw = FontCont.raw?.[family]?.normal;
|
|
345
354
|
if (!raw) return false;
|
|
346
355
|
const opt = FontCont.opt?.[family]?.normal;
|
|
347
|
-
if (opt && FontCont.forceOpt) {
|
|
356
|
+
if (opt && FontCont.state.forceOpt) {
|
|
348
357
|
return true;
|
|
349
358
|
// If optimized fonts are enabled (but not forced), the optimized version of a font will be used if:
|
|
350
359
|
// (1) The optimized version exists
|
|
351
360
|
// (2) The optimized version has a better metric (so quality should improve).
|
|
352
361
|
// (3) The optimized version of the default sans/serif font also has a better metric.
|
|
353
362
|
// This last condition avoids font optimization being enabled in the UI when it only improves an unused font.
|
|
354
|
-
} if (opt && FontCont.enableOpt) {
|
|
355
|
-
const defaultFamily = raw.type === 'serif' ? FontCont.serifDefaultName : FontCont.sansDefaultName;
|
|
363
|
+
} if (opt && FontCont.state.enableOpt) {
|
|
364
|
+
const defaultFamily = raw.type === 'serif' ? FontCont.state.serifDefaultName : FontCont.state.sansDefaultName;
|
|
356
365
|
|
|
357
366
|
const rawMetricDefault = FontCont.rawMetrics?.[defaultFamily];
|
|
358
367
|
const optMetricDefault = FontCont.optMetrics?.[defaultFamily];
|
|
@@ -375,7 +384,7 @@ export class FontCont {
|
|
|
375
384
|
* @returns {FontContainerFont}
|
|
376
385
|
*/
|
|
377
386
|
static getFont = (style, lang = 'eng') => {
|
|
378
|
-
let family = style.font || FontCont.defaultFontName;
|
|
387
|
+
let family = style.font || FontCont.state.defaultFontName;
|
|
379
388
|
|
|
380
389
|
const styleLookup = getStyleLookup(style);
|
|
381
390
|
|
|
@@ -403,19 +412,23 @@ export class FontCont {
|
|
|
403
412
|
family = 'NimbusSans';
|
|
404
413
|
} else if (/Arial/i.test(family)) {
|
|
405
414
|
family = 'NimbusSans';
|
|
406
|
-
} else if (/
|
|
415
|
+
} else if (/CenturySch/i.test(family)) {
|
|
407
416
|
family = 'Century';
|
|
408
417
|
} else if (/Palatino/i.test(family)) {
|
|
409
418
|
family = 'Palatino';
|
|
410
419
|
} else if (/Garamond/i.test(family)) {
|
|
411
420
|
family = 'Garamond';
|
|
421
|
+
} else if (/CenturyGothic/i.test(family)) {
|
|
422
|
+
family = 'Gothic';
|
|
423
|
+
} else if (/AvantGarde/i.test(family)) {
|
|
424
|
+
family = 'Gothic';
|
|
412
425
|
} else if (/Carlito/i.test(family)) {
|
|
413
426
|
family = 'Carlito';
|
|
414
427
|
} else if (/Calibri/i.test(family)) {
|
|
415
428
|
family = 'Carlito';
|
|
416
|
-
} else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
429
|
+
} else if (/Courier/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
417
430
|
family = 'NimbusMono';
|
|
418
|
-
} else if (/NimbusMono/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
431
|
+
} else if (/NimbusMono/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
419
432
|
family = 'NimbusMono';
|
|
420
433
|
}
|
|
421
434
|
}
|
|
@@ -426,10 +439,10 @@ export class FontCont {
|
|
|
426
439
|
}
|
|
427
440
|
|
|
428
441
|
// This needs to come first as `defaultFontName` maps to either 'SerifDefault' or 'SansDefault'.
|
|
429
|
-
if (family === 'Default') family = FontCont.defaultFontName;
|
|
442
|
+
if (family === 'Default') family = FontCont.state.defaultFontName;
|
|
430
443
|
|
|
431
|
-
if (family === 'SerifDefault') family = FontCont.serifDefaultName;
|
|
432
|
-
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
444
|
+
if (family === 'SerifDefault') family = FontCont.state.serifDefaultName;
|
|
445
|
+
if (family === 'SansDefault') family = FontCont.state.sansDefaultName;
|
|
433
446
|
|
|
434
447
|
/** @type {FontContainerFont} */
|
|
435
448
|
let fontRes = FontCont.raw?.[family]?.[styleLookup];
|
|
@@ -456,16 +469,18 @@ export class FontCont {
|
|
|
456
469
|
FontCont.rawMetrics = null;
|
|
457
470
|
FontCont.optMetrics = null;
|
|
458
471
|
|
|
459
|
-
FontCont.enableCleanToNimbusMono = false;
|
|
472
|
+
FontCont.state.enableCleanToNimbusMono = false;
|
|
473
|
+
|
|
474
|
+
FontCont.state.defaultFontName = 'SerifDefault';
|
|
475
|
+
FontCont.state.serifDefaultName = 'NimbusRoman';
|
|
476
|
+
FontCont.state.sansDefaultName = 'NimbusSans';
|
|
460
477
|
|
|
461
|
-
FontCont.
|
|
462
|
-
FontCont.serifDefaultName = 'NimbusRoman';
|
|
463
|
-
FontCont.sansDefaultName = 'NimbusSans';
|
|
478
|
+
clearObjectProperties(FontCont.state.charMetrics);
|
|
464
479
|
};
|
|
465
480
|
|
|
466
481
|
static terminate = () => {
|
|
467
482
|
FontCont.clear();
|
|
468
483
|
FontCont.raw = null;
|
|
469
|
-
FontCont.glyphSet = null;
|
|
484
|
+
FontCont.state.glyphSet = null;
|
|
470
485
|
};
|
|
471
486
|
}
|
|
@@ -5,7 +5,7 @@ import {
|
|
|
5
5
|
import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
|
|
6
6
|
|
|
7
7
|
import { updateFontContWorkerMain } from '../fontContainerMain.js';
|
|
8
|
-
import {
|
|
8
|
+
import { pageMetricsAll } from './dataContainer.js';
|
|
9
9
|
import {
|
|
10
10
|
FontCont,
|
|
11
11
|
FontContainerFont,
|
|
@@ -13,7 +13,7 @@ import {
|
|
|
13
13
|
} from './fontContainer.js';
|
|
14
14
|
|
|
15
15
|
import { gs } from '../generalWorkerMain.js';
|
|
16
|
-
import { imageUtils } from '../objects/imageObjects.js';
|
|
16
|
+
import { imageUtils, ImageWrapper } from '../objects/imageObjects.js';
|
|
17
17
|
import { range } from '../utils/miscUtils.js';
|
|
18
18
|
import { opt } from './app.js';
|
|
19
19
|
|
|
@@ -42,32 +42,6 @@ export class MuPDFScheduler {
|
|
|
42
42
|
}
|
|
43
43
|
}
|
|
44
44
|
|
|
45
|
-
export class ImageWrapper {
|
|
46
|
-
/**
|
|
47
|
-
* @param {number} n - Page number
|
|
48
|
-
* @param {string} imageStr - Base-64 encoded image string. Should start with "data:image/png" or "data:image/jpeg".
|
|
49
|
-
* @param {string} colorMode - Color mode ("color", "gray", or "binary").
|
|
50
|
-
* @param {boolean} rotated - Whether image has been rotated.
|
|
51
|
-
* @param {boolean} upscaled - Whether image has been upscaled.
|
|
52
|
-
*
|
|
53
|
-
* All properties of this object must be serializable, as ImageWrapper objects are sent between threads.
|
|
54
|
-
* This means that no promises can be used.
|
|
55
|
-
*/
|
|
56
|
-
constructor(n, imageStr, colorMode, rotated = false, upscaled = false) {
|
|
57
|
-
this.n = n;
|
|
58
|
-
this.src = imageStr;
|
|
59
|
-
const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1];
|
|
60
|
-
if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`);
|
|
61
|
-
this.format = format0;
|
|
62
|
-
this._dims = null;
|
|
63
|
-
this.rotated = rotated;
|
|
64
|
-
this.upscaled = upscaled;
|
|
65
|
-
this.colorMode = colorMode;
|
|
66
|
-
/** @type {?ImageBitmap} */
|
|
67
|
-
this.imageBitmap = null;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
45
|
/**
|
|
72
46
|
* @typedef {Object} ImageProperties
|
|
73
47
|
* @property {boolean} [rotated]
|
|
@@ -126,7 +100,7 @@ export class ImageCache {
|
|
|
126
100
|
colorMode = color ? 'color' : 'gray';
|
|
127
101
|
}
|
|
128
102
|
|
|
129
|
-
let pageAngle =
|
|
103
|
+
let pageAngle = pageMetricsAll[n].angle || 0;
|
|
130
104
|
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
|
|
131
105
|
|
|
132
106
|
// If no preference is specified for rotation, default to true.
|
|
@@ -213,7 +187,7 @@ export class ImageCache {
|
|
|
213
187
|
if (ImageCache.inputModes.image) {
|
|
214
188
|
return ImageCache.nativeSrc[n];
|
|
215
189
|
} if (ImageCache.inputModes.pdf) {
|
|
216
|
-
const pageMetrics =
|
|
190
|
+
const pageMetrics = pageMetricsAll[n];
|
|
217
191
|
const targetWidth = pageMetrics.dims.width;
|
|
218
192
|
const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
|
|
219
193
|
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
|
|
@@ -232,7 +206,7 @@ export class ImageCache {
|
|
|
232
206
|
* @param {boolean} [saveNativeImage=true] - Whether the native image should be saved.
|
|
233
207
|
*/
|
|
234
208
|
static transformImage = async (inputImage, n, props, saveNativeImage = true) => {
|
|
235
|
-
let pageAngle =
|
|
209
|
+
let pageAngle = pageMetricsAll[n].angle || 0;
|
|
236
210
|
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
|
|
237
211
|
|
|
238
212
|
// If no preference is specified for rotation, default to true.
|
|
@@ -245,8 +219,8 @@ export class ImageCache {
|
|
|
245
219
|
await gs.getGeneralScheduler();
|
|
246
220
|
|
|
247
221
|
const resPromise = (async () => {
|
|
248
|
-
|
|
249
|
-
|
|
222
|
+
// Wait for non-rotated version before replacing with promise
|
|
223
|
+
await gs.initTesseract({ anyOk: true });
|
|
250
224
|
return gs.recognize({
|
|
251
225
|
image: inputImage.src,
|
|
252
226
|
options: { rotateRadians: angleArg, upscale: upscaleArg },
|
|
@@ -276,7 +250,11 @@ export class ImageCache {
|
|
|
276
250
|
* @param {boolean} [nativeOnly=true]
|
|
277
251
|
*/
|
|
278
252
|
static getImages = (n, props, nativeOnly = true) => {
|
|
279
|
-
|
|
253
|
+
if (!ImageCache.inputModes.image && !ImageCache.inputModes.pdf) {
|
|
254
|
+
return { native: undefined, binary: undefined };
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const significantRotation = Math.abs(pageMetricsAll[n].angle || 0) > 0.05;
|
|
280
258
|
|
|
281
259
|
const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation);
|
|
282
260
|
const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation));
|
|
@@ -422,7 +400,7 @@ export class ImageCache {
|
|
|
422
400
|
|
|
423
401
|
ImageCache.pdfDims300.forEach((x, i) => {
|
|
424
402
|
const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
|
|
425
|
-
|
|
403
|
+
pageMetricsAll[i] = new PageMetrics(pageDims);
|
|
426
404
|
});
|
|
427
405
|
|
|
428
406
|
// WIP: Extract fonts embedded in PDFs.
|
package/js/coordinates.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
// Image Coordinate Space: coordinate space of a particular image
|
|
4
4
|
// Canvas Coordinate Space: coordinate space of canvas, used for user interactions
|
|
5
5
|
|
|
6
|
-
import {
|
|
6
|
+
import { pageMetricsAll } from './containers/dataContainer.js';
|
|
7
7
|
import { ImageCache } from './containers/imageContainer.js';
|
|
8
8
|
|
|
9
9
|
/**
|
|
@@ -27,7 +27,7 @@ function rotateBoundingBox(boundingBox, rotateAngle, n) {
|
|
|
27
27
|
let angleAdjXRect = 0;
|
|
28
28
|
let angleAdjYRect = 0;
|
|
29
29
|
|
|
30
|
-
const pageDims =
|
|
30
|
+
const pageDims = pageMetricsAll[n].dims;
|
|
31
31
|
|
|
32
32
|
const sinAngle = Math.sin(rotateAngle * (Math.PI / 180));
|
|
33
33
|
const cosAngle = Math.cos(rotateAngle * (Math.PI / 180));
|
|
@@ -103,7 +103,7 @@ async function ocrToImage(ocrCoords, n, binary = false) {
|
|
|
103
103
|
|
|
104
104
|
if (imageN.rotated) {
|
|
105
105
|
// Otherwise, we must also account for rotation applied by the canvas
|
|
106
|
-
const rotateAngle = (
|
|
106
|
+
const rotateAngle = (pageMetricsAll[n].angle || 0) * -1;
|
|
107
107
|
|
|
108
108
|
rotateBoundingBox(ocrCoords, rotateAngle, n);
|
|
109
109
|
}
|
package/js/debug.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { opt } from './containers/app.js';
|
|
2
|
-
import {
|
|
2
|
+
import { pageMetricsAll } from './containers/dataContainer.js';
|
|
3
3
|
import { ImageCache } from './containers/imageContainer.js';
|
|
4
4
|
import { gs } from './generalWorkerMain.js';
|
|
5
5
|
import { loadImageElem } from './utils/imageUtils.js';
|
|
@@ -125,7 +125,7 @@ export async function renderPageStatic(page) {
|
|
|
125
125
|
const res = gs.renderPageStaticImp({
|
|
126
126
|
page,
|
|
127
127
|
image,
|
|
128
|
-
angle:
|
|
128
|
+
angle: pageMetricsAll[page.n].angle,
|
|
129
129
|
});
|
|
130
130
|
|
|
131
131
|
return res;
|
package/js/export/export.js
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
import { inputData, opt } from '../containers/app.js';
|
|
2
2
|
import {
|
|
3
|
-
layoutDataTables, layoutRegions, ocrAll,
|
|
3
|
+
layoutDataTables, layoutRegions, ocrAll, pageMetricsAll,
|
|
4
4
|
} from '../containers/dataContainer.js';
|
|
5
5
|
import { ImageCache } from '../containers/imageContainer.js';
|
|
6
6
|
import { reorderOcrPage } from '../modifyOCR.js';
|
|
7
7
|
import { saveAs } from '../utils/miscUtils.js';
|
|
8
|
-
import { writePdf } from './writePdf.js';
|
|
8
|
+
import { writePdf } from './pdf/writePdf.js';
|
|
9
9
|
import { writeHocr } from './writeHocr.js';
|
|
10
10
|
import { writeText } from './writeText.js';
|
|
11
11
|
import { writeHtml } from './writeHtml.js';
|
|
12
|
+
import { removeCircularRefsOcr } from '../objects/ocrObjects.js';
|
|
13
|
+
import { removeCircularRefsDataTables } from '../objects/layoutObjects.js';
|
|
14
|
+
import { FontCont } from '../containers/fontContainer.js';
|
|
12
15
|
|
|
13
16
|
/**
|
|
14
17
|
* Export active OCR data to specified format.
|
|
15
18
|
* @public
|
|
16
|
-
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'} [format='txt']
|
|
19
|
+
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt']
|
|
17
20
|
* @param {number} [minPage=0] - First page to export.
|
|
18
21
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
19
22
|
* @returns {Promise<string|ArrayBuffer>}
|
|
@@ -42,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
42
45
|
const dimsLimit = { width: -1, height: -1 };
|
|
43
46
|
if (opt.standardizePageSize) {
|
|
44
47
|
for (let i = minPage; i <= maxPage; i++) {
|
|
45
|
-
dimsLimit.height = Math.max(dimsLimit.height,
|
|
46
|
-
dimsLimit.width = Math.max(dimsLimit.width,
|
|
48
|
+
dimsLimit.height = Math.max(dimsLimit.height, pageMetricsAll[i].dims.height);
|
|
49
|
+
dimsLimit.width = Math.max(dimsLimit.width, pageMetricsAll[i].dims.width);
|
|
47
50
|
}
|
|
48
51
|
}
|
|
49
52
|
|
|
@@ -55,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
55
58
|
|
|
56
59
|
const rotateText = !rotateBackground;
|
|
57
60
|
|
|
61
|
+
const includeImages = false;
|
|
62
|
+
/** @type {ImageWrapper[]} */
|
|
63
|
+
let images = [];
|
|
64
|
+
if (includeImages) {
|
|
65
|
+
images = await Promise.all(ImageCache.nativeSrc);
|
|
66
|
+
}
|
|
67
|
+
|
|
58
68
|
// Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this,
|
|
59
69
|
// and assume that the overlay PDF is the same size as the input images.
|
|
60
|
-
const pdfStr = await writePdf(
|
|
61
|
-
|
|
70
|
+
const pdfStr = await writePdf({
|
|
71
|
+
ocrArr: ocrDownload,
|
|
72
|
+
pageMetricsArr: pageMetricsAll,
|
|
73
|
+
minpage: minPage,
|
|
74
|
+
maxpage: maxPage,
|
|
75
|
+
textMode: opt.displayMode,
|
|
76
|
+
rotateText,
|
|
77
|
+
rotateBackground,
|
|
78
|
+
dimsLimit: { width: -1, height: -1 },
|
|
79
|
+
confThreshHigh: opt.confThreshHigh,
|
|
80
|
+
confThreshMed: opt.confThreshMed,
|
|
81
|
+
proofOpacity: opt.overlayOpacity / 100,
|
|
82
|
+
images,
|
|
83
|
+
includeImages,
|
|
84
|
+
});
|
|
62
85
|
|
|
63
86
|
const enc = new TextEncoder();
|
|
64
87
|
const pdfEnc = enc.encode(pdfStr);
|
|
@@ -118,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
118
141
|
|
|
119
142
|
await w.convertImageStart({ humanReadable: opt.humanReadablePDF });
|
|
120
143
|
for (let i = minPage; i < maxPage + 1; i++) {
|
|
121
|
-
/** @type {
|
|
144
|
+
/** @type {ImageWrapper} */
|
|
122
145
|
let image;
|
|
123
146
|
if (binary) {
|
|
124
147
|
image = await ImageCache.getBinary(i, props);
|
|
@@ -131,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
131
154
|
// Angle the PDF viewer is instructed to rotated the image by.
|
|
132
155
|
// This method is currently only used when rotation is needed but the user's (unrotated) source images are being used.
|
|
133
156
|
// If the images are being rendered, then rotation is expected to be applied within the rendering process.
|
|
134
|
-
const angleImagePdf = rotateBackground && !renderImage ? (
|
|
157
|
+
const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsAll[i].angle || 0) * -1 : 0;
|
|
135
158
|
|
|
136
159
|
await w.convertImageAddPage({
|
|
137
160
|
image: image.src, i, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, angle: angleImagePdf,
|
|
@@ -154,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
154
177
|
|
|
155
178
|
w.freeDocument(pdfOverlay);
|
|
156
179
|
} else {
|
|
157
|
-
const pdfStr = await writePdf(
|
|
158
|
-
|
|
180
|
+
const pdfStr = await writePdf({
|
|
181
|
+
ocrArr: ocrDownload,
|
|
182
|
+
pageMetricsArr: pageMetricsAll,
|
|
183
|
+
minpage: minPage,
|
|
184
|
+
maxpage: maxPage,
|
|
185
|
+
textMode: opt.displayMode,
|
|
186
|
+
rotateText: false,
|
|
187
|
+
rotateBackground: true,
|
|
188
|
+
dimsLimit,
|
|
189
|
+
confThreshHigh: opt.confThreshHigh,
|
|
190
|
+
confThreshMed: opt.confThreshMed,
|
|
191
|
+
proofOpacity: opt.overlayOpacity / 100,
|
|
192
|
+
});
|
|
159
193
|
|
|
160
194
|
// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
|
|
161
195
|
// This is because muPDF cleans up the PDF we made in the previous step, including:
|
|
@@ -183,22 +217,73 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
183
217
|
w.freeDocument(pdf);
|
|
184
218
|
}
|
|
185
219
|
} else if (format === 'hocr') {
|
|
186
|
-
content = writeHocr(
|
|
220
|
+
content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage });
|
|
187
221
|
} else if (format === 'html') {
|
|
188
|
-
|
|
222
|
+
const images = /** @type {Array<ImageWrapper>} */ ([]);
|
|
223
|
+
if (opt.includeImages) {
|
|
224
|
+
const props = { rotated: opt.autoRotate, upscaled: false, colorMode: opt.colorMode };
|
|
225
|
+
const binary = opt.colorMode === 'binary';
|
|
226
|
+
|
|
227
|
+
// An image could be rendered if either (1) binary is selected or (2) the input data is a PDF.
|
|
228
|
+
// Otherwise, the images uploaded by the user are used.
|
|
229
|
+
const renderImage = binary || inputData.pdfMode;
|
|
230
|
+
|
|
231
|
+
// Pre-render to benefit from parallel processing, since the loop below is synchronous.
|
|
232
|
+
if (renderImage) await ImageCache.preRenderRange(minPage, maxPage, binary, props);
|
|
233
|
+
|
|
234
|
+
for (let i = minPage; i < maxPage + 1; i++) {
|
|
235
|
+
/** @type {ImageWrapper} */
|
|
236
|
+
let image;
|
|
237
|
+
if (binary) {
|
|
238
|
+
image = await ImageCache.getBinary(i, props);
|
|
239
|
+
} else if (inputData.pdfMode) {
|
|
240
|
+
image = await ImageCache.getNative(i, props);
|
|
241
|
+
} else {
|
|
242
|
+
image = await ImageCache.nativeSrc[i];
|
|
243
|
+
}
|
|
244
|
+
images.push(image);
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
content = writeHtml({
|
|
249
|
+
ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins,
|
|
250
|
+
});
|
|
189
251
|
} else if (format === 'txt') {
|
|
190
|
-
content = writeText(
|
|
252
|
+
content = writeText({
|
|
253
|
+
ocrCurrent: ocrDownload,
|
|
254
|
+
minpage: minPage,
|
|
255
|
+
maxpage: maxPage,
|
|
256
|
+
reflowText: opt.reflow,
|
|
257
|
+
docxMode: false,
|
|
258
|
+
});
|
|
191
259
|
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
|
|
192
260
|
// @ts-ignore
|
|
193
261
|
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx') {
|
|
194
262
|
// Less common export formats are loaded dynamically to reduce initial load time.
|
|
195
263
|
const writeDocx = (await import('./writeDocx.js')).writeDocx;
|
|
196
|
-
content = await writeDocx(ocrDownload, minPage, maxPage);
|
|
264
|
+
content = await writeDocx({ hocrCurrent: ocrDownload, minpage: minPage, maxpage: maxPage });
|
|
197
265
|
// @ts-ignore
|
|
198
266
|
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx') {
|
|
199
267
|
// Less common export formats are loaded dynamically to reduce initial load time.
|
|
200
268
|
const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
|
|
201
|
-
content = await writeXlsx(
|
|
269
|
+
content = await writeXlsx({
|
|
270
|
+
ocrPageArr: ocrDownload,
|
|
271
|
+
layoutPageArr: layoutDataTables.pages,
|
|
272
|
+
minpage: minPage,
|
|
273
|
+
maxpage: maxPage,
|
|
274
|
+
});
|
|
275
|
+
} else if (format === 'scribe') {
|
|
276
|
+
const data = {
|
|
277
|
+
ocr: removeCircularRefsOcr(ocrDownload),
|
|
278
|
+
fontState: FontCont.state,
|
|
279
|
+
layoutRegions: layoutRegions.pages,
|
|
280
|
+
layoutDataTables: removeCircularRefsDataTables(layoutDataTables.pages),
|
|
281
|
+
};
|
|
282
|
+
const contentStr = JSON.stringify(data);
|
|
283
|
+
|
|
284
|
+
const pako = await import('../../lib/pako.esm.mjs');
|
|
285
|
+
const enc = new TextEncoder();
|
|
286
|
+
content = pako.gzip(enc.encode(contentStr))?.buffer;
|
|
202
287
|
}
|
|
203
288
|
|
|
204
289
|
return content;
|
|
@@ -207,14 +292,14 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
207
292
|
/**
|
|
208
293
|
* Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
|
|
209
294
|
* @public
|
|
210
|
-
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
|
|
295
|
+
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format
|
|
211
296
|
* @param {string} fileName
|
|
212
297
|
* @param {number} [minPage=0] - First page to export.
|
|
213
298
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
214
299
|
*/
|
|
215
300
|
export async function download(format, fileName, minPage = 0, maxPage = -1) {
|
|
216
301
|
if (format === 'text') format = 'txt';
|
|
217
|
-
fileName = fileName.replace(/\.\w{1,
|
|
302
|
+
fileName = fileName.replace(/\.\w{1,6}$/, `.${format}`);
|
|
218
303
|
const content = await exportData(format, minPage, maxPage);
|
|
219
304
|
await saveAs(content, fileName);
|
|
220
305
|
}
|
|
@@ -39,11 +39,12 @@ export const convertToCsv = (data) => {
|
|
|
39
39
|
|
|
40
40
|
/**
|
|
41
41
|
*
|
|
42
|
-
* @param {
|
|
43
|
-
* @param {
|
|
42
|
+
* @param {Object} params
|
|
43
|
+
* @param {Array<OcrPage>} params.pages
|
|
44
|
+
* @param {string} params.fileName
|
|
44
45
|
* @returns
|
|
45
46
|
*/
|
|
46
|
-
export const writeDebugCsv = (pages, fileName) => {
|
|
47
|
+
export const writeDebugCsv = ({ pages, fileName }) => {
|
|
47
48
|
let csvStr = '';
|
|
48
49
|
|
|
49
50
|
for (let i = 0; i < pages.length; i++) {
|