scribe.js-ocr 0.7.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/build-deno-compile.sh +30 -0
  2. package/cli/cli.js +46 -18
  3. package/cli/detectPDFType.js +1 -2
  4. package/cli/extract.js +14 -7
  5. package/cli/main.js +39 -39
  6. package/cli/require.js +1 -1
  7. package/cli/scribe.js +12 -11
  8. package/fonts/Dingbats.woff +0 -0
  9. package/fonts/all/URWGothicBook-Bold.woff +0 -0
  10. package/fonts/all/URWGothicBook-BoldItalic.woff +0 -0
  11. package/fonts/all/URWGothicBook-Italic.woff +0 -0
  12. package/fonts/all/URWGothicBook-Regular.woff +0 -0
  13. package/fonts/latin/URWGothicBook-Bold.woff +0 -0
  14. package/fonts/latin/URWGothicBook-BoldItalic.woff +0 -0
  15. package/fonts/latin/URWGothicBook-Italic.woff +0 -0
  16. package/fonts/latin/URWGothicBook-Regular.woff +0 -0
  17. package/js/canvasAdapter.js +4 -1
  18. package/js/clear.js +7 -8
  19. package/js/containers/app.js +2 -0
  20. package/js/containers/dataContainer.js +1 -4
  21. package/js/containers/fontContainer.js +59 -44
  22. package/js/containers/imageContainer.js +13 -35
  23. package/js/coordinates.js +3 -3
  24. package/js/debug.js +2 -2
  25. package/js/export/export.js +103 -18
  26. package/js/export/exportDebugCsv.js +4 -3
  27. package/js/export/pdf/writePdf.js +389 -0
  28. package/js/export/{writePdfFonts.js → pdf/writePdfFonts.js} +16 -12
  29. package/js/export/pdf/writePdfImages.js +218 -0
  30. package/js/export/{writePdf.js → pdf/writePdfText.js} +28 -315
  31. package/js/export/writeDocx.js +12 -5
  32. package/js/export/writeHocr.js +11 -10
  33. package/js/export/writeHtml.js +208 -48
  34. package/js/export/writeTabular.js +31 -20
  35. package/js/export/writeText.js +12 -10
  36. package/js/fontContainerMain.js +101 -50
  37. package/js/fontEval.js +18 -14
  38. package/js/fontStatistics.js +90 -90
  39. package/js/generalWorkerMain.js +52 -6
  40. package/js/global.d.ts +178 -6
  41. package/js/import/convertDocTextract.js +447 -0
  42. package/js/import/convertPageAbbyy.js +10 -4
  43. package/js/import/convertPageBlocks.js +4 -4
  44. package/js/import/convertPageGoogleVision.js +204 -0
  45. package/js/import/convertPageHocr.js +3 -3
  46. package/js/import/convertPageShared.js +1 -0
  47. package/js/import/convertPageStext.js +18 -10
  48. package/js/import/convertPageText.js +289 -0
  49. package/js/import/import.js +133 -125
  50. package/js/import/importOCR.js +98 -46
  51. package/js/import/nodeAdapter.js +2 -2
  52. package/js/modifyOCR.js +6 -5
  53. package/js/nudge.js +3 -3
  54. package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
  55. package/js/objects/imageObjects.js +3 -2
  56. package/js/objects/layoutObjects.js +37 -0
  57. package/js/objects/ocrObjects.js +51 -3
  58. package/js/recognizeConvert.js +74 -23
  59. package/js/utils/fontUtils.js +32 -1
  60. package/js/utils/imageUtils.js +99 -0
  61. package/js/utils/miscUtils.js +158 -9
  62. package/js/utils/reflowPars.js +4 -0
  63. package/js/worker/compareOCRModule.js +20 -18
  64. package/js/worker/generalWorker.js +12 -6
  65. package/js/worker/optimizeFontModule.js +19 -19
  66. package/mupdf/libmupdf.js +3 -3
  67. package/mupdf/libmupdf.wasm +0 -0
  68. package/mupdf/mupdf-async.js +1 -1
  69. package/mupdf/mupdf-worker.js +9 -4
  70. package/package.json +7 -4
  71. package/scribe.js +5 -5
  72. package/tess/tesseract.esm.min.js +1 -1
  73. package/tess/tesseract.min.js +1 -1
  74. package/tess/worker.min.js +1 -1
@@ -5,23 +5,23 @@
5
5
 
6
6
  // Node.js case
7
7
  import opentype from '../../lib/opentype.module.js';
8
- import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
8
+ import { determineSansSerif, getStyleLookup, clearObjectProperties } from '../utils/miscUtils.js';
9
9
  import { ca } from '../canvasAdapter.js';
10
10
 
11
11
  if (typeof process === 'object') {
12
12
  // @ts-ignore
13
13
  globalThis.self = globalThis;
14
14
  // @ts-ignore
15
- const { createRequire } = await import('module');
15
+ const { createRequire } = await import('node:module');
16
16
  globalThis.require = createRequire(import.meta.url);
17
- const { fileURLToPath } = await import('url');
18
- const { dirname } = await import('path');
17
+ const { fileURLToPath } = await import('node:url');
18
+ const { dirname } = await import('node:path');
19
19
  globalThis.__dirname = dirname(fileURLToPath(import.meta.url));
20
20
  }
21
21
 
22
22
  /**
23
23
  * Checks whether `multiFontMode` should be enabled or disabled.
24
- * @param {Object.<string, FontMetricsFamily>} fontMetricsObj
24
+ * @param {Object.<string, CharMetricsFamily>} charMetricsObj
25
25
  *
26
26
  * Usually (including when the built-in OCR engine is used) we will have metrics for individual font families,
27
27
  * which are used to optimize the appropriate fonts ("multiFontMode" is `true` in this case).
@@ -29,12 +29,12 @@ if (typeof process === 'object') {
29
29
  * but no font identification information for most or all words.
30
30
  * If this is encountered the "default" metric is applied to the default font ("multiFontMode" is `false` in this case).
31
31
  */
32
- export function checkMultiFontMode(fontMetricsObj) {
32
+ export function checkMultiFontMode(charMetricsObj) {
33
33
  let defaultFontObs = 0;
34
34
  let namedFontObs = 0;
35
- if (fontMetricsObj.Default?.obs) { defaultFontObs += (fontMetricsObj.Default?.obs || 0); }
36
- if (fontMetricsObj.SerifDefault?.obs) { namedFontObs += (fontMetricsObj.SerifDefault?.obs || 0); }
37
- if (fontMetricsObj.SansDefault?.obs) { namedFontObs += (fontMetricsObj.SansDefault?.obs || 0); }
35
+ if (charMetricsObj.Default?.obs) { defaultFontObs += (charMetricsObj.Default?.obs || 0); }
36
+ if (charMetricsObj.SerifDefault?.obs) { namedFontObs += (charMetricsObj.SerifDefault?.obs || 0); }
37
+ if (charMetricsObj.SansDefault?.obs) { namedFontObs += (charMetricsObj.SansDefault?.obs || 0); }
38
38
 
39
39
  return namedFontObs > defaultFontObs;
40
40
  }
@@ -191,7 +191,6 @@ export async function loadFontContainerFamily(family, src, opt = false) {
191
191
  /**
192
192
  *
193
193
  * @param {StyleLookup} styleLookup
194
- * @returns
195
194
  */
196
195
  const loadType = (styleLookup) => new Promise((resolve) => {
197
196
  const srcType = (src[styleLookup]);
@@ -250,33 +249,43 @@ export class FontCont {
250
249
  chi_sim: null,
251
250
  };
252
251
 
253
- /** Optimized fonts will be used when believed to improve quality. */
254
- static enableOpt = false;
252
+ /**
253
+ * This object contains all data that is saved and restored from intermediate .scribe files.
254
+ * Anything outside of this object is not saved or restored.
255
+ * @type {FontState}
256
+ */
257
+ static state = {
258
+ /** Optimized fonts will be used when believed to improve quality. */
259
+ enableOpt: false,
255
260
 
256
- /** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
257
- static forceOpt = false;
261
+ /** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
262
+ forceOpt: false,
258
263
 
259
- /** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
260
- static rawMetrics = null;
264
+ /**
265
+ * If `false`, 'Courier' will not be cleaned to Nimbus Mono.
266
+ * This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
267
+ * Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
268
+ */
269
+ enableCleanToNimbusMono: false,
261
270
 
262
- /** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
263
- static optMetrics = null;
271
+ defaultFontName: 'SerifDefault',
264
272
 
265
- static defaultFontName = 'SerifDefault';
273
+ serifDefaultName: 'NimbusRoman',
266
274
 
267
- static serifDefaultName = 'NimbusRoman';
275
+ sansDefaultName: 'NimbusSans',
268
276
 
269
- static sansDefaultName = 'NimbusSans';
277
+ glyphSet: null,
270
278
 
271
- /**
272
- * If `false`, 'Courier' will not be cleaned to Nimbus Mono.
273
- * This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
274
- * Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
275
- */
276
- static enableCleanToNimbusMono = false;
279
+ /** @type {Object.<string, CharMetricsFamily>} */
280
+ charMetrics: {},
277
281
 
278
- /** @type {?('latin'|'all')} */
279
- static glyphSet = null;
282
+ };
283
+
284
+ /** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
285
+ static rawMetrics = null;
286
+
287
+ /** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
288
+ static optMetrics = null;
280
289
 
281
290
  /**
282
291
  * Load fonts from an ArrayBuffer containing arbitrary font data.
@@ -344,15 +353,15 @@ export class FontCont {
344
353
  const raw = FontCont.raw?.[family]?.normal;
345
354
  if (!raw) return false;
346
355
  const opt = FontCont.opt?.[family]?.normal;
347
- if (opt && FontCont.forceOpt) {
356
+ if (opt && FontCont.state.forceOpt) {
348
357
  return true;
349
358
  // If optimized fonts are enabled (but not forced), the optimized version of a font will be used if:
350
359
  // (1) The optimized version exists
351
360
  // (2) The optimized version has a better metric (so quality should improve).
352
361
  // (3) The optimized version of the default sans/serif font also has a better metric.
353
362
  // This last condition avoids font optimization being enabled in the UI when it only improves an unused font.
354
- } if (opt && FontCont.enableOpt) {
355
- const defaultFamily = raw.type === 'serif' ? FontCont.serifDefaultName : FontCont.sansDefaultName;
363
+ } if (opt && FontCont.state.enableOpt) {
364
+ const defaultFamily = raw.type === 'serif' ? FontCont.state.serifDefaultName : FontCont.state.sansDefaultName;
356
365
 
357
366
  const rawMetricDefault = FontCont.rawMetrics?.[defaultFamily];
358
367
  const optMetricDefault = FontCont.optMetrics?.[defaultFamily];
@@ -375,7 +384,7 @@ export class FontCont {
375
384
  * @returns {FontContainerFont}
376
385
  */
377
386
  static getFont = (style, lang = 'eng') => {
378
- let family = style.font || FontCont.defaultFontName;
387
+ let family = style.font || FontCont.state.defaultFontName;
379
388
 
380
389
  const styleLookup = getStyleLookup(style);
381
390
 
@@ -403,19 +412,23 @@ export class FontCont {
403
412
  family = 'NimbusSans';
404
413
  } else if (/Arial/i.test(family)) {
405
414
  family = 'NimbusSans';
406
- } else if (/Century/i.test(family)) {
415
+ } else if (/CenturySch/i.test(family)) {
407
416
  family = 'Century';
408
417
  } else if (/Palatino/i.test(family)) {
409
418
  family = 'Palatino';
410
419
  } else if (/Garamond/i.test(family)) {
411
420
  family = 'Garamond';
421
+ } else if (/CenturyGothic/i.test(family)) {
422
+ family = 'Gothic';
423
+ } else if (/AvantGarde/i.test(family)) {
424
+ family = 'Gothic';
412
425
  } else if (/Carlito/i.test(family)) {
413
426
  family = 'Carlito';
414
427
  } else if (/Calibri/i.test(family)) {
415
428
  family = 'Carlito';
416
- } else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
429
+ } else if (/Courier/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
417
430
  family = 'NimbusMono';
418
- } else if (/NimbusMono/i.test(family) && FontCont.enableCleanToNimbusMono) {
431
+ } else if (/NimbusMono/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
419
432
  family = 'NimbusMono';
420
433
  }
421
434
  }
@@ -426,10 +439,10 @@ export class FontCont {
426
439
  }
427
440
 
428
441
  // This needs to come first as `defaultFontName` maps to either 'SerifDefault' or 'SansDefault'.
429
- if (family === 'Default') family = FontCont.defaultFontName;
442
+ if (family === 'Default') family = FontCont.state.defaultFontName;
430
443
 
431
- if (family === 'SerifDefault') family = FontCont.serifDefaultName;
432
- if (family === 'SansDefault') family = FontCont.sansDefaultName;
444
+ if (family === 'SerifDefault') family = FontCont.state.serifDefaultName;
445
+ if (family === 'SansDefault') family = FontCont.state.sansDefaultName;
433
446
 
434
447
  /** @type {FontContainerFont} */
435
448
  let fontRes = FontCont.raw?.[family]?.[styleLookup];
@@ -456,16 +469,18 @@ export class FontCont {
456
469
  FontCont.rawMetrics = null;
457
470
  FontCont.optMetrics = null;
458
471
 
459
- FontCont.enableCleanToNimbusMono = false;
472
+ FontCont.state.enableCleanToNimbusMono = false;
473
+
474
+ FontCont.state.defaultFontName = 'SerifDefault';
475
+ FontCont.state.serifDefaultName = 'NimbusRoman';
476
+ FontCont.state.sansDefaultName = 'NimbusSans';
460
477
 
461
- FontCont.defaultFontName = 'SerifDefault';
462
- FontCont.serifDefaultName = 'NimbusRoman';
463
- FontCont.sansDefaultName = 'NimbusSans';
478
+ clearObjectProperties(FontCont.state.charMetrics);
464
479
  };
465
480
 
466
481
  static terminate = () => {
467
482
  FontCont.clear();
468
483
  FontCont.raw = null;
469
- FontCont.glyphSet = null;
484
+ FontCont.state.glyphSet = null;
470
485
  };
471
486
  }
@@ -5,7 +5,7 @@ import {
5
5
  import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
6
6
 
7
7
  import { updateFontContWorkerMain } from '../fontContainerMain.js';
8
- import { pageMetricsArr } from './dataContainer.js';
8
+ import { pageMetricsAll } from './dataContainer.js';
9
9
  import {
10
10
  FontCont,
11
11
  FontContainerFont,
@@ -13,7 +13,7 @@ import {
13
13
  } from './fontContainer.js';
14
14
 
15
15
  import { gs } from '../generalWorkerMain.js';
16
- import { imageUtils } from '../objects/imageObjects.js';
16
+ import { imageUtils, ImageWrapper } from '../objects/imageObjects.js';
17
17
  import { range } from '../utils/miscUtils.js';
18
18
  import { opt } from './app.js';
19
19
 
@@ -42,32 +42,6 @@ export class MuPDFScheduler {
42
42
  }
43
43
  }
44
44
 
45
- export class ImageWrapper {
46
- /**
47
- * @param {number} n - Page number
48
- * @param {string} imageStr - Base-64 encoded image string. Should start with "data:image/png" or "data:image/jpeg".
49
- * @param {string} colorMode - Color mode ("color", "gray", or "binary").
50
- * @param {boolean} rotated - Whether image has been rotated.
51
- * @param {boolean} upscaled - Whether image has been upscaled.
52
- *
53
- * All properties of this object must be serializable, as ImageWrapper objects are sent between threads.
54
- * This means that no promises can be used.
55
- */
56
- constructor(n, imageStr, colorMode, rotated = false, upscaled = false) {
57
- this.n = n;
58
- this.src = imageStr;
59
- const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1];
60
- if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`);
61
- this.format = format0;
62
- this._dims = null;
63
- this.rotated = rotated;
64
- this.upscaled = upscaled;
65
- this.colorMode = colorMode;
66
- /** @type {?ImageBitmap} */
67
- this.imageBitmap = null;
68
- }
69
- }
70
-
71
45
  /**
72
46
  * @typedef {Object} ImageProperties
73
47
  * @property {boolean} [rotated]
@@ -126,7 +100,7 @@ export class ImageCache {
126
100
  colorMode = color ? 'color' : 'gray';
127
101
  }
128
102
 
129
- let pageAngle = pageMetricsArr[n].angle || 0;
103
+ let pageAngle = pageMetricsAll[n].angle || 0;
130
104
  if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
131
105
 
132
106
  // If no preference is specified for rotation, default to true.
@@ -213,7 +187,7 @@ export class ImageCache {
213
187
  if (ImageCache.inputModes.image) {
214
188
  return ImageCache.nativeSrc[n];
215
189
  } if (ImageCache.inputModes.pdf) {
216
- const pageMetrics = pageMetricsArr[n];
190
+ const pageMetrics = pageMetricsAll[n];
217
191
  const targetWidth = pageMetrics.dims.width;
218
192
  const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
219
193
  const muPDFScheduler = await ImageCache.getMuPDFScheduler();
@@ -232,7 +206,7 @@ export class ImageCache {
232
206
  * @param {boolean} [saveNativeImage=true] - Whether the native image should be saved.
233
207
  */
234
208
  static transformImage = async (inputImage, n, props, saveNativeImage = true) => {
235
- let pageAngle = pageMetricsArr[n].angle || 0;
209
+ let pageAngle = pageMetricsAll[n].angle || 0;
236
210
  if (Math.abs(pageAngle) < 0.05) pageAngle = 0;
237
211
 
238
212
  // If no preference is specified for rotation, default to true.
@@ -245,8 +219,8 @@ export class ImageCache {
245
219
  await gs.getGeneralScheduler();
246
220
 
247
221
  const resPromise = (async () => {
248
- // Wait for non-rotated version before replacing with promise
249
- if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
222
+ // Wait for non-rotated version before replacing with promise
223
+ await gs.initTesseract({ anyOk: true });
250
224
  return gs.recognize({
251
225
  image: inputImage.src,
252
226
  options: { rotateRadians: angleArg, upscale: upscaleArg },
@@ -276,7 +250,11 @@ export class ImageCache {
276
250
  * @param {boolean} [nativeOnly=true]
277
251
  */
278
252
  static getImages = (n, props, nativeOnly = true) => {
279
- const significantRotation = Math.abs(pageMetricsArr[n].angle || 0) > 0.05;
253
+ if (!ImageCache.inputModes.image && !ImageCache.inputModes.pdf) {
254
+ return { native: undefined, binary: undefined };
255
+ }
256
+
257
+ const significantRotation = Math.abs(pageMetricsAll[n].angle || 0) > 0.05;
280
258
 
281
259
  const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation);
282
260
  const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation));
@@ -422,7 +400,7 @@ export class ImageCache {
422
400
 
423
401
  ImageCache.pdfDims300.forEach((x, i) => {
424
402
  const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
425
- pageMetricsArr[i] = new PageMetrics(pageDims);
403
+ pageMetricsAll[i] = new PageMetrics(pageDims);
426
404
  });
427
405
 
428
406
  // WIP: Extract fonts embedded in PDFs.
package/js/coordinates.js CHANGED
@@ -3,7 +3,7 @@
3
3
  // Image Coordinate Space: coordinate space of a particular image
4
4
  // Canvas Coordinate Space: coordinate space of canvas, used for user interactions
5
5
 
6
- import { pageMetricsArr } from './containers/dataContainer.js';
6
+ import { pageMetricsAll } from './containers/dataContainer.js';
7
7
  import { ImageCache } from './containers/imageContainer.js';
8
8
 
9
9
  /**
@@ -27,7 +27,7 @@ function rotateBoundingBox(boundingBox, rotateAngle, n) {
27
27
  let angleAdjXRect = 0;
28
28
  let angleAdjYRect = 0;
29
29
 
30
- const pageDims = pageMetricsArr[n].dims;
30
+ const pageDims = pageMetricsAll[n].dims;
31
31
 
32
32
  const sinAngle = Math.sin(rotateAngle * (Math.PI / 180));
33
33
  const cosAngle = Math.cos(rotateAngle * (Math.PI / 180));
@@ -103,7 +103,7 @@ async function ocrToImage(ocrCoords, n, binary = false) {
103
103
 
104
104
  if (imageN.rotated) {
105
105
  // Otherwise, we must also account for rotation applied by the canvas
106
- const rotateAngle = (pageMetricsArr[n].angle || 0) * -1;
106
+ const rotateAngle = (pageMetricsAll[n].angle || 0) * -1;
107
107
 
108
108
  rotateBoundingBox(ocrCoords, rotateAngle, n);
109
109
  }
package/js/debug.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { opt } from './containers/app.js';
2
- import { pageMetricsArr } from './containers/dataContainer.js';
2
+ import { pageMetricsAll } from './containers/dataContainer.js';
3
3
  import { ImageCache } from './containers/imageContainer.js';
4
4
  import { gs } from './generalWorkerMain.js';
5
5
  import { loadImageElem } from './utils/imageUtils.js';
@@ -125,7 +125,7 @@ export async function renderPageStatic(page) {
125
125
  const res = gs.renderPageStaticImp({
126
126
  page,
127
127
  image,
128
- angle: pageMetricsArr[page.n].angle,
128
+ angle: pageMetricsAll[page.n].angle,
129
129
  });
130
130
 
131
131
  return res;
@@ -1,19 +1,22 @@
1
1
  import { inputData, opt } from '../containers/app.js';
2
2
  import {
3
- layoutDataTables, layoutRegions, ocrAll, pageMetricsArr,
3
+ layoutDataTables, layoutRegions, ocrAll, pageMetricsAll,
4
4
  } from '../containers/dataContainer.js';
5
5
  import { ImageCache } from '../containers/imageContainer.js';
6
6
  import { reorderOcrPage } from '../modifyOCR.js';
7
7
  import { saveAs } from '../utils/miscUtils.js';
8
- import { writePdf } from './writePdf.js';
8
+ import { writePdf } from './pdf/writePdf.js';
9
9
  import { writeHocr } from './writeHocr.js';
10
10
  import { writeText } from './writeText.js';
11
11
  import { writeHtml } from './writeHtml.js';
12
+ import { removeCircularRefsOcr } from '../objects/ocrObjects.js';
13
+ import { removeCircularRefsDataTables } from '../objects/layoutObjects.js';
14
+ import { FontCont } from '../containers/fontContainer.js';
12
15
 
13
16
  /**
14
17
  * Export active OCR data to specified format.
15
18
  * @public
16
- * @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'} [format='txt']
19
+ * @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt']
17
20
  * @param {number} [minPage=0] - First page to export.
18
21
  * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
19
22
  * @returns {Promise<string|ArrayBuffer>}
@@ -42,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
42
45
  const dimsLimit = { width: -1, height: -1 };
43
46
  if (opt.standardizePageSize) {
44
47
  for (let i = minPage; i <= maxPage; i++) {
45
- dimsLimit.height = Math.max(dimsLimit.height, pageMetricsArr[i].dims.height);
46
- dimsLimit.width = Math.max(dimsLimit.width, pageMetricsArr[i].dims.width);
48
+ dimsLimit.height = Math.max(dimsLimit.height, pageMetricsAll[i].dims.height);
49
+ dimsLimit.width = Math.max(dimsLimit.width, pageMetricsAll[i].dims.width);
47
50
  }
48
51
  }
49
52
 
@@ -55,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
55
58
 
56
59
  const rotateText = !rotateBackground;
57
60
 
61
+ const includeImages = false;
62
+ /** @type {ImageWrapper[]} */
63
+ let images = [];
64
+ if (includeImages) {
65
+ images = await Promise.all(ImageCache.nativeSrc);
66
+ }
67
+
58
68
  // Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this,
59
69
  // and assume that the overlay PDF is the same size as the input images.
60
- const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, rotateText, rotateBackground,
61
- { width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
70
+ const pdfStr = await writePdf({
71
+ ocrArr: ocrDownload,
72
+ pageMetricsArr: pageMetricsAll,
73
+ minpage: minPage,
74
+ maxpage: maxPage,
75
+ textMode: opt.displayMode,
76
+ rotateText,
77
+ rotateBackground,
78
+ dimsLimit: { width: -1, height: -1 },
79
+ confThreshHigh: opt.confThreshHigh,
80
+ confThreshMed: opt.confThreshMed,
81
+ proofOpacity: opt.overlayOpacity / 100,
82
+ images,
83
+ includeImages,
84
+ });
62
85
 
63
86
  const enc = new TextEncoder();
64
87
  const pdfEnc = enc.encode(pdfStr);
@@ -118,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
118
141
 
119
142
  await w.convertImageStart({ humanReadable: opt.humanReadablePDF });
120
143
  for (let i = minPage; i < maxPage + 1; i++) {
121
- /** @type {import('../containers/imageContainer.js').ImageWrapper} */
144
+ /** @type {ImageWrapper} */
122
145
  let image;
123
146
  if (binary) {
124
147
  image = await ImageCache.getBinary(i, props);
@@ -131,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
131
154
  // Angle the PDF viewer is instructed to rotated the image by.
132
155
  // This method is currently only used when rotation is needed but the user's (unrotated) source images are being used.
133
156
  // If the images are being rendered, then rotation is expected to be applied within the rendering process.
134
- const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsArr[i].angle || 0) * -1 : 0;
157
+ const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsAll[i].angle || 0) * -1 : 0;
135
158
 
136
159
  await w.convertImageAddPage({
137
160
  image: image.src, i, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, angle: angleImagePdf,
@@ -154,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
154
177
 
155
178
  w.freeDocument(pdfOverlay);
156
179
  } else {
157
- const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
158
- opt.overlayOpacity / 100);
180
+ const pdfStr = await writePdf({
181
+ ocrArr: ocrDownload,
182
+ pageMetricsArr: pageMetricsAll,
183
+ minpage: minPage,
184
+ maxpage: maxPage,
185
+ textMode: opt.displayMode,
186
+ rotateText: false,
187
+ rotateBackground: true,
188
+ dimsLimit,
189
+ confThreshHigh: opt.confThreshHigh,
190
+ confThreshMed: opt.confThreshMed,
191
+ proofOpacity: opt.overlayOpacity / 100,
192
+ });
159
193
 
160
194
  // The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
161
195
  // This is because muPDF cleans up the PDF we made in the previous step, including:
@@ -183,22 +217,73 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
183
217
  w.freeDocument(pdf);
184
218
  }
185
219
  } else if (format === 'hocr') {
186
- content = writeHocr(ocrAll.active, minPage, maxPage);
220
+ content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage });
187
221
  } else if (format === 'html') {
188
- content = writeHtml(ocrAll.active, minPage, maxPage, opt.reflow, opt.removeMargins);
222
+ const images = /** @type {Array<ImageWrapper>} */ ([]);
223
+ if (opt.includeImages) {
224
+ const props = { rotated: opt.autoRotate, upscaled: false, colorMode: opt.colorMode };
225
+ const binary = opt.colorMode === 'binary';
226
+
227
+ // An image could be rendered if either (1) binary is selected or (2) the input data is a PDF.
228
+ // Otherwise, the images uploaded by the user are used.
229
+ const renderImage = binary || inputData.pdfMode;
230
+
231
+ // Pre-render to benefit from parallel processing, since the loop below is synchronous.
232
+ if (renderImage) await ImageCache.preRenderRange(minPage, maxPage, binary, props);
233
+
234
+ for (let i = minPage; i < maxPage + 1; i++) {
235
+ /** @type {ImageWrapper} */
236
+ let image;
237
+ if (binary) {
238
+ image = await ImageCache.getBinary(i, props);
239
+ } else if (inputData.pdfMode) {
240
+ image = await ImageCache.getNative(i, props);
241
+ } else {
242
+ image = await ImageCache.nativeSrc[i];
243
+ }
244
+ images.push(image);
245
+ }
246
+ }
247
+
248
+ content = writeHtml({
249
+ ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins,
250
+ });
189
251
  } else if (format === 'txt') {
190
- content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
252
+ content = writeText({
253
+ ocrCurrent: ocrDownload,
254
+ minpage: minPage,
255
+ maxpage: maxPage,
256
+ reflowText: opt.reflow,
257
+ docxMode: false,
258
+ });
191
259
  // Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
192
260
  // @ts-ignore
193
261
  } else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx') {
194
262
  // Less common export formats are loaded dynamically to reduce initial load time.
195
263
  const writeDocx = (await import('./writeDocx.js')).writeDocx;
196
- content = await writeDocx(ocrDownload, minPage, maxPage);
264
+ content = await writeDocx({ hocrCurrent: ocrDownload, minpage: minPage, maxpage: maxPage });
197
265
  // @ts-ignore
198
266
  } else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx') {
199
267
  // Less common export formats are loaded dynamically to reduce initial load time.
200
268
  const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
201
- content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage);
269
+ content = await writeXlsx({
270
+ ocrPageArr: ocrDownload,
271
+ layoutPageArr: layoutDataTables.pages,
272
+ minpage: minPage,
273
+ maxpage: maxPage,
274
+ });
275
+ } else if (format === 'scribe') {
276
+ const data = {
277
+ ocr: removeCircularRefsOcr(ocrDownload),
278
+ fontState: FontCont.state,
279
+ layoutRegions: layoutRegions.pages,
280
+ layoutDataTables: removeCircularRefsDataTables(layoutDataTables.pages),
281
+ };
282
+ const contentStr = JSON.stringify(data);
283
+
284
+ const pako = await import('../../lib/pako.esm.mjs');
285
+ const enc = new TextEncoder();
286
+ content = pako.gzip(enc.encode(contentStr))?.buffer;
202
287
  }
203
288
 
204
289
  return content;
@@ -207,14 +292,14 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
207
292
  /**
208
293
  * Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
209
294
  * @public
210
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
295
+ * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format
211
296
  * @param {string} fileName
212
297
  * @param {number} [minPage=0] - First page to export.
213
298
  * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
214
299
  */
215
300
  export async function download(format, fileName, minPage = 0, maxPage = -1) {
216
301
  if (format === 'text') format = 'txt';
217
- fileName = fileName.replace(/\.\w{1,4}$/, `.${format}`);
302
+ fileName = fileName.replace(/\.\w{1,6}$/, `.${format}`);
218
303
  const content = await exportData(format, minPage, maxPage);
219
304
  await saveAs(content, fileName);
220
305
  }
@@ -39,11 +39,12 @@ export const convertToCsv = (data) => {
39
39
 
40
40
  /**
41
41
  *
42
- * @param {Array<OcrPage>} pages
43
- * @param {string} fileName
42
+ * @param {Object} params
43
+ * @param {Array<OcrPage>} params.pages
44
+ * @param {string} params.fileName
44
45
  * @returns
45
46
  */
46
- export const writeDebugCsv = (pages, fileName) => {
47
+ export const writeDebugCsv = ({ pages, fileName }) => {
47
48
  let csvStr = '';
48
49
 
49
50
  for (let i = 0; i < pages.length; i++) {