scribe.js-ocr 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/cli/scribe.js +2 -0
  2. package/fonts/all/Carlito-BoldItalic.woff +0 -0
  3. package/fonts/all/Century-BoldItalic.woff +0 -0
  4. package/fonts/all/Garamond-BoldItalic.woff +0 -0
  5. package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
  6. package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
  7. package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
  8. package/fonts/all/Palatino-BoldItalic.woff +0 -0
  9. package/fonts/latin/Carlito-BoldItalic.woff +0 -0
  10. package/fonts/latin/Century-BoldItalic.woff +0 -0
  11. package/fonts/latin/Garamond-BoldItalic.woff +0 -0
  12. package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
  13. package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
  14. package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
  15. package/fonts/latin/Palatino-BoldItalic.woff +0 -0
  16. package/js/clear.js +5 -6
  17. package/js/containers/app.js +1 -1
  18. package/js/containers/dataContainer.js +0 -3
  19. package/js/containers/fontContainer.js +91 -77
  20. package/js/export/export.js +20 -5
  21. package/js/export/writeHocr.js +20 -18
  22. package/js/export/writeHtml.js +1 -1
  23. package/js/export/writePdf.js +52 -14
  24. package/js/export/writePdfFonts.js +11 -9
  25. package/js/export/writeTabular.js +2 -2
  26. package/js/export/writeText.js +10 -6
  27. package/js/extractTables.js +5 -5
  28. package/js/fontContainerMain.js +92 -49
  29. package/js/fontEval.js +12 -12
  30. package/js/fontStatistics.js +93 -92
  31. package/js/fontSupp.js +20 -20
  32. package/js/generalWorkerMain.js +4 -0
  33. package/js/global.d.ts +39 -4
  34. package/js/import/convertPageAbbyy.js +55 -26
  35. package/js/import/convertPageBlocks.js +2 -2
  36. package/js/import/convertPageHocr.js +10 -20
  37. package/js/import/convertPageShared.js +13 -9
  38. package/js/import/convertPageStext.js +67 -32
  39. package/js/import/import.js +89 -45
  40. package/js/import/importOCR.js +27 -33
  41. package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
  42. package/js/objects/layoutObjects.js +37 -0
  43. package/js/objects/ocrObjects.js +55 -19
  44. package/js/recognizeConvert.js +21 -8
  45. package/js/utils/fontUtils.js +11 -11
  46. package/js/utils/miscUtils.js +43 -6
  47. package/js/worker/compareOCRModule.js +20 -23
  48. package/js/worker/generalWorker.js +5 -5
  49. package/js/worker/optimizeFontModule.js +19 -19
  50. package/mupdf/libmupdf.js +123 -17
  51. package/mupdf/libmupdf.wasm +0 -0
  52. package/package.json +6 -3
@@ -140,11 +140,11 @@ export async function convertPageBlocks({
140
140
  // The `word` object has a `is_italic` property, but it is always false.
141
141
  // Therefore, the font name is checked to determine if the word is italic.
142
142
  // See: https://github.com/naptha/tesseract.js/issues/907
143
- if (keepItalic && /italic/i.test(word.font_name)) wordObj.style = 'italic';
143
+ if (keepItalic && /italic/i.test(word.font_name)) wordObj.style.italic = true;
144
144
 
145
145
  // Our fork of Tesseract Legacy should be able to recognize fonts, so this information is included.
146
146
  // The generic HOCR importer does not include font information, as this is assumed to be unreliable.
147
- wordObj.font = word.font_name;
147
+ wordObj.style.font = word.font_name;
148
148
 
149
149
  wordObj.chars = [];
150
150
  for (let m = 0; m < word.symbols.length; m++) {
@@ -247,8 +247,8 @@ export async function convertPageHocr({
247
247
 
248
248
  if (debugMode) wordObj.raw = match;
249
249
 
250
- if (italic) wordObj.style = 'italic';
251
- if (fontName) wordObj.font = fontName;
250
+ if (italic) wordObj.style.italic = true;
251
+ if (fontName) wordObj.style.font = fontName;
252
252
 
253
253
  wordObj.conf = wordConf;
254
254
 
@@ -302,19 +302,6 @@ export async function convertPageHocr({
302
302
 
303
303
  const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];
304
304
 
305
- let smallCaps = false;
306
- /** @type {('normal'|'italic'|'bold')} */
307
- let fontStyle = 'normal';
308
- if (styleStr && /italic/i.test(styleStr)) {
309
- fontStyle = 'italic';
310
- } else if (styleStr && /bold/i.test(styleStr)) {
311
- fontStyle = 'bold';
312
- }
313
-
314
- if (styleStr && /small-caps/i.test(styleStr)) {
315
- smallCaps = true;
316
- }
317
-
318
305
  const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0';
319
306
  const wordConf = parseInt(confMatch) || 0;
320
307
 
@@ -327,16 +314,19 @@ export async function convertPageHocr({
327
314
  const wordFontSizeStr = titleStrWord.match(/(?:;|\s)x_fsize\s+(\d+)/)?.[1];
328
315
  if (wordFontSizeStr) {
329
316
  const wordFontSize = parseInt(wordFontSizeStr);
330
- if (wordFontSize) wordObj.size = wordFontSize;
317
+ if (wordFontSize) wordObj.style.size = wordFontSize;
331
318
  }
332
319
  }
333
320
 
334
- wordObj.style = fontStyle;
335
- if (fontName) wordObj.font = fontName;
321
+ if (styleStr) {
322
+ if (/italic/i.test(styleStr)) wordObj.style.italic = true;
323
+ if (/bold/i.test(styleStr)) wordObj.style.bold = true;
324
+ if (/small-caps/i.test(styleStr)) wordObj.style.smallCaps = true;
325
+ }
336
326
 
337
- wordObj.sup = wordSup;
327
+ if (wordSup) wordObj.style.sup = true;
338
328
 
339
- wordObj.smallCaps = smallCaps;
329
+ if (fontName) wordObj.style.font = fontName;
340
330
 
341
331
  wordObj.conf = wordConf;
342
332
 
@@ -41,7 +41,7 @@ export function pass2(pageObj, rotateAngle) {
41
41
  for (let j = 0; j < lineObj.words.length; j++) {
42
42
  const wordObj = lineObj.words[j];
43
43
  // Skip words that are already identified as small caps, however they can be used to validate other words.
44
- if (wordObj.smallCaps) {
44
+ if (wordObj.style.smallCaps) {
45
45
  smallCapsWordArr.push(wordObj);
46
46
  firstWord = true;
47
47
  continue;
@@ -95,7 +95,7 @@ export function pass2(pageObj, rotateAngle) {
95
95
 
96
96
  for (let k = 0; k < smallCapsWordArr.length; k++) {
97
97
  const wordObj = smallCapsWordArr[k];
98
- wordObj.smallCaps = true;
98
+ wordObj.style.smallCaps = true;
99
99
  if (!wordObj.chars || !titleCaseTotal) continue;
100
100
 
101
101
  // If title case, convert all letters after the first to lowercase.
@@ -161,8 +161,10 @@ export function pass2(pageObj, rotateAngle) {
161
161
 
162
162
  // If the entire word is a superscript, it does not need to be split.
163
163
  if (superN === wordObj.text.length) {
164
- wordObj.sup = true;
165
- wordObj.style = 'normal';
164
+ wordObj.style.sup = true;
165
+ wordObj.style.bold = false;
166
+ wordObj.style.italic = false;
167
+ wordObj.style.underline = false;
166
168
  continue;
167
169
  }
168
170
 
@@ -182,8 +184,10 @@ export function pass2(pageObj, rotateAngle) {
182
184
 
183
185
  wordObjSup.text = textSuper;
184
186
  wordObjSup.chars = charSuperArr;
185
- wordObjSup.style = 'normal';
186
- wordObjSup.sup = true;
187
+ wordObjSup.style.bold = false;
188
+ wordObjSup.style.italic = false;
189
+ wordObjSup.style.underline = false;
190
+ wordObjSup.style.sup = true;
187
191
  wordObjSup.id = `${wordObj.id}a`;
188
192
  ocr.calcWordBbox(wordObjSup);
189
193
 
@@ -280,13 +284,13 @@ export function pass3(pageObj) {
280
284
 
281
285
  // Do not include superscripts, dropcaps, and low-confidence words in all statistics.
282
286
  // Low-confidence words are included for font size calculations, as some lines only contain low-confidence words.
283
- if (wordObj.sup || wordObj.dropcap) continue;
287
+ if (wordObj.style.sup || wordObj.style.dropcap) continue;
284
288
 
285
289
  const contentStrLetter = letterArr[k];
286
290
  const charHeight = charObj.bbox.bottom - charObj.bbox.top;
287
291
 
288
- const ascChar = wordObj.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.smallCaps && ascCharArr.includes(contentStrLetter);
289
- const xChar = wordObj.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.smallCaps && xCharArr.includes(contentStrLetter);
292
+ const ascChar = wordObj.style.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.style.smallCaps && ascCharArr.includes(contentStrLetter);
293
+ const xChar = wordObj.style.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.style.smallCaps && xCharArr.includes(contentStrLetter);
290
294
 
291
295
  // Save character heights to array for font size calculations
292
296
  lineAllHeightArr.push(charHeight);
@@ -5,7 +5,6 @@ import {
5
5
  calcBoxOverlap,
6
6
  calcLang,
7
7
  mean50,
8
- quantile,
9
8
  round6,
10
9
  unescapeXml,
11
10
  } from '../utils/miscUtils.js';
@@ -98,10 +97,11 @@ export async function convertPageStext({ ocrStr, n }) {
98
97
  let baselineCurrent = 0;
99
98
 
100
99
  /** @type {Array<Array<string>>} */
101
- const text = [];
100
+ const textArr = [];
102
101
  /** @type {Array<number>} */
103
102
  const wordLetterOrFontArrIndex = [];
104
- let styleCurrent = 'normal';
103
+ let boldCurrent = false;
104
+ let italicCurrent = false;
105
105
  let familyCurrent = 'Default';
106
106
  /** Font size at the current position in the PDF, with no modifications. */
107
107
  let sizeCurrentRaw = 0;
@@ -110,8 +110,14 @@ export async function convertPageStext({ ocrStr, n }) {
110
110
  let superCurrent = false;
111
111
  let smallCapsCurrent;
112
112
  let smallCapsCurrentAlt;
113
- /** @type {Array<string>} */
114
- const styleArr = [];
113
+
114
+ /** @type {Array<boolean>} */
115
+ const boldArr = [];
116
+ /** @type {Array<boolean>} */
117
+ const italicArr = [];
118
+
119
+ /** @type {Array<boolean>} */
120
+ const underlineArr = [];
115
121
  /** @type {Array<boolean>} */
116
122
  const smallCapsArr = [];
117
123
  /** @type {Array<boolean>} */
@@ -144,6 +150,7 @@ export async function convertPageStext({ ocrStr, n }) {
144
150
  * @property {Quad} quad
145
151
  * @property {Point} origin
146
152
  * @property {string} text
153
+ * @property {number} flags
147
154
  */
148
155
 
149
156
  /**
@@ -158,8 +165,7 @@ export async function convertPageStext({ ocrStr, n }) {
158
165
  // Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
159
166
  // This regex splits the string into elements that contain either (1) a font change or (2) a character.
160
167
  // The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
161
- const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
162
-
168
+ const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"]([^>]*?c=['"][^'"]+['"])\s*\/>/ig;
163
169
  const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
164
170
 
165
171
  wordCharOrFontArr[i] = [];
@@ -167,7 +173,8 @@ export async function convertPageStext({ ocrStr, n }) {
167
173
  const fontStr = stextMatches[j][1];
168
174
  const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
169
175
  const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
170
- if (fontNameStrI && fontSizeStrI) {
176
+ // fontNameStrI can exist but be an empty string. Therefore, truthy/falsy checks are not sufficient.
177
+ if (fontNameStrI !== undefined && fontSizeStrI !== undefined) {
171
178
  // Skip font changes that occur at the end of a line.
172
179
  // In addition to being unnecessary, these are problematic when parsing superscripts.
173
180
  if (i + 1 === wordStrArr.length && j + 1 === stextMatches.length) continue;
@@ -209,10 +216,14 @@ export async function convertPageStext({ ocrStr, n }) {
209
216
  };
210
217
  }
211
218
 
219
+ const flags = parseInt(stextMatches[j][12]?.match(/flags=['"]([^'"]*)/)?.[1]);
220
+ const text = stextMatches[j][12]?.match(/c=['"]([^'"]*)/)?.[1];
221
+
212
222
  wordCharOrFontArr[i][j] = {
213
223
  quad,
214
224
  origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
215
- text: stextMatches[j][12],
225
+ flags,
226
+ text,
216
227
  };
217
228
  }
218
229
  }
@@ -220,6 +231,7 @@ export async function convertPageStext({ ocrStr, n }) {
220
231
  for (let i = 0; i < wordCharOrFontArr.length; i++) {
221
232
  let textWordArr = [];
222
233
  let bboxesWordArr = [];
234
+ const underlineWordArr = [];
223
235
  let fontFamily = familyCurrent || fontFamilyLine || 'Default';
224
236
  // Font size for the word is a separate variable, as if a font size changes at the end of the word,
225
237
  // that should not be reflected until the following word.
@@ -228,7 +240,8 @@ export async function convertPageStext({ ocrStr, n }) {
228
240
  let smallCapsWordAlt = smallCapsCurrentAlt || false;
229
241
  // Title case adjustment does not carry forward between words. A word in title case may be followed by a word in all lower case.
230
242
  let smallCapsWordAltTitleCaseAdj = false;
231
- let styleWord = 'normal';
243
+ let boldWord = false;
244
+ let italicWord = false;
232
245
 
233
246
  if (wordCharOrFontArr[i].length === 0) continue;
234
247
 
@@ -276,9 +289,13 @@ export async function convertPageStext({ ocrStr, n }) {
276
289
  && ((baselineDelta < -0.25 && sizeDelta < -0.05) || (baselineDelta > 0.25 && sizeDelta > 0.05))) {
277
290
  // Split word when superscript starts or ends.
278
291
  if (textWordArr.length > 0) {
279
- text.push(textWordArr);
292
+ textArr.push(textWordArr);
280
293
  bboxes.push(bboxesWordArr);
281
- styleArr.push(styleWord);
294
+
295
+ boldArr.push(boldWord);
296
+ italicArr.push(italicWord);
297
+ underlineArr.push(underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5);
298
+
282
299
  fontFamilyArr.push(fontFamily);
283
300
 
284
301
  if (sizeDelta > 0) {
@@ -341,11 +358,15 @@ export async function convertPageStext({ ocrStr, n }) {
341
358
  if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /-it$/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
342
359
  // The word is already initialized, so we need to change the last element of the style array.
343
360
  // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
344
- styleCurrent = 'italic';
345
- } else if (/bold|black/i.test(charOrFont.name)) {
346
- styleCurrent = 'bold';
361
+ italicCurrent = true;
347
362
  } else {
348
- styleCurrent = 'normal';
363
+ italicCurrent = false;
364
+ }
365
+
366
+ if (/bold|black/i.test(charOrFont.name)) {
367
+ boldCurrent = true;
368
+ } else {
369
+ boldCurrent = false;
349
370
  }
350
371
 
351
372
  continue;
@@ -354,7 +375,9 @@ export async function convertPageStext({ ocrStr, n }) {
354
375
  }
355
376
 
356
377
  if (!wordInit) {
357
- styleWord = styleCurrent;
378
+ boldWord = boldCurrent;
379
+ italicWord = italicCurrent;
380
+
358
381
  wordInit = true;
359
382
  }
360
383
 
@@ -411,15 +434,23 @@ export async function convertPageStext({ ocrStr, n }) {
411
434
 
412
435
  textWordArr.push(charOrFont.text);
413
436
 
437
+ underlineWordArr.push(charOrFont.flags === 2);
438
+
414
439
  bboxesWordArr.push(bbox);
415
440
  }
416
441
 
417
442
  if (textWordArr.length === 0) continue;
418
443
 
444
+ const underlineWord = underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5;
445
+ underlineArr.push(underlineWord);
446
+
419
447
  wordLetterOrFontArrIndex.push(i);
420
- text.push(textWordArr);
448
+ textArr.push(textWordArr);
421
449
  bboxes.push(bboxesWordArr);
422
- styleArr.push(styleWord);
450
+
451
+ boldArr.push(boldWord);
452
+ italicArr.push(italicWord);
453
+
423
454
  fontFamilyArr.push(fontFamily);
424
455
  fontSizeArr.push(fontSizeWord);
425
456
  smallCapsAltArr.push(smallCapsWordAlt);
@@ -476,8 +507,8 @@ export async function convertPageStext({ ocrStr, n }) {
476
507
  lineObj.raw = xmlLine;
477
508
 
478
509
  let lettersKept = 0;
479
- for (let i = 0; i < text.length; i++) {
480
- const wordText = unescapeXml(text[i].join(''));
510
+ for (let i = 0; i < textArr.length; i++) {
511
+ const wordText = unescapeXml(textArr[i].join(''));
481
512
 
482
513
  if (wordText.trim() === '') continue;
483
514
 
@@ -490,8 +521,8 @@ export async function convertPageStext({ ocrStr, n }) {
490
521
  /** @type {Array<OcrChar>} */
491
522
  const charObjArr = [];
492
523
 
493
- for (let j = 0; j < text[i].length; j++) {
494
- const letter = unescapeXml(text[i][j]);
524
+ for (let j = 0; j < textArr[i].length; j++) {
525
+ const letter = unescapeXml(textArr[i][j]);
495
526
 
496
527
  const bbox = bboxesI[j];
497
528
 
@@ -526,7 +557,7 @@ export async function convertPageStext({ ocrStr, n }) {
526
557
  if (bbox.left < 0 && bbox.right < 0) continue;
527
558
 
528
559
  const wordObj = new ocr.OcrWord(lineObj, wordText, bbox, wordID);
529
- wordObj.size = fontSizeArr[i];
560
+ wordObj.style.size = fontSizeArr[i];
530
561
 
531
562
  wordObj.lang = wordLang;
532
563
 
@@ -540,7 +571,7 @@ export async function convertPageStext({ ocrStr, n }) {
540
571
  wordObj.conf = 100;
541
572
 
542
573
  if (smallCapsAltArr[i] && !/[a-z]/.test(wordObj.text) && /[A-Z].?[A-Z]/.test(wordObj.text)) {
543
- wordObj.smallCaps = true;
574
+ wordObj.style.smallCaps = true;
544
575
  if (smallCapsAltTitleCaseArr[i]) {
545
576
  wordObj.chars.slice(1).forEach((x) => {
546
577
  x.text = x.text.toLowerCase();
@@ -552,20 +583,24 @@ export async function convertPageStext({ ocrStr, n }) {
552
583
  }
553
584
  wordObj.text = wordObj.chars.map((x) => x.text).join('');
554
585
  } else if (smallCapsArr[i]) {
555
- wordObj.smallCaps = true;
586
+ wordObj.style.smallCaps = true;
556
587
  }
557
588
 
558
- if (styleArr[i] === 'italic') {
559
- wordObj.style = 'italic';
560
- } if (styleArr[i] === 'bold') {
561
- wordObj.style = 'bold';
589
+ if (italicArr[i]) {
590
+ wordObj.style.italic = true;
591
+ }
592
+
593
+ if (boldArr[i]) {
594
+ wordObj.style.bold = true;
562
595
  }
563
596
 
564
597
  wordObj.raw = wordStrArr[wordLetterOrFontArrIndex[i]];
565
598
 
566
- wordObj.font = fontFamilyArr[i];
599
+ wordObj.style.font = fontFamilyArr[i];
600
+
601
+ wordObj.style.sup = superArr[i];
567
602
 
568
- wordObj.sup = superArr[i];
603
+ wordObj.style.underline = underlineArr[i];
569
604
 
570
605
  lineObj.words.push(wordObj);
571
606
 
@@ -2,7 +2,6 @@ import { clearData } from '../clear.js';
2
2
  import { inputData, opt } from '../containers/app.js';
3
3
  import {
4
4
  convertPageWarn,
5
- fontMetricsObj,
6
5
  layoutDataTables,
7
6
  layoutRegions,
8
7
  ocrAll,
@@ -18,14 +17,15 @@ import {
18
17
  optimizeFontContainerAll, setDefaultFontAuto,
19
18
  } from '../fontContainerMain.js';
20
19
  import { runFontOptimization } from '../fontEval.js';
21
- import { calcFontMetricsFromPages } from '../fontStatistics.js';
20
+ import { calcCharMetricsFromPages } from '../fontStatistics.js';
22
21
  import { calcSuppFontInfo } from '../fontSupp.js';
23
22
  import { gs } from '../generalWorkerMain.js';
24
23
  import { imageUtils } from '../objects/imageObjects.js';
25
- import { LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
24
+ import { addCircularRefsDataTables, LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
25
+ import { addCircularRefsOcr } from '../objects/ocrObjects.js';
26
26
  import { PageMetrics } from '../objects/pageMetricsObjects.js';
27
27
  import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
28
- import { replaceObjectProperties } from '../utils/miscUtils.js';
28
+ import { readOcrFile, clearObjectProperties, objectAssignDefined } from '../utils/miscUtils.js';
29
29
  import { importOCRFiles } from './importOCR.js';
30
30
 
31
31
  /**
@@ -141,6 +141,8 @@ export function sortInputFiles(files) {
141
141
  /** @type {Array<File|FileNode>} */
142
142
  const pdfFilesAll = [];
143
143
  /** @type {Array<File|FileNode>} */
144
+ const scribeFilesAll = [];
145
+ /** @type {Array<File|FileNode>} */
144
146
  const unsupportedFilesAll = [];
145
147
  const unsupportedExt = {};
146
148
  for (let i = 0; i < files.length; i++) {
@@ -156,6 +158,8 @@ export function sortInputFiles(files) {
156
158
  // All .gz files are assumed to be OCR data (xml) since all other file types can be compressed already
157
159
  } else if (['hocr', 'xml', 'html', 'gz', 'stext'].includes(fileExt)) {
158
160
  ocrFilesAll.push(file);
161
+ } else if (['scribe'].includes(fileExt)) {
162
+ scribeFilesAll.push(file);
159
163
  } else if (['pdf'].includes(fileExt)) {
160
164
  pdfFilesAll.push(file);
161
165
  } else {
@@ -172,7 +176,9 @@ export function sortInputFiles(files) {
172
176
  imageFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
173
177
  ocrFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
174
178
 
175
- return { pdfFiles: pdfFilesAll, imageFiles: imageFilesAll, ocrFiles: ocrFilesAll };
179
+ return {
180
+ pdfFiles: pdfFilesAll, imageFiles: imageFilesAll, ocrFiles: ocrFilesAll, scribeFiles: scribeFilesAll,
181
+ };
176
182
  }
177
183
 
178
184
  /**
@@ -184,6 +190,7 @@ export function sortInputFiles(files) {
184
190
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [pdfFiles]
185
191
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [imageFiles]
186
192
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [ocrFiles]
193
+ * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [scribeFiles]
187
194
  */
188
195
 
189
196
  /**
@@ -205,9 +212,11 @@ export async function importFiles(files) {
205
212
  let imageFiles = [];
206
213
  /** @type {Array<File|FileNode|ArrayBuffer>} */
207
214
  let ocrFiles = [];
215
+ /** @type {Array<File|FileNode|ArrayBuffer>} */
216
+ let scribeFiles = [];
208
217
  // These statements contain many ts-ignore comments, because the TypeScript interpreter apparently cannot properly narrow arrays.
209
218
  // See: https://github.com/microsoft/TypeScript/issues/42384
210
- if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files) {
219
+ if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files || 'scribeFiles' in files) {
211
220
  if (files.pdfFiles && files.pdfFiles[0] instanceof ArrayBuffer) {
212
221
  // @ts-ignore
213
222
  pdfFiles = files.pdfFiles;
@@ -229,14 +238,23 @@ export async function importFiles(files) {
229
238
  // @ts-ignore
230
239
  ocrFiles = await standardizeFiles(files.ocrFiles);
231
240
  }
241
+ if (files.scribeFiles && files.scribeFiles[0] instanceof ArrayBuffer) {
242
+ // @ts-ignore
243
+ scribeFiles = files.scribeFiles;
244
+ } else if (files.scribeFiles) {
245
+ // @ts-ignore
246
+ scribeFiles = await standardizeFiles(files.scribeFiles);
247
+ }
232
248
  } else {
233
249
  // @ts-ignore
234
250
  const filesStand = await standardizeFiles(files);
235
251
  if (files[0] instanceof ArrayBuffer) throw new Error('ArrayBuffer inputs must be sorted by file type.');
236
- ({ pdfFiles, imageFiles, ocrFiles } = sortInputFiles(filesStand));
252
+ ({
253
+ pdfFiles, imageFiles, ocrFiles, scribeFiles,
254
+ } = sortInputFiles(filesStand));
237
255
  }
238
256
 
239
- if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0) {
257
+ if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0 && scribeFiles.length === 0) {
240
258
  const errorText = 'No supported files found.';
241
259
  opt.errorHandler(errorText);
242
260
  return;
@@ -261,23 +279,61 @@ export async function importFiles(files) {
261
279
 
262
280
  // Set default download name
263
281
  if (pdfFiles.length > 0 && 'name' in pdfFiles[0]) {
264
- inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
282
+ inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
265
283
  } else if (imageFiles.length > 0 && 'name' in imageFiles[0]) {
266
- inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
284
+ inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
267
285
  } else if (ocrFiles.length > 0 && 'name' in ocrFiles[0]) {
268
- inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
286
+ inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
287
+ } else if (scribeFiles.length > 0 && 'name' in scribeFiles[0]) {
288
+ inputData.defaultDownloadFileName = `${scribeFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
269
289
  }
270
290
 
291
+ let existingLayout = false;
292
+ let existingLayoutDataTable = false;
293
+
271
294
  inputData.pdfMode = pdfFiles.length === 1;
272
295
  inputData.imageMode = !!(imageFiles.length > 0 && !inputData.pdfMode);
273
296
  ImageCache.inputModes.image = !!(imageFiles.length > 0 && !inputData.pdfMode);
274
297
 
298
+ if (scribeFiles.length > 0) {
299
+ const scribeRestoreStr = await readOcrFile(scribeFiles[0]);
300
+ /** @type {ScribeSaveData} */
301
+ const scribeRestoreObj = JSON.parse(scribeRestoreStr);
302
+ if (scribeRestoreObj.fontState) {
303
+ objectAssignDefined(FontCont.state, scribeRestoreObj.fontState);
304
+ await runFontOptimization(ocrAll.active);
305
+ }
306
+ if (scribeRestoreObj.layoutRegions) {
307
+ existingLayout = true;
308
+ layoutRegions.pages = scribeRestoreObj.layoutRegions;
309
+ }
310
+ if (scribeRestoreObj.layoutDataTables) {
311
+ existingLayoutDataTable = true;
312
+ addCircularRefsDataTables(scribeRestoreObj.layoutDataTables);
313
+ layoutDataTables.pages = scribeRestoreObj.layoutDataTables;
314
+ }
315
+
316
+ const oemName = 'User Upload';
317
+ if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
318
+ addCircularRefsOcr(scribeRestoreObj.ocr);
319
+ ocrAll[oemName] = scribeRestoreObj.ocr;
320
+ ocrAll.active = ocrAll[oemName];
321
+
322
+ for (let i = 0; i < ocrAll[oemName].length; i++) {
323
+ inputData.xmlMode[i] = true;
324
+ if (ocrAll[oemName][i].dims.height && ocrAll[oemName][i].dims.width) {
325
+ pageMetricsArr[i] = new PageMetrics(ocrAll[oemName][i].dims);
326
+ }
327
+ pageMetricsArr[i].angle = ocrAll[oemName][i].angle;
328
+ }
329
+ }
330
+
275
331
  const xmlModeImport = ocrFiles.length > 0;
276
332
 
277
333
  let pageCount;
278
334
  let pageCountImage;
279
335
  let abbyyMode = false;
280
- let scribeMode = false;
336
+ let reimportHocrMode = false;
281
337
 
282
338
  if (inputData.pdfMode) {
283
339
  const pdfFile = pdfFiles[0];
@@ -296,8 +352,6 @@ export async function importFiles(files) {
296
352
  pageCountImage = imageFiles.length;
297
353
  }
298
354
 
299
- let existingLayout = false;
300
- let existingLayoutDataTable = false;
301
355
  let existingOpt = false;
302
356
  const oemName = 'User Upload';
303
357
  let stextMode;
@@ -317,41 +371,32 @@ export async function importFiles(files) {
317
371
  ocrAllRaw.active = ocrAllRaw.active.slice(0, pageCountImage);
318
372
  }
319
373
 
374
+ objectAssignDefined(FontCont.state, ocrData.fontState);
375
+
320
376
  // Restore font metrics and optimize font from previous session (if applicable)
321
- if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
377
+ if (ocrData.fontState.charMetrics && Object.keys(ocrData.fontState.charMetrics).length > 0) {
322
378
  const fontPromise = loadBuiltInFontsRaw();
323
379
 
324
380
  existingOpt = true;
325
381
 
326
- replaceObjectProperties(fontMetricsObj, ocrData.fontMetricsObj);
327
382
  await gs.schedulerReady;
328
- setDefaultFontAuto(fontMetricsObj);
383
+ setDefaultFontAuto(FontCont.state.charMetrics);
329
384
 
330
385
  // If `ocrData.enableOpt` is `false`, then the metrics are present but ignored.
331
386
  // This occurs if optimization was found to decrease accuracy for both sans and serif,
332
387
  // not simply because the user disabled optimization in the view settings.
333
388
  // If no `enableOpt` property exists but metrics are present, then optimization is enabled.
334
389
  if (ocrData.enableOpt === 'false') {
335
- FontCont.enableOpt = false;
390
+ FontCont.state.enableOpt = false;
336
391
  } else {
337
392
  await fontPromise;
338
393
  if (!FontCont.raw) throw new Error('Raw font data not found.');
339
- FontCont.opt = await optimizeFontContainerAll(FontCont.raw, fontMetricsObj);
340
- FontCont.enableOpt = true;
394
+ FontCont.opt = await optimizeFontContainerAll(FontCont.raw, FontCont.state.charMetrics);
395
+ FontCont.state.enableOpt = true;
341
396
  await enableFontOpt(true);
342
397
  }
343
398
  }
344
399
 
345
- if (ocrData.defaultFont) FontCont.defaultFontName = ocrData.defaultFont;
346
-
347
- if (ocrData.sansFont) {
348
- FontCont.sansDefaultName = ocrData.sansFont;
349
- }
350
-
351
- if (ocrData.serifFont) {
352
- FontCont.serifDefaultName = ocrData.serifFont;
353
- }
354
-
355
400
  // Restore layout data from previous session (if applicable)
356
401
  if (ocrData.layoutObj) {
357
402
  for (let i = 0; i < ocrData.layoutObj.length; i++) {
@@ -368,22 +413,22 @@ export async function importFiles(files) {
368
413
  }
369
414
 
370
415
  abbyyMode = ocrData.abbyyMode;
371
- scribeMode = ocrData.scribeMode;
416
+ reimportHocrMode = ocrData.reimportHocrMode;
372
417
 
373
418
  stextMode = ocrData.stextMode;
374
419
  }
375
420
 
376
- const pageCountHOCR = ocrAllRaw.active?.length;
421
+ const pageCountOcr = ocrAllRaw.active?.length || ocrAll.active?.length || 0;
377
422
 
378
423
  // If both OCR data and image data are present, confirm they have the same number of pages
379
424
  if (xmlModeImport && (inputData.imageMode || inputData.pdfMode)) {
380
- if (pageCountImage !== pageCountHOCR) {
381
- const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${pageCountHOCR} pages.`;
425
+ if (pageCountImage !== pageCountOcr) {
426
+ const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${pageCountOcr} pages.`;
382
427
  opt.warningHandler(warningHTML);
383
428
  }
384
429
  }
385
430
 
386
- inputData.pageCount = pageCountImage ?? pageCountHOCR;
431
+ inputData.pageCount = pageCountImage ?? pageCountOcr;
387
432
 
388
433
  ocrAllRaw.active = ocrAllRaw.active || Array(pageCount);
389
434
 
@@ -399,10 +444,6 @@ export async function importFiles(files) {
399
444
  }
400
445
  }
401
446
 
402
- inputData.xmlMode = new Array(inputData.pageCount);
403
-
404
- inputData.xmlMode.fill(false);
405
-
406
447
  // Render first page for PDF only
407
448
  if (inputData.pdfMode && !xmlModeImport) {
408
449
  opt.progressHandler({ n: 0, type: 'importPDF', info: { } });
@@ -429,18 +470,23 @@ export async function importFiles(files) {
429
470
  if (stextMode) format = 'stext';
430
471
 
431
472
  // Process HOCR using web worker, reading from file first if that has not been done already
432
- await convertOCR(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
473
+ await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode).then(async () => {
433
474
  // Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate).
434
475
  if (!existingOpt && !stextMode) {
435
476
  await checkCharWarn(convertPageWarn);
436
- calcFontMetricsFromPages(ocrAll.active);
477
+ const charMetrics = calcCharMetricsFromPages(ocrAll.active);
478
+
479
+ if (Object.keys(charMetrics).length > 0) {
480
+ clearObjectProperties(FontCont.state.charMetrics);
481
+ Object.assign(FontCont.state.charMetrics, charMetrics);
482
+ }
437
483
  await runFontOptimization(ocrAll.active);
438
484
  }
439
485
  });
440
486
  } else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
441
487
  await extractInternalPDFText();
442
488
  if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
443
- if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
489
+ if (inputData.pdfType === 'text') FontCont.state.enableCleanToNimbusMono = true;
444
490
  if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
445
491
  }
446
492
  }
@@ -467,8 +513,6 @@ export async function importFilesSupp(files, ocrName) {
467
513
 
468
514
  const ocrData = await importOCRFiles(ocrFilesAll);
469
515
 
470
- const scribeMode = ocrData.scribeMode;
471
-
472
516
  const pageCountHOCR = ocrData.hocrRaw.length;
473
517
 
474
518
  // If both OCR data and image data are present, confirm they have the same number of pages
@@ -482,5 +526,5 @@ export async function importFilesSupp(files, ocrName) {
482
526
  if (ocrData.abbyyMode) format = 'abbyy';
483
527
  if (ocrData.stextMode) format = 'stext';
484
528
 
485
- await convertOCR(ocrData.hocrRaw, false, format, ocrName, scribeMode);
529
+ await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
486
530
  }