npm - scribe.js-ocr - Versions diffs - 0.7.3 → 0.8.0 - Mend

scribe.js-ocr 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/cli/scribe.js +2 -0
package/fonts/all/Carlito-BoldItalic.woff +0 -0
package/fonts/all/Century-BoldItalic.woff +0 -0
package/fonts/all/Garamond-BoldItalic.woff +0 -0
package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
package/fonts/all/Palatino-BoldItalic.woff +0 -0
package/fonts/latin/Carlito-BoldItalic.woff +0 -0
package/fonts/latin/Century-BoldItalic.woff +0 -0
package/fonts/latin/Garamond-BoldItalic.woff +0 -0
package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
package/fonts/latin/Palatino-BoldItalic.woff +0 -0
package/js/clear.js +5 -6
package/js/containers/app.js +1 -1
package/js/containers/dataContainer.js +0 -3
package/js/containers/fontContainer.js +91 -77
package/js/export/export.js +20 -5
package/js/export/writeHocr.js +20 -18
package/js/export/writeHtml.js +1 -1
package/js/export/writePdf.js +52 -14
package/js/export/writePdfFonts.js +11 -9
package/js/export/writeTabular.js +2 -2
package/js/export/writeText.js +10 -6
package/js/extractTables.js +5 -5
package/js/fontContainerMain.js +92 -49
package/js/fontEval.js +12 -12
package/js/fontStatistics.js +93 -92
package/js/fontSupp.js +20 -20
package/js/generalWorkerMain.js +4 -0
package/js/global.d.ts +39 -4
package/js/import/convertPageAbbyy.js +55 -26
package/js/import/convertPageBlocks.js +2 -2
package/js/import/convertPageHocr.js +10 -20
package/js/import/convertPageShared.js +13 -9
package/js/import/convertPageStext.js +67 -32
package/js/import/import.js +89 -45
package/js/import/importOCR.js +27 -33
package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
package/js/objects/layoutObjects.js +37 -0
package/js/objects/ocrObjects.js +55 -19
package/js/recognizeConvert.js +21 -8
package/js/utils/fontUtils.js +11 -11
package/js/utils/miscUtils.js +43 -6
package/js/worker/compareOCRModule.js +20 -23
package/js/worker/generalWorker.js +5 -5
package/js/worker/optimizeFontModule.js +19 -19
package/mupdf/libmupdf.js +123 -17
package/mupdf/libmupdf.wasm +0 -0
package/package.json +6 -3

package/js/import/convertPageBlocks.js CHANGED Viewed

@@ -140,11 +140,11 @@ export async function convertPageBlocks({
           // The `word` object has a `is_italic` property, but it is always false.
           // Therefore, the font name is checked to determine if the word is italic.
           // See: https://github.com/naptha/tesseract.js/issues/907
-          if (keepItalic && /italic/i.test(word.font_name)) wordObj.style = 'italic';
+          if (keepItalic && /italic/i.test(word.font_name)) wordObj.style.italic = true;
           // Our fork of Tesseract Legacy should be able to recognize fonts, so this information is included.
           // The generic HOCR importer does not include font information, as this is assumed to be unreliable.
-          wordObj.font = word.font_name;
+          wordObj.style.font = word.font_name;
           wordObj.chars = [];
           for (let m = 0; m < word.symbols.length; m++) {

package/js/import/convertPageHocr.js CHANGED Viewed

@@ -247,8 +247,8 @@ export async function convertPageHocr({
       if (debugMode) wordObj.raw = match;
-      if (italic) wordObj.style = 'italic';
-      if (fontName) wordObj.font = fontName;
+      if (italic) wordObj.style.italic = true;
+      if (fontName) wordObj.style.font = fontName;
       wordObj.conf = wordConf;
@@ -302,19 +302,6 @@ export async function convertPageHocr({
       const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];
-      let smallCaps = false;
-      /** @type {('normal'|'italic'|'bold')} */
-      let fontStyle = 'normal';
-      if (styleStr && /italic/i.test(styleStr)) {
-        fontStyle = 'italic';
-      } else if (styleStr && /bold/i.test(styleStr)) {
-        fontStyle = 'bold';
-      }
-      if (styleStr && /small-caps/i.test(styleStr)) {
-        smallCaps = true;
-      }
       const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0';
       const wordConf = parseInt(confMatch) || 0;
@@ -327,16 +314,19 @@ export async function convertPageHocr({
         const wordFontSizeStr = titleStrWord.match(/(?:;|\s)x_fsize\s+(\d+)/)?.[1];
         if (wordFontSizeStr) {
           const wordFontSize = parseInt(wordFontSizeStr);
-          if (wordFontSize) wordObj.size = wordFontSize;
+          if (wordFontSize) wordObj.style.size = wordFontSize;
         }
       }
-      wordObj.style = fontStyle;
-      if (fontName) wordObj.font = fontName;
+      if (styleStr) {
+        if (/italic/i.test(styleStr)) wordObj.style.italic = true;
+        if (/bold/i.test(styleStr)) wordObj.style.bold = true;
+        if (/small-caps/i.test(styleStr)) wordObj.style.smallCaps = true;
+      }
-      wordObj.sup = wordSup;
+      if (wordSup) wordObj.style.sup = true;
-      wordObj.smallCaps = smallCaps;
+      if (fontName) wordObj.style.font = fontName;
       wordObj.conf = wordConf;

package/js/import/convertPageShared.js CHANGED Viewed

@@ -41,7 +41,7 @@ export function pass2(pageObj, rotateAngle) {
     for (let j = 0; j < lineObj.words.length; j++) {
       const wordObj = lineObj.words[j];
       // Skip words that are already identified as small caps, however they can be used to validate other words.
-      if (wordObj.smallCaps) {
+      if (wordObj.style.smallCaps) {
         smallCapsWordArr.push(wordObj);
         firstWord = true;
         continue;
@@ -95,7 +95,7 @@ export function pass2(pageObj, rotateAngle) {
       for (let k = 0; k < smallCapsWordArr.length; k++) {
         const wordObj = smallCapsWordArr[k];
-        wordObj.smallCaps = true;
+        wordObj.style.smallCaps = true;
         if (!wordObj.chars || !titleCaseTotal) continue;
         // If title case, convert all letters after the first to lowercase.
@@ -161,8 +161,10 @@ export function pass2(pageObj, rotateAngle) {
       // If the entire word is a superscript, it does not need to be split.
       if (superN === wordObj.text.length) {
-        wordObj.sup = true;
-        wordObj.style = 'normal';
+        wordObj.style.sup = true;
+        wordObj.style.bold = false;
+        wordObj.style.italic = false;
+        wordObj.style.underline = false;
         continue;
       }
@@ -182,8 +184,10 @@ export function pass2(pageObj, rotateAngle) {
       wordObjSup.text = textSuper;
       wordObjSup.chars = charSuperArr;
-      wordObjSup.style = 'normal';
-      wordObjSup.sup = true;
+      wordObjSup.style.bold = false;
+      wordObjSup.style.italic = false;
+      wordObjSup.style.underline = false;
+      wordObjSup.style.sup = true;
       wordObjSup.id = `${wordObj.id}a`;
       ocr.calcWordBbox(wordObjSup);
@@ -280,13 +284,13 @@ export function pass3(pageObj) {
           // Do not include superscripts, dropcaps, and low-confidence words in all statistics.
           // Low-confidence words are included for font size calculations, as some lines only contain low-confidence words.
-          if (wordObj.sup || wordObj.dropcap) continue;
+          if (wordObj.style.sup || wordObj.style.dropcap) continue;
           const contentStrLetter = letterArr[k];
           const charHeight = charObj.bbox.bottom - charObj.bbox.top;
-          const ascChar = wordObj.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.smallCaps && ascCharArr.includes(contentStrLetter);
-          const xChar = wordObj.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.smallCaps && xCharArr.includes(contentStrLetter);
+          const ascChar = wordObj.style.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.style.smallCaps && ascCharArr.includes(contentStrLetter);
+          const xChar = wordObj.style.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.style.smallCaps && xCharArr.includes(contentStrLetter);
           // Save character heights to array for font size calculations
           lineAllHeightArr.push(charHeight);

package/js/import/convertPageStext.js CHANGED Viewed

@@ -5,7 +5,6 @@ import {
   calcBoxOverlap,
   calcLang,
   mean50,
-  quantile,
   round6,
   unescapeXml,
 } from '../utils/miscUtils.js';
@@ -98,10 +97,11 @@ export async function convertPageStext({ ocrStr, n }) {
       let baselineCurrent = 0;
       /** @type {Array<Array<string>>} */
-      const text = [];
+      const textArr = [];
       /** @type {Array<number>} */
       const wordLetterOrFontArrIndex = [];
-      let styleCurrent = 'normal';
+      let boldCurrent = false;
+      let italicCurrent = false;
       let familyCurrent = 'Default';
       /** Font size at the current position in the PDF, with no modifications. */
       let sizeCurrentRaw = 0;
@@ -110,8 +110,14 @@ export async function convertPageStext({ ocrStr, n }) {
       let superCurrent = false;
       let smallCapsCurrent;
       let smallCapsCurrentAlt;
-      /** @type {Array<string>} */
-      const styleArr = [];
+      /** @type {Array<boolean>} */
+      const boldArr = [];
+      /** @type {Array<boolean>} */
+      const italicArr = [];
+      /** @type {Array<boolean>} */
+      const underlineArr = [];
       /** @type {Array<boolean>} */
       const smallCapsArr = [];
       /** @type {Array<boolean>} */
@@ -144,6 +150,7 @@ export async function convertPageStext({ ocrStr, n }) {
        * @property {Quad} quad
        * @property {Point} origin
        * @property {string} text
+       * @property {number} flags
        */
       /**
@@ -158,8 +165,7 @@ export async function convertPageStext({ ocrStr, n }) {
         // Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
         // This regex splits the string into elements that contain either (1) a font change or (2) a character.
         // The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
-        const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
+        const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"]([^>]*?c=['"][^'"]+['"])\s*\/>/ig;
         const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
         wordCharOrFontArr[i] = [];
@@ -167,7 +173,8 @@ export async function convertPageStext({ ocrStr, n }) {
           const fontStr = stextMatches[j][1];
           const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
           const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
-          if (fontNameStrI && fontSizeStrI) {
+          // fontNameStrI can exist but be an empty string. Therefore, truthy/falsy checks are not sufficient.
+          if (fontNameStrI !== undefined && fontSizeStrI !== undefined) {
             // Skip font changes that occur at the end of a line.
             // In addition to being unnecessary, these are problematic when parsing superscripts.
             if (i + 1 === wordStrArr.length && j + 1 === stextMatches.length) continue;
@@ -209,10 +216,14 @@ export async function convertPageStext({ ocrStr, n }) {
             };
           }
+          const flags = parseInt(stextMatches[j][12]?.match(/flags=['"]([^'"]*)/)?.[1]);
+          const text = stextMatches[j][12]?.match(/c=['"]([^'"]*)/)?.[1];
           wordCharOrFontArr[i][j] = {
             quad,
             origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
-            text: stextMatches[j][12],
+            flags,
+            text,
           };
         }
       }
@@ -220,6 +231,7 @@ export async function convertPageStext({ ocrStr, n }) {
       for (let i = 0; i < wordCharOrFontArr.length; i++) {
         let textWordArr = [];
         let bboxesWordArr = [];
+        const underlineWordArr = [];
         let fontFamily = familyCurrent || fontFamilyLine || 'Default';
         // Font size for the word is a separate variable, as if a font size changes at the end of the word,
         // that should not be reflected until the following word.
@@ -228,7 +240,8 @@ export async function convertPageStext({ ocrStr, n }) {
         let smallCapsWordAlt = smallCapsCurrentAlt || false;
         // Title case adjustment does not carry forward between words. A word in title case may be followed by a word in all lower case.
         let smallCapsWordAltTitleCaseAdj = false;
-        let styleWord = 'normal';
+        let boldWord = false;
+        let italicWord = false;
         if (wordCharOrFontArr[i].length === 0) continue;
@@ -276,9 +289,13 @@ export async function convertPageStext({ ocrStr, n }) {
             && ((baselineDelta < -0.25 && sizeDelta < -0.05) || (baselineDelta > 0.25 && sizeDelta > 0.05))) {
               // Split word when superscript starts or ends.
               if (textWordArr.length > 0) {
-                text.push(textWordArr);
+                textArr.push(textWordArr);
                 bboxes.push(bboxesWordArr);
-                styleArr.push(styleWord);
+                boldArr.push(boldWord);
+                italicArr.push(italicWord);
+                underlineArr.push(underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5);
                 fontFamilyArr.push(fontFamily);
                 if (sizeDelta > 0) {
@@ -341,11 +358,15 @@ export async function convertPageStext({ ocrStr, n }) {
             if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /-it$/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
               // The word is already initialized, so we need to change the last element of the style array.
               // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
-              styleCurrent = 'italic';
-            } else if (/bold|black/i.test(charOrFont.name)) {
-              styleCurrent = 'bold';
+              italicCurrent = true;
             } else {
-              styleCurrent = 'normal';
+              italicCurrent = false;
+            }
+            if (/bold|black/i.test(charOrFont.name)) {
+              boldCurrent = true;
+            } else {
+              boldCurrent = false;
             }
             continue;
@@ -354,7 +375,9 @@ export async function convertPageStext({ ocrStr, n }) {
           }
           if (!wordInit) {
-            styleWord = styleCurrent;
+            boldWord = boldCurrent;
+            italicWord = italicCurrent;
             wordInit = true;
           }
@@ -411,15 +434,23 @@ export async function convertPageStext({ ocrStr, n }) {
           textWordArr.push(charOrFont.text);
+          underlineWordArr.push(charOrFont.flags === 2);
           bboxesWordArr.push(bbox);
         }
         if (textWordArr.length === 0) continue;
+        const underlineWord = underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5;
+        underlineArr.push(underlineWord);
         wordLetterOrFontArrIndex.push(i);
-        text.push(textWordArr);
+        textArr.push(textWordArr);
         bboxes.push(bboxesWordArr);
-        styleArr.push(styleWord);
+        boldArr.push(boldWord);
+        italicArr.push(italicWord);
         fontFamilyArr.push(fontFamily);
         fontSizeArr.push(fontSizeWord);
         smallCapsAltArr.push(smallCapsWordAlt);
@@ -476,8 +507,8 @@ export async function convertPageStext({ ocrStr, n }) {
       lineObj.raw = xmlLine;
       let lettersKept = 0;
-      for (let i = 0; i < text.length; i++) {
-        const wordText = unescapeXml(text[i].join(''));
+      for (let i = 0; i < textArr.length; i++) {
+        const wordText = unescapeXml(textArr[i].join(''));
         if (wordText.trim() === '') continue;
@@ -490,8 +521,8 @@ export async function convertPageStext({ ocrStr, n }) {
         /** @type {Array<OcrChar>} */
         const charObjArr = [];
-        for (let j = 0; j < text[i].length; j++) {
-          const letter = unescapeXml(text[i][j]);
+        for (let j = 0; j < textArr[i].length; j++) {
+          const letter = unescapeXml(textArr[i][j]);
           const bbox = bboxesI[j];
@@ -526,7 +557,7 @@ export async function convertPageStext({ ocrStr, n }) {
         if (bbox.left < 0 && bbox.right < 0) continue;
         const wordObj = new ocr.OcrWord(lineObj, wordText, bbox, wordID);
-        wordObj.size = fontSizeArr[i];
+        wordObj.style.size = fontSizeArr[i];
         wordObj.lang = wordLang;
@@ -540,7 +571,7 @@ export async function convertPageStext({ ocrStr, n }) {
         wordObj.conf = 100;
         if (smallCapsAltArr[i] && !/[a-z]/.test(wordObj.text) && /[A-Z].?[A-Z]/.test(wordObj.text)) {
-          wordObj.smallCaps = true;
+          wordObj.style.smallCaps = true;
           if (smallCapsAltTitleCaseArr[i]) {
             wordObj.chars.slice(1).forEach((x) => {
               x.text = x.text.toLowerCase();
@@ -552,20 +583,24 @@ export async function convertPageStext({ ocrStr, n }) {
           }
           wordObj.text = wordObj.chars.map((x) => x.text).join('');
         } else if (smallCapsArr[i]) {
-          wordObj.smallCaps = true;
+          wordObj.style.smallCaps = true;
         }
-        if (styleArr[i] === 'italic') {
-          wordObj.style = 'italic';
-        } if (styleArr[i] === 'bold') {
-          wordObj.style = 'bold';
+        if (italicArr[i]) {
+          wordObj.style.italic = true;
+        }
+        if (boldArr[i]) {
+          wordObj.style.bold = true;
         }
         wordObj.raw = wordStrArr[wordLetterOrFontArrIndex[i]];
-        wordObj.font = fontFamilyArr[i];
+        wordObj.style.font = fontFamilyArr[i];
+        wordObj.style.sup = superArr[i];
-        wordObj.sup = superArr[i];
+        wordObj.style.underline = underlineArr[i];
         lineObj.words.push(wordObj);

package/js/import/import.js CHANGED Viewed

@@ -2,7 +2,6 @@ import { clearData } from '../clear.js';
 import { inputData, opt } from '../containers/app.js';
 import {
   convertPageWarn,
-  fontMetricsObj,
   layoutDataTables,
   layoutRegions,
   ocrAll,
@@ -18,14 +17,15 @@ import {
   optimizeFontContainerAll, setDefaultFontAuto,
 } from '../fontContainerMain.js';
 import { runFontOptimization } from '../fontEval.js';
-import { calcFontMetricsFromPages } from '../fontStatistics.js';
+import { calcCharMetricsFromPages } from '../fontStatistics.js';
 import { calcSuppFontInfo } from '../fontSupp.js';
 import { gs } from '../generalWorkerMain.js';
 import { imageUtils } from '../objects/imageObjects.js';
-import { LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
+import { addCircularRefsDataTables, LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
+import { addCircularRefsOcr } from '../objects/ocrObjects.js';
 import { PageMetrics } from '../objects/pageMetricsObjects.js';
 import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
-import { replaceObjectProperties } from '../utils/miscUtils.js';
+import { readOcrFile, clearObjectProperties, objectAssignDefined } from '../utils/miscUtils.js';
 import { importOCRFiles } from './importOCR.js';
 /**
@@ -141,6 +141,8 @@ export function sortInputFiles(files) {
   /** @type {Array<File|FileNode>} */
   const pdfFilesAll = [];
   /** @type {Array<File|FileNode>} */
+  const scribeFilesAll = [];
+  /** @type {Array<File|FileNode>} */
   const unsupportedFilesAll = [];
   const unsupportedExt = {};
   for (let i = 0; i < files.length; i++) {
@@ -156,6 +158,8 @@ export function sortInputFiles(files) {
       // All .gz files are assumed to be OCR data (xml) since all other file types can be compressed already
     } else if (['hocr', 'xml', 'html', 'gz', 'stext'].includes(fileExt)) {
       ocrFilesAll.push(file);
+    } else if (['scribe'].includes(fileExt)) {
+      scribeFilesAll.push(file);
     } else if (['pdf'].includes(fileExt)) {
       pdfFilesAll.push(file);
     } else {
@@ -172,7 +176,9 @@ export function sortInputFiles(files) {
   imageFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
   ocrFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
-  return { pdfFiles: pdfFilesAll, imageFiles: imageFilesAll, ocrFiles: ocrFilesAll };
+  return {
+    pdfFiles: pdfFilesAll, imageFiles: imageFilesAll, ocrFiles: ocrFilesAll, scribeFiles: scribeFilesAll,
+  };
 }
 /**
@@ -184,6 +190,7 @@ export function sortInputFiles(files) {
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [pdfFiles]
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [imageFiles]
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [ocrFiles]
+ * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [scribeFiles]
  */
 /**
@@ -205,9 +212,11 @@ export async function importFiles(files) {
   let imageFiles = [];
   /** @type {Array<File|FileNode|ArrayBuffer>} */
   let ocrFiles = [];
+  /** @type {Array<File|FileNode|ArrayBuffer>} */
+  let scribeFiles = [];
   // These statements contain many ts-ignore comments, because the TypeScript interpreter apparently cannot properly narrow arrays.
   // See: https://github.com/microsoft/TypeScript/issues/42384
-  if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files) {
+  if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files || 'scribeFiles' in files) {
     if (files.pdfFiles && files.pdfFiles[0] instanceof ArrayBuffer) {
       // @ts-ignore
       pdfFiles = files.pdfFiles;
@@ -229,14 +238,23 @@ export async function importFiles(files) {
       // @ts-ignore
       ocrFiles = await standardizeFiles(files.ocrFiles);
     }
+    if (files.scribeFiles && files.scribeFiles[0] instanceof ArrayBuffer) {
+      // @ts-ignore
+      scribeFiles = files.scribeFiles;
+    } else if (files.scribeFiles) {
+      // @ts-ignore
+      scribeFiles = await standardizeFiles(files.scribeFiles);
+    }
   } else {
     // @ts-ignore
     const filesStand = await standardizeFiles(files);
     if (files[0] instanceof ArrayBuffer) throw new Error('ArrayBuffer inputs must be sorted by file type.');
-    ({ pdfFiles, imageFiles, ocrFiles } = sortInputFiles(filesStand));
+    ({
+      pdfFiles, imageFiles, ocrFiles, scribeFiles,
+    } = sortInputFiles(filesStand));
   }
-  if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0) {
+  if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0 && scribeFiles.length === 0) {
     const errorText = 'No supported files found.';
     opt.errorHandler(errorText);
     return;
@@ -261,23 +279,61 @@ export async function importFiles(files) {
   // Set default download name
   if (pdfFiles.length > 0 && 'name' in pdfFiles[0]) {
-    inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
+    inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
   } else if (imageFiles.length > 0 && 'name' in imageFiles[0]) {
-    inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
+    inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
   } else if (ocrFiles.length > 0 && 'name' in ocrFiles[0]) {
-    inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,4}$/, '')}.pdf`;
+    inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
+  } else if (scribeFiles.length > 0 && 'name' in scribeFiles[0]) {
+    inputData.defaultDownloadFileName = `${scribeFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
   }
+  let existingLayout = false;
+  let existingLayoutDataTable = false;
   inputData.pdfMode = pdfFiles.length === 1;
   inputData.imageMode = !!(imageFiles.length > 0 && !inputData.pdfMode);
   ImageCache.inputModes.image = !!(imageFiles.length > 0 && !inputData.pdfMode);
+  if (scribeFiles.length > 0) {
+    const scribeRestoreStr = await readOcrFile(scribeFiles[0]);
+    /** @type {ScribeSaveData} */
+    const scribeRestoreObj = JSON.parse(scribeRestoreStr);
+    if (scribeRestoreObj.fontState) {
+      objectAssignDefined(FontCont.state, scribeRestoreObj.fontState);
+      await runFontOptimization(ocrAll.active);
+    }
+    if (scribeRestoreObj.layoutRegions) {
+      existingLayout = true;
+      layoutRegions.pages = scribeRestoreObj.layoutRegions;
+    }
+    if (scribeRestoreObj.layoutDataTables) {
+      existingLayoutDataTable = true;
+      addCircularRefsDataTables(scribeRestoreObj.layoutDataTables);
+      layoutDataTables.pages = scribeRestoreObj.layoutDataTables;
+    }
+    const oemName = 'User Upload';
+    if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
+    addCircularRefsOcr(scribeRestoreObj.ocr);
+    ocrAll[oemName] = scribeRestoreObj.ocr;
+    ocrAll.active = ocrAll[oemName];
+    for (let i = 0; i < ocrAll[oemName].length; i++) {
+      inputData.xmlMode[i] = true;
+      if (ocrAll[oemName][i].dims.height && ocrAll[oemName][i].dims.width) {
+        pageMetricsArr[i] = new PageMetrics(ocrAll[oemName][i].dims);
+      }
+      pageMetricsArr[i].angle = ocrAll[oemName][i].angle;
+    }
+  }
   const xmlModeImport = ocrFiles.length > 0;
   let pageCount;
   let pageCountImage;
   let abbyyMode = false;
-  let scribeMode = false;
+  let reimportHocrMode = false;
   if (inputData.pdfMode) {
     const pdfFile = pdfFiles[0];
@@ -296,8 +352,6 @@ export async function importFiles(files) {
     pageCountImage = imageFiles.length;
   }
-  let existingLayout = false;
-  let existingLayoutDataTable = false;
   let existingOpt = false;
   const oemName = 'User Upload';
   let stextMode;
@@ -317,41 +371,32 @@ export async function importFiles(files) {
       ocrAllRaw.active = ocrAllRaw.active.slice(0, pageCountImage);
     }
+    objectAssignDefined(FontCont.state, ocrData.fontState);
     // Restore font metrics and optimize font from previous session (if applicable)
-    if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
+    if (ocrData.fontState.charMetrics && Object.keys(ocrData.fontState.charMetrics).length > 0) {
       const fontPromise = loadBuiltInFontsRaw();
       existingOpt = true;
-      replaceObjectProperties(fontMetricsObj, ocrData.fontMetricsObj);
       await gs.schedulerReady;
-      setDefaultFontAuto(fontMetricsObj);
+      setDefaultFontAuto(FontCont.state.charMetrics);
       // If `ocrData.enableOpt` is `false`, then the metrics are present but ignored.
       // This occurs if optimization was found to decrease accuracy for both sans and serif,
       // not simply because the user disabled optimization in the view settings.
       // If no `enableOpt` property exists but metrics are present, then optimization is enabled.
       if (ocrData.enableOpt === 'false') {
-        FontCont.enableOpt = false;
+        FontCont.state.enableOpt = false;
       } else {
         await fontPromise;
         if (!FontCont.raw) throw new Error('Raw font data not found.');
-        FontCont.opt = await optimizeFontContainerAll(FontCont.raw, fontMetricsObj);
-        FontCont.enableOpt = true;
+        FontCont.opt = await optimizeFontContainerAll(FontCont.raw, FontCont.state.charMetrics);
+        FontCont.state.enableOpt = true;
         await enableFontOpt(true);
       }
     }
-    if (ocrData.defaultFont) FontCont.defaultFontName = ocrData.defaultFont;
-    if (ocrData.sansFont) {
-      FontCont.sansDefaultName = ocrData.sansFont;
-    }
-    if (ocrData.serifFont) {
-      FontCont.serifDefaultName = ocrData.serifFont;
-    }
     // Restore layout data from previous session (if applicable)
     if (ocrData.layoutObj) {
       for (let i = 0; i < ocrData.layoutObj.length; i++) {
@@ -368,22 +413,22 @@ export async function importFiles(files) {
     }
     abbyyMode = ocrData.abbyyMode;
-    scribeMode = ocrData.scribeMode;
+    reimportHocrMode = ocrData.reimportHocrMode;
     stextMode = ocrData.stextMode;
   }
-  const pageCountHOCR = ocrAllRaw.active?.length;
+  const pageCountOcr = ocrAllRaw.active?.length || ocrAll.active?.length || 0;
   // If both OCR data and image data are present, confirm they have the same number of pages
   if (xmlModeImport && (inputData.imageMode || inputData.pdfMode)) {
-    if (pageCountImage !== pageCountHOCR) {
-      const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${pageCountHOCR} pages.`;
+    if (pageCountImage !== pageCountOcr) {
+      const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${pageCountOcr} pages.`;
       opt.warningHandler(warningHTML);
     }
   }
-  inputData.pageCount = pageCountImage ?? pageCountHOCR;
+  inputData.pageCount = pageCountImage ?? pageCountOcr;
   ocrAllRaw.active = ocrAllRaw.active || Array(pageCount);
@@ -399,10 +444,6 @@ export async function importFiles(files) {
     }
   }
-  inputData.xmlMode = new Array(inputData.pageCount);
-  inputData.xmlMode.fill(false);
   // Render first page for PDF only
   if (inputData.pdfMode && !xmlModeImport) {
     opt.progressHandler({ n: 0, type: 'importPDF', info: { } });
@@ -429,18 +470,23 @@ export async function importFiles(files) {
     if (stextMode) format = 'stext';
     // Process HOCR using web worker, reading from file first if that has not been done already
-    await convertOCR(ocrAllRaw.active, true, format, oemName, scribeMode).then(async () => {
+    await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode).then(async () => {
       // Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate).
       if (!existingOpt && !stextMode) {
         await checkCharWarn(convertPageWarn);
-        calcFontMetricsFromPages(ocrAll.active);
+        const charMetrics = calcCharMetricsFromPages(ocrAll.active);
+        if (Object.keys(charMetrics).length > 0) {
+          clearObjectProperties(FontCont.state.charMetrics);
+          Object.assign(FontCont.state.charMetrics, charMetrics);
+        }
         await runFontOptimization(ocrAll.active);
       }
     });
   } else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
     await extractInternalPDFText();
     if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
-      if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
+      if (inputData.pdfType === 'text') FontCont.state.enableCleanToNimbusMono = true;
       if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
     }
   }
@@ -467,8 +513,6 @@ export async function importFilesSupp(files, ocrName) {
   const ocrData = await importOCRFiles(ocrFilesAll);
-  const scribeMode = ocrData.scribeMode;
   const pageCountHOCR = ocrData.hocrRaw.length;
   // If both OCR data and image data are present, confirm they have the same number of pages
@@ -482,5 +526,5 @@ export async function importFilesSupp(files, ocrName) {
   if (ocrData.abbyyMode) format = 'abbyy';
   if (ocrData.stextMode) format = 'stext';
-  await convertOCR(ocrData.hocrRaw, false, format, ocrName, scribeMode);
+  await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
 }