npm - scribe.js-ocr - Versions diffs - 0.2.6 → 0.2.7 - Mend

scribe.js-ocr 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +1 -0
package/js/containers/app.js +4 -0
package/js/containers/fontContainer.js +13 -0
package/js/containers/imageContainer.js +47 -28
package/js/debug.js +5 -19
package/js/export/export.js +3 -3
package/js/export/exportPDF.js +61 -49
package/js/export/exportRenderText.js +5 -1
package/js/extractPDFText.js +5 -5
package/js/fontContainerMain.js +13 -10
package/js/fontEval.js +7 -25
package/js/fontSupp.js +165 -0
package/js/generalWorkerMain.js +122 -84
package/js/import/convertPageAbbyy.js +1 -6
package/js/import/convertPageHocr.js +7 -11
package/js/import/convertPageStext.js +7 -2
package/js/import/import.js +5 -3
package/js/objects/ocrObjects.js +14 -0
package/js/recognizeConvert.js +26 -43
package/js/utils/fontUtils.js +16 -7
package/js/utils/miscUtils.js +20 -3
package/js/worker/compareOCRModule.js +13 -10
package/js/worker/renderWordCanvas.js +11 -6
package/mupdf/mupdf-worker.js +1 -2
package/package.json +1 -1
package/scribe.js +12 -0

package/README.md CHANGED Viewed

@@ -35,6 +35,7 @@ When using Scribe.js in the browser, all files must be served from the same orig
 The following are template repos showing how Scribe.js can be used within various frameworks/build systems.
 - Browser with ESM (no build): https://github.com/scribeocr/scribe.js-example-esm-browser
+- Browser with Next.js: https://github.com/scribeocr/scribe.js-example-next.js
 - Browser with Webpack 5: https://github.com/scribeocr/scribe.js-example-webpack5
 - Browser with Vue.js v2: https://github.com/scribeocr/scribe.js-example-vue2

package/js/containers/app.js CHANGED Viewed

@@ -57,6 +57,10 @@ export class opt {
   /** Generate debug visualizations when running OCR. */
   static debugVis = false;
+  static extractPDFFonts = false;
+  static calcSuppFontInfo = false;
 }
 export class inputData {

package/js/containers/fontContainer.js CHANGED Viewed

@@ -67,6 +67,8 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
   const fontFace = new FontFace(fontFamily, src1, { style: fontStyle, weight: fontWeight });
+  if (fontFace.status === 'error') throw new Error(`FontFace failed to load: ${fontFamily} ${fontStyle} ${fontWeight}`);
   // Fonts are stored in `document.fonts` for the main thread and `WorkerGlobalScope.fonts` for workers
   const fontSet = globalThis.document ? globalThis.document.fonts : globalThis.fonts;
@@ -157,6 +159,10 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
   /** @type {("sans"|"serif")} */
   this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
   this.smallCapsMult = 0.75;
+  /**
+   * @type {boolean} - Disable font. This is used to prevent a flawed font extracted from a PDF from being used.
+   */
+  this.disable = false;
   if (typeof FontFace !== 'undefined') loadFontFace(this.fontFaceName, this.fontFaceStyle, this.fontFaceWeight, this.src);
 }
@@ -228,6 +234,9 @@ export class FontCont {
   /** @type {?FontContainer} */
   static opt = null;
+  /** @type {?Object<string, FontContainerFamilyUpload>} */
+  static doc = null;
   /** @type {?FontContainer} */
   static export = null;
@@ -298,6 +307,10 @@ export class FontCont {
      * @returns {FontContainerFont}
      */
   static getFont = (family, style = 'normal', lang = 'eng') => {
+    if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
+      return FontCont.doc[family][style];
+    }
     if (lang === 'chi_sim') {
       if (!FontCont.supp.chi_sim) throw new Error('chi_sim font does not exist.');
       return FontCont.supp.chi_sim;

package/js/containers/imageContainer.js CHANGED Viewed

@@ -6,7 +6,7 @@ import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
 import { getImageBitmap } from '../utils/imageUtils.js';
-import { setUploadFontsWorker } from '../fontContainerMain.js';
+import { updateFontContWorkerMain } from '../fontContainerMain.js';
 import { pageMetricsArr } from './dataContainer.js';
 import {
   FontCont,
@@ -16,7 +16,7 @@ import {
 import { gs } from '../generalWorkerMain.js';
 import { imageUtils } from '../objects/imageObjects.js';
-import { determineSansSerif, range } from '../utils/miscUtils.js';
+import { range } from '../utils/miscUtils.js';
 import { opt } from './app.js';
 let skipTextMode = false;
@@ -256,12 +256,12 @@ export class ImageCache {
     // If no preference is specified for upscaling, default to false.
     const upscaleArg = props?.upscaled || false;
-    const scheduler = await gs.getGeneralScheduler();
+    await gs.getGeneralScheduler();
     const resPromise = (async () => {
     // Wait for non-rotated version before replacing with promise
       if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
-      return scheduler.recognize({
+      return gs.recognize({
         image: inputImage.src,
         options: { rotateRadians: angleArg, upscale: upscaleArg },
         output: {
@@ -525,7 +525,7 @@ export class ImageCache {
     // For reasons that are unclear, a small number of pages have been rendered into massive files
     // so a hard-cap on resolution must be imposed.
-    const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
+    const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
     // In addition to capping the resolution, also switch the width/height
     ImageCache.pdfDims300.forEach((x, i) => {
@@ -534,42 +534,61 @@ export class ImageCache {
     });
     // WIP: Extract fonts embedded in PDFs.
-    if (false) {
+    // This feature is disabled by default as the results are often bad.
+    // In addition to only working for certain font formats, fonts embedded in PDFs are often subsetted and/or corrupted.
+    // Therefore, before this is enabled by default, more sophisticated rules regarding when fonts should be used are needed.
+    if (opt.extractPDFFonts) {
       muPDFScheduler.extractAllFonts().then(async (x) => {
-        globalImageCache.fontArr = [];
         for (let i = 0; i < x.length; i++) {
           const src = x[i].buffer;
-          const fontObj = await loadOpentype(src);
-          const fontNameEmbedded = fontObj.names.postScriptName.en;
-          const fontFamilyEmbedded = fontObj.names?.fontFamily?.en || fontNameEmbedded.replace(/-\w+$/, '');
+          let fontObj;
+          let fontData;
+          try {
+            fontObj = await loadOpentype(src);
+            // It is common for raw fonts embedded in PDFs to be invalid and rejected by the OTS, but running them through opentype.js fixes them.
+            // This appears to be because of the way that fonts are subsetted in PDFs.
+            fontData = fontObj.toArrayBuffer();
+          } catch (error) {
+            console.error(`Error loading font ${i}.`);
+            console.error(error);
+            continue;
+          }
-          // Skip bold and bold-italic fonts for now.
-          if (fontNameEmbedded.match(/bold/i)) continue;
+          const fontNameEmbedded = fontObj.names.postScriptName.en;
           let fontStyle = 'normal';
           if (fontNameEmbedded.match(/italic/i)) {
             fontStyle = 'italic';
           } else if (fontNameEmbedded.match(/bold/i)) {
-            // Bold fonts should be enabled at some later point.
-            // While we previously found that we were unable to detect bold fonts reliably,
-            // when importing from PDFs, we do not need to guess.
-            // fontStyle = 'bold';
+            fontStyle = 'bold';
           }
-          const type = determineSansSerif(fontFamilyEmbedded) === 'SansDefault' ? 'sans' : 'serif';
-          // mupdf replaces spaces with underscores in font names.
-          const fontName = fontFamilyEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
-          if (!FontCont.raw[fontName]) {
-            FontCont.raw[fontName] = {};
-          }
-          if (!FontCont.raw[fontName][fontStyle]) {
-            FontCont.raw[fontName][fontStyle] = new FontContainerFont(fontName, fontStyle, src, false, fontObj);
+          // mupdf makes changes to font names, so we need to do the same.
+          // Font names in the form `MEDJCO+CenturySchoolbook` are changed to `CenturySchoolbook`.
+          // Spaces are replaced with underscores.
+          const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
+          if (!FontCont.doc?.[fontName]?.[fontStyle]) {
+            try {
+              const fontContainer = new FontContainerFont(fontName, fontStyle, fontData, false, fontObj);
+              if (!FontCont.doc) {
+                FontCont.doc = {};
+              }
+              if (!FontCont.doc[fontName]) {
+                FontCont.doc[fontName] = {};
+              }
+              FontCont.doc[fontName][fontStyle] = fontContainer;
+            } catch (error) {
+              console.error(`Error loading font ${fontName} ${fontStyle}.`);
+            }
+          } else {
+            console.warn(`Font ${fontName} ${fontStyle} already exists.`);
           }
         }
-        await setUploadFontsWorker(gs.schedulerInner);
+        await updateFontContWorkerMain();
       });
     }
   };

package/js/debug.js CHANGED Viewed

@@ -114,25 +114,11 @@ export async function drawDebugImages(args) {
 export async function renderPageStatic(page) {
   const image = await ImageCache.getNative(page.n, { rotated: opt.autoRotate, upscaled: false });
-  // The Node.js canvas package does not currently support worker threads
-  // https://github.com/Automattic/node-canvas/issues/1394
-  let res;
-  if (!(typeof process === 'undefined')) {
-    const { renderPageStaticImp } = await import('./worker/compareOCRModule.js');
-    res = await renderPageStaticImp({
-      page,
-      image,
-      angle: pageMetricsArr[page.n].angle,
-    });
-    // Browser case
-  } else {
-    if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
-    res = await gs.scheduler.renderPageStaticImp({
-      page,
-      image,
-      angle: pageMetricsArr[page.n].angle,
-    });
-  }
+  const res = gs.renderPageStaticImp({
+    page,
+    image,
+    angle: pageMetricsArr[page.n].angle,
+  });
   return res;
 }

package/js/export/export.js CHANGED Viewed

@@ -3,7 +3,7 @@ import { layoutRegions, ocrAll, pageMetricsArr } from '../containers/dataContain
 import { ImageCache } from '../containers/imageContainer.js';
 import { reorderOcrPage } from '../modifyOCR.js';
 import { saveAs } from '../utils/miscUtils.js';
-import { hocrToPDF } from './exportPDF.js';
+import { renderPDF } from './exportPDF.js';
 import { renderHOCR } from './exportRenderHOCR.js';
 import { renderText } from './exportRenderText.js';
@@ -60,7 +60,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
       // and assume that the overlay PDF is the same size as the input images.
       // The `maxpage` argument must be set manually to `inputData.pageCount-1`, as this avoids an error in the case where there is no OCR data (`hocrDownload` has length 0).
       // In all other cases, this should be equivalent to using the default argument of `-1` (which results in `hocrDownload.length` being used).
-      const pdfStr = await hocrToPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
+      const pdfStr = await renderPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
         { width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
       const enc = new TextEncoder();
@@ -142,7 +142,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
         });
       }
     } else {
-      const pdfStr = await hocrToPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
+      const pdfStr = await renderPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
         opt.overlayOpacity / 100);
       // The PDF is still run through muPDF, even thought in eBook mode no background layer is added.

package/js/export/exportPDF.js CHANGED Viewed

@@ -31,7 +31,7 @@ import ocr from '../objects/ocrObjects.js';
  *
  * A valid PDF will be created if an empty array is provided for `hocrArr`, as long as `maxpage` is set manually.
  */
-export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
+export async function renderPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
   dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, proofOpacity = 0.8) {
   if (!FontCont.raw) throw new Error('No fonts loaded.');
@@ -52,13 +52,8 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
   /** @type {Array<string>} */
   const pdfFontObjStrArr = [];
   let pdfFontsStr = '';
-  for (const familyKey of Object.keys(FontCont.raw)) {
-    const useOpt = FontCont.useOptFamily(familyKey);
-    const familyObj = {
-      normal: useOpt && FontCont.opt?.[familyKey]?.normal ? FontCont.opt[familyKey].normal : FontCont.raw[familyKey].normal,
-      italic: useOpt && FontCont.opt?.[familyKey]?.italic ? FontCont.opt[familyKey].italic : FontCont.raw[familyKey].italic,
-      bold: useOpt && FontCont.opt?.[familyKey]?.bold ? FontCont.opt[familyKey].bold : FontCont.raw[familyKey].bold,
-    };
+  const addFamilyObj = async (familyKey, familyObj) => {
     pdfFonts[familyKey] = {};
     for (const [key, value] of Object.entries(familyObj)) {
       const font = await value.opentype;
@@ -87,6 +82,22 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
       pdfFontsStr += `/F${String(fontI)} ${String(objectThis)} 0 R\n`;
       fontI++;
     }
+  };
+  for (const familyKeyI of Object.keys(FontCont.raw)) {
+    const useOpt = FontCont.useOptFamily(familyKeyI);
+    const familyObjI = {
+      normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
+      italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
+      bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
+    };
+    await addFamilyObj(familyKeyI, familyObjI);
+  }
+  if (FontCont.doc) {
+    for (const familyKeyI of Object.keys(FontCont.doc)) {
+      await addFamilyObj(familyKeyI, FontCont.doc[familyKeyI]);
+    }
   }
   /** @type {?import('opentype.js').Font} */
@@ -308,13 +319,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     const { baseline } = lineObj;
     const linebox = lineObj.bbox;
-    let firstWord = words[0];
+    let wordJ = words[0];
     let fillColor = '0 0 0 rg';
     if (textMode === 'proof') {
-      if (firstWord.conf > confThreshHigh) {
+      if (wordJ.conf > confThreshHigh) {
         fillColor = '0 1 0.5 rg';
-      } else if (firstWord.conf > confThreshMed) {
+      } else if (wordJ.conf > confThreshMed) {
         fillColor = '1 0.8 0 rg';
       } else {
         fillColor = '1 0 0 rg';
@@ -327,41 +338,41 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     textContentObjStr += `${fillColor}\n`;
-    let wordFont = FontCont.getWordFont(firstWord);
+    let wordFont = FontCont.getWordFont(wordJ);
     // The Chinese font is subset to only relevant characters, the others currently are not.
-    let wordFontOpentype = (firstWord.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
+    let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
     if (!wordFontOpentype) {
-      const fontNameMessage = firstWord.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${firstWord.style})`;
+      const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
       console.log(`Skipping word due to missing font (${fontNameMessage})`);
       continue;
     }
     // let wordFontSize = calcWordFontSize(word);
-    const word0Metrics = calcWordMetrics(firstWord, angle);
+    const word0Metrics = calcWordMetrics(wordJ, angle);
     let wordFontSize = word0Metrics.fontSize;
     // Set font and font size
-    ({ name: pdfFontCurrent, type: pdfFontTypeCurrent } = firstWord.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][firstWord.style]);
+    ({ name: pdfFontCurrent, type: pdfFontTypeCurrent } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style]);
     textContentObjStr += `${pdfFontCurrent} ${String(wordFontSize)} Tf\n`;
     // Reset baseline to line baseline
     textContentObjStr += '0 Ts\n';
-    const word0LeftBearing = firstWord.visualCoords ? word0Metrics.leftSideBearing : 0;
+    const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
     let tz = 100;
-    if (firstWord.dropcap) {
-      const wordWidthActual = firstWord.bbox.right - firstWord.bbox.left;
+    if (wordJ.dropcap) {
+      const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
       tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
     }
     // Move to next line
-    const lineLeftAdj = firstWord.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
+    const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
     const lineTopAdj = linebox.bottom + baseline[1] + angleAdjLine.y;
     if (rotateText) {
@@ -379,7 +390,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     let charSpacingLast = 0;
     let spacingAdj = 0;
     let kernSpacing = false;
-    let wordLast = firstWord;
+    let wordLast = wordJ;
     let wordFontOpentypeLast = wordFontOpentype;
     let fontSizeLast = wordFontSize;
     let tsCurrent = 0;
@@ -387,27 +398,27 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
     let charLig = false;
     for (let j = 0; j < words.length; j++) {
-      firstWord = words[j];
+      wordJ = words[j];
-      const wordMetrics = calcWordMetrics(firstWord, angle);
+      const wordMetrics = calcWordMetrics(wordJ, angle);
       wordFontSize = wordMetrics.fontSize;
       const charSpacing = wordMetrics.charSpacing;
       const charArr = wordMetrics.charArr;
-      const wordLeftBearing = firstWord.visualCoords ? wordMetrics.leftSideBearing : 0;
+      const wordLeftBearing = wordJ.visualCoords ? wordMetrics.leftSideBearing : 0;
       const kerningArr = wordMetrics.kerningArr;
-      wordFont = FontCont.getWordFont(firstWord);
-      wordFontOpentype = firstWord.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
+      wordFont = FontCont.getWordFont(wordJ);
+      wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
       if (!wordFontOpentype) {
-        const fontNameMessage = firstWord.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${firstWord.style})`;
+        const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
         console.log(`Skipping word due to missing font (${fontNameMessage})`);
         continue;
       }
       fillColor = '0 0 0 rg';
       if (textMode === 'proof') {
-        const wordConf = firstWord.conf;
+        const wordConf = wordJ.conf;
         if (wordConf > confThreshHigh) {
           fillColor = '0 1 0.5 rg';
@@ -417,34 +428,35 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
           fillColor = '1 0 0 rg';
         }
       } else if (textMode === 'eval') {
-        fillColor = firstWord.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
+        fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
       }
-      const angleAdjWord = firstWord.sup ? ocr.calcWordAngleAdj(firstWord) : { x: 0, y: 0 };
+      const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
       const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
-      // TODO: Test whether the math here is correct for drop caps.
       let ts = 0;
-      if (firstWord.sup) {
-        ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (firstWord.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
-      } else if (firstWord.dropcap) {
-        ts = (linebox.bottom + baseline[1]) - firstWord.bbox.bottom + angleAdjLine.y + angleAdjWord.y;
+      if (wordJ.sup || wordJ.dropcap) {
+        ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
+        if (!wordJ.visualCoords) {
+          const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
+          ts -= fontDesc;
+        }
       } else {
         ts = 0;
       }
       // TODO: This probably fails for Chinese, rethink.
       tz = 100;
-      if (firstWord.dropcap) {
-        const wordWidthActual = firstWord.bbox.right - firstWord.bbox.left;
+      if (wordJ.dropcap) {
+        const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
         tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
       }
       // const pdfFont = word.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFontFamily][word.style];
-      const { name: pdfFont, type: pdfFontType } = firstWord.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][firstWord.style];
+      const { name: pdfFont, type: pdfFontType } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
-      const wordWidthAdj = (firstWord.bbox.right - firstWord.bbox.left) / cosAngle;
-      const wordSpaceAdj = (firstWord.bbox.left - wordBoxLast.right) / cosAngle;
+      const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAngle;
+      const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAngle;
       // Add space character between words
       if (j > 0 && !kernSpacing) {
@@ -468,13 +480,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       }
       kernSpacing = false;
-      wordBoxLast = firstWord.bbox;
+      wordBoxLast = wordJ.bbox;
       // In general, we assume that (given our adjustments to character spacing) the rendered word has the same width as the image of that word.
       // However, this assumption does not hold for single-character words, as there is no space between character to adjust.
       // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
       // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
-      if (charArr.length === 1 && !firstWord.dropcap) {
+      if (charArr.length === 1 && !wordJ.dropcap) {
         const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
         const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
         const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
@@ -485,7 +497,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       textContentObjStr += ' ] TJ\n';
-      const fontSize = firstWord.smallCaps && firstWord.text[0] && firstWord.text[0] !== firstWord.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
+      const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
       if (pdfFont !== pdfFontCurrent || fontSize !== fontSizeLast) {
         textContentObjStr += `${pdfFont} ${String(fontSize)} Tf\n`;
         pdfFontCurrent = pdfFont;
@@ -512,23 +524,23 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
       // Non-ASCII and special characters are encoded/escaped using winEncodingLookup
       for (let k = 0; k < charArr.length; k++) {
         const letterSrc = charArr[k];
-        const letter = firstWord.smallCaps ? charArr[k].toUpperCase() : charArr[k];
-        const fontSizeLetter = firstWord.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
+        const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
+        const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
         const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
         if (letterEnc) {
           let kern = (kerningArr[k] || 0) * (-1000 / fontSizeLetter);
-          if (firstWord.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
+          if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
             kernSpacing = true;
             const wordNext = words[j + 1];
-            const wordSpaceNextAdj = (wordNext.bbox.left - firstWord.bbox.right) / cosAngle;
+            const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
             // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
             const wordGlyphMetrics = wordFontOpentype.charToGlyph(charArr.at(-1)).getMetrics();
             const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
-            const wordRightBearing = firstWord.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
+            const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
             const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
@@ -581,7 +593,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
         }
       }
-      wordLast = firstWord;
+      wordLast = wordJ;
       wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
       wordFontOpentypeLast = wordFontOpentype;
       charSpacingLast = charSpacing;

package/js/export/exportRenderText.js CHANGED Viewed

@@ -11,8 +11,10 @@ import { assignParagraphs } from '../utils/reflowPars.js';
  * @param {number} maxpage - The last page to include in the document.
  * @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
  * @param {boolean} docxMode - Create XML for a word document rather than plain text.
+ * @param {?Array<string>} wordIds - An array of word IDs to include in the document.
+ *    If omitted, all words are included.
  */
-export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false) {
+export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) {
   let textStr = '';
   if (maxpage === -1) maxpage = ocrCurrent.length - 1;
@@ -48,6 +50,8 @@ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = f
         const wordObj = lineObj.words[i];
         if (!wordObj) continue;
+        if (wordIds && !wordIds.includes(wordObj.id)) continue;
         if (docxMode) {
           let fontStyle = '';
           if (wordObj.style === 'italic') {

package/js/extractPDFText.js CHANGED Viewed

@@ -1,6 +1,6 @@
+import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
-import { convertOCRAll } from './recognizeConvert.js';
-import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
+import { convertOCR } from './recognizeConvert.js';
 /**
  * Extract raw text content from currently loaded PDF.
@@ -21,7 +21,7 @@ const extractInternalPDFTextRaw = async () => {
   };
   const stextArr = /** @type {Array<string>} */ ([]);
-  const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
+  const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
   const resArr = pageDPI.map(async (x, i) => {
     // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
     // The XML format is the only built-in mupdf format that includes character-level granularity.
@@ -53,7 +53,7 @@ const extractInternalPDFTextRaw = async () => {
       // (1) The total number of letters is at least 100 per page on average.
       // (2) The total number of letters is at least half of the total number of letters.
     } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
+      && pdfContentStats.pageCountTotalText >= ImageCache.pageCount / 2) {
       type = 'ocr';
       // Otherwise, the PDF is considered image-native.
       // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
@@ -102,7 +102,7 @@ export const extractInternalPDFText = async (options = {}) => {
   const format = 'stext';
   // Process HOCR using web worker, reading from file first if that has not been done already
-  await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
+  await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
   res.content = ocrAll.pdf;

package/js/fontContainerMain.js CHANGED Viewed

@@ -183,16 +183,19 @@ export async function enableFontOpt(enableOpt, forceOpt) {
  *    Set `loadRaw` to `true` or `false` to force the raw fonts to be loaded or not loaded, respectively.
  * @param {boolean} [params.loadOpt] - By default, optimized fonts are loaded if they have not been loaded before.
  *   Set `loadOpt` to `true` or `false` to force the optimized fonts to be loaded or not loaded, respectively.
+ * @param {boolean} [params.loadDoc] - By default, fonts extracted from PDF documents are loaded if they have not been loaded before.
+ *  Set `loadDoc` to `true` or `false` to force the document fonts to be loaded or not loaded, respectively.
  * @param {boolean} [params.updateProps]
  */
 export async function updateFontContWorkerMain(params = {}) {
-  const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInRawWorker);
-  const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInOptWorker);
+  const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInFontsRawWorker);
+  const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInFontsOptWorker);
+  const loadDoc = params.loadDoc === true || (params.loadDoc !== false && FontCont.doc && !gs.loadedBuiltInFontsDocWorker);
   // If the active font data is not already loaded, load it now.
   // This assumes that only one version of the raw/optimized fonts ever exist--
   // it does not check whether the current optimized font changed since it was last loaded.
-  for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt]]) {
+  for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt], ['doc', loadDoc]]) {
     if (!load) continue;
     const resArr = [];
@@ -214,9 +217,11 @@ export async function updateFontContWorkerMain(params = {}) {
       // TODO: consider the race condition when `setBuiltInFontsWorkers` is called multiple times quickly and `loadFontsWorker` is still running.
       if (type === 'opt') {
-        gs.loadedBuiltInOptWorker = true;
-      } else {
-        gs.loadedBuiltInRawWorker = true;
+        gs.loadedBuiltInFontsOptWorker = true;
+      } else if (type === 'raw') {
+        gs.loadedBuiltInFontsRawWorker = true;
+      } else if (type === 'doc') {
+        gs.loadedBuiltInFontsDocWorker = true;
       }
     }
     await Promise.all(resArr);
@@ -321,8 +326,6 @@ export function setDefaultFontAuto(fontMetricsObj) {
  * @param {Object.<string, FontMetricsFamily>} fontMetricsObj
  */
 export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
-  if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
   // When we have metrics for individual fonts families, those are used to optimize the appropriate fonts.
   // Otherwise, the "default" metric is applied to whatever font the user has selected as the default font.
   const multiFontMode = checkMultiFontMode(fontMetricsObj);
@@ -342,7 +345,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
   }
   const metricsNormal = fontMetricsObj[fontMetricsType][fontFamily.normal.style];
-  const normalOptFont = gs.scheduler.optimizeFont({ fontData: fontFamily.normal.src, fontMetricsObj: metricsNormal, style: fontFamily.normal.style })
+  const normalOptFont = gs.optimizeFont({ fontData: fontFamily.normal.src, fontMetricsObj: metricsNormal, style: fontFamily.normal.style })
     .then(async (x) => {
       const font = await loadOpentype(x.fontData, x.kerningPairs);
       return new FontContainerFont(fontFamily.normal.family, fontFamily.normal.style, x.fontData, true, font);
@@ -352,7 +355,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
   /** @type {?FontContainerFont|Promise<FontContainerFont>} */
   let italicOptFont = null;
   if (metricsItalic && metricsItalic.obs >= 200) {
-    italicOptFont = gs.scheduler.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
+    italicOptFont = gs.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
       .then(async (x) => {
         const font = await loadOpentype(x.fontData, x.kerningPairs);
         return new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, x.fontData, true, font);