npm - scribe.js-ocr - Versions diffs - 0.3.0 → 0.4.0 - Mend

scribe.js-ocr 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/cli/cli.js +5 -4
package/cli/main.js +13 -7
package/cli/scribe.js +4 -0
package/js/containers/app.js +15 -0
package/js/containers/fontContainer.js +11 -0
package/js/containers/imageContainer.js +3 -3
package/js/export/export.js +4 -0
package/js/export/writePdf.js +3 -2
package/js/extractPDFText.js +3 -2
package/js/fontContainerMain.js +17 -2
package/js/fontEval.js +8 -3
package/js/fontSupp.js +1 -1
package/js/generalWorkerMain.js +9 -7
package/js/global.d.ts +1 -0
package/js/import/convertPageStext.js +124 -38
package/js/import/import.js +5 -2
package/js/objects/ocrObjects.js +32 -0
package/js/recognizeConvert.js +19 -5
package/js/utils/fontUtils.js +10 -2
package/js/utils/miscUtils.js +1 -1
package/js/worker/compareOCRModule.js +21 -4
package/js/worker/optimizeFontModule.js +6 -3
package/mupdf/libmupdf.wasm +0 -0
package/mupdf/mupdf-async.js +1 -1
package/mupdf/mupdf-worker.js +3 -1
package/package.json +1 -1
package/scribe.js +1 -1

package/cli/cli.js CHANGED Viewed

@@ -11,13 +11,13 @@ export const confCLI = async (ocrFile) => {
   process.exitCode = 0;
 };
-export const checkCLI = async (pdfFile, ocrFile) => {
-  await check(pdfFile, ocrFile);
+export const checkCLI = async (pdfFile, ocrFile, options) => {
+  await check(pdfFile, ocrFile, options);
   process.exitCode = 0;
 };
-export const evalInternalCLI = async (pdfFile, ocrFile) => {
-  const { evalMetrics } = await evalInternal(pdfFile, ocrFile);
+export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
+  const { evalMetrics } = await evalInternal(pdfFile, ocrFile, options);
   const ignoreExtra = true;
   let metricWER;
@@ -53,6 +53,7 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
  * @param {boolean} [options.robust]
  * @param {boolean} [options.conf]
  * @param {boolean} [options.vis]
+ * @param {number} [options.workers]
  */
 export const overlayCLI = async (pdfFile, ocrFile, outputDir, options) => {
   options.overlayMode = options.vis ? 'proof' : 'invis';

package/cli/main.js CHANGED Viewed

@@ -21,9 +21,11 @@ scribe.opt.saveDebugImages = debugMode;
  * @param {boolean} [params.robustConfMode]
  * @param {boolean} [params.printConf]
  * @param {"eval" | "ebook" | "proof" | "invis"} [params.overlayMode]
- *
+ * @param {number} [params.workerN]
  */
 async function main(func, params) {
+  scribe.opt.workerN = params.workerN || null;
   await scribe.init({
     pdf: true,
     ocr: true,
@@ -118,16 +120,20 @@ export const conf = async (ocrFile) => (main('conf', { ocrFile }));
  *
  * @param {string} pdfFile - Path to PDF file.
  * @param {string} ocrFile
+ * @param {Object} options
+ * @param {number} [options.workers]
  */
-export const check = async (pdfFile, ocrFile) => (main('check', { pdfFile, ocrFile }));
+export const check = async (pdfFile, ocrFile, options) => (main('check', { pdfFile, ocrFile, workerN: options?.workers }));
 /**
  * Evaluate internal OCR engine.
  *
  * @param {string} pdfFile - Path to PDF file.
  * @param {string} ocrFile - Path to OCR file containing ground truth.
+ * @param {Object} options
+ * @param {number} [options.workers]
  */
-export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile, ocrFile }));
+export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', { pdfFile, ocrFile, workerN: options?.workers }));
 /**
  *
@@ -138,10 +144,10 @@ export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile,
  * @param {boolean} [options.robust]
  * @param {boolean} [options.conf]
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
- * @returns
+ * @param {number} [options.workers]
  */
 export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('overlay', {
-  pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis',
+  pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
 }));
 /**
@@ -149,9 +155,9 @@ export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('ov
  * @param {string} pdfFile - Path to PDF file.
  * @param {Object} options
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
- * @returns
+ * @param {number} [options.workers]
  */
-export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis' }));
+export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers }));
 /**
  *

package/cli/scribe.js CHANGED Viewed

@@ -19,6 +19,7 @@ program
   .command('check')
   .argument('<pdf_file>', 'Input PDF file.')
   .argument('<ocr_file>', 'Input OCR file.  Accepts .hocr and Abbyy .xml (with character-level data enabled).')
+  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
   .description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
   .action(checkCLI);
@@ -26,6 +27,7 @@ program
   .command('eval')
   .argument('<pdf_file>', 'Input PDF file.')
   .argument('<ocr_file>', 'Input OCR file.  Accepts .hocr and Abbyy .xml (with character-level data enabled).')
+  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
   .description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
   .action(evalInternalCLI);
@@ -46,6 +48,7 @@ program
   .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
   .option('-c, --conf', 'Print average confidence metric for document.')
   .option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
+  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
   .description('Add OCR data to provided PDF file and save result as PDF.')
   .action(overlayCLI);
@@ -54,6 +57,7 @@ program
   .argument('<pdf_file>', 'Input PDF file.')
   .description('Recognize text in PDF file using internal OCR engine.')
   .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
+  .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
   .action(recognizeCLI);
 program

package/js/containers/app.js CHANGED Viewed

@@ -61,6 +61,18 @@ export class opt {
   static extractPDFFonts = false;
   static calcSuppFontInfo = false;
+  static usePDFTextSupp = true;
+  static usePDFTextMain = true;
+  /**
+   * Number of workers to use. Must be set prior to initialization.
+   * If set to `null` (default), the number of workers will be set up to 6 (browser) or 8 (node),
+   * if the system has enough resources.
+   * @type {?number}
+   */
+  static workerN = null;
 }
 export class inputData {
@@ -70,6 +82,9 @@ export class inputData {
   /** `true` if user uploaded pdf */
   static pdfMode = false;
+  /** @type {?('text'|'ocr'|'image')} */
+  static pdfType = null;
   /** `true` if user uploaded image files (.png, .jpeg) */
   static imageMode = false;

package/js/containers/fontContainer.js CHANGED Viewed

@@ -263,6 +263,13 @@ export class FontCont {
   static sansDefaultName = 'NimbusSans';
+  /**
+   * If `false`, 'Courier' will not be cleaned to Nimbus Mono.
+   * This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
+   * Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
+   */
+  static enableCleanToNimbusMono = false;
   /** @type {?('latin'|'all')} */
   static glyphSet = null;
@@ -337,6 +344,8 @@ export class FontCont {
         family = 'Carlito';
       } else if (/Calibri/i.test(family)) {
         family = 'Carlito';
+      } else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
+        family = 'NimbusMono';
       }
     }
@@ -379,6 +388,8 @@ export class FontCont {
     FontCont.rawMetrics = null;
     FontCont.optMetrics = null;
+    FontCont.enableCleanToNimbusMono = false;
     FontCont.defaultFontName = 'SerifDefault';
     FontCont.serifDefaultName = 'NimbusRomNo9L';
     FontCont.sansDefaultName = 'NimbusSans';

package/js/containers/imageContainer.js CHANGED Viewed

@@ -159,9 +159,6 @@ export class ImageCache {
     image: false,
   };
-  /** @type {?('text'|'ocr'|'image')} */
-  static pdfType = null;
   static colorModeDefault = 'gray';
   /**
@@ -196,6 +193,9 @@ export class ImageCache {
     const workersPromiseArr = range(0, scheduler.workers.length - 1).map(async (x) => {
       const w = scheduler.workers[x];
+      if (w.pdfDoc) await w.freeDocument(w.pdfDoc);
       // The ArrayBuffer is transferred to the worker, so a new one must be created for each worker.
       // const fileData = await file.arrayBuffer();
       const fileDataCopy = fileData.slice(0);

package/js/export/export.js CHANGED Viewed

@@ -143,6 +143,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
           doc1: pdfOverlay, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
         });
       }
+      w.freeDocument(pdfOverlay);
     } else {
       const pdfStr = await writePdf(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
         opt.overlayOpacity / 100);
@@ -169,6 +171,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
       content = await w.write({
         doc1: pdf, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
       });
+      w.freeDocument(pdf);
     }
   } else if (format === 'hocr') {
     content = writeHocr(ocrAll.active, minValue, maxValue);

package/js/export/writePdf.js CHANGED Viewed

@@ -534,10 +534,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
             const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
             // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
-            const wordGlyphMetrics = wordFontOpentype.charToGlyph(charArr.at(-1)).getMetrics();
+            const wordGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
+            const wordGlyphMetrics = wordGlyph.getMetrics();
             const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
-            const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
+            const wordRightBearing = wordJ.visualCoords ? (wordGlyph.advanceWidth - wordGlyphMetrics.xMax) * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
             const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;

package/js/extractPDFText.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { inputData } from './containers/app.js';
 import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
 import { convertOCR } from './recognizeConvert.js';
@@ -83,7 +84,7 @@ export const extractInternalPDFText = async (options = {}) => {
   const res = await extractInternalPDFTextRaw();
-  ImageCache.pdfType = res.type;
+  inputData.pdfType = res.type;
   ocrAllRaw.pdf = res.contentRaw;
   if (!extractPDFTextImage && res.type === 'image') return res;
@@ -102,7 +103,7 @@ export const extractInternalPDFText = async (options = {}) => {
   const format = 'stext';
   // Process HOCR using web worker, reading from file first if that has not been done already
-  await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
+  await convertOCR(ocrAllRaw.pdf, true, format, 'pdf', false);
   res.content = ocrAll.pdf;

package/js/fontContainerMain.js CHANGED Viewed

@@ -39,6 +39,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
   let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
   let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
   let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
+  let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
+  let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
+  let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
   if (typeof process === 'undefined') {
     if (glyphSet === 'latin') {
       carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
@@ -59,6 +62,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
       nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
       nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
       nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
     } else {
       carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
       carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
@@ -78,6 +84,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
       nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
       nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
       nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
+      nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
     }
   } else {
     const { readFile } = await import('fs/promises');
@@ -99,6 +108,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
     nimbusSansNormal = readFile(new URL('../fonts/all_ttf/NimbusSans-Regular.ttf', import.meta.url)).then((res) => res.buffer);
     nimbusSansItalic = readFile(new URL('../fonts/all_ttf/NimbusSans-Italic.ttf', import.meta.url)).then((res) => res.buffer);
     nimbusSansBold = readFile(new URL('../fonts/all_ttf/NimbusSans-Bold.ttf', import.meta.url)).then((res) => res.buffer);
+    nimbusMonoNormal = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Regular.ttf', import.meta.url)).then((res) => res.buffer);
+    nimbusMonoItalic = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Italic.ttf', import.meta.url)).then((res) => res.buffer);
+    nimbusMonoBold = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Bold.ttf', import.meta.url)).then((res) => res.buffer);
   }
   const srcObj = {
@@ -108,6 +120,7 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
     Palatino: { normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold },
     NimbusRomNo9L: { normal: await nimbusRomNo9LNormal, italic: await nimbusRomNo9LItalic, bold: await nimbusRomNo9LBold },
     NimbusSans: { normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold },
+    NimbusMono: { normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold },
   };
   FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
@@ -256,7 +269,7 @@ export async function setUploadFontsWorker(scheduler) {
   /** @type {Object<string, fontSrcBuiltIn|fontSrcUpload>} */
   const fontsUpload = {};
   for (const [key, value] of Object.entries(FontCont.active)) {
-    if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans'].includes(key)) {
+    if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans', 'NimbusMono'].includes(key)) {
       fontsUpload[key] = {
         normal: value?.normal?.src, italic: value?.italic?.src, bold: value?.bold?.src,
       };
@@ -381,8 +394,9 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
   const palatinoPromise = optimizeFontContainerFamily(fontPrivate.Palatino, fontMetricsObj);
   const nimbusRomNo9LPromise = optimizeFontContainerFamily(fontPrivate.NimbusRomNo9L, fontMetricsObj);
   const nimbusSansPromise = optimizeFontContainerFamily(fontPrivate.NimbusSans, fontMetricsObj);
+  const nimbusMonoPromise = optimizeFontContainerFamily(fontPrivate.NimbusMono, fontMetricsObj);
-  const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise]);
+  const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise, nimbusMonoPromise]);
   if (results.every((x) => x === null)) return null;
@@ -393,5 +407,6 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
     Palatino: results[3],
     NimbusRomNo9L: results[4],
     NimbusSans: results[5],
+    NimbusMono: results[6],
   };
 }

package/js/fontEval.js CHANGED Viewed

@@ -50,6 +50,7 @@ export async function evaluateFonts(pageArr, opt) {
   const evalPalatino = !!(opt ? FontCont.opt?.Palatino : FontCont.raw?.Palatino);
   const evalGaramond = !!(opt ? FontCont.opt?.Garamond : FontCont.raw?.Garamond);
   const evalNimbusRomNo9L = !!(opt ? FontCont.opt?.NimbusRomNo9L : FontCont.raw?.NimbusRomNo9L);
+  const evalNimbusMono = !!(opt ? FontCont.opt?.NimbusMono : FontCont.raw?.NimbusMono);
   // The browser version runs in parallel using workers, however the Node.js version runs sequentially,
   // as the canvas package does not support workers, and trying to run in parallel causes problems.
@@ -63,6 +64,7 @@ export async function evaluateFonts(pageArr, opt) {
       palatino: evalPalatino ? evalPagesFont('Palatino', pageArr, opt) : null,
       garamond: evalGaramond ? evalPagesFont('Garamond', pageArr, opt) : null,
       nimbusRomNo9L: evalNimbusRomNo9L ? evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
+      nimbusMono: evalNimbusMono ? evalPagesFont('NimbusMono', pageArr, opt) : null,
     };
     fontMetricsTmp = {
@@ -72,6 +74,7 @@ export async function evaluateFonts(pageArr, opt) {
       palatino: await fontMetricsPromises.palatino,
       garamond: await fontMetricsPromises.garamond,
       nimbusRomNo9L: await fontMetricsPromises.nimbusRomNo9L,
+      nimbusMono: await fontMetricsPromises.nimbusMono,
     };
   } else {
     fontMetricsTmp = {
@@ -81,6 +84,7 @@ export async function evaluateFonts(pageArr, opt) {
       palatino: evalPalatino ? await evalPagesFont('Palatino', pageArr, opt) : null,
       garamond: evalGaramond ? await evalPagesFont('Garamond', pageArr, opt) : null,
       nimbusRomNo9L: evalNimbusRomNo9L ? await evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
+      nimbusMono: evalNimbusMono ? await evalPagesFont('NimbusMono', pageArr, opt) : null,
     };
   }
@@ -91,6 +95,7 @@ export async function evaluateFonts(pageArr, opt) {
     Palatino: fontMetricsTmp.palatino ? fontMetricsTmp.palatino.metricTotal / fontMetricsTmp.palatino.wordsTotal : null,
     Garamond: fontMetricsTmp.garamond ? fontMetricsTmp.garamond.metricTotal / fontMetricsTmp.garamond.wordsTotal : null,
     NimbusRomNo9L: fontMetricsTmp.nimbusRomNo9L ? fontMetricsTmp.nimbusRomNo9L.metricTotal / fontMetricsTmp.nimbusRomNo9L.wordsTotal : null,
+    NimbusMono: fontMetricsTmp.nimbusMono ? fontMetricsTmp.nimbusMono.metricTotal / fontMetricsTmp.nimbusMono.wordsTotal : null,
   };
   return fontMetrics;
@@ -106,7 +111,7 @@ const calcBestFonts = (fontMetrics) => {
   for (const [key, value] of Object.entries(fontMetrics)) {
     if (!['Carlito', 'NimbusSans'].includes(key)) continue;
-    if (value < minValueSans) {
+    if (value && value < minValueSans) {
       minValueSans = value;
       minKeySans = key;
     }
@@ -116,8 +121,8 @@ const calcBestFonts = (fontMetrics) => {
   let minValueSerif = Number.MAX_VALUE;
   for (const [key, value] of Object.entries(fontMetrics)) {
-    if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L'].includes(key)) continue;
-    if (value < minValueSerif) {
+    if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L', 'NimbusMono'].includes(key)) continue;
+    if (value && value < minValueSerif) {
       minValueSerif = value;
       minKeySerif = key;
     }

package/js/fontSupp.js CHANGED Viewed

@@ -159,7 +159,7 @@ export const calcSuppFontInfo = async (ocrArr) => {
     for (const line of page.lines) {
       for (const word of line.words) {
         if (word.font && word.size && FontProps.sizeMult[word.font]) {
-          word.size *= FontProps.sizeMult[word.font];
+          word.size = Math.round(word.size * FontProps.sizeMult[word.font] * 1000) / 1000;
         }
       }
     }

package/js/generalWorkerMain.js CHANGED Viewed

@@ -1,3 +1,5 @@
+import { opt } from './containers/app.js';
 /**
  * Initializes a general worker and returns an object with methods controlled by the worker.
  * @returns {Promise} A promise that resolves to an object with control methods.
@@ -265,14 +267,14 @@ export class gs {
       gs.#resReady = resolve;
     });
-    // Determine number of workers to use in the browser.
-    // This is the minimum of:
-    //      1. The number of cores
-    //      3. 6 (browser-imposed memory limits make going higher than 6 problematic, even on hardware that could support it)
-    // Node.js version only uses 1 worker.
-    let workerN = 1;
-    if (typeof process === 'undefined') {
+    let workerN;
+    if (opt.workerN) {
+      workerN = opt.workerN;
+    } else if (typeof process === 'undefined') {
       workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
+    } else {
+      const cpuN = Math.floor((await import('os')).cpus().length / 2);
+      workerN = Math.min(cpuN - 1, 8);
     }
     const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');

package/js/global.d.ts CHANGED Viewed

@@ -34,6 +34,7 @@ declare global {
         Palatino: FontContainerFamilyBuiltIn;
         NimbusRomNo9L: FontContainerFamilyBuiltIn;
         NimbusSans: FontContainerFamilyBuiltIn;
+        NimbusMono: FontContainerFamilyBuiltIn;
         [key: string]: FontContainerFamily;
     };

package/js/import/convertPageStext.js CHANGED Viewed

@@ -50,6 +50,10 @@ export async function convertPageStext({ ocrStr, n }) {
       const xmlLinePreChar = xmlLine.match(/^[\s\S]*?(?=<char)/)?.[0];
       if (!xmlLinePreChar) return;
+      const dirStr = xmlLinePreChar.match(/dir=['"]([^'"]*)/)?.[1];
+      const dirSlopeStr = dirStr?.match(/[-\d.]+$/)?.[0];
+      const dirSlope = dirSlopeStr ? parseFloat(dirSlopeStr) : null;
       const xmlLineFormatting = xmlLinePreChar?.match(/<font[^>]+/)?.[0];
       const fontName = xmlLineFormatting?.match(/name=['"]([^'"]*)/)?.[1];
       const fontSizeStr = xmlLineFormatting?.match(/size=['"]([^'"]*)/)?.[1];
@@ -81,7 +85,7 @@ export async function convertPageStext({ ocrStr, n }) {
       /** @type {Array<Array<{left: number, top: number, right: number, bottom: number}>>} */
       const bboxes = [];
-      const baselineSlopeArr = /** @type {Array<Number>} */ ([]);
+      let baselineFirstDone = false;
       const baselineFirst = /** @type {Array<Number>} */ ([]);
       let baselineCurrent = 0;
@@ -114,17 +118,72 @@ export async function convertPageStext({ ocrStr, n }) {
       /** @type {Array<boolean>} */
       const superArr = [];
-      const wordLetterOrFontArr = /** @type {Array<Array<RegExpExecArray>>} */([]);
+      /**
+       * @typedef {Object} Point
+       * @property {number} x - The x coordinate.
+       * @property {number} y - The y coordinate.
+       */
+      /**
+       * @typedef {Object} Quad
+       * @property {Point} ul - Upper left corner.
+       * @property {Point} ur - Upper right corner.
+       * @property {Point} ll - Lower left corner.
+       * @property {Point} lr - Lower right corner.
+       */
+      /**
+       * @typedef {Object} StextChar
+       * @property {Quad} quad
+       * @property {Point} origin
+       * @property {string} text
+       */
+      /**
+       * @typedef {Object} StextFont
+       * @property {string} name
+       * @property {number} size
+       */
+      const wordCharOrFontArr = /** @type {Array<Array<StextChar|StextFont>>} */([]);
       for (let i = 0; i < wordStrArr.length; i++) {
         // Fonts can be changed at any point in the word string.
         // Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
         // This regex splits the string into elements that contain either (1) a font change or (2) a character.
         // The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
-        const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
-        wordLetterOrFontArr[i] = [...wordStrArr[i].matchAll(stextCharRegex)];
+        const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
+        const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
+        wordCharOrFontArr[i] = [];
+        for (let j = 0; j < stextMatches.length; j++) {
+          const fontStr = stextMatches[j][1];
+          const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
+          const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
+          if (fontNameStrI && fontSizeStrI) {
+            wordCharOrFontArr[i][j] = {
+              name: fontNameStrI,
+              size: parseFloat(fontSizeStrI),
+            };
+            continue;
+          }
+          const quad = {
+            ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
+            ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
+            ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
+            lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
+          };
+          wordCharOrFontArr[i][j] = {
+            quad,
+            origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
+            text: stextMatches[j][12],
+          };
+        }
       }
-      for (let i = 0; i < wordLetterOrFontArr.length; i++) {
+      for (let i = 0; i < wordCharOrFontArr.length; i++) {
         let textWordArr = [];
         let bboxesWordArr = [];
         let fontFamily = familyCurrent || fontFamilyLine || 'Default';
@@ -137,28 +196,38 @@ export async function convertPageStext({ ocrStr, n }) {
         let smallCapsWordAltTitleCaseAdj = false;
         let styleWord = 'normal';
-        const letterOrFontArr = wordLetterOrFontArr[i];
-        if (letterOrFontArr.length === 0) continue;
+        if (wordCharOrFontArr[i].length === 0) continue;
         let wordInit = false;
-        for (let j = 0; j < letterOrFontArr.length; j++) {
-          const fontStr = letterOrFontArr[j][1];
-          const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
-          const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
-          const baseline = parseFloat(letterOrFontArr[j][6]);
-          if (fontNameStrI && fontSizeStrI) {
+        for (let j = 0; j < wordCharOrFontArr[i].length; j++) {
+          const charOrFont = wordCharOrFontArr[i][j];
+          if ('name' in charOrFont) {
             // While small caps can be printed using special "small caps" fonts, they can also be printed using a regular font with a size change.
             // This block of code detects small caps printed in title case by checking for a decrease in font size after the first letter.
             // TODO: This logic currently fails when:
             // (1) Runs of small caps include punctuation, which is printed at the full size (and therefore is counted as a size increase ending small caps).
             // (2) Runs of small caps that start with lower-case letters, which do not conform to the expectation that runs of small caps start with a capital letter.
             const sizePrevRaw = sizeCurrentRaw;
-            sizeCurrentRaw = parseFloat(fontSizeStrI);
+            sizeCurrentRaw = charOrFont.size;
             const secondLetter = wordInit && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
-            const baselineNextLetter = parseFloat(letterOrFontArr[j + 1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[0]?.[6])
-              || parseFloat(wordLetterOrFontArr[i + 1]?.[1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[2]?.[6]);
+            let baselineNextLetter;
+            const possibleNextLetter1 = wordCharOrFontArr[i][j + 1];
+            const possibleNextLetter2 = wordCharOrFontArr[i + 1]?.[0];
+            const possibleNextLetter3 = wordCharOrFontArr[i + 1]?.[1];
+            const possibleNextLetter4 = wordCharOrFontArr[i + 1]?.[2];
+            if (possibleNextLetter1 && 'origin' in possibleNextLetter1) {
+              baselineNextLetter = possibleNextLetter1.origin.y;
+            } else if (possibleNextLetter2 && 'origin' in possibleNextLetter2) {
+              baselineNextLetter = possibleNextLetter2.origin.y;
+            } else if (possibleNextLetter3 && 'origin' in possibleNextLetter3) {
+              baselineNextLetter = possibleNextLetter3.origin.y;
+            } else if (possibleNextLetter4 && 'origin' in possibleNextLetter4) {
+              baselineNextLetter = possibleNextLetter4.origin.y;
+            }
             const fontSizeMin = Math.min(sizeCurrentRaw, sizePrevRaw);
             const baselineDelta = (baselineNextLetter - baselineCurrent) / fontSizeMin;
             const sizeDelta = (sizeCurrentRaw - sizePrevRaw) / fontSizeMin;
@@ -177,7 +246,13 @@ export async function convertPageStext({ ocrStr, n }) {
                 bboxes.push(bboxesWordArr);
                 styleArr.push(styleWord);
                 fontFamilyArr.push(fontFamily);
-                fontSizeArr.push(fontSizeWord);
+                if (sizeDelta > 0) {
+                  fontSizeArr.push(sizePrevRaw);
+                } else {
+                  fontSizeArr.push(fontSizeWord);
+                }
                 smallCapsArr.push(smallCapsWord);
                 smallCapsAltArr.push(smallCapsWordAlt);
                 smallCapsAltTitleCaseArr.push(smallCapsWordAltTitleCaseAdj);
@@ -187,21 +262,25 @@ export async function convertPageStext({ ocrStr, n }) {
                 bboxesWordArr = [];
               }
-              // If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
               if (sizeDelta > 0) {
-                baselineFirst.length = 0;
-                familyCurrent = fontNameStrI || familyCurrent;
+                // If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
+                if (!baselineFirstDone) baselineFirst.length = 0;
+                familyCurrent = charOrFont.name || familyCurrent;
                 sizeCurrent = sizeCurrentRaw || sizeCurrent;
                 fontSizeWord = sizeCurrent;
                 fontFamily = familyCurrent;
                 superArr[superArr.length - 1] = true;
-                fontSizeArr[fontSizeArr.length - 1] = sizeCurrentRaw;
+              }
+              // If `baselineFirstDone` was set using a non-superscript word, mark it as done.
+              if (superArr.length > 0 && !superArr[superArr.length - 1] && baselineFirst.length > 0) {
+                baselineFirstDone = true;
               }
               superCurrent = sizeDelta < 0;
             } else {
               sizeCurrent = sizeCurrentRaw || sizeCurrent;
-              familyCurrent = fontNameStrI || familyCurrent;
+              familyCurrent = charOrFont.name || familyCurrent;
               // Update current word only if this is before every letter in the word.
               if (textWordArr.length === 0) {
                 fontSizeWord = sizeCurrent;
@@ -210,7 +289,7 @@ export async function convertPageStext({ ocrStr, n }) {
               // An increase in font size ends any small caps sequence.
               // A threshold is necessary because stext data has been observed to have small variations without a clear reason.
               // eslint-disable-next-line no-lonely-if
-              if (Math.abs(sizeDelta) > 0.05) {
+              if (Number.isFinite(sizeDelta) && Math.abs(sizeDelta) > 0.05) {
                 smallCapsCurrentAlt = false;
                 if (textWordArr.length === 0) {
                   superCurrent = false;
@@ -222,14 +301,14 @@ export async function convertPageStext({ ocrStr, n }) {
             // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
             smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
-            smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(fontNameStrI);
+            smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(charOrFont.name);
             smallCapsWord = smallCapsCurrent;
-            if (/italic/i.test(fontNameStrI) || /-\w*ital/i.test(fontNameStrI)) {
+            if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name)) {
               // The word is already initialized, so we need to change the last element of the style array.
               // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
               styleCurrent = 'italic';
-            } else if (/bold|black/i.test(fontNameStrI)) {
+            } else if (/bold|black/i.test(charOrFont.name)) {
               styleCurrent = 'bold';
             } else {
               styleCurrent = 'normal';
@@ -237,7 +316,7 @@ export async function convertPageStext({ ocrStr, n }) {
             continue;
           } else {
-            baselineCurrent = baseline;
+            baselineCurrent = charOrFont.origin.y;
           }
           if (!wordInit) {
@@ -246,24 +325,22 @@ export async function convertPageStext({ ocrStr, n }) {
           }
           const bbox = {
-            left: Math.round(parseFloat(letterOrFontArr[j][2])),
-            top: Math.round(parseFloat(letterOrFontArr[j][3])),
-            right: Math.round(parseFloat(letterOrFontArr[j][4])),
-            bottom: Math.round(parseFloat(letterOrFontArr[j][5])),
+            left: Math.round(charOrFont.origin.x),
+            top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
+            right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)),
+            bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
           };
           if (!superCurrent) {
             if (baselineFirst.length === 0) {
-              baselineFirst.push(bbox.left, baseline);
-            } else {
-              baselineSlopeArr.push((baseline - baselineFirst[1]) / (bbox.left - baselineFirst[0]));
+              baselineFirst.push(bbox.left, charOrFont.origin.y);
             }
           }
           // Small caps created by reducing font size can carry forward across multiple words.
           smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
-          textWordArr.push(letterOrFontArr[j][7]);
+          textWordArr.push(charOrFont.text);
           bboxesWordArr.push(bbox);
         }
@@ -288,13 +365,19 @@ export async function convertPageStext({ ocrStr, n }) {
         }
         superArr.push(superCurrent);
+        if (superCurrent) fontSizeArr[fontSizeArr.length - 1] = sizeCurrentRaw;
       }
       // Return if there are no letters in the line.
       // This commonly happens for "lines" that contain only space characters.
       if (bboxes.length === 0) return;
-      const baselineSlope = quantile(baselineSlopeArr, 0.5) || 0;
+      let baselineSlope = 0;
+      if (dirSlope !== null) {
+        baselineSlope = dirSlope;
+      } else {
+        console.log('Unable to parse slope.');
+      }
       const lineBbox = {
         left: lineBoxArr[0], top: lineBoxArr[1], right: lineBoxArr[2], bottom: lineBoxArr[3],
@@ -427,7 +510,10 @@ export async function convertPageStext({ ocrStr, n }) {
     for (let i = 0; i < lineStrArr.length; i++) {
       const angle = convertLineStext(lineStrArr[i]);
-      if (typeof angle === 'number' && !Number.isNaN(angle)) angleRisePage.push(angle);
+      // The `Math.abs(angle) < 0.3` condition avoids vertical text impacting the angle calculation.
+      // The page angle is intended to account for page skew, not different orientations (90/180/270 degrees).
+      // TODO: Eventually different orientations should be supported.
+      if (typeof angle === 'number' && !Number.isNaN(angle) && Math.abs(angle) < 0.3) angleRisePage.push(angle);
     }
     if (parLineArr.length === 0) return;

package/js/import/import.js CHANGED Viewed

@@ -447,9 +447,12 @@ export async function importFiles(files, options = {}) {
     });
   } else if (inputData.pdfMode && (extractPDFTextNative || extractPDFTextOCR)) {
     await extractInternalPDFText({
-      setActive: true, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
+      setActive: opt.usePDFTextMain, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
     });
-    if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
+    if (opt.usePDFTextMain) {
+      if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
+      if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
+    }
   }
 }

package/js/objects/ocrObjects.js CHANGED Viewed

@@ -644,6 +644,37 @@ function cloneChar(char) {
   return charNew;
 }
+/**
+ * Gets words that match the provided text.
+ * @param {string} text
+ * @param {OcrPage} ocrPage
+ */
+function getMatchingWords(text, ocrPage) {
+  text = text.trim().toLowerCase();
+  if (!text) return [];
+  const textArr = text.split(' ');
+  const wordArr = ocr.getPageWords(ocrPage);
+  const matchArr = [];
+  for (let i = 0; i < wordArr.length - (textArr.length - 1); i++) {
+    const word = wordArr[i];
+    if (!word.text.toLowerCase().includes(textArr[0])) continue;
+    const candArr = wordArr.slice(i, i + textArr.length);
+    const candText = candArr.map((x) => x.text).join(' ').toLowerCase();
+    if (candText.toLowerCase().includes(text)) {
+      matchArr.push(...candArr);
+    }
+  }
+  return matchArr;
+}
 /**
  * Gets word IDs that match the provided text.
  * @param {string} text
@@ -729,6 +760,7 @@ const ocr = {
   getPageWord,
   getPageWords,
   getDistinctChars,
+  getMatchingWords,
   getMatchingWordIds,
   getPageText,
   getParText,

package/js/recognizeConvert.js CHANGED Viewed

@@ -529,14 +529,21 @@ export async function recognize(options = {}) {
   if (langs.includes('rus') || langs.includes('ukr') || langs.includes('ell')) fontPromiseArr.push(loadBuiltInFontsRaw('all'));
   await Promise.all(fontPromiseArr);
-  /** @type {?OcrPage[]} */
-  const existingOCR = ocrAll['User Upload'] || ocrAll.pdf;
+  let forceMainData = false;
+  let existingOCR;
+  if (ocrAll['User Upload']) {
+    existingOCR = ocrAll['User Upload'];
+  } else if (opt.usePDFTextSupp && ocrAll.pdf) {
+    existingOCR = ocrAll.pdf;
+    // If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
+    forceMainData = ocrAll.pdf !== ocrAll.active;
+  }
   // A single Tesseract engine can be used (Legacy or LSTM) or the results from both can be used and combined.
   if (oemMode === 'legacy' || oemMode === 'lstm') {
     // Tesseract is used as the "main" data unless user-uploaded data exists and only the LSTM model is being run.
     // This is because Tesseract Legacy provides very strong metrics, and Abbyy often does not.
-    await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !(oemMode === 'lstm' && !!existingOCR), langs, vanillaMode);
+    await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !existingOCR, langs, vanillaMode);
     // Metrics from the LSTM model are so inaccurate they are not worth using.
     if (oemMode === 'legacy') {
@@ -544,7 +551,7 @@ export async function recognize(options = {}) {
       await runFontOptimization(ocrAll['Tesseract Legacy']);
     }
   } else if (oemMode === 'combined') {
-    await recognizeAllPages(true, true, true, langs, vanillaMode);
+    await recognizeAllPages(true, true, !existingOCR, langs, vanillaMode);
     if (opt.saveDebugImages) {
       DebugData.debugImg.Combined = new Array(ImageCache.pageCount);
@@ -653,9 +660,16 @@ export async function recognize(options = {}) {
           ignorePunct: opt.ignorePunct,
           confThreshHigh: opt.confThreshHigh,
           confThreshMed: opt.confThreshMed,
+          // If the existing data was invisible OCR text extracted from a PDF, it is assumed to not have accurate bounding boxes.
+          useBboxB: !forceMainData && existingOCR === ocrAll.pdf && inputData.pdfMode && !!inputData.pdfType && ['image', 'ocr'].includes(inputData.pdfType),
         };
-        const res = await compareOCR(existingOCR, ocrAll['Tesseract Combined'], compOptions);
+        let res;
+        if (forceMainData) {
+          res = await compareOCR(ocrAll['Tesseract Combined'], existingOCR, compOptions);
+        } else {
+          res = await compareOCR(existingOCR, ocrAll['Tesseract Combined'], compOptions);
+        }
         if (DebugData.debugImg.Combined) DebugData.debugImg.Combined = res.debug;

package/js/utils/fontUtils.js CHANGED Viewed

@@ -237,8 +237,11 @@ export function calcWordMetrics(word, angle = 0) {
   const wordLastGlyphMetrics = fontOpentype.charToGlyph(charArr2.at(-1)).getMetrics();
   const wordFirstGlyphMetrics = fontOpentype.charToGlyph(charArr2[0]).getMetrics();
-  let wordLeftBearing = wordFirstGlyphMetrics.leftSideBearing || 0;
-  let wordRightBearing = wordLastGlyphMetrics.rightSideBearing || 0;
+  // The `leftSideBearing`/`rightSideBearing`/ numbers reported by Opentype.js are not accurate for mono-spaced fonts, so `xMin`/`xMax` are used instead.
+  let wordLeftBearing = wordFirstGlyphMetrics.xMin || 0;
+  let lastGlyphMax = wordLastGlyphMetrics.xMax || 0;
+  if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) lastGlyphMax *= fontI.smallCapsMult;
+  let wordRightBearing = advanceArr[advanceArr.length - 1] - lastGlyphMax;
   if (word.smallCaps && charArr2[0] !== charArr[0]) wordLeftBearing *= fontI.smallCapsMult;
   if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) wordRightBearing *= fontI.smallCapsMult;
@@ -290,6 +293,11 @@ export const calcWordFontSize = (word) => {
     if (word.visualCoords) {
       return getFontSize(fontOpentype, word.bbox.bottom - word.bbox.top, word.text);
     }
+    if (word.size) {
+      const mult = FontProps.sizeMult[font.family] || 1;
+      return word.size / mult;
+    }
     return (word.bbox.bottom - word.bbox.top) * (fontOpentype.unitsPerEm / (fontOpentype.ascender - fontOpentype.descender));
   }

package/js/utils/miscUtils.js CHANGED Viewed

@@ -379,7 +379,7 @@ export function replaceObjectProperties(obj, obj2 = {}) {
 // Fonts that should not be added (both Sans and Serif variants):
 // DejaVu
 const serifFonts = ['SerifDefault', 'Baskerville', 'Bookman', 'C059', 'Calibri', 'Cambria', 'Century', 'Courier', 'Garamond', 'Georgia',
-  'LucidaBright', 'Minion', 'Optima', 'P052', 'Palatino', 'Times'];
+  'LucidaBright', 'Minion', 'NimbusMono', 'Optima', 'P052', 'Palatino', 'Times'];
 const sansFonts = ['SansDefault', 'Avenir', 'Arial', 'Calibri', 'Candara', 'Carlito', 'Comic', 'Franklin', 'Futura', 'Gotham',
   'Helvetica', 'Impact', 'Interstate', 'Myriad', 'Tahoma', 'Trebuchet', 'Univers', 'Verdana'];

package/js/worker/compareOCRModule.js CHANGED Viewed

@@ -463,6 +463,7 @@ async function penalizeWord(wordObjs) {
  *    rather than simply setting `compTruth`/`matchTruth`. Enabled when using recognition to update confidence metrics, but not when comparing to ground truth.
  * @param {boolean} [params.options.legacyLSTMComb] - Whether Tesseract Legacy and Tesseract LSTM are being combined, when `mode = 'comb'`.
  *    When `legacyLSTMComb` is enabled, additional heuristics are applied that are based on specific behaviors of the Tesseract Legacy engine.
+ * @param {boolean} [params.options.useBboxB] - Use bounding boxes from `pageB` in combined output.
  * @param {string} [params.options.debugLabel]
  * @param {boolean} [params.options.evalConflicts] - Whether to evaluate word quality on conflicts. If `false` the text from `pageB` is always assumed correct.
  *    This option is useful for combining the style from Tesseract Legacy with the text from Tesseract LSTM.
@@ -494,6 +495,7 @@ export async function compareOCRPageImp({
   const mode = options?.mode === undefined ? 'stats' : options?.mode;
   const editConf = options?.editConf === undefined ? false : options?.editConf;
   const legacyLSTMComb = options?.legacyLSTMComb === undefined ? false : options?.legacyLSTMComb;
+  const useBboxB = options?.useBboxB === undefined ? false : options?.useBboxB;
   const debugLabel = options?.debugLabel === undefined ? '' : options?.debugLabel;
   const evalConflicts = options?.evalConflicts === undefined ? true : options?.evalConflicts;
   const supplementComp = options?.supplementComp === undefined ? false : options?.supplementComp;
@@ -597,8 +599,13 @@ export async function compareOCRPageImp({
           const wordBoxACore = JSON.parse(JSON.stringify(wordBoxA));
-          wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.1);
-          wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.1);
+          if (wordA.visualCoords) {
+            wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.1);
+            wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.1);
+          } else {
+            wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.25);
+            wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.25);
+          }
           for (let l = minWordB; l < lineB.words.length; l++) {
             const wordB = lineB.words[l];
@@ -612,8 +619,13 @@ export async function compareOCRPageImp({
             const wordBoxBCore = JSON.parse(JSON.stringify(wordBoxB));
-            wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.1);
-            wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.1);
+            if (wordB.visualCoords) {
+              wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.1);
+              wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.1);
+            } else {
+              wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.25);
+              wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.25);
+            }
             // If left of word A is past right of word B, move to next word B
             if (wordBoxACore.left > wordBoxBCore.right) {
@@ -660,6 +672,11 @@ export async function compareOCRPageImp({
                 if (mode === 'comb') wordA.conf = 100;
                 hocrACorrect[wordA.id] = 1;
                 hocrBCorrect[wordB.id] = 1;
+                if (mode === 'comb' && useBboxB) {
+                  wordA.bbox = structuredClone(wordB.bbox);
+                  wordA.visualCoords = true;
+                  wordA.chars = structuredClone(wordB.chars);
+                }
               } else if (mode === 'comb') {
                 wordA.conf = 0;
                 wordA.matchTruth = false;

package/js/worker/optimizeFontModule.js CHANGED Viewed

@@ -101,8 +101,11 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
     const indexFirst = font.charToGlyphIndex(charFirst);
     const indexSecond = font.charToGlyphIndex(charSecond);
-    const metricsFirst = font.glyphs.glyphs[indexFirst].getMetrics();
-    const metricsSecond = font.glyphs.glyphs[indexSecond].getMetrics();
+    const glyphFirst = font.glyphs.glyphs[indexFirst];
+    const glyphSecond = font.glyphs.glyphs[indexSecond];
+    const metricsFirst = glyphFirst.getMetrics();
+    const metricsSecond = glyphSecond.getMetrics();
     const fontKern1 = Math.round(value * xHeight);
     let spaceTarget = fontKern1;
@@ -119,7 +122,7 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
     }
     // Calculate current space between these 2 glyphs (without kerning adjustments)
-    const spaceCurrent = metricsFirst.rightSideBearing + metricsSecond.leftSideBearing;
+    const spaceCurrent = (glyphFirst.advanceWidth - metricsFirst.xMax) + metricsSecond.xMin;
     // Calculate kerning adjustment needed
     let fontKern = spaceTarget - spaceCurrent;

package/mupdf/libmupdf.wasm CHANGED Viewed

Binary file

package/mupdf/mupdf-async.js CHANGED Viewed

@@ -90,7 +90,7 @@ export async function initMuPDFWorker() {
     return function (...args) {
       return new Promise((resolve, reject) => {
         // Add the PDF as the first argument for most functions
-        if (!['openDocument', 'cleanFile'].includes(func)) {
+        if (!['openDocument', 'cleanFile', 'freeDocument'].includes(func)) {
           // Remove job number (appended by Tesseract scheduler function)
           // args = args.slice(0,-1)

package/mupdf/mupdf-worker.js CHANGED Viewed

@@ -165,6 +165,8 @@ mupdf.pageText = function (doc, {
   const content = Module.UTF8ToString(dataPtr);
+  Module._free(dataPtr);
   return {
     letterCountTotal,
     letterCountVis,
@@ -464,7 +466,7 @@ const handleMessage = (data) => {
   } catch (error) {
     parentPort.postMessage(['ERROR', id, { name: error.name, message: error.message }]);
   }
-}
+};
 if (typeof process === 'undefined') {
   onmessage = (event) => handleMessage(event.data);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.3.0",
+  "version": "0.4.0",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {

package/scribe.js CHANGED Viewed

@@ -94,7 +94,7 @@ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options
   init({ ocr: true, font: true });
   await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
   if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
-  const skipRecPDF = inputData.pdfMode && (ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR);
+  const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
   const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
   if (!skipRecPDF && !skipRecOCR) await recognize({ langs });
   return exportData(outputFormat);