npm - scribe.js-ocr - Versions diffs - 0.2.0 → 0.2.2 - Mend

scribe.js-ocr 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/js/containers/imageContainer.js +2 -2
package/js/export/exportDebugCsv.js +4 -4
package/js/generalWorkerMain.js +1 -1
package/js/import/convertPageBlocks.js +1 -1
package/js/import/import.js +3 -1
package/js/import/importOCR.js +9 -15
package/js/recognizeConvert.js +14 -2
package/js/worker/compareOCRModule.js +13 -4
package/js/worker/generalWorker.js +6 -6
package/package.json +2 -2
package/scribe.js +3 -1

package/js/containers/imageContainer.js CHANGED Viewed

@@ -254,7 +254,7 @@ export class ImageCache {
    * @returns
    */
   static #initMuPDFScheduler = async (numWorkers = 3) => {
-    const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
+    const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
     const scheduler = await Tesseract.createScheduler();
     const workersPromiseArr = range(1, numWorkers).map(async () => {
       const w = await initMuPDFWorker();
@@ -300,7 +300,7 @@ export class ImageCache {
         page: n + 1, dpi, color, skipText: skipTextMode,
       }).then((res) => new ImageWrapper(n, res, color ? 'color' : 'gray'));
     }
-    throw new Error('No input mode set');
+    throw new Error('Attempted to render image without image input provided.');
   };
   /**

package/js/export/exportDebugCsv.js CHANGED Viewed

@@ -22,10 +22,10 @@ const escapeCSVField = (field) => {
 };
 /**
-     * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
-     * @param {Array<Object>} data - The array of data objects.
-     * @returns {string} - The CSV string.
-     */
+ * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
+ * @param {Array<Object>} data - The array of data objects.
+ * @returns {string} - The CSV string.
+ */
 export const convertToCSV = (data) => {
   if (data.length === 0) {
     return '';

package/js/generalWorkerMain.js CHANGED Viewed

@@ -198,7 +198,7 @@ export class gs {
       workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
     }
-    const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
+    const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
     gs.schedulerInner = await Tesseract.createScheduler();
     gs.schedulerInner.workers = new Array(workerN);

package/js/import/convertPageBlocks.js CHANGED Viewed

@@ -9,7 +9,7 @@ import { getTextScript } from '../utils/miscUtils.js';
 /**
  * @param {Object} params
- * @param {Array<import('tesseract.js').Block>} params.ocrBlocks
+ * @param {Array<import('@scribe.js/tesseract.js').Block>} params.ocrBlocks
  * @param {number} params.n
  * @param {dims} params.pageDims
  * @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.

package/js/import/import.js CHANGED Viewed

@@ -465,6 +465,8 @@ export async function importFilesSupp(files, ocrName) {
   const ocrData = await importOCRFiles(ocrFilesAll);
+  const scribeMode = ocrData.scribeMode;
   const pageCountHOCR = ocrData.hocrRaw.length;
   // If both OCR data and image data are present, confirm they have the same number of pages
@@ -478,5 +480,5 @@ export async function importFilesSupp(files, ocrName) {
   if (ocrData.abbyyMode) format = 'abbyy';
   if (ocrData.stextMode) format = 'stext';
-  convertOCRAll(ocrData.hocrRaw, false, format, ocrName);
+  await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
 }

package/js/import/importOCR.js CHANGED Viewed

@@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) {
   // In the case of 1 HOCR file
   const singleHOCRMode = ocrFilesAll.length === 1;
-  let hocrStrStart = '';
-  let hocrStrEnd = '';
+  let hocrStrStart = null;
   let abbyyMode = false;
   let stextMode = false;
   let scribeMode = false;
-  let hocrArrPages;
   let pageCountHOCR;
   let hocrRaw;
   /** @type  {?Object.<string, FontMetricsFamily>} */
@@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) {
     stextMode = !!node2 && !!/<document name/.test(node2);
     if (abbyyMode) {
-      hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
+      hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
     } else if (stextMode) {
-      hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
+      hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
     } else {
-      hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)[0];
-      hocrStrEnd = hocrStrAll.match(/<\/body>[\s\S]*$/)[0];
-      hocrArrPages = splitHOCRStr(hocrStrAll);
+      // `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API.
+      hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)?.[0];
+      hocrRaw = splitHOCRStr(hocrStrAll);
     }
-    pageCountHOCR = hocrArrPages.length;
-    hocrRaw = Array(pageCountHOCR);
-    for (let i = 0; i < pageCountHOCR; i++) {
-      hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd;
-    }
+    pageCountHOCR = hocrRaw.length;
   } else {
     pageCountHOCR = ocrFilesAll.length;
     hocrRaw = Array(pageCountHOCR);
@@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) {
     }
   }
-  if (!abbyyMode && !stextMode && hocrRaw[0]) {
+  if (!abbyyMode && !stextMode && hocrStrStart) {
     const getMeta = (name) => {
       const regex = new RegExp(`<meta name=["']${name}["'][^<]+`, 'i');
-      const nodeStr = hocrRaw[0].match(regex)?.[0];
+      const nodeStr = hocrStrStart.match(regex)?.[0];
       if (!nodeStr) return null;
       const contentStr = nodeStr.match(/content=["']([\s\S]+?)(?=["']\s{0,5}\/?>)/i)?.[1];
       if (!contentStr) return null;

package/js/recognizeConvert.js CHANGED Viewed

@@ -24,7 +24,19 @@ import { replaceObjectProperties } from './utils/miscUtils.js';
  */
 export const compareOCRPage = async (pageA, pageB, options) => {
   const func = typeof process !== 'undefined' ? (await import('./worker/compareOCRModule.js')).compareOCRPageImp : gs.scheduler.compareOCRPageImp;
-  const binaryImage = await ImageCache.getBinary(pageA.n);
+  // Some combinations of options require the image to be provided, and some do not.
+  // We skip sending the image for those that do not, as in addition to helping performance,
+  // this is also necessary to run basic comparison scripts (e.g. benchmarking accuracy) without providing the image.
+  // TODO: Rework the options so this works better with types.
+  // At present TypeScript has no way of knowing that certain combinations of options go with each other.
+  const mode = options?.mode || 'stats';
+  const evalConflicts = options?.evalConflicts ?? true;
+  const supplementComp = options?.supplementComp ?? false;
+  const skipImage = (mode === 'stats' && !supplementComp) || (mode === 'comb' && !evalConflicts && !supplementComp);
+  const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n);
   const pageMetricsObj = pageMetricsArr[pageA.n];
   return func({
     pageA, pageB, binaryImage, pageMetricsObj, options,
@@ -51,7 +63,7 @@ export const evalOCRPage = async (params) => {
  * Compare two sets of OCR data.
  * @param {Array<OcrPage>} ocrA
  * @param {Array<OcrPage>} ocrB
- * @param  {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} options
+ * @param  {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} [options]
  */
 export const compareOCR = async (ocrA, ocrB, options) => {
   /** @type {Parameters<typeof compareOCRPage>[2]} */

package/js/worker/compareOCRModule.js CHANGED Viewed

@@ -486,10 +486,19 @@ async function penalizeWord(wordObjs) {
 export async function compareOCRPageImp({
   pageA, pageB, binaryImage, pageMetricsObj, options = {},
 }) {
-  const binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
-  const imageUpscaled = binaryImage.upscaled;
-  const imageRotated = binaryImage.rotated;
+  // The `binaryImage` argument is not sent for certain operations, which do not require it.
+  // For example, running a basic comparison between a page and the ground truth does not require having the image.
+  // The types do not currently reflect this, so this should be reworked at some point.
+  /** @type {?ImageBitmap} */
+  let binaryImageBit = null;
+  let imageUpscaled = false;
+  let imageRotated = false;
+  if (binaryImage) {
+    binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
+    imageUpscaled = binaryImage.upscaled;
+    imageRotated = binaryImage.rotated;
+  }
   const mode = options?.mode === undefined ? 'stats' : options?.mode;
   const editConf = options?.editConf === undefined ? false : options?.editConf;

package/js/worker/generalWorker.js CHANGED Viewed

@@ -17,7 +17,7 @@ import { optimizeFont } from './optimizeFontModule.js';
 // import Tesseract from "../../tess/tesseract.esm.min.js";
 const browserMode = typeof process === 'undefined';
-const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js/src/index.js');
+const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
 const defaultConfigs = {
   // TODO: Add back support for multiple PSM modes.
@@ -135,7 +135,7 @@ export const recognizeAndConvert = async ({
   const keepItalic = oemCurrent === 0;
-  const ocrBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
+  const ocrBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
   const res2 = await convertPageBlocks({
     ocrBlocks, n, pageDims, rotateAngle: angle, keepItalic,
@@ -184,14 +184,14 @@ export const recognizeAndConvert2 = async ({
   let resLegacy;
   let resLSTM;
   if (options.lstm && options.legacy) {
-    const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLegacy = await convertPageBlocks({
       ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
     });
     (async () => {
       const res1 = await resArr[1];
-      const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
+      const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
       resLSTM = await convertPageBlocks({
         ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
       });
@@ -201,12 +201,12 @@ export const recognizeAndConvert2 = async ({
       postMessage({ data: xB, id: `${id}b` });
     })();
   } else if (!options.lstm && options.legacy) {
-    const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLegacy = await convertPageBlocks({
       ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
     });
   } else if (options.lstm && !options.legacy) {
-    const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLSTM = await convertPageBlocks({
       ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
     });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.2.0",
+  "version": "0.2.2",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {
@@ -52,7 +52,7 @@
     "canvas": "^2.11.2",
     "commander": "^11.1.0",
     "puppeteer": "^22.13.0",
-    "tesseract.js": "scribeocr/tesseract.js#2065fd6",
+    "@scribe.js/tesseract.js": "^5.0.5",
     "web-worker": "~1.2.0"
   }
 }

package/scribe.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { ImageCache } from './js/containers/imageContainer.js';
 import coords from './js/coordinates.js';
 import { drawDebugImages } from './js/debug.js';
 import { download, exportData } from './js/export/export.js';
-import { writeDebugCsv } from './js/export/exportDebugCsv.js';
+import { writeDebugCsv, convertToCSV } from './js/export/exportDebugCsv.js';
 import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
 import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
 import { gs } from './js/generalWorkerMain.js';
@@ -131,6 +131,8 @@ class utils {
   // Misc utils
   static calcBoxOverlap = calcBoxOverlap;
+  static convertToCSV = convertToCSV;
   static replaceSmartQuotes = replaceSmartQuotes;
   static getRandomAlphanum = getRandomAlphanum;