npm - scribe.js-ocr - Versions diffs - 0.1.1 → 0.2.1 - Mend

scribe.js-ocr 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/.eslintrc.json +6 -0
package/README.md +1 -1
package/docs/API.md +13 -4
package/examples/browser/recognize-basic.js +1 -1
package/examples/node/recognize-basic.js +1 -1
package/js/containers/app.js +0 -3
package/js/containers/imageContainer.js +13 -69
package/js/export/exportDebugCsv.js +4 -4
package/js/extractPDFText.js +110 -0
package/js/generalWorkerMain.js +3 -3
package/js/import/convertPageBlocks.js +1 -1
package/js/import/import.js +19 -15
package/js/import/importOCR.js +6 -6
package/js/recognizeConvert.js +14 -2
package/js/worker/compareOCRModule.js +13 -4
package/js/worker/generalWorker.js +6 -6
package/package.json +2 -2
package/scribe.js +16 -7

package/.eslintrc.json CHANGED Viewed

@@ -67,6 +67,12 @@
         // "one-var": "off",
         // "one-var-declaration-per-line": "off",
+        // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
+        "import/no-relative-packages": "off",
+        // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
+        "no-lone-blocks": "off",
         // This rule was depreciated
         "no-return-await": "off",

package/README.md CHANGED Viewed

@@ -25,7 +25,7 @@ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
 import scribe from 'scribe.js-ocr';
 // Basic usage
-scribe.recognizeFiles(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
+scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
 	.then((res) => console.log(res))
 ```

package/docs/API.md CHANGED Viewed

@@ -4,7 +4,7 @@
 *   [init][1]
     *   [Parameters][2]
-*   [recognizeFiles][3]
+*   [extractText][3]
     *   [Parameters][4]
 *   [clear][5]
 *   [terminate][6]
@@ -35,9 +35,10 @@ Initialize the program and optionally pre-load resources.
         The PDF renderer and OCR engine are automatically loaded when needed.
         Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
-## recognizeFiles
+## extractText
-Helper function for recognizing files with a single function call.
+Function for extracting text from image and PDF files with a single function call.
+By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
 For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
 ### Parameters
@@ -45,6 +46,10 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
 *   `files` &#x20;
 *   `langs` **[Array][21]<[string][22]>**  (optional, default `['eng']`)
 *   `outputFormat`   (optional, default `'txt'`)
+*   `options` **[Object][19]?**  (optional, default `{}`)
+    *   `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
+    *   `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
 ## clear
@@ -100,6 +105,10 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
 ### Parameters
 *   `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
+*   `options` **[Object][19]?**  (optional, default `{}`)
+    *   `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
+    *   `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
 ## recognizePage
@@ -135,7 +144,7 @@ The results of recognition can be exported by calling `exportFiles` after this f
 [2]: #parameters
-[3]: #recognizefiles
+[3]: #extracttext
 [4]: #parameters-1

package/examples/browser/recognize-basic.js CHANGED Viewed

@@ -5,6 +5,6 @@ await scribe.init({ ocr: true, font: true });
 const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
 elm.addEventListener('change', async () => {
   if (!elm.files) return;
-  const text = await scribe.recognizeFiles(elm.files);
+  const text = await scribe.extractText(elm.files);
   console.log(text);
 });

package/examples/node/recognize-basic.js CHANGED Viewed

@@ -5,7 +5,7 @@ import scribe from '../../scribe.js';
 const [,, imagePath] = process.argv;
 (async () => {
-  const res = await scribe.recognizeFiles([imagePath]);
+  const res = await scribe.extractText([imagePath]);
   console.log(res);
   await scribe.terminate();
 })();

package/js/containers/app.js CHANGED Viewed

@@ -74,9 +74,6 @@ export class inputData {
   /** `true` if user re-uploaded HOCR data created by Scribe OCR */
   static resumeMode = false;
-  /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
-  static extractTextMode = false;
   /** `true` if ground truth data is uploaded */
   static evalMode = false;

package/js/containers/imageContainer.js CHANGED Viewed

@@ -216,52 +216,19 @@ export class ImageCache {
   static pageCount = 0;
   /**
- * The dimensions that each page would be, if it was rendered at 300 DPI.
- * @type {Array<dims>}
- */
-  static pdfDims300Arr = [];
+   * The dimensions that each page would be, if it was rendered at 300 DPI.
+   * @type {Array<dims>}
+   */
+  static pdfDims300 = [];
   static inputModes = {
     pdf: false,
     image: false,
   };
-  static pdfContentStats = {
-    /** Total number of letters in the source PDF. */
-    letterCountTotal: 0,
-    /** Total number of visible letters in the source PDF. */
-    letterCountVis: 0,
-    /** Total number of pages with 100+ letters in the source PDF. */
-    pageCountTotalText: 0,
-    /** Total number of pages with 100+ visible letters in the source PDF. */
-    pageCountVisText: 0,
-  };
   /** @type {?('text'|'ocr'|'image')} */
   static pdfType = null;
-  static setPdfType = () => {
-    // The PDF is considered text-native if:
-    // (1) The total number of visible letters is at least 100 per page on average.
-    // (2) The total number of visible letters is at least 90% of the total number of letters.
-    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
-    if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
-      && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'text';
-    // The PDF is considered ocr-native if:
-    // (1) The total number of letters is at least 100 per page on average.
-    // (2) The total number of letters is at least half of the total number of letters.
-    } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'ocr';
-    // Otherwise, the PDF is considered image-native.
-    // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
-    } else {
-      ImageCache.pdfType = 'image';
-    }
-  };
   static colorModeDefault = 'gray';
   static cacheRenderPages = 3;
@@ -287,7 +254,7 @@ export class ImageCache {
    * @returns
    */
   static #initMuPDFScheduler = async (numWorkers = 3) => {
-    const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
+    const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
     const scheduler = await Tesseract.createScheduler();
     const workersPromiseArr = range(1, numWorkers).map(async () => {
       const w = await initMuPDFWorker();
@@ -327,13 +294,13 @@ export class ImageCache {
     } if (ImageCache.inputModes.pdf) {
       const pageMetrics = pageMetricsArr[n];
       const targetWidth = pageMetrics.dims.width;
-      const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
+      const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
       const muPDFScheduler = await ImageCache.getMuPDFScheduler();
       return muPDFScheduler.drawPageAsPNG({
         page: n + 1, dpi, color, skipText: skipTextMode,
       }).then((res) => new ImageWrapper(n, res, color ? 'color' : 'gray'));
     }
-    throw new Error('No input mode set');
+    throw new Error('Attempted to render image without image input provided.');
   };
   /**
@@ -566,14 +533,10 @@ export class ImageCache {
     ImageCache.inputModes.image = false;
     ImageCache.inputModes.pdf = false;
     ImageCache.pageCount = 0;
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     ImageCache.loadCount = 0;
     ImageCache.nativeProps.length = 0;
     ImageCache.binaryProps.length = 0;
-    ImageCache.pdfContentStats.letterCountTotal = 0;
-    ImageCache.pdfContentStats.letterCountVis = 0;
-    ImageCache.pdfContentStats.pageCountTotalText = 0;
-    ImageCache.pdfContentStats.pageCountVisText = 0;
   };
   static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
    *
    * @param {ArrayBuffer} fileData
    * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
-   * @param {Boolean} [extractStext=false]
    */
-  static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
+  static openMainPDF = async (fileData, skipText = false) => {
     const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
     await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
     const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     pageDims1.forEach((x) => {
-      ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
+      ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
     });
     ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
     // For reasons that are unclear, a small number of pages have been rendered into massive files
     // so a hard-cap on resolution must be imposed.
-    const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
+    const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
     // In addition to capping the resolution, also switch the width/height
-    ImageCache.pdfDims300Arr.forEach((x, i) => {
+    ImageCache.pdfDims300.forEach((x, i) => {
       const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
       pageMetricsArr[i] = new PageMetrics(pageDims);
     });
@@ -674,23 +636,5 @@ export class ImageCache {
         await setUploadFontsWorker(gs.schedulerInner);
       });
     }
-    if (extractStext) {
-      ocrAllRaw.active = Array(ImageCache.pageCount);
-      const resArr = pageDPI.map(async (x, i) => {
-        // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
-        // The XML format is the only built-in mupdf format that includes character-level granularity.
-        const res = await muPDFScheduler.pageText({
-          page: i + 1, dpi: x, format: 'xml', calcStats: true,
-        });
-        ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
-        ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
-        if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
-        if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
-        ocrAllRaw.active[i] = res.content;
-      });
-      await Promise.all(resArr);
-      ImageCache.setPdfType();
-    }
   };
 }

package/js/export/exportDebugCsv.js CHANGED Viewed

@@ -22,10 +22,10 @@ const escapeCSVField = (field) => {
 };
 /**
-     * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
-     * @param {Array<Object>} data - The array of data objects.
-     * @returns {string} - The CSV string.
-     */
+ * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
+ * @param {Array<Object>} data - The array of data objects.
+ * @returns {string} - The CSV string.
+ */
 export const convertToCSV = (data) => {
   if (data.length === 0) {
     return '';

package/js/extractPDFText.js ADDED Viewed

@@ -0,0 +1,110 @@
+import { ImageCache } from './containers/imageContainer.js';
+import { convertOCRAll } from './recognizeConvert.js';
+import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
+/**
+ * Extract raw text content from currently loaded PDF.
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
+ */
+const extractInternalPDFTextRaw = async () => {
+  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
+  const pdfContentStats = {
+    /** Total number of letters in the source PDF. */
+    letterCountTotal: 0,
+    /** Total number of visible letters in the source PDF. */
+    letterCountVis: 0,
+    /** Total number of pages with 100+ letters in the source PDF. */
+    pageCountTotalText: 0,
+    /** Total number of pages with 100+ visible letters in the source PDF. */
+    pageCountVisText: 0,
+  };
+  const stextArr = /** @type {Array<string>} */ ([]);
+  const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
+  const resArr = pageDPI.map(async (x, i) => {
+    // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
+    // The XML format is the only built-in mupdf format that includes character-level granularity.
+    const res = await muPDFScheduler.pageText({
+      page: i + 1, dpi: x, format: 'xml', calcStats: true,
+    });
+    pdfContentStats.letterCountTotal += res.letterCountTotal;
+    pdfContentStats.letterCountVis += res.letterCountVis;
+    if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
+    if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
+    stextArr[i] = res.content;
+  });
+  await Promise.all(resArr);
+  /** @type {"image" | "text" | "ocr"} */
+  let type = 'image';
+  // Determine whether the PDF is text-native, image-only, or image + OCR.
+  {
+    // The PDF is considered text-native if:
+    // (1) The total number of visible letters is at least 100 per page on average.
+    // (2) The total number of visible letters is at least 90% of the total number of letters.
+    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
+    if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
+      && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
+      type = 'text';
+      // The PDF is considered ocr-native if:
+      // (1) The total number of letters is at least 100 per page on average.
+      // (2) The total number of letters is at least half of the total number of letters.
+    } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
+      type = 'ocr';
+      // Otherwise, the PDF is considered image-native.
+      // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
+    } else {
+      type = 'image';
+    }
+  }
+  return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
+};
+/**
+ * Extract and parse text from currently loaded PDF.
+ * @param {Object} [options]
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
+ *   This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
+ */
+export const extractInternalPDFText = async (options = {}) => {
+  const extractPDFTextNative = options?.extractPDFTextNative ?? true;
+  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
+  const extractPDFTextImage = options?.extractPDFTextImage ?? false;
+  const setActive = options?.setActive ?? false;
+  const res = await extractInternalPDFTextRaw();
+  ImageCache.pdfType = res.type;
+  ocrAllRaw.pdf = res.contentRaw;
+  if (!extractPDFTextImage && res.type === 'image') return res;
+  if (!extractPDFTextOCR && res.type === 'ocr') return res;
+  if (!extractPDFTextNative && res.type === 'text') return res;
+  ocrAll.pdf = Array(ImageCache.pageCount);
+  if (setActive) {
+    ocrAllRaw.active = ocrAllRaw.pdf;
+    ocrAll.active = ocrAll.pdf;
+  }
+  const format = 'stext';
+  // Process HOCR using web worker, reading from file first if that has not been done already
+  await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
+  res.content = ocrAll.pdf;
+  return res;
+};

package/js/generalWorkerMain.js CHANGED Viewed

@@ -198,7 +198,7 @@ export class gs {
       workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
     }
-    const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
+    const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
     gs.schedulerInner = await Tesseract.createScheduler();
     gs.schedulerInner.workers = new Array(workerN);
@@ -263,12 +263,12 @@ export class gs {
   static getGeneralScheduler = async () => {
     if (gs.schedulerReady) {
       await gs.schedulerReady;
-      return gs.scheduler;
+      return /** @type {GeneralScheduler} */ (gs.scheduler);
     }
     await gs.init();
-    return gs.scheduler;
+    return /** @type {GeneralScheduler} */ (gs.scheduler);
   };
   static terminate = async () => {

package/js/import/convertPageBlocks.js CHANGED Viewed

@@ -9,7 +9,7 @@ import { getTextScript } from '../utils/miscUtils.js';
 /**
  * @param {Object} params
- * @param {Array<import('tesseract.js').Block>} params.ocrBlocks
+ * @param {Array<import('@scribe.js/tesseract.js').Block>} params.ocrBlocks
  * @param {number} params.n
  * @param {dims} params.pageDims
  * @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.

package/js/import/import.js CHANGED Viewed

@@ -11,7 +11,9 @@ import {
 } from '../containers/dataContainer.js';
 import { fontAll } from '../containers/fontContainer.js';
 import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
-import { enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw } from '../fontContainerMain.js';
+import {
+  enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
+} from '../fontContainerMain.js';
 import { runFontOptimization } from '../fontEval.js';
 import { calcFontMetricsFromPages } from '../fontStatistics.js';
 import { gs } from '../generalWorkerMain.js';
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
 import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
 import { replaceObjectProperties } from '../utils/miscUtils.js';
 import { importOCRFiles } from './importOCR.js';
+import { extractInternalPDFText } from '../extractPDFText.js';
 /**
  * Automatically detects the image type (jpeg or png).
@@ -185,12 +188,18 @@ export function sortInputFiles(files) {
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
  * @public
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
+ * @param {Object} [options]
+ * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
  * @returns
  */
-export async function importFiles(files) {
+export async function importFiles(files, options = {}) {
   clearData();
   gs.getGeneralScheduler();
+  const extractPDFTextNative = options?.extractPDFTextNative ?? false;
+  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
   /** @type {Array<File|FileNode|ArrayBuffer>} */
   let pdfFiles = [];
   /** @type {Array<File|FileNode|ArrayBuffer>} */
@@ -266,10 +275,6 @@ export async function importFiles(files) {
   const xmlModeImport = ocrFiles.length > 0;
-  // Extract text from PDF document
-  // Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
-  inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
   let pageCount;
   let pageCountImage;
   let abbyyMode = false;
@@ -284,7 +289,7 @@ export async function importFiles(files) {
     const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
     // If no XML data is provided, page sizes are calculated using muPDF alone
-    await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText, inputData.extractTextMode);
+    await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
     pageCountImage = ImageCache.pageCount;
     ImageCache.loadCount = ImageCache.pageCount;
@@ -315,7 +320,7 @@ export async function importFiles(files) {
     // Restore font metrics and optimize font from previous session (if applicable)
     if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
-      const fontPromise = loadBuiltInFontsRaw()
+      const fontPromise = loadBuiltInFontsRaw();
       existingOpt = true;
@@ -368,11 +373,6 @@ export async function importFiles(files) {
     scribeMode = ocrData.scribeMode;
     stextMode = ocrData.stextMode;
-  } else if (inputData.extractTextMode) {
-    // Initialize a new array on `ocrAll` if one does not already exist
-    if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
-    ocrAll.active = ocrAll[oemName];
-    stextMode = true;
   }
   const pageCountHOCR = ocrAllRaw.active?.length;
@@ -424,7 +424,7 @@ export async function importFiles(files) {
     }
   }
-  if (xmlModeImport || inputData.extractTextMode) {
+  if (xmlModeImport) {
     /** @type {("hocr" | "abbyy" | "stext")} */
     let format = 'hocr';
     if (abbyyMode) format = 'abbyy';
@@ -439,6 +439,8 @@ export async function importFiles(files) {
         opt.enableOpt = await runFontOptimization(ocrAll.active);
       }
     });
+  } else if (extractPDFTextNative || extractPDFTextOCR) {
+    await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
   }
 }
@@ -463,6 +465,8 @@ export async function importFilesSupp(files, ocrName) {
   const ocrData = await importOCRFiles(ocrFilesAll);
+  const scribeMode = ocrData.scribeMode;
   const pageCountHOCR = ocrData.hocrRaw.length;
   // If both OCR data and image data are present, confirm they have the same number of pages
@@ -476,5 +480,5 @@ export async function importFilesSupp(files, ocrName) {
   if (ocrData.abbyyMode) format = 'abbyy';
   if (ocrData.stextMode) format = 'stext';
-  convertOCRAll(ocrData.hocrRaw, false, format, ocrName);
+  await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
 }

package/js/import/importOCR.js CHANGED Viewed

@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
   let pageCountHOCR;
   let hocrRaw;
   /** @type  {?Object.<string, FontMetricsFamily>} */
-  let fontMetricsObj;
+  let fontMetricsObj = null;
   /** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
   let layoutObj = null;
   /** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
     const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
     // Check whether input is Abbyy XML
-    const node2 = hocrStrAll.match(/>([^>]+)/)[1];
-    abbyyMode = !!/abbyy/i.test(node2);
-    stextMode = !!/<document name/.test(node2);
+    const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
+    abbyyMode = !!node2 && !!/abbyy/i.test(node2);
+    stextMode = !!node2 && !!/<document name/.test(node2);
     if (abbyyMode) {
       hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
     // Check whether input is Abbyy XML using the first file
     const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
-    const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
-    abbyyMode = !!/abbyy/i.test(node2);
+    const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
+    abbyyMode = !!node2 && !!/abbyy/i.test(node2);
     for (let i = 0; i < pageCountHOCR; i++) {
       const hocrFile = ocrFilesAll[i];

package/js/recognizeConvert.js CHANGED Viewed

@@ -24,7 +24,19 @@ import { replaceObjectProperties } from './utils/miscUtils.js';
  */
 export const compareOCRPage = async (pageA, pageB, options) => {
   const func = typeof process !== 'undefined' ? (await import('./worker/compareOCRModule.js')).compareOCRPageImp : gs.scheduler.compareOCRPageImp;
-  const binaryImage = await ImageCache.getBinary(pageA.n);
+  // Some combinations of options require the image to be provided, and some do not.
+  // We skip sending the image for those that do not, as in addition to helping performance,
+  // this is also necessary to run basic comparison scripts (e.g. benchmarking accuracy) without providing the image.
+  // TODO: Rework the options so this works better with types.
+  // At present TypeScript has no way of knowing that certain combinations of options go with each other.
+  const mode = options?.mode || 'stats';
+  const evalConflicts = options?.evalConflicts ?? true;
+  const supplementComp = options?.supplementComp ?? false;
+  const skipImage = (mode === 'stats' && !supplementComp) || (mode === 'comb' && !evalConflicts && !supplementComp);
+  const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n);
   const pageMetricsObj = pageMetricsArr[pageA.n];
   return func({
     pageA, pageB, binaryImage, pageMetricsObj, options,
@@ -51,7 +63,7 @@ export const evalOCRPage = async (params) => {
  * Compare two sets of OCR data.
  * @param {Array<OcrPage>} ocrA
  * @param {Array<OcrPage>} ocrB
- * @param  {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} options
+ * @param  {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} [options]
  */
 export const compareOCR = async (ocrA, ocrB, options) => {
   /** @type {Parameters<typeof compareOCRPage>[2]} */

package/js/worker/compareOCRModule.js CHANGED Viewed

@@ -486,10 +486,19 @@ async function penalizeWord(wordObjs) {
 export async function compareOCRPageImp({
   pageA, pageB, binaryImage, pageMetricsObj, options = {},
 }) {
-  const binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
-  const imageUpscaled = binaryImage.upscaled;
-  const imageRotated = binaryImage.rotated;
+  // The `binaryImage` argument is not sent for certain operations, which do not require it.
+  // For example, running a basic comparison between a page and the ground truth does not require having the image.
+  // The types do not currently reflect this, so this should be reworked at some point.
+  /** @type {?ImageBitmap} */
+  let binaryImageBit = null;
+  let imageUpscaled = false;
+  let imageRotated = false;
+  if (binaryImage) {
+    binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
+    imageUpscaled = binaryImage.upscaled;
+    imageRotated = binaryImage.rotated;
+  }
   const mode = options?.mode === undefined ? 'stats' : options?.mode;
   const editConf = options?.editConf === undefined ? false : options?.editConf;

package/js/worker/generalWorker.js CHANGED Viewed

@@ -17,7 +17,7 @@ import { optimizeFont } from './optimizeFontModule.js';
 // import Tesseract from "../../tess/tesseract.esm.min.js";
 const browserMode = typeof process === 'undefined';
-const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js/src/index.js');
+const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
 const defaultConfigs = {
   // TODO: Add back support for multiple PSM modes.
@@ -135,7 +135,7 @@ export const recognizeAndConvert = async ({
   const keepItalic = oemCurrent === 0;
-  const ocrBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
+  const ocrBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
   const res2 = await convertPageBlocks({
     ocrBlocks, n, pageDims, rotateAngle: angle, keepItalic,
@@ -184,14 +184,14 @@ export const recognizeAndConvert2 = async ({
   let resLegacy;
   let resLSTM;
   if (options.lstm && options.legacy) {
-    const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLegacy = await convertPageBlocks({
       ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
     });
     (async () => {
       const res1 = await resArr[1];
-      const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
+      const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
       resLSTM = await convertPageBlocks({
         ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
       });
@@ -201,12 +201,12 @@ export const recognizeAndConvert2 = async ({
       postMessage({ data: xB, id: `${id}b` });
     })();
   } else if (!options.lstm && options.legacy) {
-    const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLegacy = await convertPageBlocks({
       ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
     });
   } else if (options.lstm && !options.legacy) {
-    const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
+    const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
     resLSTM = await convertPageBlocks({
       ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
     });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.1.1",
+  "version": "0.2.1",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {
@@ -52,7 +52,7 @@
     "canvas": "^2.11.2",
     "commander": "^11.1.0",
     "puppeteer": "^22.13.0",
-    "tesseract.js": "scribeocr/tesseract.js#2065fd6",
+    "@scribe.js/tesseract.js": "^5.0.5",
     "web-worker": "~1.2.0"
   }
 }

package/scribe.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { ImageCache } from './js/containers/imageContainer.js';
 import coords from './js/coordinates.js';
 import { drawDebugImages } from './js/debug.js';
 import { download, exportData } from './js/export/export.js';
-import { writeDebugCsv } from './js/export/exportDebugCsv.js';
+import { writeDebugCsv, convertToCSV } from './js/export/exportDebugCsv.js';
 import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
 import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
 import { gs } from './js/generalWorkerMain.js';
@@ -30,6 +30,7 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
 import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
 import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
 import { assignParagraphs } from './js/utils/reflowPars.js';
+import { extractInternalPDFText } from './js/extractPDFText.js';
 /**
  * Initialize the program and optionally pre-load resources.
@@ -66,18 +67,23 @@ const init = async (params) => {
 };
 /**
- * Helper function for recognizing files with a single function call.
+ * Function for extracting text from image and PDF files with a single function call.
+ * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
  * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
  * @public
  * @param {Parameters<typeof importFiles>[0]} files
  * @param {Array<string>} [langs=['eng']]
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
- * @returns
+ * @param {Object} [options]
+ * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
+ * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
  */
-const recognizeFiles = async (files, langs = ['eng'], outputFormat = 'txt') => {
+const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
+  const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
+  const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
   init({ ocr: true, font: true });
-  await importFiles(files);
-  await recognize({ langs });
+  await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
+  if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
   return exportData(outputFormat);
 };
@@ -125,6 +131,8 @@ class utils {
   // Misc utils
   static calcBoxOverlap = calcBoxOverlap;
+  static convertToCSV = convertToCSV;
   static replaceSmartQuotes = replaceSmartQuotes;
   static getRandomAlphanum = getRandomAlphanum;
@@ -174,7 +182,8 @@ export default {
   opt,
   recognize,
   recognizePage,
-  recognizeFiles,
+  extractText,
+  extractInternalPDFText,
   terminate,
   utils,
 };