npm - scribe.js-ocr - Versions diffs - 0.1.0 → 0.2.0 - Mend

scribe.js-ocr 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/.eslintrc.json +6 -0
package/README.md +47 -1
package/cli/extract.js +1 -1
package/cli/main.js +1 -1
package/docs/API.md +191 -0
package/docs/scribe_vs_tesseract.md +39 -0
package/examples/browser/recognize-basic.html +9 -0
package/examples/browser/recognize-basic.js +10 -0
package/examples/node/recognize-basic.js +11 -0
package/js/containers/app.js +0 -3
package/js/containers/imageContainer.js +11 -67
package/js/export/export.js +5 -2
package/js/extractPDFText.js +110 -0
package/js/fontContainerMain.js +5 -4
package/js/fontEval.js +8 -4
package/js/generalWorkerMain.js +2 -46
package/js/import/convertPageStext.js +1 -1
package/js/import/import.js +25 -17
package/js/import/importOCR.js +6 -6
package/js/import/nodeAdapter.js +20 -5
package/js/objects/layoutObjects.js +8 -0
package/js/objects/ocrObjects.js +22 -2
package/js/recognizeConvert.js +7 -3
package/package.json +4 -2
package/{module.js → scribe.js} +24 -10

package/.eslintrc.json CHANGED Viewed

@@ -67,6 +67,12 @@
         // "one-var": "off",
         // "one-var-declaration-per-line": "off",
+        // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
+        "import/no-relative-packages": "off",
+        // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
+        "no-lone-blocks": "off",
         // This rule was depreciated
         "no-return-await": "off",

package/README.md CHANGED Viewed

@@ -9,4 +9,50 @@ Common use cases:
 3. Write `.pdf` files that include a high-quality invisible text layer.
 	1. scribe.js can insert text into an existing `.pdf` file, making it searchable.
-Scribe.js is a library intended for developers.  End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
+Scribe.js is a library intended for developers.  End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
+# Setup
+Install from `npm` by running the following:
+```sh
+npm i scribe.js-ocr
+```
+Scribe.js is written in JavaScript using ESM, so can be imported directly from browser or Node.js JavaScript code.
+```js
+// Import statement in browser:
+import scribe from 'node_modules/scribe.js-ocr/scribe.js';
+// Import statement for Node.js:
+import scribe from 'scribe.js-ocr';
+// Basic usage
+scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
+	.then((res) => console.log(res))
+```
+When using Scribe.js in the browser, all files must be served from the same origin as the file importing Scribe.js.  This means that importing Scribe.js from a CDN will not work.  There is no UMD version.
+# Scribe.js vs. Tesseract.js
+Considering whether Scribe.js or Tesseract.js is better for your project?  Read [this article](./docs/scribe_vs_tesseract.md).
+# Documentation
+- [Basic Browser Examples](./examples/browser/)
+- [Basic Node.js Examples](./examples/node/)
+- [Scribe.js vs. Tesseract.js Comparison](./docs/scribe_vs_tesseract.md)
+- [API](./docs/API.md)
+# Contributing
+To work on a local copy, simply clone with `--recurse-submodules` and install.  Please run the automated tests before making a PR.
+```sh
+## Clone the repo, including recursively cloning submodules
+git clone --recurse-submodules git@github.com:scribeocr/scribe.js.git
+cd scribe.js
+## Install dependencies
+npm i
+## Make changes
+## [...]
+## Run automated tests before making PR
+npm run test
+```

package/cli/extract.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import fs from 'fs';
 import path from 'path';
-import scribe from '../module.js';
+import scribe from '../scribe.js';
 /**
  *

package/cli/main.js CHANGED Viewed

@@ -5,7 +5,7 @@ import fs from 'fs';
 import path from 'path';
 import { tmpUnique } from '../js/worker/compareOCRModule.js';
-import scribe from '../module.js';
+import scribe from '../scribe.js';
 // When `debugMode` is enabled:
 // (1) Comparison images are saved as .png files.

package/docs/API.md ADDED Viewed

@@ -0,0 +1,191 @@
+<!-- Generated by documentation.js. Update this documentation by updating the source code. -->
+### Table of Contents
+*   [init][1]
+    *   [Parameters][2]
+*   [extractText][3]
+    *   [Parameters][4]
+*   [clear][5]
+*   [terminate][6]
+*   [exportData][7]
+    *   [Parameters][8]
+*   [download][9]
+    *   [Parameters][10]
+*   [SortedInputFiles][11]
+    *   [Properties][12]
+*   [importFiles][13]
+    *   [Parameters][14]
+*   [recognizePage][15]
+    *   [Parameters][16]
+*   [recognize][17]
+    *   [Parameters][18]
+## init
+Initialize the program and optionally pre-load resources.
+### Parameters
+*   `params` **[Object][19]?**&#x20;
+    *   `params.pdf` **[boolean][20]** Load PDF renderer. (optional, default `false`)
+    *   `params.ocr` **[boolean][20]** Load OCR engine. (optional, default `false`)
+    *   `params.font` **[boolean][20]** Load built-in fonts.
+        The PDF renderer and OCR engine are automatically loaded when needed.
+        Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
+## extractText
+Function for extracting text from image and PDF files with a single function call.
+By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
+For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
+### Parameters
+*   `files` &#x20;
+*   `langs` **[Array][21]<[string][22]>**  (optional, default `['eng']`)
+*   `outputFormat`   (optional, default `'txt'`)
+*   `options` **[Object][19]?**  (optional, default `{}`)
+    *   `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
+    *   `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
+## clear
+Clears all document-specific data.
+## terminate
+Terminates the program and releases resources.
+## exportData
+Export active OCR data to specified format.
+### Parameters
+*   `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)**  (optional, default `'txt'`)
+*   `minValue` **[number][23]**  (optional, default `0`)
+*   `maxValue` **[number][23]**  (optional, default `-1`)
+Returns **[Promise][24]<([string][22] | [ArrayBuffer][25])>**&#x20;
+## download
+Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
+### Parameters
+*   `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)**&#x20;
+*   `fileName` **[string][22]**&#x20;
+*   `minValue` **[number][23]**  (optional, default `0`)
+*   `maxValue` **[number][23]**  (optional, default `-1`)
+## SortedInputFiles
+An object with this shape can be used to provide input to the `importFiles` function,
+without needing that function to figure out the file types.
+This is required when using ArrayBuffer inputs.
+Type: [Object][19]
+### Properties
+*   `pdfFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+*   `imageFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+*   `ocrFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
+## importFiles
+Import files for processing.
+An object with `pdfFiles`, `imageFiles`, and `ocrFiles` arrays can be provided to import multiple types of files.
+Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
+### Parameters
+*   `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
+*   `options` **[Object][19]?**  (optional, default `{}`)
+    *   `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
+    *   `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
+## recognizePage
+Recognize a single page in active document.
+Use `recognize` instead to recognize all pages in a document.
+### Parameters
+*   `n` **[number][23]** Page number to recognize.
+*   `legacy` **[boolean][20]** *
+*   `lstm` **[boolean][20]** *
+*   `areaMode` **[boolean][20]** *
+*   `tessOptions` **[Object][19]<[string][22], [string][22]>** Options to pass to Tesseract.js. (optional, default `{}`)
+*   `debugVis` **[boolean][20]** Generate instructions for debugging visualizations. (optional, default `false`)
+## recognize
+Recognize all pages in active document.
+Files for recognition should already be imported using `importFiles` before calling this function.
+The results of recognition can be exported by calling `exportFiles` after this function.
+### Parameters
+*   `options` **[Object][19]**  (optional, default `{}`)
+    *   `options.mode` **(`"speed"` | `"quality"`)** Recognition mode. (optional, default `'quality'`)
+    *   `options.langs` **[Array][21]<[string][22]>** Language(s) in document. (optional, default `['eng']`)
+    *   `options.modeAdv` **(`"lstm"` | `"legacy"` | `"combined"`)** Alternative method of setting recognition mode. (optional, default `'combined'`)
+    *   `options.combineMode` **(`"conf"` | `"data"`)** Method of combining OCR results. Used if OCR data already exists. (optional, default `'data'`)
+    *   `options.vanillaMode` **[boolean][20]** Whether to use the vanilla Tesseract.js model. (optional, default `false`)
+[1]: #init
+[2]: #parameters
+[3]: #extracttext
+[4]: #parameters-1
+[5]: #clear
+[6]: #terminate
+[7]: #exportdata
+[8]: #parameters-2
+[9]: #download
+[10]: #parameters-3
+[11]: #sortedinputfiles
+[12]: #properties
+[13]: #importfiles
+[14]: #parameters-4
+[15]: #recognizepage
+[16]: #parameters-5
+[17]: #recognize
+[18]: #parameters-6
+[19]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object
+[20]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+[21]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array
+[22]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String
+[23]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number
+[24]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise
+[25]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer

package/docs/scribe_vs_tesseract.md ADDED Viewed

@@ -0,0 +1,39 @@
+# Overview
+Scribe.js and Tesseract.js are both JavaScript packages that allow for running OCR in the browser or Node.js.  As both packages have advantages and disadvantages, this article explains how the packages differ, and should help developers decide which package is right for their project.
+## TL;DR
+Tesseract.js is smaller and faster than Scribe.js.  Projects that only need to extract text from `.png` and `.jpeg` images, and are satisfied with "pretty good" accuracy, should use Tesseract.js.  Scribe.js builds on Tesseract.js by providing more accurate OCR results and more features.  Most notably, Scribe.js provides PDF support, including the ability to extract existing text from PDFs, run OCR on PDFs, and add text layers to PDFs.  Developers unsure of the tradeoffs should try both packages using images their application is likely to encounter.
+# Scope
+The reason why Tesseract.js and Scribe.js exist as separate packages, despite providing similar features and containing shared code, is that the scope of both projects is different.  Tesseract.js has a significantly narrower scope compared to Scribe.js.
+**The goal of Tesseract.js is to bring Tesseract--a popular program we do not maintain--to JavaScript.**  As long as the JavaScript interface is user-friendly and works correctly, and recognition results are similar to Tesseract on desktop, Tesseract.js is working as intended.  All bugs inherited from the main Tesseract codebase are outside of the scope of the Tesseract.js project.  As a result, a large number of Tesseract.js Git Issues, including virtually all accuracy-related issues, are closed as out of scope.
+**The goal of Scribe.js is to provide high-quality text extraction in JavaScript.**  Scribe.js was created to build on Tesseract.js and support many valid bug reports and feature requests that are outside of the scope of Tesseract.js.  For example, two of the most common requests from Tesseract.js users are improved OCR accuracy and PDF support.  Scribe.js (optionally) includes a custom OCR model that differs from, and generally outperforms, Tesseract.  When provided a text-native `.pdf`, Scribe.js can bypass OCR entirely and return the raw text.
+# Differences
+### PDF Support
+Tesseract.js does not support `.pdf` files.  The only way to extract text from `.pdf` files using Tesseract.js is to render the `.pdf` file into a series of `.png` images using a separate library and then recognizing those `.png` images.  In addition to being slow, this process is often unnecessary, as many modern `.pdf` files are already text-native, meaning that no OCR needs to occur.
+Scribe OCR does support `.pdf` files, and can extract text from `.pdf` files in multiple ways.  Scribe OCR can recognize the contents of the `.pdf` file using OCR.  Additionally, for `.pdf` files that are text-native or contain an existing OCR layer, the existing text can be extracted directly.  The latter method is significantly faster compared to rendering the `.pdf` to images and running OCR.
+### OCR Quality
+Scribe.js produces results that are generally more accurate than Tesseract.js.
+1. Particularly for high-quality scans and screenshots, Scribe.js misidentifies fewer words.
+2. Scribe.js often recognizes words that are skipped entirely by Tesseract.
+3. Scribe.js can identify font styles, which Tesseract is incapable of.
+	1. This can be observed by using the GUI at [scribeocr.com](https://scribeocr.com/).
+### GUI
+Scribe OCR contains a GUI web application that end-users can use to scan documents.  Tesseract.js is intended for developers within other applications, so is unsuitable for end users.
+### File Size
+The additional features added by Scribe.js take up more space.  Enabling PDF support requires loading multiple megabytes of dependencies.  Using the Scribe.js default `quality` OCR model loads more language data than Tesseract.js does by default.
+Notably, these resources are only loaded if requested--the PDF resources are only loaded if a PDF file is uploaded or exported, and setting OCR mode to `speed` prevents additional data from being downloaded.  However, if all optional features are disabled, Scribe.js has little to offer over Tesseract.js.
+### Speed
+The Scribe.js default `quality` recognition mode runs additional recognition and checks, which therefore increases runtime.  The amount varies significantly document-to-document, but is often in the range of a 40-90% increase versus the `speed` mode (which provides results similar to to Tesseract.js).  For applications where accuracy is not critical, this increase in runtime may not be worth it.
+### License
+Tesseract.js is Apache 2.0 licensed.  This is a permissive license that imposes no meaningful restrictions on use.  Scribe.js is AGPL 3.0 licensed, which is a copy-left license.  As a result, to use Scribe.js in your program--whether on the front-end or server-side--you must either (1) publish your program under AGPL 3.0 or a compatible license or (2) obtain a proprietary license (contact admin@scribeocr.com).

package/examples/browser/recognize-basic.html ADDED Viewed

@@ -0,0 +1,9 @@
+<!DOCTYPE HTML>
+<html>
+  <head>
+    <script src="./recognize-basic.js" type="module"></script>
+  </head>
+  <body>
+    <input type="file" id="uploader" multiple>
+  </body>
+</html>

package/examples/browser/recognize-basic.js ADDED Viewed

@@ -0,0 +1,10 @@
+import scribe from '../../scribe.js';
+// Pre-load OCR and font data to avoid delay when user uploads a file.
+await scribe.init({ ocr: true, font: true });
+const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
+elm.addEventListener('change', async () => {
+  if (!elm.files) return;
+  const text = await scribe.extractText(elm.files);
+  console.log(text);
+});

package/examples/node/recognize-basic.js ADDED Viewed

@@ -0,0 +1,11 @@
+#!/usr/bin/env node
+// Run `node examples/node/recognize-basic.js path/to/image.jpg` to recognize text in an image.
+import scribe from '../../scribe.js';
+const [,, imagePath] = process.argv;
+(async () => {
+  const res = await scribe.extractText([imagePath]);
+  console.log(res);
+  await scribe.terminate();
+})();

package/js/containers/app.js CHANGED Viewed

@@ -74,9 +74,6 @@ export class inputData {
   /** `true` if user re-uploaded HOCR data created by Scribe OCR */
   static resumeMode = false;
-  /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
-  static extractTextMode = false;
   /** `true` if ground truth data is uploaded */
   static evalMode = false;

package/js/containers/imageContainer.js CHANGED Viewed

@@ -216,52 +216,19 @@ export class ImageCache {
   static pageCount = 0;
   /**
- * The dimensions that each page would be, if it was rendered at 300 DPI.
- * @type {Array<dims>}
- */
-  static pdfDims300Arr = [];
+   * The dimensions that each page would be, if it was rendered at 300 DPI.
+   * @type {Array<dims>}
+   */
+  static pdfDims300 = [];
   static inputModes = {
     pdf: false,
     image: false,
   };
-  static pdfContentStats = {
-    /** Total number of letters in the source PDF. */
-    letterCountTotal: 0,
-    /** Total number of visible letters in the source PDF. */
-    letterCountVis: 0,
-    /** Total number of pages with 100+ letters in the source PDF. */
-    pageCountTotalText: 0,
-    /** Total number of pages with 100+ visible letters in the source PDF. */
-    pageCountVisText: 0,
-  };
   /** @type {?('text'|'ocr'|'image')} */
   static pdfType = null;
-  static setPdfType = () => {
-    // The PDF is considered text-native if:
-    // (1) The total number of visible letters is at least 100 per page on average.
-    // (2) The total number of visible letters is at least 90% of the total number of letters.
-    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
-    if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
-      && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'text';
-    // The PDF is considered ocr-native if:
-    // (1) The total number of letters is at least 100 per page on average.
-    // (2) The total number of letters is at least half of the total number of letters.
-    } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
-      && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
-      ImageCache.pdfType = 'ocr';
-    // Otherwise, the PDF is considered image-native.
-    // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
-    } else {
-      ImageCache.pdfType = 'image';
-    }
-  };
   static colorModeDefault = 'gray';
   static cacheRenderPages = 3;
@@ -327,7 +294,7 @@ export class ImageCache {
     } if (ImageCache.inputModes.pdf) {
       const pageMetrics = pageMetricsArr[n];
       const targetWidth = pageMetrics.dims.width;
-      const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
+      const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
       const muPDFScheduler = await ImageCache.getMuPDFScheduler();
       return muPDFScheduler.drawPageAsPNG({
         page: n + 1, dpi, color, skipText: skipTextMode,
@@ -566,14 +533,10 @@ export class ImageCache {
     ImageCache.inputModes.image = false;
     ImageCache.inputModes.pdf = false;
     ImageCache.pageCount = 0;
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     ImageCache.loadCount = 0;
     ImageCache.nativeProps.length = 0;
     ImageCache.binaryProps.length = 0;
-    ImageCache.pdfContentStats.letterCountTotal = 0;
-    ImageCache.pdfContentStats.letterCountVis = 0;
-    ImageCache.pdfContentStats.pageCountTotalText = 0;
-    ImageCache.pdfContentStats.pageCountVisText = 0;
   };
   static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
    *
    * @param {ArrayBuffer} fileData
    * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
-   * @param {Boolean} [extractStext=false]
    */
-  static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
+  static openMainPDF = async (fileData, skipText = false) => {
     const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
     await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
     const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
-    ImageCache.pdfDims300Arr.length = 0;
+    ImageCache.pdfDims300.length = 0;
     pageDims1.forEach((x) => {
-      ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
+      ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
     });
     ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
     // For reasons that are unclear, a small number of pages have been rendered into massive files
     // so a hard-cap on resolution must be imposed.
-    const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
+    const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
     // In addition to capping the resolution, also switch the width/height
-    ImageCache.pdfDims300Arr.forEach((x, i) => {
+    ImageCache.pdfDims300.forEach((x, i) => {
       const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
       pageMetricsArr[i] = new PageMetrics(pageDims);
     });
@@ -674,23 +636,5 @@ export class ImageCache {
         await setUploadFontsWorker(gs.schedulerInner);
       });
     }
-    if (extractStext) {
-      ocrAllRaw.active = Array(ImageCache.pageCount);
-      const resArr = pageDPI.map(async (x, i) => {
-        // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
-        // The XML format is the only built-in mupdf format that includes character-level granularity.
-        const res = await muPDFScheduler.pageText({
-          page: i + 1, dpi: x, format: 'xml', calcStats: true,
-        });
-        ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
-        ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
-        if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
-        if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
-        ocrAllRaw.active[i] = res.content;
-      });
-      await Promise.all(resArr);
-      ImageCache.setPdfType();
-    }
   };
 }

package/js/export/export.js CHANGED Viewed

@@ -8,12 +8,14 @@ import { renderHOCR } from './exportRenderHOCR.js';
 import { renderText } from './exportRenderText.js';
 /**
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
+ * Export active OCR data to specified format.
+ * @public
+ * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} [format='txt']
  * @param {number} [minValue=0]
  * @param {number} [maxValue=-1]
  * @returns {Promise<string|ArrayBuffer>}
  */
-export async function exportData(format, minValue = 0, maxValue = -1) {
+export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
   if (format === 'text') format = 'txt';
   if (maxValue === -1) maxValue = inputData.pageCount - 1;
@@ -184,6 +186,7 @@ export async function exportData(format, minValue = 0, maxValue = -1) {
 /**
  * Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
+ * @public
  * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
  * @param {string} fileName
  * @param {number} [minValue=0]

package/js/extractPDFText.js ADDED Viewed

@@ -0,0 +1,110 @@
+import { ImageCache } from './containers/imageContainer.js';
+import { convertOCRAll } from './recognizeConvert.js';
+import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
+/**
+ * Extract raw text content from currently loaded PDF.
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
+ */
+const extractInternalPDFTextRaw = async () => {
+  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
+  const pdfContentStats = {
+    /** Total number of letters in the source PDF. */
+    letterCountTotal: 0,
+    /** Total number of visible letters in the source PDF. */
+    letterCountVis: 0,
+    /** Total number of pages with 100+ letters in the source PDF. */
+    pageCountTotalText: 0,
+    /** Total number of pages with 100+ visible letters in the source PDF. */
+    pageCountVisText: 0,
+  };
+  const stextArr = /** @type {Array<string>} */ ([]);
+  const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
+  const resArr = pageDPI.map(async (x, i) => {
+    // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
+    // The XML format is the only built-in mupdf format that includes character-level granularity.
+    const res = await muPDFScheduler.pageText({
+      page: i + 1, dpi: x, format: 'xml', calcStats: true,
+    });
+    pdfContentStats.letterCountTotal += res.letterCountTotal;
+    pdfContentStats.letterCountVis += res.letterCountVis;
+    if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
+    if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
+    stextArr[i] = res.content;
+  });
+  await Promise.all(resArr);
+  /** @type {"image" | "text" | "ocr"} */
+  let type = 'image';
+  // Determine whether the PDF is text-native, image-only, or image + OCR.
+  {
+    // The PDF is considered text-native if:
+    // (1) The total number of visible letters is at least 100 per page on average.
+    // (2) The total number of visible letters is at least 90% of the total number of letters.
+    // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
+    if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
+      && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
+      type = 'text';
+      // The PDF is considered ocr-native if:
+      // (1) The total number of letters is at least 100 per page on average.
+      // (2) The total number of letters is at least half of the total number of letters.
+    } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
+      && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
+      type = 'ocr';
+      // Otherwise, the PDF is considered image-native.
+      // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
+    } else {
+      type = 'image';
+    }
+  }
+  return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
+};
+/**
+ * Extract and parse text from currently loaded PDF.
+ * @param {Object} [options]
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
+ *   This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
+ */
+export const extractInternalPDFText = async (options = {}) => {
+  const extractPDFTextNative = options?.extractPDFTextNative ?? true;
+  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
+  const extractPDFTextImage = options?.extractPDFTextImage ?? false;
+  const setActive = options?.setActive ?? false;
+  const res = await extractInternalPDFTextRaw();
+  ImageCache.pdfType = res.type;
+  ocrAllRaw.pdf = res.contentRaw;
+  if (!extractPDFTextImage && res.type === 'image') return res;
+  if (!extractPDFTextOCR && res.type === 'ocr') return res;
+  if (!extractPDFTextNative && res.type === 'text') return res;
+  ocrAll.pdf = Array(ImageCache.pageCount);
+  if (setActive) {
+    ocrAllRaw.active = ocrAllRaw.pdf;
+    ocrAll.active = ocrAll.pdf;
+  }
+  const format = 'stext';
+  // Process HOCR using web worker, reading from file first if that has not been done already
+  await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
+  res.content = ocrAll.pdf;
+  return res;
+};

package/js/fontContainerMain.js CHANGED Viewed

@@ -80,7 +80,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
   if (!fontAll.active || (!fontAll.active.NimbusSans.normal.opt && !fontAll.active.NimbusRomNo9L.normal.opt)) fontAll.active = fontAll.raw;
   if (typeof process === 'undefined') {
-    await gs.schedulerReadyLoadFonts;
+    // This assumes that the scheduler `init` method has at least started.
+    if (gs.schedulerReady === null) console.warn('Failed to load fonts to workers as workers have not been initialized yet.');
+    await gs.schedulerReady;
     await setBuiltInFontsWorker(gs.schedulerInner, true);
   }
@@ -114,8 +116,7 @@ export async function loadChiSimFont() {
  * @param {boolean} [forceWorkerUpdate=false] - If true, forces the worker to update the font data even if the font data of this type is already loaded.
  *    This should be used when switching from unvalidated to validated optimized fonts.
  */
-export async function enableDisableFontOpt(enable, useInitial = false, forceWorkerUpdate = false) {
-  const browserMode = typeof process === 'undefined';
+export async function enableFontOpt(enable, useInitial = false, forceWorkerUpdate = false) {
   // Enable/disable optimized font
   if (enable && useInitial && fontAll.optInitial) {
@@ -127,7 +128,7 @@ export async function enableDisableFontOpt(enable, useInitial = false, forceWork
   }
   // Enable/disable optimized font in workers
-  if (browserMode) {
+  if (typeof process === 'undefined') {
     await setBuiltInFontsWorker(gs.schedulerInner, forceWorkerUpdate);
   } else {
     // const { setFontAll } = await import('./worker/compareOCRModule.js');

package/js/fontEval.js CHANGED Viewed

@@ -1,7 +1,9 @@
 import { DebugData, fontMetricsObj, pageMetricsArr } from './containers/dataContainer.js';
 import { fontAll } from './containers/fontContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
-import { enableDisableFontOpt, optimizeFontContainerAll, setDefaultFontAuto } from './fontContainerMain.js';
+import {
+  enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
+} from './fontContainerMain.js';
 import { gs } from './generalWorkerMain.js';
 /**
@@ -162,6 +164,8 @@ export async function evaluateFonts(pageArr) {
 export async function runFontOptimization(ocrArr) {
   const browserMode = typeof process === 'undefined';
+  await loadBuiltInFontsRaw();
   const fontRaw = fontAll.getContainer('raw');
   const calculateOpt = fontMetricsObj && Object.keys(fontMetricsObj).length > 0;
@@ -188,7 +192,7 @@ export async function runFontOptimization(ocrArr) {
     const pageNum = Math.min(ImageCache.pageCount, 5);
     // Set raw font in workers
-    await enableDisableFontOpt(false);
+    await enableFontOpt(false);
     // This step needs to happen here as all fonts must be registered before initializing the canvas.
     if (!browserMode) {
@@ -202,7 +206,7 @@ export async function runFontOptimization(ocrArr) {
     if (calculateOpt && Object.keys(fontAll.optInitial).length > 0) {
       // Enable optimized fonts
-      await enableDisableFontOpt(true, true, true);
+      await enableFontOpt(true, true, true);
       const evalOpt = await evaluateFonts(ocrArr.slice(0, pageNum));
@@ -248,7 +252,7 @@ export async function runFontOptimization(ocrArr) {
   }
   // Set final fonts in workers
-  await enableDisableFontOpt(true, false, true);
+  await enableFontOpt(true, false, true);
   const enableOpt = enableOptSerif || enableOptSans;

package/js/generalWorkerMain.js CHANGED Viewed

@@ -173,22 +173,6 @@ export class gs {
     });
   };
-  /** @type {?Function} */
-  static resReadyLoadFonts = null;
-  /**
-   * Promise that resolves when the scheduler is ready for font loading.
-   * Only used in browser version, as nothing using fonts is run within workers in Node.js version.
-   * @type {?Promise<void>}
-   */
-  static schedulerReadyLoadFonts = null;
-  static setSchedulerReadyLoadFonts = () => {
-    gs.schedulerReadyLoadFonts = new Promise((resolve, reject) => {
-      gs.resReadyLoadFonts = resolve;
-    });
-  };
   /** @type {?Function} */
   static resReadyTesseract = null;
@@ -201,21 +185,7 @@ export class gs {
     });
   };
-  /** @type {?Function} */
-  static resReadyFontAllRaw = null;
-  /** @type {?Promise<void>} */
-  static fontAllRawReady = null;
-  static setFontAllRawReady = () => {
-    gs.fontAllRawReady = new Promise((resolve, reject) => {
-      gs.resReadyFontAllRaw = resolve;
-    });
-    return /** @type {Function} */ (gs.resReadyFontAllRaw);
-  };
   static init = async () => {
-    gs.setSchedulerReadyLoadFonts();
     gs.setSchedulerReady();
     // Determine number of workers to use in the browser.
@@ -251,16 +221,6 @@ export class gs {
     gs.scheduler = new GeneralScheduler(gs.schedulerInner);
-    // Fonts are only loaded in the browser.
-    // The functions we would use fonts in a worker for also require node-canvas, which does not support workers yet.
-    if (typeof process === 'undefined') {
-    // @ts-ignore
-      gs.resReadyLoadFonts(true);
-      // Send raw fonts to workers after they have loaded in the main thread.
-      await gs.fontAllRawReady;
-    }
     // @ts-ignore
     gs.resReady(true);
   };
@@ -303,12 +263,12 @@ export class gs {
   static getGeneralScheduler = async () => {
     if (gs.schedulerReady) {
       await gs.schedulerReady;
-      return gs.scheduler;
+      return /** @type {GeneralScheduler} */ (gs.scheduler);
     }
     await gs.init();
-    return gs.scheduler;
+    return /** @type {GeneralScheduler} */ (gs.scheduler);
   };
   static terminate = async () => {
@@ -317,11 +277,7 @@ export class gs {
     gs.schedulerInner = null;
     gs.resReady = null;
     gs.schedulerReady = null;
-    gs.resReadyLoadFonts = null;
-    gs.schedulerReadyLoadFonts = null;
     gs.resReadyTesseract = null;
     gs.schedulerReadyTesseract = null;
-    gs.resReadyFontAllRaw = null;
-    gs.fontAllRawReady = null;
   };
 }

package/js/import/convertPageStext.js CHANGED Viewed

@@ -212,7 +212,7 @@ export async function convertPageStext({ ocrStr, n }) {
             // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
             smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
-            smallCapsCurrent = /(small\W?cap)|sc$/i.test(fontNameStrI);
+            smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(fontNameStrI);
             smallCapsWord = smallCapsCurrent;
             if (/italic/i.test(fontNameStrI) || /-\w*ital/i.test(fontNameStrI)) {

package/js/import/import.js CHANGED Viewed

@@ -11,7 +11,9 @@ import {
 } from '../containers/dataContainer.js';
 import { fontAll } from '../containers/fontContainer.js';
 import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
-import { enableDisableFontOpt, optimizeFontContainerAll, setDefaultFontAuto } from '../fontContainerMain.js';
+import {
+  enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
+} from '../fontContainerMain.js';
 import { runFontOptimization } from '../fontEval.js';
 import { calcFontMetricsFromPages } from '../fontStatistics.js';
 import { gs } from '../generalWorkerMain.js';
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
 import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
 import { replaceObjectProperties } from '../utils/miscUtils.js';
 import { importOCRFiles } from './importOCR.js';
+import { extractInternalPDFText } from '../extractPDFText.js';
 /**
  * Automatically detects the image type (jpeg or png).
@@ -169,7 +172,10 @@ export function sortInputFiles(files) {
 }
 /**
- *
+ * An object with this shape can be used to provide input to the `importFiles` function,
+ * without needing that function to figure out the file types.
+ * This is required when using ArrayBuffer inputs.
+ * @public
  * @typedef {Object} SortedInputFiles
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [pdfFiles]
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [imageFiles]
@@ -180,11 +186,19 @@ export function sortInputFiles(files) {
  * Import files for processing.
  * An object with `pdfFiles`, `imageFiles`, and `ocrFiles` arrays can be provided to import multiple types of files.
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
+ * @public
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
+ * @param {Object} [options]
+ * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
  * @returns
  */
-export async function importFiles(files) {
+export async function importFiles(files, options = {}) {
   clearData();
+  gs.getGeneralScheduler();
+  const extractPDFTextNative = options?.extractPDFTextNative ?? false;
+  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
   /** @type {Array<File|FileNode|ArrayBuffer>} */
   let pdfFiles = [];
@@ -261,10 +275,6 @@ export async function importFiles(files) {
   const xmlModeImport = ocrFiles.length > 0;
-  // Extract text from PDF document
-  // Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
-  inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
   let pageCount;
   let pageCountImage;
   let abbyyMode = false;
@@ -279,7 +289,7 @@ export async function importFiles(files) {
     const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
     // If no XML data is provided, page sizes are calculated using muPDF alone
-    await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText, inputData.extractTextMode);
+    await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
     pageCountImage = ImageCache.pageCount;
     ImageCache.loadCount = ImageCache.pageCount;
@@ -310,6 +320,8 @@ export async function importFiles(files) {
     // Restore font metrics and optimize font from previous session (if applicable)
     if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
+      const fontPromise = loadBuiltInFontsRaw();
       existingOpt = true;
       replaceObjectProperties(fontMetricsObj, ocrData.fontMetricsObj);
@@ -323,11 +335,12 @@ export async function importFiles(files) {
       if (ocrData.enableOpt === 'false') {
         opt.enableOpt = false;
       } else {
+        await fontPromise;
         const fontRaw = fontAll.getContainer('raw');
         if (!fontRaw) throw new Error('Raw font data not found.');
         fontAll.opt = await optimizeFontContainerAll(fontRaw, fontMetricsObj);
         opt.enableOpt = true;
-        await enableDisableFontOpt(true);
+        await enableFontOpt(true);
       }
     }
@@ -360,11 +373,6 @@ export async function importFiles(files) {
     scribeMode = ocrData.scribeMode;
     stextMode = ocrData.stextMode;
-  } else if (inputData.extractTextMode) {
-    // Initialize a new array on `ocrAll` if one does not already exist
-    if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
-    ocrAll.active = ocrAll[oemName];
-    stextMode = true;
   }
   const pageCountHOCR = ocrAllRaw.active?.length;
@@ -416,7 +424,7 @@ export async function importFiles(files) {
     }
   }
-  if (xmlModeImport || inputData.extractTextMode) {
+  if (xmlModeImport) {
     /** @type {("hocr" | "abbyy" | "stext")} */
     let format = 'hocr';
     if (abbyyMode) format = 'abbyy';
@@ -431,11 +439,11 @@ export async function importFiles(files) {
         opt.enableOpt = await runFontOptimization(ocrAll.active);
       }
     });
+  } else if (extractPDFTextNative || extractPDFTextOCR) {
+    await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
   }
 }
-// Import supplemental OCR files (from "Evaluate Accuracy" UI tab)
 /**
  * Import supplemental OCR files, such as an alternate OCR version or ground truth data.
  * This function should not be used to import the main OCR files.

package/js/import/importOCR.js CHANGED Viewed

@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
   let pageCountHOCR;
   let hocrRaw;
   /** @type  {?Object.<string, FontMetricsFamily>} */
-  let fontMetricsObj;
+  let fontMetricsObj = null;
   /** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
   let layoutObj = null;
   /** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
     const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
     // Check whether input is Abbyy XML
-    const node2 = hocrStrAll.match(/>([^>]+)/)[1];
-    abbyyMode = !!/abbyy/i.test(node2);
-    stextMode = !!/<document name/.test(node2);
+    const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
+    abbyyMode = !!node2 && !!/abbyy/i.test(node2);
+    stextMode = !!node2 && !!/<document name/.test(node2);
     if (abbyyMode) {
       hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
     // Check whether input is Abbyy XML using the first file
     const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
-    const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
-    abbyyMode = !!/abbyy/i.test(node2);
+    const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
+    abbyyMode = !!node2 && !!/abbyy/i.test(node2);
     for (let i = 0; i < pageCountHOCR; i++) {
       const hocrFile = ocrFilesAll[i];

package/js/import/nodeAdapter.js CHANGED Viewed

@@ -11,11 +11,13 @@ export class FileNode {
   /**
      * Creates an instance of the File class.
      * @param {string} filePath - The path to the file.
+     * @param {string} name - The name of the file.
+     * @param {Buffer} fileData - The file's data.
      */
-  constructor(filePath) {
+  constructor(filePath, name, fileData) {
     this.filePath = filePath;
-    this.name = path.basename(filePath);
-    this.fileData = fs.readFileSync(filePath);
+    this.name = name;
+    this.fileData = fileData;
   }
   /**
@@ -29,7 +31,20 @@ export class FileNode {
 /**
  *
- * @param {Array<string>} files
+ * @param {Array<string>} filePaths
  * @returns
  */
-export const wrapFilesNode = (files) => files.map((file) => (new FileNode(file)));
+export const wrapFilesNode = (filePaths) => {
+  const filePromises = filePaths.map(async (filePath) => {
+    const isUrl = filePath.startsWith('http://') || filePath.startsWith('https://') || filePath.startsWith('moz-extension://')
+    || filePath.startsWith('chrome-extension://') || filePath.startsWith('file://');
+    const fileData = isUrl ? Buffer.from(await fetch(filePath).then((res) => res.arrayBuffer())) : fs.readFileSync(filePath);
+    const fileName = isUrl ? filePath.split('/').pop() : path.basename(filePath);
+    return new FileNode(filePath, fileName, fileData);
+  });
+  return Promise.all(filePromises);
+};

package/js/objects/layoutObjects.js CHANGED Viewed

@@ -84,3 +84,11 @@ export function LayoutDataTablePage() {
   /** @type {Array<LayoutDataTable>} */
   this.tables = [];
 }
+const layout = {
+  LayoutDataColumn,
+  LayoutDataTable,
+  LayoutRegion,
+};
+export default layout;

package/js/objects/ocrObjects.js CHANGED Viewed

@@ -403,8 +403,28 @@ function calcWordAngleAdj(word) {
  * @param {string} text
  */
 function replaceLigatures(text) {
-  return text.replace(/ﬂ/g, 'fl').replace(/ﬁ/g, 'fi').replace(/ﬀ/g, 'ff').replace(/ﬃ/g, 'ffi')
-    .replace(/ﬄ/g, 'ffl');
+  return text.replace(/Ĳ/g, 'IJ')
+    .replace(/ĳ/g, 'ij')
+    .replace(/ŉ/g, 'ʼn')
+    .replace(/Ǳ/g, 'DZ')
+    .replace(/ǲ/g, 'Dz')
+    .replace(/ǳ/g, 'dz')
+    .replace(/Ǆ/g, 'DŽ')
+    .replace(/ǅ/g, 'Dž')
+    .replace(/ǆ/g, 'dž')
+    .replace(/Ǉ/g, 'LJ')
+    .replace(/ǈ/g, 'Lj')
+    .replace(/ǉ/g, 'lj')
+    .replace(/Ǌ/g, 'NJ')
+    .replace(/ǋ/g, 'Nj')
+    .replace(/ǌ/g, 'nj')
+    .replace(/ﬀ/g, 'ff')
+    .replace(/ﬁ/g, 'fi')
+    .replace(/ﬂ/g, 'fl')
+    .replace(/ﬃ/g, 'ffi')
+    .replace(/ﬄ/g, 'ffl')
+    .replace(/ﬅ/g, 'ſt')
+    .replace(/ﬆ/g, 'st');
 }
 /**

package/js/recognizeConvert.js CHANGED Viewed

@@ -149,8 +149,10 @@ export const calcRecognizeRotateArgs = async (n, areaMode) => {
 };
 /**
- * Run recognition on a page and save the results, including OCR data and (possibly) auto-rotated images, to the appropriate global array.
+ * Recognize a single page in active document.
+ * Use `recognize` instead to recognize all pages in a document.
  *
+ * @public
  * @param {number} n - Page number to recognize.
  * @param {boolean} legacy -
  * @param {boolean} lstm -
@@ -493,14 +495,16 @@ export async function recognizeAllPages(legacy = true, lstm = true, mainData = f
 }
 /**
- *
+ * Recognize all pages in active document.
+ * Files for recognition should already be imported using `importFiles` before calling this function.
+ * The results of recognition can be exported by calling `exportFiles` after this function.
+ * @public
  * @param {Object} options
  * @param {'speed'|'quality'} [options.mode='quality'] - Recognition mode.
  * @param {Array<string>} [options.langs=['eng']] - Language(s) in document.
  * @param {'lstm'|'legacy'|'combined'} [options.modeAdv='combined'] - Alternative method of setting recognition mode.
  * @param {'conf'|'data'} [options.combineMode='data'] - Method of combining OCR results. Used if OCR data already exists.
  * @param {boolean} [options.vanillaMode=false] - Whether to use the vanilla Tesseract.js model.
- *
  */
 export async function recognize(options = {}) {
   await gs.getGeneralScheduler();

package/package.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "High-quality OCR and text extraction for images and PDFs.",
-  "main": "module.js",
+  "main": "scribe.js",
   "directories": {
     "lib": "lib"
   },
@@ -13,6 +13,7 @@
     "@types/node": "^18.0.6",
     "@types/opentype.js": "^1.3.8",
     "chai": "^5.1.1",
+    "documentation": "^14.0.3",
     "eslint": "^8.56.0",
     "eslint-config-airbnb-base": "^15.0.0",
     "eslint-plugin-import": "^2.29.1",
@@ -27,6 +28,7 @@
     "wait-on": "^7.2.0"
   },
   "scripts": {
+    "docs": "documentation build scribe.js -f md --access public > docs/API.md",
     "start": "node tests/scripts/server.js",
     "stop": "curl http://localhost:3031/shutdown",
     "test": "npm-run-all test:cli test:module",

package/{module.js → scribe.js} RENAMED Viewed

@@ -13,11 +13,11 @@ import { drawDebugImages } from './js/debug.js';
 import { download, exportData } from './js/export/export.js';
 import { writeDebugCsv } from './js/export/exportDebugCsv.js';
 import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
-import { loadBuiltInFontsRaw } from './js/fontContainerMain.js';
+import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
 import { gs } from './js/generalWorkerMain.js';
 import { importFiles, importFilesSupp } from './js/import/import.js';
 import { calcBoxOverlap, combineOCRPage } from './js/modifyOCR.js';
-import { calcTableBbox } from './js/objects/layoutObjects.js';
+import layout, { calcTableBbox } from './js/objects/layoutObjects.js';
 import ocr from './js/objects/ocrObjects.js';
 import {
   calcEvalStatsDoc,
@@ -30,9 +30,11 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
 import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
 import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
 import { assignParagraphs } from './js/utils/reflowPars.js';
+import { extractInternalPDFText } from './js/extractPDFText.js';
 /**
  * Initialize the program and optionally pre-load resources.
+ * @public
  * @param {Object} [params]
  * @param {boolean} [params.pdf=false] - Load PDF renderer.
  * @param {boolean} [params.ocr=false] - Load OCR engine.
@@ -58,23 +60,30 @@ const init = async (params) => {
   }
   if (initFont) {
-    const resReadyFontAllRaw = gs.setFontAllRawReady();
-    promiseArr.push(loadBuiltInFontsRaw().then(() => resReadyFontAllRaw()));
+    promiseArr.push(loadBuiltInFontsRaw());
   }
   await Promise.all(promiseArr);
 };
 /**
- *
+ * Function for extracting text from image and PDF files with a single function call.
+ * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
+ * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
+ * @public
  * @param {Parameters<typeof importFiles>[0]} files
  * @param {Array<string>} [langs=['eng']]
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
- * @returns
+ * @param {Object} [options]
+ * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
+ * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
  */
-const recognizeFiles = async (files, langs = ['eng'], outputFormat = 'txt') => {
-  await importFiles(files);
-  await recognize({ langs });
+const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
+  const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
+  const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
+  init({ ocr: true, font: true });
+  await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
+  if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
   return exportData(outputFormat);
 };
@@ -139,6 +148,7 @@ class utils {
 /**
  * Clears all document-specific data.
+ * @public
  */
 const clear = async () => {
   clearData();
@@ -146,6 +156,7 @@ const clear = async () => {
 /**
  * Terminates the program and releases resources.
+ * @public
  */
 const terminate = async () => {
   clearData();
@@ -157,6 +168,7 @@ export default {
   combineOCRPage,
   compareOCR,
   data,
+  enableFontOpt,
   evalOCRPage,
   exportData,
   download,
@@ -164,10 +176,12 @@ export default {
   importFilesSupp,
   inputData,
   init,
+  layout,
   opt,
   recognize,
   recognizePage,
-  recognizeFiles,
+  extractText,
+  extractInternalPDFText,
   terminate,
   utils,
 };