npm - scribe.js-ocr - Versions diffs - 0.4.0 → 0.4.1 - Mend

scribe.js-ocr 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/docs/API.md CHANGED Viewed

@@ -41,6 +41,7 @@ Initialize the program and optionally pre-load resources.
 Function for extracting text from image and PDF files with a single function call.
 By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
+To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
 For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
 ### Parameters
@@ -48,10 +49,7 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
 *   `files` &#x20;
 *   `langs` **[Array][23]<[string][24]>**  (optional, default `['eng']`)
 *   `outputFormat`   (optional, default `'txt'`)
-*   `options` **[Object][21]?**  (optional, default `{}`)
-    *   `options.skipRecPDFTextNative` **[boolean][22]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
-    *   `options.skipRecPDFTextOCR` **[boolean][22]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
+*   `options`   (optional, default `{}`)
 ## writeDebugImages
@@ -115,12 +113,6 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
 ### Parameters
 *   `files` **([Array][23]\<File> | FileList | [Array][23]<[string][24]> | [SortedInputFiles][13])**&#x20;
-*   `options` **[Object][21]?**  (optional, default `{}`)
-    *   `options.extractPDFTextNative` **[boolean][22]** Extract text from text-native PDF documents. (optional, default `false`)
-    *   `options.extractPDFTextOCR` **[boolean][22]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
-    *   `options.extractPDFTextImage` **[boolean][22]** Extract text from image-native PDF documents with no existing OCR layer.
-        This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header). (optional, default `false`)
 ## recognizePage

package/js/containers/app.js CHANGED Viewed

@@ -62,9 +62,23 @@ export class opt {
   static calcSuppFontInfo = false;
-  static usePDFTextSupp = true;
-  static usePDFTextMain = true;
+  /**
+   * How to use PDF text data extracted from input PDFs (if any).
+   * The `native` option controls how native text data is used (i.e. visible text rendered by the PDF viewer),
+   * while the `ocr` option controls how OCR text data is used (i.e. invisible text printed over an image).
+   * If `main` is true, then the data will be used as the primary data source.
+   * If `supp` is true, then the data will be used as a supplemental data source (may be used to correct errors in the primary data source).
+   */
+  static usePDFText = {
+    native: {
+      supp: true,
+      main: true,
+    },
+    ocr: {
+      supp: true,
+      main: false,
+    },
+  };
   /**
    * Number of workers to use. Must be set prior to initialization.

package/js/extractPDFText.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { inputData } from './containers/app.js';
+import { inputData, opt } from './containers/app.js';
 import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
 import { ImageCache } from './containers/imageContainer.js';
 import { convertOCR } from './recognizeConvert.js';
@@ -68,19 +68,11 @@ const extractInternalPDFTextRaw = async () => {
 /**
  * Extract and parse text from currently loaded PDF.
- * @param {Object} [options]
- * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
- * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
- * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
- *   This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
- * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
  */
-export const extractInternalPDFText = async (options = {}) => {
-  const extractPDFTextNative = options?.extractPDFTextNative ?? true;
-  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
-  const extractPDFTextImage = options?.extractPDFTextImage ?? false;
-  const setActive = options?.setActive ?? false;
+export const extractInternalPDFText = async () => {
+  const extractPDFTextNative = opt.usePDFText.native.main || opt.usePDFText.native.supp;
+  const extractPDFTextOCR = opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp;
+  const extractPDFTextImage = false;
   const res = await extractInternalPDFTextRaw();
@@ -95,7 +87,7 @@ export const extractInternalPDFText = async (options = {}) => {
   ocrAll.pdf = Array(ImageCache.pageCount);
-  if (setActive) {
+  if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
     ocrAllRaw.active = ocrAllRaw.pdf;
     ocrAll.active = ocrAll.pdf;
   }

package/js/import/import.js CHANGED Viewed

@@ -192,21 +192,11 @@ export function sortInputFiles(files) {
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
  * @public
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
- * @param {Object} [options]
- * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
- * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
- * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
- *   This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
- * @returns
  */
-export async function importFiles(files, options = {}) {
+export async function importFiles(files) {
   clearData();
   gs.getGeneralScheduler();
-  const extractPDFTextNative = options?.extractPDFTextNative ?? false;
-  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
-  const extractPDFTextImage = options?.extractPDFTextImage ?? false;
   /** @type {Array<File|FileNode|ArrayBuffer>} */
   let pdfFiles = [];
   /** @type {Array<File|FileNode|ArrayBuffer>} */
@@ -445,11 +435,9 @@ export async function importFiles(files, options = {}) {
         await runFontOptimization(ocrAll.active);
       }
     });
-  } else if (inputData.pdfMode && (extractPDFTextNative || extractPDFTextOCR)) {
-    await extractInternalPDFText({
-      setActive: opt.usePDFTextMain, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
-    });
-    if (opt.usePDFTextMain) {
+  } else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
+    await extractInternalPDFText();
+    if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
       if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
       if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
     }

package/js/recognizeConvert.js CHANGED Viewed

@@ -533,7 +533,7 @@ export async function recognize(options = {}) {
   let existingOCR;
   if (ocrAll['User Upload']) {
     existingOCR = ocrAll['User Upload'];
-  } else if (opt.usePDFTextSupp && ocrAll.pdf) {
+  } else if (ocrAll.pdf && (inputData.pdfType === 'text' && opt.usePDFText.native.supp || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.supp)) {
     existingOCR = ocrAll.pdf;
     // If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
     forceMainData = ocrAll.pdf !== ocrAll.active;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.4.0",
+  "version": "0.4.1",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {

package/scribe.js CHANGED Viewed

@@ -79,20 +79,18 @@ const init = async (params) => {
 /**
  * Function for extracting text from image and PDF files with a single function call.
  * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
+ * To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
  * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
  * @public
  * @param {Parameters<typeof importFiles>[0]} files
  * @param {Array<string>} [langs=['eng']]
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
- * @param {Object} [options]
- * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
- * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
  */
 const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
   const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
   const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
   init({ ocr: true, font: true });
-  await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
+  await importFiles(files);
   if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
   const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
   const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;