npm - scribe.js-ocr - Versions diffs - 0.9.1 → 0.9.2 - Mend

scribe.js-ocr 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.gitmodules +1 -1
package/js/containers/imageContainer.js +19 -7
package/js/export/export.js +0 -1
package/js/export/writeDocx.js +116 -4
package/js/export/writeText.js +6 -60
package/js/extractPDFText.js +1 -1
package/js/generalWorkerMain.js +1 -1
package/js/import/import.js +5 -0
package/js/import/importOCR.js +2 -2
package/js/utils/miscUtils.js +2 -1
package/package.json +1 -1

package/.gitmodules CHANGED Viewed

@@ -3,4 +3,4 @@
 	url = https://github.com/scribeocr/scrollview-web.git
 [submodule "cloud-adapters"]
 	path = cloud-adapters
-	url = git@github.com:scribeocr/cloud-adapters.git
+	url = https://github.com/scribeocr/cloud-adapters.git

package/js/containers/imageContainer.js CHANGED Viewed

@@ -142,10 +142,23 @@ export class ImageCache {
    * Initializes the MuPDF scheduler.
    * This is separate from the function that loads the file (`#loadFileMuPDFScheduler`),
    * as the scheduler starts loading ahead of the file being available for performance reasons.
-   * @param {number} numWorkers
-   * @returns
+   * @param {number} [numWorkers]
    */
-  static #initMuPDFScheduler = async (numWorkers = 3) => {
+  static #initMuPDFScheduler = async (numWorkers) => {
+    // If `numbWorkers` is not specified, use up to 3 workers based on hardware concurrency
+    // and the global `opt.workerN` setting.
+    if (!numWorkers) {
+      if (typeof process === 'undefined') {
+        numWorkers = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 3);
+      } else {
+        const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
+        numWorkers = Math.max(Math.min(cpuN - 1, 3), 1);
+      }
+      if (opt.workerN && opt.workerN < numWorkers) {
+        numWorkers = opt.workerN;
+      }
+    }
     const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
     const scheduler = await Tesseract.createScheduler();
     const workersPromiseArr = range(1, numWorkers).map(async () => {
@@ -357,10 +370,9 @@ export class ImageCache {
   /**
    * Gets the MuPDF scheduler if it exists, otherwise creates a new one.
-   * @param {number} [numWorkers=3] - Number of workers to create.
-   * @returns
+   * @param {number} [numWorkers] - Number of workers to create.
    */
-  static getMuPDFScheduler = async (numWorkers = 3) => {
+  static getMuPDFScheduler = async (numWorkers) => {
     if (ImageCache.muPDFScheduler) return ImageCache.muPDFScheduler;
     ImageCache.muPDFScheduler = ImageCache.#initMuPDFScheduler(numWorkers);
     return ImageCache.muPDFScheduler;
@@ -372,7 +384,7 @@ export class ImageCache {
    * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
    */
   static openMainPDF = async (fileData, skipText = false) => {
-    const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
+    const muPDFScheduler = await ImageCache.getMuPDFScheduler();
     await ImageCache.#loadFileMuPDFScheduler(fileData);

package/js/export/export.js CHANGED Viewed

@@ -254,7 +254,6 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
       minpage: minPage,
       maxpage: maxPage,
       reflowText: opt.reflow,
-      docxMode: false,
     });
   // Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
   // @ts-ignore

package/js/export/writeDocx.js CHANGED Viewed

@@ -1,9 +1,122 @@
 import { documentEnd, documentStart, docxStrings } from './resources/docxFiles.js';
-import { writeText } from './writeText.js';
 import { opt } from '../containers/app.js';
+import { assignParagraphs } from '../utils/reflowPars.js';
+import { pageMetricsAll } from '../containers/dataContainer.js';
+import ocr from '../objects/ocrObjects.js';
+/**
+ * Convert an array of ocrPage objects to XML for a Word document.
+ *
+ * @param {Object} params
+ * @param {Array<OcrPage>} params.ocrCurrent -
+ * @param {number} [params.minpage=0] - The first page to include in the document.
+ * @param {number} [params.maxpage=-1] - The last page to include in the document.
+ * @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
+ * @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
+ *    If omitted, all words are included.
+ */
+export function writeDocxContent({
+  ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
+}) {
+  let textStr = '';
+  if (maxpage === -1) maxpage = ocrCurrent.length - 1;
+  let newLine = false;
+  for (let g = minpage; g <= maxpage; g++) {
+    if (!ocrCurrent[g] || ocrCurrent[g].lines.length === 0) continue;
+    const pageObj = ocrCurrent[g];
+    // Do not overwrite paragraphs from Abbyy or Textract.
+    if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
+      const angle = pageMetricsAll[g].angle || 0;
+      assignParagraphs(pageObj, angle);
+    }
+    let parCurrent = pageObj.lines[0].par;
+    let fontStylePrev = '';
+    let supPrev = false;
+    for (let h = 0; h < pageObj.lines.length; h++) {
+      const lineObj = pageObj.lines[h];
+      if (reflowText) {
+        if (g > 0 && h === 0 || lineObj.par !== parCurrent) newLine = true;
+        parCurrent = lineObj.par;
+      } else {
+        newLine = true;
+      }
+      for (let i = 0; i < lineObj.words.length; i++) {
+        const wordObj = lineObj.words[i];
+        if (!wordObj) continue;
+        if (wordIds && !wordIds.includes(wordObj.id)) continue;
+        let fontStyle = '';
+        if (wordObj.style.italic) {
+          fontStyle += '<w:i/>';
+        } else if (wordObj.style.bold) {
+          fontStyle += '<w:b/>';
+        }
+        if (wordObj.style.smallCaps) {
+          fontStyle += '<w:smallCaps/>';
+        }
+        if (wordObj.style.underline) {
+          fontStyle += '<w:u w:val="single"/>';
+        }
+        if (wordObj.style.sup) {
+          fontStyle += '<w:vertAlign w:val="superscript"/>';
+        }
+        if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
+          const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
+          if (h === 0 && g === 0 && i === 0) {
+            textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
+          } else if (newLine) {
+            textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
+          // If the previous word was a superscript, the space is added switching back to normal text.
+          } else if (supPrev) {
+            textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
+          // If this word is a superscript, no space is added between words.
+          } else if (wordObj.style.sup && i > 0) {
+            textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
+          } else {
+            textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
+          }
+        } else {
+          textStr += ' ';
+        }
+        fontStylePrev = fontStyle;
+        supPrev = wordObj.style.sup;
+        newLine = false;
+        // DOCX is an XML format, so any escaped XML characters need to continue being escaped.
+        // TODO: Figure out how to properly export superscripts to Word
+        textStr += ocr.escapeXml(wordObj.text);
+      }
+    }
+    opt.progressHandler({ n: g, type: 'export', info: { } });
+  }
+  // Add final closing tags
+  if (textStr) textStr += '</w:t></w:r></w:p>';
+  return textStr;
+}
 /**
  * Create a Word document from an array of ocrPage objects.
  *
@@ -20,12 +133,11 @@ export async function writeDocx({ hocrCurrent, minpage = 0, maxpage = -1 }) {
   const zipFileWriter = new Uint8ArrayWriter();
   const zipWriter = new ZipWriter(zipFileWriter);
-  const textReader = new TextReader(documentStart + writeText({
+  const textReader = new TextReader(documentStart + writeDocxContent({
     ocrCurrent: hocrCurrent,
     minpage,
     maxpage,
     reflowText: opt.reflow,
-    docxMode: true,
   }) + documentEnd);
   await zipWriter.add('word/document.xml', textReader);

package/js/export/writeText.js CHANGED Viewed

@@ -1,21 +1,21 @@
 import { opt } from '../containers/app.js';
 import { pageMetricsAll } from '../containers/dataContainer.js';
-import ocr from '../objects/ocrObjects.js';
 import { assignParagraphs } from '../utils/reflowPars.js';
 /**
- * Convert an array of ocrPage objects to plain text, or XML for a Word document.
+ * Convert an array of ocrPage objects to plain text.
  *
  * @param {Object} params
  * @param {Array<OcrPage>} params.ocrCurrent -
  * @param {number} [params.minpage=0] - The first page to include in the document.
  * @param {number} [params.maxpage=-1] - The last page to include in the document.
  * @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
- * @param {boolean} [params.docxMode=false] - Create XML for a word document rather than plain text.
  * @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
  *    If omitted, all words are included.
  */
-export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null }) {
+export function writeText({
+  ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
+}) {
   let textStr = '';
   if (maxpage === -1) maxpage = ocrCurrent.length - 1;
@@ -35,9 +35,6 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
     let parCurrent = pageObj.lines[0].par;
-    let fontStylePrev = '';
-    let supPrev = false;
     for (let h = 0; h < pageObj.lines.length; h++) {
       const lineObj = pageObj.lines[h];
@@ -54,49 +51,7 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
         if (wordIds && !wordIds.includes(wordObj.id)) continue;
-        if (docxMode) {
-          let fontStyle = '';
-          if (wordObj.style.italic) {
-            fontStyle += '<w:i/>';
-          } else if (wordObj.style.bold) {
-            fontStyle += '<w:b/>';
-          }
-          if (wordObj.style.smallCaps) {
-            fontStyle += '<w:smallCaps/>';
-          }
-          if (wordObj.style.underline) {
-            fontStyle += '<w:u w:val="single"/>';
-          }
-          if (wordObj.style.sup) {
-            fontStyle += '<w:vertAlign w:val="superscript"/>';
-          }
-          if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
-            const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
-            if (h === 0 && g === 0 && i === 0) {
-              textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
-            } else if (newLine) {
-              textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
-            // If the previous word was a superscript, the space is added switching back to normal text.
-            } else if (supPrev) {
-              textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
-            // If this word is a superscript, no space is added between words.
-            } else if (wordObj.style.sup && i > 0) {
-              textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
-            } else {
-              textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
-            }
-          } else {
-            textStr += ' ';
-          }
-          fontStylePrev = fontStyle;
-          supPrev = wordObj.style.sup;
-        } else if (newLine) {
+        if (newLine) {
           textStr = `${textStr}\n`;
         } else if (h > 0 || g > 0 || i > 0) {
           textStr = `${textStr} `;
@@ -104,20 +59,11 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
         newLine = false;
-        // DOCX is an XML format, so any escaped XML characters need to continue being escaped.
-        if (docxMode) {
-          // TODO: Figure out how to properly export superscripts to Word
-          textStr += ocr.escapeXml(wordObj.text);
-        } else {
-          textStr += wordObj.text;
-        }
+        textStr += wordObj.text;
       }
     }
     opt.progressHandler({ n: g, type: 'export', info: { } });
   }
-  // Add final closing tags
-  if (docxMode && textStr) textStr += '</w:t></w:r></w:p>';
   return textStr;
 }

package/js/extractPDFText.js CHANGED Viewed

@@ -8,7 +8,7 @@ import { convertOCR } from './recognizeConvert.js';
  * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
  */
 const extractInternalPDFTextRaw = async () => {
-  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
+  const muPDFScheduler = await ImageCache.getMuPDFScheduler();
   const pdfContentStats = {
     /** Total number of letters in the source PDF. */

package/js/generalWorkerMain.js CHANGED Viewed

@@ -293,7 +293,7 @@ export class gs {
       workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
     } else {
       const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
-      workerN = Math.min(cpuN - 1, 8);
+      workerN = Math.max(Math.min(cpuN - 1, 8), 1);
     }
     const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');

package/js/import/import.js CHANGED Viewed

@@ -345,6 +345,11 @@ export async function importFiles(files) {
     format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
+    // The text import function requires built-in fonts to be loaded.
+    if (format === 'text') {
+      await loadBuiltInFontsRaw();
+    }
     ocrAllRaw.active = ocrData.hocrRaw;
     // Subset OCR data to avoid uncaught error that occurs when there are more pages of OCR data than image data.
     // While this should be rare, it appears to be fairly common with Archive.org documents.

package/js/import/importOCR.js CHANGED Viewed

@@ -95,7 +95,7 @@ export async function importOCRFiles(ocrFilesAll) {
   if (singleHOCRMode) {
     const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
-    format = detectOcrFormat(hocrStrAll);
+    format = detectOcrFormat(hocrStrAll, ocrFilesAll[0]?.name?.split('.').pop());
     if (!format) {
       console.error(ocrFilesAll[0]);
@@ -134,7 +134,7 @@ export async function importOCRFiles(ocrFilesAll) {
     // Check whether input is Abbyy XML using the first file
     const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
-    format = detectOcrFormat(hocrStrFirst);
+    format = detectOcrFormat(hocrStrFirst, ocrFilesAll[0]?.name?.split('.').pop());
     if (!format) {
       console.error(ocrFilesAll[0]);

package/js/utils/miscUtils.js CHANGED Viewed

@@ -323,7 +323,8 @@ export function countSubstringOccurrences(string, subString, allowOverlapping, c
 export const saveAs = async (content, fileName) => {
   if (typeof process !== 'undefined') {
     const { promises: fsPromises } = await import('node:fs');
-    await fsPromises.writeFile(fileName, content);
+    const buffer = content instanceof ArrayBuffer ? Buffer.from(content) : content;
+    await fsPromises.writeFile(fileName, buffer);
     return;
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.9.1",
+  "version": "0.9.2",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {