npm - scribe.js-ocr - Versions diffs - 0.2.1 → 0.2.2 - Mend

scribe.js-ocr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/js/import/importOCR.js +9 -15
package/package.json +1 -1

package/js/import/importOCR.js CHANGED Viewed

@@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) {
   // In the case of 1 HOCR file
   const singleHOCRMode = ocrFilesAll.length === 1;
-  let hocrStrStart = '';
-  let hocrStrEnd = '';
+  let hocrStrStart = null;
   let abbyyMode = false;
   let stextMode = false;
   let scribeMode = false;
-  let hocrArrPages;
   let pageCountHOCR;
   let hocrRaw;
   /** @type  {?Object.<string, FontMetricsFamily>} */
@@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) {
     stextMode = !!node2 && !!/<document name/.test(node2);
     if (abbyyMode) {
-      hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
+      hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
     } else if (stextMode) {
-      hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
+      hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
     } else {
-      hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)[0];
-      hocrStrEnd = hocrStrAll.match(/<\/body>[\s\S]*$/)[0];
-      hocrArrPages = splitHOCRStr(hocrStrAll);
+      // `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API.
+      hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)?.[0];
+      hocrRaw = splitHOCRStr(hocrStrAll);
     }
-    pageCountHOCR = hocrArrPages.length;
-    hocrRaw = Array(pageCountHOCR);
-    for (let i = 0; i < pageCountHOCR; i++) {
-      hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd;
-    }
+    pageCountHOCR = hocrRaw.length;
   } else {
     pageCountHOCR = ocrFilesAll.length;
     hocrRaw = Array(pageCountHOCR);
@@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) {
     }
   }
-  if (!abbyyMode && !stextMode && hocrRaw[0]) {
+  if (!abbyyMode && !stextMode && hocrStrStart) {
     const getMeta = (name) => {
       const regex = new RegExp(`<meta name=["']${name}["'][^<]+`, 'i');
-      const nodeStr = hocrRaw[0].match(regex)?.[0];
+      const nodeStr = hocrStrStart.match(regex)?.[0];
       if (!nodeStr) return null;
       const contentStr = nodeStr.match(/content=["']([\s\S]+?)(?=["']\s{0,5}\/?>)/i)?.[1];
       if (!contentStr) return null;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "scribe.js-ocr",
-  "version": "0.2.1",
+  "version": "0.2.2",
   "description": "High-quality OCR and text extraction for images and PDFs.",
   "main": "scribe.js",
   "directories": {