scribe.js-ocr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) {
18
18
  // In the case of 1 HOCR file
19
19
  const singleHOCRMode = ocrFilesAll.length === 1;
20
20
 
21
- let hocrStrStart = '';
22
- let hocrStrEnd = '';
21
+ let hocrStrStart = null;
23
22
  let abbyyMode = false;
24
23
  let stextMode = false;
25
24
  let scribeMode = false;
26
25
 
27
- let hocrArrPages;
28
26
  let pageCountHOCR;
29
27
  let hocrRaw;
30
28
  /** @type {?Object.<string, FontMetricsFamily>} */
@@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) {
47
45
  stextMode = !!node2 && !!/<document name/.test(node2);
48
46
 
49
47
  if (abbyyMode) {
50
- hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
48
+ hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
51
49
  } else if (stextMode) {
52
- hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
50
+ hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
53
51
  } else {
54
- hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)[0];
55
- hocrStrEnd = hocrStrAll.match(/<\/body>[\s\S]*$/)[0];
56
- hocrArrPages = splitHOCRStr(hocrStrAll);
52
+ // `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API.
53
+ hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)?.[0];
54
+ hocrRaw = splitHOCRStr(hocrStrAll);
57
55
  }
58
56
 
59
- pageCountHOCR = hocrArrPages.length;
60
- hocrRaw = Array(pageCountHOCR);
61
- for (let i = 0; i < pageCountHOCR; i++) {
62
- hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd;
63
- }
57
+ pageCountHOCR = hocrRaw.length;
64
58
  } else {
65
59
  pageCountHOCR = ocrFilesAll.length;
66
60
  hocrRaw = Array(pageCountHOCR);
@@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) {
76
70
  }
77
71
  }
78
72
 
79
- if (!abbyyMode && !stextMode && hocrRaw[0]) {
73
+ if (!abbyyMode && !stextMode && hocrStrStart) {
80
74
  const getMeta = (name) => {
81
75
  const regex = new RegExp(`<meta name=["']${name}["'][^<]+`, 'i');
82
76
 
83
- const nodeStr = hocrRaw[0].match(regex)?.[0];
77
+ const nodeStr = hocrStrStart.match(regex)?.[0];
84
78
  if (!nodeStr) return null;
85
79
  const contentStr = nodeStr.match(/content=["']([\s\S]+?)(?=["']\s{0,5}\/?>)/i)?.[1];
86
80
  if (!contentStr) return null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {