scribe.js-ocr 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.json CHANGED
@@ -67,6 +67,12 @@
67
67
  // "one-var": "off",
68
68
  // "one-var-declaration-per-line": "off",
69
69
 
70
+ // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
71
+ "import/no-relative-packages": "off",
72
+
73
+ // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
74
+ "no-lone-blocks": "off",
75
+
70
76
  // This rule was depreciated
71
77
  "no-return-await": "off",
72
78
 
package/README.md CHANGED
@@ -25,7 +25,7 @@ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
25
25
  import scribe from 'scribe.js-ocr';
26
26
 
27
27
  // Basic usage
28
- scribe.recognizeFiles(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
28
+ scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
29
29
  .then((res) => console.log(res))
30
30
  ```
31
31
 
package/docs/API.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  * [init][1]
6
6
  * [Parameters][2]
7
- * [recognizeFiles][3]
7
+ * [extractText][3]
8
8
  * [Parameters][4]
9
9
  * [clear][5]
10
10
  * [terminate][6]
@@ -35,9 +35,10 @@ Initialize the program and optionally pre-load resources.
35
35
  The PDF renderer and OCR engine are automatically loaded when needed.
36
36
  Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
37
37
 
38
- ## recognizeFiles
38
+ ## extractText
39
39
 
40
- Helper function for recognizing files with a single function call.
40
+ Function for extracting text from image and PDF files with a single function call.
41
+ By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
41
42
  For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
42
43
 
43
44
  ### Parameters
@@ -45,6 +46,10 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
45
46
  * `files`  
46
47
  * `langs` **[Array][21]<[string][22]>** (optional, default `['eng']`)
47
48
  * `outputFormat` (optional, default `'txt'`)
49
+ * `options` **[Object][19]?** (optional, default `{}`)
50
+
51
+ * `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
52
+ * `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
48
53
 
49
54
  ## clear
50
55
 
@@ -100,6 +105,10 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
100
105
  ### Parameters
101
106
 
102
107
  * `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
108
+ * `options` **[Object][19]?** (optional, default `{}`)
109
+
110
+ * `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
111
+ * `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
103
112
 
104
113
  ## recognizePage
105
114
 
@@ -135,7 +144,7 @@ The results of recognition can be exported by calling `exportFiles` after this f
135
144
 
136
145
  [2]: #parameters
137
146
 
138
- [3]: #recognizefiles
147
+ [3]: #extracttext
139
148
 
140
149
  [4]: #parameters-1
141
150
 
@@ -5,6 +5,6 @@ await scribe.init({ ocr: true, font: true });
5
5
  const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
6
6
  elm.addEventListener('change', async () => {
7
7
  if (!elm.files) return;
8
- const text = await scribe.recognizeFiles(elm.files);
8
+ const text = await scribe.extractText(elm.files);
9
9
  console.log(text);
10
10
  });
@@ -5,7 +5,7 @@ import scribe from '../../scribe.js';
5
5
  const [,, imagePath] = process.argv;
6
6
 
7
7
  (async () => {
8
- const res = await scribe.recognizeFiles([imagePath]);
8
+ const res = await scribe.extractText([imagePath]);
9
9
  console.log(res);
10
10
  await scribe.terminate();
11
11
  })();
@@ -74,9 +74,6 @@ export class inputData {
74
74
  /** `true` if user re-uploaded HOCR data created by Scribe OCR */
75
75
  static resumeMode = false;
76
76
 
77
- /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
78
- static extractTextMode = false;
79
-
80
77
  /** `true` if ground truth data is uploaded */
81
78
  static evalMode = false;
82
79
 
@@ -216,52 +216,19 @@ export class ImageCache {
216
216
  static pageCount = 0;
217
217
 
218
218
  /**
219
- * The dimensions that each page would be, if it was rendered at 300 DPI.
220
- * @type {Array<dims>}
221
- */
222
- static pdfDims300Arr = [];
219
+ * The dimensions that each page would be, if it was rendered at 300 DPI.
220
+ * @type {Array<dims>}
221
+ */
222
+ static pdfDims300 = [];
223
223
 
224
224
  static inputModes = {
225
225
  pdf: false,
226
226
  image: false,
227
227
  };
228
228
 
229
- static pdfContentStats = {
230
- /** Total number of letters in the source PDF. */
231
- letterCountTotal: 0,
232
- /** Total number of visible letters in the source PDF. */
233
- letterCountVis: 0,
234
- /** Total number of pages with 100+ letters in the source PDF. */
235
- pageCountTotalText: 0,
236
- /** Total number of pages with 100+ visible letters in the source PDF. */
237
- pageCountVisText: 0,
238
- };
239
-
240
229
  /** @type {?('text'|'ocr'|'image')} */
241
230
  static pdfType = null;
242
231
 
243
- static setPdfType = () => {
244
- // The PDF is considered text-native if:
245
- // (1) The total number of visible letters is at least 100 per page on average.
246
- // (2) The total number of visible letters is at least 90% of the total number of letters.
247
- // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
248
- if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
249
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
250
- && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
251
- ImageCache.pdfType = 'text';
252
- // The PDF is considered ocr-native if:
253
- // (1) The total number of letters is at least 100 per page on average.
254
- // (2) The total number of letters is at least half of the total number of letters.
255
- } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
256
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
257
- ImageCache.pdfType = 'ocr';
258
- // Otherwise, the PDF is considered image-native.
259
- // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
260
- } else {
261
- ImageCache.pdfType = 'image';
262
- }
263
- };
264
-
265
232
  static colorModeDefault = 'gray';
266
233
 
267
234
  static cacheRenderPages = 3;
@@ -327,7 +294,7 @@ export class ImageCache {
327
294
  } if (ImageCache.inputModes.pdf) {
328
295
  const pageMetrics = pageMetricsArr[n];
329
296
  const targetWidth = pageMetrics.dims.width;
330
- const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
297
+ const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
331
298
  const muPDFScheduler = await ImageCache.getMuPDFScheduler();
332
299
  return muPDFScheduler.drawPageAsPNG({
333
300
  page: n + 1, dpi, color, skipText: skipTextMode,
@@ -566,14 +533,10 @@ export class ImageCache {
566
533
  ImageCache.inputModes.image = false;
567
534
  ImageCache.inputModes.pdf = false;
568
535
  ImageCache.pageCount = 0;
569
- ImageCache.pdfDims300Arr.length = 0;
536
+ ImageCache.pdfDims300.length = 0;
570
537
  ImageCache.loadCount = 0;
571
538
  ImageCache.nativeProps.length = 0;
572
539
  ImageCache.binaryProps.length = 0;
573
- ImageCache.pdfContentStats.letterCountTotal = 0;
574
- ImageCache.pdfContentStats.letterCountVis = 0;
575
- ImageCache.pdfContentStats.pageCountTotalText = 0;
576
- ImageCache.pdfContentStats.pageCountVisText = 0;
577
540
  };
578
541
 
579
542
  static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
600
563
  *
601
564
  * @param {ArrayBuffer} fileData
602
565
  * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
603
- * @param {Boolean} [extractStext=false]
604
566
  */
605
- static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
567
+ static openMainPDF = async (fileData, skipText = false) => {
606
568
  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
607
569
 
608
570
  await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
611
573
 
612
574
  const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
613
575
 
614
- ImageCache.pdfDims300Arr.length = 0;
576
+ ImageCache.pdfDims300.length = 0;
615
577
  pageDims1.forEach((x) => {
616
- ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
578
+ ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
617
579
  });
618
580
 
619
581
  ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
627
589
 
628
590
  // For reasons that are unclear, a small number of pages have been rendered into massive files
629
591
  // so a hard-cap on resolution must be imposed.
630
- const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
592
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
631
593
 
632
594
  // In addition to capping the resolution, also switch the width/height
633
- ImageCache.pdfDims300Arr.forEach((x, i) => {
595
+ ImageCache.pdfDims300.forEach((x, i) => {
634
596
  const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
635
597
  pageMetricsArr[i] = new PageMetrics(pageDims);
636
598
  });
@@ -674,23 +636,5 @@ export class ImageCache {
674
636
  await setUploadFontsWorker(gs.schedulerInner);
675
637
  });
676
638
  }
677
-
678
- if (extractStext) {
679
- ocrAllRaw.active = Array(ImageCache.pageCount);
680
- const resArr = pageDPI.map(async (x, i) => {
681
- // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
682
- // The XML format is the only built-in mupdf format that includes character-level granularity.
683
- const res = await muPDFScheduler.pageText({
684
- page: i + 1, dpi: x, format: 'xml', calcStats: true,
685
- });
686
- ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
687
- ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
688
- if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
689
- if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
690
- ocrAllRaw.active[i] = res.content;
691
- });
692
- await Promise.all(resArr);
693
- ImageCache.setPdfType();
694
- }
695
639
  };
696
640
  }
@@ -0,0 +1,110 @@
1
+ import { ImageCache } from './containers/imageContainer.js';
2
+ import { convertOCRAll } from './recognizeConvert.js';
3
+ import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
4
+
5
+ /**
6
+ * Extract raw text content from currently loaded PDF.
7
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
8
+ */
9
+ const extractInternalPDFTextRaw = async () => {
10
+ const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
11
+
12
+ const pdfContentStats = {
13
+ /** Total number of letters in the source PDF. */
14
+ letterCountTotal: 0,
15
+ /** Total number of visible letters in the source PDF. */
16
+ letterCountVis: 0,
17
+ /** Total number of pages with 100+ letters in the source PDF. */
18
+ pageCountTotalText: 0,
19
+ /** Total number of pages with 100+ visible letters in the source PDF. */
20
+ pageCountVisText: 0,
21
+ };
22
+
23
+ const stextArr = /** @type {Array<string>} */ ([]);
24
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
25
+ const resArr = pageDPI.map(async (x, i) => {
26
+ // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
27
+ // The XML format is the only built-in mupdf format that includes character-level granularity.
28
+ const res = await muPDFScheduler.pageText({
29
+ page: i + 1, dpi: x, format: 'xml', calcStats: true,
30
+ });
31
+ pdfContentStats.letterCountTotal += res.letterCountTotal;
32
+ pdfContentStats.letterCountVis += res.letterCountVis;
33
+ if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
34
+ if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
35
+ stextArr[i] = res.content;
36
+ });
37
+ await Promise.all(resArr);
38
+
39
+ /** @type {"image" | "text" | "ocr"} */
40
+ let type = 'image';
41
+
42
+ // Determine whether the PDF is text-native, image-only, or image + OCR.
43
+ {
44
+ // The PDF is considered text-native if:
45
+ // (1) The total number of visible letters is at least 100 per page on average.
46
+ // (2) The total number of visible letters is at least 90% of the total number of letters.
47
+ // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
48
+ if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
49
+ && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
50
+ && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
51
+ type = 'text';
52
+ // The PDF is considered ocr-native if:
53
+ // (1) The total number of letters is at least 100 per page on average.
54
+ // (2) The total number of letters is at least half of the total number of letters.
55
+ } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
56
+ && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
57
+ type = 'ocr';
58
+ // Otherwise, the PDF is considered image-native.
59
+ // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
60
+ } else {
61
+ type = 'image';
62
+ }
63
+ }
64
+
65
+ return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
66
+ };
67
+
68
+ /**
69
+ * Extract and parse text from currently loaded PDF.
70
+ * @param {Object} [options]
71
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
72
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
73
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
74
+ * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
75
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
76
+ */
77
+ export const extractInternalPDFText = async (options = {}) => {
78
+ const extractPDFTextNative = options?.extractPDFTextNative ?? true;
79
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
80
+ const extractPDFTextImage = options?.extractPDFTextImage ?? false;
81
+
82
+ const setActive = options?.setActive ?? false;
83
+
84
+ const res = await extractInternalPDFTextRaw();
85
+
86
+ ImageCache.pdfType = res.type;
87
+ ocrAllRaw.pdf = res.contentRaw;
88
+
89
+ if (!extractPDFTextImage && res.type === 'image') return res;
90
+
91
+ if (!extractPDFTextOCR && res.type === 'ocr') return res;
92
+
93
+ if (!extractPDFTextNative && res.type === 'text') return res;
94
+
95
+ ocrAll.pdf = Array(ImageCache.pageCount);
96
+
97
+ if (setActive) {
98
+ ocrAllRaw.active = ocrAllRaw.pdf;
99
+ ocrAll.active = ocrAll.pdf;
100
+ }
101
+
102
+ const format = 'stext';
103
+
104
+ // Process HOCR using web worker, reading from file first if that has not been done already
105
+ await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
106
+
107
+ res.content = ocrAll.pdf;
108
+
109
+ return res;
110
+ };
@@ -263,12 +263,12 @@ export class gs {
263
263
  static getGeneralScheduler = async () => {
264
264
  if (gs.schedulerReady) {
265
265
  await gs.schedulerReady;
266
- return gs.scheduler;
266
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
267
267
  }
268
268
 
269
269
  await gs.init();
270
270
 
271
- return gs.scheduler;
271
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
272
272
  };
273
273
 
274
274
  static terminate = async () => {
@@ -11,7 +11,9 @@ import {
11
11
  } from '../containers/dataContainer.js';
12
12
  import { fontAll } from '../containers/fontContainer.js';
13
13
  import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
14
- import { enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw } from '../fontContainerMain.js';
14
+ import {
15
+ enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
16
+ } from '../fontContainerMain.js';
15
17
  import { runFontOptimization } from '../fontEval.js';
16
18
  import { calcFontMetricsFromPages } from '../fontStatistics.js';
17
19
  import { gs } from '../generalWorkerMain.js';
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
20
22
  import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
21
23
  import { replaceObjectProperties } from '../utils/miscUtils.js';
22
24
  import { importOCRFiles } from './importOCR.js';
25
+ import { extractInternalPDFText } from '../extractPDFText.js';
23
26
 
24
27
  /**
25
28
  * Automatically detects the image type (jpeg or png).
@@ -185,12 +188,18 @@ export function sortInputFiles(files) {
185
188
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
186
189
  * @public
187
190
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
191
+ * @param {Object} [options]
192
+ * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
193
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
188
194
  * @returns
189
195
  */
190
- export async function importFiles(files) {
196
+ export async function importFiles(files, options = {}) {
191
197
  clearData();
192
198
  gs.getGeneralScheduler();
193
199
 
200
+ const extractPDFTextNative = options?.extractPDFTextNative ?? false;
201
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
202
+
194
203
  /** @type {Array<File|FileNode|ArrayBuffer>} */
195
204
  let pdfFiles = [];
196
205
  /** @type {Array<File|FileNode|ArrayBuffer>} */
@@ -266,10 +275,6 @@ export async function importFiles(files) {
266
275
 
267
276
  const xmlModeImport = ocrFiles.length > 0;
268
277
 
269
- // Extract text from PDF document
270
- // Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
271
- inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
272
-
273
278
  let pageCount;
274
279
  let pageCountImage;
275
280
  let abbyyMode = false;
@@ -284,7 +289,7 @@ export async function importFiles(files) {
284
289
  const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
285
290
 
286
291
  // If no XML data is provided, page sizes are calculated using muPDF alone
287
- await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText, inputData.extractTextMode);
292
+ await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
288
293
 
289
294
  pageCountImage = ImageCache.pageCount;
290
295
  ImageCache.loadCount = ImageCache.pageCount;
@@ -315,7 +320,7 @@ export async function importFiles(files) {
315
320
 
316
321
  // Restore font metrics and optimize font from previous session (if applicable)
317
322
  if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
318
- const fontPromise = loadBuiltInFontsRaw()
323
+ const fontPromise = loadBuiltInFontsRaw();
319
324
 
320
325
  existingOpt = true;
321
326
 
@@ -368,11 +373,6 @@ export async function importFiles(files) {
368
373
  scribeMode = ocrData.scribeMode;
369
374
 
370
375
  stextMode = ocrData.stextMode;
371
- } else if (inputData.extractTextMode) {
372
- // Initialize a new array on `ocrAll` if one does not already exist
373
- if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
374
- ocrAll.active = ocrAll[oemName];
375
- stextMode = true;
376
376
  }
377
377
 
378
378
  const pageCountHOCR = ocrAllRaw.active?.length;
@@ -424,7 +424,7 @@ export async function importFiles(files) {
424
424
  }
425
425
  }
426
426
 
427
- if (xmlModeImport || inputData.extractTextMode) {
427
+ if (xmlModeImport) {
428
428
  /** @type {("hocr" | "abbyy" | "stext")} */
429
429
  let format = 'hocr';
430
430
  if (abbyyMode) format = 'abbyy';
@@ -439,6 +439,8 @@ export async function importFiles(files) {
439
439
  opt.enableOpt = await runFontOptimization(ocrAll.active);
440
440
  }
441
441
  });
442
+ } else if (extractPDFTextNative || extractPDFTextOCR) {
443
+ await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
442
444
  }
443
445
  }
444
446
 
@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
28
28
  let pageCountHOCR;
29
29
  let hocrRaw;
30
30
  /** @type {?Object.<string, FontMetricsFamily>} */
31
- let fontMetricsObj;
31
+ let fontMetricsObj = null;
32
32
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
33
33
  let layoutObj = null;
34
34
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
42
42
  const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
43
43
 
44
44
  // Check whether input is Abbyy XML
45
- const node2 = hocrStrAll.match(/>([^>]+)/)[1];
46
- abbyyMode = !!/abbyy/i.test(node2);
47
- stextMode = !!/<document name/.test(node2);
45
+ const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
46
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
47
+ stextMode = !!node2 && !!/<document name/.test(node2);
48
48
 
49
49
  if (abbyyMode) {
50
50
  hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
67
67
 
68
68
  // Check whether input is Abbyy XML using the first file
69
69
  const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
70
- const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
71
- abbyyMode = !!/abbyy/i.test(node2);
70
+ const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
71
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
72
72
 
73
73
  for (let i = 0; i < pageCountHOCR; i++) {
74
74
  const hocrFile = ocrFilesAll[i];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {
package/scribe.js CHANGED
@@ -30,6 +30,7 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
30
30
  import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
31
31
  import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
32
32
  import { assignParagraphs } from './js/utils/reflowPars.js';
33
+ import { extractInternalPDFText } from './js/extractPDFText.js';
33
34
 
34
35
  /**
35
36
  * Initialize the program and optionally pre-load resources.
@@ -66,18 +67,23 @@ const init = async (params) => {
66
67
  };
67
68
 
68
69
  /**
69
- * Helper function for recognizing files with a single function call.
70
+ * Function for extracting text from image and PDF files with a single function call.
71
+ * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
70
72
  * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
71
73
  * @public
72
74
  * @param {Parameters<typeof importFiles>[0]} files
73
75
  * @param {Array<string>} [langs=['eng']]
74
76
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
75
- * @returns
77
+ * @param {Object} [options]
78
+ * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
79
+ * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
76
80
  */
77
- const recognizeFiles = async (files, langs = ['eng'], outputFormat = 'txt') => {
81
+ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
82
+ const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
83
+ const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
78
84
  init({ ocr: true, font: true });
79
- await importFiles(files);
80
- await recognize({ langs });
85
+ await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
86
+ if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
81
87
  return exportData(outputFormat);
82
88
  };
83
89
 
@@ -174,7 +180,8 @@ export default {
174
180
  opt,
175
181
  recognize,
176
182
  recognizePage,
177
- recognizeFiles,
183
+ extractText,
184
+ extractInternalPDFText,
178
185
  terminate,
179
186
  utils,
180
187
  };