scribe.js-ocr 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.json CHANGED
@@ -67,6 +67,12 @@
67
67
  // "one-var": "off",
68
68
  // "one-var-declaration-per-line": "off",
69
69
 
70
+ // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
71
+ "import/no-relative-packages": "off",
72
+
73
+ // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
74
+ "no-lone-blocks": "off",
75
+
70
76
  // This rule was depreciated
71
77
  "no-return-await": "off",
72
78
 
package/README.md CHANGED
@@ -25,7 +25,7 @@ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
25
25
  import scribe from 'scribe.js-ocr';
26
26
 
27
27
  // Basic usage
28
- scribe.recognizeFiles(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
28
+ scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
29
29
  .then((res) => console.log(res))
30
30
  ```
31
31
 
package/docs/API.md CHANGED
@@ -4,7 +4,7 @@
4
4
 
5
5
  * [init][1]
6
6
  * [Parameters][2]
7
- * [recognizeFiles][3]
7
+ * [extractText][3]
8
8
  * [Parameters][4]
9
9
  * [clear][5]
10
10
  * [terminate][6]
@@ -35,9 +35,10 @@ Initialize the program and optionally pre-load resources.
35
35
  The PDF renderer and OCR engine are automatically loaded when needed.
36
36
  Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
37
37
 
38
- ## recognizeFiles
38
+ ## extractText
39
39
 
40
- Helper function for recognizing files with a single function call.
40
+ Function for extracting text from image and PDF files with a single function call.
41
+ By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
41
42
  For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
42
43
 
43
44
  ### Parameters
@@ -45,6 +46,10 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
45
46
  * `files`  
46
47
  * `langs` **[Array][21]<[string][22]>** (optional, default `['eng']`)
47
48
  * `outputFormat` (optional, default `'txt'`)
49
+ * `options` **[Object][19]?** (optional, default `{}`)
50
+
51
+ * `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
52
+ * `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
48
53
 
49
54
  ## clear
50
55
 
@@ -100,6 +105,10 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
100
105
  ### Parameters
101
106
 
102
107
  * `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
108
+ * `options` **[Object][19]?** (optional, default `{}`)
109
+
110
+ * `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
111
+ * `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
103
112
 
104
113
  ## recognizePage
105
114
 
@@ -135,7 +144,7 @@ The results of recognition can be exported by calling `exportFiles` after this f
135
144
 
136
145
  [2]: #parameters
137
146
 
138
- [3]: #recognizefiles
147
+ [3]: #extracttext
139
148
 
140
149
  [4]: #parameters-1
141
150
 
@@ -5,6 +5,6 @@ await scribe.init({ ocr: true, font: true });
5
5
  const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
6
6
  elm.addEventListener('change', async () => {
7
7
  if (!elm.files) return;
8
- const text = await scribe.recognizeFiles(elm.files);
8
+ const text = await scribe.extractText(elm.files);
9
9
  console.log(text);
10
10
  });
@@ -5,7 +5,7 @@ import scribe from '../../scribe.js';
5
5
  const [,, imagePath] = process.argv;
6
6
 
7
7
  (async () => {
8
- const res = await scribe.recognizeFiles([imagePath]);
8
+ const res = await scribe.extractText([imagePath]);
9
9
  console.log(res);
10
10
  await scribe.terminate();
11
11
  })();
@@ -74,9 +74,6 @@ export class inputData {
74
74
  /** `true` if user re-uploaded HOCR data created by Scribe OCR */
75
75
  static resumeMode = false;
76
76
 
77
- /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
78
- static extractTextMode = false;
79
-
80
77
  /** `true` if ground truth data is uploaded */
81
78
  static evalMode = false;
82
79
 
@@ -216,52 +216,19 @@ export class ImageCache {
216
216
  static pageCount = 0;
217
217
 
218
218
  /**
219
- * The dimensions that each page would be, if it was rendered at 300 DPI.
220
- * @type {Array<dims>}
221
- */
222
- static pdfDims300Arr = [];
219
+ * The dimensions that each page would be, if it was rendered at 300 DPI.
220
+ * @type {Array<dims>}
221
+ */
222
+ static pdfDims300 = [];
223
223
 
224
224
  static inputModes = {
225
225
  pdf: false,
226
226
  image: false,
227
227
  };
228
228
 
229
- static pdfContentStats = {
230
- /** Total number of letters in the source PDF. */
231
- letterCountTotal: 0,
232
- /** Total number of visible letters in the source PDF. */
233
- letterCountVis: 0,
234
- /** Total number of pages with 100+ letters in the source PDF. */
235
- pageCountTotalText: 0,
236
- /** Total number of pages with 100+ visible letters in the source PDF. */
237
- pageCountVisText: 0,
238
- };
239
-
240
229
  /** @type {?('text'|'ocr'|'image')} */
241
230
  static pdfType = null;
242
231
 
243
- static setPdfType = () => {
244
- // The PDF is considered text-native if:
245
- // (1) The total number of visible letters is at least 100 per page on average.
246
- // (2) The total number of visible letters is at least 90% of the total number of letters.
247
- // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
248
- if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
249
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
250
- && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
251
- ImageCache.pdfType = 'text';
252
- // The PDF is considered ocr-native if:
253
- // (1) The total number of letters is at least 100 per page on average.
254
- // (2) The total number of letters is at least half of the total number of letters.
255
- } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
256
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
257
- ImageCache.pdfType = 'ocr';
258
- // Otherwise, the PDF is considered image-native.
259
- // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
260
- } else {
261
- ImageCache.pdfType = 'image';
262
- }
263
- };
264
-
265
232
  static colorModeDefault = 'gray';
266
233
 
267
234
  static cacheRenderPages = 3;
@@ -287,7 +254,7 @@ export class ImageCache {
287
254
  * @returns
288
255
  */
289
256
  static #initMuPDFScheduler = async (numWorkers = 3) => {
290
- const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
257
+ const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
291
258
  const scheduler = await Tesseract.createScheduler();
292
259
  const workersPromiseArr = range(1, numWorkers).map(async () => {
293
260
  const w = await initMuPDFWorker();
@@ -327,13 +294,13 @@ export class ImageCache {
327
294
  } if (ImageCache.inputModes.pdf) {
328
295
  const pageMetrics = pageMetricsArr[n];
329
296
  const targetWidth = pageMetrics.dims.width;
330
- const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
297
+ const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
331
298
  const muPDFScheduler = await ImageCache.getMuPDFScheduler();
332
299
  return muPDFScheduler.drawPageAsPNG({
333
300
  page: n + 1, dpi, color, skipText: skipTextMode,
334
301
  }).then((res) => new ImageWrapper(n, res, color ? 'color' : 'gray'));
335
302
  }
336
- throw new Error('No input mode set');
303
+ throw new Error('Attempted to render image without image input provided.');
337
304
  };
338
305
 
339
306
  /**
@@ -566,14 +533,10 @@ export class ImageCache {
566
533
  ImageCache.inputModes.image = false;
567
534
  ImageCache.inputModes.pdf = false;
568
535
  ImageCache.pageCount = 0;
569
- ImageCache.pdfDims300Arr.length = 0;
536
+ ImageCache.pdfDims300.length = 0;
570
537
  ImageCache.loadCount = 0;
571
538
  ImageCache.nativeProps.length = 0;
572
539
  ImageCache.binaryProps.length = 0;
573
- ImageCache.pdfContentStats.letterCountTotal = 0;
574
- ImageCache.pdfContentStats.letterCountVis = 0;
575
- ImageCache.pdfContentStats.pageCountTotalText = 0;
576
- ImageCache.pdfContentStats.pageCountVisText = 0;
577
540
  };
578
541
 
579
542
  static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
600
563
  *
601
564
  * @param {ArrayBuffer} fileData
602
565
  * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
603
- * @param {Boolean} [extractStext=false]
604
566
  */
605
- static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
567
+ static openMainPDF = async (fileData, skipText = false) => {
606
568
  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
607
569
 
608
570
  await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
611
573
 
612
574
  const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
613
575
 
614
- ImageCache.pdfDims300Arr.length = 0;
576
+ ImageCache.pdfDims300.length = 0;
615
577
  pageDims1.forEach((x) => {
616
- ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
578
+ ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
617
579
  });
618
580
 
619
581
  ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
627
589
 
628
590
  // For reasons that are unclear, a small number of pages have been rendered into massive files
629
591
  // so a hard-cap on resolution must be imposed.
630
- const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
592
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
631
593
 
632
594
  // In addition to capping the resolution, also switch the width/height
633
- ImageCache.pdfDims300Arr.forEach((x, i) => {
595
+ ImageCache.pdfDims300.forEach((x, i) => {
634
596
  const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
635
597
  pageMetricsArr[i] = new PageMetrics(pageDims);
636
598
  });
@@ -674,23 +636,5 @@ export class ImageCache {
674
636
  await setUploadFontsWorker(gs.schedulerInner);
675
637
  });
676
638
  }
677
-
678
- if (extractStext) {
679
- ocrAllRaw.active = Array(ImageCache.pageCount);
680
- const resArr = pageDPI.map(async (x, i) => {
681
- // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
682
- // The XML format is the only built-in mupdf format that includes character-level granularity.
683
- const res = await muPDFScheduler.pageText({
684
- page: i + 1, dpi: x, format: 'xml', calcStats: true,
685
- });
686
- ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
687
- ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
688
- if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
689
- if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
690
- ocrAllRaw.active[i] = res.content;
691
- });
692
- await Promise.all(resArr);
693
- ImageCache.setPdfType();
694
- }
695
639
  };
696
640
  }
@@ -22,10 +22,10 @@ const escapeCSVField = (field) => {
22
22
  };
23
23
 
24
24
  /**
25
- * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
26
- * @param {Array<Object>} data - The array of data objects.
27
- * @returns {string} - The CSV string.
28
- */
25
+ * Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
26
+ * @param {Array<Object>} data - The array of data objects.
27
+ * @returns {string} - The CSV string.
28
+ */
29
29
  export const convertToCSV = (data) => {
30
30
  if (data.length === 0) {
31
31
  return '';
@@ -0,0 +1,110 @@
1
+ import { ImageCache } from './containers/imageContainer.js';
2
+ import { convertOCRAll } from './recognizeConvert.js';
3
+ import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
4
+
5
+ /**
6
+ * Extract raw text content from currently loaded PDF.
7
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
8
+ */
9
+ const extractInternalPDFTextRaw = async () => {
10
+ const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
11
+
12
+ const pdfContentStats = {
13
+ /** Total number of letters in the source PDF. */
14
+ letterCountTotal: 0,
15
+ /** Total number of visible letters in the source PDF. */
16
+ letterCountVis: 0,
17
+ /** Total number of pages with 100+ letters in the source PDF. */
18
+ pageCountTotalText: 0,
19
+ /** Total number of pages with 100+ visible letters in the source PDF. */
20
+ pageCountVisText: 0,
21
+ };
22
+
23
+ const stextArr = /** @type {Array<string>} */ ([]);
24
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
25
+ const resArr = pageDPI.map(async (x, i) => {
26
+ // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
27
+ // The XML format is the only built-in mupdf format that includes character-level granularity.
28
+ const res = await muPDFScheduler.pageText({
29
+ page: i + 1, dpi: x, format: 'xml', calcStats: true,
30
+ });
31
+ pdfContentStats.letterCountTotal += res.letterCountTotal;
32
+ pdfContentStats.letterCountVis += res.letterCountVis;
33
+ if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
34
+ if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
35
+ stextArr[i] = res.content;
36
+ });
37
+ await Promise.all(resArr);
38
+
39
+ /** @type {"image" | "text" | "ocr"} */
40
+ let type = 'image';
41
+
42
+ // Determine whether the PDF is text-native, image-only, or image + OCR.
43
+ {
44
+ // The PDF is considered text-native if:
45
+ // (1) The total number of visible letters is at least 100 per page on average.
46
+ // (2) The total number of visible letters is at least 90% of the total number of letters.
47
+ // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
48
+ if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
49
+ && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
50
+ && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
51
+ type = 'text';
52
+ // The PDF is considered ocr-native if:
53
+ // (1) The total number of letters is at least 100 per page on average.
54
+ // (2) The total number of letters is at least half of the total number of letters.
55
+ } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
56
+ && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
57
+ type = 'ocr';
58
+ // Otherwise, the PDF is considered image-native.
59
+ // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
60
+ } else {
61
+ type = 'image';
62
+ }
63
+ }
64
+
65
+ return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
66
+ };
67
+
68
+ /**
69
+ * Extract and parse text from currently loaded PDF.
70
+ * @param {Object} [options]
71
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
72
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
73
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
74
+ * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
75
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
76
+ */
77
+ export const extractInternalPDFText = async (options = {}) => {
78
+ const extractPDFTextNative = options?.extractPDFTextNative ?? true;
79
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
80
+ const extractPDFTextImage = options?.extractPDFTextImage ?? false;
81
+
82
+ const setActive = options?.setActive ?? false;
83
+
84
+ const res = await extractInternalPDFTextRaw();
85
+
86
+ ImageCache.pdfType = res.type;
87
+ ocrAllRaw.pdf = res.contentRaw;
88
+
89
+ if (!extractPDFTextImage && res.type === 'image') return res;
90
+
91
+ if (!extractPDFTextOCR && res.type === 'ocr') return res;
92
+
93
+ if (!extractPDFTextNative && res.type === 'text') return res;
94
+
95
+ ocrAll.pdf = Array(ImageCache.pageCount);
96
+
97
+ if (setActive) {
98
+ ocrAllRaw.active = ocrAllRaw.pdf;
99
+ ocrAll.active = ocrAll.pdf;
100
+ }
101
+
102
+ const format = 'stext';
103
+
104
+ // Process HOCR using web worker, reading from file first if that has not been done already
105
+ await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
106
+
107
+ res.content = ocrAll.pdf;
108
+
109
+ return res;
110
+ };
@@ -198,7 +198,7 @@ export class gs {
198
198
  workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
199
199
  }
200
200
 
201
- const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
201
+ const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
202
202
 
203
203
  gs.schedulerInner = await Tesseract.createScheduler();
204
204
  gs.schedulerInner.workers = new Array(workerN);
@@ -263,12 +263,12 @@ export class gs {
263
263
  static getGeneralScheduler = async () => {
264
264
  if (gs.schedulerReady) {
265
265
  await gs.schedulerReady;
266
- return gs.scheduler;
266
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
267
267
  }
268
268
 
269
269
  await gs.init();
270
270
 
271
- return gs.scheduler;
271
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
272
272
  };
273
273
 
274
274
  static terminate = async () => {
@@ -9,7 +9,7 @@ import { getTextScript } from '../utils/miscUtils.js';
9
9
 
10
10
  /**
11
11
  * @param {Object} params
12
- * @param {Array<import('tesseract.js').Block>} params.ocrBlocks
12
+ * @param {Array<import('@scribe.js/tesseract.js').Block>} params.ocrBlocks
13
13
  * @param {number} params.n
14
14
  * @param {dims} params.pageDims
15
15
  * @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.
@@ -11,7 +11,9 @@ import {
11
11
  } from '../containers/dataContainer.js';
12
12
  import { fontAll } from '../containers/fontContainer.js';
13
13
  import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
14
- import { enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw } from '../fontContainerMain.js';
14
+ import {
15
+ enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
16
+ } from '../fontContainerMain.js';
15
17
  import { runFontOptimization } from '../fontEval.js';
16
18
  import { calcFontMetricsFromPages } from '../fontStatistics.js';
17
19
  import { gs } from '../generalWorkerMain.js';
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
20
22
  import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
21
23
  import { replaceObjectProperties } from '../utils/miscUtils.js';
22
24
  import { importOCRFiles } from './importOCR.js';
25
+ import { extractInternalPDFText } from '../extractPDFText.js';
23
26
 
24
27
  /**
25
28
  * Automatically detects the image type (jpeg or png).
@@ -185,12 +188,18 @@ export function sortInputFiles(files) {
185
188
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
186
189
  * @public
187
190
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
191
+ * @param {Object} [options]
192
+ * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
193
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
188
194
  * @returns
189
195
  */
190
- export async function importFiles(files) {
196
+ export async function importFiles(files, options = {}) {
191
197
  clearData();
192
198
  gs.getGeneralScheduler();
193
199
 
200
+ const extractPDFTextNative = options?.extractPDFTextNative ?? false;
201
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
202
+
194
203
  /** @type {Array<File|FileNode|ArrayBuffer>} */
195
204
  let pdfFiles = [];
196
205
  /** @type {Array<File|FileNode|ArrayBuffer>} */
@@ -266,10 +275,6 @@ export async function importFiles(files) {
266
275
 
267
276
  const xmlModeImport = ocrFiles.length > 0;
268
277
 
269
- // Extract text from PDF document
270
- // Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
271
- inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
272
-
273
278
  let pageCount;
274
279
  let pageCountImage;
275
280
  let abbyyMode = false;
@@ -284,7 +289,7 @@ export async function importFiles(files) {
284
289
  const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
285
290
 
286
291
  // If no XML data is provided, page sizes are calculated using muPDF alone
287
- await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText, inputData.extractTextMode);
292
+ await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
288
293
 
289
294
  pageCountImage = ImageCache.pageCount;
290
295
  ImageCache.loadCount = ImageCache.pageCount;
@@ -315,7 +320,7 @@ export async function importFiles(files) {
315
320
 
316
321
  // Restore font metrics and optimize font from previous session (if applicable)
317
322
  if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
318
- const fontPromise = loadBuiltInFontsRaw()
323
+ const fontPromise = loadBuiltInFontsRaw();
319
324
 
320
325
  existingOpt = true;
321
326
 
@@ -368,11 +373,6 @@ export async function importFiles(files) {
368
373
  scribeMode = ocrData.scribeMode;
369
374
 
370
375
  stextMode = ocrData.stextMode;
371
- } else if (inputData.extractTextMode) {
372
- // Initialize a new array on `ocrAll` if one does not already exist
373
- if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
374
- ocrAll.active = ocrAll[oemName];
375
- stextMode = true;
376
376
  }
377
377
 
378
378
  const pageCountHOCR = ocrAllRaw.active?.length;
@@ -424,7 +424,7 @@ export async function importFiles(files) {
424
424
  }
425
425
  }
426
426
 
427
- if (xmlModeImport || inputData.extractTextMode) {
427
+ if (xmlModeImport) {
428
428
  /** @type {("hocr" | "abbyy" | "stext")} */
429
429
  let format = 'hocr';
430
430
  if (abbyyMode) format = 'abbyy';
@@ -439,6 +439,8 @@ export async function importFiles(files) {
439
439
  opt.enableOpt = await runFontOptimization(ocrAll.active);
440
440
  }
441
441
  });
442
+ } else if (extractPDFTextNative || extractPDFTextOCR) {
443
+ await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
442
444
  }
443
445
  }
444
446
 
@@ -463,6 +465,8 @@ export async function importFilesSupp(files, ocrName) {
463
465
 
464
466
  const ocrData = await importOCRFiles(ocrFilesAll);
465
467
 
468
+ const scribeMode = ocrData.scribeMode;
469
+
466
470
  const pageCountHOCR = ocrData.hocrRaw.length;
467
471
 
468
472
  // If both OCR data and image data are present, confirm they have the same number of pages
@@ -476,5 +480,5 @@ export async function importFilesSupp(files, ocrName) {
476
480
  if (ocrData.abbyyMode) format = 'abbyy';
477
481
  if (ocrData.stextMode) format = 'stext';
478
482
 
479
- convertOCRAll(ocrData.hocrRaw, false, format, ocrName);
483
+ await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
480
484
  }
@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
28
28
  let pageCountHOCR;
29
29
  let hocrRaw;
30
30
  /** @type {?Object.<string, FontMetricsFamily>} */
31
- let fontMetricsObj;
31
+ let fontMetricsObj = null;
32
32
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
33
33
  let layoutObj = null;
34
34
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
42
42
  const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
43
43
 
44
44
  // Check whether input is Abbyy XML
45
- const node2 = hocrStrAll.match(/>([^>]+)/)[1];
46
- abbyyMode = !!/abbyy/i.test(node2);
47
- stextMode = !!/<document name/.test(node2);
45
+ const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
46
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
47
+ stextMode = !!node2 && !!/<document name/.test(node2);
48
48
 
49
49
  if (abbyyMode) {
50
50
  hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
67
67
 
68
68
  // Check whether input is Abbyy XML using the first file
69
69
  const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
70
- const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
71
- abbyyMode = !!/abbyy/i.test(node2);
70
+ const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
71
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
72
72
 
73
73
  for (let i = 0; i < pageCountHOCR; i++) {
74
74
  const hocrFile = ocrFilesAll[i];
@@ -24,7 +24,19 @@ import { replaceObjectProperties } from './utils/miscUtils.js';
24
24
  */
25
25
  export const compareOCRPage = async (pageA, pageB, options) => {
26
26
  const func = typeof process !== 'undefined' ? (await import('./worker/compareOCRModule.js')).compareOCRPageImp : gs.scheduler.compareOCRPageImp;
27
- const binaryImage = await ImageCache.getBinary(pageA.n);
27
+
28
+ // Some combinations of options require the image to be provided, and some do not.
29
+ // We skip sending the image for those that do not, as in addition to helping performance,
30
+ // this is also necessary to run basic comparison scripts (e.g. benchmarking accuracy) without providing the image.
31
+ // TODO: Rework the options so this works better with types.
32
+ // At present TypeScript has no way of knowing that certain combinations of options go with each other.
33
+ const mode = options?.mode || 'stats';
34
+ const evalConflicts = options?.evalConflicts ?? true;
35
+ const supplementComp = options?.supplementComp ?? false;
36
+ const skipImage = (mode === 'stats' && !supplementComp) || (mode === 'comb' && !evalConflicts && !supplementComp);
37
+
38
+ const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n);
39
+
28
40
  const pageMetricsObj = pageMetricsArr[pageA.n];
29
41
  return func({
30
42
  pageA, pageB, binaryImage, pageMetricsObj, options,
@@ -51,7 +63,7 @@ export const evalOCRPage = async (params) => {
51
63
  * Compare two sets of OCR data.
52
64
  * @param {Array<OcrPage>} ocrA
53
65
  * @param {Array<OcrPage>} ocrB
54
- * @param {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} options
66
+ * @param {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} [options]
55
67
  */
56
68
  export const compareOCR = async (ocrA, ocrB, options) => {
57
69
  /** @type {Parameters<typeof compareOCRPage>[2]} */
@@ -486,10 +486,19 @@ async function penalizeWord(wordObjs) {
486
486
  export async function compareOCRPageImp({
487
487
  pageA, pageB, binaryImage, pageMetricsObj, options = {},
488
488
  }) {
489
- const binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
490
-
491
- const imageUpscaled = binaryImage.upscaled;
492
- const imageRotated = binaryImage.rotated;
489
+ // The `binaryImage` argument is not sent for certain operations, which do not require it.
490
+ // For example, running a basic comparison between a page and the ground truth does not require having the image.
491
+ // The types do not currently reflect this, so this should be reworked at some point.
492
+ /** @type {?ImageBitmap} */
493
+ let binaryImageBit = null;
494
+ let imageUpscaled = false;
495
+ let imageRotated = false;
496
+
497
+ if (binaryImage) {
498
+ binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
499
+ imageUpscaled = binaryImage.upscaled;
500
+ imageRotated = binaryImage.rotated;
501
+ }
493
502
 
494
503
  const mode = options?.mode === undefined ? 'stats' : options?.mode;
495
504
  const editConf = options?.editConf === undefined ? false : options?.editConf;
@@ -17,7 +17,7 @@ import { optimizeFont } from './optimizeFontModule.js';
17
17
  // import Tesseract from "../../tess/tesseract.esm.min.js";
18
18
  const browserMode = typeof process === 'undefined';
19
19
 
20
- const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js/src/index.js');
20
+ const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
21
21
 
22
22
  const defaultConfigs = {
23
23
  // TODO: Add back support for multiple PSM modes.
@@ -135,7 +135,7 @@ export const recognizeAndConvert = async ({
135
135
 
136
136
  const keepItalic = oemCurrent === 0;
137
137
 
138
- const ocrBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
138
+ const ocrBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
139
139
 
140
140
  const res2 = await convertPageBlocks({
141
141
  ocrBlocks, n, pageDims, rotateAngle: angle, keepItalic,
@@ -184,14 +184,14 @@ export const recognizeAndConvert2 = async ({
184
184
  let resLegacy;
185
185
  let resLSTM;
186
186
  if (options.lstm && options.legacy) {
187
- const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
187
+ const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
188
188
  resLegacy = await convertPageBlocks({
189
189
  ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
190
190
  });
191
191
  (async () => {
192
192
  const res1 = await resArr[1];
193
193
 
194
- const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
194
+ const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
195
195
  resLSTM = await convertPageBlocks({
196
196
  ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
197
197
  });
@@ -201,12 +201,12 @@ export const recognizeAndConvert2 = async ({
201
201
  postMessage({ data: xB, id: `${id}b` });
202
202
  })();
203
203
  } else if (!options.lstm && options.legacy) {
204
- const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
204
+ const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
205
205
  resLegacy = await convertPageBlocks({
206
206
  ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
207
207
  });
208
208
  } else if (options.lstm && !options.legacy) {
209
- const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
209
+ const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
210
210
  resLSTM = await convertPageBlocks({
211
211
  ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
212
212
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.1.1",
3
+ "version": "0.2.1",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {
@@ -52,7 +52,7 @@
52
52
  "canvas": "^2.11.2",
53
53
  "commander": "^11.1.0",
54
54
  "puppeteer": "^22.13.0",
55
- "tesseract.js": "scribeocr/tesseract.js#2065fd6",
55
+ "@scribe.js/tesseract.js": "^5.0.5",
56
56
  "web-worker": "~1.2.0"
57
57
  }
58
58
  }
package/scribe.js CHANGED
@@ -11,7 +11,7 @@ import { ImageCache } from './js/containers/imageContainer.js';
11
11
  import coords from './js/coordinates.js';
12
12
  import { drawDebugImages } from './js/debug.js';
13
13
  import { download, exportData } from './js/export/export.js';
14
- import { writeDebugCsv } from './js/export/exportDebugCsv.js';
14
+ import { writeDebugCsv, convertToCSV } from './js/export/exportDebugCsv.js';
15
15
  import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
16
16
  import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
17
17
  import { gs } from './js/generalWorkerMain.js';
@@ -30,6 +30,7 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
30
30
  import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
31
31
  import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
32
32
  import { assignParagraphs } from './js/utils/reflowPars.js';
33
+ import { extractInternalPDFText } from './js/extractPDFText.js';
33
34
 
34
35
  /**
35
36
  * Initialize the program and optionally pre-load resources.
@@ -66,18 +67,23 @@ const init = async (params) => {
66
67
  };
67
68
 
68
69
  /**
69
- * Helper function for recognizing files with a single function call.
70
+ * Function for extracting text from image and PDF files with a single function call.
71
+ * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
70
72
  * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
71
73
  * @public
72
74
  * @param {Parameters<typeof importFiles>[0]} files
73
75
  * @param {Array<string>} [langs=['eng']]
74
76
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
75
- * @returns
77
+ * @param {Object} [options]
78
+ * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
79
+ * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
76
80
  */
77
- const recognizeFiles = async (files, langs = ['eng'], outputFormat = 'txt') => {
81
+ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
82
+ const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
83
+ const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
78
84
  init({ ocr: true, font: true });
79
- await importFiles(files);
80
- await recognize({ langs });
85
+ await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
86
+ if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
81
87
  return exportData(outputFormat);
82
88
  };
83
89
 
@@ -125,6 +131,8 @@ class utils {
125
131
  // Misc utils
126
132
  static calcBoxOverlap = calcBoxOverlap;
127
133
 
134
+ static convertToCSV = convertToCSV;
135
+
128
136
  static replaceSmartQuotes = replaceSmartQuotes;
129
137
 
130
138
  static getRandomAlphanum = getRandomAlphanum;
@@ -174,7 +182,8 @@ export default {
174
182
  opt,
175
183
  recognize,
176
184
  recognizePage,
177
- recognizeFiles,
185
+ extractText,
186
+ extractInternalPDFText,
178
187
  terminate,
179
188
  utils,
180
189
  };