scribe.js-ocr 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.json CHANGED
@@ -67,6 +67,12 @@
67
67
  // "one-var": "off",
68
68
  // "one-var-declaration-per-line": "off",
69
69
 
70
+ // If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
71
+ "import/no-relative-packages": "off",
72
+
73
+ // Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
74
+ "no-lone-blocks": "off",
75
+
70
76
  // This rule was depreciated
71
77
  "no-return-await": "off",
72
78
 
package/README.md CHANGED
@@ -9,4 +9,50 @@ Common use cases:
9
9
  3. Write `.pdf` files that include a high-quality invisible text layer.
10
10
  1. scribe.js can insert text into an existing `.pdf` file, making it searchable.
11
11
 
12
- Scribe.js is a library intended for developers. End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
12
+ Scribe.js is a library intended for developers. End users who want to scan documents should see the officially-supported GUI at [scribeocr.com](https://scribeocr.com/) (repo [here](https://github.com/scribeocr/scribeocr)).
13
+
14
+ # Setup
15
+ Install from `npm` by running the following:
16
+ ```sh
17
+ npm i scribe.js-ocr
18
+ ```
19
+
20
+ Scribe.js is written in JavaScript using ESM, so can be imported directly from browser or Node.js JavaScript code.
21
+ ```js
22
+ // Import statement in browser:
23
+ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
24
+ // Import statement for Node.js:
25
+ import scribe from 'scribe.js-ocr';
26
+
27
+ // Basic usage
28
+ scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
29
+ .then((res) => console.log(res))
30
+ ```
31
+
32
+ When using Scribe.js in the browser, all files must be served from the same origin as the file importing Scribe.js. This means that importing Scribe.js from a CDN will not work. There is no UMD version.
33
+
34
+ # Scribe.js vs. Tesseract.js
35
+ Considering whether Scribe.js or Tesseract.js is better for your project? Read [this article](./docs/scribe_vs_tesseract.md).
36
+
37
+ # Documentation
38
+ - [Basic Browser Examples](./examples/browser/)
39
+ - [Basic Node.js Examples](./examples/node/)
40
+ - [Scribe.js vs. Tesseract.js Comparison](./docs/scribe_vs_tesseract.md)
41
+ - [API](./docs/API.md)
42
+
43
+ # Contributing
44
+ To work on a local copy, simply clone with `--recurse-submodules` and install. Please run the automated tests before making a PR.
45
+ ```sh
46
+ ## Clone the repo, including recursively cloning submodules
47
+ git clone --recurse-submodules git@github.com:scribeocr/scribe.js.git
48
+ cd scribe.js
49
+
50
+ ## Install dependencies
51
+ npm i
52
+
53
+ ## Make changes
54
+ ## [...]
55
+
56
+ ## Run automated tests before making PR
57
+ npm run test
58
+ ```
package/cli/extract.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import fs from 'fs';
2
2
  import path from 'path';
3
- import scribe from '../module.js';
3
+ import scribe from '../scribe.js';
4
4
 
5
5
  /**
6
6
  *
package/cli/main.js CHANGED
@@ -5,7 +5,7 @@ import fs from 'fs';
5
5
  import path from 'path';
6
6
 
7
7
  import { tmpUnique } from '../js/worker/compareOCRModule.js';
8
- import scribe from '../module.js';
8
+ import scribe from '../scribe.js';
9
9
 
10
10
  // When `debugMode` is enabled:
11
11
  // (1) Comparison images are saved as .png files.
package/docs/API.md ADDED
@@ -0,0 +1,191 @@
1
+ <!-- Generated by documentation.js. Update this documentation by updating the source code. -->
2
+
3
+ ### Table of Contents
4
+
5
+ * [init][1]
6
+ * [Parameters][2]
7
+ * [extractText][3]
8
+ * [Parameters][4]
9
+ * [clear][5]
10
+ * [terminate][6]
11
+ * [exportData][7]
12
+ * [Parameters][8]
13
+ * [download][9]
14
+ * [Parameters][10]
15
+ * [SortedInputFiles][11]
16
+ * [Properties][12]
17
+ * [importFiles][13]
18
+ * [Parameters][14]
19
+ * [recognizePage][15]
20
+ * [Parameters][16]
21
+ * [recognize][17]
22
+ * [Parameters][18]
23
+
24
+ ## init
25
+
26
+ Initialize the program and optionally pre-load resources.
27
+
28
+ ### Parameters
29
+
30
+ * `params` **[Object][19]?**&#x20;
31
+
32
+ * `params.pdf` **[boolean][20]** Load PDF renderer. (optional, default `false`)
33
+ * `params.ocr` **[boolean][20]** Load OCR engine. (optional, default `false`)
34
+ * `params.font` **[boolean][20]** Load built-in fonts.
35
+ The PDF renderer and OCR engine are automatically loaded when needed.
36
+ Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
37
+
38
+ ## extractText
39
+
40
+ Function for extracting text from image and PDF files with a single function call.
41
+ By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
42
+ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
43
+
44
+ ### Parameters
45
+
46
+ * `files` &#x20;
47
+ * `langs` **[Array][21]<[string][22]>** (optional, default `['eng']`)
48
+ * `outputFormat` (optional, default `'txt'`)
49
+ * `options` **[Object][19]?** (optional, default `{}`)
50
+
51
+ * `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
52
+ * `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
53
+
54
+ ## clear
55
+
56
+ Clears all document-specific data.
57
+
58
+ ## terminate
59
+
60
+ Terminates the program and releases resources.
61
+
62
+ ## exportData
63
+
64
+ Export active OCR data to specified format.
65
+
66
+ ### Parameters
67
+
68
+ * `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)** (optional, default `'txt'`)
69
+ * `minValue` **[number][23]** (optional, default `0`)
70
+ * `maxValue` **[number][23]** (optional, default `-1`)
71
+
72
+ Returns **[Promise][24]<([string][22] | [ArrayBuffer][25])>**&#x20;
73
+
74
+ ## download
75
+
76
+ Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
77
+
78
+ ### Parameters
79
+
80
+ * `format` **(`"pdf"` | `"hocr"` | `"docx"` | `"xlsx"` | `"txt"` | `"text"`)**&#x20;
81
+ * `fileName` **[string][22]**&#x20;
82
+ * `minValue` **[number][23]** (optional, default `0`)
83
+ * `maxValue` **[number][23]** (optional, default `-1`)
84
+
85
+ ## SortedInputFiles
86
+
87
+ An object with this shape can be used to provide input to the `importFiles` function,
88
+ without needing that function to figure out the file types.
89
+ This is required when using ArrayBuffer inputs.
90
+
91
+ Type: [Object][19]
92
+
93
+ ### Properties
94
+
95
+ * `pdfFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
96
+ * `imageFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
97
+ * `ocrFiles` **([Array][21]\<File> | [Array][21]<[string][22]> | [Array][21]<[ArrayBuffer][25]>)?**&#x20;
98
+
99
+ ## importFiles
100
+
101
+ Import files for processing.
102
+ An object with `pdfFiles`, `imageFiles`, and `ocrFiles` arrays can be provided to import multiple types of files.
103
+ Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
104
+
105
+ ### Parameters
106
+
107
+ * `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])**&#x20;
108
+ * `options` **[Object][19]?** (optional, default `{}`)
109
+
110
+ * `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
111
+ * `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
112
+
113
+ ## recognizePage
114
+
115
+ Recognize a single page in active document.
116
+ Use `recognize` instead to recognize all pages in a document.
117
+
118
+ ### Parameters
119
+
120
+ * `n` **[number][23]** Page number to recognize.
121
+ * `legacy` **[boolean][20]** *
122
+ * `lstm` **[boolean][20]** *
123
+ * `areaMode` **[boolean][20]** *
124
+ * `tessOptions` **[Object][19]<[string][22], [string][22]>** Options to pass to Tesseract.js. (optional, default `{}`)
125
+ * `debugVis` **[boolean][20]** Generate instructions for debugging visualizations. (optional, default `false`)
126
+
127
+ ## recognize
128
+
129
+ Recognize all pages in active document.
130
+ Files for recognition should already be imported using `importFiles` before calling this function.
131
+ The results of recognition can be exported by calling `exportFiles` after this function.
132
+
133
+ ### Parameters
134
+
135
+ * `options` **[Object][19]** (optional, default `{}`)
136
+
137
+ * `options.mode` **(`"speed"` | `"quality"`)** Recognition mode. (optional, default `'quality'`)
138
+ * `options.langs` **[Array][21]<[string][22]>** Language(s) in document. (optional, default `['eng']`)
139
+ * `options.modeAdv` **(`"lstm"` | `"legacy"` | `"combined"`)** Alternative method of setting recognition mode. (optional, default `'combined'`)
140
+ * `options.combineMode` **(`"conf"` | `"data"`)** Method of combining OCR results. Used if OCR data already exists. (optional, default `'data'`)
141
+ * `options.vanillaMode` **[boolean][20]** Whether to use the vanilla Tesseract.js model. (optional, default `false`)
142
+
143
+ [1]: #init
144
+
145
+ [2]: #parameters
146
+
147
+ [3]: #extracttext
148
+
149
+ [4]: #parameters-1
150
+
151
+ [5]: #clear
152
+
153
+ [6]: #terminate
154
+
155
+ [7]: #exportdata
156
+
157
+ [8]: #parameters-2
158
+
159
+ [9]: #download
160
+
161
+ [10]: #parameters-3
162
+
163
+ [11]: #sortedinputfiles
164
+
165
+ [12]: #properties
166
+
167
+ [13]: #importfiles
168
+
169
+ [14]: #parameters-4
170
+
171
+ [15]: #recognizepage
172
+
173
+ [16]: #parameters-5
174
+
175
+ [17]: #recognize
176
+
177
+ [18]: #parameters-6
178
+
179
+ [19]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object
180
+
181
+ [20]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean
182
+
183
+ [21]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array
184
+
185
+ [22]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String
186
+
187
+ [23]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number
188
+
189
+ [24]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise
190
+
191
+ [25]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer
@@ -0,0 +1,39 @@
1
+ # Overview
2
+ Scribe.js and Tesseract.js are both JavaScript packages that allow for running OCR in the browser or Node.js. As both packages have advantages and disadvantages, this article explains how the packages differ, and should help developers decide which package is right for their project.
3
+
4
+ ## TL;DR
5
+ Tesseract.js is smaller and faster than Scribe.js. Projects that only need to extract text from `.png` and `.jpeg` images, and are satisfied with "pretty good" accuracy, should use Tesseract.js. Scribe.js builds on Tesseract.js by providing more accurate OCR results and more features. Most notably, Scribe.js provides PDF support, including the ability to extract existing text from PDFs, run OCR on PDFs, and add text layers to PDFs. Developers unsure of the tradeoffs should try both packages using images their application is likely to encounter.
6
+
7
+ # Scope
8
+ The reason why Tesseract.js and Scribe.js exist as separate packages, despite providing similar features and containing shared code, is that the scope of both projects is different. Tesseract.js has a significantly narrower scope compared to Scribe.js.
9
+
10
+ **The goal of Tesseract.js is to bring Tesseract--a popular program we do not maintain--to JavaScript.** As long as the JavaScript interface is user-friendly and works correctly, and recognition results are similar to Tesseract on desktop, Tesseract.js is working as intended. All bugs inherited from the main Tesseract codebase are outside of the scope of the Tesseract.js project. As a result, a large number of Tesseract.js Git Issues, including virtually all accuracy-related issues, are closed as out of scope.
11
+
12
+ **The goal of Scribe.js is to provide high-quality text extraction in JavaScript.** Scribe.js was created to build on Tesseract.js and support many valid bug reports and feature requests that are outside of the scope of Tesseract.js. For example, two of the most common requests from Tesseract.js users are improved OCR accuracy and PDF support. Scribe.js (optionally) includes a custom OCR model that differs from, and generally outperforms, Tesseract. When provided a text-native `.pdf`, Scribe.js can bypass OCR entirely and return the raw text.
13
+
14
+ # Differences
15
+ ### PDF Support
16
+ Tesseract.js does not support `.pdf` files. The only way to extract text from `.pdf` files using Tesseract.js is to render the `.pdf` file into a series of `.png` images using a separate library and then recognizing those `.png` images. In addition to being slow, this process is often unnecessary, as many modern `.pdf` files are already text-native, meaning that no OCR needs to occur.
17
+
18
+ Scribe OCR does support `.pdf` files, and can extract text from `.pdf` files in multiple ways. Scribe OCR can recognize the contents of the `.pdf` file using OCR. Additionally, for `.pdf` files that are text-native or contain an existing OCR layer, the existing text can be extracted directly. The latter method is significantly faster compared to rendering the `.pdf` to images and running OCR.
19
+
20
+ ### OCR Quality
21
+ Scribe.js produces results that are generally more accurate than Tesseract.js.
22
+ 1. Particularly for high-quality scans and screenshots, Scribe.js misidentifies fewer words.
23
+ 2. Scribe.js often recognizes words that are skipped entirely by Tesseract.
24
+ 3. Scribe.js can identify font styles, which Tesseract is incapable of.
25
+ 1. This can be observed by using the GUI at [scribeocr.com](https://scribeocr.com/).
26
+
27
+ ### GUI
28
+ Scribe OCR contains a GUI web application that end-users can use to scan documents. Tesseract.js is intended for developers within other applications, so is unsuitable for end users.
29
+
30
+ ### File Size
31
+ The additional features added by Scribe.js take up more space. Enabling PDF support requires loading multiple megabytes of dependencies. Using the Scribe.js default `quality` OCR model loads more language data than Tesseract.js does by default.
32
+
33
+ Notably, these resources are only loaded if requested--the PDF resources are only loaded if a PDF file is uploaded or exported, and setting OCR mode to `speed` prevents additional data from being downloaded. However, if all optional features are disabled, Scribe.js has little to offer over Tesseract.js.
34
+
35
+ ### Speed
36
+ The Scribe.js default `quality` recognition mode runs additional recognition and checks, which therefore increases runtime. The amount varies significantly document-to-document, but is often in the range of a 40-90% increase versus the `speed` mode (which provides results similar to to Tesseract.js). For applications where accuracy is not critical, this increase in runtime may not be worth it.
37
+
38
+ ### License
39
+ Tesseract.js is Apache 2.0 licensed. This is a permissive license that imposes no meaningful restrictions on use. Scribe.js is AGPL 3.0 licensed, which is a copy-left license. As a result, to use Scribe.js in your program--whether on the front-end or server-side--you must either (1) publish your program under AGPL 3.0 or a compatible license or (2) obtain a proprietary license (contact admin@scribeocr.com).
@@ -0,0 +1,9 @@
1
+ <!DOCTYPE HTML>
2
+ <html>
3
+ <head>
4
+ <script src="./recognize-basic.js" type="module"></script>
5
+ </head>
6
+ <body>
7
+ <input type="file" id="uploader" multiple>
8
+ </body>
9
+ </html>
@@ -0,0 +1,10 @@
1
+ import scribe from '../../scribe.js';
2
+ // Pre-load OCR and font data to avoid delay when user uploads a file.
3
+ await scribe.init({ ocr: true, font: true });
4
+
5
+ const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
6
+ elm.addEventListener('change', async () => {
7
+ if (!elm.files) return;
8
+ const text = await scribe.extractText(elm.files);
9
+ console.log(text);
10
+ });
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env node
2
+ // Run `node examples/node/recognize-basic.js path/to/image.jpg` to recognize text in an image.
3
+ import scribe from '../../scribe.js';
4
+
5
+ const [,, imagePath] = process.argv;
6
+
7
+ (async () => {
8
+ const res = await scribe.extractText([imagePath]);
9
+ console.log(res);
10
+ await scribe.terminate();
11
+ })();
@@ -74,9 +74,6 @@ export class inputData {
74
74
  /** `true` if user re-uploaded HOCR data created by Scribe OCR */
75
75
  static resumeMode = false;
76
76
 
77
- /** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
78
- static extractTextMode = false;
79
-
80
77
  /** `true` if ground truth data is uploaded */
81
78
  static evalMode = false;
82
79
 
@@ -216,52 +216,19 @@ export class ImageCache {
216
216
  static pageCount = 0;
217
217
 
218
218
  /**
219
- * The dimensions that each page would be, if it was rendered at 300 DPI.
220
- * @type {Array<dims>}
221
- */
222
- static pdfDims300Arr = [];
219
+ * The dimensions that each page would be, if it was rendered at 300 DPI.
220
+ * @type {Array<dims>}
221
+ */
222
+ static pdfDims300 = [];
223
223
 
224
224
  static inputModes = {
225
225
  pdf: false,
226
226
  image: false,
227
227
  };
228
228
 
229
- static pdfContentStats = {
230
- /** Total number of letters in the source PDF. */
231
- letterCountTotal: 0,
232
- /** Total number of visible letters in the source PDF. */
233
- letterCountVis: 0,
234
- /** Total number of pages with 100+ letters in the source PDF. */
235
- pageCountTotalText: 0,
236
- /** Total number of pages with 100+ visible letters in the source PDF. */
237
- pageCountVisText: 0,
238
- };
239
-
240
229
  /** @type {?('text'|'ocr'|'image')} */
241
230
  static pdfType = null;
242
231
 
243
- static setPdfType = () => {
244
- // The PDF is considered text-native if:
245
- // (1) The total number of visible letters is at least 100 per page on average.
246
- // (2) The total number of visible letters is at least 90% of the total number of letters.
247
- // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
248
- if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
249
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
250
- && ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
251
- ImageCache.pdfType = 'text';
252
- // The PDF is considered ocr-native if:
253
- // (1) The total number of letters is at least 100 per page on average.
254
- // (2) The total number of letters is at least half of the total number of letters.
255
- } else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
256
- && ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
257
- ImageCache.pdfType = 'ocr';
258
- // Otherwise, the PDF is considered image-native.
259
- // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
260
- } else {
261
- ImageCache.pdfType = 'image';
262
- }
263
- };
264
-
265
232
  static colorModeDefault = 'gray';
266
233
 
267
234
  static cacheRenderPages = 3;
@@ -327,7 +294,7 @@ export class ImageCache {
327
294
  } if (ImageCache.inputModes.pdf) {
328
295
  const pageMetrics = pageMetricsArr[n];
329
296
  const targetWidth = pageMetrics.dims.width;
330
- const dpi = 300 * (targetWidth / ImageCache.pdfDims300Arr[n].width);
297
+ const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
331
298
  const muPDFScheduler = await ImageCache.getMuPDFScheduler();
332
299
  return muPDFScheduler.drawPageAsPNG({
333
300
  page: n + 1, dpi, color, skipText: skipTextMode,
@@ -566,14 +533,10 @@ export class ImageCache {
566
533
  ImageCache.inputModes.image = false;
567
534
  ImageCache.inputModes.pdf = false;
568
535
  ImageCache.pageCount = 0;
569
- ImageCache.pdfDims300Arr.length = 0;
536
+ ImageCache.pdfDims300.length = 0;
570
537
  ImageCache.loadCount = 0;
571
538
  ImageCache.nativeProps.length = 0;
572
539
  ImageCache.binaryProps.length = 0;
573
- ImageCache.pdfContentStats.letterCountTotal = 0;
574
- ImageCache.pdfContentStats.letterCountVis = 0;
575
- ImageCache.pdfContentStats.pageCountTotalText = 0;
576
- ImageCache.pdfContentStats.pageCountVisText = 0;
577
540
  };
578
541
 
579
542
  static terminate = async () => {
@@ -600,9 +563,8 @@ export class ImageCache {
600
563
  *
601
564
  * @param {ArrayBuffer} fileData
602
565
  * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
603
- * @param {Boolean} [extractStext=false]
604
566
  */
605
- static openMainPDF = async (fileData, skipText = false, extractStext = false) => {
567
+ static openMainPDF = async (fileData, skipText = false) => {
606
568
  const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
607
569
 
608
570
  await ImageCache.#loadFileMuPDFScheduler(fileData);
@@ -611,9 +573,9 @@ export class ImageCache {
611
573
 
612
574
  const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
613
575
 
614
- ImageCache.pdfDims300Arr.length = 0;
576
+ ImageCache.pdfDims300.length = 0;
615
577
  pageDims1.forEach((x) => {
616
- ImageCache.pdfDims300Arr.push({ width: x[0], height: x[1] });
578
+ ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
617
579
  });
618
580
 
619
581
  ImageCache.inputModes.pdf = true;
@@ -627,10 +589,10 @@ export class ImageCache {
627
589
 
628
590
  // For reasons that are unclear, a small number of pages have been rendered into massive files
629
591
  // so a hard-cap on resolution must be imposed.
630
- const pageDPI = ImageCache.pdfDims300Arr.map((x) => 300 * 2000 / x.width, 2000);
592
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
631
593
 
632
594
  // In addition to capping the resolution, also switch the width/height
633
- ImageCache.pdfDims300Arr.forEach((x, i) => {
595
+ ImageCache.pdfDims300.forEach((x, i) => {
634
596
  const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
635
597
  pageMetricsArr[i] = new PageMetrics(pageDims);
636
598
  });
@@ -674,23 +636,5 @@ export class ImageCache {
674
636
  await setUploadFontsWorker(gs.schedulerInner);
675
637
  });
676
638
  }
677
-
678
- if (extractStext) {
679
- ocrAllRaw.active = Array(ImageCache.pageCount);
680
- const resArr = pageDPI.map(async (x, i) => {
681
- // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
682
- // The XML format is the only built-in mupdf format that includes character-level granularity.
683
- const res = await muPDFScheduler.pageText({
684
- page: i + 1, dpi: x, format: 'xml', calcStats: true,
685
- });
686
- ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
687
- ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
688
- if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
689
- if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
690
- ocrAllRaw.active[i] = res.content;
691
- });
692
- await Promise.all(resArr);
693
- ImageCache.setPdfType();
694
- }
695
639
  };
696
640
  }
@@ -8,12 +8,14 @@ import { renderHOCR } from './exportRenderHOCR.js';
8
8
  import { renderText } from './exportRenderText.js';
9
9
 
10
10
  /**
11
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
11
+ * Export active OCR data to specified format.
12
+ * @public
13
+ * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} [format='txt']
12
14
  * @param {number} [minValue=0]
13
15
  * @param {number} [maxValue=-1]
14
16
  * @returns {Promise<string|ArrayBuffer>}
15
17
  */
16
- export async function exportData(format, minValue = 0, maxValue = -1) {
18
+ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
17
19
  if (format === 'text') format = 'txt';
18
20
 
19
21
  if (maxValue === -1) maxValue = inputData.pageCount - 1;
@@ -184,6 +186,7 @@ export async function exportData(format, minValue = 0, maxValue = -1) {
184
186
 
185
187
  /**
186
188
  * Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
189
+ * @public
187
190
  * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
188
191
  * @param {string} fileName
189
192
  * @param {number} [minValue=0]
@@ -0,0 +1,110 @@
1
+ import { ImageCache } from './containers/imageContainer.js';
2
+ import { convertOCRAll } from './recognizeConvert.js';
3
+ import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
4
+
5
+ /**
6
+ * Extract raw text content from currently loaded PDF.
7
+ * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
8
+ */
9
+ const extractInternalPDFTextRaw = async () => {
10
+ const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
11
+
12
+ const pdfContentStats = {
13
+ /** Total number of letters in the source PDF. */
14
+ letterCountTotal: 0,
15
+ /** Total number of visible letters in the source PDF. */
16
+ letterCountVis: 0,
17
+ /** Total number of pages with 100+ letters in the source PDF. */
18
+ pageCountTotalText: 0,
19
+ /** Total number of pages with 100+ visible letters in the source PDF. */
20
+ pageCountVisText: 0,
21
+ };
22
+
23
+ const stextArr = /** @type {Array<string>} */ ([]);
24
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
25
+ const resArr = pageDPI.map(async (x, i) => {
26
+ // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
27
+ // The XML format is the only built-in mupdf format that includes character-level granularity.
28
+ const res = await muPDFScheduler.pageText({
29
+ page: i + 1, dpi: x, format: 'xml', calcStats: true,
30
+ });
31
+ pdfContentStats.letterCountTotal += res.letterCountTotal;
32
+ pdfContentStats.letterCountVis += res.letterCountVis;
33
+ if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
34
+ if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
35
+ stextArr[i] = res.content;
36
+ });
37
+ await Promise.all(resArr);
38
+
39
+ /** @type {"image" | "text" | "ocr"} */
40
+ let type = 'image';
41
+
42
+ // Determine whether the PDF is text-native, image-only, or image + OCR.
43
+ {
44
+ // The PDF is considered text-native if:
45
+ // (1) The total number of visible letters is at least 100 per page on average.
46
+ // (2) The total number of visible letters is at least 90% of the total number of letters.
47
+ // (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
48
+ if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
49
+ && pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
50
+ && pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
51
+ type = 'text';
52
+ // The PDF is considered ocr-native if:
53
+ // (1) The total number of letters is at least 100 per page on average.
54
+ // (2) The total number of letters is at least half of the total number of letters.
55
+ } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
56
+ && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
57
+ type = 'ocr';
58
+ // Otherwise, the PDF is considered image-native.
59
+ // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
60
+ } else {
61
+ type = 'image';
62
+ }
63
+ }
64
+
65
+ return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
66
+ };
67
+
68
+ /**
69
+ * Extract and parse text from currently loaded PDF.
70
+ * @param {Object} [options]
71
+ * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
72
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
73
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
74
+ * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
75
+ * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
76
+ */
77
+ export const extractInternalPDFText = async (options = {}) => {
78
+ const extractPDFTextNative = options?.extractPDFTextNative ?? true;
79
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
80
+ const extractPDFTextImage = options?.extractPDFTextImage ?? false;
81
+
82
+ const setActive = options?.setActive ?? false;
83
+
84
+ const res = await extractInternalPDFTextRaw();
85
+
86
+ ImageCache.pdfType = res.type;
87
+ ocrAllRaw.pdf = res.contentRaw;
88
+
89
+ if (!extractPDFTextImage && res.type === 'image') return res;
90
+
91
+ if (!extractPDFTextOCR && res.type === 'ocr') return res;
92
+
93
+ if (!extractPDFTextNative && res.type === 'text') return res;
94
+
95
+ ocrAll.pdf = Array(ImageCache.pageCount);
96
+
97
+ if (setActive) {
98
+ ocrAllRaw.active = ocrAllRaw.pdf;
99
+ ocrAll.active = ocrAll.pdf;
100
+ }
101
+
102
+ const format = 'stext';
103
+
104
+ // Process HOCR using web worker, reading from file first if that has not been done already
105
+ await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
106
+
107
+ res.content = ocrAll.pdf;
108
+
109
+ return res;
110
+ };
@@ -80,7 +80,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
80
80
  if (!fontAll.active || (!fontAll.active.NimbusSans.normal.opt && !fontAll.active.NimbusRomNo9L.normal.opt)) fontAll.active = fontAll.raw;
81
81
 
82
82
  if (typeof process === 'undefined') {
83
- await gs.schedulerReadyLoadFonts;
83
+ // This assumes that the scheduler `init` method has at least started.
84
+ if (gs.schedulerReady === null) console.warn('Failed to load fonts to workers as workers have not been initialized yet.');
85
+ await gs.schedulerReady;
84
86
  await setBuiltInFontsWorker(gs.schedulerInner, true);
85
87
  }
86
88
 
@@ -114,8 +116,7 @@ export async function loadChiSimFont() {
114
116
  * @param {boolean} [forceWorkerUpdate=false] - If true, forces the worker to update the font data even if the font data of this type is already loaded.
115
117
  * This should be used when switching from unvalidated to validated optimized fonts.
116
118
  */
117
- export async function enableDisableFontOpt(enable, useInitial = false, forceWorkerUpdate = false) {
118
- const browserMode = typeof process === 'undefined';
119
+ export async function enableFontOpt(enable, useInitial = false, forceWorkerUpdate = false) {
119
120
 
120
121
  // Enable/disable optimized font
121
122
  if (enable && useInitial && fontAll.optInitial) {
@@ -127,7 +128,7 @@ export async function enableDisableFontOpt(enable, useInitial = false, forceWork
127
128
  }
128
129
 
129
130
  // Enable/disable optimized font in workers
130
- if (browserMode) {
131
+ if (typeof process === 'undefined') {
131
132
  await setBuiltInFontsWorker(gs.schedulerInner, forceWorkerUpdate);
132
133
  } else {
133
134
  // const { setFontAll } = await import('./worker/compareOCRModule.js');
package/js/fontEval.js CHANGED
@@ -1,7 +1,9 @@
1
1
  import { DebugData, fontMetricsObj, pageMetricsArr } from './containers/dataContainer.js';
2
2
  import { fontAll } from './containers/fontContainer.js';
3
3
  import { ImageCache } from './containers/imageContainer.js';
4
- import { enableDisableFontOpt, optimizeFontContainerAll, setDefaultFontAuto } from './fontContainerMain.js';
4
+ import {
5
+ enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
6
+ } from './fontContainerMain.js';
5
7
  import { gs } from './generalWorkerMain.js';
6
8
 
7
9
  /**
@@ -162,6 +164,8 @@ export async function evaluateFonts(pageArr) {
162
164
  export async function runFontOptimization(ocrArr) {
163
165
  const browserMode = typeof process === 'undefined';
164
166
 
167
+ await loadBuiltInFontsRaw();
168
+
165
169
  const fontRaw = fontAll.getContainer('raw');
166
170
 
167
171
  const calculateOpt = fontMetricsObj && Object.keys(fontMetricsObj).length > 0;
@@ -188,7 +192,7 @@ export async function runFontOptimization(ocrArr) {
188
192
  const pageNum = Math.min(ImageCache.pageCount, 5);
189
193
 
190
194
  // Set raw font in workers
191
- await enableDisableFontOpt(false);
195
+ await enableFontOpt(false);
192
196
 
193
197
  // This step needs to happen here as all fonts must be registered before initializing the canvas.
194
198
  if (!browserMode) {
@@ -202,7 +206,7 @@ export async function runFontOptimization(ocrArr) {
202
206
 
203
207
  if (calculateOpt && Object.keys(fontAll.optInitial).length > 0) {
204
208
  // Enable optimized fonts
205
- await enableDisableFontOpt(true, true, true);
209
+ await enableFontOpt(true, true, true);
206
210
 
207
211
  const evalOpt = await evaluateFonts(ocrArr.slice(0, pageNum));
208
212
 
@@ -248,7 +252,7 @@ export async function runFontOptimization(ocrArr) {
248
252
  }
249
253
 
250
254
  // Set final fonts in workers
251
- await enableDisableFontOpt(true, false, true);
255
+ await enableFontOpt(true, false, true);
252
256
 
253
257
  const enableOpt = enableOptSerif || enableOptSans;
254
258
 
@@ -173,22 +173,6 @@ export class gs {
173
173
  });
174
174
  };
175
175
 
176
- /** @type {?Function} */
177
- static resReadyLoadFonts = null;
178
-
179
- /**
180
- * Promise that resolves when the scheduler is ready for font loading.
181
- * Only used in browser version, as nothing using fonts is run within workers in Node.js version.
182
- * @type {?Promise<void>}
183
- */
184
- static schedulerReadyLoadFonts = null;
185
-
186
- static setSchedulerReadyLoadFonts = () => {
187
- gs.schedulerReadyLoadFonts = new Promise((resolve, reject) => {
188
- gs.resReadyLoadFonts = resolve;
189
- });
190
- };
191
-
192
176
  /** @type {?Function} */
193
177
  static resReadyTesseract = null;
194
178
 
@@ -201,21 +185,7 @@ export class gs {
201
185
  });
202
186
  };
203
187
 
204
- /** @type {?Function} */
205
- static resReadyFontAllRaw = null;
206
-
207
- /** @type {?Promise<void>} */
208
- static fontAllRawReady = null;
209
-
210
- static setFontAllRawReady = () => {
211
- gs.fontAllRawReady = new Promise((resolve, reject) => {
212
- gs.resReadyFontAllRaw = resolve;
213
- });
214
- return /** @type {Function} */ (gs.resReadyFontAllRaw);
215
- };
216
-
217
188
  static init = async () => {
218
- gs.setSchedulerReadyLoadFonts();
219
189
  gs.setSchedulerReady();
220
190
 
221
191
  // Determine number of workers to use in the browser.
@@ -251,16 +221,6 @@ export class gs {
251
221
 
252
222
  gs.scheduler = new GeneralScheduler(gs.schedulerInner);
253
223
 
254
- // Fonts are only loaded in the browser.
255
- // The functions we would use fonts in a worker for also require node-canvas, which does not support workers yet.
256
- if (typeof process === 'undefined') {
257
- // @ts-ignore
258
- gs.resReadyLoadFonts(true);
259
-
260
- // Send raw fonts to workers after they have loaded in the main thread.
261
- await gs.fontAllRawReady;
262
- }
263
-
264
224
  // @ts-ignore
265
225
  gs.resReady(true);
266
226
  };
@@ -303,12 +263,12 @@ export class gs {
303
263
  static getGeneralScheduler = async () => {
304
264
  if (gs.schedulerReady) {
305
265
  await gs.schedulerReady;
306
- return gs.scheduler;
266
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
307
267
  }
308
268
 
309
269
  await gs.init();
310
270
 
311
- return gs.scheduler;
271
+ return /** @type {GeneralScheduler} */ (gs.scheduler);
312
272
  };
313
273
 
314
274
  static terminate = async () => {
@@ -317,11 +277,7 @@ export class gs {
317
277
  gs.schedulerInner = null;
318
278
  gs.resReady = null;
319
279
  gs.schedulerReady = null;
320
- gs.resReadyLoadFonts = null;
321
- gs.schedulerReadyLoadFonts = null;
322
280
  gs.resReadyTesseract = null;
323
281
  gs.schedulerReadyTesseract = null;
324
- gs.resReadyFontAllRaw = null;
325
- gs.fontAllRawReady = null;
326
282
  };
327
283
  }
@@ -212,7 +212,7 @@ export async function convertPageStext({ ocrStr, n }) {
212
212
 
213
213
  // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
214
214
  smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
215
- smallCapsCurrent = /(small\W?cap)|sc$/i.test(fontNameStrI);
215
+ smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(fontNameStrI);
216
216
  smallCapsWord = smallCapsCurrent;
217
217
 
218
218
  if (/italic/i.test(fontNameStrI) || /-\w*ital/i.test(fontNameStrI)) {
@@ -11,7 +11,9 @@ import {
11
11
  } from '../containers/dataContainer.js';
12
12
  import { fontAll } from '../containers/fontContainer.js';
13
13
  import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
14
- import { enableDisableFontOpt, optimizeFontContainerAll, setDefaultFontAuto } from '../fontContainerMain.js';
14
+ import {
15
+ enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
16
+ } from '../fontContainerMain.js';
15
17
  import { runFontOptimization } from '../fontEval.js';
16
18
  import { calcFontMetricsFromPages } from '../fontStatistics.js';
17
19
  import { gs } from '../generalWorkerMain.js';
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
20
22
  import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
21
23
  import { replaceObjectProperties } from '../utils/miscUtils.js';
22
24
  import { importOCRFiles } from './importOCR.js';
25
+ import { extractInternalPDFText } from '../extractPDFText.js';
23
26
 
24
27
  /**
25
28
  * Automatically detects the image type (jpeg or png).
@@ -169,7 +172,10 @@ export function sortInputFiles(files) {
169
172
  }
170
173
 
171
174
  /**
172
- *
175
+ * An object with this shape can be used to provide input to the `importFiles` function,
176
+ * without needing that function to figure out the file types.
177
+ * This is required when using ArrayBuffer inputs.
178
+ * @public
173
179
  * @typedef {Object} SortedInputFiles
174
180
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [pdfFiles]
175
181
  * @property {Array<File>|Array<string>|Array<ArrayBuffer>} [imageFiles]
@@ -180,11 +186,19 @@ export function sortInputFiles(files) {
180
186
  * Import files for processing.
181
187
  * An object with `pdfFiles`, `imageFiles`, and `ocrFiles` arrays can be provided to import multiple types of files.
182
188
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
189
+ * @public
183
190
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
191
+ * @param {Object} [options]
192
+ * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
193
+ * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
184
194
  * @returns
185
195
  */
186
- export async function importFiles(files) {
196
+ export async function importFiles(files, options = {}) {
187
197
  clearData();
198
+ gs.getGeneralScheduler();
199
+
200
+ const extractPDFTextNative = options?.extractPDFTextNative ?? false;
201
+ const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
188
202
 
189
203
  /** @type {Array<File|FileNode|ArrayBuffer>} */
190
204
  let pdfFiles = [];
@@ -261,10 +275,6 @@ export async function importFiles(files) {
261
275
 
262
276
  const xmlModeImport = ocrFiles.length > 0;
263
277
 
264
- // Extract text from PDF document
265
- // Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
266
- inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
267
-
268
278
  let pageCount;
269
279
  let pageCountImage;
270
280
  let abbyyMode = false;
@@ -279,7 +289,7 @@ export async function importFiles(files) {
279
289
  const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
280
290
 
281
291
  // If no XML data is provided, page sizes are calculated using muPDF alone
282
- await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText, inputData.extractTextMode);
292
+ await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
283
293
 
284
294
  pageCountImage = ImageCache.pageCount;
285
295
  ImageCache.loadCount = ImageCache.pageCount;
@@ -310,6 +320,8 @@ export async function importFiles(files) {
310
320
 
311
321
  // Restore font metrics and optimize font from previous session (if applicable)
312
322
  if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
323
+ const fontPromise = loadBuiltInFontsRaw();
324
+
313
325
  existingOpt = true;
314
326
 
315
327
  replaceObjectProperties(fontMetricsObj, ocrData.fontMetricsObj);
@@ -323,11 +335,12 @@ export async function importFiles(files) {
323
335
  if (ocrData.enableOpt === 'false') {
324
336
  opt.enableOpt = false;
325
337
  } else {
338
+ await fontPromise;
326
339
  const fontRaw = fontAll.getContainer('raw');
327
340
  if (!fontRaw) throw new Error('Raw font data not found.');
328
341
  fontAll.opt = await optimizeFontContainerAll(fontRaw, fontMetricsObj);
329
342
  opt.enableOpt = true;
330
- await enableDisableFontOpt(true);
343
+ await enableFontOpt(true);
331
344
  }
332
345
  }
333
346
 
@@ -360,11 +373,6 @@ export async function importFiles(files) {
360
373
  scribeMode = ocrData.scribeMode;
361
374
 
362
375
  stextMode = ocrData.stextMode;
363
- } else if (inputData.extractTextMode) {
364
- // Initialize a new array on `ocrAll` if one does not already exist
365
- if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
366
- ocrAll.active = ocrAll[oemName];
367
- stextMode = true;
368
376
  }
369
377
 
370
378
  const pageCountHOCR = ocrAllRaw.active?.length;
@@ -416,7 +424,7 @@ export async function importFiles(files) {
416
424
  }
417
425
  }
418
426
 
419
- if (xmlModeImport || inputData.extractTextMode) {
427
+ if (xmlModeImport) {
420
428
  /** @type {("hocr" | "abbyy" | "stext")} */
421
429
  let format = 'hocr';
422
430
  if (abbyyMode) format = 'abbyy';
@@ -431,11 +439,11 @@ export async function importFiles(files) {
431
439
  opt.enableOpt = await runFontOptimization(ocrAll.active);
432
440
  }
433
441
  });
442
+ } else if (extractPDFTextNative || extractPDFTextOCR) {
443
+ await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
434
444
  }
435
445
  }
436
446
 
437
- // Import supplemental OCR files (from "Evaluate Accuracy" UI tab)
438
-
439
447
  /**
440
448
  * Import supplemental OCR files, such as an alternate OCR version or ground truth data.
441
449
  * This function should not be used to import the main OCR files.
@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
28
28
  let pageCountHOCR;
29
29
  let hocrRaw;
30
30
  /** @type {?Object.<string, FontMetricsFamily>} */
31
- let fontMetricsObj;
31
+ let fontMetricsObj = null;
32
32
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
33
33
  let layoutObj = null;
34
34
  /** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
42
42
  const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
43
43
 
44
44
  // Check whether input is Abbyy XML
45
- const node2 = hocrStrAll.match(/>([^>]+)/)[1];
46
- abbyyMode = !!/abbyy/i.test(node2);
47
- stextMode = !!/<document name/.test(node2);
45
+ const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
46
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
47
+ stextMode = !!node2 && !!/<document name/.test(node2);
48
48
 
49
49
  if (abbyyMode) {
50
50
  hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
67
67
 
68
68
  // Check whether input is Abbyy XML using the first file
69
69
  const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
70
- const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
71
- abbyyMode = !!/abbyy/i.test(node2);
70
+ const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
71
+ abbyyMode = !!node2 && !!/abbyy/i.test(node2);
72
72
 
73
73
  for (let i = 0; i < pageCountHOCR; i++) {
74
74
  const hocrFile = ocrFilesAll[i];
@@ -11,11 +11,13 @@ export class FileNode {
11
11
  /**
12
12
  * Creates an instance of the File class.
13
13
  * @param {string} filePath - The path to the file.
14
+ * @param {string} name - The name of the file.
15
+ * @param {Buffer} fileData - The file's data.
14
16
  */
15
- constructor(filePath) {
17
+ constructor(filePath, name, fileData) {
16
18
  this.filePath = filePath;
17
- this.name = path.basename(filePath);
18
- this.fileData = fs.readFileSync(filePath);
19
+ this.name = name;
20
+ this.fileData = fileData;
19
21
  }
20
22
 
21
23
  /**
@@ -29,7 +31,20 @@ export class FileNode {
29
31
 
30
32
  /**
31
33
  *
32
- * @param {Array<string>} files
34
+ * @param {Array<string>} filePaths
33
35
  * @returns
34
36
  */
35
- export const wrapFilesNode = (files) => files.map((file) => (new FileNode(file)));
37
+ export const wrapFilesNode = (filePaths) => {
38
+ const filePromises = filePaths.map(async (filePath) => {
39
+ const isUrl = filePath.startsWith('http://') || filePath.startsWith('https://') || filePath.startsWith('moz-extension://')
40
+ || filePath.startsWith('chrome-extension://') || filePath.startsWith('file://');
41
+
42
+ const fileData = isUrl ? Buffer.from(await fetch(filePath).then((res) => res.arrayBuffer())) : fs.readFileSync(filePath);
43
+
44
+ const fileName = isUrl ? filePath.split('/').pop() : path.basename(filePath);
45
+
46
+ return new FileNode(filePath, fileName, fileData);
47
+ });
48
+
49
+ return Promise.all(filePromises);
50
+ };
@@ -84,3 +84,11 @@ export function LayoutDataTablePage() {
84
84
  /** @type {Array<LayoutDataTable>} */
85
85
  this.tables = [];
86
86
  }
87
+
88
+ const layout = {
89
+ LayoutDataColumn,
90
+ LayoutDataTable,
91
+ LayoutRegion,
92
+ };
93
+
94
+ export default layout;
@@ -403,8 +403,28 @@ function calcWordAngleAdj(word) {
403
403
  * @param {string} text
404
404
  */
405
405
  function replaceLigatures(text) {
406
- return text.replace(/fl/g, 'fl').replace(/fi/g, 'fi').replace(/ff/g, 'ff').replace(/ffi/g, 'ffi')
407
- .replace(/ffl/g, 'ffl');
406
+ return text.replace(/IJ/g, 'IJ')
407
+ .replace(/ij/g, 'ij')
408
+ .replace(/ʼn/g, 'ʼn')
409
+ .replace(/DZ/g, 'DZ')
410
+ .replace(/Dz/g, 'Dz')
411
+ .replace(/dz/g, 'dz')
412
+ .replace(/DŽ/g, 'DŽ')
413
+ .replace(/Dž/g, 'Dž')
414
+ .replace(/dž/g, 'dž')
415
+ .replace(/LJ/g, 'LJ')
416
+ .replace(/Lj/g, 'Lj')
417
+ .replace(/lj/g, 'lj')
418
+ .replace(/NJ/g, 'NJ')
419
+ .replace(/Nj/g, 'Nj')
420
+ .replace(/nj/g, 'nj')
421
+ .replace(/ff/g, 'ff')
422
+ .replace(/fi/g, 'fi')
423
+ .replace(/fl/g, 'fl')
424
+ .replace(/ffi/g, 'ffi')
425
+ .replace(/ffl/g, 'ffl')
426
+ .replace(/ſt/g, 'ſt')
427
+ .replace(/st/g, 'st');
408
428
  }
409
429
 
410
430
  /**
@@ -149,8 +149,10 @@ export const calcRecognizeRotateArgs = async (n, areaMode) => {
149
149
  };
150
150
 
151
151
  /**
152
- * Run recognition on a page and save the results, including OCR data and (possibly) auto-rotated images, to the appropriate global array.
152
+ * Recognize a single page in active document.
153
+ * Use `recognize` instead to recognize all pages in a document.
153
154
  *
155
+ * @public
154
156
  * @param {number} n - Page number to recognize.
155
157
  * @param {boolean} legacy -
156
158
  * @param {boolean} lstm -
@@ -493,14 +495,16 @@ export async function recognizeAllPages(legacy = true, lstm = true, mainData = f
493
495
  }
494
496
 
495
497
  /**
496
- *
498
+ * Recognize all pages in active document.
499
+ * Files for recognition should already be imported using `importFiles` before calling this function.
500
+ * The results of recognition can be exported by calling `exportFiles` after this function.
501
+ * @public
497
502
  * @param {Object} options
498
503
  * @param {'speed'|'quality'} [options.mode='quality'] - Recognition mode.
499
504
  * @param {Array<string>} [options.langs=['eng']] - Language(s) in document.
500
505
  * @param {'lstm'|'legacy'|'combined'} [options.modeAdv='combined'] - Alternative method of setting recognition mode.
501
506
  * @param {'conf'|'data'} [options.combineMode='data'] - Method of combining OCR results. Used if OCR data already exists.
502
507
  * @param {boolean} [options.vanillaMode=false] - Whether to use the vanilla Tesseract.js model.
503
- *
504
508
  */
505
509
  export async function recognize(options = {}) {
506
510
  await gs.getGeneralScheduler();
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
- "main": "module.js",
5
+ "main": "scribe.js",
6
6
  "directories": {
7
7
  "lib": "lib"
8
8
  },
@@ -13,6 +13,7 @@
13
13
  "@types/node": "^18.0.6",
14
14
  "@types/opentype.js": "^1.3.8",
15
15
  "chai": "^5.1.1",
16
+ "documentation": "^14.0.3",
16
17
  "eslint": "^8.56.0",
17
18
  "eslint-config-airbnb-base": "^15.0.0",
18
19
  "eslint-plugin-import": "^2.29.1",
@@ -27,6 +28,7 @@
27
28
  "wait-on": "^7.2.0"
28
29
  },
29
30
  "scripts": {
31
+ "docs": "documentation build scribe.js -f md --access public > docs/API.md",
30
32
  "start": "node tests/scripts/server.js",
31
33
  "stop": "curl http://localhost:3031/shutdown",
32
34
  "test": "npm-run-all test:cli test:module",
@@ -13,11 +13,11 @@ import { drawDebugImages } from './js/debug.js';
13
13
  import { download, exportData } from './js/export/export.js';
14
14
  import { writeDebugCsv } from './js/export/exportDebugCsv.js';
15
15
  import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
16
- import { loadBuiltInFontsRaw } from './js/fontContainerMain.js';
16
+ import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
17
17
  import { gs } from './js/generalWorkerMain.js';
18
18
  import { importFiles, importFilesSupp } from './js/import/import.js';
19
19
  import { calcBoxOverlap, combineOCRPage } from './js/modifyOCR.js';
20
- import { calcTableBbox } from './js/objects/layoutObjects.js';
20
+ import layout, { calcTableBbox } from './js/objects/layoutObjects.js';
21
21
  import ocr from './js/objects/ocrObjects.js';
22
22
  import {
23
23
  calcEvalStatsDoc,
@@ -30,9 +30,11 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
30
30
  import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
31
31
  import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
32
32
  import { assignParagraphs } from './js/utils/reflowPars.js';
33
+ import { extractInternalPDFText } from './js/extractPDFText.js';
33
34
 
34
35
  /**
35
36
  * Initialize the program and optionally pre-load resources.
37
+ * @public
36
38
  * @param {Object} [params]
37
39
  * @param {boolean} [params.pdf=false] - Load PDF renderer.
38
40
  * @param {boolean} [params.ocr=false] - Load OCR engine.
@@ -58,23 +60,30 @@ const init = async (params) => {
58
60
  }
59
61
 
60
62
  if (initFont) {
61
- const resReadyFontAllRaw = gs.setFontAllRawReady();
62
- promiseArr.push(loadBuiltInFontsRaw().then(() => resReadyFontAllRaw()));
63
+ promiseArr.push(loadBuiltInFontsRaw());
63
64
  }
64
65
 
65
66
  await Promise.all(promiseArr);
66
67
  };
67
68
 
68
69
  /**
69
- *
70
+ * Function for extracting text from image and PDF files with a single function call.
71
+ * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
72
+ * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
73
+ * @public
70
74
  * @param {Parameters<typeof importFiles>[0]} files
71
75
  * @param {Array<string>} [langs=['eng']]
72
76
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
73
- * @returns
77
+ * @param {Object} [options]
78
+ * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
79
+ * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
74
80
  */
75
- const recognizeFiles = async (files, langs = ['eng'], outputFormat = 'txt') => {
76
- await importFiles(files);
77
- await recognize({ langs });
81
+ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
82
+ const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
83
+ const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
84
+ init({ ocr: true, font: true });
85
+ await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
86
+ if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
78
87
  return exportData(outputFormat);
79
88
  };
80
89
 
@@ -139,6 +148,7 @@ class utils {
139
148
 
140
149
  /**
141
150
  * Clears all document-specific data.
151
+ * @public
142
152
  */
143
153
  const clear = async () => {
144
154
  clearData();
@@ -146,6 +156,7 @@ const clear = async () => {
146
156
 
147
157
  /**
148
158
  * Terminates the program and releases resources.
159
+ * @public
149
160
  */
150
161
  const terminate = async () => {
151
162
  clearData();
@@ -157,6 +168,7 @@ export default {
157
168
  combineOCRPage,
158
169
  compareOCR,
159
170
  data,
171
+ enableFontOpt,
160
172
  evalOCRPage,
161
173
  exportData,
162
174
  download,
@@ -164,10 +176,12 @@ export default {
164
176
  importFilesSupp,
165
177
  inputData,
166
178
  init,
179
+ layout,
167
180
  opt,
168
181
  recognize,
169
182
  recognizePage,
170
- recognizeFiles,
183
+ extractText,
184
+ extractInternalPDFText,
171
185
  terminate,
172
186
  utils,
173
187
  };