scribe.js-ocr 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +6 -0
- package/README.md +1 -1
- package/docs/API.md +13 -4
- package/examples/browser/recognize-basic.js +1 -1
- package/examples/node/recognize-basic.js +1 -1
- package/js/containers/app.js +0 -3
- package/js/containers/imageContainer.js +11 -67
- package/js/extractPDFText.js +110 -0
- package/js/generalWorkerMain.js +2 -2
- package/js/import/import.js +16 -14
- package/js/import/importOCR.js +6 -6
- package/package.json +1 -1
- package/scribe.js +13 -6
package/.eslintrc.json
CHANGED
|
@@ -67,6 +67,12 @@
|
|
|
67
67
|
// "one-var": "off",
|
|
68
68
|
// "one-var-declaration-per-line": "off",
|
|
69
69
|
|
|
70
|
+
// If this is enabled eslint breaks our import statements, such that they no longer run natively in the browser.
|
|
71
|
+
"import/no-relative-packages": "off",
|
|
72
|
+
|
|
73
|
+
// Using blocks for purely organizational purposes (e.g. when in-lining a function) is fine.
|
|
74
|
+
"no-lone-blocks": "off",
|
|
75
|
+
|
|
70
76
|
// This rule was depreciated
|
|
71
77
|
"no-return-await": "off",
|
|
72
78
|
|
package/README.md
CHANGED
|
@@ -25,7 +25,7 @@ import scribe from 'node_modules/scribe.js-ocr/scribe.js';
|
|
|
25
25
|
import scribe from 'scribe.js-ocr';
|
|
26
26
|
|
|
27
27
|
// Basic usage
|
|
28
|
-
scribe.
|
|
28
|
+
scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
|
|
29
29
|
.then((res) => console.log(res))
|
|
30
30
|
```
|
|
31
31
|
|
package/docs/API.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
* [init][1]
|
|
6
6
|
* [Parameters][2]
|
|
7
|
-
* [
|
|
7
|
+
* [extractText][3]
|
|
8
8
|
* [Parameters][4]
|
|
9
9
|
* [clear][5]
|
|
10
10
|
* [terminate][6]
|
|
@@ -35,9 +35,10 @@ Initialize the program and optionally pre-load resources.
|
|
|
35
35
|
The PDF renderer and OCR engine are automatically loaded when needed.
|
|
36
36
|
Therefore, the only reason to set `pdf` or `ocr` to `true` is to pre-load them. (optional, default `false`)
|
|
37
37
|
|
|
38
|
-
##
|
|
38
|
+
## extractText
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
Function for extracting text from image and PDF files with a single function call.
|
|
41
|
+
By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
|
|
41
42
|
For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
|
|
42
43
|
|
|
43
44
|
### Parameters
|
|
@@ -45,6 +46,10 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
|
|
|
45
46
|
* `files`  
|
|
46
47
|
* `langs` **[Array][21]<[string][22]>** (optional, default `['eng']`)
|
|
47
48
|
* `outputFormat` (optional, default `'txt'`)
|
|
49
|
+
* `options` **[Object][19]?** (optional, default `{}`)
|
|
50
|
+
|
|
51
|
+
* `options.skipRecPDFTextNative` **[boolean][20]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
|
|
52
|
+
* `options.skipRecPDFTextOCR` **[boolean][20]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
|
|
48
53
|
|
|
49
54
|
## clear
|
|
50
55
|
|
|
@@ -100,6 +105,10 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
|
|
|
100
105
|
### Parameters
|
|
101
106
|
|
|
102
107
|
* `files` **([Array][21]\<File> | FileList | [Array][21]<[string][22]> | [SortedInputFiles][11])** 
|
|
108
|
+
* `options` **[Object][19]?** (optional, default `{}`)
|
|
109
|
+
|
|
110
|
+
* `options.extractPDFTextNative` **[boolean][20]** Extract text from text-native PDF documents. (optional, default `false`)
|
|
111
|
+
* `options.extractPDFTextOCR` **[boolean][20]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
|
|
103
112
|
|
|
104
113
|
## recognizePage
|
|
105
114
|
|
|
@@ -135,7 +144,7 @@ The results of recognition can be exported by calling `exportFiles` after this f
|
|
|
135
144
|
|
|
136
145
|
[2]: #parameters
|
|
137
146
|
|
|
138
|
-
[3]: #
|
|
147
|
+
[3]: #extracttext
|
|
139
148
|
|
|
140
149
|
[4]: #parameters-1
|
|
141
150
|
|
|
@@ -5,6 +5,6 @@ await scribe.init({ ocr: true, font: true });
|
|
|
5
5
|
const elm = /** @type {HTMLInputElement} */ (document.getElementById('uploader'));
|
|
6
6
|
elm.addEventListener('change', async () => {
|
|
7
7
|
if (!elm.files) return;
|
|
8
|
-
const text = await scribe.
|
|
8
|
+
const text = await scribe.extractText(elm.files);
|
|
9
9
|
console.log(text);
|
|
10
10
|
});
|
package/js/containers/app.js
CHANGED
|
@@ -74,9 +74,6 @@ export class inputData {
|
|
|
74
74
|
/** `true` if user re-uploaded HOCR data created by Scribe OCR */
|
|
75
75
|
static resumeMode = false;
|
|
76
76
|
|
|
77
|
-
/** `true` if stext is extracted from a PDF (rather than text layer uploaded seprately) */
|
|
78
|
-
static extractTextMode = false;
|
|
79
|
-
|
|
80
77
|
/** `true` if ground truth data is uploaded */
|
|
81
78
|
static evalMode = false;
|
|
82
79
|
|
|
@@ -216,52 +216,19 @@ export class ImageCache {
|
|
|
216
216
|
static pageCount = 0;
|
|
217
217
|
|
|
218
218
|
/**
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
static
|
|
219
|
+
* The dimensions that each page would be, if it was rendered at 300 DPI.
|
|
220
|
+
* @type {Array<dims>}
|
|
221
|
+
*/
|
|
222
|
+
static pdfDims300 = [];
|
|
223
223
|
|
|
224
224
|
static inputModes = {
|
|
225
225
|
pdf: false,
|
|
226
226
|
image: false,
|
|
227
227
|
};
|
|
228
228
|
|
|
229
|
-
static pdfContentStats = {
|
|
230
|
-
/** Total number of letters in the source PDF. */
|
|
231
|
-
letterCountTotal: 0,
|
|
232
|
-
/** Total number of visible letters in the source PDF. */
|
|
233
|
-
letterCountVis: 0,
|
|
234
|
-
/** Total number of pages with 100+ letters in the source PDF. */
|
|
235
|
-
pageCountTotalText: 0,
|
|
236
|
-
/** Total number of pages with 100+ visible letters in the source PDF. */
|
|
237
|
-
pageCountVisText: 0,
|
|
238
|
-
};
|
|
239
|
-
|
|
240
229
|
/** @type {?('text'|'ocr'|'image')} */
|
|
241
230
|
static pdfType = null;
|
|
242
231
|
|
|
243
|
-
static setPdfType = () => {
|
|
244
|
-
// The PDF is considered text-native if:
|
|
245
|
-
// (1) The total number of visible letters is at least 100 per page on average.
|
|
246
|
-
// (2) The total number of visible letters is at least 90% of the total number of letters.
|
|
247
|
-
// (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
|
|
248
|
-
if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
|
|
249
|
-
&& ImageCache.pdfContentStats.letterCountVis >= ImageCache.pdfContentStats.letterCountTotal * 0.9
|
|
250
|
-
&& ImageCache.pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
|
|
251
|
-
ImageCache.pdfType = 'text';
|
|
252
|
-
// The PDF is considered ocr-native if:
|
|
253
|
-
// (1) The total number of letters is at least 100 per page on average.
|
|
254
|
-
// (2) The total number of letters is at least half of the total number of letters.
|
|
255
|
-
} else if (ImageCache.pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
|
|
256
|
-
&& ImageCache.pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
|
|
257
|
-
ImageCache.pdfType = 'ocr';
|
|
258
|
-
// Otherwise, the PDF is considered image-native.
|
|
259
|
-
// This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
|
|
260
|
-
} else {
|
|
261
|
-
ImageCache.pdfType = 'image';
|
|
262
|
-
}
|
|
263
|
-
};
|
|
264
|
-
|
|
265
232
|
static colorModeDefault = 'gray';
|
|
266
233
|
|
|
267
234
|
static cacheRenderPages = 3;
|
|
@@ -327,7 +294,7 @@ export class ImageCache {
|
|
|
327
294
|
} if (ImageCache.inputModes.pdf) {
|
|
328
295
|
const pageMetrics = pageMetricsArr[n];
|
|
329
296
|
const targetWidth = pageMetrics.dims.width;
|
|
330
|
-
const dpi = 300 * (targetWidth / ImageCache.
|
|
297
|
+
const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
|
|
331
298
|
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
|
|
332
299
|
return muPDFScheduler.drawPageAsPNG({
|
|
333
300
|
page: n + 1, dpi, color, skipText: skipTextMode,
|
|
@@ -566,14 +533,10 @@ export class ImageCache {
|
|
|
566
533
|
ImageCache.inputModes.image = false;
|
|
567
534
|
ImageCache.inputModes.pdf = false;
|
|
568
535
|
ImageCache.pageCount = 0;
|
|
569
|
-
ImageCache.
|
|
536
|
+
ImageCache.pdfDims300.length = 0;
|
|
570
537
|
ImageCache.loadCount = 0;
|
|
571
538
|
ImageCache.nativeProps.length = 0;
|
|
572
539
|
ImageCache.binaryProps.length = 0;
|
|
573
|
-
ImageCache.pdfContentStats.letterCountTotal = 0;
|
|
574
|
-
ImageCache.pdfContentStats.letterCountVis = 0;
|
|
575
|
-
ImageCache.pdfContentStats.pageCountTotalText = 0;
|
|
576
|
-
ImageCache.pdfContentStats.pageCountVisText = 0;
|
|
577
540
|
};
|
|
578
541
|
|
|
579
542
|
static terminate = async () => {
|
|
@@ -600,9 +563,8 @@ export class ImageCache {
|
|
|
600
563
|
*
|
|
601
564
|
* @param {ArrayBuffer} fileData
|
|
602
565
|
* @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
|
|
603
|
-
* @param {Boolean} [extractStext=false]
|
|
604
566
|
*/
|
|
605
|
-
static openMainPDF = async (fileData, skipText = false
|
|
567
|
+
static openMainPDF = async (fileData, skipText = false) => {
|
|
606
568
|
const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
|
|
607
569
|
|
|
608
570
|
await ImageCache.#loadFileMuPDFScheduler(fileData);
|
|
@@ -611,9 +573,9 @@ export class ImageCache {
|
|
|
611
573
|
|
|
612
574
|
const pageDims1 = await muPDFScheduler.workers[0].pageSizes([300]);
|
|
613
575
|
|
|
614
|
-
ImageCache.
|
|
576
|
+
ImageCache.pdfDims300.length = 0;
|
|
615
577
|
pageDims1.forEach((x) => {
|
|
616
|
-
ImageCache.
|
|
578
|
+
ImageCache.pdfDims300.push({ width: x[0], height: x[1] });
|
|
617
579
|
});
|
|
618
580
|
|
|
619
581
|
ImageCache.inputModes.pdf = true;
|
|
@@ -627,10 +589,10 @@ export class ImageCache {
|
|
|
627
589
|
|
|
628
590
|
// For reasons that are unclear, a small number of pages have been rendered into massive files
|
|
629
591
|
// so a hard-cap on resolution must be imposed.
|
|
630
|
-
const pageDPI = ImageCache.
|
|
592
|
+
const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
|
|
631
593
|
|
|
632
594
|
// In addition to capping the resolution, also switch the width/height
|
|
633
|
-
ImageCache.
|
|
595
|
+
ImageCache.pdfDims300.forEach((x, i) => {
|
|
634
596
|
const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
|
|
635
597
|
pageMetricsArr[i] = new PageMetrics(pageDims);
|
|
636
598
|
});
|
|
@@ -674,23 +636,5 @@ export class ImageCache {
|
|
|
674
636
|
await setUploadFontsWorker(gs.schedulerInner);
|
|
675
637
|
});
|
|
676
638
|
}
|
|
677
|
-
|
|
678
|
-
if (extractStext) {
|
|
679
|
-
ocrAllRaw.active = Array(ImageCache.pageCount);
|
|
680
|
-
const resArr = pageDPI.map(async (x, i) => {
|
|
681
|
-
// While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
|
|
682
|
-
// The XML format is the only built-in mupdf format that includes character-level granularity.
|
|
683
|
-
const res = await muPDFScheduler.pageText({
|
|
684
|
-
page: i + 1, dpi: x, format: 'xml', calcStats: true,
|
|
685
|
-
});
|
|
686
|
-
ImageCache.pdfContentStats.letterCountTotal += res.letterCountTotal;
|
|
687
|
-
ImageCache.pdfContentStats.letterCountVis += res.letterCountVis;
|
|
688
|
-
if (res.letterCountTotal >= 100) ImageCache.pdfContentStats.pageCountTotalText++;
|
|
689
|
-
if (res.letterCountVis >= 100) ImageCache.pdfContentStats.pageCountVisText++;
|
|
690
|
-
ocrAllRaw.active[i] = res.content;
|
|
691
|
-
});
|
|
692
|
-
await Promise.all(resArr);
|
|
693
|
-
ImageCache.setPdfType();
|
|
694
|
-
}
|
|
695
639
|
};
|
|
696
640
|
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import { ImageCache } from './containers/imageContainer.js';
|
|
2
|
+
import { convertOCRAll } from './recognizeConvert.js';
|
|
3
|
+
import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Extract raw text content from currently loaded PDF.
|
|
7
|
+
* Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
|
|
8
|
+
*/
|
|
9
|
+
const extractInternalPDFTextRaw = async () => {
|
|
10
|
+
const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
|
|
11
|
+
|
|
12
|
+
const pdfContentStats = {
|
|
13
|
+
/** Total number of letters in the source PDF. */
|
|
14
|
+
letterCountTotal: 0,
|
|
15
|
+
/** Total number of visible letters in the source PDF. */
|
|
16
|
+
letterCountVis: 0,
|
|
17
|
+
/** Total number of pages with 100+ letters in the source PDF. */
|
|
18
|
+
pageCountTotalText: 0,
|
|
19
|
+
/** Total number of pages with 100+ visible letters in the source PDF. */
|
|
20
|
+
pageCountVisText: 0,
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
const stextArr = /** @type {Array<string>} */ ([]);
|
|
24
|
+
const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
|
|
25
|
+
const resArr = pageDPI.map(async (x, i) => {
|
|
26
|
+
// While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
|
|
27
|
+
// The XML format is the only built-in mupdf format that includes character-level granularity.
|
|
28
|
+
const res = await muPDFScheduler.pageText({
|
|
29
|
+
page: i + 1, dpi: x, format: 'xml', calcStats: true,
|
|
30
|
+
});
|
|
31
|
+
pdfContentStats.letterCountTotal += res.letterCountTotal;
|
|
32
|
+
pdfContentStats.letterCountVis += res.letterCountVis;
|
|
33
|
+
if (res.letterCountTotal >= 100) pdfContentStats.pageCountTotalText++;
|
|
34
|
+
if (res.letterCountVis >= 100) pdfContentStats.pageCountVisText++;
|
|
35
|
+
stextArr[i] = res.content;
|
|
36
|
+
});
|
|
37
|
+
await Promise.all(resArr);
|
|
38
|
+
|
|
39
|
+
/** @type {"image" | "text" | "ocr"} */
|
|
40
|
+
let type = 'image';
|
|
41
|
+
|
|
42
|
+
// Determine whether the PDF is text-native, image-only, or image + OCR.
|
|
43
|
+
{
|
|
44
|
+
// The PDF is considered text-native if:
|
|
45
|
+
// (1) The total number of visible letters is at least 100 per page on average.
|
|
46
|
+
// (2) The total number of visible letters is at least 90% of the total number of letters.
|
|
47
|
+
// (3) The total number of pages with 100+ visible letters is at least half of the total number of pages.
|
|
48
|
+
if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
|
|
49
|
+
&& pdfContentStats.letterCountVis >= pdfContentStats.letterCountTotal * 0.9
|
|
50
|
+
&& pdfContentStats.pageCountVisText >= ImageCache.pageCount / 2) {
|
|
51
|
+
type = 'text';
|
|
52
|
+
// The PDF is considered ocr-native if:
|
|
53
|
+
// (1) The total number of letters is at least 100 per page on average.
|
|
54
|
+
// (2) The total number of letters is at least half of the total number of letters.
|
|
55
|
+
} else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
|
|
56
|
+
&& pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
|
|
57
|
+
type = 'ocr';
|
|
58
|
+
// Otherwise, the PDF is considered image-native.
|
|
59
|
+
// This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
|
|
60
|
+
} else {
|
|
61
|
+
type = 'image';
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return { contentRaw: stextArr, content: /** @type {?Array<OcrPage>} */ (null), type };
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Extract and parse text from currently loaded PDF.
|
|
70
|
+
* @param {Object} [options]
|
|
71
|
+
* @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
|
|
72
|
+
* @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
|
|
73
|
+
* @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
|
|
74
|
+
* This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
|
|
75
|
+
* @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
|
|
76
|
+
*/
|
|
77
|
+
export const extractInternalPDFText = async (options = {}) => {
|
|
78
|
+
const extractPDFTextNative = options?.extractPDFTextNative ?? true;
|
|
79
|
+
const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
|
|
80
|
+
const extractPDFTextImage = options?.extractPDFTextImage ?? false;
|
|
81
|
+
|
|
82
|
+
const setActive = options?.setActive ?? false;
|
|
83
|
+
|
|
84
|
+
const res = await extractInternalPDFTextRaw();
|
|
85
|
+
|
|
86
|
+
ImageCache.pdfType = res.type;
|
|
87
|
+
ocrAllRaw.pdf = res.contentRaw;
|
|
88
|
+
|
|
89
|
+
if (!extractPDFTextImage && res.type === 'image') return res;
|
|
90
|
+
|
|
91
|
+
if (!extractPDFTextOCR && res.type === 'ocr') return res;
|
|
92
|
+
|
|
93
|
+
if (!extractPDFTextNative && res.type === 'text') return res;
|
|
94
|
+
|
|
95
|
+
ocrAll.pdf = Array(ImageCache.pageCount);
|
|
96
|
+
|
|
97
|
+
if (setActive) {
|
|
98
|
+
ocrAllRaw.active = ocrAllRaw.pdf;
|
|
99
|
+
ocrAll.active = ocrAll.pdf;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const format = 'stext';
|
|
103
|
+
|
|
104
|
+
// Process HOCR using web worker, reading from file first if that has not been done already
|
|
105
|
+
await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
|
|
106
|
+
|
|
107
|
+
res.content = ocrAll.pdf;
|
|
108
|
+
|
|
109
|
+
return res;
|
|
110
|
+
};
|
package/js/generalWorkerMain.js
CHANGED
|
@@ -263,12 +263,12 @@ export class gs {
|
|
|
263
263
|
static getGeneralScheduler = async () => {
|
|
264
264
|
if (gs.schedulerReady) {
|
|
265
265
|
await gs.schedulerReady;
|
|
266
|
-
return gs.scheduler;
|
|
266
|
+
return /** @type {GeneralScheduler} */ (gs.scheduler);
|
|
267
267
|
}
|
|
268
268
|
|
|
269
269
|
await gs.init();
|
|
270
270
|
|
|
271
|
-
return gs.scheduler;
|
|
271
|
+
return /** @type {GeneralScheduler} */ (gs.scheduler);
|
|
272
272
|
};
|
|
273
273
|
|
|
274
274
|
static terminate = async () => {
|
package/js/import/import.js
CHANGED
|
@@ -11,7 +11,9 @@ import {
|
|
|
11
11
|
} from '../containers/dataContainer.js';
|
|
12
12
|
import { fontAll } from '../containers/fontContainer.js';
|
|
13
13
|
import { ImageCache, imageUtils, ImageWrapper } from '../containers/imageContainer.js';
|
|
14
|
-
import {
|
|
14
|
+
import {
|
|
15
|
+
enableFontOpt, optimizeFontContainerAll, setDefaultFontAuto, loadBuiltInFontsRaw,
|
|
16
|
+
} from '../fontContainerMain.js';
|
|
15
17
|
import { runFontOptimization } from '../fontEval.js';
|
|
16
18
|
import { calcFontMetricsFromPages } from '../fontStatistics.js';
|
|
17
19
|
import { gs } from '../generalWorkerMain.js';
|
|
@@ -20,6 +22,7 @@ import { PageMetrics } from '../objects/pageMetricsObjects.js';
|
|
|
20
22
|
import { checkCharWarn, convertOCRAll } from '../recognizeConvert.js';
|
|
21
23
|
import { replaceObjectProperties } from '../utils/miscUtils.js';
|
|
22
24
|
import { importOCRFiles } from './importOCR.js';
|
|
25
|
+
import { extractInternalPDFText } from '../extractPDFText.js';
|
|
23
26
|
|
|
24
27
|
/**
|
|
25
28
|
* Automatically detects the image type (jpeg or png).
|
|
@@ -185,12 +188,18 @@ export function sortInputFiles(files) {
|
|
|
185
188
|
* Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
|
|
186
189
|
* @public
|
|
187
190
|
* @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
|
|
191
|
+
* @param {Object} [options]
|
|
192
|
+
* @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
|
|
193
|
+
* @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
|
|
188
194
|
* @returns
|
|
189
195
|
*/
|
|
190
|
-
export async function importFiles(files) {
|
|
196
|
+
export async function importFiles(files, options = {}) {
|
|
191
197
|
clearData();
|
|
192
198
|
gs.getGeneralScheduler();
|
|
193
199
|
|
|
200
|
+
const extractPDFTextNative = options?.extractPDFTextNative ?? false;
|
|
201
|
+
const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
|
|
202
|
+
|
|
194
203
|
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
195
204
|
let pdfFiles = [];
|
|
196
205
|
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
@@ -266,10 +275,6 @@ export async function importFiles(files) {
|
|
|
266
275
|
|
|
267
276
|
const xmlModeImport = ocrFiles.length > 0;
|
|
268
277
|
|
|
269
|
-
// Extract text from PDF document
|
|
270
|
-
// Only enabled if (1) user selects this option, (2) user uploads a PDF, and (3) user does not upload XML data.
|
|
271
|
-
inputData.extractTextMode = opt.extractText && inputData.pdfMode && !xmlModeImport;
|
|
272
|
-
|
|
273
278
|
let pageCount;
|
|
274
279
|
let pageCountImage;
|
|
275
280
|
let abbyyMode = false;
|
|
@@ -284,7 +289,7 @@ export async function importFiles(files) {
|
|
|
284
289
|
const pdfFileData = pdfFile instanceof ArrayBuffer ? pdfFile : await pdfFile.arrayBuffer();
|
|
285
290
|
|
|
286
291
|
// If no XML data is provided, page sizes are calculated using muPDF alone
|
|
287
|
-
await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText
|
|
292
|
+
await ImageCache.openMainPDF(pdfFileData, opt.omitNativeText);
|
|
288
293
|
|
|
289
294
|
pageCountImage = ImageCache.pageCount;
|
|
290
295
|
ImageCache.loadCount = ImageCache.pageCount;
|
|
@@ -315,7 +320,7 @@ export async function importFiles(files) {
|
|
|
315
320
|
|
|
316
321
|
// Restore font metrics and optimize font from previous session (if applicable)
|
|
317
322
|
if (ocrData.fontMetricsObj && Object.keys(ocrData.fontMetricsObj).length > 0) {
|
|
318
|
-
const fontPromise = loadBuiltInFontsRaw()
|
|
323
|
+
const fontPromise = loadBuiltInFontsRaw();
|
|
319
324
|
|
|
320
325
|
existingOpt = true;
|
|
321
326
|
|
|
@@ -368,11 +373,6 @@ export async function importFiles(files) {
|
|
|
368
373
|
scribeMode = ocrData.scribeMode;
|
|
369
374
|
|
|
370
375
|
stextMode = ocrData.stextMode;
|
|
371
|
-
} else if (inputData.extractTextMode) {
|
|
372
|
-
// Initialize a new array on `ocrAll` if one does not already exist
|
|
373
|
-
if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
|
|
374
|
-
ocrAll.active = ocrAll[oemName];
|
|
375
|
-
stextMode = true;
|
|
376
376
|
}
|
|
377
377
|
|
|
378
378
|
const pageCountHOCR = ocrAllRaw.active?.length;
|
|
@@ -424,7 +424,7 @@ export async function importFiles(files) {
|
|
|
424
424
|
}
|
|
425
425
|
}
|
|
426
426
|
|
|
427
|
-
if (xmlModeImport
|
|
427
|
+
if (xmlModeImport) {
|
|
428
428
|
/** @type {("hocr" | "abbyy" | "stext")} */
|
|
429
429
|
let format = 'hocr';
|
|
430
430
|
if (abbyyMode) format = 'abbyy';
|
|
@@ -439,6 +439,8 @@ export async function importFiles(files) {
|
|
|
439
439
|
opt.enableOpt = await runFontOptimization(ocrAll.active);
|
|
440
440
|
}
|
|
441
441
|
});
|
|
442
|
+
} else if (extractPDFTextNative || extractPDFTextOCR) {
|
|
443
|
+
await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
|
|
442
444
|
}
|
|
443
445
|
}
|
|
444
446
|
|
package/js/import/importOCR.js
CHANGED
|
@@ -28,7 +28,7 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
28
28
|
let pageCountHOCR;
|
|
29
29
|
let hocrRaw;
|
|
30
30
|
/** @type {?Object.<string, FontMetricsFamily>} */
|
|
31
|
-
let fontMetricsObj;
|
|
31
|
+
let fontMetricsObj = null;
|
|
32
32
|
/** @type{?Array<import('../objects/layoutObjects.js').LayoutPage>} */
|
|
33
33
|
let layoutObj = null;
|
|
34
34
|
/** @type{?Array<import('../objects/layoutObjects.js').LayoutDataTablePage>} */
|
|
@@ -42,9 +42,9 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
42
42
|
const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
|
|
43
43
|
|
|
44
44
|
// Check whether input is Abbyy XML
|
|
45
|
-
const node2 = hocrStrAll.match(/>([^>]+)/)[1];
|
|
46
|
-
abbyyMode = !!/abbyy/i.test(node2);
|
|
47
|
-
stextMode = !!/<document name/.test(node2);
|
|
45
|
+
const node2 = hocrStrAll.match(/>([^>]+)/)?.[1];
|
|
46
|
+
abbyyMode = !!node2 && !!/abbyy/i.test(node2);
|
|
47
|
+
stextMode = !!node2 && !!/<document name/.test(node2);
|
|
48
48
|
|
|
49
49
|
if (abbyyMode) {
|
|
50
50
|
hocrArrPages = hocrStrAll.split(/(?=<page)/).slice(1);
|
|
@@ -67,8 +67,8 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
67
67
|
|
|
68
68
|
// Check whether input is Abbyy XML using the first file
|
|
69
69
|
const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
|
|
70
|
-
const node2 = hocrStrFirst.match(/>([^>]+)/)[1];
|
|
71
|
-
abbyyMode = !!/abbyy/i.test(node2);
|
|
70
|
+
const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1];
|
|
71
|
+
abbyyMode = !!node2 && !!/abbyy/i.test(node2);
|
|
72
72
|
|
|
73
73
|
for (let i = 0; i < pageCountHOCR; i++) {
|
|
74
74
|
const hocrFile = ocrFilesAll[i];
|
package/package.json
CHANGED
package/scribe.js
CHANGED
|
@@ -30,6 +30,7 @@ import { imageStrToBlob } from './js/utils/imageUtils.js';
|
|
|
30
30
|
import { countSubstringOccurrences, getRandomAlphanum, replaceSmartQuotes } from './js/utils/miscUtils.js';
|
|
31
31
|
import { calcConf, mergeOcrWords, splitOcrWord } from './js/utils/ocrUtils.js';
|
|
32
32
|
import { assignParagraphs } from './js/utils/reflowPars.js';
|
|
33
|
+
import { extractInternalPDFText } from './js/extractPDFText.js';
|
|
33
34
|
|
|
34
35
|
/**
|
|
35
36
|
* Initialize the program and optionally pre-load resources.
|
|
@@ -66,18 +67,23 @@ const init = async (params) => {
|
|
|
66
67
|
};
|
|
67
68
|
|
|
68
69
|
/**
|
|
69
|
-
*
|
|
70
|
+
* Function for extracting text from image and PDF files with a single function call.
|
|
71
|
+
* By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
|
|
70
72
|
* For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
|
|
71
73
|
* @public
|
|
72
74
|
* @param {Parameters<typeof importFiles>[0]} files
|
|
73
75
|
* @param {Array<string>} [langs=['eng']]
|
|
74
76
|
* @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
|
|
75
|
-
* @
|
|
77
|
+
* @param {Object} [options]
|
|
78
|
+
* @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
|
|
79
|
+
* @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
|
|
76
80
|
*/
|
|
77
|
-
const
|
|
81
|
+
const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
|
|
82
|
+
const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
|
|
83
|
+
const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
|
|
78
84
|
init({ ocr: true, font: true });
|
|
79
|
-
await importFiles(files);
|
|
80
|
-
await recognize({ langs });
|
|
85
|
+
await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
|
|
86
|
+
if (!(ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR)) await recognize({ langs });
|
|
81
87
|
return exportData(outputFormat);
|
|
82
88
|
};
|
|
83
89
|
|
|
@@ -174,7 +180,8 @@ export default {
|
|
|
174
180
|
opt,
|
|
175
181
|
recognize,
|
|
176
182
|
recognizePage,
|
|
177
|
-
|
|
183
|
+
extractText,
|
|
184
|
+
extractInternalPDFText,
|
|
178
185
|
terminate,
|
|
179
186
|
utils,
|
|
180
187
|
};
|