scribe.js-ocr 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/API.md +2 -10
- package/js/containers/app.js +17 -3
- package/js/extractPDFText.js +6 -14
- package/js/import/import.js +4 -16
- package/js/recognizeConvert.js +1 -1
- package/package.json +1 -1
- package/scribe.js +2 -4
package/docs/API.md
CHANGED
|
@@ -41,6 +41,7 @@ Initialize the program and optionally pre-load resources.
|
|
|
41
41
|
|
|
42
42
|
Function for extracting text from image and PDF files with a single function call.
|
|
43
43
|
By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
|
|
44
|
+
To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
|
|
44
45
|
For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
|
|
45
46
|
|
|
46
47
|
### Parameters
|
|
@@ -48,10 +49,7 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
|
|
|
48
49
|
* `files`  
|
|
49
50
|
* `langs` **[Array][23]<[string][24]>** (optional, default `['eng']`)
|
|
50
51
|
* `outputFormat` (optional, default `'txt'`)
|
|
51
|
-
* `options`
|
|
52
|
-
|
|
53
|
-
* `options.skipRecPDFTextNative` **[boolean][22]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
|
|
54
|
-
* `options.skipRecPDFTextOCR` **[boolean][22]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
|
|
52
|
+
* `options` (optional, default `{}`)
|
|
55
53
|
|
|
56
54
|
## writeDebugImages
|
|
57
55
|
|
|
@@ -115,12 +113,6 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
|
|
|
115
113
|
### Parameters
|
|
116
114
|
|
|
117
115
|
* `files` **([Array][23]\<File> | FileList | [Array][23]<[string][24]> | [SortedInputFiles][13])** 
|
|
118
|
-
* `options` **[Object][21]?** (optional, default `{}`)
|
|
119
|
-
|
|
120
|
-
* `options.extractPDFTextNative` **[boolean][22]** Extract text from text-native PDF documents. (optional, default `false`)
|
|
121
|
-
* `options.extractPDFTextOCR` **[boolean][22]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
|
|
122
|
-
* `options.extractPDFTextImage` **[boolean][22]** Extract text from image-native PDF documents with no existing OCR layer.
|
|
123
|
-
This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header). (optional, default `false`)
|
|
124
116
|
|
|
125
117
|
## recognizePage
|
|
126
118
|
|
package/js/containers/app.js
CHANGED
|
@@ -62,9 +62,23 @@ export class opt {
|
|
|
62
62
|
|
|
63
63
|
static calcSuppFontInfo = false;
|
|
64
64
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
65
|
+
/**
|
|
66
|
+
* How to use PDF text data extracted from input PDFs (if any).
|
|
67
|
+
* The `native` option controls how native text data is used (i.e. visible text rendered by the PDF viewer),
|
|
68
|
+
* while the `ocr` option controls how OCR text data is used (i.e. invisible text printed over an image).
|
|
69
|
+
* If `main` is true, then the data will be used as the primary data source.
|
|
70
|
+
* If `supp` is true, then the data will be used as a supplemental data source (may be used to correct errors in the primary data source).
|
|
71
|
+
*/
|
|
72
|
+
static usePDFText = {
|
|
73
|
+
native: {
|
|
74
|
+
supp: true,
|
|
75
|
+
main: true,
|
|
76
|
+
},
|
|
77
|
+
ocr: {
|
|
78
|
+
supp: true,
|
|
79
|
+
main: false,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
68
82
|
|
|
69
83
|
/**
|
|
70
84
|
* Number of workers to use. Must be set prior to initialization.
|
package/js/extractPDFText.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { inputData } from './containers/app.js';
|
|
1
|
+
import { inputData, opt } from './containers/app.js';
|
|
2
2
|
import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
|
|
3
3
|
import { ImageCache } from './containers/imageContainer.js';
|
|
4
4
|
import { convertOCR } from './recognizeConvert.js';
|
|
@@ -68,19 +68,11 @@ const extractInternalPDFTextRaw = async () => {
|
|
|
68
68
|
|
|
69
69
|
/**
|
|
70
70
|
* Extract and parse text from currently loaded PDF.
|
|
71
|
-
* @param {Object} [options]
|
|
72
|
-
* @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
|
|
73
|
-
* @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
|
|
74
|
-
* @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
|
|
75
|
-
* This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
|
|
76
|
-
* @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
|
|
77
71
|
*/
|
|
78
|
-
export const extractInternalPDFText = async (
|
|
79
|
-
const extractPDFTextNative =
|
|
80
|
-
const extractPDFTextOCR =
|
|
81
|
-
const extractPDFTextImage =
|
|
82
|
-
|
|
83
|
-
const setActive = options?.setActive ?? false;
|
|
72
|
+
export const extractInternalPDFText = async () => {
|
|
73
|
+
const extractPDFTextNative = opt.usePDFText.native.main || opt.usePDFText.native.supp;
|
|
74
|
+
const extractPDFTextOCR = opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp;
|
|
75
|
+
const extractPDFTextImage = false;
|
|
84
76
|
|
|
85
77
|
const res = await extractInternalPDFTextRaw();
|
|
86
78
|
|
|
@@ -95,7 +87,7 @@ export const extractInternalPDFText = async (options = {}) => {
|
|
|
95
87
|
|
|
96
88
|
ocrAll.pdf = Array(ImageCache.pageCount);
|
|
97
89
|
|
|
98
|
-
if (
|
|
90
|
+
if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
|
|
99
91
|
ocrAllRaw.active = ocrAllRaw.pdf;
|
|
100
92
|
ocrAll.active = ocrAll.pdf;
|
|
101
93
|
}
|
package/js/import/import.js
CHANGED
|
@@ -192,21 +192,11 @@ export function sortInputFiles(files) {
|
|
|
192
192
|
* Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
|
|
193
193
|
* @public
|
|
194
194
|
* @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
|
|
195
|
-
* @param {Object} [options]
|
|
196
|
-
* @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
|
|
197
|
-
* @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
|
|
198
|
-
* @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
|
|
199
|
-
* This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
|
|
200
|
-
* @returns
|
|
201
195
|
*/
|
|
202
|
-
export async function importFiles(files
|
|
196
|
+
export async function importFiles(files) {
|
|
203
197
|
clearData();
|
|
204
198
|
gs.getGeneralScheduler();
|
|
205
199
|
|
|
206
|
-
const extractPDFTextNative = options?.extractPDFTextNative ?? false;
|
|
207
|
-
const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
|
|
208
|
-
const extractPDFTextImage = options?.extractPDFTextImage ?? false;
|
|
209
|
-
|
|
210
200
|
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
211
201
|
let pdfFiles = [];
|
|
212
202
|
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
@@ -445,11 +435,9 @@ export async function importFiles(files, options = {}) {
|
|
|
445
435
|
await runFontOptimization(ocrAll.active);
|
|
446
436
|
}
|
|
447
437
|
});
|
|
448
|
-
} else if (inputData.pdfMode && (
|
|
449
|
-
await extractInternalPDFText(
|
|
450
|
-
|
|
451
|
-
});
|
|
452
|
-
if (opt.usePDFTextMain) {
|
|
438
|
+
} else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
|
|
439
|
+
await extractInternalPDFText();
|
|
440
|
+
if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
|
|
453
441
|
if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
|
|
454
442
|
if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
|
|
455
443
|
}
|
package/js/recognizeConvert.js
CHANGED
|
@@ -533,7 +533,7 @@ export async function recognize(options = {}) {
|
|
|
533
533
|
let existingOCR;
|
|
534
534
|
if (ocrAll['User Upload']) {
|
|
535
535
|
existingOCR = ocrAll['User Upload'];
|
|
536
|
-
} else if (opt.
|
|
536
|
+
} else if (ocrAll.pdf && (inputData.pdfType === 'text' && opt.usePDFText.native.supp || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.supp)) {
|
|
537
537
|
existingOCR = ocrAll.pdf;
|
|
538
538
|
// If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
|
|
539
539
|
forceMainData = ocrAll.pdf !== ocrAll.active;
|
package/package.json
CHANGED
package/scribe.js
CHANGED
|
@@ -79,20 +79,18 @@ const init = async (params) => {
|
|
|
79
79
|
/**
|
|
80
80
|
* Function for extracting text from image and PDF files with a single function call.
|
|
81
81
|
* By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
|
|
82
|
+
* To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
|
|
82
83
|
* For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
|
|
83
84
|
* @public
|
|
84
85
|
* @param {Parameters<typeof importFiles>[0]} files
|
|
85
86
|
* @param {Array<string>} [langs=['eng']]
|
|
86
87
|
* @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
|
|
87
|
-
* @param {Object} [options]
|
|
88
|
-
* @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
|
|
89
|
-
* @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
|
|
90
88
|
*/
|
|
91
89
|
const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
|
|
92
90
|
const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
|
|
93
91
|
const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
|
|
94
92
|
init({ ocr: true, font: true });
|
|
95
|
-
await importFiles(files
|
|
93
|
+
await importFiles(files);
|
|
96
94
|
if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
|
|
97
95
|
const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
|
|
98
96
|
const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
|