scribe.js-ocr 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/API.md CHANGED
@@ -41,6 +41,7 @@ Initialize the program and optionally pre-load resources.
41
41
 
42
42
  Function for extracting text from image and PDF files with a single function call.
43
43
  By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
44
+ To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
44
45
  For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
45
46
 
46
47
  ### Parameters
@@ -48,10 +49,7 @@ For more control, use `init`, `importFiles`, `recognize`, and `exportData` separ
48
49
  * `files`  
49
50
  * `langs` **[Array][23]<[string][24]>** (optional, default `['eng']`)
50
51
  * `outputFormat` (optional, default `'txt'`)
51
- * `options` **[Object][21]?** (optional, default `{}`)
52
-
53
- * `options.skipRecPDFTextNative` **[boolean][22]** If the input is a text-native PDF, skip recognition and return the existing text. (optional, default `true`)
54
- * `options.skipRecPDFTextOCR` **[boolean][22]** If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text. (optional, default `false`)
52
+ * `options` (optional, default `{}`)
55
53
 
56
54
  ## writeDebugImages
57
55
 
@@ -115,12 +113,6 @@ Alternatively, for `File` objects (browser) and file paths (Node.js), a single a
115
113
  ### Parameters
116
114
 
117
115
  * `files` **([Array][23]\<File> | FileList | [Array][23]<[string][24]> | [SortedInputFiles][13])**&#x20;
118
- * `options` **[Object][21]?** (optional, default `{}`)
119
-
120
- * `options.extractPDFTextNative` **[boolean][22]** Extract text from text-native PDF documents. (optional, default `false`)
121
- * `options.extractPDFTextOCR` **[boolean][22]** Extract text from image-native PDF documents with existing OCR text layers. (optional, default `false`)
122
- * `options.extractPDFTextImage` **[boolean][22]** Extract text from image-native PDF documents with no existing OCR layer.
123
- This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header). (optional, default `false`)
124
116
 
125
117
  ## recognizePage
126
118
 
@@ -62,9 +62,23 @@ export class opt {
62
62
 
63
63
  static calcSuppFontInfo = false;
64
64
 
65
- static usePDFTextSupp = true;
66
-
67
- static usePDFTextMain = true;
65
+ /**
66
+ * How to use PDF text data extracted from input PDFs (if any).
67
+ * The `native` option controls how native text data is used (i.e. visible text rendered by the PDF viewer),
68
+ * while the `ocr` option controls how OCR text data is used (i.e. invisible text printed over an image).
69
+ * If `main` is true, then the data will be used as the primary data source.
70
+ * If `supp` is true, then the data will be used as a supplemental data source (may be used to correct errors in the primary data source).
71
+ */
72
+ static usePDFText = {
73
+ native: {
74
+ supp: true,
75
+ main: true,
76
+ },
77
+ ocr: {
78
+ supp: true,
79
+ main: false,
80
+ },
81
+ };
68
82
 
69
83
  /**
70
84
  * Number of workers to use. Must be set prior to initialization.
@@ -1,4 +1,4 @@
1
- import { inputData } from './containers/app.js';
1
+ import { inputData, opt } from './containers/app.js';
2
2
  import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
3
3
  import { ImageCache } from './containers/imageContainer.js';
4
4
  import { convertOCR } from './recognizeConvert.js';
@@ -68,19 +68,11 @@ const extractInternalPDFTextRaw = async () => {
68
68
 
69
69
  /**
70
70
  * Extract and parse text from currently loaded PDF.
71
- * @param {Object} [options]
72
- * @param {boolean} [options.extractPDFTextNative=true] - Extract text from text-native PDF documents.
73
- * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
74
- * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
75
- * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
76
- * @param {boolean} [options.setActive=false] - Set the active OCR data to the extracted text.
77
71
  */
78
- export const extractInternalPDFText = async (options = {}) => {
79
- const extractPDFTextNative = options?.extractPDFTextNative ?? true;
80
- const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
81
- const extractPDFTextImage = options?.extractPDFTextImage ?? false;
82
-
83
- const setActive = options?.setActive ?? false;
72
+ export const extractInternalPDFText = async () => {
73
+ const extractPDFTextNative = opt.usePDFText.native.main || opt.usePDFText.native.supp;
74
+ const extractPDFTextOCR = opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp;
75
+ const extractPDFTextImage = false;
84
76
 
85
77
  const res = await extractInternalPDFTextRaw();
86
78
 
@@ -95,7 +87,7 @@ export const extractInternalPDFText = async (options = {}) => {
95
87
 
96
88
  ocrAll.pdf = Array(ImageCache.pageCount);
97
89
 
98
- if (setActive) {
90
+ if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
99
91
  ocrAllRaw.active = ocrAllRaw.pdf;
100
92
  ocrAll.active = ocrAll.pdf;
101
93
  }
@@ -192,21 +192,11 @@ export function sortInputFiles(files) {
192
192
  * Alternatively, for `File` objects (browser) and file paths (Node.js), a single array can be provided, which is sorted based on extension.
193
193
  * @public
194
194
  * @param {Array<File>|FileList|Array<string>|SortedInputFiles} files
195
- * @param {Object} [options]
196
- * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
197
- * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
198
- * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
199
- * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
200
- * @returns
201
195
  */
202
- export async function importFiles(files, options = {}) {
196
+ export async function importFiles(files) {
203
197
  clearData();
204
198
  gs.getGeneralScheduler();
205
199
 
206
- const extractPDFTextNative = options?.extractPDFTextNative ?? false;
207
- const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
208
- const extractPDFTextImage = options?.extractPDFTextImage ?? false;
209
-
210
200
  /** @type {Array<File|FileNode|ArrayBuffer>} */
211
201
  let pdfFiles = [];
212
202
  /** @type {Array<File|FileNode|ArrayBuffer>} */
@@ -445,11 +435,9 @@ export async function importFiles(files, options = {}) {
445
435
  await runFontOptimization(ocrAll.active);
446
436
  }
447
437
  });
448
- } else if (inputData.pdfMode && (extractPDFTextNative || extractPDFTextOCR)) {
449
- await extractInternalPDFText({
450
- setActive: opt.usePDFTextMain, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
451
- });
452
- if (opt.usePDFTextMain) {
438
+ } else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
439
+ await extractInternalPDFText();
440
+ if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
453
441
  if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
454
442
  if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
455
443
  }
@@ -533,7 +533,7 @@ export async function recognize(options = {}) {
533
533
  let existingOCR;
534
534
  if (ocrAll['User Upload']) {
535
535
  existingOCR = ocrAll['User Upload'];
536
- } else if (opt.usePDFTextSupp && ocrAll.pdf) {
536
+ } else if (ocrAll.pdf && (inputData.pdfType === 'text' && opt.usePDFText.native.supp || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.supp)) {
537
537
  existingOCR = ocrAll.pdf;
538
538
  // If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
539
539
  forceMainData = ocrAll.pdf !== ocrAll.active;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.4.0",
3
+ "version": "0.4.1",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {
package/scribe.js CHANGED
@@ -79,20 +79,18 @@ const init = async (params) => {
79
79
  /**
80
80
  * Function for extracting text from image and PDF files with a single function call.
81
81
  * By default, existing text content is extracted for text-native PDF files; otherwise text is extracted using OCR.
82
+ * To control how text from PDF files is handled, set the options in the `opt.usePDFText` object.
82
83
  * For more control, use `init`, `importFiles`, `recognize`, and `exportData` separately.
83
84
  * @public
84
85
  * @param {Parameters<typeof importFiles>[0]} files
85
86
  * @param {Array<string>} [langs=['eng']]
86
87
  * @param {Parameters<typeof exportData>[0]} [outputFormat='txt']
87
- * @param {Object} [options]
88
- * @param {boolean} [options.skipRecPDFTextNative=true] - If the input is a text-native PDF, skip recognition and return the existing text.
89
- * @param {boolean} [options.skipRecPDFTextOCR=false] - If the input is an image-native PDF with existing OCR layer, skip recognition and return the existing text.
90
88
  */
91
89
  const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options = {}) => {
92
90
  const skipRecPDFTextNative = options?.skipRecPDFTextNative ?? true;
93
91
  const skipRecPDFTextOCR = options?.skipRecPDFTextOCR ?? false;
94
92
  init({ ocr: true, font: true });
95
- await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
93
+ await importFiles(files);
96
94
  if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
97
95
  const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
98
96
  const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;