scribe.js-ocr 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -191,6 +191,8 @@ export function sortInputFiles(files) {
191
191
  * @param {Object} [options]
192
192
  * @param {boolean} [options.extractPDFTextNative=false] - Extract text from text-native PDF documents.
193
193
  * @param {boolean} [options.extractPDFTextOCR=false] - Extract text from image-native PDF documents with existing OCR text layers.
194
+ * @param {boolean} [options.extractPDFTextImage=false] - Extract text from image-native PDF documents with no existing OCR layer.
195
+ * This option exists because documents may still contain some text even if they are determined to be image-native (for example, scanned documents with a text-native header).
194
196
  * @returns
195
197
  */
196
198
  export async function importFiles(files, options = {}) {
@@ -199,6 +201,7 @@ export async function importFiles(files, options = {}) {
199
201
 
200
202
  const extractPDFTextNative = options?.extractPDFTextNative ?? false;
201
203
  const extractPDFTextOCR = options?.extractPDFTextOCR ?? false;
204
+ const extractPDFTextImage = options?.extractPDFTextImage ?? false;
202
205
 
203
206
  /** @type {Array<File|FileNode|ArrayBuffer>} */
204
207
  let pdfFiles = [];
@@ -440,7 +443,7 @@ export async function importFiles(files, options = {}) {
440
443
  }
441
444
  });
442
445
  } else if (extractPDFTextNative || extractPDFTextOCR) {
443
- await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR });
446
+ await extractInternalPDFText({ setActive: true, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage });
444
447
  }
445
448
  }
446
449
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {