scribe.js-ocr 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/js/containers/imageContainer.js +2 -2
- package/js/export/exportDebugCsv.js +4 -4
- package/js/generalWorkerMain.js +1 -1
- package/js/import/convertPageBlocks.js +1 -1
- package/js/import/import.js +3 -1
- package/js/import/importOCR.js +9 -15
- package/js/recognizeConvert.js +14 -2
- package/js/worker/compareOCRModule.js +13 -4
- package/js/worker/generalWorker.js +6 -6
- package/package.json +2 -2
- package/scribe.js +3 -1
|
@@ -254,7 +254,7 @@ export class ImageCache {
|
|
|
254
254
|
* @returns
|
|
255
255
|
*/
|
|
256
256
|
static #initMuPDFScheduler = async (numWorkers = 3) => {
|
|
257
|
-
const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
|
|
257
|
+
const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
|
258
258
|
const scheduler = await Tesseract.createScheduler();
|
|
259
259
|
const workersPromiseArr = range(1, numWorkers).map(async () => {
|
|
260
260
|
const w = await initMuPDFWorker();
|
|
@@ -300,7 +300,7 @@ export class ImageCache {
|
|
|
300
300
|
page: n + 1, dpi, color, skipText: skipTextMode,
|
|
301
301
|
}).then((res) => new ImageWrapper(n, res, color ? 'color' : 'gray'));
|
|
302
302
|
}
|
|
303
|
-
throw new Error('
|
|
303
|
+
throw new Error('Attempted to render image without image input provided.');
|
|
304
304
|
};
|
|
305
305
|
|
|
306
306
|
/**
|
|
@@ -22,10 +22,10 @@ const escapeCSVField = (field) => {
|
|
|
22
22
|
};
|
|
23
23
|
|
|
24
24
|
/**
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
* Converts an array of objects with atomic properties (string, number, boolean) to a CSV string.
|
|
26
|
+
* @param {Array<Object>} data - The array of data objects.
|
|
27
|
+
* @returns {string} - The CSV string.
|
|
28
|
+
*/
|
|
29
29
|
export const convertToCSV = (data) => {
|
|
30
30
|
if (data.length === 0) {
|
|
31
31
|
return '';
|
package/js/generalWorkerMain.js
CHANGED
|
@@ -198,7 +198,7 @@ export class gs {
|
|
|
198
198
|
workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
|
|
199
199
|
}
|
|
200
200
|
|
|
201
|
-
const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('tesseract.js');
|
|
201
|
+
const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
|
202
202
|
|
|
203
203
|
gs.schedulerInner = await Tesseract.createScheduler();
|
|
204
204
|
gs.schedulerInner.workers = new Array(workerN);
|
|
@@ -9,7 +9,7 @@ import { getTextScript } from '../utils/miscUtils.js';
|
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
11
|
* @param {Object} params
|
|
12
|
-
* @param {Array<import('tesseract.js').Block>} params.ocrBlocks
|
|
12
|
+
* @param {Array<import('@scribe.js/tesseract.js').Block>} params.ocrBlocks
|
|
13
13
|
* @param {number} params.n
|
|
14
14
|
* @param {dims} params.pageDims
|
|
15
15
|
* @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.
|
package/js/import/import.js
CHANGED
|
@@ -465,6 +465,8 @@ export async function importFilesSupp(files, ocrName) {
|
|
|
465
465
|
|
|
466
466
|
const ocrData = await importOCRFiles(ocrFilesAll);
|
|
467
467
|
|
|
468
|
+
const scribeMode = ocrData.scribeMode;
|
|
469
|
+
|
|
468
470
|
const pageCountHOCR = ocrData.hocrRaw.length;
|
|
469
471
|
|
|
470
472
|
// If both OCR data and image data are present, confirm they have the same number of pages
|
|
@@ -478,5 +480,5 @@ export async function importFilesSupp(files, ocrName) {
|
|
|
478
480
|
if (ocrData.abbyyMode) format = 'abbyy';
|
|
479
481
|
if (ocrData.stextMode) format = 'stext';
|
|
480
482
|
|
|
481
|
-
convertOCRAll(ocrData.hocrRaw, false, format, ocrName);
|
|
483
|
+
await convertOCRAll(ocrData.hocrRaw, false, format, ocrName, scribeMode);
|
|
482
484
|
}
|
package/js/import/importOCR.js
CHANGED
|
@@ -18,13 +18,11 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
18
18
|
// In the case of 1 HOCR file
|
|
19
19
|
const singleHOCRMode = ocrFilesAll.length === 1;
|
|
20
20
|
|
|
21
|
-
let hocrStrStart =
|
|
22
|
-
let hocrStrEnd = '';
|
|
21
|
+
let hocrStrStart = null;
|
|
23
22
|
let abbyyMode = false;
|
|
24
23
|
let stextMode = false;
|
|
25
24
|
let scribeMode = false;
|
|
26
25
|
|
|
27
|
-
let hocrArrPages;
|
|
28
26
|
let pageCountHOCR;
|
|
29
27
|
let hocrRaw;
|
|
30
28
|
/** @type {?Object.<string, FontMetricsFamily>} */
|
|
@@ -47,20 +45,16 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
47
45
|
stextMode = !!node2 && !!/<document name/.test(node2);
|
|
48
46
|
|
|
49
47
|
if (abbyyMode) {
|
|
50
|
-
|
|
48
|
+
hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
|
|
51
49
|
} else if (stextMode) {
|
|
52
|
-
|
|
50
|
+
hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
|
|
53
51
|
} else {
|
|
54
|
-
hocrStrStart
|
|
55
|
-
|
|
56
|
-
|
|
52
|
+
// `hocrStrStart` will be missing for individual HOCR pages created with Tesseract.js or the Tesseract API.
|
|
53
|
+
hocrStrStart = hocrStrAll.match(/[\s\S]*?<body>/)?.[0];
|
|
54
|
+
hocrRaw = splitHOCRStr(hocrStrAll);
|
|
57
55
|
}
|
|
58
56
|
|
|
59
|
-
pageCountHOCR =
|
|
60
|
-
hocrRaw = Array(pageCountHOCR);
|
|
61
|
-
for (let i = 0; i < pageCountHOCR; i++) {
|
|
62
|
-
hocrRaw[i] = hocrStrStart + hocrArrPages[i] + hocrStrEnd;
|
|
63
|
-
}
|
|
57
|
+
pageCountHOCR = hocrRaw.length;
|
|
64
58
|
} else {
|
|
65
59
|
pageCountHOCR = ocrFilesAll.length;
|
|
66
60
|
hocrRaw = Array(pageCountHOCR);
|
|
@@ -76,11 +70,11 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
76
70
|
}
|
|
77
71
|
}
|
|
78
72
|
|
|
79
|
-
if (!abbyyMode && !stextMode &&
|
|
73
|
+
if (!abbyyMode && !stextMode && hocrStrStart) {
|
|
80
74
|
const getMeta = (name) => {
|
|
81
75
|
const regex = new RegExp(`<meta name=["']${name}["'][^<]+`, 'i');
|
|
82
76
|
|
|
83
|
-
const nodeStr =
|
|
77
|
+
const nodeStr = hocrStrStart.match(regex)?.[0];
|
|
84
78
|
if (!nodeStr) return null;
|
|
85
79
|
const contentStr = nodeStr.match(/content=["']([\s\S]+?)(?=["']\s{0,5}\/?>)/i)?.[1];
|
|
86
80
|
if (!contentStr) return null;
|
package/js/recognizeConvert.js
CHANGED
|
@@ -24,7 +24,19 @@ import { replaceObjectProperties } from './utils/miscUtils.js';
|
|
|
24
24
|
*/
|
|
25
25
|
export const compareOCRPage = async (pageA, pageB, options) => {
|
|
26
26
|
const func = typeof process !== 'undefined' ? (await import('./worker/compareOCRModule.js')).compareOCRPageImp : gs.scheduler.compareOCRPageImp;
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
// Some combinations of options require the image to be provided, and some do not.
|
|
29
|
+
// We skip sending the image for those that do not, as in addition to helping performance,
|
|
30
|
+
// this is also necessary to run basic comparison scripts (e.g. benchmarking accuracy) without providing the image.
|
|
31
|
+
// TODO: Rework the options so this works better with types.
|
|
32
|
+
// At present TypeScript has no way of knowing that certain combinations of options go with each other.
|
|
33
|
+
const mode = options?.mode || 'stats';
|
|
34
|
+
const evalConflicts = options?.evalConflicts ?? true;
|
|
35
|
+
const supplementComp = options?.supplementComp ?? false;
|
|
36
|
+
const skipImage = (mode === 'stats' && !supplementComp) || (mode === 'comb' && !evalConflicts && !supplementComp);
|
|
37
|
+
|
|
38
|
+
const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n);
|
|
39
|
+
|
|
28
40
|
const pageMetricsObj = pageMetricsArr[pageA.n];
|
|
29
41
|
return func({
|
|
30
42
|
pageA, pageB, binaryImage, pageMetricsObj, options,
|
|
@@ -51,7 +63,7 @@ export const evalOCRPage = async (params) => {
|
|
|
51
63
|
* Compare two sets of OCR data.
|
|
52
64
|
* @param {Array<OcrPage>} ocrA
|
|
53
65
|
* @param {Array<OcrPage>} ocrB
|
|
54
|
-
* @param {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} options
|
|
66
|
+
* @param {Parameters<import('./worker/compareOCRModule.js').compareOCRPageImp>[0]['options']} [options]
|
|
55
67
|
*/
|
|
56
68
|
export const compareOCR = async (ocrA, ocrB, options) => {
|
|
57
69
|
/** @type {Parameters<typeof compareOCRPage>[2]} */
|
|
@@ -486,10 +486,19 @@ async function penalizeWord(wordObjs) {
|
|
|
486
486
|
export async function compareOCRPageImp({
|
|
487
487
|
pageA, pageB, binaryImage, pageMetricsObj, options = {},
|
|
488
488
|
}) {
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
489
|
+
// The `binaryImage` argument is not sent for certain operations, which do not require it.
|
|
490
|
+
// For example, running a basic comparison between a page and the ground truth does not require having the image.
|
|
491
|
+
// The types do not currently reflect this, so this should be reworked at some point.
|
|
492
|
+
/** @type {?ImageBitmap} */
|
|
493
|
+
let binaryImageBit = null;
|
|
494
|
+
let imageUpscaled = false;
|
|
495
|
+
let imageRotated = false;
|
|
496
|
+
|
|
497
|
+
if (binaryImage) {
|
|
498
|
+
binaryImageBit = binaryImage.imageBitmap || await getImageBitmap(binaryImage.src);
|
|
499
|
+
imageUpscaled = binaryImage.upscaled;
|
|
500
|
+
imageRotated = binaryImage.rotated;
|
|
501
|
+
}
|
|
493
502
|
|
|
494
503
|
const mode = options?.mode === undefined ? 'stats' : options?.mode;
|
|
495
504
|
const editConf = options?.editConf === undefined ? false : options?.editConf;
|
|
@@ -17,7 +17,7 @@ import { optimizeFont } from './optimizeFontModule.js';
|
|
|
17
17
|
// import Tesseract from "../../tess/tesseract.esm.min.js";
|
|
18
18
|
const browserMode = typeof process === 'undefined';
|
|
19
19
|
|
|
20
|
-
const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('
|
|
20
|
+
const Tesseract = browserMode ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
|
21
21
|
|
|
22
22
|
const defaultConfigs = {
|
|
23
23
|
// TODO: Add back support for multiple PSM modes.
|
|
@@ -135,7 +135,7 @@ export const recognizeAndConvert = async ({
|
|
|
135
135
|
|
|
136
136
|
const keepItalic = oemCurrent === 0;
|
|
137
137
|
|
|
138
|
-
const ocrBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
|
|
138
|
+
const ocrBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
|
|
139
139
|
|
|
140
140
|
const res2 = await convertPageBlocks({
|
|
141
141
|
ocrBlocks, n, pageDims, rotateAngle: angle, keepItalic,
|
|
@@ -184,14 +184,14 @@ export const recognizeAndConvert2 = async ({
|
|
|
184
184
|
let resLegacy;
|
|
185
185
|
let resLSTM;
|
|
186
186
|
if (options.lstm && options.legacy) {
|
|
187
|
-
const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
|
|
187
|
+
const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
|
|
188
188
|
resLegacy = await convertPageBlocks({
|
|
189
189
|
ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
|
|
190
190
|
});
|
|
191
191
|
(async () => {
|
|
192
192
|
const res1 = await resArr[1];
|
|
193
193
|
|
|
194
|
-
const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res1.data.blocks);
|
|
194
|
+
const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res1.data.blocks);
|
|
195
195
|
resLSTM = await convertPageBlocks({
|
|
196
196
|
ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
|
|
197
197
|
});
|
|
@@ -201,12 +201,12 @@ export const recognizeAndConvert2 = async ({
|
|
|
201
201
|
postMessage({ data: xB, id: `${id}b` });
|
|
202
202
|
})();
|
|
203
203
|
} else if (!options.lstm && options.legacy) {
|
|
204
|
-
const legacyBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
|
|
204
|
+
const legacyBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
|
|
205
205
|
resLegacy = await convertPageBlocks({
|
|
206
206
|
ocrBlocks: legacyBlocks, n, pageDims, rotateAngle: angle, keepItalic: true, upscale: options.upscale,
|
|
207
207
|
});
|
|
208
208
|
} else if (options.lstm && !options.legacy) {
|
|
209
|
-
const lstmBlocks = /** @type {Array<import('tesseract.js').Block>} */(res0.data.blocks);
|
|
209
|
+
const lstmBlocks = /** @type {Array<import('@scribe.js/tesseract.js').Block>} */(res0.data.blocks);
|
|
210
210
|
resLSTM = await convertPageBlocks({
|
|
211
211
|
ocrBlocks: lstmBlocks, n, pageDims, rotateAngle: angle, keepItalic: false, upscale: options.upscale,
|
|
212
212
|
});
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scribe.js-ocr",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.2",
|
|
4
4
|
"description": "High-quality OCR and text extraction for images and PDFs.",
|
|
5
5
|
"main": "scribe.js",
|
|
6
6
|
"directories": {
|
|
@@ -52,7 +52,7 @@
|
|
|
52
52
|
"canvas": "^2.11.2",
|
|
53
53
|
"commander": "^11.1.0",
|
|
54
54
|
"puppeteer": "^22.13.0",
|
|
55
|
-
"tesseract.js": "
|
|
55
|
+
"@scribe.js/tesseract.js": "^5.0.5",
|
|
56
56
|
"web-worker": "~1.2.0"
|
|
57
57
|
}
|
|
58
58
|
}
|
package/scribe.js
CHANGED
|
@@ -11,7 +11,7 @@ import { ImageCache } from './js/containers/imageContainer.js';
|
|
|
11
11
|
import coords from './js/coordinates.js';
|
|
12
12
|
import { drawDebugImages } from './js/debug.js';
|
|
13
13
|
import { download, exportData } from './js/export/export.js';
|
|
14
|
-
import { writeDebugCsv } from './js/export/exportDebugCsv.js';
|
|
14
|
+
import { writeDebugCsv, convertToCSV } from './js/export/exportDebugCsv.js';
|
|
15
15
|
import { extractSingleTableContent } from './js/export/exportWriteTabular.js';
|
|
16
16
|
import { loadBuiltInFontsRaw, enableFontOpt } from './js/fontContainerMain.js';
|
|
17
17
|
import { gs } from './js/generalWorkerMain.js';
|
|
@@ -131,6 +131,8 @@ class utils {
|
|
|
131
131
|
// Misc utils
|
|
132
132
|
static calcBoxOverlap = calcBoxOverlap;
|
|
133
133
|
|
|
134
|
+
static convertToCSV = convertToCSV;
|
|
135
|
+
|
|
134
136
|
static replaceSmartQuotes = replaceSmartQuotes;
|
|
135
137
|
|
|
136
138
|
static getRandomAlphanum = getRandomAlphanum;
|