scribe.js-ocr 0.7.4 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/js/clear.js +5 -6
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +51 -39
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +5 -5
- package/js/fontContainerMain.js +42 -42
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +86 -90
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +22 -4
- package/js/import/convertPageAbbyy.js +8 -1
- package/js/import/convertPageStext.js +1 -1
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +42 -0
- package/js/recognizeConvert.js +21 -8
- package/js/utils/miscUtils.js +27 -6
- package/js/worker/compareOCRModule.js +7 -7
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +16 -16
- package/package.json +6 -3
package/js/utils/miscUtils.js
CHANGED
|
@@ -201,6 +201,13 @@ export function calcLang(str) {
|
|
|
201
201
|
*/
|
|
202
202
|
export async function readOcrFile(file) {
|
|
203
203
|
if (file instanceof ArrayBuffer) {
|
|
204
|
+
const fileUint8Array = new Uint8Array(file);
|
|
205
|
+
|
|
206
|
+
const isGzipped = fileUint8Array[0] === 0x1F && fileUint8Array[1] === 0x8B;
|
|
207
|
+
if (isGzipped) {
|
|
208
|
+
const pako = await import('../../lib/pako.esm.min.js');
|
|
209
|
+
file = pako.inflate(file)?.buffer;
|
|
210
|
+
}
|
|
204
211
|
const decoder = new TextDecoder('utf-8');
|
|
205
212
|
return decoder.decode(file);
|
|
206
213
|
}
|
|
@@ -210,7 +217,7 @@ export async function readOcrFile(file) {
|
|
|
210
217
|
|
|
211
218
|
// The `typeof process` condition is necessary to avoid error in Node.js versions <20, where `File` is not defined.
|
|
212
219
|
if (typeof process === 'undefined' && file instanceof File) {
|
|
213
|
-
if (/\.gz$/i.test(file.name)) {
|
|
220
|
+
if (/\.gz|\.scribe$/i.test(file.name)) {
|
|
214
221
|
return (readTextFileGz(file));
|
|
215
222
|
}
|
|
216
223
|
return (readTextFile(file));
|
|
@@ -361,18 +368,32 @@ export const reduceEvalMetrics = (evalMetricsArr) => evalMetricsArr.reduce((acc,
|
|
|
361
368
|
});
|
|
362
369
|
|
|
363
370
|
/**
|
|
364
|
-
* Delete all properties from `obj
|
|
365
|
-
*
|
|
371
|
+
* Delete all properties from `obj`.
|
|
372
|
+
* This should be used instead of `obj = {}` to avoid creating a new object.
|
|
366
373
|
* @param {Object} obj
|
|
367
|
-
* @param {Object} [obj2={}]
|
|
368
374
|
*/
|
|
369
|
-
export function
|
|
375
|
+
export function clearObjectProperties(obj) {
|
|
370
376
|
for (const prop in obj) {
|
|
371
377
|
if (Object.hasOwnProperty.call(obj, prop)) {
|
|
372
378
|
delete obj[prop];
|
|
373
379
|
}
|
|
374
380
|
}
|
|
375
|
-
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Version of `Object.assign` that only assigns properties that are not `undefined`.
|
|
385
|
+
* @param {Object} target - The target object to assign properties to.
|
|
386
|
+
* @param {...Object} sources - The source objects to assign properties from.
|
|
387
|
+
*/
|
|
388
|
+
export function objectAssignDefined(target, ...sources) {
|
|
389
|
+
for (const source of sources) {
|
|
390
|
+
for (const key in source) {
|
|
391
|
+
if (Object.hasOwnProperty.call(source, key) && source[key] !== undefined) {
|
|
392
|
+
target[key] = source[key];
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
return target;
|
|
376
397
|
}
|
|
377
398
|
|
|
378
399
|
// Sans/serif lookup for common font families. These should not include spaces or underscores--multi-word font names should be concatenated.
|
|
@@ -1286,18 +1286,18 @@ export async function evalPageBase({
|
|
|
1286
1286
|
export async function evalPageFont({
|
|
1287
1287
|
page, binaryImage, pageMetricsObj, font, opt = false,
|
|
1288
1288
|
}) {
|
|
1289
|
-
const enableOptSave = FontCont.enableOpt;
|
|
1290
|
-
const forceOptSave = FontCont.forceOpt;
|
|
1289
|
+
const enableOptSave = FontCont.state.enableOpt;
|
|
1290
|
+
const forceOptSave = FontCont.state.forceOpt;
|
|
1291
1291
|
|
|
1292
1292
|
// Allowing the font to be set here allows for better performance during font optimization compared to using the `enableFontOpt` function.
|
|
1293
1293
|
// This is because the `enableFontOpt` function requires a response from the main thread and *every* worker before completing, which leads to non-trivial waiting time.
|
|
1294
1294
|
if (opt === true) {
|
|
1295
1295
|
if (!FontCont.opt) throw new Error('Optimized fonts requested but not defined.');
|
|
1296
|
-
FontCont.forceOpt = true;
|
|
1296
|
+
FontCont.state.forceOpt = true;
|
|
1297
1297
|
} else if (opt === false) {
|
|
1298
1298
|
if (!FontCont.raw) throw new Error('Raw fonts requested but not defined.');
|
|
1299
|
-
FontCont.enableOpt = false;
|
|
1300
|
-
FontCont.forceOpt = false;
|
|
1299
|
+
FontCont.state.enableOpt = false;
|
|
1300
|
+
FontCont.state.forceOpt = false;
|
|
1301
1301
|
}
|
|
1302
1302
|
|
|
1303
1303
|
/**
|
|
@@ -1329,8 +1329,8 @@ export async function evalPageFont({
|
|
|
1329
1329
|
page, binaryImage, pageMetricsObj, func: transformLineFont,
|
|
1330
1330
|
});
|
|
1331
1331
|
|
|
1332
|
-
FontCont.enableOpt = enableOptSave;
|
|
1333
|
-
FontCont.forceOpt = forceOptSave;
|
|
1332
|
+
FontCont.state.enableOpt = enableOptSave;
|
|
1333
|
+
FontCont.state.forceOpt = forceOptSave;
|
|
1334
1334
|
|
|
1335
1335
|
return res;
|
|
1336
1336
|
}
|
|
@@ -369,13 +369,13 @@ async function loadFontsWorker({ src, opt }) {
|
|
|
369
369
|
async function updateFontContWorker({
|
|
370
370
|
rawMetrics, optMetrics, defaultFontName, sansDefaultName, serifDefaultName, enableOpt, forceOpt,
|
|
371
371
|
}) {
|
|
372
|
-
if (sansDefaultName) FontCont.sansDefaultName = sansDefaultName;
|
|
373
|
-
if (serifDefaultName) FontCont.serifDefaultName = serifDefaultName;
|
|
374
|
-
if (defaultFontName) FontCont.defaultFontName = defaultFontName;
|
|
372
|
+
if (sansDefaultName) FontCont.state.sansDefaultName = sansDefaultName;
|
|
373
|
+
if (serifDefaultName) FontCont.state.serifDefaultName = serifDefaultName;
|
|
374
|
+
if (defaultFontName) FontCont.state.defaultFontName = defaultFontName;
|
|
375
375
|
if (rawMetrics) FontCont.rawMetrics = rawMetrics;
|
|
376
376
|
if (optMetrics) FontCont.optMetrics = optMetrics;
|
|
377
|
-
if (enableOpt === true || enableOpt === false) FontCont.enableOpt = enableOpt;
|
|
378
|
-
if (forceOpt === true || forceOpt === false) FontCont.forceOpt = forceOpt;
|
|
377
|
+
if (enableOpt === true || enableOpt === false) FontCont.state.enableOpt = enableOpt;
|
|
378
|
+
if (forceOpt === true || forceOpt === false) FontCont.state.forceOpt = forceOpt;
|
|
379
379
|
}
|
|
380
380
|
|
|
381
381
|
async function compareOCRPageImpWrap(args) {
|
|
@@ -76,18 +76,18 @@ function transformGlyph(glyph, func, transX = false, transY = false) {
|
|
|
76
76
|
* Calculate pair kerning adjustments for font given provided metrics.
|
|
77
77
|
*
|
|
78
78
|
* @param {opentype.Font} font
|
|
79
|
-
* @param {
|
|
79
|
+
* @param {CharMetricsFont} charMetricsObj
|
|
80
80
|
* @param {number} xHeight
|
|
81
81
|
* @param {StyleLookup} styleLookup
|
|
82
82
|
*/
|
|
83
|
-
const calculateKerningPairs = (font,
|
|
83
|
+
const calculateKerningPairs = (font, charMetricsObj, xHeight, styleLookup) => {
|
|
84
84
|
const fontKerningObj = {};
|
|
85
85
|
|
|
86
86
|
// Kerning is limited to +/-10% of the em size for most pairs. Anything beyond this is likely not correct.
|
|
87
87
|
const maxKern = Math.round(font.unitsPerEm * 0.1);
|
|
88
88
|
const minKern = maxKern * -1;
|
|
89
89
|
|
|
90
|
-
for (const [key, value] of Object.entries(
|
|
90
|
+
for (const [key, value] of Object.entries(charMetricsObj.kerning)) {
|
|
91
91
|
// Do not adjust pair kerning for italic "ff".
|
|
92
92
|
// Given the amount of overlap between these glyphs, this metric is rarely accurate.
|
|
93
93
|
if (key === '102,102' && ['italic', 'boldItalic'].includes(styleLookup)) continue;
|
|
@@ -113,8 +113,8 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, styleLookup) => {
|
|
|
113
113
|
// Calculate target (measured) space between two characters.
|
|
114
114
|
// This is calculated as the average between two measurements.
|
|
115
115
|
// This did not exist in an older version of the code, so this should be optional and skipped if the data is not present.
|
|
116
|
-
if (
|
|
117
|
-
const value2 =
|
|
116
|
+
if (charMetricsObj.kerning2) {
|
|
117
|
+
const value2 = charMetricsObj.kerning2[key];
|
|
118
118
|
if (value2) {
|
|
119
119
|
const fontKern2 = Math.round(value2 * xHeight);
|
|
120
120
|
spaceTarget = Math.round((fontKern1 + fontKern2) / 2);
|
|
@@ -151,7 +151,7 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, styleLookup) => {
|
|
|
151
151
|
* Creates optimized version of font based on metrics provided.
|
|
152
152
|
* @param {Object} params
|
|
153
153
|
* @param {string|ArrayBuffer} params.fontData
|
|
154
|
-
* @param {
|
|
154
|
+
* @param {CharMetricsFont} params.charMetricsObj
|
|
155
155
|
* @param {StyleLookup} params.style -
|
|
156
156
|
* @param {boolean} [params.adjustAllLeftBearings] - Edit left bearings for all characters based on provided metrics.
|
|
157
157
|
* @param {boolean} [params.standardizeSize] - Scale such that size of 'o' is 0.47x em size.
|
|
@@ -160,7 +160,7 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, styleLookup) => {
|
|
|
160
160
|
* If `false`, only font-level transformations (adjusting em size and standardizing 'o' height) are performed.
|
|
161
161
|
*/
|
|
162
162
|
export async function optimizeFont({
|
|
163
|
-
fontData,
|
|
163
|
+
fontData, charMetricsObj, style, adjustAllLeftBearings = false, standardizeSize = false, targetEmSize = null, transGlyphs = true,
|
|
164
164
|
}) {
|
|
165
165
|
/** @type {opentype.Font} */
|
|
166
166
|
const workingFont = typeof (fontData) === 'string' ? await opentype.load(fontData) : opentype.parse(fontData, { lowMemory: false });
|
|
@@ -202,7 +202,7 @@ export async function optimizeFont({
|
|
|
202
202
|
|
|
203
203
|
// If no glyph-level transformations are requested, return early.
|
|
204
204
|
if (!transGlyphs) {
|
|
205
|
-
workingFont.kerningPairs = calculateKerningPairs(workingFont,
|
|
205
|
+
workingFont.kerningPairs = calculateKerningPairs(workingFont, charMetricsObj, xHeight, style);
|
|
206
206
|
|
|
207
207
|
return { fontData: workingFont.toArrayBuffer(), kerningPairs: workingFont.kerningPairs };
|
|
208
208
|
}
|
|
@@ -210,7 +210,7 @@ export async function optimizeFont({
|
|
|
210
210
|
oGlyph = workingFont.charToGlyph('o').getMetrics();
|
|
211
211
|
xHeight = oGlyph.yMax - oGlyph.yMin;
|
|
212
212
|
|
|
213
|
-
const heightCapsBelievable =
|
|
213
|
+
const heightCapsBelievable = charMetricsObj.obsCaps >= 10 && charMetricsObj.heightCaps >= 1.1 && charMetricsObj.heightCaps < 2;
|
|
214
214
|
|
|
215
215
|
const fontAscHeight = workingFont.charToGlyph('A').getMetrics().yMax;
|
|
216
216
|
|
|
@@ -224,7 +224,7 @@ export async function optimizeFont({
|
|
|
224
224
|
// console.log("workingFontRightBearingMedian: " + workingFontRightBearingMedian);
|
|
225
225
|
|
|
226
226
|
// Adjust character width and advance
|
|
227
|
-
for (const [key, value] of Object.entries(
|
|
227
|
+
for (const [key, value] of Object.entries(charMetricsObj.width)) {
|
|
228
228
|
// 33 is the first latin glyph (excluding space which is 32)
|
|
229
229
|
if (parseInt(key) < 33) { continue; }
|
|
230
230
|
|
|
@@ -305,7 +305,7 @@ export async function optimizeFont({
|
|
|
305
305
|
|
|
306
306
|
// Adjust height for capital letters (if heightCaps is believable)
|
|
307
307
|
if (heightCapsBelievable) {
|
|
308
|
-
const capsMult = xHeight *
|
|
308
|
+
const capsMult = xHeight * charMetricsObj.heightCaps / fontAscHeight;
|
|
309
309
|
for (const key of [...Array(26).keys()].map((x) => x + 65)) {
|
|
310
310
|
const charLit = String.fromCharCode(key);
|
|
311
311
|
|
|
@@ -320,8 +320,8 @@ export async function optimizeFont({
|
|
|
320
320
|
// This purposefully does not include numbers, as those are normalized differently.
|
|
321
321
|
const upperAsc = ['A', 'B', 'D', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
|
|
322
322
|
const upperAscCodes = upperAsc.map((x) => String(x.charCodeAt(0)));
|
|
323
|
-
const charHeightKeys = Object.keys(
|
|
324
|
-
const heightAscArr = Object.values(
|
|
323
|
+
const charHeightKeys = Object.keys(charMetricsObj.height);
|
|
324
|
+
const heightAscArr = Object.values(charMetricsObj.height).filter((element, index) => upperAscCodes.includes(charHeightKeys[index]));
|
|
325
325
|
|
|
326
326
|
// At least 10 observations are required to adjust from the default.
|
|
327
327
|
if (heightAscArr.length >= 10) {
|
|
@@ -332,7 +332,7 @@ export async function optimizeFont({
|
|
|
332
332
|
// TODO: Extend similar logic to apply to other descenders such as "p" and "q"
|
|
333
333
|
// Adjust height of capital J (which often has a height greater than other capital letters)
|
|
334
334
|
// All height from "J" above that of "A" is assumed to occur under the baseline
|
|
335
|
-
const actJMult = Math.max(round6(
|
|
335
|
+
const actJMult = Math.max(round6(charMetricsObj.height[74]) / charHeightA, 0);
|
|
336
336
|
const fontJMetrics = workingFont.charToGlyph('J').getMetrics();
|
|
337
337
|
const fontAMetrics = workingFont.charToGlyph('A').getMetrics();
|
|
338
338
|
const fontJMult = Math.max((fontJMetrics.yMax - fontJMetrics.yMin) / (fontAMetrics.yMax - fontAMetrics.yMin), 1);
|
|
@@ -358,7 +358,7 @@ export async function optimizeFont({
|
|
|
358
358
|
for (let i = 0; i < descAdjArr.length; i++) {
|
|
359
359
|
const charI = descAdjArr[i];
|
|
360
360
|
const charICode = charI.charCodeAt(0);
|
|
361
|
-
const actMult = Math.max(
|
|
361
|
+
const actMult = Math.max(charMetricsObj.height[charICode] / charMetricsObj.height[97], 0);
|
|
362
362
|
const metrics = workingFont.charToGlyph(charI).getMetrics();
|
|
363
363
|
const fontMult = (metrics.yMax - metrics.yMin) / (fontAMetrics.yMax - fontAMetrics.yMin);
|
|
364
364
|
const actFontMult = actMult / fontMult;
|
|
@@ -391,7 +391,7 @@ export async function optimizeFont({
|
|
|
391
391
|
}
|
|
392
392
|
}
|
|
393
393
|
|
|
394
|
-
workingFont.kerningPairs = calculateKerningPairs(workingFont,
|
|
394
|
+
workingFont.kerningPairs = calculateKerningPairs(workingFont, charMetricsObj, xHeight, style);
|
|
395
395
|
|
|
396
396
|
// Append suffix to avoid naming conflict with raw font.
|
|
397
397
|
// This is necessary for the Node.js version due to quirks with node-canvas.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scribe.js-ocr",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"description": "High-quality OCR and text extraction for images and PDFs.",
|
|
5
5
|
"main": "scribe.js",
|
|
6
6
|
"directories": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"eslint-config-airbnb-base": "^15.0.0",
|
|
18
18
|
"eslint-plugin-import": "^2.29.1",
|
|
19
19
|
"eslint-plugin-jsdoc": "^50.6.2",
|
|
20
|
-
"express": "^4.
|
|
20
|
+
"express": "^4.21.2",
|
|
21
21
|
"karma": "^6.4.4",
|
|
22
22
|
"karma-chrome-launcher": "^3.2.0",
|
|
23
23
|
"karma-firefox-launcher": "^2.1.3",
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
"karma-mocha-reporter": "^2.2.5",
|
|
26
26
|
"mocha": "^10.6.0",
|
|
27
27
|
"npm-run-all": "^4.1.5",
|
|
28
|
-
"wait-on": "^
|
|
28
|
+
"wait-on": "^8.0.2"
|
|
29
29
|
},
|
|
30
30
|
"scripts": {
|
|
31
31
|
"docs": "documentation build scribe.js -f md --access public > docs/API.md",
|
|
@@ -38,6 +38,9 @@
|
|
|
38
38
|
"test:cli": "mocha tests/cli",
|
|
39
39
|
"wait": "wait-on http://localhost:3031/"
|
|
40
40
|
},
|
|
41
|
+
"bin": {
|
|
42
|
+
"scribe": "cli/scribe.js"
|
|
43
|
+
},
|
|
41
44
|
"repository": {
|
|
42
45
|
"type": "git",
|
|
43
46
|
"url": "git+https://github.com/scribeocr/scribe.js"
|