scribe.js-ocr 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -2
- package/cli/main.js +12 -46
- package/js/clear.js +3 -3
- package/js/containers/app.js +11 -2
- package/js/containers/dataContainer.js +0 -6
- package/js/containers/fontContainer.js +139 -97
- package/js/containers/imageContainer.js +20 -84
- package/js/debug.js +34 -0
- package/js/export/exportPDF.js +52 -57
- package/js/export/exportRenderHOCR.js +5 -5
- package/js/fontContainerMain.js +95 -108
- package/js/fontEval.js +83 -111
- package/js/generalWorkerMain.js +28 -3
- package/js/global.d.ts +3 -0
- package/js/import/convertPageBlocks.js +9 -0
- package/js/import/convertPageShared.js +13 -7
- package/js/import/import.js +15 -13
- package/js/objects/imageObjects.js +97 -0
- package/js/objects/ocrObjects.js +53 -1
- package/js/recognizeConvert.js +8 -4
- package/js/utils/fontUtils.js +5 -5
- package/js/utils/miscUtils.js +7 -2
- package/js/worker/compareOCRModule.js +279 -81
- package/js/worker/generalWorker.js +98 -28
- package/js/worker/renderWordCanvas.js +14 -29
- package/package.json +1 -1
- package/scribe.js +69 -5
package/js/fontContainerMain.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
checkMultiFontMode,
|
|
3
|
-
|
|
3
|
+
FontCont,
|
|
4
4
|
FontContainerFont,
|
|
5
5
|
loadFont,
|
|
6
6
|
loadFontsFromSource,
|
|
@@ -15,9 +15,9 @@ import { gs } from './generalWorkerMain.js';
|
|
|
15
15
|
*/
|
|
16
16
|
export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
17
17
|
// Return early if the font set is already loaded, or a superset of the requested set is loaded.
|
|
18
|
-
if (
|
|
18
|
+
if (FontCont.glyphSet === glyphSet || FontCont.glyphSet === 'all' && glyphSet === 'latin') return;
|
|
19
19
|
|
|
20
|
-
|
|
20
|
+
FontCont.glyphSet = glyphSet;
|
|
21
21
|
|
|
22
22
|
// Note: this function is intentionally verbose, and should not be refactored to generate the paths dynamically.
|
|
23
23
|
// Build systems will not be able to resolve the paths if they are generated dynamically.
|
|
@@ -110,14 +110,14 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
110
110
|
NimbusSans: { normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold },
|
|
111
111
|
};
|
|
112
112
|
|
|
113
|
-
|
|
114
|
-
if (!fontAll.active || (!fontAll.active.NimbusSans.normal.opt && !fontAll.active.NimbusRomNo9L.normal.opt)) fontAll.active = fontAll.raw;
|
|
113
|
+
FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
|
|
115
114
|
|
|
116
115
|
if (typeof process === 'undefined') {
|
|
117
116
|
// This assumes that the scheduler `init` method has at least started.
|
|
118
117
|
if (gs.schedulerReady === null) console.warn('Failed to load fonts to workers as workers have not been initialized yet.');
|
|
119
118
|
await gs.schedulerReady;
|
|
120
|
-
|
|
119
|
+
// If this is running, presumably a new glyphset is being loaded, so the fonts should be forced to be updated.
|
|
120
|
+
await updateFontContWorkerMain({ loadRaw: true });
|
|
121
121
|
}
|
|
122
122
|
|
|
123
123
|
return;
|
|
@@ -144,7 +144,7 @@ export async function loadChiSimFont() {
|
|
|
144
144
|
chiSimSrc = readFile(new URL('../fonts/NotoSansSC-Regular.ttf', import.meta.url)).then((res) => res.buffer);
|
|
145
145
|
}
|
|
146
146
|
|
|
147
|
-
|
|
147
|
+
FontCont.supp.chi_sim = await loadFont('NotoSansSC', 'normal', 'sans', await chiSimSrc, false);
|
|
148
148
|
|
|
149
149
|
chiReadyRes();
|
|
150
150
|
|
|
@@ -152,102 +152,89 @@ export async function loadChiSimFont() {
|
|
|
152
152
|
}
|
|
153
153
|
|
|
154
154
|
/**
|
|
155
|
-
*
|
|
156
|
-
*
|
|
157
|
-
* @param {boolean}
|
|
158
|
-
* @param {boolean} [
|
|
159
|
-
* This should be used when switching from unvalidated to validated optimized fonts.
|
|
155
|
+
* Enable or disable font optimization settings.
|
|
156
|
+
* This function is used rather than exposing the settings using the `opt` object, as these settings exist on the font container in both the main thread and the worker threads.
|
|
157
|
+
* @param {boolean} enableOpt
|
|
158
|
+
* @param {boolean} [forceOpt]
|
|
160
159
|
*/
|
|
161
|
-
export async function enableFontOpt(
|
|
162
|
-
|
|
163
|
-
if (
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
160
|
+
export async function enableFontOpt(enableOpt, forceOpt) {
|
|
161
|
+
let change = false;
|
|
162
|
+
if (enableOpt === true || enableOpt === false) {
|
|
163
|
+
if (FontCont.enableOpt !== enableOpt) {
|
|
164
|
+
change = true;
|
|
165
|
+
FontCont.enableOpt = enableOpt;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
if (forceOpt === true || forceOpt === false) {
|
|
169
|
+
if (FontCont.forceOpt !== forceOpt) {
|
|
170
|
+
change = true;
|
|
171
|
+
FontCont.forceOpt = forceOpt;
|
|
172
|
+
}
|
|
169
173
|
}
|
|
170
174
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
await setBuiltInFontsWorker(gs.schedulerInner, forceWorkerUpdate);
|
|
174
|
-
} else {
|
|
175
|
-
// const { setFontAll } = await import('./worker/compareOCRModule.js');
|
|
176
|
-
// setFontAll(fontAll);
|
|
175
|
+
if (typeof process === 'undefined' && change) {
|
|
176
|
+
await updateFontContWorkerMain();
|
|
177
177
|
}
|
|
178
178
|
}
|
|
179
179
|
|
|
180
180
|
/**
|
|
181
|
-
*
|
|
182
|
-
* @param {
|
|
183
|
-
*
|
|
181
|
+
* @param {Object} [params]
|
|
182
|
+
* @param {boolean} [params.loadRaw] - By default, raw fonts are loaded if they have not been loaded before.
|
|
183
|
+
* Set `loadRaw` to `true` or `false` to force the raw fonts to be loaded or not loaded, respectively.
|
|
184
|
+
* @param {boolean} [params.loadOpt] - By default, optimized fonts are loaded if they have not been loaded before.
|
|
185
|
+
* Set `loadOpt` to `true` or `false` to force the optimized fonts to be loaded or not loaded, respectively.
|
|
186
|
+
* @param {boolean} [params.updateProps]
|
|
184
187
|
*/
|
|
185
|
-
export async function
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
const opt = fontAll.active.Carlito.normal.opt || fontAll.active.NimbusRomNo9L.normal.opt;
|
|
189
|
-
|
|
190
|
-
const loadedBuiltIn = (!opt && fontAll.loadedBuiltInRawWorker) || (opt && fontAll.loadedBuiltInOptWorker);
|
|
188
|
+
export async function updateFontContWorkerMain(params = {}) {
|
|
189
|
+
const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInRawWorker);
|
|
190
|
+
const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInOptWorker);
|
|
191
191
|
|
|
192
192
|
// If the active font data is not already loaded, load it now.
|
|
193
193
|
// This assumes that only one version of the raw/optimized fonts ever exist--
|
|
194
194
|
// it does not check whether the current optimized font changed since it was last loaded.
|
|
195
|
-
|
|
195
|
+
for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt]]) {
|
|
196
|
+
if (!load) continue;
|
|
197
|
+
|
|
196
198
|
const resArr = [];
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
Century: {
|
|
207
|
-
normal: fontAll.active.Century.normal.src,
|
|
208
|
-
italic: fontAll.active.Century.italic.src,
|
|
209
|
-
bold: fontAll.active.Century.bold.src,
|
|
210
|
-
},
|
|
211
|
-
Garamond: {
|
|
212
|
-
normal: fontAll.active.Garamond.normal.src,
|
|
213
|
-
italic: fontAll.active.Garamond.italic.src,
|
|
214
|
-
bold: fontAll.active.Garamond.bold.src,
|
|
215
|
-
},
|
|
216
|
-
Palatino: {
|
|
217
|
-
normal: fontAll.active.Palatino.normal.src,
|
|
218
|
-
italic: fontAll.active.Palatino.italic.src,
|
|
219
|
-
bold: fontAll.active.Palatino.bold.src,
|
|
220
|
-
},
|
|
221
|
-
NimbusRomNo9L: {
|
|
222
|
-
normal: fontAll.active.NimbusRomNo9L.normal.src,
|
|
223
|
-
italic: fontAll.active.NimbusRomNo9L.italic.src,
|
|
224
|
-
bold: fontAll.active.NimbusRomNo9L.bold.src,
|
|
225
|
-
},
|
|
226
|
-
NimbusSans: {
|
|
227
|
-
normal: fontAll.active.NimbusSans.normal.src,
|
|
228
|
-
italic: fontAll.active.NimbusSans.italic.src,
|
|
229
|
-
bold: fontAll.active.NimbusSans.bold.src,
|
|
230
|
-
},
|
|
231
|
-
},
|
|
232
|
-
opt,
|
|
233
|
-
});
|
|
234
|
-
resArr.push(res);
|
|
199
|
+
|
|
200
|
+
const input = { opt: type === 'opt', src: {} };
|
|
201
|
+
for (const [key, value] of Object.entries(FontCont[type])) {
|
|
202
|
+
if (!value || !value.normal) continue;
|
|
203
|
+
input.src[key] = {
|
|
204
|
+
normal: value.normal.src,
|
|
205
|
+
};
|
|
206
|
+
if (value.italic) input.src[key].italic = value.italic.src;
|
|
207
|
+
if (value.bold) input.src[key].bold = value.bold.src;
|
|
235
208
|
}
|
|
236
|
-
await Promise.all(resArr);
|
|
237
209
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
210
|
+
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
|
211
|
+
const worker = gs.schedulerInner.workers[i];
|
|
212
|
+
const res = worker.loadFontsWorker(input);
|
|
213
|
+
resArr.push(res);
|
|
214
|
+
|
|
215
|
+
// TODO: consider the race condition when `setBuiltInFontsWorkers` is called multiple times quickly and `loadFontsWorker` is still running.
|
|
216
|
+
if (type === 'opt') {
|
|
217
|
+
gs.loadedBuiltInOptWorker = true;
|
|
218
|
+
} else {
|
|
219
|
+
gs.loadedBuiltInRawWorker = true;
|
|
220
|
+
}
|
|
243
221
|
}
|
|
222
|
+
await Promise.all(resArr);
|
|
244
223
|
}
|
|
245
224
|
|
|
246
225
|
// Set the active font in the workers to match the active font in `fontAll`
|
|
247
226
|
const resArr = [];
|
|
248
|
-
for (let i = 0; i <
|
|
249
|
-
const worker =
|
|
250
|
-
const res = worker.
|
|
227
|
+
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
|
228
|
+
const worker = gs.schedulerInner.workers[i];
|
|
229
|
+
const res = worker.updateFontContWorker({
|
|
230
|
+
rawMetrics: FontCont.rawMetrics,
|
|
231
|
+
optMetrics: FontCont.optMetrics,
|
|
232
|
+
sansDefaultName: FontCont.sansDefaultName,
|
|
233
|
+
serifDefaultName: FontCont.serifDefaultName,
|
|
234
|
+
defaultFontName: FontCont.defaultFontName,
|
|
235
|
+
enableOpt: FontCont.enableOpt,
|
|
236
|
+
forceOpt: FontCont.forceOpt,
|
|
237
|
+
});
|
|
251
238
|
resArr.push(res);
|
|
252
239
|
}
|
|
253
240
|
await Promise.all(resArr);
|
|
@@ -255,15 +242,15 @@ export async function setBuiltInFontsWorker(scheduler, force = false) {
|
|
|
255
242
|
|
|
256
243
|
/**
|
|
257
244
|
* WIP: Import fonts embedded in PDFs.
|
|
258
|
-
* This function is not currently used.
|
|
245
|
+
* This function is out of date and not currently used.
|
|
259
246
|
* @param {*} scheduler
|
|
260
247
|
*/
|
|
261
248
|
export async function setUploadFontsWorker(scheduler) {
|
|
262
|
-
if (!
|
|
249
|
+
if (!FontCont.active) return;
|
|
263
250
|
|
|
264
251
|
/** @type {Object<string, fontSrcBuiltIn|fontSrcUpload>} */
|
|
265
252
|
const fontsUpload = {};
|
|
266
|
-
for (const [key, value] of Object.entries(
|
|
253
|
+
for (const [key, value] of Object.entries(FontCont.active)) {
|
|
267
254
|
if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans'].includes(key)) {
|
|
268
255
|
fontsUpload[key] = {
|
|
269
256
|
normal: value?.normal?.src, italic: value?.italic?.src, bold: value?.bold?.src,
|
|
@@ -286,10 +273,18 @@ export async function setUploadFontsWorker(scheduler) {
|
|
|
286
273
|
|
|
287
274
|
// Set the active font in the workers to match the active font in `fontAll`
|
|
288
275
|
const resArr = [];
|
|
289
|
-
const opt =
|
|
276
|
+
const opt = FontCont.active.Carlito.normal.opt || FontCont.active.NimbusRomNo9L.normal.opt;
|
|
290
277
|
for (let i = 0; i < scheduler.workers.length; i++) {
|
|
291
278
|
const worker = scheduler.workers[i];
|
|
292
|
-
const res = worker.
|
|
279
|
+
const res = worker.updateFontContWorker({
|
|
280
|
+
rawMetrics: FontCont.rawMetrics,
|
|
281
|
+
optMetrics: FontCont.optMetrics,
|
|
282
|
+
sansDefaultName: FontCont.sansDefaultName,
|
|
283
|
+
serifDefaultName: FontCont.serifDefaultName,
|
|
284
|
+
defaultFontName: FontCont.defaultFontName,
|
|
285
|
+
enableOpt: FontCont.enableOpt,
|
|
286
|
+
forceOpt: FontCont.forceOpt,
|
|
287
|
+
});
|
|
293
288
|
resArr.push(res);
|
|
294
289
|
}
|
|
295
290
|
await Promise.all(resArr);
|
|
@@ -307,15 +302,15 @@ export function setDefaultFontAuto(fontMetricsObj) {
|
|
|
307
302
|
|
|
308
303
|
// Change default font to whatever named font appears more
|
|
309
304
|
if ((fontMetricsObj.SerifDefault?.obs || 0) > (fontMetricsObj.SansDefault?.obs || 0)) {
|
|
310
|
-
|
|
305
|
+
FontCont.defaultFontName = 'SerifDefault';
|
|
311
306
|
} else {
|
|
312
|
-
|
|
307
|
+
FontCont.defaultFontName = 'SansDefault';
|
|
313
308
|
}
|
|
314
309
|
|
|
315
310
|
if (gs.schedulerInner) {
|
|
316
311
|
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
|
317
312
|
const worker = gs.schedulerInner.workers[i];
|
|
318
|
-
worker.
|
|
313
|
+
worker.updateFontContWorker({ defaultFontName: FontCont.defaultFontName });
|
|
319
314
|
}
|
|
320
315
|
}
|
|
321
316
|
}
|
|
@@ -342,14 +337,8 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
|
342
337
|
|
|
343
338
|
// If there are no statistics to use for optimization, create "optimized" font by simply copying the raw font without modification.
|
|
344
339
|
// This should only occur when `multiFontMode` is true, but a document contains no sans words or no serif words.
|
|
345
|
-
if (!fontMetricsObj[fontMetricsType] || !fontMetricsObj[fontMetricsType][fontFamily.normal.style]) {
|
|
346
|
-
|
|
347
|
-
const normalOptFont = new FontContainerFont(fontFamily.normal.family, fontFamily.normal.style, fontFamily.normal.src, true, opentypeFontArr[0]);
|
|
348
|
-
const italicOptFont = new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, fontFamily.italic.src, true, opentypeFontArr[1]);
|
|
349
|
-
const boldOptFont = new FontContainerFont(fontFamily.bold.family, fontFamily.bold.style, fontFamily.bold.src, true, opentypeFontArr[2]);
|
|
350
|
-
return {
|
|
351
|
-
normal: await normalOptFont, italic: await italicOptFont, bold: await boldOptFont,
|
|
352
|
-
};
|
|
340
|
+
if (!fontMetricsObj[fontMetricsType] || !fontMetricsObj[fontMetricsType][fontFamily.normal.style] || fontMetricsObj[fontMetricsType][fontFamily.normal.style].obs < 200) {
|
|
341
|
+
return null;
|
|
353
342
|
}
|
|
354
343
|
|
|
355
344
|
const metricsNormal = fontMetricsObj[fontMetricsType][fontFamily.normal.style];
|
|
@@ -360,29 +349,25 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
|
360
349
|
});
|
|
361
350
|
|
|
362
351
|
const metricsItalic = fontMetricsObj[fontMetricsType][fontFamily.italic.style];
|
|
363
|
-
/** @type {FontContainerFont|Promise<FontContainerFont>} */
|
|
364
|
-
let italicOptFont;
|
|
365
|
-
if (metricsItalic) {
|
|
352
|
+
/** @type {?FontContainerFont|Promise<FontContainerFont>} */
|
|
353
|
+
let italicOptFont = null;
|
|
354
|
+
if (metricsItalic && metricsItalic.obs >= 200) {
|
|
366
355
|
italicOptFont = gs.scheduler.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
|
|
367
356
|
.then(async (x) => {
|
|
368
357
|
const font = await loadOpentype(x.fontData, x.kerningPairs);
|
|
369
358
|
return new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, x.fontData, true, font);
|
|
370
359
|
});
|
|
371
|
-
} else {
|
|
372
|
-
const font = await loadOpentype(fontFamily.italic.src, null);
|
|
373
|
-
italicOptFont = new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, fontFamily.italic.src, true, font);
|
|
374
360
|
}
|
|
375
361
|
|
|
376
362
|
// Bold fonts are not optimized, as we currently have no accurate way to determine if characters are bold within OCR, so do not have bold metrics.
|
|
377
|
-
const boldOptFont = loadOpentype(fontFamily.bold.src, null).then((opentypeFont) => new FontContainerFont(fontFamily.bold.family, fontFamily.bold.style, fontFamily.bold.src, true, opentypeFont));
|
|
378
|
-
|
|
379
363
|
return {
|
|
380
|
-
normal: await normalOptFont, italic: await italicOptFont, bold:
|
|
364
|
+
normal: await normalOptFont, italic: await italicOptFont, bold: null,
|
|
381
365
|
};
|
|
382
366
|
}
|
|
383
367
|
|
|
384
368
|
/**
|
|
385
369
|
* Optimize all fonts.
|
|
370
|
+
* If a font cannot be optimized, then the raw font is returned.
|
|
386
371
|
* @param {Object<string, FontContainerFamilyBuiltIn>} fontPrivate
|
|
387
372
|
* @param {Object.<string, FontMetricsFamily>} fontMetricsObj
|
|
388
373
|
*/
|
|
@@ -396,6 +381,8 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
|
|
|
396
381
|
|
|
397
382
|
const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise]);
|
|
398
383
|
|
|
384
|
+
if (results.every((x) => x === null)) return null;
|
|
385
|
+
|
|
399
386
|
return {
|
|
400
387
|
Carlito: results[0],
|
|
401
388
|
Century: results[1],
|
package/js/fontEval.js
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { fontMetricsObj, pageMetricsArr } from './containers/dataContainer.js';
|
|
2
|
+
import { FontCont } from './containers/fontContainer.js';
|
|
3
3
|
import { ImageCache } from './containers/imageContainer.js';
|
|
4
4
|
import {
|
|
5
|
-
enableFontOpt,
|
|
6
5
|
loadBuiltInFontsRaw,
|
|
7
6
|
optimizeFontContainerAll, setDefaultFontAuto,
|
|
7
|
+
updateFontContWorkerMain,
|
|
8
8
|
} from './fontContainerMain.js';
|
|
9
9
|
import { gs } from './generalWorkerMain.js';
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
*
|
|
13
|
-
* @param {
|
|
12
|
+
* Evaluate how well a font matches the provided array of pages.
|
|
13
|
+
* @param {string} font - Name of font family.
|
|
14
14
|
* @param {Array<OcrPage>} pageArr
|
|
15
|
+
* @param {boolean} opt - Whether to use optimized fonts.
|
|
15
16
|
* @param {number} n - Number of words to compare
|
|
16
17
|
*/
|
|
17
|
-
export async function
|
|
18
|
+
export async function evalPagesFont(font, pageArr, opt, n = 500) {
|
|
18
19
|
if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
|
|
19
20
|
|
|
20
21
|
let metricTotal = 0;
|
|
@@ -25,25 +26,27 @@ export async function evalPageFonts(font, pageArr, n = 500) {
|
|
|
25
26
|
|
|
26
27
|
const imageI = await ImageCache.getBinary(i);
|
|
27
28
|
|
|
28
|
-
// The Node.js canvas package does not currently support
|
|
29
|
+
// The Node.js canvas package does not currently support worker threads
|
|
29
30
|
// https://github.com/Automattic/node-canvas/issues/1394
|
|
30
31
|
let res;
|
|
31
32
|
if (!(typeof process === 'undefined')) {
|
|
32
33
|
const { evalPageFont } = await import('./worker/compareOCRModule.js');
|
|
33
34
|
|
|
34
35
|
res = await evalPageFont({
|
|
35
|
-
font
|
|
36
|
+
font,
|
|
36
37
|
page: pageArr[i],
|
|
37
38
|
binaryImage: imageI,
|
|
38
39
|
pageMetricsObj: pageMetricsArr[i],
|
|
40
|
+
opt,
|
|
39
41
|
});
|
|
40
42
|
// Browser case
|
|
41
43
|
} else {
|
|
42
44
|
res = await gs.scheduler.evalPageFont({
|
|
43
|
-
font
|
|
45
|
+
font,
|
|
44
46
|
page: pageArr[i],
|
|
45
47
|
binaryImage: imageI,
|
|
46
48
|
pageMetricsObj: pageMetricsArr[i],
|
|
49
|
+
opt,
|
|
47
50
|
});
|
|
48
51
|
}
|
|
49
52
|
|
|
@@ -56,28 +59,31 @@ export async function evalPageFonts(font, pageArr, n = 500) {
|
|
|
56
59
|
|
|
57
60
|
/**
|
|
58
61
|
* @param {Array<OcrPage>} pageArr
|
|
62
|
+
* @param {boolean} opt - Whether to use optimized fonts.
|
|
59
63
|
*/
|
|
60
|
-
export async function evaluateFonts(pageArr) {
|
|
61
|
-
const
|
|
62
|
-
|
|
63
|
-
const
|
|
64
|
+
export async function evaluateFonts(pageArr, opt) {
|
|
65
|
+
const evalCarlito = !!(opt ? FontCont.opt?.Carlito : FontCont.raw?.Carlito);
|
|
66
|
+
const evalNimbusSans = !!(opt ? FontCont.opt?.NimbusSans : FontCont.raw?.NimbusSans);
|
|
67
|
+
const evalCentury = !!(opt ? FontCont.opt?.Century : FontCont.raw?.Century);
|
|
68
|
+
const evalPalatino = !!(opt ? FontCont.opt?.Palatino : FontCont.raw?.Palatino);
|
|
69
|
+
const evalGaramond = !!(opt ? FontCont.opt?.Garamond : FontCont.raw?.Garamond);
|
|
70
|
+
const evalNimbusRomNo9L = !!(opt ? FontCont.opt?.NimbusRomNo9L : FontCont.raw?.NimbusRomNo9L);
|
|
64
71
|
|
|
65
72
|
// The browser version runs in parallel using workers, however the Node.js version runs sequentially,
|
|
66
73
|
// as the canvas package does not support workers, and trying to run in parallel causes problems.
|
|
67
74
|
// The logic is the same in both versions.
|
|
68
|
-
let
|
|
69
|
-
let serifMetrics;
|
|
75
|
+
let fontMetricsTmp;
|
|
70
76
|
if (typeof process === 'undefined') {
|
|
71
77
|
const fontMetricsPromises = {
|
|
72
|
-
carlito:
|
|
73
|
-
nimbusSans:
|
|
74
|
-
century:
|
|
75
|
-
palatino:
|
|
76
|
-
garamond:
|
|
77
|
-
nimbusRomNo9L:
|
|
78
|
+
carlito: evalCarlito ? evalPagesFont('Carlito', pageArr, opt) : null,
|
|
79
|
+
nimbusSans: evalNimbusSans ? evalPagesFont('NimbusSans', pageArr, opt) : null,
|
|
80
|
+
century: evalCentury ? evalPagesFont('Century', pageArr, opt) : null,
|
|
81
|
+
palatino: evalPalatino ? evalPagesFont('Palatino', pageArr, opt) : null,
|
|
82
|
+
garamond: evalGaramond ? evalPagesFont('Garamond', pageArr, opt) : null,
|
|
83
|
+
nimbusRomNo9L: evalNimbusRomNo9L ? evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
|
|
78
84
|
};
|
|
79
85
|
|
|
80
|
-
|
|
86
|
+
fontMetricsTmp = {
|
|
81
87
|
carlito: await fontMetricsPromises.carlito,
|
|
82
88
|
nimbusSans: await fontMetricsPromises.nimbusSans,
|
|
83
89
|
century: await fontMetricsPromises.century,
|
|
@@ -85,46 +91,39 @@ export async function evaluateFonts(pageArr) {
|
|
|
85
91
|
garamond: await fontMetricsPromises.garamond,
|
|
86
92
|
nimbusRomNo9L: await fontMetricsPromises.nimbusRomNo9L,
|
|
87
93
|
};
|
|
88
|
-
|
|
89
|
-
sansMetrics = {
|
|
90
|
-
Carlito: fontMetrics.carlito.metricTotal / fontMetrics.carlito.wordsTotal,
|
|
91
|
-
NimbusSans: fontMetrics.nimbusSans.metricTotal / fontMetrics.nimbusSans.wordsTotal,
|
|
92
|
-
};
|
|
93
|
-
|
|
94
|
-
serifMetrics = {
|
|
95
|
-
Century: fontMetrics.century.metricTotal / fontMetrics.century.wordsTotal,
|
|
96
|
-
Palatino: fontMetrics.palatino.metricTotal / fontMetrics.palatino.wordsTotal,
|
|
97
|
-
Garamond: fontMetrics.garamond.metricTotal / fontMetrics.garamond.wordsTotal,
|
|
98
|
-
NimbusRomNo9L: fontMetrics.nimbusRomNo9L.metricTotal / fontMetrics.nimbusRomNo9L.wordsTotal,
|
|
99
|
-
};
|
|
100
94
|
} else {
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
95
|
+
fontMetricsTmp = {
|
|
96
|
+
carlito: evalCarlito ? await evalPagesFont('Carlito', pageArr, opt) : null,
|
|
97
|
+
nimbusSans: evalNimbusSans ? await evalPagesFont('NimbusSans', pageArr, opt) : null,
|
|
98
|
+
century: evalCentury ? await evalPagesFont('Century', pageArr, opt) : null,
|
|
99
|
+
palatino: evalPalatino ? await evalPagesFont('Palatino', pageArr, opt) : null,
|
|
100
|
+
garamond: evalGaramond ? await evalPagesFont('Garamond', pageArr, opt) : null,
|
|
101
|
+
nimbusRomNo9L: evalNimbusRomNo9L ? await evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
|
|
108
102
|
};
|
|
103
|
+
}
|
|
109
104
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
105
|
+
const fontMetrics = {
|
|
106
|
+
Carlito: fontMetricsTmp.carlito ? fontMetricsTmp.carlito.metricTotal / fontMetricsTmp.carlito.wordsTotal : null,
|
|
107
|
+
NimbusSans: fontMetricsTmp.nimbusSans ? fontMetricsTmp.nimbusSans.metricTotal / fontMetricsTmp.nimbusSans.wordsTotal : null,
|
|
108
|
+
Century: fontMetricsTmp.century ? fontMetricsTmp.century.metricTotal / fontMetricsTmp.century.wordsTotal : null,
|
|
109
|
+
Palatino: fontMetricsTmp.palatino ? fontMetricsTmp.palatino.metricTotal / fontMetricsTmp.palatino.wordsTotal : null,
|
|
110
|
+
Garamond: fontMetricsTmp.garamond ? fontMetricsTmp.garamond.metricTotal / fontMetricsTmp.garamond.wordsTotal : null,
|
|
111
|
+
NimbusRomNo9L: fontMetricsTmp.nimbusRomNo9L ? fontMetricsTmp.nimbusRomNo9L.metricTotal / fontMetricsTmp.nimbusRomNo9L.wordsTotal : null,
|
|
112
|
+
};
|
|
114
113
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
Palatino: fontMetrics.Palatino.metricTotal / fontMetrics.Palatino.wordsTotal,
|
|
118
|
-
Garamond: fontMetrics.Garamond.metricTotal / fontMetrics.Garamond.wordsTotal,
|
|
119
|
-
NimbusRomNo9L: fontMetrics.NimbusRomNo9L.metricTotal / fontMetrics.NimbusRomNo9L.wordsTotal,
|
|
120
|
-
};
|
|
121
|
-
}
|
|
114
|
+
return fontMetrics;
|
|
115
|
+
}
|
|
122
116
|
|
|
117
|
+
/**
|
|
118
|
+
*
|
|
119
|
+
* @param {Awaited<ReturnType<evaluateFonts>>} fontMetrics
|
|
120
|
+
*/
|
|
121
|
+
const calcBestFonts = (fontMetrics) => {
|
|
123
122
|
let minKeySans = 'NimbusSans';
|
|
124
123
|
let minValueSans = Number.MAX_VALUE;
|
|
125
124
|
|
|
126
|
-
for (const [key, value] of Object.entries(
|
|
127
|
-
if (
|
|
125
|
+
for (const [key, value] of Object.entries(fontMetrics)) {
|
|
126
|
+
if (!['Carlito', 'NimbusSans'].includes(key)) continue;
|
|
128
127
|
if (value < minValueSans) {
|
|
129
128
|
minValueSans = value;
|
|
130
129
|
minKeySans = key;
|
|
@@ -134,8 +133,8 @@ export async function evaluateFonts(pageArr) {
|
|
|
134
133
|
let minKeySerif = 'NimbusRomNo9L';
|
|
135
134
|
let minValueSerif = Number.MAX_VALUE;
|
|
136
135
|
|
|
137
|
-
for (const [key, value] of Object.entries(
|
|
138
|
-
if (
|
|
136
|
+
for (const [key, value] of Object.entries(fontMetrics)) {
|
|
137
|
+
if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L'].includes(key)) continue;
|
|
139
138
|
if (value < minValueSerif) {
|
|
140
139
|
minValueSerif = value;
|
|
141
140
|
minKeySerif = key;
|
|
@@ -143,12 +142,10 @@ export async function evaluateFonts(pageArr) {
|
|
|
143
142
|
}
|
|
144
143
|
|
|
145
144
|
return {
|
|
146
|
-
sansMetrics,
|
|
147
|
-
serifMetrics,
|
|
148
145
|
minKeySans,
|
|
149
146
|
minKeySerif,
|
|
150
147
|
};
|
|
151
|
-
}
|
|
148
|
+
};
|
|
152
149
|
|
|
153
150
|
/**
|
|
154
151
|
* Runs font optimization and validation. Sets `fontAll` defaults to best fonts,
|
|
@@ -164,24 +161,19 @@ export async function evaluateFonts(pageArr) {
|
|
|
164
161
|
export async function runFontOptimization(ocrArr) {
|
|
165
162
|
await loadBuiltInFontsRaw();
|
|
166
163
|
|
|
167
|
-
const fontRaw = fontAll.getContainer('raw');
|
|
168
|
-
|
|
169
164
|
const calculateOpt = fontMetricsObj && Object.keys(fontMetricsObj).length > 0;
|
|
170
165
|
|
|
171
166
|
let enableOptSerif = false;
|
|
172
167
|
let enableOptSans = false;
|
|
173
168
|
|
|
169
|
+
let optimizeFontContainerAllPromise;
|
|
174
170
|
if (calculateOpt) {
|
|
175
171
|
setDefaultFontAuto(fontMetricsObj);
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
// If this ever comes up in actual usage and is a problem, then the behavior can be changed for that specific case.
|
|
182
|
-
if (!ImageCache.inputModes.image && !ImageCache.inputModes.pdf) {
|
|
183
|
-
fontAll.opt = { ...fontAll.optInitial };
|
|
184
|
-
}
|
|
172
|
+
|
|
173
|
+
optimizeFontContainerAllPromise = optimizeFontContainerAll(FontCont.raw, fontMetricsObj)
|
|
174
|
+
.then((res) => {
|
|
175
|
+
FontCont.opt = res;
|
|
176
|
+
});
|
|
185
177
|
}
|
|
186
178
|
|
|
187
179
|
// If image data exists, select the correct font by comparing to the image.
|
|
@@ -189,70 +181,50 @@ export async function runFontOptimization(ocrArr) {
|
|
|
189
181
|
// Evaluate default fonts using up to 5 pages.
|
|
190
182
|
const pageNum = Math.min(ImageCache.pageCount, 5);
|
|
191
183
|
|
|
192
|
-
// Set raw font in workers
|
|
193
|
-
await enableFontOpt(false);
|
|
194
|
-
|
|
195
184
|
// This step needs to happen here as all fonts must be registered before initializing the canvas.
|
|
196
185
|
if (!(typeof process === 'undefined')) {
|
|
186
|
+
await optimizeFontContainerAllPromise;
|
|
197
187
|
const { initCanvasNode } = await import('./worker/compareOCRModule.js');
|
|
198
188
|
await initCanvasNode();
|
|
199
189
|
}
|
|
200
190
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
DebugData.evalRaw = evalRaw;
|
|
191
|
+
FontCont.rawMetrics = await evaluateFonts(ocrArr.slice(0, pageNum), false);
|
|
192
|
+
const bestMetricsRaw = calcBestFonts(FontCont.rawMetrics);
|
|
204
193
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
await
|
|
194
|
+
await optimizeFontContainerAllPromise;
|
|
195
|
+
if (FontCont.opt && Object.keys(FontCont.opt).length > 0) {
|
|
196
|
+
await updateFontContWorkerMain();
|
|
208
197
|
|
|
209
|
-
|
|
198
|
+
FontCont.optMetrics = await evaluateFonts(ocrArr.slice(0, pageNum), true);
|
|
210
199
|
|
|
211
|
-
|
|
200
|
+
const bestMetricsOpt = calcBestFonts(FontCont.optMetrics);
|
|
212
201
|
|
|
213
202
|
// The default font for both the optimized and unoptimized versions are set to the same font.
|
|
214
203
|
// This ensures that switching on/off "font optimization" does not change the font, which would be confusing.
|
|
215
|
-
if (
|
|
216
|
-
fontAll.sansDefaultName = evalOpt.minKeySans;
|
|
204
|
+
if (FontCont.optMetrics[bestMetricsOpt.minKeySans] < FontCont.rawMetrics[bestMetricsRaw.minKeySans]) {
|
|
217
205
|
enableOptSans = true;
|
|
206
|
+
FontCont.sansDefaultName = bestMetricsOpt.minKeySans;
|
|
218
207
|
} else {
|
|
219
|
-
|
|
208
|
+
FontCont.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
220
209
|
}
|
|
221
210
|
|
|
222
211
|
// Repeat for serif fonts
|
|
223
|
-
if (
|
|
224
|
-
fontAll.serifDefaultName = evalOpt.minKeySerif;
|
|
212
|
+
if (FontCont.optMetrics[bestMetricsOpt.minKeySerif] < FontCont.rawMetrics[bestMetricsRaw.minKeySerif]) {
|
|
225
213
|
enableOptSerif = true;
|
|
214
|
+
FontCont.serifDefaultName = bestMetricsOpt.minKeySerif;
|
|
226
215
|
} else {
|
|
227
|
-
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
// Create final optimized font object.
|
|
231
|
-
// The final optimized font is set to either the initial optimized font or the raw font depending on what fits better.
|
|
232
|
-
// Make shallow copy to allow for changing individual fonts without copying the entire object.
|
|
233
|
-
fontAll.opt = { ...fontAll.optInitial };
|
|
234
|
-
|
|
235
|
-
if (!enableOptSans) {
|
|
236
|
-
fontAll.opt.Carlito = fontRaw.Carlito;
|
|
237
|
-
fontAll.opt.NimbusSans = fontRaw.NimbusSans;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
if (!enableOptSerif) {
|
|
241
|
-
fontAll.opt.Century = fontRaw.Century;
|
|
242
|
-
fontAll.opt.Garamond = fontRaw.Garamond;
|
|
243
|
-
fontAll.opt.NimbusRomNo9L = fontRaw.NimbusRomNo9L;
|
|
244
|
-
fontAll.opt.Palatino = fontRaw.Palatino;
|
|
216
|
+
FontCont.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
245
217
|
}
|
|
246
218
|
} else {
|
|
247
|
-
|
|
248
|
-
|
|
219
|
+
FontCont.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
220
|
+
FontCont.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
249
221
|
}
|
|
250
|
-
}
|
|
251
222
|
|
|
252
|
-
|
|
253
|
-
await enableFontOpt(true, false, true);
|
|
223
|
+
FontCont.enableOpt = enableOptSerif || enableOptSans;
|
|
254
224
|
|
|
255
|
-
|
|
225
|
+
// Send updated state to all workers.
|
|
226
|
+
await updateFontContWorkerMain();
|
|
227
|
+
}
|
|
256
228
|
|
|
257
|
-
return enableOpt;
|
|
229
|
+
return FontCont.enableOpt;
|
|
258
230
|
}
|