scribe.js-ocr 0.7.4 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/js/clear.js +5 -6
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +51 -39
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +5 -5
- package/js/fontContainerMain.js +42 -42
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +86 -90
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +22 -4
- package/js/import/convertPageAbbyy.js +8 -1
- package/js/import/convertPageStext.js +1 -1
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +42 -0
- package/js/recognizeConvert.js +21 -8
- package/js/utils/miscUtils.js +27 -6
- package/js/worker/compareOCRModule.js +7 -7
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +16 -16
- package/package.json +6 -3
package/cli/scribe.js
CHANGED
package/js/clear.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { inputData } from './containers/app.js';
|
|
2
2
|
import {
|
|
3
3
|
convertPageWarn,
|
|
4
|
-
fontMetricsObj,
|
|
5
4
|
layoutDataTables,
|
|
6
5
|
layoutRegions,
|
|
7
6
|
ocrAll,
|
|
@@ -10,18 +9,18 @@ import {
|
|
|
10
9
|
} from './containers/dataContainer.js';
|
|
11
10
|
import { FontCont } from './containers/fontContainer.js';
|
|
12
11
|
import { ImageCache } from './containers/imageContainer.js';
|
|
13
|
-
import {
|
|
12
|
+
import { clearObjectProperties } from './utils/miscUtils.js';
|
|
14
13
|
|
|
15
14
|
export function clearData() {
|
|
16
15
|
inputData.clear();
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
clearObjectProperties(ocrAll);
|
|
17
|
+
ocrAll.active = [];
|
|
18
|
+
clearObjectProperties(ocrAllRaw);
|
|
19
|
+
ocrAllRaw.active = [];
|
|
19
20
|
layoutRegions.pages.length = 0;
|
|
20
21
|
layoutDataTables.pages.length = 0;
|
|
21
22
|
pageMetricsArr.length = 0;
|
|
22
23
|
convertPageWarn.length = 0;
|
|
23
24
|
ImageCache.clear();
|
|
24
|
-
// Clear optimized font data and reset fontAll to raw data.
|
|
25
|
-
replaceObjectProperties(fontMetricsObj);
|
|
26
25
|
FontCont.clear();
|
|
27
26
|
}
|
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
// This file contains various objects that are imported by other modules.
|
|
2
2
|
// Everything here is essentially a global variable; none of them are technically "containers".
|
|
3
3
|
|
|
4
|
-
/** @type {Object.<string, FontMetricsFamily>} */
|
|
5
|
-
export const fontMetricsObj = {};
|
|
6
|
-
|
|
7
4
|
export class layoutRegions {
|
|
8
5
|
/** @type {Array<LayoutPage>} */
|
|
9
6
|
static pages = [];
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
// Node.js case
|
|
7
7
|
import opentype from '../../lib/opentype.module.js';
|
|
8
|
-
import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
|
|
8
|
+
import { determineSansSerif, getStyleLookup, clearObjectProperties } from '../utils/miscUtils.js';
|
|
9
9
|
import { ca } from '../canvasAdapter.js';
|
|
10
10
|
|
|
11
11
|
if (typeof process === 'object') {
|
|
@@ -21,7 +21,7 @@ if (typeof process === 'object') {
|
|
|
21
21
|
|
|
22
22
|
/**
|
|
23
23
|
* Checks whether `multiFontMode` should be enabled or disabled.
|
|
24
|
-
* @param {Object.<string,
|
|
24
|
+
* @param {Object.<string, CharMetricsFamily>} charMetricsObj
|
|
25
25
|
*
|
|
26
26
|
* Usually (including when the built-in OCR engine is used) we will have metrics for individual font families,
|
|
27
27
|
* which are used to optimize the appropriate fonts ("multiFontMode" is `true` in this case).
|
|
@@ -29,12 +29,12 @@ if (typeof process === 'object') {
|
|
|
29
29
|
* but no font identification information for most or all words.
|
|
30
30
|
* If this is encountered the "default" metric is applied to the default font ("multiFontMode" is `false` in this case).
|
|
31
31
|
*/
|
|
32
|
-
export function checkMultiFontMode(
|
|
32
|
+
export function checkMultiFontMode(charMetricsObj) {
|
|
33
33
|
let defaultFontObs = 0;
|
|
34
34
|
let namedFontObs = 0;
|
|
35
|
-
if (
|
|
36
|
-
if (
|
|
37
|
-
if (
|
|
35
|
+
if (charMetricsObj.Default?.obs) { defaultFontObs += (charMetricsObj.Default?.obs || 0); }
|
|
36
|
+
if (charMetricsObj.SerifDefault?.obs) { namedFontObs += (charMetricsObj.SerifDefault?.obs || 0); }
|
|
37
|
+
if (charMetricsObj.SansDefault?.obs) { namedFontObs += (charMetricsObj.SansDefault?.obs || 0); }
|
|
38
38
|
|
|
39
39
|
return namedFontObs > defaultFontObs;
|
|
40
40
|
}
|
|
@@ -250,33 +250,43 @@ export class FontCont {
|
|
|
250
250
|
chi_sim: null,
|
|
251
251
|
};
|
|
252
252
|
|
|
253
|
-
/**
|
|
254
|
-
|
|
253
|
+
/**
|
|
254
|
+
* This object contains all data that is saved and restored from intermediate .scribe files.
|
|
255
|
+
* Anything outside of this object is not saved or restored.
|
|
256
|
+
* @type {FontState}
|
|
257
|
+
*/
|
|
258
|
+
static state = {
|
|
259
|
+
/** Optimized fonts will be used when believed to improve quality. */
|
|
260
|
+
enableOpt: false,
|
|
255
261
|
|
|
256
|
-
|
|
257
|
-
|
|
262
|
+
/** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
|
|
263
|
+
forceOpt: false,
|
|
258
264
|
|
|
259
|
-
|
|
260
|
-
|
|
265
|
+
/**
|
|
266
|
+
* If `false`, 'Courier' will not be cleaned to Nimbus Mono.
|
|
267
|
+
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
268
|
+
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
269
|
+
*/
|
|
270
|
+
enableCleanToNimbusMono: false,
|
|
261
271
|
|
|
262
|
-
|
|
263
|
-
static optMetrics = null;
|
|
272
|
+
defaultFontName: 'SerifDefault',
|
|
264
273
|
|
|
265
|
-
|
|
274
|
+
serifDefaultName: 'NimbusRoman',
|
|
266
275
|
|
|
267
|
-
|
|
276
|
+
sansDefaultName: 'NimbusSans',
|
|
268
277
|
|
|
269
|
-
|
|
278
|
+
glyphSet: null,
|
|
270
279
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
274
|
-
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
275
|
-
*/
|
|
276
|
-
static enableCleanToNimbusMono = false;
|
|
280
|
+
/** @type {Object.<string, CharMetricsFamily>} */
|
|
281
|
+
charMetrics: {},
|
|
277
282
|
|
|
278
|
-
|
|
279
|
-
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
286
|
+
static rawMetrics = null;
|
|
287
|
+
|
|
288
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
289
|
+
static optMetrics = null;
|
|
280
290
|
|
|
281
291
|
/**
|
|
282
292
|
* Load fonts from an ArrayBuffer containing arbitrary font data.
|
|
@@ -344,15 +354,15 @@ export class FontCont {
|
|
|
344
354
|
const raw = FontCont.raw?.[family]?.normal;
|
|
345
355
|
if (!raw) return false;
|
|
346
356
|
const opt = FontCont.opt?.[family]?.normal;
|
|
347
|
-
if (opt && FontCont.forceOpt) {
|
|
357
|
+
if (opt && FontCont.state.forceOpt) {
|
|
348
358
|
return true;
|
|
349
359
|
// If optimized fonts are enabled (but not forced), the optimized version of a font will be used if:
|
|
350
360
|
// (1) The optimized version exists
|
|
351
361
|
// (2) The optimized version has a better metric (so quality should improve).
|
|
352
362
|
// (3) The optimized version of the default sans/serif font also has a better metric.
|
|
353
363
|
// This last condition avoids font optimization being enabled in the UI when it only improves an unused font.
|
|
354
|
-
} if (opt && FontCont.enableOpt) {
|
|
355
|
-
const defaultFamily = raw.type === 'serif' ? FontCont.serifDefaultName : FontCont.sansDefaultName;
|
|
364
|
+
} if (opt && FontCont.state.enableOpt) {
|
|
365
|
+
const defaultFamily = raw.type === 'serif' ? FontCont.state.serifDefaultName : FontCont.state.sansDefaultName;
|
|
356
366
|
|
|
357
367
|
const rawMetricDefault = FontCont.rawMetrics?.[defaultFamily];
|
|
358
368
|
const optMetricDefault = FontCont.optMetrics?.[defaultFamily];
|
|
@@ -375,7 +385,7 @@ export class FontCont {
|
|
|
375
385
|
* @returns {FontContainerFont}
|
|
376
386
|
*/
|
|
377
387
|
static getFont = (style, lang = 'eng') => {
|
|
378
|
-
let family = style.font || FontCont.defaultFontName;
|
|
388
|
+
let family = style.font || FontCont.state.defaultFontName;
|
|
379
389
|
|
|
380
390
|
const styleLookup = getStyleLookup(style);
|
|
381
391
|
|
|
@@ -413,9 +423,9 @@ export class FontCont {
|
|
|
413
423
|
family = 'Carlito';
|
|
414
424
|
} else if (/Calibri/i.test(family)) {
|
|
415
425
|
family = 'Carlito';
|
|
416
|
-
} else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
426
|
+
} else if (/Courier/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
417
427
|
family = 'NimbusMono';
|
|
418
|
-
} else if (/NimbusMono/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
428
|
+
} else if (/NimbusMono/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
419
429
|
family = 'NimbusMono';
|
|
420
430
|
}
|
|
421
431
|
}
|
|
@@ -426,10 +436,10 @@ export class FontCont {
|
|
|
426
436
|
}
|
|
427
437
|
|
|
428
438
|
// This needs to come first as `defaultFontName` maps to either 'SerifDefault' or 'SansDefault'.
|
|
429
|
-
if (family === 'Default') family = FontCont.defaultFontName;
|
|
439
|
+
if (family === 'Default') family = FontCont.state.defaultFontName;
|
|
430
440
|
|
|
431
|
-
if (family === 'SerifDefault') family = FontCont.serifDefaultName;
|
|
432
|
-
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
441
|
+
if (family === 'SerifDefault') family = FontCont.state.serifDefaultName;
|
|
442
|
+
if (family === 'SansDefault') family = FontCont.state.sansDefaultName;
|
|
433
443
|
|
|
434
444
|
/** @type {FontContainerFont} */
|
|
435
445
|
let fontRes = FontCont.raw?.[family]?.[styleLookup];
|
|
@@ -456,16 +466,18 @@ export class FontCont {
|
|
|
456
466
|
FontCont.rawMetrics = null;
|
|
457
467
|
FontCont.optMetrics = null;
|
|
458
468
|
|
|
459
|
-
FontCont.enableCleanToNimbusMono = false;
|
|
469
|
+
FontCont.state.enableCleanToNimbusMono = false;
|
|
470
|
+
|
|
471
|
+
FontCont.state.defaultFontName = 'SerifDefault';
|
|
472
|
+
FontCont.state.serifDefaultName = 'NimbusRoman';
|
|
473
|
+
FontCont.state.sansDefaultName = 'NimbusSans';
|
|
460
474
|
|
|
461
|
-
FontCont.
|
|
462
|
-
FontCont.serifDefaultName = 'NimbusRoman';
|
|
463
|
-
FontCont.sansDefaultName = 'NimbusSans';
|
|
475
|
+
clearObjectProperties(FontCont.state.charMetrics);
|
|
464
476
|
};
|
|
465
477
|
|
|
466
478
|
static terminate = () => {
|
|
467
479
|
FontCont.clear();
|
|
468
480
|
FontCont.raw = null;
|
|
469
|
-
FontCont.glyphSet = null;
|
|
481
|
+
FontCont.state.glyphSet = null;
|
|
470
482
|
};
|
|
471
483
|
}
|
package/js/export/export.js
CHANGED
|
@@ -9,11 +9,14 @@ import { writePdf } from './writePdf.js';
|
|
|
9
9
|
import { writeHocr } from './writeHocr.js';
|
|
10
10
|
import { writeText } from './writeText.js';
|
|
11
11
|
import { writeHtml } from './writeHtml.js';
|
|
12
|
+
import { removeCircularRefsOcr } from '../objects/ocrObjects.js';
|
|
13
|
+
import { removeCircularRefsDataTables } from '../objects/layoutObjects.js';
|
|
14
|
+
import { FontCont } from '../containers/fontContainer.js';
|
|
12
15
|
|
|
13
16
|
/**
|
|
14
17
|
* Export active OCR data to specified format.
|
|
15
18
|
* @public
|
|
16
|
-
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'} [format='txt']
|
|
19
|
+
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt']
|
|
17
20
|
* @param {number} [minPage=0] - First page to export.
|
|
18
21
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
19
22
|
* @returns {Promise<string|ArrayBuffer>}
|
|
@@ -183,9 +186,9 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
183
186
|
w.freeDocument(pdf);
|
|
184
187
|
}
|
|
185
188
|
} else if (format === 'hocr') {
|
|
186
|
-
content = writeHocr(
|
|
189
|
+
content = writeHocr(ocrDownload, minPage, maxPage);
|
|
187
190
|
} else if (format === 'html') {
|
|
188
|
-
content = writeHtml(
|
|
191
|
+
content = writeHtml(ocrDownload, minPage, maxPage, opt.reflow, opt.removeMargins);
|
|
189
192
|
} else if (format === 'txt') {
|
|
190
193
|
content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
|
|
191
194
|
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
|
|
@@ -199,6 +202,18 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
199
202
|
// Less common export formats are loaded dynamically to reduce initial load time.
|
|
200
203
|
const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
|
|
201
204
|
content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage);
|
|
205
|
+
} else if (format === 'scribe') {
|
|
206
|
+
const data = {
|
|
207
|
+
ocr: removeCircularRefsOcr(ocrDownload),
|
|
208
|
+
fontState: FontCont.state,
|
|
209
|
+
layoutRegions: layoutRegions.pages,
|
|
210
|
+
layoutDataTables: removeCircularRefsDataTables(layoutDataTables.pages),
|
|
211
|
+
};
|
|
212
|
+
const contentStr = JSON.stringify(data);
|
|
213
|
+
|
|
214
|
+
const pako = await import('../../lib/pako.esm.mjs');
|
|
215
|
+
const enc = new TextEncoder();
|
|
216
|
+
content = pako.gzip(enc.encode(contentStr))?.buffer;
|
|
202
217
|
}
|
|
203
218
|
|
|
204
219
|
return content;
|
|
@@ -207,14 +222,14 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
207
222
|
/**
|
|
208
223
|
* Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
|
|
209
224
|
* @public
|
|
210
|
-
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
|
|
225
|
+
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format
|
|
211
226
|
* @param {string} fileName
|
|
212
227
|
* @param {number} [minPage=0] - First page to export.
|
|
213
228
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
214
229
|
*/
|
|
215
230
|
export async function download(format, fileName, minPage = 0, maxPage = -1) {
|
|
216
231
|
if (format === 'text') format = 'txt';
|
|
217
|
-
fileName = fileName.replace(/\.\w{1,
|
|
232
|
+
fileName = fileName.replace(/\.\w{1,6}$/, `.${format}`);
|
|
218
233
|
const content = await exportData(format, minPage, maxPage);
|
|
219
234
|
await saveAs(content, fileName);
|
|
220
235
|
}
|
package/js/export/writeHocr.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { opt } from '../containers/app.js';
|
|
2
2
|
import {
|
|
3
|
-
|
|
3
|
+
layoutDataTables, layoutRegions, pageMetricsArr,
|
|
4
4
|
} from '../containers/dataContainer.js';
|
|
5
5
|
import { FontCont } from '../containers/fontContainer.js';
|
|
6
6
|
import ocr from '../objects/ocrObjects.js';
|
|
@@ -17,10 +17,10 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
17
17
|
if (maxValue === null || maxValue === undefined || maxValue < 0) maxValue = ocrData.length - 1;
|
|
18
18
|
|
|
19
19
|
const meta = {
|
|
20
|
-
'font-metrics':
|
|
21
|
-
'default-font': FontCont.defaultFontName,
|
|
22
|
-
'sans-font': FontCont.sansDefaultName,
|
|
23
|
-
'serif-font': FontCont.serifDefaultName,
|
|
20
|
+
'font-metrics': FontCont.state.charMetrics,
|
|
21
|
+
'default-font': FontCont.state.defaultFontName,
|
|
22
|
+
'sans-font': FontCont.state.sansDefaultName,
|
|
23
|
+
'serif-font': FontCont.state.serifDefaultName,
|
|
24
24
|
'enable-opt': opt.enableOpt,
|
|
25
25
|
layout: layoutRegions.pages,
|
|
26
26
|
'layout-data-table': layoutDataTables.serialize(),
|
package/js/fontContainerMain.js
CHANGED
|
@@ -15,9 +15,9 @@ import { gs } from './generalWorkerMain.js';
|
|
|
15
15
|
*/
|
|
16
16
|
export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
17
17
|
// Return early if the font set is already loaded, or a superset of the requested set is loaded.
|
|
18
|
-
if (FontCont.glyphSet === glyphSet || FontCont.glyphSet === 'all' && glyphSet === 'latin') return;
|
|
18
|
+
if (FontCont.state.glyphSet === glyphSet || FontCont.state.glyphSet === 'all' && glyphSet === 'latin') return;
|
|
19
19
|
|
|
20
|
-
FontCont.glyphSet = glyphSet;
|
|
20
|
+
FontCont.state.glyphSet = glyphSet;
|
|
21
21
|
|
|
22
22
|
// Note: this function is intentionally verbose, and should not be refactored to generate the paths dynamically.
|
|
23
23
|
// Build systems will not be able to resolve the paths if they are generated dynamically.
|
|
@@ -213,15 +213,15 @@ export async function loadChiSimFont() {
|
|
|
213
213
|
export async function enableFontOpt(enableOpt, forceOpt) {
|
|
214
214
|
let change = false;
|
|
215
215
|
if (enableOpt === true || enableOpt === false) {
|
|
216
|
-
if (FontCont.enableOpt !== enableOpt) {
|
|
216
|
+
if (FontCont.state.enableOpt !== enableOpt) {
|
|
217
217
|
change = true;
|
|
218
|
-
FontCont.enableOpt = enableOpt;
|
|
218
|
+
FontCont.state.enableOpt = enableOpt;
|
|
219
219
|
}
|
|
220
220
|
}
|
|
221
221
|
if (forceOpt === true || forceOpt === false) {
|
|
222
|
-
if (FontCont.forceOpt !== forceOpt) {
|
|
222
|
+
if (FontCont.state.forceOpt !== forceOpt) {
|
|
223
223
|
change = true;
|
|
224
|
-
FontCont.forceOpt = forceOpt;
|
|
224
|
+
FontCont.state.forceOpt = forceOpt;
|
|
225
225
|
}
|
|
226
226
|
}
|
|
227
227
|
|
|
@@ -286,11 +286,11 @@ export async function updateFontContWorkerMain(params = {}) {
|
|
|
286
286
|
const res = worker.updateFontContWorker({
|
|
287
287
|
rawMetrics: FontCont.rawMetrics,
|
|
288
288
|
optMetrics: FontCont.optMetrics,
|
|
289
|
-
sansDefaultName: FontCont.sansDefaultName,
|
|
290
|
-
serifDefaultName: FontCont.serifDefaultName,
|
|
291
|
-
defaultFontName: FontCont.defaultFontName,
|
|
292
|
-
enableOpt: FontCont.enableOpt,
|
|
293
|
-
forceOpt: FontCont.forceOpt,
|
|
289
|
+
sansDefaultName: FontCont.state.sansDefaultName,
|
|
290
|
+
serifDefaultName: FontCont.state.serifDefaultName,
|
|
291
|
+
defaultFontName: FontCont.state.defaultFontName,
|
|
292
|
+
enableOpt: FontCont.state.enableOpt,
|
|
293
|
+
forceOpt: FontCont.state.forceOpt,
|
|
294
294
|
});
|
|
295
295
|
resArr.push(res);
|
|
296
296
|
}
|
|
@@ -336,11 +336,11 @@ export async function setUploadFontsWorker(scheduler) {
|
|
|
336
336
|
const res = worker.updateFontContWorker({
|
|
337
337
|
rawMetrics: FontCont.rawMetrics,
|
|
338
338
|
optMetrics: FontCont.optMetrics,
|
|
339
|
-
sansDefaultName: FontCont.sansDefaultName,
|
|
340
|
-
serifDefaultName: FontCont.serifDefaultName,
|
|
341
|
-
defaultFontName: FontCont.defaultFontName,
|
|
342
|
-
enableOpt: FontCont.enableOpt,
|
|
343
|
-
forceOpt: FontCont.forceOpt,
|
|
339
|
+
sansDefaultName: FontCont.state.sansDefaultName,
|
|
340
|
+
serifDefaultName: FontCont.state.serifDefaultName,
|
|
341
|
+
defaultFontName: FontCont.state.defaultFontName,
|
|
342
|
+
enableOpt: FontCont.state.enableOpt,
|
|
343
|
+
forceOpt: FontCont.state.forceOpt,
|
|
344
344
|
});
|
|
345
345
|
resArr.push(res);
|
|
346
346
|
}
|
|
@@ -351,23 +351,23 @@ export async function setUploadFontsWorker(scheduler) {
|
|
|
351
351
|
* Automatically sets the default font to whatever font is most common in the provided font metrics.
|
|
352
352
|
*
|
|
353
353
|
*/
|
|
354
|
-
export function setDefaultFontAuto(
|
|
355
|
-
const multiFontMode = checkMultiFontMode(
|
|
354
|
+
export function setDefaultFontAuto(charMetricsObj) {
|
|
355
|
+
const multiFontMode = checkMultiFontMode(charMetricsObj);
|
|
356
356
|
|
|
357
357
|
// Return early if the OCR data does not contain font info.
|
|
358
358
|
if (!multiFontMode) return;
|
|
359
359
|
|
|
360
360
|
// Change default font to whatever named font appears more
|
|
361
|
-
if ((
|
|
362
|
-
FontCont.defaultFontName = 'SerifDefault';
|
|
361
|
+
if ((charMetricsObj.SerifDefault?.obs || 0) > (charMetricsObj.SansDefault?.obs || 0)) {
|
|
362
|
+
FontCont.state.defaultFontName = 'SerifDefault';
|
|
363
363
|
} else {
|
|
364
|
-
FontCont.defaultFontName = 'SansDefault';
|
|
364
|
+
FontCont.state.defaultFontName = 'SansDefault';
|
|
365
365
|
}
|
|
366
366
|
|
|
367
367
|
if (gs.schedulerInner) {
|
|
368
368
|
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
|
369
369
|
const worker = gs.schedulerInner.workers[i];
|
|
370
|
-
worker.updateFontContWorker({ defaultFontName: FontCont.defaultFontName });
|
|
370
|
+
worker.updateFontContWorker({ defaultFontName: FontCont.state.defaultFontName });
|
|
371
371
|
}
|
|
372
372
|
}
|
|
373
373
|
}
|
|
@@ -375,39 +375,39 @@ export function setDefaultFontAuto(fontMetricsObj) {
|
|
|
375
375
|
/**
|
|
376
376
|
*
|
|
377
377
|
* @param {FontContainerFamilyBuiltIn} fontFamily
|
|
378
|
-
* @param {Object.<string,
|
|
378
|
+
* @param {Object.<string, CharMetricsFamily>} charMetricsObj
|
|
379
379
|
*/
|
|
380
|
-
export async function optimizeFontContainerFamily(fontFamily,
|
|
380
|
+
export async function optimizeFontContainerFamily(fontFamily, charMetricsObj) {
|
|
381
381
|
// When we have metrics for individual fonts families, those are used to optimize the appropriate fonts.
|
|
382
382
|
// Otherwise, the "default" metric is applied to whatever font the user has selected as the default font.
|
|
383
|
-
const multiFontMode = checkMultiFontMode(
|
|
384
|
-
let
|
|
383
|
+
const multiFontMode = checkMultiFontMode(charMetricsObj);
|
|
384
|
+
let charMetricsType = 'Default';
|
|
385
385
|
if (multiFontMode) {
|
|
386
386
|
if (fontFamily.normal.type === 'sans') {
|
|
387
|
-
|
|
387
|
+
charMetricsType = 'SansDefault';
|
|
388
388
|
} else {
|
|
389
|
-
|
|
389
|
+
charMetricsType = 'SerifDefault';
|
|
390
390
|
}
|
|
391
391
|
}
|
|
392
392
|
|
|
393
393
|
// If there are no statistics to use for optimization, create "optimized" font by simply copying the raw font without modification.
|
|
394
394
|
// This should only occur when `multiFontMode` is true, but a document contains no sans words or no serif words.
|
|
395
|
-
if (!
|
|
395
|
+
if (!charMetricsObj[charMetricsType] || !charMetricsObj[charMetricsType][fontFamily.normal.style] || charMetricsObj[charMetricsType][fontFamily.normal.style].obs < 200) {
|
|
396
396
|
return null;
|
|
397
397
|
}
|
|
398
398
|
|
|
399
|
-
const metricsNormal =
|
|
400
|
-
const normalOptFont = gs.optimizeFont({ fontData: fontFamily.normal.src,
|
|
399
|
+
const metricsNormal = charMetricsObj[charMetricsType][fontFamily.normal.style];
|
|
400
|
+
const normalOptFont = gs.optimizeFont({ fontData: fontFamily.normal.src, charMetricsObj: metricsNormal, style: fontFamily.normal.style })
|
|
401
401
|
.then(async (x) => {
|
|
402
402
|
const font = await loadOpentype(x.fontData, x.kerningPairs);
|
|
403
403
|
return new FontContainerFont(fontFamily.normal.family, fontFamily.normal.style, x.fontData, true, font);
|
|
404
404
|
});
|
|
405
405
|
|
|
406
|
-
const metricsItalic =
|
|
406
|
+
const metricsItalic = charMetricsObj[charMetricsType][fontFamily.italic.style];
|
|
407
407
|
/** @type {?FontContainerFont|Promise<FontContainerFont>} */
|
|
408
408
|
let italicOptFont = null;
|
|
409
409
|
if (metricsItalic && metricsItalic.obs >= 200) {
|
|
410
|
-
italicOptFont = gs.optimizeFont({ fontData: fontFamily.italic.src,
|
|
410
|
+
italicOptFont = gs.optimizeFont({ fontData: fontFamily.italic.src, charMetricsObj: metricsItalic, style: fontFamily.italic.style })
|
|
411
411
|
.then(async (x) => {
|
|
412
412
|
const font = await loadOpentype(x.fontData, x.kerningPairs);
|
|
413
413
|
return new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, x.fontData, true, font);
|
|
@@ -424,16 +424,16 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
|
424
424
|
* Optimize all fonts.
|
|
425
425
|
* If a font cannot be optimized, then the raw font is returned.
|
|
426
426
|
* @param {Object<string, FontContainerFamilyBuiltIn>} fontPrivate
|
|
427
|
-
* @param {Object.<string,
|
|
427
|
+
* @param {Object.<string, CharMetricsFamily>} charMetricsObj
|
|
428
428
|
*/
|
|
429
|
-
export async function optimizeFontContainerAll(fontPrivate,
|
|
430
|
-
const carlitoPromise = optimizeFontContainerFamily(fontPrivate.Carlito,
|
|
431
|
-
const centuryPromise = optimizeFontContainerFamily(fontPrivate.Century,
|
|
432
|
-
const garamondPromise = optimizeFontContainerFamily(fontPrivate.Garamond,
|
|
433
|
-
const palatinoPromise = optimizeFontContainerFamily(fontPrivate.Palatino,
|
|
434
|
-
const nimbusRomanPromise = optimizeFontContainerFamily(fontPrivate.NimbusRoman,
|
|
435
|
-
const nimbusSansPromise = optimizeFontContainerFamily(fontPrivate.NimbusSans,
|
|
436
|
-
const nimbusMonoPromise = optimizeFontContainerFamily(fontPrivate.NimbusMono,
|
|
429
|
+
export async function optimizeFontContainerAll(fontPrivate, charMetricsObj) {
|
|
430
|
+
const carlitoPromise = optimizeFontContainerFamily(fontPrivate.Carlito, charMetricsObj);
|
|
431
|
+
const centuryPromise = optimizeFontContainerFamily(fontPrivate.Century, charMetricsObj);
|
|
432
|
+
const garamondPromise = optimizeFontContainerFamily(fontPrivate.Garamond, charMetricsObj);
|
|
433
|
+
const palatinoPromise = optimizeFontContainerFamily(fontPrivate.Palatino, charMetricsObj);
|
|
434
|
+
const nimbusRomanPromise = optimizeFontContainerFamily(fontPrivate.NimbusRoman, charMetricsObj);
|
|
435
|
+
const nimbusSansPromise = optimizeFontContainerFamily(fontPrivate.NimbusSans, charMetricsObj);
|
|
436
|
+
const nimbusMonoPromise = optimizeFontContainerFamily(fontPrivate.NimbusMono, charMetricsObj);
|
|
437
437
|
|
|
438
438
|
const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomanPromise, nimbusSansPromise, nimbusMonoPromise]);
|
|
439
439
|
|
package/js/fontEval.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { pageMetricsArr } from './containers/dataContainer.js';
|
|
2
2
|
import { FontCont } from './containers/fontContainer.js';
|
|
3
3
|
import { ImageCache } from './containers/imageContainer.js';
|
|
4
4
|
import {
|
|
@@ -132,16 +132,16 @@ const calcBestFonts = (fontMetrics) => {
|
|
|
132
132
|
export async function runFontOptimization(ocrArr) {
|
|
133
133
|
await loadBuiltInFontsRaw();
|
|
134
134
|
|
|
135
|
-
const calculateOpt =
|
|
135
|
+
const calculateOpt = FontCont.state.charMetrics && Object.keys(FontCont.state.charMetrics).length > 0;
|
|
136
136
|
|
|
137
137
|
let enableOptSerif = false;
|
|
138
138
|
let enableOptSans = false;
|
|
139
139
|
|
|
140
140
|
let optimizeFontContainerAllPromise;
|
|
141
141
|
if (calculateOpt) {
|
|
142
|
-
setDefaultFontAuto(
|
|
142
|
+
setDefaultFontAuto(FontCont.state.charMetrics);
|
|
143
143
|
|
|
144
|
-
optimizeFontContainerAllPromise = optimizeFontContainerAll(FontCont.raw,
|
|
144
|
+
optimizeFontContainerAllPromise = optimizeFontContainerAll(FontCont.raw, FontCont.state.charMetrics)
|
|
145
145
|
.then((res) => {
|
|
146
146
|
FontCont.opt = res;
|
|
147
147
|
});
|
|
@@ -167,28 +167,28 @@ export async function runFontOptimization(ocrArr) {
|
|
|
167
167
|
// This ensures that switching on/off "font optimization" does not change the font, which would be confusing.
|
|
168
168
|
if (FontCont.optMetrics[bestMetricsOpt.minKeySans] < FontCont.rawMetrics[bestMetricsRaw.minKeySans]) {
|
|
169
169
|
enableOptSans = true;
|
|
170
|
-
FontCont.sansDefaultName = bestMetricsOpt.minKeySans;
|
|
170
|
+
FontCont.state.sansDefaultName = bestMetricsOpt.minKeySans;
|
|
171
171
|
} else {
|
|
172
|
-
FontCont.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
172
|
+
FontCont.state.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
173
173
|
}
|
|
174
174
|
|
|
175
175
|
// Repeat for serif fonts
|
|
176
176
|
if (FontCont.optMetrics[bestMetricsOpt.minKeySerif] < FontCont.rawMetrics[bestMetricsRaw.minKeySerif]) {
|
|
177
177
|
enableOptSerif = true;
|
|
178
|
-
FontCont.serifDefaultName = bestMetricsOpt.minKeySerif;
|
|
178
|
+
FontCont.state.serifDefaultName = bestMetricsOpt.minKeySerif;
|
|
179
179
|
} else {
|
|
180
|
-
FontCont.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
180
|
+
FontCont.state.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
181
181
|
}
|
|
182
182
|
} else {
|
|
183
|
-
FontCont.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
184
|
-
FontCont.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
183
|
+
FontCont.state.sansDefaultName = bestMetricsRaw.minKeySans;
|
|
184
|
+
FontCont.state.serifDefaultName = bestMetricsRaw.minKeySerif;
|
|
185
185
|
}
|
|
186
186
|
|
|
187
|
-
FontCont.enableOpt = enableOptSerif || enableOptSans;
|
|
187
|
+
FontCont.state.enableOpt = enableOptSerif || enableOptSans;
|
|
188
188
|
|
|
189
189
|
// Send updated state to all workers.
|
|
190
190
|
await updateFontContWorkerMain();
|
|
191
191
|
}
|
|
192
192
|
|
|
193
|
-
return FontCont.enableOpt;
|
|
193
|
+
return FontCont.state.enableOpt;
|
|
194
194
|
}
|