scribe.js-ocr 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/clear.js +5 -6
- package/js/containers/app.js +1 -1
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +91 -77
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +20 -18
- package/js/export/writeHtml.js +1 -1
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +92 -49
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +93 -92
- package/js/fontSupp.js +20 -20
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +39 -4
- package/js/import/convertPageAbbyy.js +55 -26
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +67 -32
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +55 -19
- package/js/recognizeConvert.js +21 -8
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +43 -6
- package/js/worker/compareOCRModule.js +20 -23
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +6 -3
package/cli/scribe.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/js/clear.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { inputData } from './containers/app.js';
|
|
2
2
|
import {
|
|
3
3
|
convertPageWarn,
|
|
4
|
-
fontMetricsObj,
|
|
5
4
|
layoutDataTables,
|
|
6
5
|
layoutRegions,
|
|
7
6
|
ocrAll,
|
|
@@ -10,18 +9,18 @@ import {
|
|
|
10
9
|
} from './containers/dataContainer.js';
|
|
11
10
|
import { FontCont } from './containers/fontContainer.js';
|
|
12
11
|
import { ImageCache } from './containers/imageContainer.js';
|
|
13
|
-
import {
|
|
12
|
+
import { clearObjectProperties } from './utils/miscUtils.js';
|
|
14
13
|
|
|
15
14
|
export function clearData() {
|
|
16
15
|
inputData.clear();
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
clearObjectProperties(ocrAll);
|
|
17
|
+
ocrAll.active = [];
|
|
18
|
+
clearObjectProperties(ocrAllRaw);
|
|
19
|
+
ocrAllRaw.active = [];
|
|
19
20
|
layoutRegions.pages.length = 0;
|
|
20
21
|
layoutDataTables.pages.length = 0;
|
|
21
22
|
pageMetricsArr.length = 0;
|
|
22
23
|
convertPageWarn.length = 0;
|
|
23
24
|
ImageCache.clear();
|
|
24
|
-
// Clear optimized font data and reset fontAll to raw data.
|
|
25
|
-
replaceObjectProperties(fontMetricsObj);
|
|
26
25
|
FontCont.clear();
|
|
27
26
|
}
|
package/js/containers/app.js
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
// This file contains various objects that are imported by other modules.
|
|
2
2
|
// Everything here is essentially a global variable; none of them are technically "containers".
|
|
3
3
|
|
|
4
|
-
/** @type {Object.<string, FontMetricsFamily>} */
|
|
5
|
-
export const fontMetricsObj = {};
|
|
6
|
-
|
|
7
4
|
export class layoutRegions {
|
|
8
5
|
/** @type {Array<LayoutPage>} */
|
|
9
6
|
static pages = [];
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
// Node.js case
|
|
7
7
|
import opentype from '../../lib/opentype.module.js';
|
|
8
|
-
import { determineSansSerif } from '../utils/miscUtils.js';
|
|
8
|
+
import { determineSansSerif, getStyleLookup, clearObjectProperties } from '../utils/miscUtils.js';
|
|
9
9
|
import { ca } from '../canvasAdapter.js';
|
|
10
10
|
|
|
11
11
|
if (typeof process === 'object') {
|
|
@@ -21,7 +21,7 @@ if (typeof process === 'object') {
|
|
|
21
21
|
|
|
22
22
|
/**
|
|
23
23
|
* Checks whether `multiFontMode` should be enabled or disabled.
|
|
24
|
-
* @param {Object.<string,
|
|
24
|
+
* @param {Object.<string, CharMetricsFamily>} charMetricsObj
|
|
25
25
|
*
|
|
26
26
|
* Usually (including when the built-in OCR engine is used) we will have metrics for individual font families,
|
|
27
27
|
* which are used to optimize the appropriate fonts ("multiFontMode" is `true` in this case).
|
|
@@ -29,12 +29,12 @@ if (typeof process === 'object') {
|
|
|
29
29
|
* but no font identification information for most or all words.
|
|
30
30
|
* If this is encountered the "default" metric is applied to the default font ("multiFontMode" is `false` in this case).
|
|
31
31
|
*/
|
|
32
|
-
export function checkMultiFontMode(
|
|
32
|
+
export function checkMultiFontMode(charMetricsObj) {
|
|
33
33
|
let defaultFontObs = 0;
|
|
34
34
|
let namedFontObs = 0;
|
|
35
|
-
if (
|
|
36
|
-
if (
|
|
37
|
-
if (
|
|
35
|
+
if (charMetricsObj.Default?.obs) { defaultFontObs += (charMetricsObj.Default?.obs || 0); }
|
|
36
|
+
if (charMetricsObj.SerifDefault?.obs) { namedFontObs += (charMetricsObj.SerifDefault?.obs || 0); }
|
|
37
|
+
if (charMetricsObj.SansDefault?.obs) { namedFontObs += (charMetricsObj.SansDefault?.obs || 0); }
|
|
38
38
|
|
|
39
39
|
return namedFontObs > defaultFontObs;
|
|
40
40
|
}
|
|
@@ -104,26 +104,26 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
|
|
|
104
104
|
* Load font from source and return a FontContainerFont object.
|
|
105
105
|
* This function is used to load the Chinese font.
|
|
106
106
|
* @param {string} family
|
|
107
|
-
* @param {
|
|
107
|
+
* @param {StyleLookup} styleLookup
|
|
108
108
|
* @param {("sans"|"serif")} type
|
|
109
109
|
* @param {ArrayBuffer} src
|
|
110
110
|
* @param {boolean} opt
|
|
111
111
|
*
|
|
112
112
|
*/
|
|
113
|
-
export async function loadFont(family,
|
|
113
|
+
export async function loadFont(family, styleLookup, type, src, opt) {
|
|
114
114
|
const fontObj = await loadOpentype(src);
|
|
115
|
-
return new FontContainerFont(family,
|
|
115
|
+
return new FontContainerFont(family, styleLookup, src, opt, fontObj);
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
/**
|
|
119
119
|
*
|
|
120
120
|
* @param {string} family
|
|
121
|
-
* @param {
|
|
121
|
+
* @param {StyleLookup} styleLookup
|
|
122
122
|
* @param {ArrayBuffer} src
|
|
123
123
|
* @param {boolean} opt
|
|
124
124
|
* @param {opentype.Font} opentypeObj - Kerning paris to re-apply
|
|
125
125
|
* @property {string} family -
|
|
126
|
-
* @property {
|
|
126
|
+
* @property {StyleLookup} style -
|
|
127
127
|
* @property {ArrayBuffer} src
|
|
128
128
|
* @property {opentype.Font} opentype -
|
|
129
129
|
* @property {string} fontFaceName -
|
|
@@ -135,7 +135,7 @@ export async function loadFont(family, style, type, src, opt) {
|
|
|
135
135
|
* First, it is not necessary. Setting the font on a canvas (the only reason loading a `FontFace` is needed) is done through refering `fontFaceName` and `fontFaceStyle`.
|
|
136
136
|
* Second, it results in errors being thrown when used in Node.js, as `FontFace` will be undefined in this case.
|
|
137
137
|
*/
|
|
138
|
-
export function FontContainerFont(family,
|
|
138
|
+
export function FontContainerFont(family, styleLookup, src, opt, opentypeObj) {
|
|
139
139
|
// As FontFace objects are included in the document FontFaceSet object,
|
|
140
140
|
// they need to all have unique names.
|
|
141
141
|
let fontFaceName = family;
|
|
@@ -143,8 +143,8 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
143
143
|
|
|
144
144
|
/** @type {string} */
|
|
145
145
|
this.family = family;
|
|
146
|
-
/** @type {
|
|
147
|
-
this.style =
|
|
146
|
+
/** @type {StyleLookup} */
|
|
147
|
+
this.style = styleLookup;
|
|
148
148
|
/** @type {boolean} */
|
|
149
149
|
this.opt = opt;
|
|
150
150
|
/** @type {ArrayBuffer} */
|
|
@@ -154,9 +154,9 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
154
154
|
/** @type {string} */
|
|
155
155
|
this.fontFaceName = fontFaceName;
|
|
156
156
|
/** @type {('normal'|'italic')} */
|
|
157
|
-
this.fontFaceStyle = this.style
|
|
157
|
+
this.fontFaceStyle = ['italic', 'boldItalic'].includes(this.style) ? 'italic' : 'normal';
|
|
158
158
|
/** @type {('normal'|'bold')} */
|
|
159
|
-
this.fontFaceWeight = this.style
|
|
159
|
+
this.fontFaceWeight = ['bold', 'boldItalic'].includes(this.style) ? 'bold' : 'normal';
|
|
160
160
|
/** @type {("sans"|"serif")} */
|
|
161
161
|
this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
|
|
162
162
|
this.smallCapsMult = 0.75;
|
|
@@ -185,27 +185,27 @@ export async function loadFontContainerFamily(family, src, opt = false) {
|
|
|
185
185
|
normal: null,
|
|
186
186
|
italic: null,
|
|
187
187
|
bold: null,
|
|
188
|
+
boldItalic: null,
|
|
188
189
|
};
|
|
189
190
|
|
|
190
191
|
/**
|
|
191
192
|
*
|
|
192
|
-
* @param {
|
|
193
|
+
* @param {StyleLookup} styleLookup
|
|
193
194
|
* @returns
|
|
194
195
|
*/
|
|
195
|
-
const loadType = (
|
|
196
|
-
const srcType = (src[
|
|
196
|
+
const loadType = (styleLookup) => new Promise((resolve) => {
|
|
197
|
+
const srcType = (src[styleLookup]);
|
|
197
198
|
if (!srcType) {
|
|
198
199
|
resolve(false);
|
|
199
200
|
return;
|
|
200
201
|
}
|
|
201
|
-
// const scrNormal = typeof srcType === 'string' ? getFontAbsPath(srcType) : srcType;
|
|
202
202
|
loadOpentype(srcType).then((font) => {
|
|
203
|
-
res[
|
|
203
|
+
res[styleLookup] = new FontContainerFont(family, styleLookup, srcType, opt, font);
|
|
204
204
|
resolve(true);
|
|
205
205
|
});
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
-
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold')]);
|
|
208
|
+
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold'), loadType('boldItalic')]);
|
|
209
209
|
|
|
210
210
|
return res;
|
|
211
211
|
}
|
|
@@ -250,33 +250,43 @@ export class FontCont {
|
|
|
250
250
|
chi_sim: null,
|
|
251
251
|
};
|
|
252
252
|
|
|
253
|
-
/**
|
|
254
|
-
|
|
253
|
+
/**
|
|
254
|
+
* This object contains all data that is saved and restored from intermediate .scribe files.
|
|
255
|
+
* Anything outside of this object is not saved or restored.
|
|
256
|
+
* @type {FontState}
|
|
257
|
+
*/
|
|
258
|
+
static state = {
|
|
259
|
+
/** Optimized fonts will be used when believed to improve quality. */
|
|
260
|
+
enableOpt: false,
|
|
255
261
|
|
|
256
|
-
|
|
257
|
-
|
|
262
|
+
/** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
|
|
263
|
+
forceOpt: false,
|
|
258
264
|
|
|
259
|
-
|
|
260
|
-
|
|
265
|
+
/**
|
|
266
|
+
* If `false`, 'Courier' will not be cleaned to Nimbus Mono.
|
|
267
|
+
* This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
|
|
268
|
+
* Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
|
|
269
|
+
*/
|
|
270
|
+
enableCleanToNimbusMono: false,
|
|
261
271
|
|
|
262
|
-
|
|
263
|
-
static optMetrics = null;
|
|
272
|
+
defaultFontName: 'SerifDefault',
|
|
264
273
|
|
|
265
|
-
|
|
274
|
+
serifDefaultName: 'NimbusRoman',
|
|
266
275
|
|
|
267
|
-
|
|
276
|
+
sansDefaultName: 'NimbusSans',
|
|
268
277
|
|
|
269
|
-
|
|
278
|
+
glyphSet: null,
|
|
270
279
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
*/
|
|
276
|
-
static enableCleanToNimbusMono = false;
|
|
280
|
+
/** @type {Object.<string, CharMetricsFamily>} */
|
|
281
|
+
charMetrics: {},
|
|
282
|
+
|
|
283
|
+
};
|
|
277
284
|
|
|
278
|
-
/** @type {?('
|
|
279
|
-
static
|
|
285
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
286
|
+
static rawMetrics = null;
|
|
287
|
+
|
|
288
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
289
|
+
static optMetrics = null;
|
|
280
290
|
|
|
281
291
|
/**
|
|
282
292
|
* Load fonts from an ArrayBuffer containing arbitrary font data.
|
|
@@ -300,11 +310,13 @@ export class FontCont {
|
|
|
300
310
|
|
|
301
311
|
const fontNameEmbedded = fontObj.names.postScriptName.en;
|
|
302
312
|
|
|
303
|
-
let
|
|
304
|
-
if (fontNameEmbedded.match(/
|
|
305
|
-
|
|
313
|
+
let styleLookup = /** @type {StyleLookup} */ ('normal');
|
|
314
|
+
if (fontNameEmbedded.match(/boldit|bdit/i)) {
|
|
315
|
+
styleLookup = 'boldItalic';
|
|
316
|
+
} else if (fontNameEmbedded.match(/italic/i)) {
|
|
317
|
+
styleLookup = 'italic';
|
|
306
318
|
} else if (fontNameEmbedded.match(/bold/i)) {
|
|
307
|
-
|
|
319
|
+
styleLookup = 'bold';
|
|
308
320
|
}
|
|
309
321
|
|
|
310
322
|
// mupdf makes changes to font names, so we need to do the same.
|
|
@@ -312,9 +324,9 @@ export class FontCont {
|
|
|
312
324
|
// Spaces are replaced with underscores.
|
|
313
325
|
const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
314
326
|
|
|
315
|
-
if (!FontCont.doc?.[fontName]?.[
|
|
327
|
+
if (!FontCont.doc?.[fontName]?.[styleLookup]) {
|
|
316
328
|
try {
|
|
317
|
-
const fontContainer = new FontContainerFont(fontName,
|
|
329
|
+
const fontContainer = new FontContainerFont(fontName, styleLookup, fontData, false, fontObj);
|
|
318
330
|
|
|
319
331
|
if (!FontCont.doc) {
|
|
320
332
|
FontCont.doc = {};
|
|
@@ -324,12 +336,12 @@ export class FontCont {
|
|
|
324
336
|
FontCont.doc[fontName] = {};
|
|
325
337
|
}
|
|
326
338
|
|
|
327
|
-
FontCont.doc[fontName][
|
|
339
|
+
FontCont.doc[fontName][styleLookup] = fontContainer;
|
|
328
340
|
} catch (error) {
|
|
329
|
-
console.error(`Error loading font ${fontName} ${
|
|
341
|
+
console.error(`Error loading font ${fontName} ${styleLookup}.`);
|
|
330
342
|
}
|
|
331
343
|
} else {
|
|
332
|
-
console.warn(`Font ${fontName} ${
|
|
344
|
+
console.warn(`Font ${fontName} ${styleLookup} already exists.`);
|
|
333
345
|
}
|
|
334
346
|
};
|
|
335
347
|
|
|
@@ -342,15 +354,15 @@ export class FontCont {
|
|
|
342
354
|
const raw = FontCont.raw?.[family]?.normal;
|
|
343
355
|
if (!raw) return false;
|
|
344
356
|
const opt = FontCont.opt?.[family]?.normal;
|
|
345
|
-
if (opt && FontCont.forceOpt) {
|
|
357
|
+
if (opt && FontCont.state.forceOpt) {
|
|
346
358
|
return true;
|
|
347
359
|
// If optimized fonts are enabled (but not forced), the optimized version of a font will be used if:
|
|
348
360
|
// (1) The optimized version exists
|
|
349
361
|
// (2) The optimized version has a better metric (so quality should improve).
|
|
350
362
|
// (3) The optimized version of the default sans/serif font also has a better metric.
|
|
351
363
|
// This last condition avoids font optimization being enabled in the UI when it only improves an unused font.
|
|
352
|
-
} if (opt && FontCont.enableOpt) {
|
|
353
|
-
const defaultFamily = raw.type === 'serif' ? FontCont.serifDefaultName : FontCont.sansDefaultName;
|
|
364
|
+
} if (opt && FontCont.state.enableOpt) {
|
|
365
|
+
const defaultFamily = raw.type === 'serif' ? FontCont.state.serifDefaultName : FontCont.state.sansDefaultName;
|
|
354
366
|
|
|
355
367
|
const rawMetricDefault = FontCont.rawMetrics?.[defaultFamily];
|
|
356
368
|
const optMetricDefault = FontCont.optMetrics?.[defaultFamily];
|
|
@@ -368,14 +380,17 @@ export class FontCont {
|
|
|
368
380
|
* Gets a font object. Unlike accessing the font containers directly,
|
|
369
381
|
* this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
|
|
370
382
|
*
|
|
371
|
-
* @param {
|
|
372
|
-
* @param {('normal'|'italic'|'bold'|string)} [style='normal']
|
|
383
|
+
* @param {Partial<Style>} style
|
|
373
384
|
* @param {string} [lang='eng']
|
|
374
385
|
* @returns {FontContainerFont}
|
|
375
386
|
*/
|
|
376
|
-
static getFont = (
|
|
377
|
-
|
|
378
|
-
|
|
387
|
+
static getFont = (style, lang = 'eng') => {
|
|
388
|
+
let family = style.font || FontCont.state.defaultFontName;
|
|
389
|
+
|
|
390
|
+
const styleLookup = getStyleLookup(style);
|
|
391
|
+
|
|
392
|
+
if (FontCont.doc?.[family]?.[styleLookup] && !FontCont.doc?.[family]?.[styleLookup]?.disable) {
|
|
393
|
+
return FontCont.doc[family][styleLookup];
|
|
379
394
|
}
|
|
380
395
|
|
|
381
396
|
if (lang === 'chi_sim') {
|
|
@@ -387,7 +402,7 @@ export class FontCont {
|
|
|
387
402
|
|
|
388
403
|
// Option 1: If we have access to the font, use it.
|
|
389
404
|
// Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
|
|
390
|
-
if (!FontCont.raw?.[family]?.[
|
|
405
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
391
406
|
if (/NimbusRom/i.test(family)) {
|
|
392
407
|
family = 'NimbusRoman';
|
|
393
408
|
} else if (/Times/i.test(family)) {
|
|
@@ -408,29 +423,29 @@ export class FontCont {
|
|
|
408
423
|
family = 'Carlito';
|
|
409
424
|
} else if (/Calibri/i.test(family)) {
|
|
410
425
|
family = 'Carlito';
|
|
411
|
-
} else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
426
|
+
} else if (/Courier/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
412
427
|
family = 'NimbusMono';
|
|
413
|
-
} else if (/NimbusMono/i.test(family) && FontCont.enableCleanToNimbusMono) {
|
|
428
|
+
} else if (/NimbusMono/i.test(family) && FontCont.state.enableCleanToNimbusMono) {
|
|
414
429
|
family = 'NimbusMono';
|
|
415
430
|
}
|
|
416
431
|
}
|
|
417
432
|
|
|
418
433
|
// Option 3: If the font still is not identified, use the default sans/serif font.
|
|
419
|
-
if (!FontCont.raw?.[family]?.[
|
|
434
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
420
435
|
family = determineSansSerif(family);
|
|
421
436
|
}
|
|
422
437
|
|
|
423
438
|
// This needs to come first as `defaultFontName` maps to either 'SerifDefault' or 'SansDefault'.
|
|
424
|
-
if (family === 'Default') family = FontCont.defaultFontName;
|
|
439
|
+
if (family === 'Default') family = FontCont.state.defaultFontName;
|
|
425
440
|
|
|
426
|
-
if (family === 'SerifDefault') family = FontCont.serifDefaultName;
|
|
427
|
-
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
441
|
+
if (family === 'SerifDefault') family = FontCont.state.serifDefaultName;
|
|
442
|
+
if (family === 'SansDefault') family = FontCont.state.sansDefaultName;
|
|
428
443
|
|
|
429
444
|
/** @type {FontContainerFont} */
|
|
430
|
-
let fontRes = FontCont.raw?.[family]?.[
|
|
431
|
-
if (!fontRes) throw new Error(`Font container does not contain ${family} (${
|
|
445
|
+
let fontRes = FontCont.raw?.[family]?.[styleLookup];
|
|
446
|
+
if (!fontRes) throw new Error(`Font container does not contain ${family} (${styleLookup}).`);
|
|
432
447
|
|
|
433
|
-
const opt = FontCont.opt?.[family]?.[
|
|
448
|
+
const opt = FontCont.opt?.[family]?.[styleLookup];
|
|
434
449
|
const useOpt = FontCont.useOptFamily(family);
|
|
435
450
|
if (opt && useOpt) fontRes = opt;
|
|
436
451
|
|
|
@@ -441,10 +456,7 @@ export class FontCont {
|
|
|
441
456
|
*
|
|
442
457
|
* @param {OcrWord} word
|
|
443
458
|
*/
|
|
444
|
-
static getWordFont = (word) =>
|
|
445
|
-
const wordFontFamily = word.font || FontCont.defaultFontName;
|
|
446
|
-
return FontCont.getFont(wordFontFamily, word.style, word.lang);
|
|
447
|
-
};
|
|
459
|
+
static getWordFont = (word) => FontCont.getFont(word.style, word.lang);
|
|
448
460
|
|
|
449
461
|
/**
|
|
450
462
|
* Reset font container to original state but do not unload default resources.
|
|
@@ -454,16 +466,18 @@ export class FontCont {
|
|
|
454
466
|
FontCont.rawMetrics = null;
|
|
455
467
|
FontCont.optMetrics = null;
|
|
456
468
|
|
|
457
|
-
FontCont.enableCleanToNimbusMono = false;
|
|
469
|
+
FontCont.state.enableCleanToNimbusMono = false;
|
|
470
|
+
|
|
471
|
+
FontCont.state.defaultFontName = 'SerifDefault';
|
|
472
|
+
FontCont.state.serifDefaultName = 'NimbusRoman';
|
|
473
|
+
FontCont.state.sansDefaultName = 'NimbusSans';
|
|
458
474
|
|
|
459
|
-
FontCont.
|
|
460
|
-
FontCont.serifDefaultName = 'NimbusRoman';
|
|
461
|
-
FontCont.sansDefaultName = 'NimbusSans';
|
|
475
|
+
clearObjectProperties(FontCont.state.charMetrics);
|
|
462
476
|
};
|
|
463
477
|
|
|
464
478
|
static terminate = () => {
|
|
465
479
|
FontCont.clear();
|
|
466
480
|
FontCont.raw = null;
|
|
467
|
-
FontCont.glyphSet = null;
|
|
481
|
+
FontCont.state.glyphSet = null;
|
|
468
482
|
};
|
|
469
483
|
}
|
package/js/export/export.js
CHANGED
|
@@ -9,11 +9,14 @@ import { writePdf } from './writePdf.js';
|
|
|
9
9
|
import { writeHocr } from './writeHocr.js';
|
|
10
10
|
import { writeText } from './writeText.js';
|
|
11
11
|
import { writeHtml } from './writeHtml.js';
|
|
12
|
+
import { removeCircularRefsOcr } from '../objects/ocrObjects.js';
|
|
13
|
+
import { removeCircularRefsDataTables } from '../objects/layoutObjects.js';
|
|
14
|
+
import { FontCont } from '../containers/fontContainer.js';
|
|
12
15
|
|
|
13
16
|
/**
|
|
14
17
|
* Export active OCR data to specified format.
|
|
15
18
|
* @public
|
|
16
|
-
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'} [format='txt']
|
|
19
|
+
* @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt']
|
|
17
20
|
* @param {number} [minPage=0] - First page to export.
|
|
18
21
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
19
22
|
* @returns {Promise<string|ArrayBuffer>}
|
|
@@ -183,9 +186,9 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
183
186
|
w.freeDocument(pdf);
|
|
184
187
|
}
|
|
185
188
|
} else if (format === 'hocr') {
|
|
186
|
-
content = writeHocr(
|
|
189
|
+
content = writeHocr(ocrDownload, minPage, maxPage);
|
|
187
190
|
} else if (format === 'html') {
|
|
188
|
-
content = writeHtml(
|
|
191
|
+
content = writeHtml(ocrDownload, minPage, maxPage, opt.reflow, opt.removeMargins);
|
|
189
192
|
} else if (format === 'txt') {
|
|
190
193
|
content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
|
|
191
194
|
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
|
|
@@ -199,6 +202,18 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
199
202
|
// Less common export formats are loaded dynamically to reduce initial load time.
|
|
200
203
|
const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
|
|
201
204
|
content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage);
|
|
205
|
+
} else if (format === 'scribe') {
|
|
206
|
+
const data = {
|
|
207
|
+
ocr: removeCircularRefsOcr(ocrDownload),
|
|
208
|
+
fontState: FontCont.state,
|
|
209
|
+
layoutRegions: layoutRegions.pages,
|
|
210
|
+
layoutDataTables: removeCircularRefsDataTables(layoutDataTables.pages),
|
|
211
|
+
};
|
|
212
|
+
const contentStr = JSON.stringify(data);
|
|
213
|
+
|
|
214
|
+
const pako = await import('../../lib/pako.esm.mjs');
|
|
215
|
+
const enc = new TextEncoder();
|
|
216
|
+
content = pako.gzip(enc.encode(contentStr))?.buffer;
|
|
202
217
|
}
|
|
203
218
|
|
|
204
219
|
return content;
|
|
@@ -207,14 +222,14 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
207
222
|
/**
|
|
208
223
|
* Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
|
|
209
224
|
* @public
|
|
210
|
-
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
|
|
225
|
+
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format
|
|
211
226
|
* @param {string} fileName
|
|
212
227
|
* @param {number} [minPage=0] - First page to export.
|
|
213
228
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
|
214
229
|
*/
|
|
215
230
|
export async function download(format, fileName, minPage = 0, maxPage = -1) {
|
|
216
231
|
if (format === 'text') format = 'txt';
|
|
217
|
-
fileName = fileName.replace(/\.\w{1,
|
|
232
|
+
fileName = fileName.replace(/\.\w{1,6}$/, `.${format}`);
|
|
218
233
|
const content = await exportData(format, minPage, maxPage);
|
|
219
234
|
await saveAs(content, fileName);
|
|
220
235
|
}
|
package/js/export/writeHocr.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { opt } from '../containers/app.js';
|
|
2
2
|
import {
|
|
3
|
-
|
|
3
|
+
layoutDataTables, layoutRegions, pageMetricsArr,
|
|
4
4
|
} from '../containers/dataContainer.js';
|
|
5
5
|
import { FontCont } from '../containers/fontContainer.js';
|
|
6
6
|
import ocr from '../objects/ocrObjects.js';
|
|
@@ -17,10 +17,10 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
17
17
|
if (maxValue === null || maxValue === undefined || maxValue < 0) maxValue = ocrData.length - 1;
|
|
18
18
|
|
|
19
19
|
const meta = {
|
|
20
|
-
'font-metrics':
|
|
21
|
-
'default-font': FontCont.defaultFontName,
|
|
22
|
-
'sans-font': FontCont.sansDefaultName,
|
|
23
|
-
'serif-font': FontCont.serifDefaultName,
|
|
20
|
+
'font-metrics': FontCont.state.charMetrics,
|
|
21
|
+
'default-font': FontCont.state.defaultFontName,
|
|
22
|
+
'sans-font': FontCont.state.sansDefaultName,
|
|
23
|
+
'serif-font': FontCont.state.serifDefaultName,
|
|
24
24
|
'enable-opt': opt.enableOpt,
|
|
25
25
|
layout: layoutRegions.pages,
|
|
26
26
|
'layout-data-table': layoutDataTables.serialize(),
|
|
@@ -75,38 +75,40 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
75
75
|
hocrOut += `bbox ${Math.round(wordObj.bbox.left)} ${Math.round(wordObj.bbox.top)} ${Math.round(wordObj.bbox.right)} ${Math.round(wordObj.bbox.bottom)}`;
|
|
76
76
|
hocrOut += `;x_wconf ${wordObj.conf}`;
|
|
77
77
|
|
|
78
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
79
|
-
hocrOut += `;x_font ${wordObj.font}`;
|
|
78
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
79
|
+
hocrOut += `;x_font ${wordObj.style.font}`;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
if (wordObj.size) {
|
|
83
|
-
hocrOut += `;x_fsize ${wordObj.size}`;
|
|
82
|
+
if (wordObj.style.size) {
|
|
83
|
+
hocrOut += `;x_fsize ${wordObj.style.size}`;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
hocrOut += "'";
|
|
87
87
|
|
|
88
88
|
// Tesseract HOCR specifies default language for a paragraph in the "ocr_par" element,
|
|
89
89
|
// however as ScribeOCR does not currently have a paragarph object, every word must have its language specified.
|
|
90
|
-
hocrOut += ` lang='${wordObj.lang}'`;
|
|
90
|
+
if (wordObj.lang) hocrOut += ` lang='${wordObj.lang}'`;
|
|
91
91
|
|
|
92
92
|
// TODO: Why are we representing font family and style using the `style` HTML element here?
|
|
93
93
|
// This is not how Tesseract does things, and our own parsing script does not appear to be written to re-import it properly.
|
|
94
94
|
// Add "style" attribute (if applicable)
|
|
95
|
-
if (
|
|
95
|
+
if (wordObj.style.bold || wordObj.style.italic || wordObj.style.smallCaps || (wordObj.style.font && wordObj.style.font !== 'Default')) {
|
|
96
96
|
hocrOut += ' style=\'';
|
|
97
97
|
|
|
98
|
-
if (wordObj.style
|
|
98
|
+
if (wordObj.style.italic) {
|
|
99
99
|
hocrOut += 'font-style:italic;';
|
|
100
|
-
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (wordObj.style.bold) {
|
|
101
103
|
hocrOut += 'font-weight:bold;';
|
|
102
104
|
}
|
|
103
105
|
|
|
104
|
-
if (wordObj.smallCaps) {
|
|
106
|
+
if (wordObj.style.smallCaps) {
|
|
105
107
|
hocrOut += 'font-variant:small-caps;';
|
|
106
108
|
}
|
|
107
109
|
|
|
108
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
109
|
-
hocrOut += `font-family:${wordObj.font}`;
|
|
110
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
111
|
+
hocrOut += `font-family:${wordObj.style.font}`;
|
|
110
112
|
}
|
|
111
113
|
|
|
112
114
|
hocrOut += '\'>';
|
|
@@ -115,9 +117,9 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
115
117
|
}
|
|
116
118
|
|
|
117
119
|
// Add word text, along with any formatting that uses nested elements rather than attributes
|
|
118
|
-
if (wordObj.sup) {
|
|
120
|
+
if (wordObj.style.sup) {
|
|
119
121
|
hocrOut += `<sup>${ocr.escapeXml(wordObj.text)}</sup>`;
|
|
120
|
-
} else if (wordObj.dropcap) {
|
|
122
|
+
} else if (wordObj.style.dropcap) {
|
|
121
123
|
hocrOut += `<span class='ocr_dropcap'>${ocr.escapeXml(wordObj.text)}</span>`;
|
|
122
124
|
} else {
|
|
123
125
|
hocrOut += ocr.escapeXml(wordObj.text);
|
package/js/export/writeHtml.js
CHANGED
|
@@ -175,7 +175,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
175
175
|
// Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
|
|
176
176
|
// and then wrapping each letter in a span with a smaller font size.
|
|
177
177
|
let innerHTML;
|
|
178
|
-
if (wordObj.smallCaps) {
|
|
178
|
+
if (wordObj.style.smallCaps) {
|
|
179
179
|
styleStr += 'text-transform:uppercase;';
|
|
180
180
|
innerHTML = makeSmallCapsDivs(wordStr, fontSizeHTMLSmallCaps);
|
|
181
181
|
} else {
|