scribe.js-ocr 0.7.4 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/js/clear.js +5 -6
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +51 -39
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +5 -5
- package/js/fontContainerMain.js +42 -42
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +86 -90
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +22 -4
- package/js/import/convertPageAbbyy.js +8 -1
- package/js/import/convertPageStext.js +1 -1
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +42 -0
- package/js/recognizeConvert.js +21 -8
- package/js/utils/miscUtils.js +27 -6
- package/js/worker/compareOCRModule.js +7 -7
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +16 -16
- package/package.json +6 -3
package/js/fontStatistics.js
CHANGED
|
@@ -5,15 +5,10 @@ import {
|
|
|
5
5
|
determineSansSerif,
|
|
6
6
|
getStyleLookup,
|
|
7
7
|
quantile,
|
|
8
|
-
replaceObjectProperties,
|
|
9
8
|
round6,
|
|
10
9
|
} from './utils/miscUtils.js';
|
|
11
10
|
|
|
12
|
-
import {
|
|
13
|
-
|
|
14
|
-
import { fontMetricsObj } from './containers/dataContainer.js';
|
|
15
|
-
|
|
16
|
-
// import { glyphAlts } from "../fonts/glyphs.js";
|
|
11
|
+
import { CharMetricsFamily, CharMetricsFont, CharMetricsRawFamily } from './objects/charMetricsObjects.js';
|
|
17
12
|
|
|
18
13
|
/**
|
|
19
14
|
* Combine page-level character statistics to calculate overall font metrics.
|
|
@@ -21,102 +16,102 @@ import { fontMetricsObj } from './containers/dataContainer.js';
|
|
|
21
16
|
*
|
|
22
17
|
* @param {Array<OcrPage>} pageArr
|
|
23
18
|
*/
|
|
24
|
-
export function
|
|
25
|
-
const
|
|
19
|
+
export function calcCharMetricsFromPages(pageArr) {
|
|
20
|
+
const pageCharMetricsArr = pageArr.map((x) => calcCharMetricsPage(x));
|
|
26
21
|
|
|
27
|
-
const
|
|
22
|
+
const charMetricsRawObj = pageCharMetricsArr.reduce((x, y) => unionCharMetricsRawObj(x, y));
|
|
28
23
|
|
|
29
|
-
/** @type {Object.<string,
|
|
30
|
-
const
|
|
24
|
+
/** @type {Object.<string, CharMetricsFamily>} */
|
|
25
|
+
const charMetricsOut = {};
|
|
31
26
|
|
|
32
|
-
for (const [family, obj] of Object.entries(
|
|
33
|
-
|
|
27
|
+
for (const [family, obj] of Object.entries(charMetricsRawObj)) {
|
|
28
|
+
charMetricsOut[family] = new CharMetricsFamily();
|
|
34
29
|
for (const [style, obj2] of Object.entries(obj)) {
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
charMetricsOut[family][style] = calculateCharMetrics(obj2);
|
|
31
|
+
charMetricsOut[family].obs += charMetricsOut[family][style].obs;
|
|
37
32
|
}
|
|
38
33
|
}
|
|
39
34
|
|
|
40
|
-
|
|
35
|
+
return charMetricsOut;
|
|
41
36
|
}
|
|
42
37
|
|
|
43
|
-
// The following functions are used for combining an array of page-level
|
|
38
|
+
// The following functions are used for combining an array of page-level charMetrics objects produced by convertPage.js into a single document-level object.
|
|
44
39
|
|
|
45
40
|
/**
|
|
46
|
-
* Adds observations from `
|
|
41
|
+
* Adds observations from `charMetricsRawFontB` into `charMetricsRawFontA`. Modifies `charMetricsRawFontA` in place.
|
|
47
42
|
*
|
|
48
|
-
* @param {?
|
|
49
|
-
* @param {?
|
|
50
|
-
* @param {?number} xHeight - If specified, values from `
|
|
51
|
-
* @returns {?
|
|
43
|
+
* @param {?CharMetricsRawFont} charMetricsRawFontA
|
|
44
|
+
* @param {?CharMetricsRawFont} charMetricsRawFontB
|
|
45
|
+
* @param {?number} xHeight - If specified, values from `charMetricsRawFontB` will be normalized by dividing by `xHeight`.
|
|
46
|
+
* @returns {?CharMetricsRawFont} - Returns charMetricsRawFontA after modifying in place
|
|
52
47
|
*/
|
|
53
|
-
|
|
48
|
+
function unionCharMetricsFont(charMetricsRawFontA, charMetricsRawFontB, xHeight = null) {
|
|
54
49
|
// If one of the inputs is undefined, return early with the only valid object
|
|
55
|
-
if (!
|
|
56
|
-
if (!
|
|
57
|
-
|
|
58
|
-
return
|
|
50
|
+
if (!charMetricsRawFontA) {
|
|
51
|
+
if (!charMetricsRawFontB) return null;
|
|
52
|
+
charMetricsRawFontA = structuredClone(charMetricsRawFontB);
|
|
53
|
+
return charMetricsRawFontA;
|
|
59
54
|
}
|
|
60
|
-
if (!
|
|
61
|
-
return
|
|
55
|
+
if (!charMetricsRawFontB) {
|
|
56
|
+
return charMetricsRawFontA;
|
|
62
57
|
}
|
|
63
58
|
|
|
64
|
-
if (
|
|
59
|
+
if (charMetricsRawFontB?.obs) charMetricsRawFontA.obs += charMetricsRawFontB.obs;
|
|
65
60
|
|
|
66
|
-
for (const [prop, obj] of Object.entries(
|
|
61
|
+
for (const [prop, obj] of Object.entries(charMetricsRawFontB)) {
|
|
67
62
|
for (const [key, value] of Object.entries(obj)) {
|
|
68
|
-
if (!
|
|
69
|
-
|
|
63
|
+
if (!charMetricsRawFontA[prop][key]) {
|
|
64
|
+
charMetricsRawFontA[prop][key] = [];
|
|
70
65
|
}
|
|
71
66
|
if (xHeight) {
|
|
72
67
|
const valueNorm = value.map((x) => x / xHeight).filter((x) => x);
|
|
73
|
-
Array.prototype.push.apply(
|
|
68
|
+
Array.prototype.push.apply(charMetricsRawFontA[prop][key], valueNorm);
|
|
74
69
|
} else {
|
|
75
|
-
Array.prototype.push.apply(
|
|
70
|
+
Array.prototype.push.apply(charMetricsRawFontA[prop][key], value);
|
|
76
71
|
}
|
|
77
72
|
}
|
|
78
73
|
}
|
|
79
|
-
return (
|
|
74
|
+
return (charMetricsRawFontA);
|
|
80
75
|
}
|
|
81
76
|
|
|
82
77
|
/**
|
|
83
|
-
* Adds observations from `
|
|
78
|
+
* Adds observations from `charMetricsRawObjB` into `charMetricsRawObjA`. Modifies `charMetricsRawObjA` in place.
|
|
84
79
|
*
|
|
85
|
-
* @param {Object.<string,
|
|
86
|
-
* @param {Object.<string,
|
|
87
|
-
* @returns {Object.<string,
|
|
80
|
+
* @param {Object.<string, CharMetricsRawFamily>} charMetricsRawObjA
|
|
81
|
+
* @param {Object.<string, CharMetricsRawFamily>} charMetricsRawObjB
|
|
82
|
+
* @returns {Object.<string, CharMetricsRawFamily>} - Returns charMetricsRawObjA after modifying in place
|
|
88
83
|
*/
|
|
89
|
-
function
|
|
90
|
-
for (const [family, obj] of Object.entries(
|
|
84
|
+
function unionCharMetricsRawObj(charMetricsRawObjA, charMetricsRawObjB) {
|
|
85
|
+
for (const [family, obj] of Object.entries(charMetricsRawObjB)) {
|
|
91
86
|
for (const [style, obj2] of Object.entries(obj)) {
|
|
92
87
|
if (Object.keys(obj2.width).length === 0) continue;
|
|
93
|
-
if (!
|
|
94
|
-
|
|
88
|
+
if (!charMetricsRawObjA[family]) {
|
|
89
|
+
charMetricsRawObjA[family] = new CharMetricsRawFamily();
|
|
95
90
|
}
|
|
96
91
|
}
|
|
97
92
|
}
|
|
98
93
|
|
|
99
|
-
for (const [family, obj] of Object.entries(
|
|
94
|
+
for (const [family, obj] of Object.entries(charMetricsRawObjA)) {
|
|
100
95
|
for (const [style, obj2] of Object.entries(obj)) {
|
|
101
|
-
|
|
96
|
+
unionCharMetricsFont(charMetricsRawObjA?.[family]?.[style], charMetricsRawObjB?.[family]?.[style]);
|
|
102
97
|
}
|
|
103
98
|
}
|
|
104
99
|
|
|
105
|
-
return (
|
|
100
|
+
return (charMetricsRawObjA);
|
|
106
101
|
}
|
|
107
102
|
|
|
108
103
|
/**
|
|
109
104
|
* Calculates final font statistics from individual observations.
|
|
110
105
|
*
|
|
111
|
-
* @param {
|
|
112
|
-
* @returns {
|
|
106
|
+
* @param {CharMetricsRawFont} charMetricsRawFontObj
|
|
107
|
+
* @returns {CharMetricsFont} -
|
|
113
108
|
*/
|
|
114
|
-
function
|
|
115
|
-
const fontMetricOut = new
|
|
109
|
+
function calculateCharMetrics(charMetricsRawFontObj) {
|
|
110
|
+
const fontMetricOut = new CharMetricsFont();
|
|
116
111
|
|
|
117
112
|
// Take the median of each array
|
|
118
113
|
for (const prop of ['width', 'height', 'kerning', 'kerning2']) {
|
|
119
|
-
for (const [key, value] of Object.entries(
|
|
114
|
+
for (const [key, value] of Object.entries(charMetricsRawFontObj[prop])) {
|
|
120
115
|
if (value.length > 0) {
|
|
121
116
|
fontMetricOut[prop][key] = round6(quantile(value, 0.5));
|
|
122
117
|
}
|
|
@@ -125,7 +120,7 @@ function calculateFontMetrics(fontMetricsRawFontObj) {
|
|
|
125
120
|
|
|
126
121
|
// Calculate median hight of capital letters only
|
|
127
122
|
const heightCapsArr = [];
|
|
128
|
-
for (const [key, value] of Object.entries(
|
|
123
|
+
for (const [key, value] of Object.entries(charMetricsRawFontObj.height)) {
|
|
129
124
|
if (/[A-Z]/.test(String.fromCharCode(parseInt(key)))) {
|
|
130
125
|
Array.prototype.push.apply(heightCapsArr, value);
|
|
131
126
|
}
|
|
@@ -134,12 +129,12 @@ function calculateFontMetrics(fontMetricsRawFontObj) {
|
|
|
134
129
|
fontMetricOut.heightCaps = round6(quantile(heightCapsArr, 0.5));
|
|
135
130
|
fontMetricOut.obsCaps = heightCapsArr.length;
|
|
136
131
|
|
|
137
|
-
fontMetricOut.obs =
|
|
132
|
+
fontMetricOut.obs = charMetricsRawFontObj.obs;
|
|
138
133
|
|
|
139
134
|
// Standardize all metrics be normalized by x-height
|
|
140
135
|
// The raw metrics may be normalized by ascHeight (for numbers) or x-height (for all other characters).
|
|
141
136
|
for (const prop of ['width', 'height', 'kerning', 'kerning2']) {
|
|
142
|
-
for (const [key, value] of Object.entries(
|
|
137
|
+
for (const [key, value] of Object.entries(charMetricsRawFontObj[prop])) {
|
|
143
138
|
const nameFirst = key.match(/\w+/)[0];
|
|
144
139
|
const charFirst = String.fromCharCode(parseInt(nameFirst));
|
|
145
140
|
if (/\d/.test(charFirst)) {
|
|
@@ -151,7 +146,7 @@ function calculateFontMetrics(fontMetricsRawFontObj) {
|
|
|
151
146
|
// The `kerning2` observations contain the measurement between the end of char 1 and the end of char 2.
|
|
152
147
|
// Therefore, the width of char 2 must be subtracted to get a measurement comparable with `kerning`.
|
|
153
148
|
for (const prop of ['kerning2']) {
|
|
154
|
-
for (const [key, value] of Object.entries(
|
|
149
|
+
for (const [key, value] of Object.entries(charMetricsRawFontObj[prop])) {
|
|
155
150
|
if (value.length > 0) {
|
|
156
151
|
const nameSecond = key.match(/\w+$/)[0];
|
|
157
152
|
|
|
@@ -205,42 +200,43 @@ const roundedVWRegex = new RegExp(roundedVWArr.reduce((x, y) => `${x}|${y}`), 'i
|
|
|
205
200
|
const serifStemSerifPQArr = ['Bookman', 'Century_Schoolbook', 'Courier', 'Georgia', 'Times'];
|
|
206
201
|
const serifStemSerifPQRegex = new RegExp(serifStemSerifPQArr.reduce((x, y) => `${x}|${y}`), 'i');
|
|
207
202
|
|
|
203
|
+
// This function is currently unused. Keeping as we may restore this feature in the future.
|
|
208
204
|
// While the majority of glyphs can be approximated by applying geometric transformations to a single sans and serif font,
|
|
209
205
|
// there are some exceptions (e.g. the lowercase "g" has 2 distinct variations).
|
|
210
206
|
// This function identifies variations that require switching out a glyph from the default font entirely.
|
|
211
|
-
|
|
212
|
-
if (
|
|
207
|
+
function identifyFontVariants(fontScores, charMetrics) {
|
|
208
|
+
if (charMetrics?.SansDefault?.normal) {
|
|
213
209
|
const sansG = calcTopFont(fontScores?.SansDefault?.normal?.g);
|
|
214
|
-
|
|
210
|
+
charMetrics.SansDefault.normal.variants.sans_g = singleGRegex.test(sansG);
|
|
215
211
|
const sans1 = calcTopFont(fontScores?.SansDefault?.normal?.['1']);
|
|
216
|
-
|
|
212
|
+
charMetrics.SansDefault.normal.variants.sans_1 = base1Regex.test(sans1);
|
|
217
213
|
}
|
|
218
214
|
|
|
219
|
-
if (
|
|
215
|
+
if (charMetrics?.SerifDefault?.italic) {
|
|
220
216
|
const minY = calcTopFont(fontScores?.SerifDefault?.italic?.y);
|
|
221
|
-
|
|
217
|
+
charMetrics.SerifDefault.italic.variants.serif_italic_y = minYRegex.test(minY);
|
|
222
218
|
const closedK = calcTopFont(fontScores?.SerifDefault?.italic?.y);
|
|
223
|
-
|
|
219
|
+
charMetrics.SerifDefault.italic.variants.serif_open_k = !closedKRegex.test(closedK);
|
|
224
220
|
|
|
225
221
|
const roundedV = calcTopFont(fontScores?.SerifDefault?.italic?.v);
|
|
226
222
|
const roundedW = calcTopFont(fontScores?.SerifDefault?.italic?.w);
|
|
227
|
-
|
|
223
|
+
charMetrics.SerifDefault.italic.variants.serif_pointy_vw = !(roundedVWRegex.test(roundedV) || roundedVWRegex.test(roundedW));
|
|
228
224
|
|
|
229
225
|
const serifItalicP = calcTopFont(fontScores?.SerifDefault?.italic?.p);
|
|
230
226
|
const serifItalicQ = calcTopFont(fontScores?.SerifDefault?.italic?.q);
|
|
231
|
-
|
|
227
|
+
charMetrics.SerifDefault.italic.variants.serif_stem_sans_pq = !(serifStemSerifPQRegex.test(serifItalicP) || serifStemSerifPQRegex.test(serifItalicQ));
|
|
232
228
|
}
|
|
233
229
|
|
|
234
|
-
return
|
|
230
|
+
return charMetrics;
|
|
235
231
|
}
|
|
236
232
|
|
|
237
233
|
/**
|
|
238
234
|
*
|
|
239
235
|
* @param {OcrPage} pageObj
|
|
240
236
|
*/
|
|
241
|
-
function
|
|
242
|
-
/** @type {Object.<string,
|
|
243
|
-
const
|
|
237
|
+
function calcCharMetricsPage(pageObj) {
|
|
238
|
+
/** @type {Object.<string, CharMetricsRawFamily>} */
|
|
239
|
+
const charMetricsRawPage = {};
|
|
244
240
|
|
|
245
241
|
for (const lineObj of pageObj.lines) {
|
|
246
242
|
for (const wordObj of lineObj.words) {
|
|
@@ -251,8 +247,8 @@ function calcFontMetricsPage(pageObj) {
|
|
|
251
247
|
|
|
252
248
|
// Do not include superscripts, dropcaps, and low-confidence words in statistics for font optimization.
|
|
253
249
|
if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.style.sup || wordObj.style.smallCaps) continue;
|
|
254
|
-
/** @type {Object.<string,
|
|
255
|
-
const
|
|
250
|
+
/** @type {Object.<string, CharMetricsRawFamily>} */
|
|
251
|
+
const charMetricsRawLine = {};
|
|
256
252
|
|
|
257
253
|
if (wordObj.chars) {
|
|
258
254
|
for (let k = 0; k < wordObj.chars.length; k++) {
|
|
@@ -272,22 +268,22 @@ function calcFontMetricsPage(pageObj) {
|
|
|
272
268
|
// May cause future issues as this code assumes one character per <ocrx_cinfo> tag.
|
|
273
269
|
const charUnicode = String(charObj.text.charCodeAt(0));
|
|
274
270
|
|
|
275
|
-
if (!
|
|
276
|
-
|
|
271
|
+
if (!charMetricsRawLine[wordFontFamily]) {
|
|
272
|
+
charMetricsRawLine[wordFontFamily] = new CharMetricsRawFamily();
|
|
277
273
|
}
|
|
278
274
|
|
|
279
275
|
const styleLookup = getStyleLookup(wordObj.style);
|
|
280
276
|
|
|
281
277
|
if (!['normal', 'italic', 'bold'].includes(styleLookup)) continue;
|
|
282
278
|
|
|
283
|
-
if (!
|
|
284
|
-
|
|
285
|
-
|
|
279
|
+
if (!charMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode]) {
|
|
280
|
+
charMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode] = [];
|
|
281
|
+
charMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode] = [];
|
|
286
282
|
}
|
|
287
283
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
284
|
+
charMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode].push(charWidth / charNorm);
|
|
285
|
+
charMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode].push(charHeight / charNorm);
|
|
286
|
+
charMetricsRawLine[wordFontFamily][styleLookup].obs += 1;
|
|
291
287
|
|
|
292
288
|
if (k + 1 < wordObj.chars.length) {
|
|
293
289
|
const charObjNext = wordObj.chars[k + 1];
|
|
@@ -300,33 +296,33 @@ function calcFontMetricsPage(pageObj) {
|
|
|
300
296
|
if (trailingSpace + charWidthNext > 0) {
|
|
301
297
|
const bigramUnicode = `${charUnicode},${wordObj.chars[k + 1].text.charCodeAt(0)}`;
|
|
302
298
|
|
|
303
|
-
if (!
|
|
304
|
-
|
|
305
|
-
|
|
299
|
+
if (!charMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode]) {
|
|
300
|
+
charMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode] = [];
|
|
301
|
+
charMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode] = [];
|
|
306
302
|
}
|
|
307
|
-
|
|
308
|
-
|
|
303
|
+
charMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode].push(trailingSpace / charNorm);
|
|
304
|
+
charMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode].push((trailingSpace + charWidthNext) / charNorm);
|
|
309
305
|
}
|
|
310
306
|
}
|
|
311
307
|
}
|
|
312
308
|
}
|
|
313
309
|
|
|
314
|
-
for (const [family, obj] of Object.entries(
|
|
310
|
+
for (const [family, obj] of Object.entries(charMetricsRawLine)) {
|
|
315
311
|
for (const [style, obj2] of Object.entries(obj)) {
|
|
316
312
|
if (Object.keys(obj2.width).length === 0) continue;
|
|
317
|
-
if (!
|
|
318
|
-
|
|
313
|
+
if (!charMetricsRawPage[family]) {
|
|
314
|
+
charMetricsRawPage[family] = new CharMetricsRawFamily();
|
|
319
315
|
}
|
|
320
316
|
}
|
|
321
317
|
}
|
|
322
318
|
|
|
323
|
-
for (const [family, obj] of Object.entries(
|
|
319
|
+
for (const [family, obj] of Object.entries(charMetricsRawPage)) {
|
|
324
320
|
for (const [style, obj2] of Object.entries(obj)) {
|
|
325
|
-
|
|
321
|
+
unionCharMetricsFont(charMetricsRawPage?.[family]?.[style], charMetricsRawLine?.[family]?.[style]);
|
|
326
322
|
}
|
|
327
323
|
}
|
|
328
324
|
}
|
|
329
325
|
}
|
|
330
326
|
|
|
331
|
-
return
|
|
327
|
+
return charMetricsRawPage;
|
|
332
328
|
}
|
package/js/generalWorkerMain.js
CHANGED
|
@@ -326,6 +326,10 @@ export class gs {
|
|
|
326
326
|
|
|
327
327
|
static terminate = async () => {
|
|
328
328
|
gs.clear();
|
|
329
|
+
// This function can be run while the scheduler is still initializing.
|
|
330
|
+
// This happens when we pre-load the scheduler, but then terminate before it finishes loading,
|
|
331
|
+
// and it is never actually used.
|
|
332
|
+
await gs.schedulerReady;
|
|
329
333
|
await gs.schedulerInner.terminate();
|
|
330
334
|
gs.schedulerInner = null;
|
|
331
335
|
gs.schedulerReady = null;
|
package/js/global.d.ts
CHANGED
|
@@ -10,6 +10,24 @@ declare global {
|
|
|
10
10
|
sup: boolean;
|
|
11
11
|
dropcap: boolean;
|
|
12
12
|
};
|
|
13
|
+
|
|
14
|
+
type FontState = {
|
|
15
|
+
enableOpt: boolean;
|
|
16
|
+
forceOpt: boolean;
|
|
17
|
+
enableCleanToNimbusMono: boolean;
|
|
18
|
+
defaultFontName: string;
|
|
19
|
+
serifDefaultName: string;
|
|
20
|
+
sansDefaultName: string;
|
|
21
|
+
glyphSet: null | 'latin' | 'all';
|
|
22
|
+
charMetrics: { [key: string]: CharMetricsFamily };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
type ScribeSaveData = {
|
|
26
|
+
ocr: OcrPage[];
|
|
27
|
+
fontState: FontState;
|
|
28
|
+
layoutRegions: LayoutPage[];
|
|
29
|
+
layoutDataTables: LayoutDataTablePage[];
|
|
30
|
+
}
|
|
13
31
|
|
|
14
32
|
type StyleLookup = ('normal'|'bold'|'italic'|'boldItalic');
|
|
15
33
|
|
|
@@ -20,10 +38,10 @@ declare global {
|
|
|
20
38
|
type OcrChar = import("./objects/ocrObjects.js").OcrChar;
|
|
21
39
|
|
|
22
40
|
// Font objects
|
|
23
|
-
type
|
|
24
|
-
type
|
|
25
|
-
type
|
|
26
|
-
type
|
|
41
|
+
type CharMetricsFont = import("./objects/charMetricsObjects.js").CharMetricsFont;
|
|
42
|
+
type CharMetricsRawFamily = import("./objects/charMetricsObjects.js").CharMetricsRawFamily;
|
|
43
|
+
type CharMetricsFamily = import("./objects/charMetricsObjects.js").CharMetricsFamily;
|
|
44
|
+
type CharMetricsRawFont = import("./objects/charMetricsObjects.js").CharMetricsRawFont;
|
|
27
45
|
type FontContainerFont = import("./containers/fontContainer.js").FontContainerFont;
|
|
28
46
|
|
|
29
47
|
type FontContainerFamilyBuiltIn = {
|
|
@@ -58,7 +58,7 @@ export async function convertPageAbbyy({ ocrStr, n }) {
|
|
|
58
58
|
const warn = { char: 'char_error' };
|
|
59
59
|
|
|
60
60
|
return {
|
|
61
|
-
pageObj,
|
|
61
|
+
pageObj, charMetricsObj: {}, dataTables: new LayoutDataTablePage(n), warn,
|
|
62
62
|
};
|
|
63
63
|
}
|
|
64
64
|
|
|
@@ -82,6 +82,13 @@ export async function convertPageAbbyy({ ocrStr, n }) {
|
|
|
82
82
|
|
|
83
83
|
const textOrientationFinal = (pageOrientation + textOrientation) % 4;
|
|
84
84
|
|
|
85
|
+
/**
|
|
86
|
+
* Convert Abbyy XML paragraph to internal format.
|
|
87
|
+
* Note that Abbyy XML paragraphs are not preserved because paragraphs are re-assigned by the `assignParagraphs` function.
|
|
88
|
+
* Even if this function call was skipped in the code, when saving/restoring the state using .scribe files, paragraph data is not saved.
|
|
89
|
+
* Further development would be needed to preserve paragraph data.
|
|
90
|
+
* @param {string} xmlPar
|
|
91
|
+
*/
|
|
85
92
|
function convertParAbbyy(xmlPar) {
|
|
86
93
|
/** @type {Array<OcrLine>} */
|
|
87
94
|
const parLineArr = [];
|
|
@@ -5,7 +5,6 @@ import {
|
|
|
5
5
|
calcBoxOverlap,
|
|
6
6
|
calcLang,
|
|
7
7
|
mean50,
|
|
8
|
-
quantile,
|
|
9
8
|
round6,
|
|
10
9
|
unescapeXml,
|
|
11
10
|
} from '../utils/miscUtils.js';
|
|
@@ -295,6 +294,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
295
294
|
|
|
296
295
|
boldArr.push(boldWord);
|
|
297
296
|
italicArr.push(italicWord);
|
|
297
|
+
underlineArr.push(underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5);
|
|
298
298
|
|
|
299
299
|
fontFamilyArr.push(fontFamily);
|
|
300
300
|
|