scribe.js-ocr 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/clear.js +5 -6
- package/js/containers/app.js +1 -1
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +91 -77
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +20 -18
- package/js/export/writeHtml.js +1 -1
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +92 -49
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +93 -92
- package/js/fontSupp.js +20 -20
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +39 -4
- package/js/import/convertPageAbbyy.js +55 -26
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +67 -32
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +55 -19
- package/js/recognizeConvert.js +21 -8
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +43 -6
- package/js/worker/compareOCRModule.js +20 -23
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +6 -3
|
@@ -140,11 +140,11 @@ export async function convertPageBlocks({
|
|
|
140
140
|
// The `word` object has a `is_italic` property, but it is always false.
|
|
141
141
|
// Therefore, the font name is checked to determine if the word is italic.
|
|
142
142
|
// See: https://github.com/naptha/tesseract.js/issues/907
|
|
143
|
-
if (keepItalic && /italic/i.test(word.font_name)) wordObj.style =
|
|
143
|
+
if (keepItalic && /italic/i.test(word.font_name)) wordObj.style.italic = true;
|
|
144
144
|
|
|
145
145
|
// Our fork of Tesseract Legacy should be able to recognize fonts, so this information is included.
|
|
146
146
|
// The generic HOCR importer does not include font information, as this is assumed to be unreliable.
|
|
147
|
-
wordObj.font = word.font_name;
|
|
147
|
+
wordObj.style.font = word.font_name;
|
|
148
148
|
|
|
149
149
|
wordObj.chars = [];
|
|
150
150
|
for (let m = 0; m < word.symbols.length; m++) {
|
|
@@ -247,8 +247,8 @@ export async function convertPageHocr({
|
|
|
247
247
|
|
|
248
248
|
if (debugMode) wordObj.raw = match;
|
|
249
249
|
|
|
250
|
-
if (italic) wordObj.style =
|
|
251
|
-
if (fontName) wordObj.font = fontName;
|
|
250
|
+
if (italic) wordObj.style.italic = true;
|
|
251
|
+
if (fontName) wordObj.style.font = fontName;
|
|
252
252
|
|
|
253
253
|
wordObj.conf = wordConf;
|
|
254
254
|
|
|
@@ -302,19 +302,6 @@ export async function convertPageHocr({
|
|
|
302
302
|
|
|
303
303
|
const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];
|
|
304
304
|
|
|
305
|
-
let smallCaps = false;
|
|
306
|
-
/** @type {('normal'|'italic'|'bold')} */
|
|
307
|
-
let fontStyle = 'normal';
|
|
308
|
-
if (styleStr && /italic/i.test(styleStr)) {
|
|
309
|
-
fontStyle = 'italic';
|
|
310
|
-
} else if (styleStr && /bold/i.test(styleStr)) {
|
|
311
|
-
fontStyle = 'bold';
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
if (styleStr && /small-caps/i.test(styleStr)) {
|
|
315
|
-
smallCaps = true;
|
|
316
|
-
}
|
|
317
|
-
|
|
318
305
|
const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0';
|
|
319
306
|
const wordConf = parseInt(confMatch) || 0;
|
|
320
307
|
|
|
@@ -327,16 +314,19 @@ export async function convertPageHocr({
|
|
|
327
314
|
const wordFontSizeStr = titleStrWord.match(/(?:;|\s)x_fsize\s+(\d+)/)?.[1];
|
|
328
315
|
if (wordFontSizeStr) {
|
|
329
316
|
const wordFontSize = parseInt(wordFontSizeStr);
|
|
330
|
-
if (wordFontSize) wordObj.size = wordFontSize;
|
|
317
|
+
if (wordFontSize) wordObj.style.size = wordFontSize;
|
|
331
318
|
}
|
|
332
319
|
}
|
|
333
320
|
|
|
334
|
-
|
|
335
|
-
|
|
321
|
+
if (styleStr) {
|
|
322
|
+
if (/italic/i.test(styleStr)) wordObj.style.italic = true;
|
|
323
|
+
if (/bold/i.test(styleStr)) wordObj.style.bold = true;
|
|
324
|
+
if (/small-caps/i.test(styleStr)) wordObj.style.smallCaps = true;
|
|
325
|
+
}
|
|
336
326
|
|
|
337
|
-
wordObj.sup =
|
|
327
|
+
if (wordSup) wordObj.style.sup = true;
|
|
338
328
|
|
|
339
|
-
wordObj.
|
|
329
|
+
if (fontName) wordObj.style.font = fontName;
|
|
340
330
|
|
|
341
331
|
wordObj.conf = wordConf;
|
|
342
332
|
|
|
@@ -41,7 +41,7 @@ export function pass2(pageObj, rotateAngle) {
|
|
|
41
41
|
for (let j = 0; j < lineObj.words.length; j++) {
|
|
42
42
|
const wordObj = lineObj.words[j];
|
|
43
43
|
// Skip words that are already identified as small caps, however they can be used to validate other words.
|
|
44
|
-
if (wordObj.smallCaps) {
|
|
44
|
+
if (wordObj.style.smallCaps) {
|
|
45
45
|
smallCapsWordArr.push(wordObj);
|
|
46
46
|
firstWord = true;
|
|
47
47
|
continue;
|
|
@@ -95,7 +95,7 @@ export function pass2(pageObj, rotateAngle) {
|
|
|
95
95
|
|
|
96
96
|
for (let k = 0; k < smallCapsWordArr.length; k++) {
|
|
97
97
|
const wordObj = smallCapsWordArr[k];
|
|
98
|
-
wordObj.smallCaps = true;
|
|
98
|
+
wordObj.style.smallCaps = true;
|
|
99
99
|
if (!wordObj.chars || !titleCaseTotal) continue;
|
|
100
100
|
|
|
101
101
|
// If title case, convert all letters after the first to lowercase.
|
|
@@ -161,8 +161,10 @@ export function pass2(pageObj, rotateAngle) {
|
|
|
161
161
|
|
|
162
162
|
// If the entire word is a superscript, it does not need to be split.
|
|
163
163
|
if (superN === wordObj.text.length) {
|
|
164
|
-
wordObj.sup = true;
|
|
165
|
-
wordObj.style =
|
|
164
|
+
wordObj.style.sup = true;
|
|
165
|
+
wordObj.style.bold = false;
|
|
166
|
+
wordObj.style.italic = false;
|
|
167
|
+
wordObj.style.underline = false;
|
|
166
168
|
continue;
|
|
167
169
|
}
|
|
168
170
|
|
|
@@ -182,8 +184,10 @@ export function pass2(pageObj, rotateAngle) {
|
|
|
182
184
|
|
|
183
185
|
wordObjSup.text = textSuper;
|
|
184
186
|
wordObjSup.chars = charSuperArr;
|
|
185
|
-
wordObjSup.style =
|
|
186
|
-
wordObjSup.
|
|
187
|
+
wordObjSup.style.bold = false;
|
|
188
|
+
wordObjSup.style.italic = false;
|
|
189
|
+
wordObjSup.style.underline = false;
|
|
190
|
+
wordObjSup.style.sup = true;
|
|
187
191
|
wordObjSup.id = `${wordObj.id}a`;
|
|
188
192
|
ocr.calcWordBbox(wordObjSup);
|
|
189
193
|
|
|
@@ -280,13 +284,13 @@ export function pass3(pageObj) {
|
|
|
280
284
|
|
|
281
285
|
// Do not include superscripts, dropcaps, and low-confidence words in all statistics.
|
|
282
286
|
// Low-confidence words are included for font size calculations, as some lines only contain low-confidence words.
|
|
283
|
-
if (wordObj.sup || wordObj.dropcap) continue;
|
|
287
|
+
if (wordObj.style.sup || wordObj.style.dropcap) continue;
|
|
284
288
|
|
|
285
289
|
const contentStrLetter = letterArr[k];
|
|
286
290
|
const charHeight = charObj.bbox.bottom - charObj.bbox.top;
|
|
287
291
|
|
|
288
|
-
const ascChar = wordObj.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.smallCaps && ascCharArr.includes(contentStrLetter);
|
|
289
|
-
const xChar = wordObj.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.smallCaps && xCharArr.includes(contentStrLetter);
|
|
292
|
+
const ascChar = wordObj.style.smallCaps && /[A-Z0-9]/.test(contentStrLetter) || !wordObj.style.smallCaps && ascCharArr.includes(contentStrLetter);
|
|
293
|
+
const xChar = wordObj.style.smallCaps && /[a-z]/.test(contentStrLetter) || !wordObj.style.smallCaps && xCharArr.includes(contentStrLetter);
|
|
290
294
|
|
|
291
295
|
// Save character heights to array for font size calculations
|
|
292
296
|
lineAllHeightArr.push(charHeight);
|
|
@@ -5,7 +5,6 @@ import {
|
|
|
5
5
|
calcBoxOverlap,
|
|
6
6
|
calcLang,
|
|
7
7
|
mean50,
|
|
8
|
-
quantile,
|
|
9
8
|
round6,
|
|
10
9
|
unescapeXml,
|
|
11
10
|
} from '../utils/miscUtils.js';
|
|
@@ -98,10 +97,11 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
98
97
|
let baselineCurrent = 0;
|
|
99
98
|
|
|
100
99
|
/** @type {Array<Array<string>>} */
|
|
101
|
-
const
|
|
100
|
+
const textArr = [];
|
|
102
101
|
/** @type {Array<number>} */
|
|
103
102
|
const wordLetterOrFontArrIndex = [];
|
|
104
|
-
let
|
|
103
|
+
let boldCurrent = false;
|
|
104
|
+
let italicCurrent = false;
|
|
105
105
|
let familyCurrent = 'Default';
|
|
106
106
|
/** Font size at the current position in the PDF, with no modifications. */
|
|
107
107
|
let sizeCurrentRaw = 0;
|
|
@@ -110,8 +110,14 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
110
110
|
let superCurrent = false;
|
|
111
111
|
let smallCapsCurrent;
|
|
112
112
|
let smallCapsCurrentAlt;
|
|
113
|
-
|
|
114
|
-
|
|
113
|
+
|
|
114
|
+
/** @type {Array<boolean>} */
|
|
115
|
+
const boldArr = [];
|
|
116
|
+
/** @type {Array<boolean>} */
|
|
117
|
+
const italicArr = [];
|
|
118
|
+
|
|
119
|
+
/** @type {Array<boolean>} */
|
|
120
|
+
const underlineArr = [];
|
|
115
121
|
/** @type {Array<boolean>} */
|
|
116
122
|
const smallCapsArr = [];
|
|
117
123
|
/** @type {Array<boolean>} */
|
|
@@ -144,6 +150,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
144
150
|
* @property {Quad} quad
|
|
145
151
|
* @property {Point} origin
|
|
146
152
|
* @property {string} text
|
|
153
|
+
* @property {number} flags
|
|
147
154
|
*/
|
|
148
155
|
|
|
149
156
|
/**
|
|
@@ -158,8 +165,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
158
165
|
// Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
|
|
159
166
|
// This regex splits the string into elements that contain either (1) a font change or (2) a character.
|
|
160
167
|
// The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
|
|
161
|
-
const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]
|
|
162
|
-
|
|
168
|
+
const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"]([^>]*?c=['"][^'"]+['"])\s*\/>/ig;
|
|
163
169
|
const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
|
|
164
170
|
|
|
165
171
|
wordCharOrFontArr[i] = [];
|
|
@@ -167,7 +173,8 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
167
173
|
const fontStr = stextMatches[j][1];
|
|
168
174
|
const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
|
|
169
175
|
const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
|
|
170
|
-
|
|
176
|
+
// fontNameStrI can exist but be an empty string. Therefore, truthy/falsy checks are not sufficient.
|
|
177
|
+
if (fontNameStrI !== undefined && fontSizeStrI !== undefined) {
|
|
171
178
|
// Skip font changes that occur at the end of a line.
|
|
172
179
|
// In addition to being unnecessary, these are problematic when parsing superscripts.
|
|
173
180
|
if (i + 1 === wordStrArr.length && j + 1 === stextMatches.length) continue;
|
|
@@ -209,10 +216,14 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
209
216
|
};
|
|
210
217
|
}
|
|
211
218
|
|
|
219
|
+
const flags = parseInt(stextMatches[j][12]?.match(/flags=['"]([^'"]*)/)?.[1]);
|
|
220
|
+
const text = stextMatches[j][12]?.match(/c=['"]([^'"]*)/)?.[1];
|
|
221
|
+
|
|
212
222
|
wordCharOrFontArr[i][j] = {
|
|
213
223
|
quad,
|
|
214
224
|
origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
|
|
215
|
-
|
|
225
|
+
flags,
|
|
226
|
+
text,
|
|
216
227
|
};
|
|
217
228
|
}
|
|
218
229
|
}
|
|
@@ -220,6 +231,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
220
231
|
for (let i = 0; i < wordCharOrFontArr.length; i++) {
|
|
221
232
|
let textWordArr = [];
|
|
222
233
|
let bboxesWordArr = [];
|
|
234
|
+
const underlineWordArr = [];
|
|
223
235
|
let fontFamily = familyCurrent || fontFamilyLine || 'Default';
|
|
224
236
|
// Font size for the word is a separate variable, as if a font size changes at the end of the word,
|
|
225
237
|
// that should not be reflected until the following word.
|
|
@@ -228,7 +240,8 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
228
240
|
let smallCapsWordAlt = smallCapsCurrentAlt || false;
|
|
229
241
|
// Title case adjustment does not carry forward between words. A word in title case may be followed by a word in all lower case.
|
|
230
242
|
let smallCapsWordAltTitleCaseAdj = false;
|
|
231
|
-
let
|
|
243
|
+
let boldWord = false;
|
|
244
|
+
let italicWord = false;
|
|
232
245
|
|
|
233
246
|
if (wordCharOrFontArr[i].length === 0) continue;
|
|
234
247
|
|
|
@@ -276,9 +289,13 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
276
289
|
&& ((baselineDelta < -0.25 && sizeDelta < -0.05) || (baselineDelta > 0.25 && sizeDelta > 0.05))) {
|
|
277
290
|
// Split word when superscript starts or ends.
|
|
278
291
|
if (textWordArr.length > 0) {
|
|
279
|
-
|
|
292
|
+
textArr.push(textWordArr);
|
|
280
293
|
bboxes.push(bboxesWordArr);
|
|
281
|
-
|
|
294
|
+
|
|
295
|
+
boldArr.push(boldWord);
|
|
296
|
+
italicArr.push(italicWord);
|
|
297
|
+
underlineArr.push(underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5);
|
|
298
|
+
|
|
282
299
|
fontFamilyArr.push(fontFamily);
|
|
283
300
|
|
|
284
301
|
if (sizeDelta > 0) {
|
|
@@ -341,11 +358,15 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
341
358
|
if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name) || /-it$/i.test(charOrFont.name) || /oblique/i.test(charOrFont.name)) {
|
|
342
359
|
// The word is already initialized, so we need to change the last element of the style array.
|
|
343
360
|
// Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
|
|
344
|
-
|
|
345
|
-
} else if (/bold|black/i.test(charOrFont.name)) {
|
|
346
|
-
styleCurrent = 'bold';
|
|
361
|
+
italicCurrent = true;
|
|
347
362
|
} else {
|
|
348
|
-
|
|
363
|
+
italicCurrent = false;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if (/bold|black/i.test(charOrFont.name)) {
|
|
367
|
+
boldCurrent = true;
|
|
368
|
+
} else {
|
|
369
|
+
boldCurrent = false;
|
|
349
370
|
}
|
|
350
371
|
|
|
351
372
|
continue;
|
|
@@ -354,7 +375,9 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
354
375
|
}
|
|
355
376
|
|
|
356
377
|
if (!wordInit) {
|
|
357
|
-
|
|
378
|
+
boldWord = boldCurrent;
|
|
379
|
+
italicWord = italicCurrent;
|
|
380
|
+
|
|
358
381
|
wordInit = true;
|
|
359
382
|
}
|
|
360
383
|
|
|
@@ -411,15 +434,23 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
411
434
|
|
|
412
435
|
textWordArr.push(charOrFont.text);
|
|
413
436
|
|
|
437
|
+
underlineWordArr.push(charOrFont.flags === 2);
|
|
438
|
+
|
|
414
439
|
bboxesWordArr.push(bbox);
|
|
415
440
|
}
|
|
416
441
|
|
|
417
442
|
if (textWordArr.length === 0) continue;
|
|
418
443
|
|
|
444
|
+
const underlineWord = underlineWordArr.reduce((a, b) => Number(a) + Number(b), 0) / underlineWordArr.length > 0.5;
|
|
445
|
+
underlineArr.push(underlineWord);
|
|
446
|
+
|
|
419
447
|
wordLetterOrFontArrIndex.push(i);
|
|
420
|
-
|
|
448
|
+
textArr.push(textWordArr);
|
|
421
449
|
bboxes.push(bboxesWordArr);
|
|
422
|
-
|
|
450
|
+
|
|
451
|
+
boldArr.push(boldWord);
|
|
452
|
+
italicArr.push(italicWord);
|
|
453
|
+
|
|
423
454
|
fontFamilyArr.push(fontFamily);
|
|
424
455
|
fontSizeArr.push(fontSizeWord);
|
|
425
456
|
smallCapsAltArr.push(smallCapsWordAlt);
|
|
@@ -476,8 +507,8 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
476
507
|
lineObj.raw = xmlLine;
|
|
477
508
|
|
|
478
509
|
let lettersKept = 0;
|
|
479
|
-
for (let i = 0; i <
|
|
480
|
-
const wordText = unescapeXml(
|
|
510
|
+
for (let i = 0; i < textArr.length; i++) {
|
|
511
|
+
const wordText = unescapeXml(textArr[i].join(''));
|
|
481
512
|
|
|
482
513
|
if (wordText.trim() === '') continue;
|
|
483
514
|
|
|
@@ -490,8 +521,8 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
490
521
|
/** @type {Array<OcrChar>} */
|
|
491
522
|
const charObjArr = [];
|
|
492
523
|
|
|
493
|
-
for (let j = 0; j <
|
|
494
|
-
const letter = unescapeXml(
|
|
524
|
+
for (let j = 0; j < textArr[i].length; j++) {
|
|
525
|
+
const letter = unescapeXml(textArr[i][j]);
|
|
495
526
|
|
|
496
527
|
const bbox = bboxesI[j];
|
|
497
528
|
|
|
@@ -526,7 +557,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
526
557
|
if (bbox.left < 0 && bbox.right < 0) continue;
|
|
527
558
|
|
|
528
559
|
const wordObj = new ocr.OcrWord(lineObj, wordText, bbox, wordID);
|
|
529
|
-
wordObj.size = fontSizeArr[i];
|
|
560
|
+
wordObj.style.size = fontSizeArr[i];
|
|
530
561
|
|
|
531
562
|
wordObj.lang = wordLang;
|
|
532
563
|
|
|
@@ -540,7 +571,7 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
540
571
|
wordObj.conf = 100;
|
|
541
572
|
|
|
542
573
|
if (smallCapsAltArr[i] && !/[a-z]/.test(wordObj.text) && /[A-Z].?[A-Z]/.test(wordObj.text)) {
|
|
543
|
-
wordObj.smallCaps = true;
|
|
574
|
+
wordObj.style.smallCaps = true;
|
|
544
575
|
if (smallCapsAltTitleCaseArr[i]) {
|
|
545
576
|
wordObj.chars.slice(1).forEach((x) => {
|
|
546
577
|
x.text = x.text.toLowerCase();
|
|
@@ -552,20 +583,24 @@ export async function convertPageStext({ ocrStr, n }) {
|
|
|
552
583
|
}
|
|
553
584
|
wordObj.text = wordObj.chars.map((x) => x.text).join('');
|
|
554
585
|
} else if (smallCapsArr[i]) {
|
|
555
|
-
wordObj.smallCaps = true;
|
|
586
|
+
wordObj.style.smallCaps = true;
|
|
556
587
|
}
|
|
557
588
|
|
|
558
|
-
if (
|
|
559
|
-
wordObj.style =
|
|
560
|
-
}
|
|
561
|
-
|
|
589
|
+
if (italicArr[i]) {
|
|
590
|
+
wordObj.style.italic = true;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
if (boldArr[i]) {
|
|
594
|
+
wordObj.style.bold = true;
|
|
562
595
|
}
|
|
563
596
|
|
|
564
597
|
wordObj.raw = wordStrArr[wordLetterOrFontArrIndex[i]];
|
|
565
598
|
|
|
566
|
-
wordObj.font = fontFamilyArr[i];
|
|
599
|
+
wordObj.style.font = fontFamilyArr[i];
|
|
600
|
+
|
|
601
|
+
wordObj.style.sup = superArr[i];
|
|
567
602
|
|
|
568
|
-
wordObj.
|
|
603
|
+
wordObj.style.underline = underlineArr[i];
|
|
569
604
|
|
|
570
605
|
lineObj.words.push(wordObj);
|
|
571
606
|
|
package/js/import/import.js
CHANGED
|
@@ -2,7 +2,6 @@ import { clearData } from '../clear.js';
|
|
|
2
2
|
import { inputData, opt } from '../containers/app.js';
|
|
3
3
|
import {
|
|
4
4
|
convertPageWarn,
|
|
5
|
-
fontMetricsObj,
|
|
6
5
|
layoutDataTables,
|
|
7
6
|
layoutRegions,
|
|
8
7
|
ocrAll,
|
|
@@ -18,14 +17,15 @@ import {
|
|
|
18
17
|
optimizeFontContainerAll, setDefaultFontAuto,
|
|
19
18
|
} from '../fontContainerMain.js';
|
|
20
19
|
import { runFontOptimization } from '../fontEval.js';
|
|
21
|
-
import {
|
|
20
|
+
import { calcCharMetricsFromPages } from '../fontStatistics.js';
|
|
22
21
|
import { calcSuppFontInfo } from '../fontSupp.js';
|
|
23
22
|
import { gs } from '../generalWorkerMain.js';
|
|
24
23
|
import { imageUtils } from '../objects/imageObjects.js';
|
|
25
|
-
import { LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
|
|
24
|
+
import { addCircularRefsDataTables, LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js';
|
|
25
|
+
import { addCircularRefsOcr } from '../objects/ocrObjects.js';
|
|
26
26
|
import { PageMetrics } from '../objects/pageMetricsObjects.js';
|
|
27
27
|
import { checkCharWarn, convertOCR } from '../recognizeConvert.js';
|
|
28
|
-
import {
|
|
28
|
+
import { readOcrFile, clearObjectProperties, objectAssignDefined } from '../utils/miscUtils.js';
|
|
29
29
|
import { importOCRFiles } from './importOCR.js';
|
|
30
30
|
|
|
31
31
|
/**
|
|
@@ -141,6 +141,8 @@ export function sortInputFiles(files) {
|
|
|
141
141
|
/** @type {Array<File|FileNode>} */
|
|
142
142
|
const pdfFilesAll = [];
|
|
143
143
|
/** @type {Array<File|FileNode>} */
|
|
144
|
+
const scribeFilesAll = [];
|
|
145
|
+
/** @type {Array<File|FileNode>} */
|
|
144
146
|
const unsupportedFilesAll = [];
|
|
145
147
|
const unsupportedExt = {};
|
|
146
148
|
for (let i = 0; i < files.length; i++) {
|
|
@@ -156,6 +158,8 @@ export function sortInputFiles(files) {
|
|
|
156
158
|
// All .gz files are assumed to be OCR data (xml) since all other file types can be compressed already
|
|
157
159
|
} else if (['hocr', 'xml', 'html', 'gz', 'stext'].includes(fileExt)) {
|
|
158
160
|
ocrFilesAll.push(file);
|
|
161
|
+
} else if (['scribe'].includes(fileExt)) {
|
|
162
|
+
scribeFilesAll.push(file);
|
|
159
163
|
} else if (['pdf'].includes(fileExt)) {
|
|
160
164
|
pdfFilesAll.push(file);
|
|
161
165
|
} else {
|
|
@@ -172,7 +176,9 @@ export function sortInputFiles(files) {
|
|
|
172
176
|
imageFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
|
|
173
177
|
ocrFilesAll.sort((a, b) => ((a.name > b.name) ? 1 : ((b.name > a.name) ? -1 : 0)));
|
|
174
178
|
|
|
175
|
-
return {
|
|
179
|
+
return {
|
|
180
|
+
pdfFiles: pdfFilesAll, imageFiles: imageFilesAll, ocrFiles: ocrFilesAll, scribeFiles: scribeFilesAll,
|
|
181
|
+
};
|
|
176
182
|
}
|
|
177
183
|
|
|
178
184
|
/**
|
|
@@ -184,6 +190,7 @@ export function sortInputFiles(files) {
|
|
|
184
190
|
* @property {Array<File>|Array<string>|Array<ArrayBuffer>} [pdfFiles]
|
|
185
191
|
* @property {Array<File>|Array<string>|Array<ArrayBuffer>} [imageFiles]
|
|
186
192
|
* @property {Array<File>|Array<string>|Array<ArrayBuffer>} [ocrFiles]
|
|
193
|
+
* @property {Array<File>|Array<string>|Array<ArrayBuffer>} [scribeFiles]
|
|
187
194
|
*/
|
|
188
195
|
|
|
189
196
|
/**
|
|
@@ -205,9 +212,11 @@ export async function importFiles(files) {
|
|
|
205
212
|
let imageFiles = [];
|
|
206
213
|
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
207
214
|
let ocrFiles = [];
|
|
215
|
+
/** @type {Array<File|FileNode|ArrayBuffer>} */
|
|
216
|
+
let scribeFiles = [];
|
|
208
217
|
// These statements contain many ts-ignore comments, because the TypeScript interpreter apparently cannot properly narrow arrays.
|
|
209
218
|
// See: https://github.com/microsoft/TypeScript/issues/42384
|
|
210
|
-
if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files) {
|
|
219
|
+
if ('pdfFiles' in files || 'imageFiles' in files || 'ocrFiles' in files || 'scribeFiles' in files) {
|
|
211
220
|
if (files.pdfFiles && files.pdfFiles[0] instanceof ArrayBuffer) {
|
|
212
221
|
// @ts-ignore
|
|
213
222
|
pdfFiles = files.pdfFiles;
|
|
@@ -229,14 +238,23 @@ export async function importFiles(files) {
|
|
|
229
238
|
// @ts-ignore
|
|
230
239
|
ocrFiles = await standardizeFiles(files.ocrFiles);
|
|
231
240
|
}
|
|
241
|
+
if (files.scribeFiles && files.scribeFiles[0] instanceof ArrayBuffer) {
|
|
242
|
+
// @ts-ignore
|
|
243
|
+
scribeFiles = files.scribeFiles;
|
|
244
|
+
} else if (files.scribeFiles) {
|
|
245
|
+
// @ts-ignore
|
|
246
|
+
scribeFiles = await standardizeFiles(files.scribeFiles);
|
|
247
|
+
}
|
|
232
248
|
} else {
|
|
233
249
|
// @ts-ignore
|
|
234
250
|
const filesStand = await standardizeFiles(files);
|
|
235
251
|
if (files[0] instanceof ArrayBuffer) throw new Error('ArrayBuffer inputs must be sorted by file type.');
|
|
236
|
-
({
|
|
252
|
+
({
|
|
253
|
+
pdfFiles, imageFiles, ocrFiles, scribeFiles,
|
|
254
|
+
} = sortInputFiles(filesStand));
|
|
237
255
|
}
|
|
238
256
|
|
|
239
|
-
if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0) {
|
|
257
|
+
if (pdfFiles.length === 0 && imageFiles.length === 0 && ocrFiles.length === 0 && scribeFiles.length === 0) {
|
|
240
258
|
const errorText = 'No supported files found.';
|
|
241
259
|
opt.errorHandler(errorText);
|
|
242
260
|
return;
|
|
@@ -261,23 +279,61 @@ export async function importFiles(files) {
|
|
|
261
279
|
|
|
262
280
|
// Set default download name
|
|
263
281
|
if (pdfFiles.length > 0 && 'name' in pdfFiles[0]) {
|
|
264
|
-
inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,
|
|
282
|
+
inputData.defaultDownloadFileName = `${pdfFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
|
|
265
283
|
} else if (imageFiles.length > 0 && 'name' in imageFiles[0]) {
|
|
266
|
-
inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,
|
|
284
|
+
inputData.defaultDownloadFileName = `${imageFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
|
|
267
285
|
} else if (ocrFiles.length > 0 && 'name' in ocrFiles[0]) {
|
|
268
|
-
inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,
|
|
286
|
+
inputData.defaultDownloadFileName = `${ocrFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
|
|
287
|
+
} else if (scribeFiles.length > 0 && 'name' in scribeFiles[0]) {
|
|
288
|
+
inputData.defaultDownloadFileName = `${scribeFiles[0].name.replace(/\.\w{1,6}$/, '')}.pdf`;
|
|
269
289
|
}
|
|
270
290
|
|
|
291
|
+
let existingLayout = false;
|
|
292
|
+
let existingLayoutDataTable = false;
|
|
293
|
+
|
|
271
294
|
inputData.pdfMode = pdfFiles.length === 1;
|
|
272
295
|
inputData.imageMode = !!(imageFiles.length > 0 && !inputData.pdfMode);
|
|
273
296
|
ImageCache.inputModes.image = !!(imageFiles.length > 0 && !inputData.pdfMode);
|
|
274
297
|
|
|
298
|
+
if (scribeFiles.length > 0) {
|
|
299
|
+
const scribeRestoreStr = await readOcrFile(scribeFiles[0]);
|
|
300
|
+
/** @type {ScribeSaveData} */
|
|
301
|
+
const scribeRestoreObj = JSON.parse(scribeRestoreStr);
|
|
302
|
+
if (scribeRestoreObj.fontState) {
|
|
303
|
+
objectAssignDefined(FontCont.state, scribeRestoreObj.fontState);
|
|
304
|
+
await runFontOptimization(ocrAll.active);
|
|
305
|
+
}
|
|
306
|
+
if (scribeRestoreObj.layoutRegions) {
|
|
307
|
+
existingLayout = true;
|
|
308
|
+
layoutRegions.pages = scribeRestoreObj.layoutRegions;
|
|
309
|
+
}
|
|
310
|
+
if (scribeRestoreObj.layoutDataTables) {
|
|
311
|
+
existingLayoutDataTable = true;
|
|
312
|
+
addCircularRefsDataTables(scribeRestoreObj.layoutDataTables);
|
|
313
|
+
layoutDataTables.pages = scribeRestoreObj.layoutDataTables;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const oemName = 'User Upload';
|
|
317
|
+
if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount);
|
|
318
|
+
addCircularRefsOcr(scribeRestoreObj.ocr);
|
|
319
|
+
ocrAll[oemName] = scribeRestoreObj.ocr;
|
|
320
|
+
ocrAll.active = ocrAll[oemName];
|
|
321
|
+
|
|
322
|
+
for (let i = 0; i < ocrAll[oemName].length; i++) {
|
|
323
|
+
inputData.xmlMode[i] = true;
|
|
324
|
+
if (ocrAll[oemName][i].dims.height && ocrAll[oemName][i].dims.width) {
|
|
325
|
+
pageMetricsArr[i] = new PageMetrics(ocrAll[oemName][i].dims);
|
|
326
|
+
}
|
|
327
|
+
pageMetricsArr[i].angle = ocrAll[oemName][i].angle;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
275
331
|
const xmlModeImport = ocrFiles.length > 0;
|
|
276
332
|
|
|
277
333
|
let pageCount;
|
|
278
334
|
let pageCountImage;
|
|
279
335
|
let abbyyMode = false;
|
|
280
|
-
let
|
|
336
|
+
let reimportHocrMode = false;
|
|
281
337
|
|
|
282
338
|
if (inputData.pdfMode) {
|
|
283
339
|
const pdfFile = pdfFiles[0];
|
|
@@ -296,8 +352,6 @@ export async function importFiles(files) {
|
|
|
296
352
|
pageCountImage = imageFiles.length;
|
|
297
353
|
}
|
|
298
354
|
|
|
299
|
-
let existingLayout = false;
|
|
300
|
-
let existingLayoutDataTable = false;
|
|
301
355
|
let existingOpt = false;
|
|
302
356
|
const oemName = 'User Upload';
|
|
303
357
|
let stextMode;
|
|
@@ -317,41 +371,32 @@ export async function importFiles(files) {
|
|
|
317
371
|
ocrAllRaw.active = ocrAllRaw.active.slice(0, pageCountImage);
|
|
318
372
|
}
|
|
319
373
|
|
|
374
|
+
objectAssignDefined(FontCont.state, ocrData.fontState);
|
|
375
|
+
|
|
320
376
|
// Restore font metrics and optimize font from previous session (if applicable)
|
|
321
|
-
if (ocrData.
|
|
377
|
+
if (ocrData.fontState.charMetrics && Object.keys(ocrData.fontState.charMetrics).length > 0) {
|
|
322
378
|
const fontPromise = loadBuiltInFontsRaw();
|
|
323
379
|
|
|
324
380
|
existingOpt = true;
|
|
325
381
|
|
|
326
|
-
replaceObjectProperties(fontMetricsObj, ocrData.fontMetricsObj);
|
|
327
382
|
await gs.schedulerReady;
|
|
328
|
-
setDefaultFontAuto(
|
|
383
|
+
setDefaultFontAuto(FontCont.state.charMetrics);
|
|
329
384
|
|
|
330
385
|
// If `ocrData.enableOpt` is `false`, then the metrics are present but ignored.
|
|
331
386
|
// This occurs if optimization was found to decrease accuracy for both sans and serif,
|
|
332
387
|
// not simply because the user disabled optimization in the view settings.
|
|
333
388
|
// If no `enableOpt` property exists but metrics are present, then optimization is enabled.
|
|
334
389
|
if (ocrData.enableOpt === 'false') {
|
|
335
|
-
FontCont.enableOpt = false;
|
|
390
|
+
FontCont.state.enableOpt = false;
|
|
336
391
|
} else {
|
|
337
392
|
await fontPromise;
|
|
338
393
|
if (!FontCont.raw) throw new Error('Raw font data not found.');
|
|
339
|
-
FontCont.opt = await optimizeFontContainerAll(FontCont.raw,
|
|
340
|
-
FontCont.enableOpt = true;
|
|
394
|
+
FontCont.opt = await optimizeFontContainerAll(FontCont.raw, FontCont.state.charMetrics);
|
|
395
|
+
FontCont.state.enableOpt = true;
|
|
341
396
|
await enableFontOpt(true);
|
|
342
397
|
}
|
|
343
398
|
}
|
|
344
399
|
|
|
345
|
-
if (ocrData.defaultFont) FontCont.defaultFontName = ocrData.defaultFont;
|
|
346
|
-
|
|
347
|
-
if (ocrData.sansFont) {
|
|
348
|
-
FontCont.sansDefaultName = ocrData.sansFont;
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
if (ocrData.serifFont) {
|
|
352
|
-
FontCont.serifDefaultName = ocrData.serifFont;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
400
|
// Restore layout data from previous session (if applicable)
|
|
356
401
|
if (ocrData.layoutObj) {
|
|
357
402
|
for (let i = 0; i < ocrData.layoutObj.length; i++) {
|
|
@@ -368,22 +413,22 @@ export async function importFiles(files) {
|
|
|
368
413
|
}
|
|
369
414
|
|
|
370
415
|
abbyyMode = ocrData.abbyyMode;
|
|
371
|
-
|
|
416
|
+
reimportHocrMode = ocrData.reimportHocrMode;
|
|
372
417
|
|
|
373
418
|
stextMode = ocrData.stextMode;
|
|
374
419
|
}
|
|
375
420
|
|
|
376
|
-
const
|
|
421
|
+
const pageCountOcr = ocrAllRaw.active?.length || ocrAll.active?.length || 0;
|
|
377
422
|
|
|
378
423
|
// If both OCR data and image data are present, confirm they have the same number of pages
|
|
379
424
|
if (xmlModeImport && (inputData.imageMode || inputData.pdfMode)) {
|
|
380
|
-
if (pageCountImage !==
|
|
381
|
-
const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${
|
|
425
|
+
if (pageCountImage !== pageCountOcr) {
|
|
426
|
+
const warningHTML = `Page mismatch detected. Image data has ${pageCountImage} pages while OCR data has ${pageCountOcr} pages.`;
|
|
382
427
|
opt.warningHandler(warningHTML);
|
|
383
428
|
}
|
|
384
429
|
}
|
|
385
430
|
|
|
386
|
-
inputData.pageCount = pageCountImage ??
|
|
431
|
+
inputData.pageCount = pageCountImage ?? pageCountOcr;
|
|
387
432
|
|
|
388
433
|
ocrAllRaw.active = ocrAllRaw.active || Array(pageCount);
|
|
389
434
|
|
|
@@ -399,10 +444,6 @@ export async function importFiles(files) {
|
|
|
399
444
|
}
|
|
400
445
|
}
|
|
401
446
|
|
|
402
|
-
inputData.xmlMode = new Array(inputData.pageCount);
|
|
403
|
-
|
|
404
|
-
inputData.xmlMode.fill(false);
|
|
405
|
-
|
|
406
447
|
// Render first page for PDF only
|
|
407
448
|
if (inputData.pdfMode && !xmlModeImport) {
|
|
408
449
|
opt.progressHandler({ n: 0, type: 'importPDF', info: { } });
|
|
@@ -429,18 +470,23 @@ export async function importFiles(files) {
|
|
|
429
470
|
if (stextMode) format = 'stext';
|
|
430
471
|
|
|
431
472
|
// Process HOCR using web worker, reading from file first if that has not been done already
|
|
432
|
-
await convertOCR(ocrAllRaw.active, true, format, oemName,
|
|
473
|
+
await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode).then(async () => {
|
|
433
474
|
// Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate).
|
|
434
475
|
if (!existingOpt && !stextMode) {
|
|
435
476
|
await checkCharWarn(convertPageWarn);
|
|
436
|
-
|
|
477
|
+
const charMetrics = calcCharMetricsFromPages(ocrAll.active);
|
|
478
|
+
|
|
479
|
+
if (Object.keys(charMetrics).length > 0) {
|
|
480
|
+
clearObjectProperties(FontCont.state.charMetrics);
|
|
481
|
+
Object.assign(FontCont.state.charMetrics, charMetrics);
|
|
482
|
+
}
|
|
437
483
|
await runFontOptimization(ocrAll.active);
|
|
438
484
|
}
|
|
439
485
|
});
|
|
440
486
|
} else if (inputData.pdfMode && (opt.usePDFText.native.main || opt.usePDFText.native.supp || opt.usePDFText.ocr.main || opt.usePDFText.ocr.supp)) {
|
|
441
487
|
await extractInternalPDFText();
|
|
442
488
|
if (inputData.pdfType === 'text' && opt.usePDFText.native.main || inputData.pdfType === 'ocr' && opt.usePDFText.ocr.main) {
|
|
443
|
-
if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
|
|
489
|
+
if (inputData.pdfType === 'text') FontCont.state.enableCleanToNimbusMono = true;
|
|
444
490
|
if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
|
|
445
491
|
}
|
|
446
492
|
}
|
|
@@ -467,8 +513,6 @@ export async function importFilesSupp(files, ocrName) {
|
|
|
467
513
|
|
|
468
514
|
const ocrData = await importOCRFiles(ocrFilesAll);
|
|
469
515
|
|
|
470
|
-
const scribeMode = ocrData.scribeMode;
|
|
471
|
-
|
|
472
516
|
const pageCountHOCR = ocrData.hocrRaw.length;
|
|
473
517
|
|
|
474
518
|
// If both OCR data and image data are present, confirm they have the same number of pages
|
|
@@ -482,5 +526,5 @@ export async function importFilesSupp(files, ocrName) {
|
|
|
482
526
|
if (ocrData.abbyyMode) format = 'abbyy';
|
|
483
527
|
if (ocrData.stextMode) format = 'stext';
|
|
484
528
|
|
|
485
|
-
await convertOCR(ocrData.hocrRaw, false, format, ocrName,
|
|
529
|
+
await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode);
|
|
486
530
|
}
|