scribe.js-ocr 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/containers/app.js +1 -1
- package/js/containers/fontContainer.js +42 -40
- package/js/export/writeHocr.js +15 -13
- package/js/export/writeHtml.js +1 -1
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +50 -7
- package/js/fontStatistics.js +18 -13
- package/js/fontSupp.js +20 -20
- package/js/global.d.ts +17 -0
- package/js/import/convertPageAbbyy.js +47 -25
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +66 -31
- package/js/objects/ocrObjects.js +13 -19
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +16 -0
- package/js/worker/compareOCRModule.js +13 -16
- package/js/worker/optimizeFontModule.js +4 -4
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +1 -1
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/js/containers/app.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
// Node.js case
|
|
7
7
|
import opentype from '../../lib/opentype.module.js';
|
|
8
|
-
import { determineSansSerif } from '../utils/miscUtils.js';
|
|
8
|
+
import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
|
|
9
9
|
import { ca } from '../canvasAdapter.js';
|
|
10
10
|
|
|
11
11
|
if (typeof process === 'object') {
|
|
@@ -104,26 +104,26 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
|
|
|
104
104
|
* Load font from source and return a FontContainerFont object.
|
|
105
105
|
* This function is used to load the Chinese font.
|
|
106
106
|
* @param {string} family
|
|
107
|
-
* @param {
|
|
107
|
+
* @param {StyleLookup} styleLookup
|
|
108
108
|
* @param {("sans"|"serif")} type
|
|
109
109
|
* @param {ArrayBuffer} src
|
|
110
110
|
* @param {boolean} opt
|
|
111
111
|
*
|
|
112
112
|
*/
|
|
113
|
-
export async function loadFont(family,
|
|
113
|
+
export async function loadFont(family, styleLookup, type, src, opt) {
|
|
114
114
|
const fontObj = await loadOpentype(src);
|
|
115
|
-
return new FontContainerFont(family,
|
|
115
|
+
return new FontContainerFont(family, styleLookup, src, opt, fontObj);
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
/**
|
|
119
119
|
*
|
|
120
120
|
* @param {string} family
|
|
121
|
-
* @param {
|
|
121
|
+
* @param {StyleLookup} styleLookup
|
|
122
122
|
* @param {ArrayBuffer} src
|
|
123
123
|
* @param {boolean} opt
|
|
124
124
|
* @param {opentype.Font} opentypeObj - Kerning paris to re-apply
|
|
125
125
|
* @property {string} family -
|
|
126
|
-
* @property {
|
|
126
|
+
* @property {StyleLookup} style -
|
|
127
127
|
* @property {ArrayBuffer} src
|
|
128
128
|
* @property {opentype.Font} opentype -
|
|
129
129
|
* @property {string} fontFaceName -
|
|
@@ -135,7 +135,7 @@ export async function loadFont(family, style, type, src, opt) {
|
|
|
135
135
|
* First, it is not necessary. Setting the font on a canvas (the only reason loading a `FontFace` is needed) is done through refering `fontFaceName` and `fontFaceStyle`.
|
|
136
136
|
* Second, it results in errors being thrown when used in Node.js, as `FontFace` will be undefined in this case.
|
|
137
137
|
*/
|
|
138
|
-
export function FontContainerFont(family,
|
|
138
|
+
export function FontContainerFont(family, styleLookup, src, opt, opentypeObj) {
|
|
139
139
|
// As FontFace objects are included in the document FontFaceSet object,
|
|
140
140
|
// they need to all have unique names.
|
|
141
141
|
let fontFaceName = family;
|
|
@@ -143,8 +143,8 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
143
143
|
|
|
144
144
|
/** @type {string} */
|
|
145
145
|
this.family = family;
|
|
146
|
-
/** @type {
|
|
147
|
-
this.style =
|
|
146
|
+
/** @type {StyleLookup} */
|
|
147
|
+
this.style = styleLookup;
|
|
148
148
|
/** @type {boolean} */
|
|
149
149
|
this.opt = opt;
|
|
150
150
|
/** @type {ArrayBuffer} */
|
|
@@ -154,9 +154,9 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
154
154
|
/** @type {string} */
|
|
155
155
|
this.fontFaceName = fontFaceName;
|
|
156
156
|
/** @type {('normal'|'italic')} */
|
|
157
|
-
this.fontFaceStyle = this.style
|
|
157
|
+
this.fontFaceStyle = ['italic', 'boldItalic'].includes(this.style) ? 'italic' : 'normal';
|
|
158
158
|
/** @type {('normal'|'bold')} */
|
|
159
|
-
this.fontFaceWeight = this.style
|
|
159
|
+
this.fontFaceWeight = ['bold', 'boldItalic'].includes(this.style) ? 'bold' : 'normal';
|
|
160
160
|
/** @type {("sans"|"serif")} */
|
|
161
161
|
this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
|
|
162
162
|
this.smallCapsMult = 0.75;
|
|
@@ -185,27 +185,27 @@ export async function loadFontContainerFamily(family, src, opt = false) {
|
|
|
185
185
|
normal: null,
|
|
186
186
|
italic: null,
|
|
187
187
|
bold: null,
|
|
188
|
+
boldItalic: null,
|
|
188
189
|
};
|
|
189
190
|
|
|
190
191
|
/**
|
|
191
192
|
*
|
|
192
|
-
* @param {
|
|
193
|
+
* @param {StyleLookup} styleLookup
|
|
193
194
|
* @returns
|
|
194
195
|
*/
|
|
195
|
-
const loadType = (
|
|
196
|
-
const srcType = (src[
|
|
196
|
+
const loadType = (styleLookup) => new Promise((resolve) => {
|
|
197
|
+
const srcType = (src[styleLookup]);
|
|
197
198
|
if (!srcType) {
|
|
198
199
|
resolve(false);
|
|
199
200
|
return;
|
|
200
201
|
}
|
|
201
|
-
// const scrNormal = typeof srcType === 'string' ? getFontAbsPath(srcType) : srcType;
|
|
202
202
|
loadOpentype(srcType).then((font) => {
|
|
203
|
-
res[
|
|
203
|
+
res[styleLookup] = new FontContainerFont(family, styleLookup, srcType, opt, font);
|
|
204
204
|
resolve(true);
|
|
205
205
|
});
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
-
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold')]);
|
|
208
|
+
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold'), loadType('boldItalic')]);
|
|
209
209
|
|
|
210
210
|
return res;
|
|
211
211
|
}
|
|
@@ -300,11 +300,13 @@ export class FontCont {
|
|
|
300
300
|
|
|
301
301
|
const fontNameEmbedded = fontObj.names.postScriptName.en;
|
|
302
302
|
|
|
303
|
-
let
|
|
304
|
-
if (fontNameEmbedded.match(/
|
|
305
|
-
|
|
303
|
+
let styleLookup = /** @type {StyleLookup} */ ('normal');
|
|
304
|
+
if (fontNameEmbedded.match(/boldit|bdit/i)) {
|
|
305
|
+
styleLookup = 'boldItalic';
|
|
306
|
+
} else if (fontNameEmbedded.match(/italic/i)) {
|
|
307
|
+
styleLookup = 'italic';
|
|
306
308
|
} else if (fontNameEmbedded.match(/bold/i)) {
|
|
307
|
-
|
|
309
|
+
styleLookup = 'bold';
|
|
308
310
|
}
|
|
309
311
|
|
|
310
312
|
// mupdf makes changes to font names, so we need to do the same.
|
|
@@ -312,9 +314,9 @@ export class FontCont {
|
|
|
312
314
|
// Spaces are replaced with underscores.
|
|
313
315
|
const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
314
316
|
|
|
315
|
-
if (!FontCont.doc?.[fontName]?.[
|
|
317
|
+
if (!FontCont.doc?.[fontName]?.[styleLookup]) {
|
|
316
318
|
try {
|
|
317
|
-
const fontContainer = new FontContainerFont(fontName,
|
|
319
|
+
const fontContainer = new FontContainerFont(fontName, styleLookup, fontData, false, fontObj);
|
|
318
320
|
|
|
319
321
|
if (!FontCont.doc) {
|
|
320
322
|
FontCont.doc = {};
|
|
@@ -324,12 +326,12 @@ export class FontCont {
|
|
|
324
326
|
FontCont.doc[fontName] = {};
|
|
325
327
|
}
|
|
326
328
|
|
|
327
|
-
FontCont.doc[fontName][
|
|
329
|
+
FontCont.doc[fontName][styleLookup] = fontContainer;
|
|
328
330
|
} catch (error) {
|
|
329
|
-
console.error(`Error loading font ${fontName} ${
|
|
331
|
+
console.error(`Error loading font ${fontName} ${styleLookup}.`);
|
|
330
332
|
}
|
|
331
333
|
} else {
|
|
332
|
-
console.warn(`Font ${fontName} ${
|
|
334
|
+
console.warn(`Font ${fontName} ${styleLookup} already exists.`);
|
|
333
335
|
}
|
|
334
336
|
};
|
|
335
337
|
|
|
@@ -368,14 +370,17 @@ export class FontCont {
|
|
|
368
370
|
* Gets a font object. Unlike accessing the font containers directly,
|
|
369
371
|
* this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
|
|
370
372
|
*
|
|
371
|
-
* @param {
|
|
372
|
-
* @param {('normal'|'italic'|'bold'|string)} [style='normal']
|
|
373
|
+
* @param {Partial<Style>} style
|
|
373
374
|
* @param {string} [lang='eng']
|
|
374
375
|
* @returns {FontContainerFont}
|
|
375
376
|
*/
|
|
376
|
-
static getFont = (
|
|
377
|
-
|
|
378
|
-
|
|
377
|
+
static getFont = (style, lang = 'eng') => {
|
|
378
|
+
let family = style.font || FontCont.defaultFontName;
|
|
379
|
+
|
|
380
|
+
const styleLookup = getStyleLookup(style);
|
|
381
|
+
|
|
382
|
+
if (FontCont.doc?.[family]?.[styleLookup] && !FontCont.doc?.[family]?.[styleLookup]?.disable) {
|
|
383
|
+
return FontCont.doc[family][styleLookup];
|
|
379
384
|
}
|
|
380
385
|
|
|
381
386
|
if (lang === 'chi_sim') {
|
|
@@ -387,7 +392,7 @@ export class FontCont {
|
|
|
387
392
|
|
|
388
393
|
// Option 1: If we have access to the font, use it.
|
|
389
394
|
// Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
|
|
390
|
-
if (!FontCont.raw?.[family]?.[
|
|
395
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
391
396
|
if (/NimbusRom/i.test(family)) {
|
|
392
397
|
family = 'NimbusRoman';
|
|
393
398
|
} else if (/Times/i.test(family)) {
|
|
@@ -416,7 +421,7 @@ export class FontCont {
|
|
|
416
421
|
}
|
|
417
422
|
|
|
418
423
|
// Option 3: If the font still is not identified, use the default sans/serif font.
|
|
419
|
-
if (!FontCont.raw?.[family]?.[
|
|
424
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
420
425
|
family = determineSansSerif(family);
|
|
421
426
|
}
|
|
422
427
|
|
|
@@ -427,10 +432,10 @@ export class FontCont {
|
|
|
427
432
|
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
428
433
|
|
|
429
434
|
/** @type {FontContainerFont} */
|
|
430
|
-
let fontRes = FontCont.raw?.[family]?.[
|
|
431
|
-
if (!fontRes) throw new Error(`Font container does not contain ${family} (${
|
|
435
|
+
let fontRes = FontCont.raw?.[family]?.[styleLookup];
|
|
436
|
+
if (!fontRes) throw new Error(`Font container does not contain ${family} (${styleLookup}).`);
|
|
432
437
|
|
|
433
|
-
const opt = FontCont.opt?.[family]?.[
|
|
438
|
+
const opt = FontCont.opt?.[family]?.[styleLookup];
|
|
434
439
|
const useOpt = FontCont.useOptFamily(family);
|
|
435
440
|
if (opt && useOpt) fontRes = opt;
|
|
436
441
|
|
|
@@ -441,10 +446,7 @@ export class FontCont {
|
|
|
441
446
|
*
|
|
442
447
|
* @param {OcrWord} word
|
|
443
448
|
*/
|
|
444
|
-
static getWordFont = (word) =>
|
|
445
|
-
const wordFontFamily = word.font || FontCont.defaultFontName;
|
|
446
|
-
return FontCont.getFont(wordFontFamily, word.style, word.lang);
|
|
447
|
-
};
|
|
449
|
+
static getWordFont = (word) => FontCont.getFont(word.style, word.lang);
|
|
448
450
|
|
|
449
451
|
/**
|
|
450
452
|
* Reset font container to original state but do not unload default resources.
|
package/js/export/writeHocr.js
CHANGED
|
@@ -75,38 +75,40 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
75
75
|
hocrOut += `bbox ${Math.round(wordObj.bbox.left)} ${Math.round(wordObj.bbox.top)} ${Math.round(wordObj.bbox.right)} ${Math.round(wordObj.bbox.bottom)}`;
|
|
76
76
|
hocrOut += `;x_wconf ${wordObj.conf}`;
|
|
77
77
|
|
|
78
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
79
|
-
hocrOut += `;x_font ${wordObj.font}`;
|
|
78
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
79
|
+
hocrOut += `;x_font ${wordObj.style.font}`;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
if (wordObj.size) {
|
|
83
|
-
hocrOut += `;x_fsize ${wordObj.size}`;
|
|
82
|
+
if (wordObj.style.size) {
|
|
83
|
+
hocrOut += `;x_fsize ${wordObj.style.size}`;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
hocrOut += "'";
|
|
87
87
|
|
|
88
88
|
// Tesseract HOCR specifies default language for a paragraph in the "ocr_par" element,
|
|
89
89
|
// however as ScribeOCR does not currently have a paragarph object, every word must have its language specified.
|
|
90
|
-
hocrOut += ` lang='${wordObj.lang}'`;
|
|
90
|
+
if (wordObj.lang) hocrOut += ` lang='${wordObj.lang}'`;
|
|
91
91
|
|
|
92
92
|
// TODO: Why are we representing font family and style using the `style` HTML element here?
|
|
93
93
|
// This is not how Tesseract does things, and our own parsing script does not appear to be written to re-import it properly.
|
|
94
94
|
// Add "style" attribute (if applicable)
|
|
95
|
-
if (
|
|
95
|
+
if (wordObj.style.bold || wordObj.style.italic || wordObj.style.smallCaps || (wordObj.style.font && wordObj.style.font !== 'Default')) {
|
|
96
96
|
hocrOut += ' style=\'';
|
|
97
97
|
|
|
98
|
-
if (wordObj.style
|
|
98
|
+
if (wordObj.style.italic) {
|
|
99
99
|
hocrOut += 'font-style:italic;';
|
|
100
|
-
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (wordObj.style.bold) {
|
|
101
103
|
hocrOut += 'font-weight:bold;';
|
|
102
104
|
}
|
|
103
105
|
|
|
104
|
-
if (wordObj.smallCaps) {
|
|
106
|
+
if (wordObj.style.smallCaps) {
|
|
105
107
|
hocrOut += 'font-variant:small-caps;';
|
|
106
108
|
}
|
|
107
109
|
|
|
108
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
109
|
-
hocrOut += `font-family:${wordObj.font}`;
|
|
110
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
111
|
+
hocrOut += `font-family:${wordObj.style.font}`;
|
|
110
112
|
}
|
|
111
113
|
|
|
112
114
|
hocrOut += '\'>';
|
|
@@ -115,9 +117,9 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
115
117
|
}
|
|
116
118
|
|
|
117
119
|
// Add word text, along with any formatting that uses nested elements rather than attributes
|
|
118
|
-
if (wordObj.sup) {
|
|
120
|
+
if (wordObj.style.sup) {
|
|
119
121
|
hocrOut += `<sup>${ocr.escapeXml(wordObj.text)}</sup>`;
|
|
120
|
-
} else if (wordObj.dropcap) {
|
|
122
|
+
} else if (wordObj.style.dropcap) {
|
|
121
123
|
hocrOut += `<span class='ocr_dropcap'>${ocr.escapeXml(wordObj.text)}</span>`;
|
|
122
124
|
} else {
|
|
123
125
|
hocrOut += ocr.escapeXml(wordObj.text);
|
package/js/export/writeHtml.js
CHANGED
|
@@ -175,7 +175,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
175
175
|
// Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
|
|
176
176
|
// and then wrapping each letter in a span with a smaller font size.
|
|
177
177
|
let innerHTML;
|
|
178
|
-
if (wordObj.smallCaps) {
|
|
178
|
+
if (wordObj.style.smallCaps) {
|
|
179
179
|
styleStr += 'text-transform:uppercase;';
|
|
180
180
|
innerHTML = makeSmallCapsDivs(wordStr, fontSizeHTMLSmallCaps);
|
|
181
181
|
} else {
|
package/js/export/writePdf.js
CHANGED
|
@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
|
|
|
10
10
|
import { opt } from '../containers/app.js';
|
|
11
11
|
import { pageMetricsArr } from '../containers/dataContainer.js';
|
|
12
12
|
import ocr from '../objects/ocrObjects.js';
|
|
13
|
+
import { getStyleLookup } from '../utils/miscUtils.js';
|
|
13
14
|
|
|
14
15
|
/**
|
|
15
16
|
* @param {number} x
|
|
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
|
|
|
97
98
|
normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
|
|
98
99
|
italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
|
|
99
100
|
bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
|
|
101
|
+
boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
|
|
100
102
|
};
|
|
101
103
|
await addFamilyObj(familyKeyI, familyObjI);
|
|
102
104
|
}
|
|
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
301
303
|
|
|
302
304
|
const pdfFontsUsed = new Set();
|
|
303
305
|
|
|
306
|
+
const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
|
|
307
|
+
|
|
304
308
|
// Start 1st object: Text Content
|
|
305
309
|
let textContentObjStr = '';
|
|
306
310
|
|
|
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
349
353
|
let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
|
|
350
354
|
|
|
351
355
|
if (!wordFontOpentype) {
|
|
352
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
356
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
353
357
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
354
358
|
continue;
|
|
355
359
|
}
|
|
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
359
363
|
let wordFontSize = word0Metrics.fontSize;
|
|
360
364
|
|
|
361
365
|
// Set font and font size
|
|
362
|
-
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
366
|
+
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
363
367
|
pdfFontNameCurrent = pdfFontCurrent.name;
|
|
364
368
|
pdfFontTypeCurrent = pdfFontCurrent.type;
|
|
365
369
|
pdfFontsUsed.add(pdfFontCurrent);
|
|
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
372
376
|
const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
|
|
373
377
|
|
|
374
378
|
let tz = 100;
|
|
375
|
-
if (wordJ.dropcap) {
|
|
379
|
+
if (wordJ.style.dropcap) {
|
|
376
380
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
377
381
|
tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
|
|
378
382
|
}
|
|
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
406
410
|
let spacingAdj = 0;
|
|
407
411
|
let kernSpacing = false;
|
|
408
412
|
let wordLast = wordJ;
|
|
413
|
+
let underlineLeft = /** @type {?number} */ null;
|
|
414
|
+
let underlineRight = /** @type {?number} */ null;
|
|
409
415
|
let wordFontOpentypeLast = wordFontOpentype;
|
|
410
416
|
let fontSizeLast = wordFontSize;
|
|
411
417
|
let tsCurrent = 0;
|
|
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
426
432
|
wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
|
|
427
433
|
|
|
428
434
|
if (!wordFontOpentype) {
|
|
429
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
435
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
430
436
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
431
437
|
continue;
|
|
432
438
|
}
|
|
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
446
452
|
fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
|
|
447
453
|
}
|
|
448
454
|
|
|
449
|
-
const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
455
|
+
const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
450
456
|
const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
|
|
451
457
|
|
|
452
458
|
let ts = 0;
|
|
453
|
-
if (wordJ.sup || wordJ.dropcap) {
|
|
459
|
+
if (wordJ.style.sup || wordJ.style.dropcap) {
|
|
454
460
|
ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
|
|
455
461
|
if (!wordJ.visualCoords) {
|
|
456
462
|
const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
|
|
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
462
468
|
|
|
463
469
|
// TODO: This probably fails for Chinese, rethink.
|
|
464
470
|
tz = 100;
|
|
465
|
-
if (wordJ.dropcap) {
|
|
471
|
+
if (wordJ.style.dropcap) {
|
|
466
472
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
467
473
|
tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
|
|
468
474
|
}
|
|
469
475
|
|
|
470
|
-
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
476
|
+
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
471
477
|
const pdfFontName = pdfFont.name;
|
|
472
478
|
const pdfFontType = pdfFont.type;
|
|
473
479
|
pdfFontsUsed.add(pdfFont);
|
|
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
480
486
|
// The space between words determined by:
|
|
481
487
|
// (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
|
|
482
488
|
// (4) the current character spacing value (applied twice--both before and after the space character).
|
|
483
|
-
const
|
|
489
|
+
const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
|
|
490
|
+
const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
|
|
484
491
|
|
|
485
492
|
const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
|
|
486
493
|
|
|
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
503
510
|
// However, this assumption does not hold for single-character words, as there is no space between character to adjust.
|
|
504
511
|
// Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
|
|
505
512
|
// (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
|
|
506
|
-
if (charArr.length === 1 && !wordJ.dropcap) {
|
|
513
|
+
if (charArr.length === 1 && !wordJ.style.dropcap) {
|
|
507
514
|
const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
|
|
508
515
|
const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
|
|
509
|
-
const
|
|
516
|
+
const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
|
|
517
|
+
const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
|
|
510
518
|
spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
|
|
511
519
|
} else {
|
|
512
520
|
spacingAdj = 0 - angleAdjWordX;
|
|
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
514
522
|
|
|
515
523
|
textContentObjStr += ' ] TJ\n';
|
|
516
524
|
|
|
517
|
-
const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
525
|
+
const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
518
526
|
if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
|
|
519
527
|
textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
|
|
520
528
|
pdfFontNameCurrent = pdfFontName;
|
|
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
541
549
|
// Non-ASCII and special characters are encoded/escaped using winEncodingLookup
|
|
542
550
|
for (let k = 0; k < charArr.length; k++) {
|
|
543
551
|
const letterSrc = charArr[k];
|
|
544
|
-
const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
545
|
-
const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
552
|
+
const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
553
|
+
const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
546
554
|
|
|
547
555
|
const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
|
|
548
556
|
if (letterEnc) {
|
|
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
611
619
|
}
|
|
612
620
|
}
|
|
613
621
|
|
|
622
|
+
if (wordJ.style.underline && underlineLeft === null) {
|
|
623
|
+
underlineLeft = wordJ.bbox.left;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (wordJ.style.underline) {
|
|
627
|
+
underlineRight = wordJ.bbox.right;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
|
|
631
|
+
underlines.push({
|
|
632
|
+
left: underlineLeft,
|
|
633
|
+
right: underlineRight,
|
|
634
|
+
top: lineTopAdj,
|
|
635
|
+
height: lineObj.bbox.bottom - lineObj.bbox.top,
|
|
636
|
+
fontSize: wordFontSize,
|
|
637
|
+
bold: wordJ.style.bold,
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
underlineLeft = null;
|
|
641
|
+
underlineRight = null;
|
|
642
|
+
}
|
|
643
|
+
|
|
614
644
|
wordLast = wordJ;
|
|
615
645
|
wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
|
|
616
646
|
wordFontOpentypeLast = wordFontOpentype;
|
|
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
622
652
|
|
|
623
653
|
textContentObjStr += 'ET';
|
|
624
654
|
|
|
655
|
+
// Add underlines
|
|
656
|
+
underlines.forEach((underline) => {
|
|
657
|
+
const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
|
|
658
|
+
const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
|
|
659
|
+
|
|
660
|
+
textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
|
|
661
|
+
});
|
|
662
|
+
|
|
625
663
|
return { textContentObjStr, pdfFontsUsed };
|
|
626
664
|
}
|
|
@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
|
|
|
108
108
|
*
|
|
109
109
|
* @param {opentype.Font} font - Opentype.js font object
|
|
110
110
|
* @param {number} objIndex - Index for font descriptor PDF object
|
|
111
|
-
* @param {
|
|
111
|
+
* @param {boolean} italic
|
|
112
112
|
* @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
|
|
113
113
|
* If not provided, the font will not be embedded in the PDF.
|
|
114
114
|
* @returns {string} The font descriptor object string.
|
|
115
115
|
*/
|
|
116
|
-
function createFontDescriptor(font, objIndex,
|
|
116
|
+
function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
|
|
117
117
|
let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
|
|
118
118
|
|
|
119
119
|
const namesTable = font.names.windows || font.names;
|
|
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
155
155
|
|
|
156
156
|
// Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
|
|
157
157
|
// This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
|
|
158
|
-
objOut += `/Flags ${String(generateFontFlags(serif,
|
|
158
|
+
objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
|
|
159
159
|
|
|
160
160
|
if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
|
|
161
161
|
objOut += '>>\nendobj\n\n';
|
|
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
175
175
|
*
|
|
176
176
|
* @param {opentype.Font} font - Opentype.js font object
|
|
177
177
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
178
|
-
* @param {
|
|
178
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
179
179
|
* @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
|
|
180
180
|
* Standard fonts are not embedded in the PDF.
|
|
181
181
|
* @returns {Array<string>}
|
|
182
182
|
*/
|
|
183
|
-
export function createEmbeddedFontType1(font, firstObjIndex,
|
|
183
|
+
export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
|
|
184
184
|
// Start 1st object: Font Dictionary
|
|
185
185
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
|
|
186
186
|
|
|
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
193
193
|
|
|
194
194
|
fontDictObjStr += '/Widths[';
|
|
195
195
|
for (let i = 0; i < win1252Chars.length; i++) {
|
|
196
|
-
const
|
|
196
|
+
const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
|
|
197
|
+
const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
|
|
197
198
|
fontDictObjStr += `${String(advanceNorm)} `;
|
|
198
199
|
}
|
|
199
200
|
fontDictObjStr += ']/FirstChar 32/LastChar 255';
|
|
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
201
202
|
fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
|
|
202
203
|
|
|
203
204
|
// Start 2nd object: Font Descriptor
|
|
204
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
205
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
|
|
205
206
|
|
|
206
207
|
// objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
|
|
207
208
|
|
|
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
249
250
|
*
|
|
250
251
|
* @param {opentype.Font} font - Opentype.js font object
|
|
251
252
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
253
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
252
254
|
*
|
|
253
255
|
* This function does not produce "toUnicode" or "Widths" objects,
|
|
254
256
|
* so any PDF it creates directly will lack usable copy/paste.
|
|
255
257
|
* However, both of these objects will be created from the embedded file
|
|
256
258
|
* when the result is run through mupdf.
|
|
257
259
|
*/
|
|
258
|
-
export function createEmbeddedFontType0(font, firstObjIndex,
|
|
260
|
+
export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
|
|
259
261
|
// Start 1st object: Font Dictionary
|
|
260
262
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
|
|
261
263
|
|
|
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
|
|
|
282
284
|
toUnicodeStr += '\nendstream\nendobj\n\n';
|
|
283
285
|
|
|
284
286
|
// Start 3rd object: FontDescriptor
|
|
285
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
287
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
|
|
286
288
|
|
|
287
289
|
// objOut += `${String(firstObjIndex + 2)} 0 obj\n`;
|
|
288
290
|
|
|
@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
|
|
|
86
86
|
|
|
87
87
|
if (xlsxMode) {
|
|
88
88
|
let fontStyle;
|
|
89
|
-
if (wordObj.style
|
|
89
|
+
if (wordObj.style.italic) {
|
|
90
90
|
fontStyle = '<i/>';
|
|
91
|
-
} else if (wordObj.smallCaps) {
|
|
91
|
+
} else if (wordObj.style.smallCaps) {
|
|
92
92
|
fontStyle = '<smallCaps/>';
|
|
93
93
|
} else {
|
|
94
94
|
fontStyle = '';
|