scribe.js-ocr 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/fonts/all/Carlito-BoldItalic.woff +0 -0
  2. package/fonts/all/Century-BoldItalic.woff +0 -0
  3. package/fonts/all/Garamond-BoldItalic.woff +0 -0
  4. package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
  5. package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
  6. package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
  7. package/fonts/all/Palatino-BoldItalic.woff +0 -0
  8. package/fonts/latin/Carlito-BoldItalic.woff +0 -0
  9. package/fonts/latin/Century-BoldItalic.woff +0 -0
  10. package/fonts/latin/Garamond-BoldItalic.woff +0 -0
  11. package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
  12. package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
  13. package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
  14. package/fonts/latin/Palatino-BoldItalic.woff +0 -0
  15. package/js/containers/app.js +1 -1
  16. package/js/containers/fontContainer.js +42 -40
  17. package/js/export/writeHocr.js +15 -13
  18. package/js/export/writeHtml.js +1 -1
  19. package/js/export/writePdf.js +52 -14
  20. package/js/export/writePdfFonts.js +11 -9
  21. package/js/export/writeTabular.js +2 -2
  22. package/js/export/writeText.js +10 -6
  23. package/js/extractTables.js +5 -5
  24. package/js/fontContainerMain.js +50 -7
  25. package/js/fontStatistics.js +18 -13
  26. package/js/fontSupp.js +20 -20
  27. package/js/global.d.ts +17 -0
  28. package/js/import/convertPageAbbyy.js +47 -25
  29. package/js/import/convertPageBlocks.js +2 -2
  30. package/js/import/convertPageHocr.js +10 -20
  31. package/js/import/convertPageShared.js +13 -9
  32. package/js/import/convertPageStext.js +66 -31
  33. package/js/objects/ocrObjects.js +13 -19
  34. package/js/utils/fontUtils.js +11 -11
  35. package/js/utils/miscUtils.js +16 -0
  36. package/js/worker/compareOCRModule.js +13 -16
  37. package/js/worker/optimizeFontModule.js +4 -4
  38. package/mupdf/libmupdf.js +123 -17
  39. package/mupdf/libmupdf.wasm +0 -0
  40. package/package.json +1 -1
@@ -1,5 +1,5 @@
1
1
  export class opt {
2
- static ligatures = true;
2
+ static ligatures = false;
3
3
 
4
4
  static kerning = true;
5
5
 
@@ -5,7 +5,7 @@
5
5
 
6
6
  // Node.js case
7
7
  import opentype from '../../lib/opentype.module.js';
8
- import { determineSansSerif } from '../utils/miscUtils.js';
8
+ import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
9
9
  import { ca } from '../canvasAdapter.js';
10
10
 
11
11
  if (typeof process === 'object') {
@@ -104,26 +104,26 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
104
104
  * Load font from source and return a FontContainerFont object.
105
105
  * This function is used to load the Chinese font.
106
106
  * @param {string} family
107
- * @param {string} style
107
+ * @param {StyleLookup} styleLookup
108
108
  * @param {("sans"|"serif")} type
109
109
  * @param {ArrayBuffer} src
110
110
  * @param {boolean} opt
111
111
  *
112
112
  */
113
- export async function loadFont(family, style, type, src, opt) {
113
+ export async function loadFont(family, styleLookup, type, src, opt) {
114
114
  const fontObj = await loadOpentype(src);
115
- return new FontContainerFont(family, style, src, opt, fontObj);
115
+ return new FontContainerFont(family, styleLookup, src, opt, fontObj);
116
116
  }
117
117
 
118
118
  /**
119
119
  *
120
120
  * @param {string} family
121
- * @param {string} style
121
+ * @param {StyleLookup} styleLookup
122
122
  * @param {ArrayBuffer} src
123
123
  * @param {boolean} opt
124
124
  * @param {opentype.Font} opentypeObj - Kerning paris to re-apply
125
125
  * @property {string} family -
126
- * @property {string} style -
126
+ * @property {StyleLookup} style -
127
127
  * @property {ArrayBuffer} src
128
128
  * @property {opentype.Font} opentype -
129
129
  * @property {string} fontFaceName -
@@ -135,7 +135,7 @@ export async function loadFont(family, style, type, src, opt) {
135
135
  * First, it is not necessary. Setting the font on a canvas (the only reason loading a `FontFace` is needed) is done through refering `fontFaceName` and `fontFaceStyle`.
136
136
  * Second, it results in errors being thrown when used in Node.js, as `FontFace` will be undefined in this case.
137
137
  */
138
- export function FontContainerFont(family, style, src, opt, opentypeObj) {
138
+ export function FontContainerFont(family, styleLookup, src, opt, opentypeObj) {
139
139
  // As FontFace objects are included in the document FontFaceSet object,
140
140
  // they need to all have unique names.
141
141
  let fontFaceName = family;
@@ -143,8 +143,8 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
143
143
 
144
144
  /** @type {string} */
145
145
  this.family = family;
146
- /** @type {string} */
147
- this.style = style;
146
+ /** @type {StyleLookup} */
147
+ this.style = styleLookup;
148
148
  /** @type {boolean} */
149
149
  this.opt = opt;
150
150
  /** @type {ArrayBuffer} */
@@ -154,9 +154,9 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
154
154
  /** @type {string} */
155
155
  this.fontFaceName = fontFaceName;
156
156
  /** @type {('normal'|'italic')} */
157
- this.fontFaceStyle = this.style === 'italic' ? 'italic' : 'normal';
157
+ this.fontFaceStyle = ['italic', 'boldItalic'].includes(this.style) ? 'italic' : 'normal';
158
158
  /** @type {('normal'|'bold')} */
159
- this.fontFaceWeight = this.style === 'bold' ? 'bold' : 'normal';
159
+ this.fontFaceWeight = ['bold', 'boldItalic'].includes(this.style) ? 'bold' : 'normal';
160
160
  /** @type {("sans"|"serif")} */
161
161
  this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
162
162
  this.smallCapsMult = 0.75;
@@ -185,27 +185,27 @@ export async function loadFontContainerFamily(family, src, opt = false) {
185
185
  normal: null,
186
186
  italic: null,
187
187
  bold: null,
188
+ boldItalic: null,
188
189
  };
189
190
 
190
191
  /**
191
192
  *
192
- * @param {('normal'|'bold'|'italic')} type
193
+ * @param {StyleLookup} styleLookup
193
194
  * @returns
194
195
  */
195
- const loadType = (type) => new Promise((resolve) => {
196
- const srcType = (src[type]);
196
+ const loadType = (styleLookup) => new Promise((resolve) => {
197
+ const srcType = (src[styleLookup]);
197
198
  if (!srcType) {
198
199
  resolve(false);
199
200
  return;
200
201
  }
201
- // const scrNormal = typeof srcType === 'string' ? getFontAbsPath(srcType) : srcType;
202
202
  loadOpentype(srcType).then((font) => {
203
- res[type] = new FontContainerFont(family, type, srcType, opt, font);
203
+ res[styleLookup] = new FontContainerFont(family, styleLookup, srcType, opt, font);
204
204
  resolve(true);
205
205
  });
206
206
  });
207
207
 
208
- Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold')]);
208
+ Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold'), loadType('boldItalic')]);
209
209
 
210
210
  return res;
211
211
  }
@@ -300,11 +300,13 @@ export class FontCont {
300
300
 
301
301
  const fontNameEmbedded = fontObj.names.postScriptName.en;
302
302
 
303
- let fontStyle = 'normal';
304
- if (fontNameEmbedded.match(/italic/i)) {
305
- fontStyle = 'italic';
303
+ let styleLookup = /** @type {StyleLookup} */ ('normal');
304
+ if (fontNameEmbedded.match(/boldit|bdit/i)) {
305
+ styleLookup = 'boldItalic';
306
+ } else if (fontNameEmbedded.match(/italic/i)) {
307
+ styleLookup = 'italic';
306
308
  } else if (fontNameEmbedded.match(/bold/i)) {
307
- fontStyle = 'bold';
309
+ styleLookup = 'bold';
308
310
  }
309
311
 
310
312
  // mupdf makes changes to font names, so we need to do the same.
@@ -312,9 +314,9 @@ export class FontCont {
312
314
  // Spaces are replaced with underscores.
313
315
  const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
314
316
 
315
- if (!FontCont.doc?.[fontName]?.[fontStyle]) {
317
+ if (!FontCont.doc?.[fontName]?.[styleLookup]) {
316
318
  try {
317
- const fontContainer = new FontContainerFont(fontName, fontStyle, fontData, false, fontObj);
319
+ const fontContainer = new FontContainerFont(fontName, styleLookup, fontData, false, fontObj);
318
320
 
319
321
  if (!FontCont.doc) {
320
322
  FontCont.doc = {};
@@ -324,12 +326,12 @@ export class FontCont {
324
326
  FontCont.doc[fontName] = {};
325
327
  }
326
328
 
327
- FontCont.doc[fontName][fontStyle] = fontContainer;
329
+ FontCont.doc[fontName][styleLookup] = fontContainer;
328
330
  } catch (error) {
329
- console.error(`Error loading font ${fontName} ${fontStyle}.`);
331
+ console.error(`Error loading font ${fontName} ${styleLookup}.`);
330
332
  }
331
333
  } else {
332
- console.warn(`Font ${fontName} ${fontStyle} already exists.`);
334
+ console.warn(`Font ${fontName} ${styleLookup} already exists.`);
333
335
  }
334
336
  };
335
337
 
@@ -368,14 +370,17 @@ export class FontCont {
368
370
  * Gets a font object. Unlike accessing the font containers directly,
369
371
  * this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
370
372
  *
371
- * @param {('Default'|'SansDefault'|'SerifDefault'|string)} family - Font family name.
372
- * @param {('normal'|'italic'|'bold'|string)} [style='normal']
373
+ * @param {Partial<Style>} style
373
374
  * @param {string} [lang='eng']
374
375
  * @returns {FontContainerFont}
375
376
  */
376
- static getFont = (family, style = 'normal', lang = 'eng') => {
377
- if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
378
- return FontCont.doc[family][style];
377
+ static getFont = (style, lang = 'eng') => {
378
+ let family = style.font || FontCont.defaultFontName;
379
+
380
+ const styleLookup = getStyleLookup(style);
381
+
382
+ if (FontCont.doc?.[family]?.[styleLookup] && !FontCont.doc?.[family]?.[styleLookup]?.disable) {
383
+ return FontCont.doc[family][styleLookup];
379
384
  }
380
385
 
381
386
  if (lang === 'chi_sim') {
@@ -387,7 +392,7 @@ export class FontCont {
387
392
 
388
393
  // Option 1: If we have access to the font, use it.
389
394
  // Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
390
- if (!FontCont.raw?.[family]?.[style]) {
395
+ if (!FontCont.raw?.[family]?.[styleLookup]) {
391
396
  if (/NimbusRom/i.test(family)) {
392
397
  family = 'NimbusRoman';
393
398
  } else if (/Times/i.test(family)) {
@@ -416,7 +421,7 @@ export class FontCont {
416
421
  }
417
422
 
418
423
  // Option 3: If the font still is not identified, use the default sans/serif font.
419
- if (!FontCont.raw?.[family]?.[style]) {
424
+ if (!FontCont.raw?.[family]?.[styleLookup]) {
420
425
  family = determineSansSerif(family);
421
426
  }
422
427
 
@@ -427,10 +432,10 @@ export class FontCont {
427
432
  if (family === 'SansDefault') family = FontCont.sansDefaultName;
428
433
 
429
434
  /** @type {FontContainerFont} */
430
- let fontRes = FontCont.raw?.[family]?.[style];
431
- if (!fontRes) throw new Error(`Font container does not contain ${family} (${style}).`);
435
+ let fontRes = FontCont.raw?.[family]?.[styleLookup];
436
+ if (!fontRes) throw new Error(`Font container does not contain ${family} (${styleLookup}).`);
432
437
 
433
- const opt = FontCont.opt?.[family]?.[style];
438
+ const opt = FontCont.opt?.[family]?.[styleLookup];
434
439
  const useOpt = FontCont.useOptFamily(family);
435
440
  if (opt && useOpt) fontRes = opt;
436
441
 
@@ -441,10 +446,7 @@ export class FontCont {
441
446
  *
442
447
  * @param {OcrWord} word
443
448
  */
444
- static getWordFont = (word) => {
445
- const wordFontFamily = word.font || FontCont.defaultFontName;
446
- return FontCont.getFont(wordFontFamily, word.style, word.lang);
447
- };
449
+ static getWordFont = (word) => FontCont.getFont(word.style, word.lang);
448
450
 
449
451
  /**
450
452
  * Reset font container to original state but do not unload default resources.
@@ -75,38 +75,40 @@ export function writeHocr(ocrData, minValue, maxValue) {
75
75
  hocrOut += `bbox ${Math.round(wordObj.bbox.left)} ${Math.round(wordObj.bbox.top)} ${Math.round(wordObj.bbox.right)} ${Math.round(wordObj.bbox.bottom)}`;
76
76
  hocrOut += `;x_wconf ${wordObj.conf}`;
77
77
 
78
- if (wordObj.font && wordObj.font !== 'Default') {
79
- hocrOut += `;x_font ${wordObj.font}`;
78
+ if (wordObj.style.font && wordObj.style.font !== 'Default') {
79
+ hocrOut += `;x_font ${wordObj.style.font}`;
80
80
  }
81
81
 
82
- if (wordObj.size) {
83
- hocrOut += `;x_fsize ${wordObj.size}`;
82
+ if (wordObj.style.size) {
83
+ hocrOut += `;x_fsize ${wordObj.style.size}`;
84
84
  }
85
85
 
86
86
  hocrOut += "'";
87
87
 
88
88
  // Tesseract HOCR specifies default language for a paragraph in the "ocr_par" element,
89
89
  // however as ScribeOCR does not currently have a paragarph object, every word must have its language specified.
90
- hocrOut += ` lang='${wordObj.lang}'`;
90
+ if (wordObj.lang) hocrOut += ` lang='${wordObj.lang}'`;
91
91
 
92
92
  // TODO: Why are we representing font family and style using the `style` HTML element here?
93
93
  // This is not how Tesseract does things, and our own parsing script does not appear to be written to re-import it properly.
94
94
  // Add "style" attribute (if applicable)
95
- if (['italic', 'bold'].includes(wordObj.style) || wordObj.smallCaps || (wordObj.font && wordObj.font !== 'Default')) {
95
+ if (wordObj.style.bold || wordObj.style.italic || wordObj.style.smallCaps || (wordObj.style.font && wordObj.style.font !== 'Default')) {
96
96
  hocrOut += ' style=\'';
97
97
 
98
- if (wordObj.style === 'italic') {
98
+ if (wordObj.style.italic) {
99
99
  hocrOut += 'font-style:italic;';
100
- } else if (wordObj.style === 'bold') {
100
+ }
101
+
102
+ if (wordObj.style.bold) {
101
103
  hocrOut += 'font-weight:bold;';
102
104
  }
103
105
 
104
- if (wordObj.smallCaps) {
106
+ if (wordObj.style.smallCaps) {
105
107
  hocrOut += 'font-variant:small-caps;';
106
108
  }
107
109
 
108
- if (wordObj.font && wordObj.font !== 'Default') {
109
- hocrOut += `font-family:${wordObj.font}`;
110
+ if (wordObj.style.font && wordObj.style.font !== 'Default') {
111
+ hocrOut += `font-family:${wordObj.style.font}`;
110
112
  }
111
113
 
112
114
  hocrOut += '\'>';
@@ -115,9 +117,9 @@ export function writeHocr(ocrData, minValue, maxValue) {
115
117
  }
116
118
 
117
119
  // Add word text, along with any formatting that uses nested elements rather than attributes
118
- if (wordObj.sup) {
120
+ if (wordObj.style.sup) {
119
121
  hocrOut += `<sup>${ocr.escapeXml(wordObj.text)}</sup>`;
120
- } else if (wordObj.dropcap) {
122
+ } else if (wordObj.style.dropcap) {
121
123
  hocrOut += `<span class='ocr_dropcap'>${ocr.escapeXml(wordObj.text)}</span>`;
122
124
  } else {
123
125
  hocrOut += ocr.escapeXml(wordObj.text);
@@ -175,7 +175,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
175
175
  // Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
176
176
  // and then wrapping each letter in a span with a smaller font size.
177
177
  let innerHTML;
178
- if (wordObj.smallCaps) {
178
+ if (wordObj.style.smallCaps) {
179
179
  styleStr += 'text-transform:uppercase;';
180
180
  innerHTML = makeSmallCapsDivs(wordStr, fontSizeHTMLSmallCaps);
181
181
  } else {
@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
10
10
  import { opt } from '../containers/app.js';
11
11
  import { pageMetricsArr } from '../containers/dataContainer.js';
12
12
  import ocr from '../objects/ocrObjects.js';
13
+ import { getStyleLookup } from '../utils/miscUtils.js';
13
14
 
14
15
  /**
15
16
  * @param {number} x
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
97
98
  normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
98
99
  italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
99
100
  bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
101
+ boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
100
102
  };
101
103
  await addFamilyObj(familyKeyI, familyObjI);
102
104
  }
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
301
303
 
302
304
  const pdfFontsUsed = new Set();
303
305
 
306
+ const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
307
+
304
308
  // Start 1st object: Text Content
305
309
  let textContentObjStr = '';
306
310
 
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
349
353
  let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
350
354
 
351
355
  if (!wordFontOpentype) {
352
- const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
356
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
353
357
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
354
358
  continue;
355
359
  }
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
359
363
  let wordFontSize = word0Metrics.fontSize;
360
364
 
361
365
  // Set font and font size
362
- const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
366
+ const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
363
367
  pdfFontNameCurrent = pdfFontCurrent.name;
364
368
  pdfFontTypeCurrent = pdfFontCurrent.type;
365
369
  pdfFontsUsed.add(pdfFontCurrent);
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
372
376
  const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
373
377
 
374
378
  let tz = 100;
375
- if (wordJ.dropcap) {
379
+ if (wordJ.style.dropcap) {
376
380
  const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
377
381
  tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
378
382
  }
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
406
410
  let spacingAdj = 0;
407
411
  let kernSpacing = false;
408
412
  let wordLast = wordJ;
413
+ let underlineLeft = /** @type {?number} */ null;
414
+ let underlineRight = /** @type {?number} */ null;
409
415
  let wordFontOpentypeLast = wordFontOpentype;
410
416
  let fontSizeLast = wordFontSize;
411
417
  let tsCurrent = 0;
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
426
432
  wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
427
433
 
428
434
  if (!wordFontOpentype) {
429
- const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
435
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
430
436
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
431
437
  continue;
432
438
  }
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
446
452
  fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
447
453
  }
448
454
 
449
- const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
455
+ const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
450
456
  const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
451
457
 
452
458
  let ts = 0;
453
- if (wordJ.sup || wordJ.dropcap) {
459
+ if (wordJ.style.sup || wordJ.style.dropcap) {
454
460
  ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
455
461
  if (!wordJ.visualCoords) {
456
462
  const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
462
468
 
463
469
  // TODO: This probably fails for Chinese, rethink.
464
470
  tz = 100;
465
- if (wordJ.dropcap) {
471
+ if (wordJ.style.dropcap) {
466
472
  const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
467
473
  tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
468
474
  }
469
475
 
470
- const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
476
+ const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
471
477
  const pdfFontName = pdfFont.name;
472
478
  const pdfFontType = pdfFont.type;
473
479
  pdfFontsUsed.add(pdfFont);
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
480
486
  // The space between words determined by:
481
487
  // (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
482
488
  // (4) the current character spacing value (applied twice--both before and after the space character).
483
- const spaceWidthGlyph = wordFontOpentypeLast.charToGlyph(' ').advanceWidth * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
489
+ const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
490
+ const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
484
491
 
485
492
  const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
486
493
 
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
503
510
  // However, this assumption does not hold for single-character words, as there is no space between character to adjust.
504
511
  // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
505
512
  // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
506
- if (charArr.length === 1 && !wordJ.dropcap) {
513
+ if (charArr.length === 1 && !wordJ.style.dropcap) {
507
514
  const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
508
515
  const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
509
- const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
516
+ const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
517
+ const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
510
518
  spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
511
519
  } else {
512
520
  spacingAdj = 0 - angleAdjWordX;
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
514
522
 
515
523
  textContentObjStr += ' ] TJ\n';
516
524
 
517
- const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
525
+ const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
518
526
  if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
519
527
  textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
520
528
  pdfFontNameCurrent = pdfFontName;
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
541
549
  // Non-ASCII and special characters are encoded/escaped using winEncodingLookup
542
550
  for (let k = 0; k < charArr.length; k++) {
543
551
  const letterSrc = charArr[k];
544
- const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
545
- const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
552
+ const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
553
+ const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
546
554
 
547
555
  const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
548
556
  if (letterEnc) {
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
611
619
  }
612
620
  }
613
621
 
622
+ if (wordJ.style.underline && underlineLeft === null) {
623
+ underlineLeft = wordJ.bbox.left;
624
+ }
625
+
626
+ if (wordJ.style.underline) {
627
+ underlineRight = wordJ.bbox.right;
628
+ }
629
+
630
+ if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
631
+ underlines.push({
632
+ left: underlineLeft,
633
+ right: underlineRight,
634
+ top: lineTopAdj,
635
+ height: lineObj.bbox.bottom - lineObj.bbox.top,
636
+ fontSize: wordFontSize,
637
+ bold: wordJ.style.bold,
638
+ });
639
+
640
+ underlineLeft = null;
641
+ underlineRight = null;
642
+ }
643
+
614
644
  wordLast = wordJ;
615
645
  wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
616
646
  wordFontOpentypeLast = wordFontOpentype;
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
622
652
 
623
653
  textContentObjStr += 'ET';
624
654
 
655
+ // Add underlines
656
+ underlines.forEach((underline) => {
657
+ const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
658
+ const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
659
+
660
+ textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
661
+ });
662
+
625
663
  return { textContentObjStr, pdfFontsUsed };
626
664
  }
@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
108
108
  *
109
109
  * @param {opentype.Font} font - Opentype.js font object
110
110
  * @param {number} objIndex - Index for font descriptor PDF object
111
- * @param {('normal'|'italic'|'bold')} style - Style of the font
111
+ * @param {boolean} italic
112
112
  * @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
113
113
  * If not provided, the font will not be embedded in the PDF.
114
114
  * @returns {string} The font descriptor object string.
115
115
  */
116
- function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex = null) {
116
+ function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
117
117
  let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
118
118
 
119
119
  const namesTable = font.names.windows || font.names;
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
155
155
 
156
156
  // Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
157
157
  // This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
158
- objOut += `/Flags ${String(generateFontFlags(serif, style === 'italic', false, false))}`;
158
+ objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
159
159
 
160
160
  if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
161
161
  objOut += '>>\nendobj\n\n';
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
175
175
  *
176
176
  * @param {opentype.Font} font - Opentype.js font object
177
177
  * @param {number} firstObjIndex - Index for the first PDF object
178
- * @param {('normal'|'italic'|'bold')} style - Style of the font
178
+ * @param {boolean} [italic=false] - Whether the font is italic.
179
179
  * @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
180
180
  * Standard fonts are not embedded in the PDF.
181
181
  * @returns {Array<string>}
182
182
  */
183
- export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', isStandardFont = false) {
183
+ export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
184
184
  // Start 1st object: Font Dictionary
185
185
  let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
186
186
 
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
193
193
 
194
194
  fontDictObjStr += '/Widths[';
195
195
  for (let i = 0; i < win1252Chars.length; i++) {
196
- const advanceNorm = Math.round(font.charToGlyph(win1252Chars[i]).advanceWidth * (1000 / font.unitsPerEm));
196
+ const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
197
+ const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
197
198
  fontDictObjStr += `${String(advanceNorm)} `;
198
199
  }
199
200
  fontDictObjStr += ']/FirstChar 32/LastChar 255';
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
201
202
  fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
202
203
 
203
204
  // Start 2nd object: Font Descriptor
204
- const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, isStandardFont ? null : firstObjIndex + 2);
205
+ const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
205
206
 
206
207
  // objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
207
208
 
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
249
250
  *
250
251
  * @param {opentype.Font} font - Opentype.js font object
251
252
  * @param {number} firstObjIndex - Index for the first PDF object
253
+ * @param {boolean} [italic=false] - Whether the font is italic.
252
254
  *
253
255
  * This function does not produce "toUnicode" or "Widths" objects,
254
256
  * so any PDF it creates directly will lack usable copy/paste.
255
257
  * However, both of these objects will be created from the embedded file
256
258
  * when the result is run through mupdf.
257
259
  */
258
- export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
260
+ export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
259
261
  // Start 1st object: Font Dictionary
260
262
  let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
261
263
 
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
282
284
  toUnicodeStr += '\nendstream\nendobj\n\n';
283
285
 
284
286
  // Start 3rd object: FontDescriptor
285
- const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, firstObjIndex + 3);
287
+ const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
286
288
 
287
289
  // objOut += `${String(firstObjIndex + 2)} 0 obj\n`;
288
290
 
@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
86
86
 
87
87
  if (xlsxMode) {
88
88
  let fontStyle;
89
- if (wordObj.style === 'italic') {
89
+ if (wordObj.style.italic) {
90
90
  fontStyle = '<i/>';
91
- } else if (wordObj.smallCaps) {
91
+ } else if (wordObj.style.smallCaps) {
92
92
  fontStyle = '<smallCaps/>';
93
93
  } else {
94
94
  fontStyle = '';