scribe.js-ocr 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/cli/scribe.js +2 -0
  2. package/fonts/all/Carlito-BoldItalic.woff +0 -0
  3. package/fonts/all/Century-BoldItalic.woff +0 -0
  4. package/fonts/all/Garamond-BoldItalic.woff +0 -0
  5. package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
  6. package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
  7. package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
  8. package/fonts/all/Palatino-BoldItalic.woff +0 -0
  9. package/fonts/latin/Carlito-BoldItalic.woff +0 -0
  10. package/fonts/latin/Century-BoldItalic.woff +0 -0
  11. package/fonts/latin/Garamond-BoldItalic.woff +0 -0
  12. package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
  13. package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
  14. package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
  15. package/fonts/latin/Palatino-BoldItalic.woff +0 -0
  16. package/js/clear.js +5 -6
  17. package/js/containers/app.js +1 -1
  18. package/js/containers/dataContainer.js +0 -3
  19. package/js/containers/fontContainer.js +91 -77
  20. package/js/export/export.js +20 -5
  21. package/js/export/writeHocr.js +20 -18
  22. package/js/export/writeHtml.js +1 -1
  23. package/js/export/writePdf.js +52 -14
  24. package/js/export/writePdfFonts.js +11 -9
  25. package/js/export/writeTabular.js +2 -2
  26. package/js/export/writeText.js +10 -6
  27. package/js/extractTables.js +5 -5
  28. package/js/fontContainerMain.js +92 -49
  29. package/js/fontEval.js +12 -12
  30. package/js/fontStatistics.js +93 -92
  31. package/js/fontSupp.js +20 -20
  32. package/js/generalWorkerMain.js +4 -0
  33. package/js/global.d.ts +39 -4
  34. package/js/import/convertPageAbbyy.js +55 -26
  35. package/js/import/convertPageBlocks.js +2 -2
  36. package/js/import/convertPageHocr.js +10 -20
  37. package/js/import/convertPageShared.js +13 -9
  38. package/js/import/convertPageStext.js +67 -32
  39. package/js/import/import.js +89 -45
  40. package/js/import/importOCR.js +27 -33
  41. package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
  42. package/js/objects/layoutObjects.js +37 -0
  43. package/js/objects/ocrObjects.js +55 -19
  44. package/js/recognizeConvert.js +21 -8
  45. package/js/utils/fontUtils.js +11 -11
  46. package/js/utils/miscUtils.js +43 -6
  47. package/js/worker/compareOCRModule.js +20 -23
  48. package/js/worker/generalWorker.js +5 -5
  49. package/js/worker/optimizeFontModule.js +19 -19
  50. package/mupdf/libmupdf.js +123 -17
  51. package/mupdf/libmupdf.wasm +0 -0
  52. package/package.json +6 -3
@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
10
10
  import { opt } from '../containers/app.js';
11
11
  import { pageMetricsArr } from '../containers/dataContainer.js';
12
12
  import ocr from '../objects/ocrObjects.js';
13
+ import { getStyleLookup } from '../utils/miscUtils.js';
13
14
 
14
15
  /**
15
16
  * @param {number} x
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
97
98
  normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
98
99
  italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
99
100
  bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
101
+ boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
100
102
  };
101
103
  await addFamilyObj(familyKeyI, familyObjI);
102
104
  }
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
301
303
 
302
304
  const pdfFontsUsed = new Set();
303
305
 
306
+ const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
307
+
304
308
  // Start 1st object: Text Content
305
309
  let textContentObjStr = '';
306
310
 
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
349
353
  let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
350
354
 
351
355
  if (!wordFontOpentype) {
352
- const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
356
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
353
357
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
354
358
  continue;
355
359
  }
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
359
363
  let wordFontSize = word0Metrics.fontSize;
360
364
 
361
365
  // Set font and font size
362
- const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
366
+ const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
363
367
  pdfFontNameCurrent = pdfFontCurrent.name;
364
368
  pdfFontTypeCurrent = pdfFontCurrent.type;
365
369
  pdfFontsUsed.add(pdfFontCurrent);
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
372
376
  const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
373
377
 
374
378
  let tz = 100;
375
- if (wordJ.dropcap) {
379
+ if (wordJ.style.dropcap) {
376
380
  const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
377
381
  tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
378
382
  }
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
406
410
  let spacingAdj = 0;
407
411
  let kernSpacing = false;
408
412
  let wordLast = wordJ;
413
+ let underlineLeft = /** @type {?number} */ null;
414
+ let underlineRight = /** @type {?number} */ null;
409
415
  let wordFontOpentypeLast = wordFontOpentype;
410
416
  let fontSizeLast = wordFontSize;
411
417
  let tsCurrent = 0;
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
426
432
  wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
427
433
 
428
434
  if (!wordFontOpentype) {
429
- const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
435
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
430
436
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
431
437
  continue;
432
438
  }
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
446
452
  fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
447
453
  }
448
454
 
449
- const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
455
+ const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
450
456
  const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
451
457
 
452
458
  let ts = 0;
453
- if (wordJ.sup || wordJ.dropcap) {
459
+ if (wordJ.style.sup || wordJ.style.dropcap) {
454
460
  ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
455
461
  if (!wordJ.visualCoords) {
456
462
  const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
462
468
 
463
469
  // TODO: This probably fails for Chinese, rethink.
464
470
  tz = 100;
465
- if (wordJ.dropcap) {
471
+ if (wordJ.style.dropcap) {
466
472
  const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
467
473
  tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
468
474
  }
469
475
 
470
- const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
476
+ const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
471
477
  const pdfFontName = pdfFont.name;
472
478
  const pdfFontType = pdfFont.type;
473
479
  pdfFontsUsed.add(pdfFont);
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
480
486
  // The space between words determined by:
481
487
  // (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
482
488
  // (4) the current character spacing value (applied twice--both before and after the space character).
483
- const spaceWidthGlyph = wordFontOpentypeLast.charToGlyph(' ').advanceWidth * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
489
+ const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
490
+ const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
484
491
 
485
492
  const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
486
493
 
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
503
510
  // However, this assumption does not hold for single-character words, as there is no space between character to adjust.
504
511
  // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
505
512
  // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
506
- if (charArr.length === 1 && !wordJ.dropcap) {
513
+ if (charArr.length === 1 && !wordJ.style.dropcap) {
507
514
  const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
508
515
  const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
509
- const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
516
+ const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
517
+ const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
510
518
  spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
511
519
  } else {
512
520
  spacingAdj = 0 - angleAdjWordX;
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
514
522
 
515
523
  textContentObjStr += ' ] TJ\n';
516
524
 
517
- const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
525
+ const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
518
526
  if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
519
527
  textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
520
528
  pdfFontNameCurrent = pdfFontName;
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
541
549
  // Non-ASCII and special characters are encoded/escaped using winEncodingLookup
542
550
  for (let k = 0; k < charArr.length; k++) {
543
551
  const letterSrc = charArr[k];
544
- const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
545
- const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
552
+ const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
553
+ const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
546
554
 
547
555
  const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
548
556
  if (letterEnc) {
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
611
619
  }
612
620
  }
613
621
 
622
+ if (wordJ.style.underline && underlineLeft === null) {
623
+ underlineLeft = wordJ.bbox.left;
624
+ }
625
+
626
+ if (wordJ.style.underline) {
627
+ underlineRight = wordJ.bbox.right;
628
+ }
629
+
630
+ if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
631
+ underlines.push({
632
+ left: underlineLeft,
633
+ right: underlineRight,
634
+ top: lineTopAdj,
635
+ height: lineObj.bbox.bottom - lineObj.bbox.top,
636
+ fontSize: wordFontSize,
637
+ bold: wordJ.style.bold,
638
+ });
639
+
640
+ underlineLeft = null;
641
+ underlineRight = null;
642
+ }
643
+
614
644
  wordLast = wordJ;
615
645
  wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
616
646
  wordFontOpentypeLast = wordFontOpentype;
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
622
652
 
623
653
  textContentObjStr += 'ET';
624
654
 
655
+ // Add underlines
656
+ underlines.forEach((underline) => {
657
+ const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
658
+ const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
659
+
660
+ textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
661
+ });
662
+
625
663
  return { textContentObjStr, pdfFontsUsed };
626
664
  }
@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
108
108
  *
109
109
  * @param {opentype.Font} font - Opentype.js font object
110
110
  * @param {number} objIndex - Index for font descriptor PDF object
111
- * @param {('normal'|'italic'|'bold')} style - Style of the font
111
+ * @param {boolean} italic
112
112
  * @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
113
113
  * If not provided, the font will not be embedded in the PDF.
114
114
  * @returns {string} The font descriptor object string.
115
115
  */
116
- function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex = null) {
116
+ function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
117
117
  let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
118
118
 
119
119
  const namesTable = font.names.windows || font.names;
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
155
155
 
156
156
  // Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
157
157
  // This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
158
- objOut += `/Flags ${String(generateFontFlags(serif, style === 'italic', false, false))}`;
158
+ objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
159
159
 
160
160
  if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
161
161
  objOut += '>>\nendobj\n\n';
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
175
175
  *
176
176
  * @param {opentype.Font} font - Opentype.js font object
177
177
  * @param {number} firstObjIndex - Index for the first PDF object
178
- * @param {('normal'|'italic'|'bold')} style - Style of the font
178
+ * @param {boolean} [italic=false] - Whether the font is italic.
179
179
  * @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
180
180
  * Standard fonts are not embedded in the PDF.
181
181
  * @returns {Array<string>}
182
182
  */
183
- export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', isStandardFont = false) {
183
+ export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
184
184
  // Start 1st object: Font Dictionary
185
185
  let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
186
186
 
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
193
193
 
194
194
  fontDictObjStr += '/Widths[';
195
195
  for (let i = 0; i < win1252Chars.length; i++) {
196
- const advanceNorm = Math.round(font.charToGlyph(win1252Chars[i]).advanceWidth * (1000 / font.unitsPerEm));
196
+ const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
197
+ const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
197
198
  fontDictObjStr += `${String(advanceNorm)} `;
198
199
  }
199
200
  fontDictObjStr += ']/FirstChar 32/LastChar 255';
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
201
202
  fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
202
203
 
203
204
  // Start 2nd object: Font Descriptor
204
- const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, isStandardFont ? null : firstObjIndex + 2);
205
+ const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
205
206
 
206
207
  // objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
207
208
 
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
249
250
  *
250
251
  * @param {opentype.Font} font - Opentype.js font object
251
252
  * @param {number} firstObjIndex - Index for the first PDF object
253
+ * @param {boolean} [italic=false] - Whether the font is italic.
252
254
  *
253
255
  * This function does not produce "toUnicode" or "Widths" objects,
254
256
  * so any PDF it creates directly will lack usable copy/paste.
255
257
  * However, both of these objects will be created from the embedded file
256
258
  * when the result is run through mupdf.
257
259
  */
258
- export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
260
+ export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
259
261
  // Start 1st object: Font Dictionary
260
262
  let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
261
263
 
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
282
284
  toUnicodeStr += '\nendstream\nendobj\n\n';
283
285
 
284
286
  // Start 3rd object: FontDescriptor
285
- const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, style, firstObjIndex + 3);
287
+ const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
286
288
 
287
289
  // objOut += `${String(firstObjIndex + 2)} 0 obj\n`;
288
290
 
@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
86
86
 
87
87
  if (xlsxMode) {
88
88
  let fontStyle;
89
- if (wordObj.style === 'italic') {
89
+ if (wordObj.style.italic) {
90
90
  fontStyle = '<i/>';
91
- } else if (wordObj.smallCaps) {
91
+ } else if (wordObj.style.smallCaps) {
92
92
  fontStyle = '<smallCaps/>';
93
93
  } else {
94
94
  fontStyle = '';
@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
54
54
 
55
55
  if (docxMode) {
56
56
  let fontStyle = '';
57
- if (wordObj.style === 'italic') {
57
+ if (wordObj.style.italic) {
58
58
  fontStyle += '<w:i/>';
59
- } else if (wordObj.style === 'bold') {
59
+ } else if (wordObj.style.bold) {
60
60
  fontStyle += '<w:b/>';
61
61
  }
62
62
 
63
- if (wordObj.smallCaps) {
63
+ if (wordObj.style.smallCaps) {
64
64
  fontStyle += '<w:smallCaps/>';
65
65
  }
66
66
 
67
- if (wordObj.sup) {
67
+ if (wordObj.style.underline) {
68
+ fontStyle += '<w:u w:val="single"/>';
69
+ }
70
+
71
+ if (wordObj.style.sup) {
68
72
  fontStyle += '<w:vertAlign w:val="superscript"/>';
69
73
  }
70
74
 
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
79
83
  } else if (supPrev) {
80
84
  textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
81
85
  // If this word is a superscript, no space is added between words.
82
- } else if (wordObj.sup && i > 0) {
86
+ } else if (wordObj.style.sup && i > 0) {
83
87
  textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
84
88
  } else {
85
89
  textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
89
93
  }
90
94
 
91
95
  fontStylePrev = fontStyle;
92
- supPrev = wordObj.sup;
96
+ supPrev = wordObj.style.sup;
93
97
  } else if (newLine) {
94
98
  textStr = `${textStr}\n`;
95
99
  } else if (h > 0 || g > 0 || i > 0) {
@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
22
22
 
23
23
  // TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
24
24
  /**
25
- * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
26
- * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
27
- * @param {OcrPage} pageObj
28
- * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
29
- */
25
+ * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
26
+ * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
27
+ * @param {OcrPage} pageObj
28
+ * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
29
+ */
30
30
  export function extractSingleTableContent(pageObj, boxes) {
31
31
  /** @type {Array<OcrWord>} */
32
32
  const wordArr = [];