scribe.js-ocr 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/scribe.js +2 -0
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/clear.js +5 -6
- package/js/containers/app.js +1 -1
- package/js/containers/dataContainer.js +0 -3
- package/js/containers/fontContainer.js +91 -77
- package/js/export/export.js +20 -5
- package/js/export/writeHocr.js +20 -18
- package/js/export/writeHtml.js +1 -1
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +92 -49
- package/js/fontEval.js +12 -12
- package/js/fontStatistics.js +93 -92
- package/js/fontSupp.js +20 -20
- package/js/generalWorkerMain.js +4 -0
- package/js/global.d.ts +39 -4
- package/js/import/convertPageAbbyy.js +55 -26
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +67 -32
- package/js/import/import.js +89 -45
- package/js/import/importOCR.js +27 -33
- package/js/objects/{fontMetricsObjects.js → charMetricsObjects.js} +12 -12
- package/js/objects/layoutObjects.js +37 -0
- package/js/objects/ocrObjects.js +55 -19
- package/js/recognizeConvert.js +21 -8
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +43 -6
- package/js/worker/compareOCRModule.js +20 -23
- package/js/worker/generalWorker.js +5 -5
- package/js/worker/optimizeFontModule.js +19 -19
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +6 -3
package/js/export/writePdf.js
CHANGED
|
@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
|
|
|
10
10
|
import { opt } from '../containers/app.js';
|
|
11
11
|
import { pageMetricsArr } from '../containers/dataContainer.js';
|
|
12
12
|
import ocr from '../objects/ocrObjects.js';
|
|
13
|
+
import { getStyleLookup } from '../utils/miscUtils.js';
|
|
13
14
|
|
|
14
15
|
/**
|
|
15
16
|
* @param {number} x
|
|
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
|
|
|
97
98
|
normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
|
|
98
99
|
italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
|
|
99
100
|
bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
|
|
101
|
+
boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
|
|
100
102
|
};
|
|
101
103
|
await addFamilyObj(familyKeyI, familyObjI);
|
|
102
104
|
}
|
|
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
301
303
|
|
|
302
304
|
const pdfFontsUsed = new Set();
|
|
303
305
|
|
|
306
|
+
const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
|
|
307
|
+
|
|
304
308
|
// Start 1st object: Text Content
|
|
305
309
|
let textContentObjStr = '';
|
|
306
310
|
|
|
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
349
353
|
let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
|
|
350
354
|
|
|
351
355
|
if (!wordFontOpentype) {
|
|
352
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
356
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
353
357
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
354
358
|
continue;
|
|
355
359
|
}
|
|
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
359
363
|
let wordFontSize = word0Metrics.fontSize;
|
|
360
364
|
|
|
361
365
|
// Set font and font size
|
|
362
|
-
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
366
|
+
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
363
367
|
pdfFontNameCurrent = pdfFontCurrent.name;
|
|
364
368
|
pdfFontTypeCurrent = pdfFontCurrent.type;
|
|
365
369
|
pdfFontsUsed.add(pdfFontCurrent);
|
|
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
372
376
|
const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
|
|
373
377
|
|
|
374
378
|
let tz = 100;
|
|
375
|
-
if (wordJ.dropcap) {
|
|
379
|
+
if (wordJ.style.dropcap) {
|
|
376
380
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
377
381
|
tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
|
|
378
382
|
}
|
|
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
406
410
|
let spacingAdj = 0;
|
|
407
411
|
let kernSpacing = false;
|
|
408
412
|
let wordLast = wordJ;
|
|
413
|
+
let underlineLeft = /** @type {?number} */ null;
|
|
414
|
+
let underlineRight = /** @type {?number} */ null;
|
|
409
415
|
let wordFontOpentypeLast = wordFontOpentype;
|
|
410
416
|
let fontSizeLast = wordFontSize;
|
|
411
417
|
let tsCurrent = 0;
|
|
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
426
432
|
wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
|
|
427
433
|
|
|
428
434
|
if (!wordFontOpentype) {
|
|
429
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
435
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
430
436
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
431
437
|
continue;
|
|
432
438
|
}
|
|
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
446
452
|
fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
|
|
447
453
|
}
|
|
448
454
|
|
|
449
|
-
const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
455
|
+
const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
450
456
|
const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
|
|
451
457
|
|
|
452
458
|
let ts = 0;
|
|
453
|
-
if (wordJ.sup || wordJ.dropcap) {
|
|
459
|
+
if (wordJ.style.sup || wordJ.style.dropcap) {
|
|
454
460
|
ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
|
|
455
461
|
if (!wordJ.visualCoords) {
|
|
456
462
|
const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
|
|
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
462
468
|
|
|
463
469
|
// TODO: This probably fails for Chinese, rethink.
|
|
464
470
|
tz = 100;
|
|
465
|
-
if (wordJ.dropcap) {
|
|
471
|
+
if (wordJ.style.dropcap) {
|
|
466
472
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
467
473
|
tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
|
|
468
474
|
}
|
|
469
475
|
|
|
470
|
-
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
476
|
+
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
471
477
|
const pdfFontName = pdfFont.name;
|
|
472
478
|
const pdfFontType = pdfFont.type;
|
|
473
479
|
pdfFontsUsed.add(pdfFont);
|
|
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
480
486
|
// The space between words determined by:
|
|
481
487
|
// (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
|
|
482
488
|
// (4) the current character spacing value (applied twice--both before and after the space character).
|
|
483
|
-
const
|
|
489
|
+
const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
|
|
490
|
+
const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
|
|
484
491
|
|
|
485
492
|
const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
|
|
486
493
|
|
|
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
503
510
|
// However, this assumption does not hold for single-character words, as there is no space between character to adjust.
|
|
504
511
|
// Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
|
|
505
512
|
// (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
|
|
506
|
-
if (charArr.length === 1 && !wordJ.dropcap) {
|
|
513
|
+
if (charArr.length === 1 && !wordJ.style.dropcap) {
|
|
507
514
|
const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
|
|
508
515
|
const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
|
|
509
|
-
const
|
|
516
|
+
const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
|
|
517
|
+
const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
|
|
510
518
|
spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
|
|
511
519
|
} else {
|
|
512
520
|
spacingAdj = 0 - angleAdjWordX;
|
|
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
514
522
|
|
|
515
523
|
textContentObjStr += ' ] TJ\n';
|
|
516
524
|
|
|
517
|
-
const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
525
|
+
const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
518
526
|
if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
|
|
519
527
|
textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
|
|
520
528
|
pdfFontNameCurrent = pdfFontName;
|
|
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
541
549
|
// Non-ASCII and special characters are encoded/escaped using winEncodingLookup
|
|
542
550
|
for (let k = 0; k < charArr.length; k++) {
|
|
543
551
|
const letterSrc = charArr[k];
|
|
544
|
-
const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
545
|
-
const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
552
|
+
const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
553
|
+
const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
546
554
|
|
|
547
555
|
const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
|
|
548
556
|
if (letterEnc) {
|
|
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
611
619
|
}
|
|
612
620
|
}
|
|
613
621
|
|
|
622
|
+
if (wordJ.style.underline && underlineLeft === null) {
|
|
623
|
+
underlineLeft = wordJ.bbox.left;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (wordJ.style.underline) {
|
|
627
|
+
underlineRight = wordJ.bbox.right;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
|
|
631
|
+
underlines.push({
|
|
632
|
+
left: underlineLeft,
|
|
633
|
+
right: underlineRight,
|
|
634
|
+
top: lineTopAdj,
|
|
635
|
+
height: lineObj.bbox.bottom - lineObj.bbox.top,
|
|
636
|
+
fontSize: wordFontSize,
|
|
637
|
+
bold: wordJ.style.bold,
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
underlineLeft = null;
|
|
641
|
+
underlineRight = null;
|
|
642
|
+
}
|
|
643
|
+
|
|
614
644
|
wordLast = wordJ;
|
|
615
645
|
wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
|
|
616
646
|
wordFontOpentypeLast = wordFontOpentype;
|
|
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
622
652
|
|
|
623
653
|
textContentObjStr += 'ET';
|
|
624
654
|
|
|
655
|
+
// Add underlines
|
|
656
|
+
underlines.forEach((underline) => {
|
|
657
|
+
const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
|
|
658
|
+
const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
|
|
659
|
+
|
|
660
|
+
textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
|
|
661
|
+
});
|
|
662
|
+
|
|
625
663
|
return { textContentObjStr, pdfFontsUsed };
|
|
626
664
|
}
|
|
@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
|
|
|
108
108
|
*
|
|
109
109
|
* @param {opentype.Font} font - Opentype.js font object
|
|
110
110
|
* @param {number} objIndex - Index for font descriptor PDF object
|
|
111
|
-
* @param {
|
|
111
|
+
* @param {boolean} italic
|
|
112
112
|
* @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
|
|
113
113
|
* If not provided, the font will not be embedded in the PDF.
|
|
114
114
|
* @returns {string} The font descriptor object string.
|
|
115
115
|
*/
|
|
116
|
-
function createFontDescriptor(font, objIndex,
|
|
116
|
+
function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
|
|
117
117
|
let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
|
|
118
118
|
|
|
119
119
|
const namesTable = font.names.windows || font.names;
|
|
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
155
155
|
|
|
156
156
|
// Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
|
|
157
157
|
// This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
|
|
158
|
-
objOut += `/Flags ${String(generateFontFlags(serif,
|
|
158
|
+
objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
|
|
159
159
|
|
|
160
160
|
if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
|
|
161
161
|
objOut += '>>\nendobj\n\n';
|
|
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
175
175
|
*
|
|
176
176
|
* @param {opentype.Font} font - Opentype.js font object
|
|
177
177
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
178
|
-
* @param {
|
|
178
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
179
179
|
* @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
|
|
180
180
|
* Standard fonts are not embedded in the PDF.
|
|
181
181
|
* @returns {Array<string>}
|
|
182
182
|
*/
|
|
183
|
-
export function createEmbeddedFontType1(font, firstObjIndex,
|
|
183
|
+
export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
|
|
184
184
|
// Start 1st object: Font Dictionary
|
|
185
185
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
|
|
186
186
|
|
|
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
193
193
|
|
|
194
194
|
fontDictObjStr += '/Widths[';
|
|
195
195
|
for (let i = 0; i < win1252Chars.length; i++) {
|
|
196
|
-
const
|
|
196
|
+
const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
|
|
197
|
+
const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
|
|
197
198
|
fontDictObjStr += `${String(advanceNorm)} `;
|
|
198
199
|
}
|
|
199
200
|
fontDictObjStr += ']/FirstChar 32/LastChar 255';
|
|
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
201
202
|
fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
|
|
202
203
|
|
|
203
204
|
// Start 2nd object: Font Descriptor
|
|
204
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
205
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
|
|
205
206
|
|
|
206
207
|
// objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
|
|
207
208
|
|
|
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
249
250
|
*
|
|
250
251
|
* @param {opentype.Font} font - Opentype.js font object
|
|
251
252
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
253
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
252
254
|
*
|
|
253
255
|
* This function does not produce "toUnicode" or "Widths" objects,
|
|
254
256
|
* so any PDF it creates directly will lack usable copy/paste.
|
|
255
257
|
* However, both of these objects will be created from the embedded file
|
|
256
258
|
* when the result is run through mupdf.
|
|
257
259
|
*/
|
|
258
|
-
export function createEmbeddedFontType0(font, firstObjIndex,
|
|
260
|
+
export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
|
|
259
261
|
// Start 1st object: Font Dictionary
|
|
260
262
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
|
|
261
263
|
|
|
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
|
|
|
282
284
|
toUnicodeStr += '\nendstream\nendobj\n\n';
|
|
283
285
|
|
|
284
286
|
// Start 3rd object: FontDescriptor
|
|
285
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
287
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
|
|
286
288
|
|
|
287
289
|
// objOut += `${String(firstObjIndex + 2)} 0 obj\n`;
|
|
288
290
|
|
|
@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
|
|
|
86
86
|
|
|
87
87
|
if (xlsxMode) {
|
|
88
88
|
let fontStyle;
|
|
89
|
-
if (wordObj.style
|
|
89
|
+
if (wordObj.style.italic) {
|
|
90
90
|
fontStyle = '<i/>';
|
|
91
|
-
} else if (wordObj.smallCaps) {
|
|
91
|
+
} else if (wordObj.style.smallCaps) {
|
|
92
92
|
fontStyle = '<smallCaps/>';
|
|
93
93
|
} else {
|
|
94
94
|
fontStyle = '';
|
package/js/export/writeText.js
CHANGED
|
@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
54
54
|
|
|
55
55
|
if (docxMode) {
|
|
56
56
|
let fontStyle = '';
|
|
57
|
-
if (wordObj.style
|
|
57
|
+
if (wordObj.style.italic) {
|
|
58
58
|
fontStyle += '<w:i/>';
|
|
59
|
-
} else if (wordObj.style
|
|
59
|
+
} else if (wordObj.style.bold) {
|
|
60
60
|
fontStyle += '<w:b/>';
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
if (wordObj.smallCaps) {
|
|
63
|
+
if (wordObj.style.smallCaps) {
|
|
64
64
|
fontStyle += '<w:smallCaps/>';
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
if (wordObj.
|
|
67
|
+
if (wordObj.style.underline) {
|
|
68
|
+
fontStyle += '<w:u w:val="single"/>';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (wordObj.style.sup) {
|
|
68
72
|
fontStyle += '<w:vertAlign w:val="superscript"/>';
|
|
69
73
|
}
|
|
70
74
|
|
|
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
79
83
|
} else if (supPrev) {
|
|
80
84
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
|
|
81
85
|
// If this word is a superscript, no space is added between words.
|
|
82
|
-
} else if (wordObj.sup && i > 0) {
|
|
86
|
+
} else if (wordObj.style.sup && i > 0) {
|
|
83
87
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
84
88
|
} else {
|
|
85
89
|
textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
89
93
|
}
|
|
90
94
|
|
|
91
95
|
fontStylePrev = fontStyle;
|
|
92
|
-
supPrev = wordObj.sup;
|
|
96
|
+
supPrev = wordObj.style.sup;
|
|
93
97
|
} else if (newLine) {
|
|
94
98
|
textStr = `${textStr}\n`;
|
|
95
99
|
} else if (h > 0 || g > 0 || i > 0) {
|
package/js/extractTables.js
CHANGED
|
@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
|
|
|
22
22
|
|
|
23
23
|
// TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
|
|
24
24
|
/**
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
* Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
|
|
26
|
+
* The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
|
|
27
|
+
* @param {OcrPage} pageObj
|
|
28
|
+
* @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
|
|
29
|
+
*/
|
|
30
30
|
export function extractSingleTableContent(pageObj, boxes) {
|
|
31
31
|
/** @type {Array<OcrWord>} */
|
|
32
32
|
const wordArr = [];
|