scribe.js-ocr 0.7.2 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.js +12 -1
- package/cli/detectPDFType.js +13 -20
- package/cli/extract.js +4 -2
- package/cli/scribe.js +9 -1
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/containers/app.js +1 -1
- package/js/containers/fontContainer.js +42 -40
- package/js/export/export.js +1 -1
- package/js/export/writeHocr.js +15 -13
- package/js/export/writeHtml.js +45 -29
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +50 -7
- package/js/fontStatistics.js +18 -13
- package/js/fontSupp.js +20 -20
- package/js/global.d.ts +17 -0
- package/js/import/convertPageAbbyy.js +47 -25
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +66 -31
- package/js/objects/ocrObjects.js +13 -19
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +16 -0
- package/js/worker/compareOCRModule.js +13 -16
- package/js/worker/optimizeFontModule.js +4 -4
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +1 -1
package/js/export/writePdf.js
CHANGED
|
@@ -10,6 +10,7 @@ import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFont
|
|
|
10
10
|
import { opt } from '../containers/app.js';
|
|
11
11
|
import { pageMetricsArr } from '../containers/dataContainer.js';
|
|
12
12
|
import ocr from '../objects/ocrObjects.js';
|
|
13
|
+
import { getStyleLookup } from '../utils/miscUtils.js';
|
|
13
14
|
|
|
14
15
|
/**
|
|
15
16
|
* @param {number} x
|
|
@@ -97,6 +98,7 @@ export async function writePdf(hocrArr, minpage = 0, maxpage = -1, textMode = 'e
|
|
|
97
98
|
normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
|
|
98
99
|
italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
|
|
99
100
|
bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
|
|
101
|
+
boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic,
|
|
100
102
|
};
|
|
101
103
|
await addFamilyObj(familyKeyI, familyObjI);
|
|
102
104
|
}
|
|
@@ -301,6 +303,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
301
303
|
|
|
302
304
|
const pdfFontsUsed = new Set();
|
|
303
305
|
|
|
306
|
+
const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]);
|
|
307
|
+
|
|
304
308
|
// Start 1st object: Text Content
|
|
305
309
|
let textContentObjStr = '';
|
|
306
310
|
|
|
@@ -349,7 +353,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
349
353
|
let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
|
|
350
354
|
|
|
351
355
|
if (!wordFontOpentype) {
|
|
352
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
356
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
353
357
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
354
358
|
continue;
|
|
355
359
|
}
|
|
@@ -359,7 +363,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
359
363
|
let wordFontSize = word0Metrics.fontSize;
|
|
360
364
|
|
|
361
365
|
// Set font and font size
|
|
362
|
-
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
366
|
+
const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
363
367
|
pdfFontNameCurrent = pdfFontCurrent.name;
|
|
364
368
|
pdfFontTypeCurrent = pdfFontCurrent.type;
|
|
365
369
|
pdfFontsUsed.add(pdfFontCurrent);
|
|
@@ -372,7 +376,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
372
376
|
const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
|
|
373
377
|
|
|
374
378
|
let tz = 100;
|
|
375
|
-
if (wordJ.dropcap) {
|
|
379
|
+
if (wordJ.style.dropcap) {
|
|
376
380
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
377
381
|
tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
|
|
378
382
|
}
|
|
@@ -406,6 +410,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
406
410
|
let spacingAdj = 0;
|
|
407
411
|
let kernSpacing = false;
|
|
408
412
|
let wordLast = wordJ;
|
|
413
|
+
let underlineLeft = /** @type {?number} */ null;
|
|
414
|
+
let underlineRight = /** @type {?number} */ null;
|
|
409
415
|
let wordFontOpentypeLast = wordFontOpentype;
|
|
410
416
|
let fontSizeLast = wordFontSize;
|
|
411
417
|
let tsCurrent = 0;
|
|
@@ -426,7 +432,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
426
432
|
wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
|
|
427
433
|
|
|
428
434
|
if (!wordFontOpentype) {
|
|
429
|
-
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
435
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${getStyleLookup(wordJ.style)})`;
|
|
430
436
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
431
437
|
continue;
|
|
432
438
|
}
|
|
@@ -446,11 +452,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
446
452
|
fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
|
|
447
453
|
}
|
|
448
454
|
|
|
449
|
-
const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
455
|
+
const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
450
456
|
const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
|
|
451
457
|
|
|
452
458
|
let ts = 0;
|
|
453
|
-
if (wordJ.sup || wordJ.dropcap) {
|
|
459
|
+
if (wordJ.style.sup || wordJ.style.dropcap) {
|
|
454
460
|
ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
|
|
455
461
|
if (!wordJ.visualCoords) {
|
|
456
462
|
const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
|
|
@@ -462,12 +468,12 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
462
468
|
|
|
463
469
|
// TODO: This probably fails for Chinese, rethink.
|
|
464
470
|
tz = 100;
|
|
465
|
-
if (wordJ.dropcap) {
|
|
471
|
+
if (wordJ.style.dropcap) {
|
|
466
472
|
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
467
473
|
tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
|
|
468
474
|
}
|
|
469
475
|
|
|
470
|
-
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
476
|
+
const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)];
|
|
471
477
|
const pdfFontName = pdfFont.name;
|
|
472
478
|
const pdfFontType = pdfFont.type;
|
|
473
479
|
pdfFontsUsed.add(pdfFont);
|
|
@@ -480,7 +486,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
480
486
|
// The space between words determined by:
|
|
481
487
|
// (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words,
|
|
482
488
|
// (4) the current character spacing value (applied twice--both before and after the space character).
|
|
483
|
-
const
|
|
489
|
+
const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2;
|
|
490
|
+
const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm);
|
|
484
491
|
|
|
485
492
|
const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing;
|
|
486
493
|
|
|
@@ -503,10 +510,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
503
510
|
// However, this assumption does not hold for single-character words, as there is no space between character to adjust.
|
|
504
511
|
// Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
|
|
505
512
|
// (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
|
|
506
|
-
if (charArr.length === 1 && !wordJ.dropcap) {
|
|
513
|
+
if (charArr.length === 1 && !wordJ.style.dropcap) {
|
|
507
514
|
const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
|
|
508
515
|
const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
|
|
509
|
-
const
|
|
516
|
+
const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFontOpentype.unitsPerEm / 2;
|
|
517
|
+
const lastCharWidth = lastCharAdvance * (wordFontSize / wordFontOpentype.unitsPerEm);
|
|
510
518
|
spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX;
|
|
511
519
|
} else {
|
|
512
520
|
spacingAdj = 0 - angleAdjWordX;
|
|
@@ -514,7 +522,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
514
522
|
|
|
515
523
|
textContentObjStr += ' ] TJ\n';
|
|
516
524
|
|
|
517
|
-
const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
525
|
+
const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
518
526
|
if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) {
|
|
519
527
|
textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`;
|
|
520
528
|
pdfFontNameCurrent = pdfFontName;
|
|
@@ -541,8 +549,8 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
541
549
|
// Non-ASCII and special characters are encoded/escaped using winEncodingLookup
|
|
542
550
|
for (let k = 0; k < charArr.length; k++) {
|
|
543
551
|
const letterSrc = charArr[k];
|
|
544
|
-
const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
545
|
-
const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
552
|
+
const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
553
|
+
const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
546
554
|
|
|
547
555
|
const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
|
|
548
556
|
if (letterEnc) {
|
|
@@ -611,6 +619,28 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
611
619
|
}
|
|
612
620
|
}
|
|
613
621
|
|
|
622
|
+
if (wordJ.style.underline && underlineLeft === null) {
|
|
623
|
+
underlineLeft = wordJ.bbox.left;
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
if (wordJ.style.underline) {
|
|
627
|
+
underlineRight = wordJ.bbox.right;
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) {
|
|
631
|
+
underlines.push({
|
|
632
|
+
left: underlineLeft,
|
|
633
|
+
right: underlineRight,
|
|
634
|
+
top: lineTopAdj,
|
|
635
|
+
height: lineObj.bbox.bottom - lineObj.bbox.top,
|
|
636
|
+
fontSize: wordFontSize,
|
|
637
|
+
bold: wordJ.style.bold,
|
|
638
|
+
});
|
|
639
|
+
|
|
640
|
+
underlineLeft = null;
|
|
641
|
+
underlineRight = null;
|
|
642
|
+
}
|
|
643
|
+
|
|
614
644
|
wordLast = wordJ;
|
|
615
645
|
wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
|
|
616
646
|
wordFontOpentypeLast = wordFontOpentype;
|
|
@@ -622,5 +652,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
622
652
|
|
|
623
653
|
textContentObjStr += 'ET';
|
|
624
654
|
|
|
655
|
+
// Add underlines
|
|
656
|
+
underlines.forEach((underline) => {
|
|
657
|
+
const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24);
|
|
658
|
+
const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness;
|
|
659
|
+
|
|
660
|
+
textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`;
|
|
661
|
+
});
|
|
662
|
+
|
|
625
663
|
return { textContentObjStr, pdfFontsUsed };
|
|
626
664
|
}
|
|
@@ -108,12 +108,12 @@ const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-dis
|
|
|
108
108
|
*
|
|
109
109
|
* @param {opentype.Font} font - Opentype.js font object
|
|
110
110
|
* @param {number} objIndex - Index for font descriptor PDF object
|
|
111
|
-
* @param {
|
|
111
|
+
* @param {boolean} italic
|
|
112
112
|
* @param {?number} embeddedObjIndex - Index for embedded font file PDF object.
|
|
113
113
|
* If not provided, the font will not be embedded in the PDF.
|
|
114
114
|
* @returns {string} The font descriptor object string.
|
|
115
115
|
*/
|
|
116
|
-
function createFontDescriptor(font, objIndex,
|
|
116
|
+
function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) {
|
|
117
117
|
let objOut = `${String(objIndex)} 0 obj\n<</Type/FontDescriptor`;
|
|
118
118
|
|
|
119
119
|
const namesTable = font.names.windows || font.names;
|
|
@@ -155,7 +155,7 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
155
155
|
|
|
156
156
|
// Symbolic is always set to false, even if the font contains glyphs outside the Adobe standard Latin character set.
|
|
157
157
|
// This is because symbolic fonts are only used when embedded, and this does not appear to matter for embedded fonts.
|
|
158
|
-
objOut += `/Flags ${String(generateFontFlags(serif,
|
|
158
|
+
objOut += `/Flags ${String(generateFontFlags(serif, italic, false, false))}`;
|
|
159
159
|
|
|
160
160
|
if (embeddedObjIndex === null || embeddedObjIndex === undefined) {
|
|
161
161
|
objOut += '>>\nendobj\n\n';
|
|
@@ -175,12 +175,12 @@ function createFontDescriptor(font, objIndex, style = 'normal', embeddedObjIndex
|
|
|
175
175
|
*
|
|
176
176
|
* @param {opentype.Font} font - Opentype.js font object
|
|
177
177
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
178
|
-
* @param {
|
|
178
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
179
179
|
* @param {boolean} [isStandardFont=false] - Whether the font is a standard font.
|
|
180
180
|
* Standard fonts are not embedded in the PDF.
|
|
181
181
|
* @returns {Array<string>}
|
|
182
182
|
*/
|
|
183
|
-
export function createEmbeddedFontType1(font, firstObjIndex,
|
|
183
|
+
export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) {
|
|
184
184
|
// Start 1st object: Font Dictionary
|
|
185
185
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type1`;
|
|
186
186
|
|
|
@@ -193,7 +193,8 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
193
193
|
|
|
194
194
|
fontDictObjStr += '/Widths[';
|
|
195
195
|
for (let i = 0; i < win1252Chars.length; i++) {
|
|
196
|
-
const
|
|
196
|
+
const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm;
|
|
197
|
+
const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm));
|
|
197
198
|
fontDictObjStr += `${String(advanceNorm)} `;
|
|
198
199
|
}
|
|
199
200
|
fontDictObjStr += ']/FirstChar 32/LastChar 255';
|
|
@@ -201,7 +202,7 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
201
202
|
fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`;
|
|
202
203
|
|
|
203
204
|
// Start 2nd object: Font Descriptor
|
|
204
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
205
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2);
|
|
205
206
|
|
|
206
207
|
// objOut += `${String(firstObjIndex + 1)} 0 obj\n<</Type/FontDescriptor`;
|
|
207
208
|
|
|
@@ -249,13 +250,14 @@ export function createEmbeddedFontType1(font, firstObjIndex, style = 'normal', i
|
|
|
249
250
|
*
|
|
250
251
|
* @param {opentype.Font} font - Opentype.js font object
|
|
251
252
|
* @param {number} firstObjIndex - Index for the first PDF object
|
|
253
|
+
* @param {boolean} [italic=false] - Whether the font is italic.
|
|
252
254
|
*
|
|
253
255
|
* This function does not produce "toUnicode" or "Widths" objects,
|
|
254
256
|
* so any PDF it creates directly will lack usable copy/paste.
|
|
255
257
|
* However, both of these objects will be created from the embedded file
|
|
256
258
|
* when the result is run through mupdf.
|
|
257
259
|
*/
|
|
258
|
-
export function createEmbeddedFontType0(font, firstObjIndex,
|
|
260
|
+
export function createEmbeddedFontType0(font, firstObjIndex, italic = false) {
|
|
259
261
|
// Start 1st object: Font Dictionary
|
|
260
262
|
let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<</Type/Font/Subtype/Type0`;
|
|
261
263
|
|
|
@@ -282,7 +284,7 @@ export function createEmbeddedFontType0(font, firstObjIndex, style = 'normal') {
|
|
|
282
284
|
toUnicodeStr += '\nendstream\nendobj\n\n';
|
|
283
285
|
|
|
284
286
|
// Start 3rd object: FontDescriptor
|
|
285
|
-
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1,
|
|
287
|
+
const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3);
|
|
286
288
|
|
|
287
289
|
// objOut += `${String(firstObjIndex + 2)} 0 obj\n`;
|
|
288
290
|
|
|
@@ -86,9 +86,9 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode
|
|
|
86
86
|
|
|
87
87
|
if (xlsxMode) {
|
|
88
88
|
let fontStyle;
|
|
89
|
-
if (wordObj.style
|
|
89
|
+
if (wordObj.style.italic) {
|
|
90
90
|
fontStyle = '<i/>';
|
|
91
|
-
} else if (wordObj.smallCaps) {
|
|
91
|
+
} else if (wordObj.style.smallCaps) {
|
|
92
92
|
fontStyle = '<smallCaps/>';
|
|
93
93
|
} else {
|
|
94
94
|
fontStyle = '';
|
package/js/export/writeText.js
CHANGED
|
@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
54
54
|
|
|
55
55
|
if (docxMode) {
|
|
56
56
|
let fontStyle = '';
|
|
57
|
-
if (wordObj.style
|
|
57
|
+
if (wordObj.style.italic) {
|
|
58
58
|
fontStyle += '<w:i/>';
|
|
59
|
-
} else if (wordObj.style
|
|
59
|
+
} else if (wordObj.style.bold) {
|
|
60
60
|
fontStyle += '<w:b/>';
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
if (wordObj.smallCaps) {
|
|
63
|
+
if (wordObj.style.smallCaps) {
|
|
64
64
|
fontStyle += '<w:smallCaps/>';
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
if (wordObj.
|
|
67
|
+
if (wordObj.style.underline) {
|
|
68
|
+
fontStyle += '<w:u w:val="single"/>';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (wordObj.style.sup) {
|
|
68
72
|
fontStyle += '<w:vertAlign w:val="superscript"/>';
|
|
69
73
|
}
|
|
70
74
|
|
|
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
79
83
|
} else if (supPrev) {
|
|
80
84
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
|
|
81
85
|
// If this word is a superscript, no space is added between words.
|
|
82
|
-
} else if (wordObj.sup && i > 0) {
|
|
86
|
+
} else if (wordObj.style.sup && i > 0) {
|
|
83
87
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
84
88
|
} else {
|
|
85
89
|
textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
89
93
|
}
|
|
90
94
|
|
|
91
95
|
fontStylePrev = fontStyle;
|
|
92
|
-
supPrev = wordObj.sup;
|
|
96
|
+
supPrev = wordObj.style.sup;
|
|
93
97
|
} else if (newLine) {
|
|
94
98
|
textStr = `${textStr}\n`;
|
|
95
99
|
} else if (h > 0 || g > 0 || i > 0) {
|
package/js/extractTables.js
CHANGED
|
@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
|
|
|
22
22
|
|
|
23
23
|
// TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
|
|
24
24
|
/**
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
* Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
|
|
26
|
+
* The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
|
|
27
|
+
* @param {OcrPage} pageObj
|
|
28
|
+
* @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
|
|
29
|
+
*/
|
|
30
30
|
export function extractSingleTableContent(pageObj, boxes) {
|
|
31
31
|
/** @type {Array<OcrWord>} */
|
|
32
32
|
const wordArr = [];
|
package/js/fontContainerMain.js
CHANGED
|
@@ -24,103 +24,145 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
24
24
|
let /** @type {Promise<ArrayBuffer>} */carlitoNormal;
|
|
25
25
|
let /** @type {Promise<ArrayBuffer>} */carlitoItalic;
|
|
26
26
|
let /** @type {Promise<ArrayBuffer>} */carlitoBold;
|
|
27
|
+
let /** @type {Promise<ArrayBuffer>} */carlitoBoldItalic;
|
|
27
28
|
let /** @type {Promise<ArrayBuffer>} */centuryNormal;
|
|
28
29
|
let /** @type {Promise<ArrayBuffer>} */centuryItalic;
|
|
29
30
|
let /** @type {Promise<ArrayBuffer>} */centuryBold;
|
|
31
|
+
let /** @type {Promise<ArrayBuffer>} */centuryBoldItalic;
|
|
30
32
|
let /** @type {Promise<ArrayBuffer>} */garamondNormal;
|
|
31
33
|
let /** @type {Promise<ArrayBuffer>} */garamondItalic;
|
|
32
34
|
let /** @type {Promise<ArrayBuffer>} */garamondBold;
|
|
35
|
+
let /** @type {Promise<ArrayBuffer>} */garamondBoldItalic;
|
|
33
36
|
let /** @type {Promise<ArrayBuffer>} */palatinoNormal;
|
|
34
37
|
let /** @type {Promise<ArrayBuffer>} */palatinoItalic;
|
|
35
38
|
let /** @type {Promise<ArrayBuffer>} */palatinoBold;
|
|
39
|
+
let /** @type {Promise<ArrayBuffer>} */palatinoBoldItalic;
|
|
36
40
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanNormal;
|
|
37
41
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanItalic;
|
|
38
42
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanBold;
|
|
43
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusRomanBoldItalic;
|
|
39
44
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
|
|
40
45
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
|
|
41
46
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
|
|
47
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusSansBoldItalic;
|
|
42
48
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
|
|
43
49
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
|
|
44
50
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
|
|
51
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusMonoBoldItalic;
|
|
45
52
|
if (typeof process === 'undefined') {
|
|
46
53
|
if (glyphSet === 'latin') {
|
|
47
54
|
carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
48
55
|
carlitoItalic = fetch(new URL('../fonts/latin/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
49
56
|
carlitoBold = fetch(new URL('../fonts/latin/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
57
|
+
carlitoBoldItalic = fetch(new URL('../fonts/latin/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
50
58
|
centuryNormal = fetch(new URL('../fonts/latin/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
51
59
|
centuryItalic = fetch(new URL('../fonts/latin/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
52
60
|
centuryBold = fetch(new URL('../fonts/latin/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
61
|
+
centuryBoldItalic = fetch(new URL('../fonts/latin/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
53
62
|
garamondNormal = fetch(new URL('../fonts/latin/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
54
63
|
garamondItalic = fetch(new URL('../fonts/latin/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
55
64
|
garamondBold = fetch(new URL('../fonts/latin/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
65
|
+
garamondBoldItalic = fetch(new URL('../fonts/latin/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
56
66
|
palatinoNormal = fetch(new URL('../fonts/latin/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
57
67
|
palatinoItalic = fetch(new URL('../fonts/latin/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
58
68
|
palatinoBold = fetch(new URL('../fonts/latin/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
69
|
+
palatinoBoldItalic = fetch(new URL('../fonts/latin/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
59
70
|
nimbusRomanNormal = fetch(new URL('../fonts/latin/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
60
71
|
nimbusRomanItalic = fetch(new URL('../fonts/latin/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
61
72
|
nimbusRomanBold = fetch(new URL('../fonts/latin/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
73
|
+
nimbusRomanBoldItalic = fetch(new URL('../fonts/latin/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
62
74
|
nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
63
75
|
nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
64
76
|
nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
77
|
+
nimbusSansBoldItalic = fetch(new URL('../fonts/latin/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
65
78
|
nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
66
79
|
nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
67
80
|
nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
81
|
+
nimbusMonoBoldItalic = fetch(new URL('../fonts/latin/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
68
82
|
} else {
|
|
69
83
|
carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
70
84
|
carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
71
85
|
carlitoBold = fetch(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
86
|
+
carlitoBoldItalic = fetch(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
72
87
|
centuryNormal = fetch(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
73
88
|
centuryItalic = fetch(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
74
89
|
centuryBold = fetch(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
90
|
+
centuryBoldItalic = fetch(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
75
91
|
garamondNormal = fetch(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
76
92
|
garamondItalic = fetch(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
77
93
|
garamondBold = fetch(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
94
|
+
garamondBoldItalic = fetch(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
78
95
|
palatinoNormal = fetch(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
79
96
|
palatinoItalic = fetch(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
80
97
|
palatinoBold = fetch(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
98
|
+
palatinoBoldItalic = fetch(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
81
99
|
nimbusRomanNormal = fetch(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
82
100
|
nimbusRomanItalic = fetch(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
83
101
|
nimbusRomanBold = fetch(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
102
|
+
nimbusRomanBoldItalic = fetch(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
84
103
|
nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
85
104
|
nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
86
105
|
nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
106
|
+
nimbusSansBoldItalic = fetch(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
87
107
|
nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
88
108
|
nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
89
109
|
nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
110
|
+
nimbusMonoBoldItalic = fetch(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
90
111
|
}
|
|
91
112
|
} else {
|
|
92
113
|
const { readFile } = await import('fs/promises');
|
|
93
114
|
carlitoNormal = readFile(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
94
115
|
carlitoItalic = readFile(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
95
116
|
carlitoBold = readFile(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
117
|
+
carlitoBoldItalic = readFile(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
96
118
|
centuryNormal = readFile(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
97
119
|
centuryItalic = readFile(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
98
120
|
centuryBold = readFile(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
121
|
+
centuryBoldItalic = readFile(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
99
122
|
garamondNormal = readFile(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
100
123
|
garamondItalic = readFile(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
101
124
|
garamondBold = readFile(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
125
|
+
garamondBoldItalic = readFile(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
102
126
|
palatinoNormal = readFile(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
103
127
|
palatinoItalic = readFile(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
104
128
|
palatinoBold = readFile(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
129
|
+
palatinoBoldItalic = readFile(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
105
130
|
nimbusRomanNormal = readFile(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
106
131
|
nimbusRomanItalic = readFile(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
107
132
|
nimbusRomanBold = readFile(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
133
|
+
nimbusRomanBoldItalic = readFile(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
108
134
|
nimbusSansNormal = readFile(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
109
135
|
nimbusSansItalic = readFile(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
110
136
|
nimbusSansBold = readFile(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
137
|
+
nimbusSansBoldItalic = readFile(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
111
138
|
nimbusMonoNormal = readFile(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
112
139
|
nimbusMonoItalic = readFile(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
113
140
|
nimbusMonoBold = readFile(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
141
|
+
nimbusMonoBoldItalic = readFile(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
114
142
|
}
|
|
115
143
|
|
|
116
144
|
const srcObj = {
|
|
117
|
-
Carlito: {
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
145
|
+
Carlito: {
|
|
146
|
+
normal: await carlitoNormal, italic: await carlitoItalic, bold: await carlitoBold, boldItalic: await carlitoBoldItalic,
|
|
147
|
+
},
|
|
148
|
+
Century: {
|
|
149
|
+
normal: await centuryNormal, italic: await centuryItalic, bold: await centuryBold, boldItalic: await centuryBoldItalic,
|
|
150
|
+
},
|
|
151
|
+
Garamond: {
|
|
152
|
+
normal: await garamondNormal, italic: await garamondItalic, bold: await garamondBold, boldItalic: await garamondBoldItalic,
|
|
153
|
+
},
|
|
154
|
+
Palatino: {
|
|
155
|
+
normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold, boldItalic: await palatinoBoldItalic,
|
|
156
|
+
},
|
|
157
|
+
NimbusRoman: {
|
|
158
|
+
normal: await nimbusRomanNormal, italic: await nimbusRomanItalic, bold: await nimbusRomanBold, boldItalic: await nimbusRomanBoldItalic,
|
|
159
|
+
},
|
|
160
|
+
NimbusSans: {
|
|
161
|
+
normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold, boldItalic: await nimbusSansBoldItalic,
|
|
162
|
+
},
|
|
163
|
+
NimbusMono: {
|
|
164
|
+
normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold, boldItalic: await nimbusMonoBoldItalic,
|
|
165
|
+
},
|
|
124
166
|
};
|
|
125
167
|
|
|
126
168
|
FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
|
|
@@ -217,6 +259,7 @@ export async function updateFontContWorkerMain(params = {}) {
|
|
|
217
259
|
};
|
|
218
260
|
if (value.italic) input.src[key].italic = value.italic.src;
|
|
219
261
|
if (value.bold) input.src[key].bold = value.bold.src;
|
|
262
|
+
if (value.boldItalic) input.src[key].boldItalic = value.boldItalic.src;
|
|
220
263
|
}
|
|
221
264
|
|
|
222
265
|
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
package/js/fontStatistics.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import {
|
|
5
5
|
determineSansSerif,
|
|
6
|
+
getStyleLookup,
|
|
6
7
|
quantile,
|
|
7
8
|
replaceObjectProperties,
|
|
8
9
|
round6,
|
|
@@ -243,13 +244,13 @@ function calcFontMetricsPage(pageObj) {
|
|
|
243
244
|
|
|
244
245
|
for (const lineObj of pageObj.lines) {
|
|
245
246
|
for (const wordObj of lineObj.words) {
|
|
246
|
-
const wordFontFamily = determineSansSerif(wordObj.font) || 'Default';
|
|
247
|
+
const wordFontFamily = determineSansSerif(wordObj.style.font) || 'Default';
|
|
247
248
|
|
|
248
249
|
// This condition should not occur, however has in the past due to parsing bugs. Skipping to avoid entire program crashing if this occurs.
|
|
249
250
|
if (wordObj.chars && wordObj.chars.length !== wordObj.text.length) continue;
|
|
250
251
|
|
|
251
252
|
// Do not include superscripts, dropcaps, and low-confidence words in statistics for font optimization.
|
|
252
|
-
if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.sup || wordObj.smallCaps) continue;
|
|
253
|
+
if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.style.sup || wordObj.style.smallCaps) continue;
|
|
253
254
|
/** @type {Object.<string, FontMetricsRawFamily>} */
|
|
254
255
|
const fontMetricsRawLine = {};
|
|
255
256
|
|
|
@@ -275,14 +276,18 @@ function calcFontMetricsPage(pageObj) {
|
|
|
275
276
|
fontMetricsRawLine[wordFontFamily] = new FontMetricsRawFamily();
|
|
276
277
|
}
|
|
277
278
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
279
|
+
const styleLookup = getStyleLookup(wordObj.style);
|
|
280
|
+
|
|
281
|
+
if (!['normal', 'italic', 'bold'].includes(styleLookup)) continue;
|
|
282
|
+
|
|
283
|
+
if (!fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode]) {
|
|
284
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode] = [];
|
|
285
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode] = [];
|
|
281
286
|
}
|
|
282
287
|
|
|
283
|
-
fontMetricsRawLine[wordFontFamily][
|
|
284
|
-
fontMetricsRawLine[wordFontFamily][
|
|
285
|
-
fontMetricsRawLine[wordFontFamily][
|
|
288
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode].push(charWidth / charNorm);
|
|
289
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode].push(charHeight / charNorm);
|
|
290
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].obs += 1;
|
|
286
291
|
|
|
287
292
|
if (k + 1 < wordObj.chars.length) {
|
|
288
293
|
const charObjNext = wordObj.chars[k + 1];
|
|
@@ -295,12 +300,12 @@ function calcFontMetricsPage(pageObj) {
|
|
|
295
300
|
if (trailingSpace + charWidthNext > 0) {
|
|
296
301
|
const bigramUnicode = `${charUnicode},${wordObj.chars[k + 1].text.charCodeAt(0)}`;
|
|
297
302
|
|
|
298
|
-
if (!fontMetricsRawLine[wordFontFamily][
|
|
299
|
-
fontMetricsRawLine[wordFontFamily][
|
|
300
|
-
fontMetricsRawLine[wordFontFamily][
|
|
303
|
+
if (!fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode]) {
|
|
304
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode] = [];
|
|
305
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode] = [];
|
|
301
306
|
}
|
|
302
|
-
fontMetricsRawLine[wordFontFamily][
|
|
303
|
-
fontMetricsRawLine[wordFontFamily][
|
|
307
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode].push(trailingSpace / charNorm);
|
|
308
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode].push((trailingSpace + charWidthNext) / charNorm);
|
|
304
309
|
}
|
|
305
310
|
}
|
|
306
311
|
}
|