scribe.js-ocr 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,6 +35,7 @@ When using Scribe.js in the browser, all files must be served from the same orig
35
35
  The following are template repos showing how Scribe.js can be used within various frameworks/build systems.
36
36
 
37
37
  - Browser with ESM (no build): https://github.com/scribeocr/scribe.js-example-esm-browser
38
+ - Browser with Next.js: https://github.com/scribeocr/scribe.js-example-next.js
38
39
  - Browser with Webpack 5: https://github.com/scribeocr/scribe.js-example-webpack5
39
40
  - Browser with Vue.js v2: https://github.com/scribeocr/scribe.js-example-vue2
40
41
 
@@ -57,6 +57,10 @@ export class opt {
57
57
 
58
58
  /** Generate debug visualizations when running OCR. */
59
59
  static debugVis = false;
60
+
61
+ static extractPDFFonts = false;
62
+
63
+ static calcSuppFontInfo = false;
60
64
  }
61
65
 
62
66
  export class inputData {
@@ -67,6 +67,8 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
67
67
 
68
68
  const fontFace = new FontFace(fontFamily, src1, { style: fontStyle, weight: fontWeight });
69
69
 
70
+ if (fontFace.status === 'error') throw new Error(`FontFace failed to load: ${fontFamily} ${fontStyle} ${fontWeight}`);
71
+
70
72
  // Fonts are stored in `document.fonts` for the main thread and `WorkerGlobalScope.fonts` for workers
71
73
  const fontSet = globalThis.document ? globalThis.document.fonts : globalThis.fonts;
72
74
 
@@ -157,6 +159,10 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
157
159
  /** @type {("sans"|"serif")} */
158
160
  this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
159
161
  this.smallCapsMult = 0.75;
162
+ /**
163
+ * @type {boolean} - Disable font. This is used to prevent a flawed font extracted from a PDF from being used.
164
+ */
165
+ this.disable = false;
160
166
 
161
167
  if (typeof FontFace !== 'undefined') loadFontFace(this.fontFaceName, this.fontFaceStyle, this.fontFaceWeight, this.src);
162
168
  }
@@ -228,6 +234,9 @@ export class FontCont {
228
234
  /** @type {?FontContainer} */
229
235
  static opt = null;
230
236
 
237
+ /** @type {?Object<string, FontContainerFamilyUpload>} */
238
+ static doc = null;
239
+
231
240
  /** @type {?FontContainer} */
232
241
  static export = null;
233
242
 
@@ -298,6 +307,10 @@ export class FontCont {
298
307
  * @returns {FontContainerFont}
299
308
  */
300
309
  static getFont = (family, style = 'normal', lang = 'eng') => {
310
+ if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
311
+ return FontCont.doc[family][style];
312
+ }
313
+
301
314
  if (lang === 'chi_sim') {
302
315
  if (!FontCont.supp.chi_sim) throw new Error('chi_sim font does not exist.');
303
316
  return FontCont.supp.chi_sim;
@@ -6,7 +6,7 @@ import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
6
6
 
7
7
  import { getImageBitmap } from '../utils/imageUtils.js';
8
8
 
9
- import { setUploadFontsWorker } from '../fontContainerMain.js';
9
+ import { updateFontContWorkerMain } from '../fontContainerMain.js';
10
10
  import { pageMetricsArr } from './dataContainer.js';
11
11
  import {
12
12
  FontCont,
@@ -16,7 +16,7 @@ import {
16
16
 
17
17
  import { gs } from '../generalWorkerMain.js';
18
18
  import { imageUtils } from '../objects/imageObjects.js';
19
- import { determineSansSerif, range } from '../utils/miscUtils.js';
19
+ import { range } from '../utils/miscUtils.js';
20
20
  import { opt } from './app.js';
21
21
 
22
22
  let skipTextMode = false;
@@ -256,12 +256,12 @@ export class ImageCache {
256
256
  // If no preference is specified for upscaling, default to false.
257
257
  const upscaleArg = props?.upscaled || false;
258
258
 
259
- const scheduler = await gs.getGeneralScheduler();
259
+ await gs.getGeneralScheduler();
260
260
 
261
261
  const resPromise = (async () => {
262
262
  // Wait for non-rotated version before replacing with promise
263
263
  if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
264
- return scheduler.recognize({
264
+ return gs.recognize({
265
265
  image: inputImage.src,
266
266
  options: { rotateRadians: angleArg, upscale: upscaleArg },
267
267
  output: {
@@ -525,7 +525,7 @@ export class ImageCache {
525
525
 
526
526
  // For reasons that are unclear, a small number of pages have been rendered into massive files
527
527
  // so a hard-cap on resolution must be imposed.
528
- const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
528
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
529
529
 
530
530
  // In addition to capping the resolution, also switch the width/height
531
531
  ImageCache.pdfDims300.forEach((x, i) => {
@@ -534,42 +534,61 @@ export class ImageCache {
534
534
  });
535
535
 
536
536
  // WIP: Extract fonts embedded in PDFs.
537
- if (false) {
537
+ // This feature is disabled by default as the results are often bad.
538
+ // In addition to only working for certain font formats, fonts embedded in PDFs are often subsetted and/or corrupted.
539
+ // Therefore, before this is enabled by default, more sophisticated rules regarding when fonts should be used are needed.
540
+ if (opt.extractPDFFonts) {
538
541
  muPDFScheduler.extractAllFonts().then(async (x) => {
539
- globalImageCache.fontArr = [];
540
542
  for (let i = 0; i < x.length; i++) {
541
543
  const src = x[i].buffer;
542
- const fontObj = await loadOpentype(src);
543
- const fontNameEmbedded = fontObj.names.postScriptName.en;
544
- const fontFamilyEmbedded = fontObj.names?.fontFamily?.en || fontNameEmbedded.replace(/-\w+$/, '');
544
+ let fontObj;
545
+ let fontData;
546
+ try {
547
+ fontObj = await loadOpentype(src);
548
+ // It is common for raw fonts embedded in PDFs to be invalid and rejected by the OTS, but running them through opentype.js fixes them.
549
+ // This appears to be because of the way that fonts are subsetted in PDFs.
550
+ fontData = fontObj.toArrayBuffer();
551
+ } catch (error) {
552
+ console.error(`Error loading font ${i}.`);
553
+ console.error(error);
554
+ continue;
555
+ }
545
556
 
546
- // Skip bold and bold-italic fonts for now.
547
- if (fontNameEmbedded.match(/bold/i)) continue;
557
+ const fontNameEmbedded = fontObj.names.postScriptName.en;
548
558
 
549
559
  let fontStyle = 'normal';
550
560
  if (fontNameEmbedded.match(/italic/i)) {
551
561
  fontStyle = 'italic';
552
562
  } else if (fontNameEmbedded.match(/bold/i)) {
553
- // Bold fonts should be enabled at some later point.
554
- // While we previously found that we were unable to detect bold fonts reliably,
555
- // when importing from PDFs, we do not need to guess.
556
- // fontStyle = 'bold';
563
+ fontStyle = 'bold';
557
564
  }
558
- const type = determineSansSerif(fontFamilyEmbedded) === 'SansDefault' ? 'sans' : 'serif';
559
-
560
- // mupdf replaces spaces with underscores in font names.
561
- const fontName = fontFamilyEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
562
565
 
563
- if (!FontCont.raw[fontName]) {
564
- FontCont.raw[fontName] = {};
565
- }
566
-
567
- if (!FontCont.raw[fontName][fontStyle]) {
568
- FontCont.raw[fontName][fontStyle] = new FontContainerFont(fontName, fontStyle, src, false, fontObj);
566
+ // mupdf makes changes to font names, so we need to do the same.
567
+ // Font names in the form `MEDJCO+CenturySchoolbook` are changed to `CenturySchoolbook`.
568
+ // Spaces are replaced with underscores.
569
+ const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
570
+
571
+ if (!FontCont.doc?.[fontName]?.[fontStyle]) {
572
+ try {
573
+ const fontContainer = new FontContainerFont(fontName, fontStyle, fontData, false, fontObj);
574
+
575
+ if (!FontCont.doc) {
576
+ FontCont.doc = {};
577
+ }
578
+
579
+ if (!FontCont.doc[fontName]) {
580
+ FontCont.doc[fontName] = {};
581
+ }
582
+
583
+ FontCont.doc[fontName][fontStyle] = fontContainer;
584
+ } catch (error) {
585
+ console.error(`Error loading font ${fontName} ${fontStyle}.`);
586
+ }
587
+ } else {
588
+ console.warn(`Font ${fontName} ${fontStyle} already exists.`);
569
589
  }
570
590
  }
571
-
572
- await setUploadFontsWorker(gs.schedulerInner);
591
+ await updateFontContWorkerMain();
573
592
  });
574
593
  }
575
594
  };
package/js/debug.js CHANGED
@@ -114,25 +114,11 @@ export async function drawDebugImages(args) {
114
114
  export async function renderPageStatic(page) {
115
115
  const image = await ImageCache.getNative(page.n, { rotated: opt.autoRotate, upscaled: false });
116
116
 
117
- // The Node.js canvas package does not currently support worker threads
118
- // https://github.com/Automattic/node-canvas/issues/1394
119
- let res;
120
- if (!(typeof process === 'undefined')) {
121
- const { renderPageStaticImp } = await import('./worker/compareOCRModule.js');
122
- res = await renderPageStaticImp({
123
- page,
124
- image,
125
- angle: pageMetricsArr[page.n].angle,
126
- });
127
- // Browser case
128
- } else {
129
- if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
130
- res = await gs.scheduler.renderPageStaticImp({
131
- page,
132
- image,
133
- angle: pageMetricsArr[page.n].angle,
134
- });
135
- }
117
+ const res = gs.renderPageStaticImp({
118
+ page,
119
+ image,
120
+ angle: pageMetricsArr[page.n].angle,
121
+ });
136
122
 
137
123
  return res;
138
124
  }
@@ -3,7 +3,7 @@ import { layoutRegions, ocrAll, pageMetricsArr } from '../containers/dataContain
3
3
  import { ImageCache } from '../containers/imageContainer.js';
4
4
  import { reorderOcrPage } from '../modifyOCR.js';
5
5
  import { saveAs } from '../utils/miscUtils.js';
6
- import { hocrToPDF } from './exportPDF.js';
6
+ import { renderPDF } from './exportPDF.js';
7
7
  import { renderHOCR } from './exportRenderHOCR.js';
8
8
  import { renderText } from './exportRenderText.js';
9
9
 
@@ -60,7 +60,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
60
60
  // and assume that the overlay PDF is the same size as the input images.
61
61
  // The `maxpage` argument must be set manually to `inputData.pageCount-1`, as this avoids an error in the case where there is no OCR data (`hocrDownload` has length 0).
62
62
  // In all other cases, this should be equivalent to using the default argument of `-1` (which results in `hocrDownload.length` being used).
63
- const pdfStr = await hocrToPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
63
+ const pdfStr = await renderPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
64
64
  { width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
65
65
 
66
66
  const enc = new TextEncoder();
@@ -142,7 +142,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
142
142
  });
143
143
  }
144
144
  } else {
145
- const pdfStr = await hocrToPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
145
+ const pdfStr = await renderPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
146
146
  opt.overlayOpacity / 100);
147
147
 
148
148
  // The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
@@ -31,7 +31,7 @@ import ocr from '../objects/ocrObjects.js';
31
31
  *
32
32
  * A valid PDF will be created if an empty array is provided for `hocrArr`, as long as `maxpage` is set manually.
33
33
  */
34
- export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
34
+ export async function renderPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
35
35
  dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, proofOpacity = 0.8) {
36
36
  if (!FontCont.raw) throw new Error('No fonts loaded.');
37
37
 
@@ -52,13 +52,8 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
52
52
  /** @type {Array<string>} */
53
53
  const pdfFontObjStrArr = [];
54
54
  let pdfFontsStr = '';
55
- for (const familyKey of Object.keys(FontCont.raw)) {
56
- const useOpt = FontCont.useOptFamily(familyKey);
57
- const familyObj = {
58
- normal: useOpt && FontCont.opt?.[familyKey]?.normal ? FontCont.opt[familyKey].normal : FontCont.raw[familyKey].normal,
59
- italic: useOpt && FontCont.opt?.[familyKey]?.italic ? FontCont.opt[familyKey].italic : FontCont.raw[familyKey].italic,
60
- bold: useOpt && FontCont.opt?.[familyKey]?.bold ? FontCont.opt[familyKey].bold : FontCont.raw[familyKey].bold,
61
- };
55
+
56
+ const addFamilyObj = async (familyKey, familyObj) => {
62
57
  pdfFonts[familyKey] = {};
63
58
  for (const [key, value] of Object.entries(familyObj)) {
64
59
  const font = await value.opentype;
@@ -87,6 +82,22 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
87
82
  pdfFontsStr += `/F${String(fontI)} ${String(objectThis)} 0 R\n`;
88
83
  fontI++;
89
84
  }
85
+ };
86
+
87
+ for (const familyKeyI of Object.keys(FontCont.raw)) {
88
+ const useOpt = FontCont.useOptFamily(familyKeyI);
89
+ const familyObjI = {
90
+ normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
91
+ italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
92
+ bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
93
+ };
94
+ await addFamilyObj(familyKeyI, familyObjI);
95
+ }
96
+
97
+ if (FontCont.doc) {
98
+ for (const familyKeyI of Object.keys(FontCont.doc)) {
99
+ await addFamilyObj(familyKeyI, FontCont.doc[familyKeyI]);
100
+ }
90
101
  }
91
102
 
92
103
  /** @type {?import('opentype.js').Font} */
@@ -308,13 +319,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
308
319
  const { baseline } = lineObj;
309
320
  const linebox = lineObj.bbox;
310
321
 
311
- let firstWord = words[0];
322
+ let wordJ = words[0];
312
323
 
313
324
  let fillColor = '0 0 0 rg';
314
325
  if (textMode === 'proof') {
315
- if (firstWord.conf > confThreshHigh) {
326
+ if (wordJ.conf > confThreshHigh) {
316
327
  fillColor = '0 1 0.5 rg';
317
- } else if (firstWord.conf > confThreshMed) {
328
+ } else if (wordJ.conf > confThreshMed) {
318
329
  fillColor = '1 0.8 0 rg';
319
330
  } else {
320
331
  fillColor = '1 0 0 rg';
@@ -327,41 +338,41 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
327
338
 
328
339
  textContentObjStr += `${fillColor}\n`;
329
340
 
330
- let wordFont = FontCont.getWordFont(firstWord);
341
+ let wordFont = FontCont.getWordFont(wordJ);
331
342
 
332
343
  // The Chinese font is subset to only relevant characters, the others currently are not.
333
- let wordFontOpentype = (firstWord.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
344
+ let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
334
345
 
335
346
  if (!wordFontOpentype) {
336
- const fontNameMessage = firstWord.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${firstWord.style})`;
347
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
337
348
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
338
349
  continue;
339
350
  }
340
351
 
341
352
  // let wordFontSize = calcWordFontSize(word);
342
353
 
343
- const word0Metrics = calcWordMetrics(firstWord, angle);
354
+ const word0Metrics = calcWordMetrics(wordJ, angle);
344
355
 
345
356
  let wordFontSize = word0Metrics.fontSize;
346
357
 
347
358
  // Set font and font size
348
- ({ name: pdfFontCurrent, type: pdfFontTypeCurrent } = firstWord.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][firstWord.style]);
359
+ ({ name: pdfFontCurrent, type: pdfFontTypeCurrent } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style]);
349
360
 
350
361
  textContentObjStr += `${pdfFontCurrent} ${String(wordFontSize)} Tf\n`;
351
362
 
352
363
  // Reset baseline to line baseline
353
364
  textContentObjStr += '0 Ts\n';
354
365
 
355
- const word0LeftBearing = firstWord.visualCoords ? word0Metrics.leftSideBearing : 0;
366
+ const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
356
367
 
357
368
  let tz = 100;
358
- if (firstWord.dropcap) {
359
- const wordWidthActual = firstWord.bbox.right - firstWord.bbox.left;
369
+ if (wordJ.dropcap) {
370
+ const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
360
371
  tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
361
372
  }
362
373
 
363
374
  // Move to next line
364
- const lineLeftAdj = firstWord.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
375
+ const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
365
376
  const lineTopAdj = linebox.bottom + baseline[1] + angleAdjLine.y;
366
377
 
367
378
  if (rotateText) {
@@ -379,7 +390,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
379
390
  let charSpacingLast = 0;
380
391
  let spacingAdj = 0;
381
392
  let kernSpacing = false;
382
- let wordLast = firstWord;
393
+ let wordLast = wordJ;
383
394
  let wordFontOpentypeLast = wordFontOpentype;
384
395
  let fontSizeLast = wordFontSize;
385
396
  let tsCurrent = 0;
@@ -387,27 +398,27 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
387
398
  let charLig = false;
388
399
 
389
400
  for (let j = 0; j < words.length; j++) {
390
- firstWord = words[j];
401
+ wordJ = words[j];
391
402
 
392
- const wordMetrics = calcWordMetrics(firstWord, angle);
403
+ const wordMetrics = calcWordMetrics(wordJ, angle);
393
404
  wordFontSize = wordMetrics.fontSize;
394
405
  const charSpacing = wordMetrics.charSpacing;
395
406
  const charArr = wordMetrics.charArr;
396
- const wordLeftBearing = firstWord.visualCoords ? wordMetrics.leftSideBearing : 0;
407
+ const wordLeftBearing = wordJ.visualCoords ? wordMetrics.leftSideBearing : 0;
397
408
  const kerningArr = wordMetrics.kerningArr;
398
409
 
399
- wordFont = FontCont.getWordFont(firstWord);
400
- wordFontOpentype = firstWord.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
410
+ wordFont = FontCont.getWordFont(wordJ);
411
+ wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
401
412
 
402
413
  if (!wordFontOpentype) {
403
- const fontNameMessage = firstWord.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${firstWord.style})`;
414
+ const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
404
415
  console.log(`Skipping word due to missing font (${fontNameMessage})`);
405
416
  continue;
406
417
  }
407
418
 
408
419
  fillColor = '0 0 0 rg';
409
420
  if (textMode === 'proof') {
410
- const wordConf = firstWord.conf;
421
+ const wordConf = wordJ.conf;
411
422
 
412
423
  if (wordConf > confThreshHigh) {
413
424
  fillColor = '0 1 0.5 rg';
@@ -417,34 +428,35 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
417
428
  fillColor = '1 0 0 rg';
418
429
  }
419
430
  } else if (textMode === 'eval') {
420
- fillColor = firstWord.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
431
+ fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
421
432
  }
422
433
 
423
- const angleAdjWord = firstWord.sup ? ocr.calcWordAngleAdj(firstWord) : { x: 0, y: 0 };
434
+ const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
424
435
  const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
425
436
 
426
- // TODO: Test whether the math here is correct for drop caps.
427
437
  let ts = 0;
428
- if (firstWord.sup) {
429
- ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (firstWord.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
430
- } else if (firstWord.dropcap) {
431
- ts = (linebox.bottom + baseline[1]) - firstWord.bbox.bottom + angleAdjLine.y + angleAdjWord.y;
438
+ if (wordJ.sup || wordJ.dropcap) {
439
+ ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
440
+ if (!wordJ.visualCoords) {
441
+ const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
442
+ ts -= fontDesc;
443
+ }
432
444
  } else {
433
445
  ts = 0;
434
446
  }
435
447
 
436
448
  // TODO: This probably fails for Chinese, rethink.
437
449
  tz = 100;
438
- if (firstWord.dropcap) {
439
- const wordWidthActual = firstWord.bbox.right - firstWord.bbox.left;
450
+ if (wordJ.dropcap) {
451
+ const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
440
452
  tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
441
453
  }
442
454
 
443
455
  // const pdfFont = word.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFontFamily][word.style];
444
- const { name: pdfFont, type: pdfFontType } = firstWord.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][firstWord.style];
456
+ const { name: pdfFont, type: pdfFontType } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
445
457
 
446
- const wordWidthAdj = (firstWord.bbox.right - firstWord.bbox.left) / cosAngle;
447
- const wordSpaceAdj = (firstWord.bbox.left - wordBoxLast.right) / cosAngle;
458
+ const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAngle;
459
+ const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAngle;
448
460
 
449
461
  // Add space character between words
450
462
  if (j > 0 && !kernSpacing) {
@@ -468,13 +480,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
468
480
  }
469
481
  kernSpacing = false;
470
482
 
471
- wordBoxLast = firstWord.bbox;
483
+ wordBoxLast = wordJ.bbox;
472
484
 
473
485
  // In general, we assume that (given our adjustments to character spacing) the rendered word has the same width as the image of that word.
474
486
  // However, this assumption does not hold for single-character words, as there is no space between character to adjust.
475
487
  // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
476
488
  // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
477
- if (charArr.length === 1 && !firstWord.dropcap) {
489
+ if (charArr.length === 1 && !wordJ.dropcap) {
478
490
  const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
479
491
  const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
480
492
  const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
@@ -485,7 +497,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
485
497
 
486
498
  textContentObjStr += ' ] TJ\n';
487
499
 
488
- const fontSize = firstWord.smallCaps && firstWord.text[0] && firstWord.text[0] !== firstWord.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
500
+ const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
489
501
  if (pdfFont !== pdfFontCurrent || fontSize !== fontSizeLast) {
490
502
  textContentObjStr += `${pdfFont} ${String(fontSize)} Tf\n`;
491
503
  pdfFontCurrent = pdfFont;
@@ -512,23 +524,23 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
512
524
  // Non-ASCII and special characters are encoded/escaped using winEncodingLookup
513
525
  for (let k = 0; k < charArr.length; k++) {
514
526
  const letterSrc = charArr[k];
515
- const letter = firstWord.smallCaps ? charArr[k].toUpperCase() : charArr[k];
516
- const fontSizeLetter = firstWord.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
527
+ const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
528
+ const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
517
529
 
518
530
  const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
519
531
  if (letterEnc) {
520
532
  let kern = (kerningArr[k] || 0) * (-1000 / fontSizeLetter);
521
533
 
522
- if (firstWord.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
534
+ if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
523
535
  kernSpacing = true;
524
536
  const wordNext = words[j + 1];
525
- const wordSpaceNextAdj = (wordNext.bbox.left - firstWord.bbox.right) / cosAngle;
537
+ const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
526
538
  // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
527
539
 
528
540
  const wordGlyphMetrics = wordFontOpentype.charToGlyph(charArr.at(-1)).getMetrics();
529
541
  const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
530
542
 
531
- const wordRightBearing = firstWord.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
543
+ const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
532
544
 
533
545
  const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
534
546
 
@@ -581,7 +593,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
581
593
  }
582
594
  }
583
595
 
584
- wordLast = firstWord;
596
+ wordLast = wordJ;
585
597
  wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
586
598
  wordFontOpentypeLast = wordFontOpentype;
587
599
  charSpacingLast = charSpacing;
@@ -11,8 +11,10 @@ import { assignParagraphs } from '../utils/reflowPars.js';
11
11
  * @param {number} maxpage - The last page to include in the document.
12
12
  * @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
13
13
  * @param {boolean} docxMode - Create XML for a word document rather than plain text.
14
+ * @param {?Array<string>} wordIds - An array of word IDs to include in the document.
15
+ * If omitted, all words are included.
14
16
  */
15
- export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false) {
17
+ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) {
16
18
  let textStr = '';
17
19
 
18
20
  if (maxpage === -1) maxpage = ocrCurrent.length - 1;
@@ -48,6 +50,8 @@ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = f
48
50
  const wordObj = lineObj.words[i];
49
51
  if (!wordObj) continue;
50
52
 
53
+ if (wordIds && !wordIds.includes(wordObj.id)) continue;
54
+
51
55
  if (docxMode) {
52
56
  let fontStyle = '';
53
57
  if (wordObj.style === 'italic') {
@@ -1,6 +1,6 @@
1
+ import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
1
2
  import { ImageCache } from './containers/imageContainer.js';
2
- import { convertOCRAll } from './recognizeConvert.js';
3
- import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
3
+ import { convertOCR } from './recognizeConvert.js';
4
4
 
5
5
  /**
6
6
  * Extract raw text content from currently loaded PDF.
@@ -21,7 +21,7 @@ const extractInternalPDFTextRaw = async () => {
21
21
  };
22
22
 
23
23
  const stextArr = /** @type {Array<string>} */ ([]);
24
- const pageDPI = ImageCache.pdfDims300.map((x) => 300 * 2000 / x.width, 2000);
24
+ const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
25
25
  const resArr = pageDPI.map(async (x, i) => {
26
26
  // While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
27
27
  // The XML format is the only built-in mupdf format that includes character-level granularity.
@@ -53,7 +53,7 @@ const extractInternalPDFTextRaw = async () => {
53
53
  // (1) The total number of letters is at least 100 per page on average.
54
54
  // (2) The total number of letters is at least half of the total number of letters.
55
55
  } else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
56
- && pdfContentStats.letterCountVis >= ImageCache.pageCount / 2) {
56
+ && pdfContentStats.pageCountTotalText >= ImageCache.pageCount / 2) {
57
57
  type = 'ocr';
58
58
  // Otherwise, the PDF is considered image-native.
59
59
  // This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
@@ -102,7 +102,7 @@ export const extractInternalPDFText = async (options = {}) => {
102
102
  const format = 'stext';
103
103
 
104
104
  // Process HOCR using web worker, reading from file first if that has not been done already
105
- await convertOCRAll(ocrAllRaw.active, true, format, 'pdf', false);
105
+ await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
106
106
 
107
107
  res.content = ocrAll.pdf;
108
108
 
@@ -183,16 +183,19 @@ export async function enableFontOpt(enableOpt, forceOpt) {
183
183
  * Set `loadRaw` to `true` or `false` to force the raw fonts to be loaded or not loaded, respectively.
184
184
  * @param {boolean} [params.loadOpt] - By default, optimized fonts are loaded if they have not been loaded before.
185
185
  * Set `loadOpt` to `true` or `false` to force the optimized fonts to be loaded or not loaded, respectively.
186
+ * @param {boolean} [params.loadDoc] - By default, fonts extracted from PDF documents are loaded if they have not been loaded before.
187
+ * Set `loadDoc` to `true` or `false` to force the document fonts to be loaded or not loaded, respectively.
186
188
  * @param {boolean} [params.updateProps]
187
189
  */
188
190
  export async function updateFontContWorkerMain(params = {}) {
189
- const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInRawWorker);
190
- const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInOptWorker);
191
+ const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInFontsRawWorker);
192
+ const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInFontsOptWorker);
193
+ const loadDoc = params.loadDoc === true || (params.loadDoc !== false && FontCont.doc && !gs.loadedBuiltInFontsDocWorker);
191
194
 
192
195
  // If the active font data is not already loaded, load it now.
193
196
  // This assumes that only one version of the raw/optimized fonts ever exist--
194
197
  // it does not check whether the current optimized font changed since it was last loaded.
195
- for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt]]) {
198
+ for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt], ['doc', loadDoc]]) {
196
199
  if (!load) continue;
197
200
 
198
201
  const resArr = [];
@@ -214,9 +217,11 @@ export async function updateFontContWorkerMain(params = {}) {
214
217
 
215
218
  // TODO: consider the race condition when `setBuiltInFontsWorkers` is called multiple times quickly and `loadFontsWorker` is still running.
216
219
  if (type === 'opt') {
217
- gs.loadedBuiltInOptWorker = true;
218
- } else {
219
- gs.loadedBuiltInRawWorker = true;
220
+ gs.loadedBuiltInFontsOptWorker = true;
221
+ } else if (type === 'raw') {
222
+ gs.loadedBuiltInFontsRawWorker = true;
223
+ } else if (type === 'doc') {
224
+ gs.loadedBuiltInFontsDocWorker = true;
220
225
  }
221
226
  }
222
227
  await Promise.all(resArr);
@@ -321,8 +326,6 @@ export function setDefaultFontAuto(fontMetricsObj) {
321
326
  * @param {Object.<string, FontMetricsFamily>} fontMetricsObj
322
327
  */
323
328
  export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
324
- if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
325
-
326
329
  // When we have metrics for individual fonts families, those are used to optimize the appropriate fonts.
327
330
  // Otherwise, the "default" metric is applied to whatever font the user has selected as the default font.
328
331
  const multiFontMode = checkMultiFontMode(fontMetricsObj);
@@ -342,7 +345,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
342
345
  }
343
346
 
344
347
  const metricsNormal = fontMetricsObj[fontMetricsType][fontFamily.normal.style];
345
- const normalOptFont = gs.scheduler.optimizeFont({ fontData: fontFamily.normal.src, fontMetricsObj: metricsNormal, style: fontFamily.normal.style })
348
+ const normalOptFont = gs.optimizeFont({ fontData: fontFamily.normal.src, fontMetricsObj: metricsNormal, style: fontFamily.normal.style })
346
349
  .then(async (x) => {
347
350
  const font = await loadOpentype(x.fontData, x.kerningPairs);
348
351
  return new FontContainerFont(fontFamily.normal.family, fontFamily.normal.style, x.fontData, true, font);
@@ -352,7 +355,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
352
355
  /** @type {?FontContainerFont|Promise<FontContainerFont>} */
353
356
  let italicOptFont = null;
354
357
  if (metricsItalic && metricsItalic.obs >= 200) {
355
- italicOptFont = gs.scheduler.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
358
+ italicOptFont = gs.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
356
359
  .then(async (x) => {
357
360
  const font = await loadOpentype(x.fontData, x.kerningPairs);
358
361
  return new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, x.fontData, true, font);