scribe.js-ocr 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/js/containers/app.js +4 -0
- package/js/containers/fontContainer.js +13 -0
- package/js/containers/imageContainer.js +47 -28
- package/js/debug.js +5 -19
- package/js/export/export.js +3 -3
- package/js/export/exportPDF.js +61 -49
- package/js/export/exportRenderText.js +5 -1
- package/js/extractPDFText.js +5 -5
- package/js/fontContainerMain.js +13 -10
- package/js/fontEval.js +7 -25
- package/js/fontSupp.js +165 -0
- package/js/generalWorkerMain.js +122 -84
- package/js/import/convertPageAbbyy.js +1 -6
- package/js/import/convertPageHocr.js +7 -11
- package/js/import/convertPageStext.js +7 -2
- package/js/import/import.js +5 -3
- package/js/objects/ocrObjects.js +14 -0
- package/js/recognizeConvert.js +26 -43
- package/js/utils/fontUtils.js +16 -7
- package/js/utils/miscUtils.js +20 -3
- package/js/worker/compareOCRModule.js +13 -10
- package/js/worker/renderWordCanvas.js +11 -6
- package/mupdf/mupdf-worker.js +1 -2
- package/package.json +1 -1
- package/scribe.js +12 -0
package/README.md
CHANGED
|
@@ -35,6 +35,7 @@ When using Scribe.js in the browser, all files must be served from the same orig
|
|
|
35
35
|
The following are template repos showing how Scribe.js can be used within various frameworks/build systems.
|
|
36
36
|
|
|
37
37
|
- Browser with ESM (no build): https://github.com/scribeocr/scribe.js-example-esm-browser
|
|
38
|
+
- Browser with Next.js: https://github.com/scribeocr/scribe.js-example-next.js
|
|
38
39
|
- Browser with Webpack 5: https://github.com/scribeocr/scribe.js-example-webpack5
|
|
39
40
|
- Browser with Vue.js v2: https://github.com/scribeocr/scribe.js-example-vue2
|
|
40
41
|
|
package/js/containers/app.js
CHANGED
|
@@ -67,6 +67,8 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
|
|
|
67
67
|
|
|
68
68
|
const fontFace = new FontFace(fontFamily, src1, { style: fontStyle, weight: fontWeight });
|
|
69
69
|
|
|
70
|
+
if (fontFace.status === 'error') throw new Error(`FontFace failed to load: ${fontFamily} ${fontStyle} ${fontWeight}`);
|
|
71
|
+
|
|
70
72
|
// Fonts are stored in `document.fonts` for the main thread and `WorkerGlobalScope.fonts` for workers
|
|
71
73
|
const fontSet = globalThis.document ? globalThis.document.fonts : globalThis.fonts;
|
|
72
74
|
|
|
@@ -157,6 +159,10 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
157
159
|
/** @type {("sans"|"serif")} */
|
|
158
160
|
this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
|
|
159
161
|
this.smallCapsMult = 0.75;
|
|
162
|
+
/**
|
|
163
|
+
* @type {boolean} - Disable font. This is used to prevent a flawed font extracted from a PDF from being used.
|
|
164
|
+
*/
|
|
165
|
+
this.disable = false;
|
|
160
166
|
|
|
161
167
|
if (typeof FontFace !== 'undefined') loadFontFace(this.fontFaceName, this.fontFaceStyle, this.fontFaceWeight, this.src);
|
|
162
168
|
}
|
|
@@ -228,6 +234,9 @@ export class FontCont {
|
|
|
228
234
|
/** @type {?FontContainer} */
|
|
229
235
|
static opt = null;
|
|
230
236
|
|
|
237
|
+
/** @type {?Object<string, FontContainerFamilyUpload>} */
|
|
238
|
+
static doc = null;
|
|
239
|
+
|
|
231
240
|
/** @type {?FontContainer} */
|
|
232
241
|
static export = null;
|
|
233
242
|
|
|
@@ -298,6 +307,10 @@ export class FontCont {
|
|
|
298
307
|
* @returns {FontContainerFont}
|
|
299
308
|
*/
|
|
300
309
|
static getFont = (family, style = 'normal', lang = 'eng') => {
|
|
310
|
+
if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
|
|
311
|
+
return FontCont.doc[family][style];
|
|
312
|
+
}
|
|
313
|
+
|
|
301
314
|
if (lang === 'chi_sim') {
|
|
302
315
|
if (!FontCont.supp.chi_sim) throw new Error('chi_sim font does not exist.');
|
|
303
316
|
return FontCont.supp.chi_sim;
|
|
@@ -6,7 +6,7 @@ import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
|
|
|
6
6
|
|
|
7
7
|
import { getImageBitmap } from '../utils/imageUtils.js';
|
|
8
8
|
|
|
9
|
-
import {
|
|
9
|
+
import { updateFontContWorkerMain } from '../fontContainerMain.js';
|
|
10
10
|
import { pageMetricsArr } from './dataContainer.js';
|
|
11
11
|
import {
|
|
12
12
|
FontCont,
|
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
|
|
17
17
|
import { gs } from '../generalWorkerMain.js';
|
|
18
18
|
import { imageUtils } from '../objects/imageObjects.js';
|
|
19
|
-
import {
|
|
19
|
+
import { range } from '../utils/miscUtils.js';
|
|
20
20
|
import { opt } from './app.js';
|
|
21
21
|
|
|
22
22
|
let skipTextMode = false;
|
|
@@ -256,12 +256,12 @@ export class ImageCache {
|
|
|
256
256
|
// If no preference is specified for upscaling, default to false.
|
|
257
257
|
const upscaleArg = props?.upscaled || false;
|
|
258
258
|
|
|
259
|
-
|
|
259
|
+
await gs.getGeneralScheduler();
|
|
260
260
|
|
|
261
261
|
const resPromise = (async () => {
|
|
262
262
|
// Wait for non-rotated version before replacing with promise
|
|
263
263
|
if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
|
|
264
|
-
return
|
|
264
|
+
return gs.recognize({
|
|
265
265
|
image: inputImage.src,
|
|
266
266
|
options: { rotateRadians: angleArg, upscale: upscaleArg },
|
|
267
267
|
output: {
|
|
@@ -525,7 +525,7 @@ export class ImageCache {
|
|
|
525
525
|
|
|
526
526
|
// For reasons that are unclear, a small number of pages have been rendered into massive files
|
|
527
527
|
// so a hard-cap on resolution must be imposed.
|
|
528
|
-
const pageDPI = ImageCache.pdfDims300.map((x) => 300 *
|
|
528
|
+
const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
|
|
529
529
|
|
|
530
530
|
// In addition to capping the resolution, also switch the width/height
|
|
531
531
|
ImageCache.pdfDims300.forEach((x, i) => {
|
|
@@ -534,42 +534,61 @@ export class ImageCache {
|
|
|
534
534
|
});
|
|
535
535
|
|
|
536
536
|
// WIP: Extract fonts embedded in PDFs.
|
|
537
|
-
|
|
537
|
+
// This feature is disabled by default as the results are often bad.
|
|
538
|
+
// In addition to only working for certain font formats, fonts embedded in PDFs are often subsetted and/or corrupted.
|
|
539
|
+
// Therefore, before this is enabled by default, more sophisticated rules regarding when fonts should be used are needed.
|
|
540
|
+
if (opt.extractPDFFonts) {
|
|
538
541
|
muPDFScheduler.extractAllFonts().then(async (x) => {
|
|
539
|
-
globalImageCache.fontArr = [];
|
|
540
542
|
for (let i = 0; i < x.length; i++) {
|
|
541
543
|
const src = x[i].buffer;
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
544
|
+
let fontObj;
|
|
545
|
+
let fontData;
|
|
546
|
+
try {
|
|
547
|
+
fontObj = await loadOpentype(src);
|
|
548
|
+
// It is common for raw fonts embedded in PDFs to be invalid and rejected by the OTS, but running them through opentype.js fixes them.
|
|
549
|
+
// This appears to be because of the way that fonts are subsetted in PDFs.
|
|
550
|
+
fontData = fontObj.toArrayBuffer();
|
|
551
|
+
} catch (error) {
|
|
552
|
+
console.error(`Error loading font ${i}.`);
|
|
553
|
+
console.error(error);
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
545
556
|
|
|
546
|
-
|
|
547
|
-
if (fontNameEmbedded.match(/bold/i)) continue;
|
|
557
|
+
const fontNameEmbedded = fontObj.names.postScriptName.en;
|
|
548
558
|
|
|
549
559
|
let fontStyle = 'normal';
|
|
550
560
|
if (fontNameEmbedded.match(/italic/i)) {
|
|
551
561
|
fontStyle = 'italic';
|
|
552
562
|
} else if (fontNameEmbedded.match(/bold/i)) {
|
|
553
|
-
|
|
554
|
-
// While we previously found that we were unable to detect bold fonts reliably,
|
|
555
|
-
// when importing from PDFs, we do not need to guess.
|
|
556
|
-
// fontStyle = 'bold';
|
|
563
|
+
fontStyle = 'bold';
|
|
557
564
|
}
|
|
558
|
-
const type = determineSansSerif(fontFamilyEmbedded) === 'SansDefault' ? 'sans' : 'serif';
|
|
559
|
-
|
|
560
|
-
// mupdf replaces spaces with underscores in font names.
|
|
561
|
-
const fontName = fontFamilyEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
562
565
|
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
566
|
+
// mupdf makes changes to font names, so we need to do the same.
|
|
567
|
+
// Font names in the form `MEDJCO+CenturySchoolbook` are changed to `CenturySchoolbook`.
|
|
568
|
+
// Spaces are replaced with underscores.
|
|
569
|
+
const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
570
|
+
|
|
571
|
+
if (!FontCont.doc?.[fontName]?.[fontStyle]) {
|
|
572
|
+
try {
|
|
573
|
+
const fontContainer = new FontContainerFont(fontName, fontStyle, fontData, false, fontObj);
|
|
574
|
+
|
|
575
|
+
if (!FontCont.doc) {
|
|
576
|
+
FontCont.doc = {};
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
if (!FontCont.doc[fontName]) {
|
|
580
|
+
FontCont.doc[fontName] = {};
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
FontCont.doc[fontName][fontStyle] = fontContainer;
|
|
584
|
+
} catch (error) {
|
|
585
|
+
console.error(`Error loading font ${fontName} ${fontStyle}.`);
|
|
586
|
+
}
|
|
587
|
+
} else {
|
|
588
|
+
console.warn(`Font ${fontName} ${fontStyle} already exists.`);
|
|
569
589
|
}
|
|
570
590
|
}
|
|
571
|
-
|
|
572
|
-
await setUploadFontsWorker(gs.schedulerInner);
|
|
591
|
+
await updateFontContWorkerMain();
|
|
573
592
|
});
|
|
574
593
|
}
|
|
575
594
|
};
|
package/js/debug.js
CHANGED
|
@@ -114,25 +114,11 @@ export async function drawDebugImages(args) {
|
|
|
114
114
|
export async function renderPageStatic(page) {
|
|
115
115
|
const image = await ImageCache.getNative(page.n, { rotated: opt.autoRotate, upscaled: false });
|
|
116
116
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
res = await renderPageStaticImp({
|
|
123
|
-
page,
|
|
124
|
-
image,
|
|
125
|
-
angle: pageMetricsArr[page.n].angle,
|
|
126
|
-
});
|
|
127
|
-
// Browser case
|
|
128
|
-
} else {
|
|
129
|
-
if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
|
|
130
|
-
res = await gs.scheduler.renderPageStaticImp({
|
|
131
|
-
page,
|
|
132
|
-
image,
|
|
133
|
-
angle: pageMetricsArr[page.n].angle,
|
|
134
|
-
});
|
|
135
|
-
}
|
|
117
|
+
const res = gs.renderPageStaticImp({
|
|
118
|
+
page,
|
|
119
|
+
image,
|
|
120
|
+
angle: pageMetricsArr[page.n].angle,
|
|
121
|
+
});
|
|
136
122
|
|
|
137
123
|
return res;
|
|
138
124
|
}
|
package/js/export/export.js
CHANGED
|
@@ -3,7 +3,7 @@ import { layoutRegions, ocrAll, pageMetricsArr } from '../containers/dataContain
|
|
|
3
3
|
import { ImageCache } from '../containers/imageContainer.js';
|
|
4
4
|
import { reorderOcrPage } from '../modifyOCR.js';
|
|
5
5
|
import { saveAs } from '../utils/miscUtils.js';
|
|
6
|
-
import {
|
|
6
|
+
import { renderPDF } from './exportPDF.js';
|
|
7
7
|
import { renderHOCR } from './exportRenderHOCR.js';
|
|
8
8
|
import { renderText } from './exportRenderText.js';
|
|
9
9
|
|
|
@@ -60,7 +60,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
|
|
|
60
60
|
// and assume that the overlay PDF is the same size as the input images.
|
|
61
61
|
// The `maxpage` argument must be set manually to `inputData.pageCount-1`, as this avoids an error in the case where there is no OCR data (`hocrDownload` has length 0).
|
|
62
62
|
// In all other cases, this should be equivalent to using the default argument of `-1` (which results in `hocrDownload.length` being used).
|
|
63
|
-
const pdfStr = await
|
|
63
|
+
const pdfStr = await renderPDF(ocrDownload, 0, inputData.pageCount - 1, opt.displayMode, rotateText, rotateBackground,
|
|
64
64
|
{ width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
|
|
65
65
|
|
|
66
66
|
const enc = new TextEncoder();
|
|
@@ -142,7 +142,7 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
|
|
|
142
142
|
});
|
|
143
143
|
}
|
|
144
144
|
} else {
|
|
145
|
-
const pdfStr = await
|
|
145
|
+
const pdfStr = await renderPDF(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
|
|
146
146
|
opt.overlayOpacity / 100);
|
|
147
147
|
|
|
148
148
|
// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
|
package/js/export/exportPDF.js
CHANGED
|
@@ -31,7 +31,7 @@ import ocr from '../objects/ocrObjects.js';
|
|
|
31
31
|
*
|
|
32
32
|
* A valid PDF will be created if an empty array is provided for `hocrArr`, as long as `maxpage` is set manually.
|
|
33
33
|
*/
|
|
34
|
-
export async function
|
|
34
|
+
export async function renderPDF(hocrArr, minpage = 0, maxpage = -1, textMode = 'ebook', rotateText = false, rotateBackground = false,
|
|
35
35
|
dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, proofOpacity = 0.8) {
|
|
36
36
|
if (!FontCont.raw) throw new Error('No fonts loaded.');
|
|
37
37
|
|
|
@@ -52,13 +52,8 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
|
|
|
52
52
|
/** @type {Array<string>} */
|
|
53
53
|
const pdfFontObjStrArr = [];
|
|
54
54
|
let pdfFontsStr = '';
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
const familyObj = {
|
|
58
|
-
normal: useOpt && FontCont.opt?.[familyKey]?.normal ? FontCont.opt[familyKey].normal : FontCont.raw[familyKey].normal,
|
|
59
|
-
italic: useOpt && FontCont.opt?.[familyKey]?.italic ? FontCont.opt[familyKey].italic : FontCont.raw[familyKey].italic,
|
|
60
|
-
bold: useOpt && FontCont.opt?.[familyKey]?.bold ? FontCont.opt[familyKey].bold : FontCont.raw[familyKey].bold,
|
|
61
|
-
};
|
|
55
|
+
|
|
56
|
+
const addFamilyObj = async (familyKey, familyObj) => {
|
|
62
57
|
pdfFonts[familyKey] = {};
|
|
63
58
|
for (const [key, value] of Object.entries(familyObj)) {
|
|
64
59
|
const font = await value.opentype;
|
|
@@ -87,6 +82,22 @@ export async function hocrToPDF(hocrArr, minpage = 0, maxpage = -1, textMode = '
|
|
|
87
82
|
pdfFontsStr += `/F${String(fontI)} ${String(objectThis)} 0 R\n`;
|
|
88
83
|
fontI++;
|
|
89
84
|
}
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
for (const familyKeyI of Object.keys(FontCont.raw)) {
|
|
88
|
+
const useOpt = FontCont.useOptFamily(familyKeyI);
|
|
89
|
+
const familyObjI = {
|
|
90
|
+
normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal,
|
|
91
|
+
italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic,
|
|
92
|
+
bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold,
|
|
93
|
+
};
|
|
94
|
+
await addFamilyObj(familyKeyI, familyObjI);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (FontCont.doc) {
|
|
98
|
+
for (const familyKeyI of Object.keys(FontCont.doc)) {
|
|
99
|
+
await addFamilyObj(familyKeyI, FontCont.doc[familyKeyI]);
|
|
100
|
+
}
|
|
90
101
|
}
|
|
91
102
|
|
|
92
103
|
/** @type {?import('opentype.js').Font} */
|
|
@@ -308,13 +319,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
308
319
|
const { baseline } = lineObj;
|
|
309
320
|
const linebox = lineObj.bbox;
|
|
310
321
|
|
|
311
|
-
let
|
|
322
|
+
let wordJ = words[0];
|
|
312
323
|
|
|
313
324
|
let fillColor = '0 0 0 rg';
|
|
314
325
|
if (textMode === 'proof') {
|
|
315
|
-
if (
|
|
326
|
+
if (wordJ.conf > confThreshHigh) {
|
|
316
327
|
fillColor = '0 1 0.5 rg';
|
|
317
|
-
} else if (
|
|
328
|
+
} else if (wordJ.conf > confThreshMed) {
|
|
318
329
|
fillColor = '1 0.8 0 rg';
|
|
319
330
|
} else {
|
|
320
331
|
fillColor = '1 0 0 rg';
|
|
@@ -327,41 +338,41 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
327
338
|
|
|
328
339
|
textContentObjStr += `${fillColor}\n`;
|
|
329
340
|
|
|
330
|
-
let wordFont = FontCont.getWordFont(
|
|
341
|
+
let wordFont = FontCont.getWordFont(wordJ);
|
|
331
342
|
|
|
332
343
|
// The Chinese font is subset to only relevant characters, the others currently are not.
|
|
333
|
-
let wordFontOpentype = (
|
|
344
|
+
let wordFontOpentype = (wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype);
|
|
334
345
|
|
|
335
346
|
if (!wordFontOpentype) {
|
|
336
|
-
const fontNameMessage =
|
|
347
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
337
348
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
338
349
|
continue;
|
|
339
350
|
}
|
|
340
351
|
|
|
341
352
|
// let wordFontSize = calcWordFontSize(word);
|
|
342
353
|
|
|
343
|
-
const word0Metrics = calcWordMetrics(
|
|
354
|
+
const word0Metrics = calcWordMetrics(wordJ, angle);
|
|
344
355
|
|
|
345
356
|
let wordFontSize = word0Metrics.fontSize;
|
|
346
357
|
|
|
347
358
|
// Set font and font size
|
|
348
|
-
({ name: pdfFontCurrent, type: pdfFontTypeCurrent } =
|
|
359
|
+
({ name: pdfFontCurrent, type: pdfFontTypeCurrent } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style]);
|
|
349
360
|
|
|
350
361
|
textContentObjStr += `${pdfFontCurrent} ${String(wordFontSize)} Tf\n`;
|
|
351
362
|
|
|
352
363
|
// Reset baseline to line baseline
|
|
353
364
|
textContentObjStr += '0 Ts\n';
|
|
354
365
|
|
|
355
|
-
const word0LeftBearing =
|
|
366
|
+
const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0;
|
|
356
367
|
|
|
357
368
|
let tz = 100;
|
|
358
|
-
if (
|
|
359
|
-
const wordWidthActual =
|
|
369
|
+
if (wordJ.dropcap) {
|
|
370
|
+
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
360
371
|
tz = (wordWidthActual / word0Metrics.visualWidth) * 100;
|
|
361
372
|
}
|
|
362
373
|
|
|
363
374
|
// Move to next line
|
|
364
|
-
const lineLeftAdj =
|
|
375
|
+
const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x;
|
|
365
376
|
const lineTopAdj = linebox.bottom + baseline[1] + angleAdjLine.y;
|
|
366
377
|
|
|
367
378
|
if (rotateText) {
|
|
@@ -379,7 +390,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
379
390
|
let charSpacingLast = 0;
|
|
380
391
|
let spacingAdj = 0;
|
|
381
392
|
let kernSpacing = false;
|
|
382
|
-
let wordLast =
|
|
393
|
+
let wordLast = wordJ;
|
|
383
394
|
let wordFontOpentypeLast = wordFontOpentype;
|
|
384
395
|
let fontSizeLast = wordFontSize;
|
|
385
396
|
let tsCurrent = 0;
|
|
@@ -387,27 +398,27 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
387
398
|
let charLig = false;
|
|
388
399
|
|
|
389
400
|
for (let j = 0; j < words.length; j++) {
|
|
390
|
-
|
|
401
|
+
wordJ = words[j];
|
|
391
402
|
|
|
392
|
-
const wordMetrics = calcWordMetrics(
|
|
403
|
+
const wordMetrics = calcWordMetrics(wordJ, angle);
|
|
393
404
|
wordFontSize = wordMetrics.fontSize;
|
|
394
405
|
const charSpacing = wordMetrics.charSpacing;
|
|
395
406
|
const charArr = wordMetrics.charArr;
|
|
396
|
-
const wordLeftBearing =
|
|
407
|
+
const wordLeftBearing = wordJ.visualCoords ? wordMetrics.leftSideBearing : 0;
|
|
397
408
|
const kerningArr = wordMetrics.kerningArr;
|
|
398
409
|
|
|
399
|
-
wordFont = FontCont.getWordFont(
|
|
400
|
-
wordFontOpentype =
|
|
410
|
+
wordFont = FontCont.getWordFont(wordJ);
|
|
411
|
+
wordFontOpentype = wordJ.lang === 'chi_sim' ? fontChiSim : wordFont.opentype;
|
|
401
412
|
|
|
402
413
|
if (!wordFontOpentype) {
|
|
403
|
-
const fontNameMessage =
|
|
414
|
+
const fontNameMessage = wordJ.lang === 'chi_sim' ? 'chi_sim' : `${wordFont.family} (${wordJ.style})`;
|
|
404
415
|
console.log(`Skipping word due to missing font (${fontNameMessage})`);
|
|
405
416
|
continue;
|
|
406
417
|
}
|
|
407
418
|
|
|
408
419
|
fillColor = '0 0 0 rg';
|
|
409
420
|
if (textMode === 'proof') {
|
|
410
|
-
const wordConf =
|
|
421
|
+
const wordConf = wordJ.conf;
|
|
411
422
|
|
|
412
423
|
if (wordConf > confThreshHigh) {
|
|
413
424
|
fillColor = '0 1 0.5 rg';
|
|
@@ -417,34 +428,35 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
417
428
|
fillColor = '1 0 0 rg';
|
|
418
429
|
}
|
|
419
430
|
} else if (textMode === 'eval') {
|
|
420
|
-
fillColor =
|
|
431
|
+
fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg';
|
|
421
432
|
}
|
|
422
433
|
|
|
423
|
-
const angleAdjWord =
|
|
434
|
+
const angleAdjWord = wordJ.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 };
|
|
424
435
|
const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0;
|
|
425
436
|
|
|
426
|
-
// TODO: Test whether the math here is correct for drop caps.
|
|
427
437
|
let ts = 0;
|
|
428
|
-
if (
|
|
429
|
-
ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (
|
|
430
|
-
|
|
431
|
-
|
|
438
|
+
if (wordJ.sup || wordJ.dropcap) {
|
|
439
|
+
ts = (linebox.bottom + baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y);
|
|
440
|
+
if (!wordJ.visualCoords) {
|
|
441
|
+
const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize;
|
|
442
|
+
ts -= fontDesc;
|
|
443
|
+
}
|
|
432
444
|
} else {
|
|
433
445
|
ts = 0;
|
|
434
446
|
}
|
|
435
447
|
|
|
436
448
|
// TODO: This probably fails for Chinese, rethink.
|
|
437
449
|
tz = 100;
|
|
438
|
-
if (
|
|
439
|
-
const wordWidthActual =
|
|
450
|
+
if (wordJ.dropcap) {
|
|
451
|
+
const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left;
|
|
440
452
|
tz = (wordWidthActual / wordMetrics.visualWidth) * 100;
|
|
441
453
|
}
|
|
442
454
|
|
|
443
455
|
// const pdfFont = word.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFontFamily][word.style];
|
|
444
|
-
const { name: pdfFont, type: pdfFontType } =
|
|
456
|
+
const { name: pdfFont, type: pdfFontType } = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][wordJ.style];
|
|
445
457
|
|
|
446
|
-
const wordWidthAdj = (
|
|
447
|
-
const wordSpaceAdj = (
|
|
458
|
+
const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAngle;
|
|
459
|
+
const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAngle;
|
|
448
460
|
|
|
449
461
|
// Add space character between words
|
|
450
462
|
if (j > 0 && !kernSpacing) {
|
|
@@ -468,13 +480,13 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
468
480
|
}
|
|
469
481
|
kernSpacing = false;
|
|
470
482
|
|
|
471
|
-
wordBoxLast =
|
|
483
|
+
wordBoxLast = wordJ.bbox;
|
|
472
484
|
|
|
473
485
|
// In general, we assume that (given our adjustments to character spacing) the rendered word has the same width as the image of that word.
|
|
474
486
|
// However, this assumption does not hold for single-character words, as there is no space between character to adjust.
|
|
475
487
|
// Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space.
|
|
476
488
|
// (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.)
|
|
477
|
-
if (charArr.length === 1 && !
|
|
489
|
+
if (charArr.length === 1 && !wordJ.dropcap) {
|
|
478
490
|
const wordLastGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
|
|
479
491
|
const wordLastGlyphMetrics = wordLastGlyph.getMetrics();
|
|
480
492
|
const lastCharWidth = (wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth) * (wordFontSize / wordFontOpentype.unitsPerEm);
|
|
@@ -485,7 +497,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
485
497
|
|
|
486
498
|
textContentObjStr += ' ] TJ\n';
|
|
487
499
|
|
|
488
|
-
const fontSize =
|
|
500
|
+
const fontSize = wordJ.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
489
501
|
if (pdfFont !== pdfFontCurrent || fontSize !== fontSizeLast) {
|
|
490
502
|
textContentObjStr += `${pdfFont} ${String(fontSize)} Tf\n`;
|
|
491
503
|
pdfFontCurrent = pdfFont;
|
|
@@ -512,23 +524,23 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
512
524
|
// Non-ASCII and special characters are encoded/escaped using winEncodingLookup
|
|
513
525
|
for (let k = 0; k < charArr.length; k++) {
|
|
514
526
|
const letterSrc = charArr[k];
|
|
515
|
-
const letter =
|
|
516
|
-
const fontSizeLetter =
|
|
527
|
+
const letter = wordJ.smallCaps ? charArr[k].toUpperCase() : charArr[k];
|
|
528
|
+
const fontSizeLetter = wordJ.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize;
|
|
517
529
|
|
|
518
530
|
const letterEnc = pdfFontTypeCurrent === 0 ? wordFontOpentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter];
|
|
519
531
|
if (letterEnc) {
|
|
520
532
|
let kern = (kerningArr[k] || 0) * (-1000 / fontSizeLetter);
|
|
521
533
|
|
|
522
|
-
if (
|
|
534
|
+
if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') {
|
|
523
535
|
kernSpacing = true;
|
|
524
536
|
const wordNext = words[j + 1];
|
|
525
|
-
const wordSpaceNextAdj = (wordNext.bbox.left -
|
|
537
|
+
const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
|
|
526
538
|
// const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
|
|
527
539
|
|
|
528
540
|
const wordGlyphMetrics = wordFontOpentype.charToGlyph(charArr.at(-1)).getMetrics();
|
|
529
541
|
const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
|
|
530
542
|
|
|
531
|
-
const wordRightBearing =
|
|
543
|
+
const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
|
|
532
544
|
|
|
533
545
|
const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
|
|
534
546
|
|
|
@@ -581,7 +593,7 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
|
|
|
581
593
|
}
|
|
582
594
|
}
|
|
583
595
|
|
|
584
|
-
wordLast =
|
|
596
|
+
wordLast = wordJ;
|
|
585
597
|
wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0;
|
|
586
598
|
wordFontOpentypeLast = wordFontOpentype;
|
|
587
599
|
charSpacingLast = charSpacing;
|
|
@@ -11,8 +11,10 @@ import { assignParagraphs } from '../utils/reflowPars.js';
|
|
|
11
11
|
* @param {number} maxpage - The last page to include in the document.
|
|
12
12
|
* @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
|
|
13
13
|
* @param {boolean} docxMode - Create XML for a word document rather than plain text.
|
|
14
|
+
* @param {?Array<string>} wordIds - An array of word IDs to include in the document.
|
|
15
|
+
* If omitted, all words are included.
|
|
14
16
|
*/
|
|
15
|
-
export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false) {
|
|
17
|
+
export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) {
|
|
16
18
|
let textStr = '';
|
|
17
19
|
|
|
18
20
|
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
|
|
@@ -48,6 +50,8 @@ export function renderText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = f
|
|
|
48
50
|
const wordObj = lineObj.words[i];
|
|
49
51
|
if (!wordObj) continue;
|
|
50
52
|
|
|
53
|
+
if (wordIds && !wordIds.includes(wordObj.id)) continue;
|
|
54
|
+
|
|
51
55
|
if (docxMode) {
|
|
52
56
|
let fontStyle = '';
|
|
53
57
|
if (wordObj.style === 'italic') {
|
package/js/extractPDFText.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
|
|
1
2
|
import { ImageCache } from './containers/imageContainer.js';
|
|
2
|
-
import {
|
|
3
|
-
import { ocrAllRaw, ocrAll } from './containers/dataContainer.js';
|
|
3
|
+
import { convertOCR } from './recognizeConvert.js';
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Extract raw text content from currently loaded PDF.
|
|
@@ -21,7 +21,7 @@ const extractInternalPDFTextRaw = async () => {
|
|
|
21
21
|
};
|
|
22
22
|
|
|
23
23
|
const stextArr = /** @type {Array<string>} */ ([]);
|
|
24
|
-
const pageDPI = ImageCache.pdfDims300.map((x) => 300 *
|
|
24
|
+
const pageDPI = ImageCache.pdfDims300.map((x) => 300 * Math.min(x.width, 3500) / x.width);
|
|
25
25
|
const resArr = pageDPI.map(async (x, i) => {
|
|
26
26
|
// While using `pageTextJSON` would save some parsing, unfortunately that format only includes line-level granularity.
|
|
27
27
|
// The XML format is the only built-in mupdf format that includes character-level granularity.
|
|
@@ -53,7 +53,7 @@ const extractInternalPDFTextRaw = async () => {
|
|
|
53
53
|
// (1) The total number of letters is at least 100 per page on average.
|
|
54
54
|
// (2) The total number of letters is at least half of the total number of letters.
|
|
55
55
|
} else if (pdfContentStats.letterCountTotal >= ImageCache.pageCount * 100
|
|
56
|
-
&& pdfContentStats.
|
|
56
|
+
&& pdfContentStats.pageCountTotalText >= ImageCache.pageCount / 2) {
|
|
57
57
|
type = 'ocr';
|
|
58
58
|
// Otherwise, the PDF is considered image-native.
|
|
59
59
|
// This includes both literally image-only PDFs, as well as PDFs that have invalid encodings or other issues that prevent valid text extraction.
|
|
@@ -102,7 +102,7 @@ export const extractInternalPDFText = async (options = {}) => {
|
|
|
102
102
|
const format = 'stext';
|
|
103
103
|
|
|
104
104
|
// Process HOCR using web worker, reading from file first if that has not been done already
|
|
105
|
-
await
|
|
105
|
+
await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
|
|
106
106
|
|
|
107
107
|
res.content = ocrAll.pdf;
|
|
108
108
|
|
package/js/fontContainerMain.js
CHANGED
|
@@ -183,16 +183,19 @@ export async function enableFontOpt(enableOpt, forceOpt) {
|
|
|
183
183
|
* Set `loadRaw` to `true` or `false` to force the raw fonts to be loaded or not loaded, respectively.
|
|
184
184
|
* @param {boolean} [params.loadOpt] - By default, optimized fonts are loaded if they have not been loaded before.
|
|
185
185
|
* Set `loadOpt` to `true` or `false` to force the optimized fonts to be loaded or not loaded, respectively.
|
|
186
|
+
* @param {boolean} [params.loadDoc] - By default, fonts extracted from PDF documents are loaded if they have not been loaded before.
|
|
187
|
+
* Set `loadDoc` to `true` or `false` to force the document fonts to be loaded or not loaded, respectively.
|
|
186
188
|
* @param {boolean} [params.updateProps]
|
|
187
189
|
*/
|
|
188
190
|
export async function updateFontContWorkerMain(params = {}) {
|
|
189
|
-
const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.
|
|
190
|
-
const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.
|
|
191
|
+
const loadRaw = params.loadRaw === true || (params.loadRaw !== false && FontCont.raw && !gs.loadedBuiltInFontsRawWorker);
|
|
192
|
+
const loadOpt = params.loadOpt === true || (params.loadOpt !== false && FontCont.opt && !gs.loadedBuiltInFontsOptWorker);
|
|
193
|
+
const loadDoc = params.loadDoc === true || (params.loadDoc !== false && FontCont.doc && !gs.loadedBuiltInFontsDocWorker);
|
|
191
194
|
|
|
192
195
|
// If the active font data is not already loaded, load it now.
|
|
193
196
|
// This assumes that only one version of the raw/optimized fonts ever exist--
|
|
194
197
|
// it does not check whether the current optimized font changed since it was last loaded.
|
|
195
|
-
for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt]]) {
|
|
198
|
+
for (const [type, load] of [['raw', loadRaw], ['opt', loadOpt], ['doc', loadDoc]]) {
|
|
196
199
|
if (!load) continue;
|
|
197
200
|
|
|
198
201
|
const resArr = [];
|
|
@@ -214,9 +217,11 @@ export async function updateFontContWorkerMain(params = {}) {
|
|
|
214
217
|
|
|
215
218
|
// TODO: consider the race condition when `setBuiltInFontsWorkers` is called multiple times quickly and `loadFontsWorker` is still running.
|
|
216
219
|
if (type === 'opt') {
|
|
217
|
-
gs.
|
|
218
|
-
} else {
|
|
219
|
-
gs.
|
|
220
|
+
gs.loadedBuiltInFontsOptWorker = true;
|
|
221
|
+
} else if (type === 'raw') {
|
|
222
|
+
gs.loadedBuiltInFontsRawWorker = true;
|
|
223
|
+
} else if (type === 'doc') {
|
|
224
|
+
gs.loadedBuiltInFontsDocWorker = true;
|
|
220
225
|
}
|
|
221
226
|
}
|
|
222
227
|
await Promise.all(resArr);
|
|
@@ -321,8 +326,6 @@ export function setDefaultFontAuto(fontMetricsObj) {
|
|
|
321
326
|
* @param {Object.<string, FontMetricsFamily>} fontMetricsObj
|
|
322
327
|
*/
|
|
323
328
|
export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
324
|
-
if (!gs.scheduler) throw new Error('GeneralScheduler must be defined before this function can run.');
|
|
325
|
-
|
|
326
329
|
// When we have metrics for individual fonts families, those are used to optimize the appropriate fonts.
|
|
327
330
|
// Otherwise, the "default" metric is applied to whatever font the user has selected as the default font.
|
|
328
331
|
const multiFontMode = checkMultiFontMode(fontMetricsObj);
|
|
@@ -342,7 +345,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
|
342
345
|
}
|
|
343
346
|
|
|
344
347
|
const metricsNormal = fontMetricsObj[fontMetricsType][fontFamily.normal.style];
|
|
345
|
-
const normalOptFont = gs.
|
|
348
|
+
const normalOptFont = gs.optimizeFont({ fontData: fontFamily.normal.src, fontMetricsObj: metricsNormal, style: fontFamily.normal.style })
|
|
346
349
|
.then(async (x) => {
|
|
347
350
|
const font = await loadOpentype(x.fontData, x.kerningPairs);
|
|
348
351
|
return new FontContainerFont(fontFamily.normal.family, fontFamily.normal.style, x.fontData, true, font);
|
|
@@ -352,7 +355,7 @@ export async function optimizeFontContainerFamily(fontFamily, fontMetricsObj) {
|
|
|
352
355
|
/** @type {?FontContainerFont|Promise<FontContainerFont>} */
|
|
353
356
|
let italicOptFont = null;
|
|
354
357
|
if (metricsItalic && metricsItalic.obs >= 200) {
|
|
355
|
-
italicOptFont = gs.
|
|
358
|
+
italicOptFont = gs.optimizeFont({ fontData: fontFamily.italic.src, fontMetricsObj: metricsItalic, style: fontFamily.italic.style })
|
|
356
359
|
.then(async (x) => {
|
|
357
360
|
const font = await loadOpentype(x.fontData, x.kerningPairs);
|
|
358
361
|
return new FontContainerFont(fontFamily.italic.family, fontFamily.italic.style, x.fontData, true, font);
|