scribe.js-ocr 0.7.2 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.js +12 -1
- package/cli/detectPDFType.js +13 -20
- package/cli/extract.js +4 -2
- package/cli/scribe.js +9 -1
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/containers/app.js +1 -1
- package/js/containers/fontContainer.js +42 -40
- package/js/export/export.js +1 -1
- package/js/export/writeHocr.js +15 -13
- package/js/export/writeHtml.js +45 -29
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +50 -7
- package/js/fontStatistics.js +18 -13
- package/js/fontSupp.js +20 -20
- package/js/global.d.ts +17 -0
- package/js/import/convertPageAbbyy.js +47 -25
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +66 -31
- package/js/objects/ocrObjects.js +13 -19
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +16 -0
- package/js/worker/compareOCRModule.js +13 -16
- package/js/worker/optimizeFontModule.js +4 -4
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +1 -1
package/cli/cli.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { detectPDFType } from './detectPDFType.js';
|
|
1
2
|
import { extract } from './extract.js';
|
|
2
3
|
import {
|
|
3
4
|
check,
|
|
@@ -36,7 +37,7 @@ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
|
|
|
36
37
|
* @param {string} pdfFile - Path to PDF file.
|
|
37
38
|
* @param {?string} [outputDir='.'] - Output directory.
|
|
38
39
|
* @param {Object} [options]
|
|
39
|
-
* @param {
|
|
40
|
+
* @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
|
|
40
41
|
* @param {boolean} [options.reflow]
|
|
41
42
|
*/
|
|
42
43
|
export const extractCLI = async (pdfFile, outputDir, options) => {
|
|
@@ -44,6 +45,16 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
|
|
|
44
45
|
process.exitCode = 0;
|
|
45
46
|
};
|
|
46
47
|
|
|
48
|
+
/**
|
|
49
|
+
*
|
|
50
|
+
* @param {string} pdfFile - Path to PDF file.
|
|
51
|
+
* @param {string} [outputPath] - Output file path.
|
|
52
|
+
*/
|
|
53
|
+
export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
|
|
54
|
+
await detectPDFType(pdfFile, outputPath);
|
|
55
|
+
process.exitCode = 0;
|
|
56
|
+
};
|
|
57
|
+
|
|
47
58
|
/**
|
|
48
59
|
*
|
|
49
60
|
* @param {string} pdfFile - Path to PDF file.
|
package/cli/detectPDFType.js
CHANGED
|
@@ -1,20 +1,17 @@
|
|
|
1
|
-
// Code for adding visualization to OCR output
|
|
2
|
-
|
|
3
1
|
import fs from 'fs';
|
|
4
|
-
import
|
|
5
|
-
import Worker from 'web-worker';
|
|
6
|
-
import { initMuPDFWorker } from '../mupdf/mupdf-async.js';
|
|
7
|
-
|
|
8
|
-
globalThis.Worker = Worker;
|
|
9
|
-
globalThis.require = createRequire(import.meta.url);
|
|
2
|
+
import scribe from '../scribe.js';
|
|
10
3
|
|
|
11
|
-
|
|
4
|
+
/**
|
|
5
|
+
*
|
|
6
|
+
* @param {string} pdfFile - Path to PDF file.
|
|
7
|
+
* @param {string} [outputPath] - Output file path.
|
|
8
|
+
* If provided, the text will be extracted and saved to this path.
|
|
9
|
+
*/
|
|
10
|
+
export const detectPDFType = async (pdfFile, outputPath) => {
|
|
11
|
+
const mupdfScheduler = await scribe.data.image.getMuPDFScheduler(1);
|
|
12
|
+
const w = mupdfScheduler.workers[0];
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
const w = await initMuPDFWorker();
|
|
15
|
-
const fileData = await fs.readFileSync(args[0]);
|
|
16
|
-
|
|
17
|
-
const outputPath = args[1];
|
|
14
|
+
const fileData = await fs.readFileSync(pdfFile);
|
|
18
15
|
|
|
19
16
|
const pdfDoc = await w.openDocument(fileData, 'file.pdf');
|
|
20
17
|
w.pdfDoc = pdfDoc;
|
|
@@ -32,10 +29,6 @@ async function main() {
|
|
|
32
29
|
|
|
33
30
|
console.log('PDF Type:', type);
|
|
34
31
|
|
|
35
|
-
|
|
36
|
-
w.terminate();
|
|
37
|
-
|
|
38
|
-
process.exitCode = 0;
|
|
39
|
-
}
|
|
32
|
+
mupdfScheduler.scheduler.terminate();
|
|
40
33
|
|
|
41
|
-
|
|
34
|
+
};
|
package/cli/extract.js
CHANGED
|
@@ -7,7 +7,7 @@ import scribe from '../scribe.js';
|
|
|
7
7
|
* @param {string} pdfFile - Path to PDF file.
|
|
8
8
|
* @param {?string} [output='.'] - Output file or directory.
|
|
9
9
|
* @param {Object} [options]
|
|
10
|
-
* @param {
|
|
10
|
+
* @param {Parameters<typeof scribe.download>[0]} [options.format]
|
|
11
11
|
* @param {boolean} [options.reflow]
|
|
12
12
|
*/
|
|
13
13
|
export const extract = async (pdfFile, output, options) => {
|
|
@@ -18,7 +18,9 @@ export const extract = async (pdfFile, output, options) => {
|
|
|
18
18
|
const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,5}$/i, `.${format}`)}` : path.basename(output);
|
|
19
19
|
const outputPath = `${outputDir}/${outputFile}`;
|
|
20
20
|
|
|
21
|
-
scribe.
|
|
21
|
+
scribe.opt.reflow = true;
|
|
22
|
+
scribe.opt.extractText = true;
|
|
23
|
+
|
|
22
24
|
await scribe.init();
|
|
23
25
|
await scribe.importFiles([pdfFile]);
|
|
24
26
|
|
package/cli/scribe.js
CHANGED
|
@@ -4,6 +4,7 @@ import {
|
|
|
4
4
|
checkCLI,
|
|
5
5
|
confCLI,
|
|
6
6
|
debugCLI,
|
|
7
|
+
detectPDFTypeCLI,
|
|
7
8
|
evalInternalCLI, extractCLI, overlayCLI, recognizeCLI,
|
|
8
9
|
} from './cli.js';
|
|
9
10
|
|
|
@@ -35,7 +36,7 @@ program
|
|
|
35
36
|
.command('extract')
|
|
36
37
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
37
38
|
.argument('[output]', 'Output directory or file to save results.', '.')
|
|
38
|
-
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['txt']).default('txt'))
|
|
39
|
+
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
|
|
39
40
|
.option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')
|
|
40
41
|
.description('Extract text from PDF file and save in requested format.')
|
|
41
42
|
.action(extractCLI);
|
|
@@ -61,6 +62,13 @@ program
|
|
|
61
62
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
62
63
|
.action(recognizeCLI);
|
|
63
64
|
|
|
65
|
+
program
|
|
66
|
+
.command('type')
|
|
67
|
+
.argument('<pdf_file>', 'Input PDF file.')
|
|
68
|
+
.argument('[output]', 'Output file path to save text.')
|
|
69
|
+
.description('Detect PDF file type (\'Text native\', \'Image + OCR text\', or \'Image native\').')
|
|
70
|
+
.action(detectPDFTypeCLI);
|
|
71
|
+
|
|
64
72
|
program
|
|
65
73
|
.command('debug')
|
|
66
74
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/js/containers/app.js
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
// Node.js case
|
|
7
7
|
import opentype from '../../lib/opentype.module.js';
|
|
8
|
-
import { determineSansSerif } from '../utils/miscUtils.js';
|
|
8
|
+
import { determineSansSerif, getStyleLookup } from '../utils/miscUtils.js';
|
|
9
9
|
import { ca } from '../canvasAdapter.js';
|
|
10
10
|
|
|
11
11
|
if (typeof process === 'object') {
|
|
@@ -104,26 +104,26 @@ export function loadFontFace(fontFamily, fontStyle, fontWeight, src) {
|
|
|
104
104
|
* Load font from source and return a FontContainerFont object.
|
|
105
105
|
* This function is used to load the Chinese font.
|
|
106
106
|
* @param {string} family
|
|
107
|
-
* @param {
|
|
107
|
+
* @param {StyleLookup} styleLookup
|
|
108
108
|
* @param {("sans"|"serif")} type
|
|
109
109
|
* @param {ArrayBuffer} src
|
|
110
110
|
* @param {boolean} opt
|
|
111
111
|
*
|
|
112
112
|
*/
|
|
113
|
-
export async function loadFont(family,
|
|
113
|
+
export async function loadFont(family, styleLookup, type, src, opt) {
|
|
114
114
|
const fontObj = await loadOpentype(src);
|
|
115
|
-
return new FontContainerFont(family,
|
|
115
|
+
return new FontContainerFont(family, styleLookup, src, opt, fontObj);
|
|
116
116
|
}
|
|
117
117
|
|
|
118
118
|
/**
|
|
119
119
|
*
|
|
120
120
|
* @param {string} family
|
|
121
|
-
* @param {
|
|
121
|
+
* @param {StyleLookup} styleLookup
|
|
122
122
|
* @param {ArrayBuffer} src
|
|
123
123
|
* @param {boolean} opt
|
|
124
124
|
* @param {opentype.Font} opentypeObj - Kerning paris to re-apply
|
|
125
125
|
* @property {string} family -
|
|
126
|
-
* @property {
|
|
126
|
+
* @property {StyleLookup} style -
|
|
127
127
|
* @property {ArrayBuffer} src
|
|
128
128
|
* @property {opentype.Font} opentype -
|
|
129
129
|
* @property {string} fontFaceName -
|
|
@@ -135,7 +135,7 @@ export async function loadFont(family, style, type, src, opt) {
|
|
|
135
135
|
* First, it is not necessary. Setting the font on a canvas (the only reason loading a `FontFace` is needed) is done through refering `fontFaceName` and `fontFaceStyle`.
|
|
136
136
|
* Second, it results in errors being thrown when used in Node.js, as `FontFace` will be undefined in this case.
|
|
137
137
|
*/
|
|
138
|
-
export function FontContainerFont(family,
|
|
138
|
+
export function FontContainerFont(family, styleLookup, src, opt, opentypeObj) {
|
|
139
139
|
// As FontFace objects are included in the document FontFaceSet object,
|
|
140
140
|
// they need to all have unique names.
|
|
141
141
|
let fontFaceName = family;
|
|
@@ -143,8 +143,8 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
143
143
|
|
|
144
144
|
/** @type {string} */
|
|
145
145
|
this.family = family;
|
|
146
|
-
/** @type {
|
|
147
|
-
this.style =
|
|
146
|
+
/** @type {StyleLookup} */
|
|
147
|
+
this.style = styleLookup;
|
|
148
148
|
/** @type {boolean} */
|
|
149
149
|
this.opt = opt;
|
|
150
150
|
/** @type {ArrayBuffer} */
|
|
@@ -154,9 +154,9 @@ export function FontContainerFont(family, style, src, opt, opentypeObj) {
|
|
|
154
154
|
/** @type {string} */
|
|
155
155
|
this.fontFaceName = fontFaceName;
|
|
156
156
|
/** @type {('normal'|'italic')} */
|
|
157
|
-
this.fontFaceStyle = this.style
|
|
157
|
+
this.fontFaceStyle = ['italic', 'boldItalic'].includes(this.style) ? 'italic' : 'normal';
|
|
158
158
|
/** @type {('normal'|'bold')} */
|
|
159
|
-
this.fontFaceWeight = this.style
|
|
159
|
+
this.fontFaceWeight = ['bold', 'boldItalic'].includes(this.style) ? 'bold' : 'normal';
|
|
160
160
|
/** @type {("sans"|"serif")} */
|
|
161
161
|
this.type = determineSansSerif(this.family) === 'SansDefault' ? 'sans' : 'serif';
|
|
162
162
|
this.smallCapsMult = 0.75;
|
|
@@ -185,27 +185,27 @@ export async function loadFontContainerFamily(family, src, opt = false) {
|
|
|
185
185
|
normal: null,
|
|
186
186
|
italic: null,
|
|
187
187
|
bold: null,
|
|
188
|
+
boldItalic: null,
|
|
188
189
|
};
|
|
189
190
|
|
|
190
191
|
/**
|
|
191
192
|
*
|
|
192
|
-
* @param {
|
|
193
|
+
* @param {StyleLookup} styleLookup
|
|
193
194
|
* @returns
|
|
194
195
|
*/
|
|
195
|
-
const loadType = (
|
|
196
|
-
const srcType = (src[
|
|
196
|
+
const loadType = (styleLookup) => new Promise((resolve) => {
|
|
197
|
+
const srcType = (src[styleLookup]);
|
|
197
198
|
if (!srcType) {
|
|
198
199
|
resolve(false);
|
|
199
200
|
return;
|
|
200
201
|
}
|
|
201
|
-
// const scrNormal = typeof srcType === 'string' ? getFontAbsPath(srcType) : srcType;
|
|
202
202
|
loadOpentype(srcType).then((font) => {
|
|
203
|
-
res[
|
|
203
|
+
res[styleLookup] = new FontContainerFont(family, styleLookup, srcType, opt, font);
|
|
204
204
|
resolve(true);
|
|
205
205
|
});
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
-
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold')]);
|
|
208
|
+
Promise.allSettled([loadType('normal'), loadType('italic'), loadType('bold'), loadType('boldItalic')]);
|
|
209
209
|
|
|
210
210
|
return res;
|
|
211
211
|
}
|
|
@@ -300,11 +300,13 @@ export class FontCont {
|
|
|
300
300
|
|
|
301
301
|
const fontNameEmbedded = fontObj.names.postScriptName.en;
|
|
302
302
|
|
|
303
|
-
let
|
|
304
|
-
if (fontNameEmbedded.match(/
|
|
305
|
-
|
|
303
|
+
let styleLookup = /** @type {StyleLookup} */ ('normal');
|
|
304
|
+
if (fontNameEmbedded.match(/boldit|bdit/i)) {
|
|
305
|
+
styleLookup = 'boldItalic';
|
|
306
|
+
} else if (fontNameEmbedded.match(/italic/i)) {
|
|
307
|
+
styleLookup = 'italic';
|
|
306
308
|
} else if (fontNameEmbedded.match(/bold/i)) {
|
|
307
|
-
|
|
309
|
+
styleLookup = 'bold';
|
|
308
310
|
}
|
|
309
311
|
|
|
310
312
|
// mupdf makes changes to font names, so we need to do the same.
|
|
@@ -312,9 +314,9 @@ export class FontCont {
|
|
|
312
314
|
// Spaces are replaced with underscores.
|
|
313
315
|
const fontName = fontNameEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
314
316
|
|
|
315
|
-
if (!FontCont.doc?.[fontName]?.[
|
|
317
|
+
if (!FontCont.doc?.[fontName]?.[styleLookup]) {
|
|
316
318
|
try {
|
|
317
|
-
const fontContainer = new FontContainerFont(fontName,
|
|
319
|
+
const fontContainer = new FontContainerFont(fontName, styleLookup, fontData, false, fontObj);
|
|
318
320
|
|
|
319
321
|
if (!FontCont.doc) {
|
|
320
322
|
FontCont.doc = {};
|
|
@@ -324,12 +326,12 @@ export class FontCont {
|
|
|
324
326
|
FontCont.doc[fontName] = {};
|
|
325
327
|
}
|
|
326
328
|
|
|
327
|
-
FontCont.doc[fontName][
|
|
329
|
+
FontCont.doc[fontName][styleLookup] = fontContainer;
|
|
328
330
|
} catch (error) {
|
|
329
|
-
console.error(`Error loading font ${fontName} ${
|
|
331
|
+
console.error(`Error loading font ${fontName} ${styleLookup}.`);
|
|
330
332
|
}
|
|
331
333
|
} else {
|
|
332
|
-
console.warn(`Font ${fontName} ${
|
|
334
|
+
console.warn(`Font ${fontName} ${styleLookup} already exists.`);
|
|
333
335
|
}
|
|
334
336
|
};
|
|
335
337
|
|
|
@@ -368,14 +370,17 @@ export class FontCont {
|
|
|
368
370
|
* Gets a font object. Unlike accessing the font containers directly,
|
|
369
371
|
* this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
|
|
370
372
|
*
|
|
371
|
-
* @param {
|
|
372
|
-
* @param {('normal'|'italic'|'bold'|string)} [style='normal']
|
|
373
|
+
* @param {Partial<Style>} style
|
|
373
374
|
* @param {string} [lang='eng']
|
|
374
375
|
* @returns {FontContainerFont}
|
|
375
376
|
*/
|
|
376
|
-
static getFont = (
|
|
377
|
-
|
|
378
|
-
|
|
377
|
+
static getFont = (style, lang = 'eng') => {
|
|
378
|
+
let family = style.font || FontCont.defaultFontName;
|
|
379
|
+
|
|
380
|
+
const styleLookup = getStyleLookup(style);
|
|
381
|
+
|
|
382
|
+
if (FontCont.doc?.[family]?.[styleLookup] && !FontCont.doc?.[family]?.[styleLookup]?.disable) {
|
|
383
|
+
return FontCont.doc[family][styleLookup];
|
|
379
384
|
}
|
|
380
385
|
|
|
381
386
|
if (lang === 'chi_sim') {
|
|
@@ -387,7 +392,7 @@ export class FontCont {
|
|
|
387
392
|
|
|
388
393
|
// Option 1: If we have access to the font, use it.
|
|
389
394
|
// Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
|
|
390
|
-
if (!FontCont.raw?.[family]?.[
|
|
395
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
391
396
|
if (/NimbusRom/i.test(family)) {
|
|
392
397
|
family = 'NimbusRoman';
|
|
393
398
|
} else if (/Times/i.test(family)) {
|
|
@@ -416,7 +421,7 @@ export class FontCont {
|
|
|
416
421
|
}
|
|
417
422
|
|
|
418
423
|
// Option 3: If the font still is not identified, use the default sans/serif font.
|
|
419
|
-
if (!FontCont.raw?.[family]?.[
|
|
424
|
+
if (!FontCont.raw?.[family]?.[styleLookup]) {
|
|
420
425
|
family = determineSansSerif(family);
|
|
421
426
|
}
|
|
422
427
|
|
|
@@ -427,10 +432,10 @@ export class FontCont {
|
|
|
427
432
|
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
428
433
|
|
|
429
434
|
/** @type {FontContainerFont} */
|
|
430
|
-
let fontRes = FontCont.raw?.[family]?.[
|
|
431
|
-
if (!fontRes) throw new Error(`Font container does not contain ${family} (${
|
|
435
|
+
let fontRes = FontCont.raw?.[family]?.[styleLookup];
|
|
436
|
+
if (!fontRes) throw new Error(`Font container does not contain ${family} (${styleLookup}).`);
|
|
432
437
|
|
|
433
|
-
const opt = FontCont.opt?.[family]?.[
|
|
438
|
+
const opt = FontCont.opt?.[family]?.[styleLookup];
|
|
434
439
|
const useOpt = FontCont.useOptFamily(family);
|
|
435
440
|
if (opt && useOpt) fontRes = opt;
|
|
436
441
|
|
|
@@ -441,10 +446,7 @@ export class FontCont {
|
|
|
441
446
|
*
|
|
442
447
|
* @param {OcrWord} word
|
|
443
448
|
*/
|
|
444
|
-
static getWordFont = (word) =>
|
|
445
|
-
const wordFontFamily = word.font || FontCont.defaultFontName;
|
|
446
|
-
return FontCont.getFont(wordFontFamily, word.style, word.lang);
|
|
447
|
-
};
|
|
449
|
+
static getWordFont = (word) => FontCont.getFont(word.style, word.lang);
|
|
448
450
|
|
|
449
451
|
/**
|
|
450
452
|
* Reset font container to original state but do not unload default resources.
|
package/js/export/export.js
CHANGED
|
@@ -207,7 +207,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
207
207
|
/**
|
|
208
208
|
* Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
|
|
209
209
|
* @public
|
|
210
|
-
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
|
|
210
|
+
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
|
|
211
211
|
* @param {string} fileName
|
|
212
212
|
* @param {number} [minPage=0] - First page to export.
|
|
213
213
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
package/js/export/writeHocr.js
CHANGED
|
@@ -75,38 +75,40 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
75
75
|
hocrOut += `bbox ${Math.round(wordObj.bbox.left)} ${Math.round(wordObj.bbox.top)} ${Math.round(wordObj.bbox.right)} ${Math.round(wordObj.bbox.bottom)}`;
|
|
76
76
|
hocrOut += `;x_wconf ${wordObj.conf}`;
|
|
77
77
|
|
|
78
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
79
|
-
hocrOut += `;x_font ${wordObj.font}`;
|
|
78
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
79
|
+
hocrOut += `;x_font ${wordObj.style.font}`;
|
|
80
80
|
}
|
|
81
81
|
|
|
82
|
-
if (wordObj.size) {
|
|
83
|
-
hocrOut += `;x_fsize ${wordObj.size}`;
|
|
82
|
+
if (wordObj.style.size) {
|
|
83
|
+
hocrOut += `;x_fsize ${wordObj.style.size}`;
|
|
84
84
|
}
|
|
85
85
|
|
|
86
86
|
hocrOut += "'";
|
|
87
87
|
|
|
88
88
|
// Tesseract HOCR specifies default language for a paragraph in the "ocr_par" element,
|
|
89
89
|
// however as ScribeOCR does not currently have a paragarph object, every word must have its language specified.
|
|
90
|
-
hocrOut += ` lang='${wordObj.lang}'`;
|
|
90
|
+
if (wordObj.lang) hocrOut += ` lang='${wordObj.lang}'`;
|
|
91
91
|
|
|
92
92
|
// TODO: Why are we representing font family and style using the `style` HTML element here?
|
|
93
93
|
// This is not how Tesseract does things, and our own parsing script does not appear to be written to re-import it properly.
|
|
94
94
|
// Add "style" attribute (if applicable)
|
|
95
|
-
if (
|
|
95
|
+
if (wordObj.style.bold || wordObj.style.italic || wordObj.style.smallCaps || (wordObj.style.font && wordObj.style.font !== 'Default')) {
|
|
96
96
|
hocrOut += ' style=\'';
|
|
97
97
|
|
|
98
|
-
if (wordObj.style
|
|
98
|
+
if (wordObj.style.italic) {
|
|
99
99
|
hocrOut += 'font-style:italic;';
|
|
100
|
-
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (wordObj.style.bold) {
|
|
101
103
|
hocrOut += 'font-weight:bold;';
|
|
102
104
|
}
|
|
103
105
|
|
|
104
|
-
if (wordObj.smallCaps) {
|
|
106
|
+
if (wordObj.style.smallCaps) {
|
|
105
107
|
hocrOut += 'font-variant:small-caps;';
|
|
106
108
|
}
|
|
107
109
|
|
|
108
|
-
if (wordObj.font && wordObj.font !== 'Default') {
|
|
109
|
-
hocrOut += `font-family:${wordObj.font}`;
|
|
110
|
+
if (wordObj.style.font && wordObj.style.font !== 'Default') {
|
|
111
|
+
hocrOut += `font-family:${wordObj.style.font}`;
|
|
110
112
|
}
|
|
111
113
|
|
|
112
114
|
hocrOut += '\'>';
|
|
@@ -115,9 +117,9 @@ export function writeHocr(ocrData, minValue, maxValue) {
|
|
|
115
117
|
}
|
|
116
118
|
|
|
117
119
|
// Add word text, along with any formatting that uses nested elements rather than attributes
|
|
118
|
-
if (wordObj.sup) {
|
|
120
|
+
if (wordObj.style.sup) {
|
|
119
121
|
hocrOut += `<sup>${ocr.escapeXml(wordObj.text)}</sup>`;
|
|
120
|
-
} else if (wordObj.dropcap) {
|
|
122
|
+
} else if (wordObj.style.dropcap) {
|
|
121
123
|
hocrOut += `<span class='ocr_dropcap'>${ocr.escapeXml(wordObj.text)}</span>`;
|
|
122
124
|
} else {
|
|
123
125
|
hocrOut += ocr.escapeXml(wordObj.text);
|
package/js/export/writeHtml.js
CHANGED
|
@@ -5,6 +5,31 @@ import { assignParagraphs } from '../utils/reflowPars.js';
|
|
|
5
5
|
import { pageMetricsArr } from '../containers/dataContainer.js';
|
|
6
6
|
import ocr from '../objects/ocrObjects.js';
|
|
7
7
|
|
|
8
|
+
/**
|
|
9
|
+
* Calculate the font metrics for a given font and font size.
|
|
10
|
+
* This is used to get metrics that match `ctx.measureText`, but without requiring a canvas.
|
|
11
|
+
* @param {FontContainerFont} fontI
|
|
12
|
+
* @param {number} fontSize
|
|
13
|
+
*/
|
|
14
|
+
const calcFontMetrics = (fontI, fontSize) => {
|
|
15
|
+
const os2 = fontI.opentype.tables.os2;
|
|
16
|
+
const unitsPerEm = fontI.opentype.unitsPerEm;
|
|
17
|
+
|
|
18
|
+
// Bit 7: Use_Typo_Metrics (1 = Yes)
|
|
19
|
+
// eslint-disable-next-line no-bitwise
|
|
20
|
+
if (os2.fsSelection >> 7 & 1) {
|
|
21
|
+
return {
|
|
22
|
+
fontBoundingBoxAscent: Math.round(os2.sTypoAscender * (fontSize / unitsPerEm)),
|
|
23
|
+
fontBoundingBoxDescent: Math.round(os2.sTypoDescender * (fontSize / unitsPerEm)),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
fontBoundingBoxAscent: Math.round(os2.usWinAscent * (fontSize / unitsPerEm)),
|
|
29
|
+
fontBoundingBoxDescent: Math.round(os2.usWinDescent * (fontSize / unitsPerEm)),
|
|
30
|
+
};
|
|
31
|
+
};
|
|
32
|
+
|
|
8
33
|
/**
|
|
9
34
|
*
|
|
10
35
|
* @param {string} text
|
|
@@ -33,18 +58,11 @@ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
|
|
|
33
58
|
* If omitted, all words are included.
|
|
34
59
|
*/
|
|
35
60
|
export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null) {
|
|
36
|
-
if (!(typeof process === 'undefined')) {
|
|
37
|
-
throw new Error('HTML exports are not supported in Node.js');
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const canvas = new OffscreenCanvas(1, 1);
|
|
41
|
-
const ctx = /** @type {OffscreenCanvasRenderingContext2D} */ (canvas.getContext('2d'));
|
|
42
|
-
|
|
43
61
|
const fontsUsed = new Set();
|
|
44
62
|
|
|
45
63
|
const pad = 5;
|
|
46
64
|
|
|
47
|
-
let bodyStr = '<body
|
|
65
|
+
let bodyStr = '<body>\n';
|
|
48
66
|
|
|
49
67
|
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
|
|
50
68
|
|
|
@@ -71,7 +89,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
71
89
|
}
|
|
72
90
|
}
|
|
73
91
|
|
|
74
|
-
bodyStr +=
|
|
92
|
+
bodyStr += ` <div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">\n`;
|
|
75
93
|
if (removeMargins) {
|
|
76
94
|
top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10);
|
|
77
95
|
} else {
|
|
@@ -130,9 +148,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
130
148
|
|
|
131
149
|
const fontSizeHTML = fontSize * scale;
|
|
132
150
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
const metrics = ctx.measureText(wordStr);
|
|
151
|
+
const metrics = calcFontMetrics(fontI, fontSizeHTML);
|
|
136
152
|
|
|
137
153
|
const fontSizeHTMLSmallCaps = fontSize * scale * fontI.smallCapsMult;
|
|
138
154
|
|
|
@@ -159,7 +175,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
159
175
|
// Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
|
|
160
176
|
// and then wrapping each letter in a span with a smaller font size.
|
|
161
177
|
let innerHTML;
|
|
162
|
-
if (wordObj.smallCaps) {
|
|
178
|
+
if (wordObj.style.smallCaps) {
|
|
163
179
|
styleStr += 'text-transform:uppercase;';
|
|
164
180
|
innerHTML = makeSmallCapsDivs(wordStr, fontSizeHTMLSmallCaps);
|
|
165
181
|
} else {
|
|
@@ -174,29 +190,29 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
174
190
|
// Line height must match the height of the font bounding box for the font metrics to be accurate.
|
|
175
191
|
styleStr += `line-height:${metrics.fontBoundingBoxAscent + metrics.fontBoundingBoxDescent}px;`;
|
|
176
192
|
|
|
177
|
-
bodyStr +=
|
|
193
|
+
bodyStr += ` <span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
|
|
178
194
|
}
|
|
179
195
|
}
|
|
180
196
|
|
|
181
|
-
bodyStr += '</div
|
|
197
|
+
bodyStr += '\n </div>\n';
|
|
182
198
|
|
|
183
199
|
opt.progressHandler({ n: g, type: 'export', info: { } });
|
|
184
200
|
}
|
|
185
201
|
|
|
186
|
-
let styleStr = '<style
|
|
202
|
+
let styleStr = '<style>\n .scribe-word {\n';
|
|
187
203
|
|
|
188
|
-
styleStr += 'position:absolute
|
|
189
|
-
styleStr += `padding-left:${pad}px
|
|
190
|
-
styleStr += `padding-right:${pad}px
|
|
191
|
-
styleStr += 'z-index:1
|
|
192
|
-
styleStr += 'white-space:nowrap
|
|
204
|
+
styleStr += ' position:absolute;\n';
|
|
205
|
+
styleStr += ` padding-left:${pad}px;\n`;
|
|
206
|
+
styleStr += ` padding-right:${pad}px;\n`;
|
|
207
|
+
styleStr += ' z-index:1;\n';
|
|
208
|
+
styleStr += ' white-space:nowrap;\n';
|
|
193
209
|
if (opt.kerning) {
|
|
194
|
-
styleStr += 'font-kerning:normal
|
|
210
|
+
styleStr += ' font-kerning:normal;\n';
|
|
195
211
|
} else {
|
|
196
|
-
styleStr += 'font-kerning:none
|
|
212
|
+
styleStr += ' font-kerning:none;\n';
|
|
197
213
|
}
|
|
198
214
|
|
|
199
|
-
styleStr += '}';
|
|
215
|
+
styleStr += ' }\n';
|
|
200
216
|
|
|
201
217
|
for (const fontI of fontsUsed) {
|
|
202
218
|
const cdnPath = 'https://cdn.jsdelivr.net/npm/scribe.js-ocr@0.7.1/fonts/all/';
|
|
@@ -205,19 +221,19 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
205
221
|
const fontName = `${fontI.family}-${styleTitleCase}.woff`;
|
|
206
222
|
const fontPath = cdnPath + fontName;
|
|
207
223
|
|
|
208
|
-
styleStr +=
|
|
224
|
+
styleStr += ` @font-face {
|
|
209
225
|
font-family: '${fontI.fontFaceName}';
|
|
210
226
|
font-style: ${fontI.fontFaceStyle};
|
|
211
227
|
font-weight: ${fontI.fontFaceWeight};
|
|
212
228
|
src: url('${fontPath}');
|
|
213
|
-
}\n`;
|
|
229
|
+
}\n`;
|
|
214
230
|
}
|
|
215
231
|
|
|
216
|
-
styleStr += '</style
|
|
232
|
+
styleStr += '</style>\n';
|
|
217
233
|
|
|
218
|
-
bodyStr += '</body
|
|
234
|
+
bodyStr += '</body>\n';
|
|
219
235
|
|
|
220
|
-
const htmlStr = `<html
|
|
236
|
+
const htmlStr = `<html>\n<head>\n${styleStr}</head>\n${bodyStr}</html>`;
|
|
221
237
|
|
|
222
238
|
return htmlStr;
|
|
223
239
|
}
|