scribe.js-ocr 0.7.3 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/fonts/all/Carlito-BoldItalic.woff +0 -0
- package/fonts/all/Century-BoldItalic.woff +0 -0
- package/fonts/all/Garamond-BoldItalic.woff +0 -0
- package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/all/Palatino-BoldItalic.woff +0 -0
- package/fonts/latin/Carlito-BoldItalic.woff +0 -0
- package/fonts/latin/Century-BoldItalic.woff +0 -0
- package/fonts/latin/Garamond-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
- package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
- package/fonts/latin/Palatino-BoldItalic.woff +0 -0
- package/js/containers/app.js +1 -1
- package/js/containers/fontContainer.js +42 -40
- package/js/export/writeHocr.js +15 -13
- package/js/export/writeHtml.js +1 -1
- package/js/export/writePdf.js +52 -14
- package/js/export/writePdfFonts.js +11 -9
- package/js/export/writeTabular.js +2 -2
- package/js/export/writeText.js +10 -6
- package/js/extractTables.js +5 -5
- package/js/fontContainerMain.js +50 -7
- package/js/fontStatistics.js +18 -13
- package/js/fontSupp.js +20 -20
- package/js/global.d.ts +17 -0
- package/js/import/convertPageAbbyy.js +47 -25
- package/js/import/convertPageBlocks.js +2 -2
- package/js/import/convertPageHocr.js +10 -20
- package/js/import/convertPageShared.js +13 -9
- package/js/import/convertPageStext.js +66 -31
- package/js/objects/ocrObjects.js +13 -19
- package/js/utils/fontUtils.js +11 -11
- package/js/utils/miscUtils.js +16 -0
- package/js/worker/compareOCRModule.js +13 -16
- package/js/worker/optimizeFontModule.js +4 -4
- package/mupdf/libmupdf.js +123 -17
- package/mupdf/libmupdf.wasm +0 -0
- package/package.json +1 -1
package/js/export/writeText.js
CHANGED
|
@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
54
54
|
|
|
55
55
|
if (docxMode) {
|
|
56
56
|
let fontStyle = '';
|
|
57
|
-
if (wordObj.style
|
|
57
|
+
if (wordObj.style.italic) {
|
|
58
58
|
fontStyle += '<w:i/>';
|
|
59
|
-
} else if (wordObj.style
|
|
59
|
+
} else if (wordObj.style.bold) {
|
|
60
60
|
fontStyle += '<w:b/>';
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
if (wordObj.smallCaps) {
|
|
63
|
+
if (wordObj.style.smallCaps) {
|
|
64
64
|
fontStyle += '<w:smallCaps/>';
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
-
if (wordObj.
|
|
67
|
+
if (wordObj.style.underline) {
|
|
68
|
+
fontStyle += '<w:u w:val="single"/>';
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (wordObj.style.sup) {
|
|
68
72
|
fontStyle += '<w:vertAlign w:val="superscript"/>';
|
|
69
73
|
}
|
|
70
74
|
|
|
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
79
83
|
} else if (supPrev) {
|
|
80
84
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
|
|
81
85
|
// If this word is a superscript, no space is added between words.
|
|
82
|
-
} else if (wordObj.sup && i > 0) {
|
|
86
|
+
} else if (wordObj.style.sup && i > 0) {
|
|
83
87
|
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
84
88
|
} else {
|
|
85
89
|
textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
89
93
|
}
|
|
90
94
|
|
|
91
95
|
fontStylePrev = fontStyle;
|
|
92
|
-
supPrev = wordObj.sup;
|
|
96
|
+
supPrev = wordObj.style.sup;
|
|
93
97
|
} else if (newLine) {
|
|
94
98
|
textStr = `${textStr}\n`;
|
|
95
99
|
} else if (h > 0 || g > 0 || i > 0) {
|
package/js/extractTables.js
CHANGED
|
@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
|
|
|
22
22
|
|
|
23
23
|
// TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
|
|
24
24
|
/**
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
* Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
|
|
26
|
+
* The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
|
|
27
|
+
* @param {OcrPage} pageObj
|
|
28
|
+
* @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
|
|
29
|
+
*/
|
|
30
30
|
export function extractSingleTableContent(pageObj, boxes) {
|
|
31
31
|
/** @type {Array<OcrWord>} */
|
|
32
32
|
const wordArr = [];
|
package/js/fontContainerMain.js
CHANGED
|
@@ -24,103 +24,145 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
|
|
|
24
24
|
let /** @type {Promise<ArrayBuffer>} */carlitoNormal;
|
|
25
25
|
let /** @type {Promise<ArrayBuffer>} */carlitoItalic;
|
|
26
26
|
let /** @type {Promise<ArrayBuffer>} */carlitoBold;
|
|
27
|
+
let /** @type {Promise<ArrayBuffer>} */carlitoBoldItalic;
|
|
27
28
|
let /** @type {Promise<ArrayBuffer>} */centuryNormal;
|
|
28
29
|
let /** @type {Promise<ArrayBuffer>} */centuryItalic;
|
|
29
30
|
let /** @type {Promise<ArrayBuffer>} */centuryBold;
|
|
31
|
+
let /** @type {Promise<ArrayBuffer>} */centuryBoldItalic;
|
|
30
32
|
let /** @type {Promise<ArrayBuffer>} */garamondNormal;
|
|
31
33
|
let /** @type {Promise<ArrayBuffer>} */garamondItalic;
|
|
32
34
|
let /** @type {Promise<ArrayBuffer>} */garamondBold;
|
|
35
|
+
let /** @type {Promise<ArrayBuffer>} */garamondBoldItalic;
|
|
33
36
|
let /** @type {Promise<ArrayBuffer>} */palatinoNormal;
|
|
34
37
|
let /** @type {Promise<ArrayBuffer>} */palatinoItalic;
|
|
35
38
|
let /** @type {Promise<ArrayBuffer>} */palatinoBold;
|
|
39
|
+
let /** @type {Promise<ArrayBuffer>} */palatinoBoldItalic;
|
|
36
40
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanNormal;
|
|
37
41
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanItalic;
|
|
38
42
|
let /** @type {Promise<ArrayBuffer>} */nimbusRomanBold;
|
|
43
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusRomanBoldItalic;
|
|
39
44
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
|
|
40
45
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
|
|
41
46
|
let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
|
|
47
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusSansBoldItalic;
|
|
42
48
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
|
|
43
49
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
|
|
44
50
|
let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
|
|
51
|
+
let /** @type {Promise<ArrayBuffer>} */nimbusMonoBoldItalic;
|
|
45
52
|
if (typeof process === 'undefined') {
|
|
46
53
|
if (glyphSet === 'latin') {
|
|
47
54
|
carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
48
55
|
carlitoItalic = fetch(new URL('../fonts/latin/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
49
56
|
carlitoBold = fetch(new URL('../fonts/latin/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
57
|
+
carlitoBoldItalic = fetch(new URL('../fonts/latin/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
50
58
|
centuryNormal = fetch(new URL('../fonts/latin/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
51
59
|
centuryItalic = fetch(new URL('../fonts/latin/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
52
60
|
centuryBold = fetch(new URL('../fonts/latin/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
61
|
+
centuryBoldItalic = fetch(new URL('../fonts/latin/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
53
62
|
garamondNormal = fetch(new URL('../fonts/latin/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
54
63
|
garamondItalic = fetch(new URL('../fonts/latin/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
55
64
|
garamondBold = fetch(new URL('../fonts/latin/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
65
|
+
garamondBoldItalic = fetch(new URL('../fonts/latin/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
56
66
|
palatinoNormal = fetch(new URL('../fonts/latin/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
57
67
|
palatinoItalic = fetch(new URL('../fonts/latin/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
58
68
|
palatinoBold = fetch(new URL('../fonts/latin/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
69
|
+
palatinoBoldItalic = fetch(new URL('../fonts/latin/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
59
70
|
nimbusRomanNormal = fetch(new URL('../fonts/latin/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
60
71
|
nimbusRomanItalic = fetch(new URL('../fonts/latin/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
61
72
|
nimbusRomanBold = fetch(new URL('../fonts/latin/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
73
|
+
nimbusRomanBoldItalic = fetch(new URL('../fonts/latin/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
62
74
|
nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
63
75
|
nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
64
76
|
nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
77
|
+
nimbusSansBoldItalic = fetch(new URL('../fonts/latin/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
65
78
|
nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
66
79
|
nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
67
80
|
nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
81
|
+
nimbusMonoBoldItalic = fetch(new URL('../fonts/latin/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
68
82
|
} else {
|
|
69
83
|
carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
70
84
|
carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
71
85
|
carlitoBold = fetch(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
86
|
+
carlitoBoldItalic = fetch(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
72
87
|
centuryNormal = fetch(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
73
88
|
centuryItalic = fetch(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
74
89
|
centuryBold = fetch(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
90
|
+
centuryBoldItalic = fetch(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
75
91
|
garamondNormal = fetch(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
76
92
|
garamondItalic = fetch(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
77
93
|
garamondBold = fetch(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
94
|
+
garamondBoldItalic = fetch(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
78
95
|
palatinoNormal = fetch(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
79
96
|
palatinoItalic = fetch(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
80
97
|
palatinoBold = fetch(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
98
|
+
palatinoBoldItalic = fetch(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
81
99
|
nimbusRomanNormal = fetch(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
82
100
|
nimbusRomanItalic = fetch(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
83
101
|
nimbusRomanBold = fetch(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
102
|
+
nimbusRomanBoldItalic = fetch(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
84
103
|
nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
85
104
|
nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
86
105
|
nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
106
|
+
nimbusSansBoldItalic = fetch(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
87
107
|
nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
88
108
|
nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
89
109
|
nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
110
|
+
nimbusMonoBoldItalic = fetch(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
|
|
90
111
|
}
|
|
91
112
|
} else {
|
|
92
113
|
const { readFile } = await import('fs/promises');
|
|
93
114
|
carlitoNormal = readFile(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
94
115
|
carlitoItalic = readFile(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
95
116
|
carlitoBold = readFile(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
117
|
+
carlitoBoldItalic = readFile(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
96
118
|
centuryNormal = readFile(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
97
119
|
centuryItalic = readFile(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
98
120
|
centuryBold = readFile(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
121
|
+
centuryBoldItalic = readFile(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
99
122
|
garamondNormal = readFile(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
100
123
|
garamondItalic = readFile(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
101
124
|
garamondBold = readFile(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
125
|
+
garamondBoldItalic = readFile(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
102
126
|
palatinoNormal = readFile(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
103
127
|
palatinoItalic = readFile(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
104
128
|
palatinoBold = readFile(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
129
|
+
palatinoBoldItalic = readFile(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
105
130
|
nimbusRomanNormal = readFile(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
106
131
|
nimbusRomanItalic = readFile(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
107
132
|
nimbusRomanBold = readFile(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
133
|
+
nimbusRomanBoldItalic = readFile(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
108
134
|
nimbusSansNormal = readFile(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
109
135
|
nimbusSansItalic = readFile(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
110
136
|
nimbusSansBold = readFile(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
137
|
+
nimbusSansBoldItalic = readFile(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
111
138
|
nimbusMonoNormal = readFile(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.buffer);
|
|
112
139
|
nimbusMonoItalic = readFile(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.buffer);
|
|
113
140
|
nimbusMonoBold = readFile(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.buffer);
|
|
141
|
+
nimbusMonoBoldItalic = readFile(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
|
|
114
142
|
}
|
|
115
143
|
|
|
116
144
|
const srcObj = {
|
|
117
|
-
Carlito: {
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
145
|
+
Carlito: {
|
|
146
|
+
normal: await carlitoNormal, italic: await carlitoItalic, bold: await carlitoBold, boldItalic: await carlitoBoldItalic,
|
|
147
|
+
},
|
|
148
|
+
Century: {
|
|
149
|
+
normal: await centuryNormal, italic: await centuryItalic, bold: await centuryBold, boldItalic: await centuryBoldItalic,
|
|
150
|
+
},
|
|
151
|
+
Garamond: {
|
|
152
|
+
normal: await garamondNormal, italic: await garamondItalic, bold: await garamondBold, boldItalic: await garamondBoldItalic,
|
|
153
|
+
},
|
|
154
|
+
Palatino: {
|
|
155
|
+
normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold, boldItalic: await palatinoBoldItalic,
|
|
156
|
+
},
|
|
157
|
+
NimbusRoman: {
|
|
158
|
+
normal: await nimbusRomanNormal, italic: await nimbusRomanItalic, bold: await nimbusRomanBold, boldItalic: await nimbusRomanBoldItalic,
|
|
159
|
+
},
|
|
160
|
+
NimbusSans: {
|
|
161
|
+
normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold, boldItalic: await nimbusSansBoldItalic,
|
|
162
|
+
},
|
|
163
|
+
NimbusMono: {
|
|
164
|
+
normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold, boldItalic: await nimbusMonoBoldItalic,
|
|
165
|
+
},
|
|
124
166
|
};
|
|
125
167
|
|
|
126
168
|
FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
|
|
@@ -217,6 +259,7 @@ export async function updateFontContWorkerMain(params = {}) {
|
|
|
217
259
|
};
|
|
218
260
|
if (value.italic) input.src[key].italic = value.italic.src;
|
|
219
261
|
if (value.bold) input.src[key].bold = value.bold.src;
|
|
262
|
+
if (value.boldItalic) input.src[key].boldItalic = value.boldItalic.src;
|
|
220
263
|
}
|
|
221
264
|
|
|
222
265
|
for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
|
package/js/fontStatistics.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import {
|
|
5
5
|
determineSansSerif,
|
|
6
|
+
getStyleLookup,
|
|
6
7
|
quantile,
|
|
7
8
|
replaceObjectProperties,
|
|
8
9
|
round6,
|
|
@@ -243,13 +244,13 @@ function calcFontMetricsPage(pageObj) {
|
|
|
243
244
|
|
|
244
245
|
for (const lineObj of pageObj.lines) {
|
|
245
246
|
for (const wordObj of lineObj.words) {
|
|
246
|
-
const wordFontFamily = determineSansSerif(wordObj.font) || 'Default';
|
|
247
|
+
const wordFontFamily = determineSansSerif(wordObj.style.font) || 'Default';
|
|
247
248
|
|
|
248
249
|
// This condition should not occur, however has in the past due to parsing bugs. Skipping to avoid entire program crashing if this occurs.
|
|
249
250
|
if (wordObj.chars && wordObj.chars.length !== wordObj.text.length) continue;
|
|
250
251
|
|
|
251
252
|
// Do not include superscripts, dropcaps, and low-confidence words in statistics for font optimization.
|
|
252
|
-
if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.sup || wordObj.smallCaps) continue;
|
|
253
|
+
if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.style.sup || wordObj.style.smallCaps) continue;
|
|
253
254
|
/** @type {Object.<string, FontMetricsRawFamily>} */
|
|
254
255
|
const fontMetricsRawLine = {};
|
|
255
256
|
|
|
@@ -275,14 +276,18 @@ function calcFontMetricsPage(pageObj) {
|
|
|
275
276
|
fontMetricsRawLine[wordFontFamily] = new FontMetricsRawFamily();
|
|
276
277
|
}
|
|
277
278
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
279
|
+
const styleLookup = getStyleLookup(wordObj.style);
|
|
280
|
+
|
|
281
|
+
if (!['normal', 'italic', 'bold'].includes(styleLookup)) continue;
|
|
282
|
+
|
|
283
|
+
if (!fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode]) {
|
|
284
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode] = [];
|
|
285
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode] = [];
|
|
281
286
|
}
|
|
282
287
|
|
|
283
|
-
fontMetricsRawLine[wordFontFamily][
|
|
284
|
-
fontMetricsRawLine[wordFontFamily][
|
|
285
|
-
fontMetricsRawLine[wordFontFamily][
|
|
288
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode].push(charWidth / charNorm);
|
|
289
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode].push(charHeight / charNorm);
|
|
290
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].obs += 1;
|
|
286
291
|
|
|
287
292
|
if (k + 1 < wordObj.chars.length) {
|
|
288
293
|
const charObjNext = wordObj.chars[k + 1];
|
|
@@ -295,12 +300,12 @@ function calcFontMetricsPage(pageObj) {
|
|
|
295
300
|
if (trailingSpace + charWidthNext > 0) {
|
|
296
301
|
const bigramUnicode = `${charUnicode},${wordObj.chars[k + 1].text.charCodeAt(0)}`;
|
|
297
302
|
|
|
298
|
-
if (!fontMetricsRawLine[wordFontFamily][
|
|
299
|
-
fontMetricsRawLine[wordFontFamily][
|
|
300
|
-
fontMetricsRawLine[wordFontFamily][
|
|
303
|
+
if (!fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode]) {
|
|
304
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode] = [];
|
|
305
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode] = [];
|
|
301
306
|
}
|
|
302
|
-
fontMetricsRawLine[wordFontFamily][
|
|
303
|
-
fontMetricsRawLine[wordFontFamily][
|
|
307
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode].push(trailingSpace / charNorm);
|
|
308
|
+
fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode].push((trailingSpace + charWidthNext) / charNorm);
|
|
304
309
|
}
|
|
305
310
|
}
|
|
306
311
|
}
|
package/js/fontSupp.js
CHANGED
|
@@ -44,7 +44,7 @@ const calcSuppFontInfoForWords = async (words) => {
|
|
|
44
44
|
const fontSizeArr = [];
|
|
45
45
|
for (const word of wordsRes) {
|
|
46
46
|
fontSizeArr.push(calcWordFontSize(word));
|
|
47
|
-
const sansSerif = determineSansSerif(word.font);
|
|
47
|
+
const sansSerif = determineSansSerif(word.style.font);
|
|
48
48
|
if (sansSerif !== 'Default') {
|
|
49
49
|
if (sansSerif === 'SansDefault') {
|
|
50
50
|
sansVotes++;
|
|
@@ -53,9 +53,9 @@ const calcSuppFontInfoForWords = async (words) => {
|
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
55
|
}
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
fontSizeMult = quantile(fontSizeArr, 0.5) / words[0].size;
|
|
56
|
+
|
|
57
|
+
if (words[0].style.size) {
|
|
58
|
+
fontSizeMult = quantile(fontSizeArr, 0.5) / words[0].style.size;
|
|
59
59
|
}
|
|
60
60
|
|
|
61
61
|
return { sansVotes, serifVotes, fontSizeMult };
|
|
@@ -83,36 +83,36 @@ export const calcSuppFontInfo = async (ocrArr) => {
|
|
|
83
83
|
let wordFontLast;
|
|
84
84
|
let wordFontSizeLast;
|
|
85
85
|
for (const word of line.words) {
|
|
86
|
-
if (word.font) {
|
|
87
|
-
if (skipFonts.has(word.font)) {
|
|
86
|
+
if (word.style.font) {
|
|
87
|
+
if (skipFonts.has(word.style.font)) {
|
|
88
88
|
continue;
|
|
89
89
|
// Printing words off screen is a common method of hiding text in PDFs.
|
|
90
90
|
} else if (word.bbox.left < 0 || word.bbox.top < 0 || word.bbox.right > page.dims.width || word.bbox.bottom > page.dims.height) {
|
|
91
91
|
continue;
|
|
92
|
-
} else if (!calcFonts.has(word.font)) {
|
|
93
|
-
const sansSerifUnknown = determineSansSerif(word.font) === 'Default';
|
|
92
|
+
} else if (!calcFonts.has(word.style.font)) {
|
|
93
|
+
const sansSerifUnknown = determineSansSerif(word.style.font) === 'Default';
|
|
94
94
|
if (sansSerifUnknown || !word.visualCoords) {
|
|
95
|
-
calcFonts.add(word.font);
|
|
95
|
+
calcFonts.add(word.style.font);
|
|
96
96
|
} else {
|
|
97
|
-
skipFonts.add(word.font);
|
|
97
|
+
skipFonts.add(word.style.font);
|
|
98
98
|
continue;
|
|
99
99
|
}
|
|
100
100
|
}
|
|
101
101
|
|
|
102
|
-
if (!fontExamples[word.font]) {
|
|
103
|
-
fontExamples[word.font] = [];
|
|
104
|
-
} else if (fontExamples[word.font].length > 3) {
|
|
102
|
+
if (!fontExamples[word.style.font]) {
|
|
103
|
+
fontExamples[word.style.font] = [];
|
|
104
|
+
} else if (fontExamples[word.style.font].length > 3) {
|
|
105
105
|
continue;
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
-
if (word.font !== wordFontLast || word.size !== wordFontSizeLast) {
|
|
109
|
-
fontExamples[word.font].push([word]);
|
|
108
|
+
if (word.style.font !== wordFontLast || word.style.size !== wordFontSizeLast) {
|
|
109
|
+
fontExamples[word.style.font].push([word]);
|
|
110
110
|
} else {
|
|
111
|
-
fontExamples[word.font][fontExamples[word.font].length - 1].push(word);
|
|
111
|
+
fontExamples[word.style.font][fontExamples[word.style.font].length - 1].push(word);
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
-
wordFontLast = word.font;
|
|
115
|
-
wordFontSizeLast = word.size;
|
|
114
|
+
wordFontLast = word.style.font;
|
|
115
|
+
wordFontSizeLast = word.style.size;
|
|
116
116
|
}
|
|
117
117
|
}
|
|
118
118
|
}
|
|
@@ -158,8 +158,8 @@ export const calcSuppFontInfo = async (ocrArr) => {
|
|
|
158
158
|
for (const page of ocrArr) {
|
|
159
159
|
for (const line of page.lines) {
|
|
160
160
|
for (const word of line.words) {
|
|
161
|
-
if (word.font && word.size && FontProps.sizeMult[word.font]) {
|
|
162
|
-
word.size = Math.round(word.size * FontProps.sizeMult[word.font] * 1000) / 1000;
|
|
161
|
+
if (word.style.font && word.style.size && FontProps.sizeMult[word.style.font]) {
|
|
162
|
+
word.style.size = Math.round(word.style.size * FontProps.sizeMult[word.style.font] * 1000) / 1000;
|
|
163
163
|
}
|
|
164
164
|
}
|
|
165
165
|
}
|
package/js/global.d.ts
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
declare global {
|
|
2
2
|
|
|
3
|
+
type Style = {
|
|
4
|
+
font: ?string;
|
|
5
|
+
size: ?number;
|
|
6
|
+
bold: boolean;
|
|
7
|
+
italic: boolean;
|
|
8
|
+
underline: boolean;
|
|
9
|
+
smallCaps: boolean;
|
|
10
|
+
sup: boolean;
|
|
11
|
+
dropcap: boolean;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
type StyleLookup = ('normal'|'bold'|'italic'|'boldItalic');
|
|
15
|
+
|
|
3
16
|
// OCR objects
|
|
4
17
|
type OcrPage = import("./objects/ocrObjects.js").OcrPage;
|
|
5
18
|
type OcrLine = import("./objects/ocrObjects.js").OcrLine;
|
|
@@ -17,12 +30,14 @@ declare global {
|
|
|
17
30
|
normal: FontContainerFont;
|
|
18
31
|
italic: FontContainerFont;
|
|
19
32
|
bold: FontContainerFont;
|
|
33
|
+
boldItalic: FontContainerFont;
|
|
20
34
|
};
|
|
21
35
|
|
|
22
36
|
type FontContainerFamilyUpload = {
|
|
23
37
|
normal: FontContainerFont | null;
|
|
24
38
|
italic: FontContainerFont | null;
|
|
25
39
|
bold: FontContainerFont | null;
|
|
40
|
+
boldItalic: FontContainerFont | null;
|
|
26
41
|
};
|
|
27
42
|
|
|
28
43
|
type FontContainerFamily = FontContainerFamilyBuiltIn | FontContainerFamilyUpload;
|
|
@@ -42,12 +57,14 @@ declare global {
|
|
|
42
57
|
normal: ArrayBuffer;
|
|
43
58
|
italic: ArrayBuffer;
|
|
44
59
|
bold: ArrayBuffer;
|
|
60
|
+
boldItalic: ArrayBuffer;
|
|
45
61
|
};
|
|
46
62
|
|
|
47
63
|
type fontSrcUpload = {
|
|
48
64
|
normal: ArrayBuffer | null;
|
|
49
65
|
italic: ArrayBuffer | null;
|
|
50
66
|
bold: ArrayBuffer | null;
|
|
67
|
+
boldItalic: ArrayBuffer | null;
|
|
51
68
|
};
|
|
52
69
|
|
|
53
70
|
type opentypeFont = import("../lib/opentype.module.js").Font;
|
|
@@ -171,8 +171,18 @@ export async function convertPageAbbyy({ ocrStr, n }) {
|
|
|
171
171
|
/** @type {Array<Array<OcrChar>>} */
|
|
172
172
|
const charObjArrLine = Array(wordStrArr.length);
|
|
173
173
|
text = text.fill('');
|
|
174
|
-
|
|
175
|
-
|
|
174
|
+
|
|
175
|
+
/** @type {Array<boolean>} */
|
|
176
|
+
const italicArr = Array(wordStrArr.length).fill(false);
|
|
177
|
+
/** @type {Array<boolean>} */
|
|
178
|
+
const boldArr = Array(wordStrArr.length).fill(false);
|
|
179
|
+
/** @type {Array<boolean>} */
|
|
180
|
+
const underlineArr = Array(wordStrArr.length).fill(false);
|
|
181
|
+
/** @type {Array<boolean>} */
|
|
182
|
+
const supArr = Array(wordStrArr.length).fill(false);
|
|
183
|
+
/** @type {Array<boolean>} */
|
|
184
|
+
const dropcapArr = Array(wordStrArr.length).fill(false);
|
|
185
|
+
|
|
176
186
|
/** @type {Array<boolean>} */
|
|
177
187
|
const smallCapsArr = Array(wordStrArr.length).fill(false);
|
|
178
188
|
/** @type {Array<boolean>} */
|
|
@@ -184,28 +194,32 @@ export async function convertPageAbbyy({ ocrStr, n }) {
|
|
|
184
194
|
|
|
185
195
|
if (typeof (letterArr[0][1]) !== 'undefined') {
|
|
186
196
|
if (dropCap && i === 0) {
|
|
187
|
-
|
|
197
|
+
dropcapArr[i] = true;
|
|
188
198
|
} else if (/superscript=['"](1|true)/i.test(letterArr[0][1])) {
|
|
189
|
-
|
|
190
|
-
} else if (/italic=['"](1|true)/i.test(letterArr[0][1])) {
|
|
191
|
-
styleArr[i] = 'italic';
|
|
192
|
-
stylesLine.italic = true;
|
|
199
|
+
supArr[i] = true;
|
|
193
200
|
} else {
|
|
194
|
-
|
|
195
|
-
|
|
201
|
+
if (/italic=['"](1|true)/i.test(letterArr[0][1])) {
|
|
202
|
+
italicArr[i] = true;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (/bold=['"](1|true)/i.test(letterArr[0][1])) {
|
|
206
|
+
boldArr[i] = true;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (/underline=['"](1|true)/i.test(letterArr[0][1])) {
|
|
210
|
+
underlineArr[i] = true;
|
|
211
|
+
}
|
|
196
212
|
}
|
|
197
213
|
|
|
198
214
|
if (/smallcaps=['"](1|true)/i.test(letterArr[0][1])) {
|
|
199
215
|
smallCapsArr[i] = true;
|
|
200
216
|
}
|
|
201
|
-
} else if (i > 0) {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
smallCapsArr[i] = smallCapsArr[i - 1];
|
|
208
|
-
}
|
|
217
|
+
} else if (i > 0 && !dropcapArr[i - 1]) {
|
|
218
|
+
italicArr[i] = italicArr[i - 1];
|
|
219
|
+
boldArr[i] = boldArr[i - 1];
|
|
220
|
+
underlineArr[i] = underlineArr[i - 1];
|
|
221
|
+
supArr[i] = supArr[i - 1];
|
|
222
|
+
smallCapsArr[i] = smallCapsArr[i - 1];
|
|
209
223
|
}
|
|
210
224
|
|
|
211
225
|
// Abbyy will sometimes misidentify capital letters immediately following drop caps as small caps,
|
|
@@ -372,18 +386,26 @@ export async function convertPageAbbyy({ ocrStr, n }) {
|
|
|
372
386
|
|
|
373
387
|
console.assert(wordObj.chars.length === text[i].length, `Likely parsing error for word: ${id}. Number of letters in text does not match number of \`ocrChar\` objects.`);
|
|
374
388
|
|
|
375
|
-
if (
|
|
376
|
-
wordObj.style =
|
|
389
|
+
if (italicArr[i]) {
|
|
390
|
+
wordObj.style.italic = true;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
if (boldArr[i]) {
|
|
394
|
+
wordObj.style.bold = true;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
if (underlineArr[i]) {
|
|
398
|
+
wordObj.style.underline = true;
|
|
377
399
|
}
|
|
378
400
|
|
|
379
|
-
wordObj.smallCaps = smallCapsArr[i];
|
|
401
|
+
wordObj.style.smallCaps = smallCapsArr[i];
|
|
380
402
|
|
|
381
|
-
if (fontName) wordObj.font = fontName;
|
|
403
|
+
if (fontName) wordObj.style.font = fontName;
|
|
382
404
|
|
|
383
|
-
if (
|
|
384
|
-
wordObj.sup = true;
|
|
385
|
-
} else if (
|
|
386
|
-
wordObj.dropcap = true;
|
|
405
|
+
if (supArr[i]) {
|
|
406
|
+
wordObj.style.sup = true;
|
|
407
|
+
} else if (dropcapArr[i]) {
|
|
408
|
+
wordObj.style.dropcap = true;
|
|
387
409
|
}
|
|
388
410
|
|
|
389
411
|
lineObj.words.push(wordObj);
|
|
@@ -140,11 +140,11 @@ export async function convertPageBlocks({
|
|
|
140
140
|
// The `word` object has a `is_italic` property, but it is always false.
|
|
141
141
|
// Therefore, the font name is checked to determine if the word is italic.
|
|
142
142
|
// See: https://github.com/naptha/tesseract.js/issues/907
|
|
143
|
-
if (keepItalic && /italic/i.test(word.font_name)) wordObj.style =
|
|
143
|
+
if (keepItalic && /italic/i.test(word.font_name)) wordObj.style.italic = true;
|
|
144
144
|
|
|
145
145
|
// Our fork of Tesseract Legacy should be able to recognize fonts, so this information is included.
|
|
146
146
|
// The generic HOCR importer does not include font information, as this is assumed to be unreliable.
|
|
147
|
-
wordObj.font = word.font_name;
|
|
147
|
+
wordObj.style.font = word.font_name;
|
|
148
148
|
|
|
149
149
|
wordObj.chars = [];
|
|
150
150
|
for (let m = 0; m < word.symbols.length; m++) {
|
|
@@ -247,8 +247,8 @@ export async function convertPageHocr({
|
|
|
247
247
|
|
|
248
248
|
if (debugMode) wordObj.raw = match;
|
|
249
249
|
|
|
250
|
-
if (italic) wordObj.style =
|
|
251
|
-
if (fontName) wordObj.font = fontName;
|
|
250
|
+
if (italic) wordObj.style.italic = true;
|
|
251
|
+
if (fontName) wordObj.style.font = fontName;
|
|
252
252
|
|
|
253
253
|
wordObj.conf = wordConf;
|
|
254
254
|
|
|
@@ -302,19 +302,6 @@ export async function convertPageHocr({
|
|
|
302
302
|
|
|
303
303
|
const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];
|
|
304
304
|
|
|
305
|
-
let smallCaps = false;
|
|
306
|
-
/** @type {('normal'|'italic'|'bold')} */
|
|
307
|
-
let fontStyle = 'normal';
|
|
308
|
-
if (styleStr && /italic/i.test(styleStr)) {
|
|
309
|
-
fontStyle = 'italic';
|
|
310
|
-
} else if (styleStr && /bold/i.test(styleStr)) {
|
|
311
|
-
fontStyle = 'bold';
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
if (styleStr && /small-caps/i.test(styleStr)) {
|
|
315
|
-
smallCaps = true;
|
|
316
|
-
}
|
|
317
|
-
|
|
318
305
|
const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0';
|
|
319
306
|
const wordConf = parseInt(confMatch) || 0;
|
|
320
307
|
|
|
@@ -327,16 +314,19 @@ export async function convertPageHocr({
|
|
|
327
314
|
const wordFontSizeStr = titleStrWord.match(/(?:;|\s)x_fsize\s+(\d+)/)?.[1];
|
|
328
315
|
if (wordFontSizeStr) {
|
|
329
316
|
const wordFontSize = parseInt(wordFontSizeStr);
|
|
330
|
-
if (wordFontSize) wordObj.size = wordFontSize;
|
|
317
|
+
if (wordFontSize) wordObj.style.size = wordFontSize;
|
|
331
318
|
}
|
|
332
319
|
}
|
|
333
320
|
|
|
334
|
-
|
|
335
|
-
|
|
321
|
+
if (styleStr) {
|
|
322
|
+
if (/italic/i.test(styleStr)) wordObj.style.italic = true;
|
|
323
|
+
if (/bold/i.test(styleStr)) wordObj.style.bold = true;
|
|
324
|
+
if (/small-caps/i.test(styleStr)) wordObj.style.smallCaps = true;
|
|
325
|
+
}
|
|
336
326
|
|
|
337
|
-
wordObj.sup =
|
|
327
|
+
if (wordSup) wordObj.style.sup = true;
|
|
338
328
|
|
|
339
|
-
wordObj.
|
|
329
|
+
if (fontName) wordObj.style.font = fontName;
|
|
340
330
|
|
|
341
331
|
wordObj.conf = wordConf;
|
|
342
332
|
|