scribe.js-ocr 0.7.3 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/fonts/all/Carlito-BoldItalic.woff +0 -0
  2. package/fonts/all/Century-BoldItalic.woff +0 -0
  3. package/fonts/all/Garamond-BoldItalic.woff +0 -0
  4. package/fonts/all/NimbusMono-BoldItalic.woff +0 -0
  5. package/fonts/all/NimbusRoman-BoldItalic.woff +0 -0
  6. package/fonts/all/NimbusSans-BoldItalic.woff +0 -0
  7. package/fonts/all/Palatino-BoldItalic.woff +0 -0
  8. package/fonts/latin/Carlito-BoldItalic.woff +0 -0
  9. package/fonts/latin/Century-BoldItalic.woff +0 -0
  10. package/fonts/latin/Garamond-BoldItalic.woff +0 -0
  11. package/fonts/latin/NimbusMono-BoldItalic.woff +0 -0
  12. package/fonts/latin/NimbusRoman-BoldItalic.woff +0 -0
  13. package/fonts/latin/NimbusSans-BoldItalic.woff +0 -0
  14. package/fonts/latin/Palatino-BoldItalic.woff +0 -0
  15. package/js/containers/app.js +1 -1
  16. package/js/containers/fontContainer.js +42 -40
  17. package/js/export/writeHocr.js +15 -13
  18. package/js/export/writeHtml.js +1 -1
  19. package/js/export/writePdf.js +52 -14
  20. package/js/export/writePdfFonts.js +11 -9
  21. package/js/export/writeTabular.js +2 -2
  22. package/js/export/writeText.js +10 -6
  23. package/js/extractTables.js +5 -5
  24. package/js/fontContainerMain.js +50 -7
  25. package/js/fontStatistics.js +18 -13
  26. package/js/fontSupp.js +20 -20
  27. package/js/global.d.ts +17 -0
  28. package/js/import/convertPageAbbyy.js +47 -25
  29. package/js/import/convertPageBlocks.js +2 -2
  30. package/js/import/convertPageHocr.js +10 -20
  31. package/js/import/convertPageShared.js +13 -9
  32. package/js/import/convertPageStext.js +66 -31
  33. package/js/objects/ocrObjects.js +13 -19
  34. package/js/utils/fontUtils.js +11 -11
  35. package/js/utils/miscUtils.js +16 -0
  36. package/js/worker/compareOCRModule.js +13 -16
  37. package/js/worker/optimizeFontModule.js +4 -4
  38. package/mupdf/libmupdf.js +123 -17
  39. package/mupdf/libmupdf.wasm +0 -0
  40. package/package.json +1 -1
@@ -54,17 +54,21 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
54
54
 
55
55
  if (docxMode) {
56
56
  let fontStyle = '';
57
- if (wordObj.style === 'italic') {
57
+ if (wordObj.style.italic) {
58
58
  fontStyle += '<w:i/>';
59
- } else if (wordObj.style === 'bold') {
59
+ } else if (wordObj.style.bold) {
60
60
  fontStyle += '<w:b/>';
61
61
  }
62
62
 
63
- if (wordObj.smallCaps) {
63
+ if (wordObj.style.smallCaps) {
64
64
  fontStyle += '<w:smallCaps/>';
65
65
  }
66
66
 
67
- if (wordObj.sup) {
67
+ if (wordObj.style.underline) {
68
+ fontStyle += '<w:u w:val="single"/>';
69
+ }
70
+
71
+ if (wordObj.style.sup) {
68
72
  fontStyle += '<w:vertAlign w:val="superscript"/>';
69
73
  }
70
74
 
@@ -79,7 +83,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
79
83
  } else if (supPrev) {
80
84
  textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
81
85
  // If this word is a superscript, no space is added between words.
82
- } else if (wordObj.sup && i > 0) {
86
+ } else if (wordObj.style.sup && i > 0) {
83
87
  textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
84
88
  } else {
85
89
  textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
@@ -89,7 +93,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
89
93
  }
90
94
 
91
95
  fontStylePrev = fontStyle;
92
- supPrev = wordObj.sup;
96
+ supPrev = wordObj.style.sup;
93
97
  } else if (newLine) {
94
98
  textStr = `${textStr}\n`;
95
99
  } else if (h > 0 || g > 0 || i > 0) {
@@ -22,11 +22,11 @@ export function extractTableContent(pageObj, layoutObj) {
22
22
 
23
23
  // TODO: This currently creates junk rows with only punctuation, as those bounding boxes are so small they often do not overlap with other lines.
24
24
  /**
25
- * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
26
- * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
27
- * @param {OcrPage} pageObj
28
- * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
29
- */
25
+ * Extracts words from a page that are within the bounding boxes of the table, organized into arrays of rows and columns.
26
+ * The output is in the form of a 3D array, where the first dimension is the row, the second dimension is the column, and the third dimension is the word.
27
+ * @param {OcrPage} pageObj
28
+ * @param {Array<import('./objects/layoutObjects.js').LayoutBoxBase>} boxes
29
+ */
30
30
  export function extractSingleTableContent(pageObj, boxes) {
31
31
  /** @type {Array<OcrWord>} */
32
32
  const wordArr = [];
@@ -24,103 +24,145 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
24
24
  let /** @type {Promise<ArrayBuffer>} */carlitoNormal;
25
25
  let /** @type {Promise<ArrayBuffer>} */carlitoItalic;
26
26
  let /** @type {Promise<ArrayBuffer>} */carlitoBold;
27
+ let /** @type {Promise<ArrayBuffer>} */carlitoBoldItalic;
27
28
  let /** @type {Promise<ArrayBuffer>} */centuryNormal;
28
29
  let /** @type {Promise<ArrayBuffer>} */centuryItalic;
29
30
  let /** @type {Promise<ArrayBuffer>} */centuryBold;
31
+ let /** @type {Promise<ArrayBuffer>} */centuryBoldItalic;
30
32
  let /** @type {Promise<ArrayBuffer>} */garamondNormal;
31
33
  let /** @type {Promise<ArrayBuffer>} */garamondItalic;
32
34
  let /** @type {Promise<ArrayBuffer>} */garamondBold;
35
+ let /** @type {Promise<ArrayBuffer>} */garamondBoldItalic;
33
36
  let /** @type {Promise<ArrayBuffer>} */palatinoNormal;
34
37
  let /** @type {Promise<ArrayBuffer>} */palatinoItalic;
35
38
  let /** @type {Promise<ArrayBuffer>} */palatinoBold;
39
+ let /** @type {Promise<ArrayBuffer>} */palatinoBoldItalic;
36
40
  let /** @type {Promise<ArrayBuffer>} */nimbusRomanNormal;
37
41
  let /** @type {Promise<ArrayBuffer>} */nimbusRomanItalic;
38
42
  let /** @type {Promise<ArrayBuffer>} */nimbusRomanBold;
43
+ let /** @type {Promise<ArrayBuffer>} */nimbusRomanBoldItalic;
39
44
  let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
40
45
  let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
41
46
  let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
47
+ let /** @type {Promise<ArrayBuffer>} */nimbusSansBoldItalic;
42
48
  let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
43
49
  let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
44
50
  let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
51
+ let /** @type {Promise<ArrayBuffer>} */nimbusMonoBoldItalic;
45
52
  if (typeof process === 'undefined') {
46
53
  if (glyphSet === 'latin') {
47
54
  carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
48
55
  carlitoItalic = fetch(new URL('../fonts/latin/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
49
56
  carlitoBold = fetch(new URL('../fonts/latin/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
57
+ carlitoBoldItalic = fetch(new URL('../fonts/latin/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
50
58
  centuryNormal = fetch(new URL('../fonts/latin/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
51
59
  centuryItalic = fetch(new URL('../fonts/latin/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
52
60
  centuryBold = fetch(new URL('../fonts/latin/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
61
+ centuryBoldItalic = fetch(new URL('../fonts/latin/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
53
62
  garamondNormal = fetch(new URL('../fonts/latin/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
54
63
  garamondItalic = fetch(new URL('../fonts/latin/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
55
64
  garamondBold = fetch(new URL('../fonts/latin/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
65
+ garamondBoldItalic = fetch(new URL('../fonts/latin/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
56
66
  palatinoNormal = fetch(new URL('../fonts/latin/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
57
67
  palatinoItalic = fetch(new URL('../fonts/latin/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
58
68
  palatinoBold = fetch(new URL('../fonts/latin/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
69
+ palatinoBoldItalic = fetch(new URL('../fonts/latin/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
59
70
  nimbusRomanNormal = fetch(new URL('../fonts/latin/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
60
71
  nimbusRomanItalic = fetch(new URL('../fonts/latin/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
61
72
  nimbusRomanBold = fetch(new URL('../fonts/latin/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
73
+ nimbusRomanBoldItalic = fetch(new URL('../fonts/latin/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
62
74
  nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
63
75
  nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
64
76
  nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
77
+ nimbusSansBoldItalic = fetch(new URL('../fonts/latin/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
65
78
  nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
66
79
  nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
67
80
  nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
81
+ nimbusMonoBoldItalic = fetch(new URL('../fonts/latin/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
68
82
  } else {
69
83
  carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
70
84
  carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
71
85
  carlitoBold = fetch(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
86
+ carlitoBoldItalic = fetch(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
72
87
  centuryNormal = fetch(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
73
88
  centuryItalic = fetch(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
74
89
  centuryBold = fetch(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
90
+ centuryBoldItalic = fetch(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
75
91
  garamondNormal = fetch(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
76
92
  garamondItalic = fetch(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
77
93
  garamondBold = fetch(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
94
+ garamondBoldItalic = fetch(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
78
95
  palatinoNormal = fetch(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
79
96
  palatinoItalic = fetch(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
80
97
  palatinoBold = fetch(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
98
+ palatinoBoldItalic = fetch(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
81
99
  nimbusRomanNormal = fetch(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
82
100
  nimbusRomanItalic = fetch(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
83
101
  nimbusRomanBold = fetch(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
102
+ nimbusRomanBoldItalic = fetch(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
84
103
  nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
85
104
  nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
86
105
  nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
106
+ nimbusSansBoldItalic = fetch(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
87
107
  nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
88
108
  nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
89
109
  nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
110
+ nimbusMonoBoldItalic = fetch(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.arrayBuffer());
90
111
  }
91
112
  } else {
92
113
  const { readFile } = await import('fs/promises');
93
114
  carlitoNormal = readFile(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.buffer);
94
115
  carlitoItalic = readFile(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.buffer);
95
116
  carlitoBold = readFile(new URL('../fonts/all/Carlito-Bold.woff', import.meta.url)).then((res) => res.buffer);
117
+ carlitoBoldItalic = readFile(new URL('../fonts/all/Carlito-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
96
118
  centuryNormal = readFile(new URL('../fonts/all/Century-Regular.woff', import.meta.url)).then((res) => res.buffer);
97
119
  centuryItalic = readFile(new URL('../fonts/all/Century-Italic.woff', import.meta.url)).then((res) => res.buffer);
98
120
  centuryBold = readFile(new URL('../fonts/all/Century-Bold.woff', import.meta.url)).then((res) => res.buffer);
121
+ centuryBoldItalic = readFile(new URL('../fonts/all/Century-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
99
122
  garamondNormal = readFile(new URL('../fonts/all/Garamond-Regular.woff', import.meta.url)).then((res) => res.buffer);
100
123
  garamondItalic = readFile(new URL('../fonts/all/Garamond-Italic.woff', import.meta.url)).then((res) => res.buffer);
101
124
  garamondBold = readFile(new URL('../fonts/all/Garamond-Bold.woff', import.meta.url)).then((res) => res.buffer);
125
+ garamondBoldItalic = readFile(new URL('../fonts/all/Garamond-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
102
126
  palatinoNormal = readFile(new URL('../fonts/all/Palatino-Regular.woff', import.meta.url)).then((res) => res.buffer);
103
127
  palatinoItalic = readFile(new URL('../fonts/all/Palatino-Italic.woff', import.meta.url)).then((res) => res.buffer);
104
128
  palatinoBold = readFile(new URL('../fonts/all/Palatino-Bold.woff', import.meta.url)).then((res) => res.buffer);
129
+ palatinoBoldItalic = readFile(new URL('../fonts/all/Palatino-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
105
130
  nimbusRomanNormal = readFile(new URL('../fonts/all/NimbusRoman-Regular.woff', import.meta.url)).then((res) => res.buffer);
106
131
  nimbusRomanItalic = readFile(new URL('../fonts/all/NimbusRoman-Italic.woff', import.meta.url)).then((res) => res.buffer);
107
132
  nimbusRomanBold = readFile(new URL('../fonts/all/NimbusRoman-Bold.woff', import.meta.url)).then((res) => res.buffer);
133
+ nimbusRomanBoldItalic = readFile(new URL('../fonts/all/NimbusRoman-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
108
134
  nimbusSansNormal = readFile(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.buffer);
109
135
  nimbusSansItalic = readFile(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.buffer);
110
136
  nimbusSansBold = readFile(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.buffer);
137
+ nimbusSansBoldItalic = readFile(new URL('../fonts/all/NimbusSans-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
111
138
  nimbusMonoNormal = readFile(new URL('../fonts/all/NimbusMono-Regular.woff', import.meta.url)).then((res) => res.buffer);
112
139
  nimbusMonoItalic = readFile(new URL('../fonts/all/NimbusMono-Italic.woff', import.meta.url)).then((res) => res.buffer);
113
140
  nimbusMonoBold = readFile(new URL('../fonts/all/NimbusMono-Bold.woff', import.meta.url)).then((res) => res.buffer);
141
+ nimbusMonoBoldItalic = readFile(new URL('../fonts/all/NimbusMono-BoldItalic.woff', import.meta.url)).then((res) => res.buffer);
114
142
  }
115
143
 
116
144
  const srcObj = {
117
- Carlito: { normal: await carlitoNormal, italic: await carlitoItalic, bold: await carlitoBold },
118
- Century: { normal: await centuryNormal, italic: await centuryItalic, bold: await centuryBold },
119
- Garamond: { normal: await garamondNormal, italic: await garamondItalic, bold: await garamondBold },
120
- Palatino: { normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold },
121
- NimbusRoman: { normal: await nimbusRomanNormal, italic: await nimbusRomanItalic, bold: await nimbusRomanBold },
122
- NimbusSans: { normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold },
123
- NimbusMono: { normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold },
145
+ Carlito: {
146
+ normal: await carlitoNormal, italic: await carlitoItalic, bold: await carlitoBold, boldItalic: await carlitoBoldItalic,
147
+ },
148
+ Century: {
149
+ normal: await centuryNormal, italic: await centuryItalic, bold: await centuryBold, boldItalic: await centuryBoldItalic,
150
+ },
151
+ Garamond: {
152
+ normal: await garamondNormal, italic: await garamondItalic, bold: await garamondBold, boldItalic: await garamondBoldItalic,
153
+ },
154
+ Palatino: {
155
+ normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold, boldItalic: await palatinoBoldItalic,
156
+ },
157
+ NimbusRoman: {
158
+ normal: await nimbusRomanNormal, italic: await nimbusRomanItalic, bold: await nimbusRomanBold, boldItalic: await nimbusRomanBoldItalic,
159
+ },
160
+ NimbusSans: {
161
+ normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold, boldItalic: await nimbusSansBoldItalic,
162
+ },
163
+ NimbusMono: {
164
+ normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold, boldItalic: await nimbusMonoBoldItalic,
165
+ },
124
166
  };
125
167
 
126
168
  FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
@@ -217,6 +259,7 @@ export async function updateFontContWorkerMain(params = {}) {
217
259
  };
218
260
  if (value.italic) input.src[key].italic = value.italic.src;
219
261
  if (value.bold) input.src[key].bold = value.bold.src;
262
+ if (value.boldItalic) input.src[key].boldItalic = value.boldItalic.src;
220
263
  }
221
264
 
222
265
  for (let i = 0; i < gs.schedulerInner.workers.length; i++) {
@@ -3,6 +3,7 @@
3
3
 
4
4
  import {
5
5
  determineSansSerif,
6
+ getStyleLookup,
6
7
  quantile,
7
8
  replaceObjectProperties,
8
9
  round6,
@@ -243,13 +244,13 @@ function calcFontMetricsPage(pageObj) {
243
244
 
244
245
  for (const lineObj of pageObj.lines) {
245
246
  for (const wordObj of lineObj.words) {
246
- const wordFontFamily = determineSansSerif(wordObj.font) || 'Default';
247
+ const wordFontFamily = determineSansSerif(wordObj.style.font) || 'Default';
247
248
 
248
249
  // This condition should not occur, however has in the past due to parsing bugs. Skipping to avoid entire program crashing if this occurs.
249
250
  if (wordObj.chars && wordObj.chars.length !== wordObj.text.length) continue;
250
251
 
251
252
  // Do not include superscripts, dropcaps, and low-confidence words in statistics for font optimization.
252
- if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.sup || wordObj.smallCaps) continue;
253
+ if (wordObj.conf < 80 || wordObj.lang === 'chi_sim' || wordObj.style.sup || wordObj.style.smallCaps) continue;
253
254
  /** @type {Object.<string, FontMetricsRawFamily>} */
254
255
  const fontMetricsRawLine = {};
255
256
 
@@ -275,14 +276,18 @@ function calcFontMetricsPage(pageObj) {
275
276
  fontMetricsRawLine[wordFontFamily] = new FontMetricsRawFamily();
276
277
  }
277
278
 
278
- if (!fontMetricsRawLine[wordFontFamily][wordObj.style].width[charUnicode]) {
279
- fontMetricsRawLine[wordFontFamily][wordObj.style].width[charUnicode] = [];
280
- fontMetricsRawLine[wordFontFamily][wordObj.style].height[charUnicode] = [];
279
+ const styleLookup = getStyleLookup(wordObj.style);
280
+
281
+ if (!['normal', 'italic', 'bold'].includes(styleLookup)) continue;
282
+
283
+ if (!fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode]) {
284
+ fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode] = [];
285
+ fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode] = [];
281
286
  }
282
287
 
283
- fontMetricsRawLine[wordFontFamily][wordObj.style].width[charUnicode].push(charWidth / charNorm);
284
- fontMetricsRawLine[wordFontFamily][wordObj.style].height[charUnicode].push(charHeight / charNorm);
285
- fontMetricsRawLine[wordFontFamily][wordObj.style].obs += 1;
288
+ fontMetricsRawLine[wordFontFamily][styleLookup].width[charUnicode].push(charWidth / charNorm);
289
+ fontMetricsRawLine[wordFontFamily][styleLookup].height[charUnicode].push(charHeight / charNorm);
290
+ fontMetricsRawLine[wordFontFamily][styleLookup].obs += 1;
286
291
 
287
292
  if (k + 1 < wordObj.chars.length) {
288
293
  const charObjNext = wordObj.chars[k + 1];
@@ -295,12 +300,12 @@ function calcFontMetricsPage(pageObj) {
295
300
  if (trailingSpace + charWidthNext > 0) {
296
301
  const bigramUnicode = `${charUnicode},${wordObj.chars[k + 1].text.charCodeAt(0)}`;
297
302
 
298
- if (!fontMetricsRawLine[wordFontFamily][wordObj.style].kerning[bigramUnicode]) {
299
- fontMetricsRawLine[wordFontFamily][wordObj.style].kerning[bigramUnicode] = [];
300
- fontMetricsRawLine[wordFontFamily][wordObj.style].kerning2[bigramUnicode] = [];
303
+ if (!fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode]) {
304
+ fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode] = [];
305
+ fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode] = [];
301
306
  }
302
- fontMetricsRawLine[wordFontFamily][wordObj.style].kerning[bigramUnicode].push(trailingSpace / charNorm);
303
- fontMetricsRawLine[wordFontFamily][wordObj.style].kerning2[bigramUnicode].push((trailingSpace + charWidthNext) / charNorm);
307
+ fontMetricsRawLine[wordFontFamily][styleLookup].kerning[bigramUnicode].push(trailingSpace / charNorm);
308
+ fontMetricsRawLine[wordFontFamily][styleLookup].kerning2[bigramUnicode].push((trailingSpace + charWidthNext) / charNorm);
304
309
  }
305
310
  }
306
311
  }
package/js/fontSupp.js CHANGED
@@ -44,7 +44,7 @@ const calcSuppFontInfoForWords = async (words) => {
44
44
  const fontSizeArr = [];
45
45
  for (const word of wordsRes) {
46
46
  fontSizeArr.push(calcWordFontSize(word));
47
- const sansSerif = determineSansSerif(word.font);
47
+ const sansSerif = determineSansSerif(word.style.font);
48
48
  if (sansSerif !== 'Default') {
49
49
  if (sansSerif === 'SansDefault') {
50
50
  sansVotes++;
@@ -53,9 +53,9 @@ const calcSuppFontInfoForWords = async (words) => {
53
53
  }
54
54
  }
55
55
  }
56
- if (words[0].size) {
57
- // @ts-ignore
58
- fontSizeMult = quantile(fontSizeArr, 0.5) / words[0].size;
56
+
57
+ if (words[0].style.size) {
58
+ fontSizeMult = quantile(fontSizeArr, 0.5) / words[0].style.size;
59
59
  }
60
60
 
61
61
  return { sansVotes, serifVotes, fontSizeMult };
@@ -83,36 +83,36 @@ export const calcSuppFontInfo = async (ocrArr) => {
83
83
  let wordFontLast;
84
84
  let wordFontSizeLast;
85
85
  for (const word of line.words) {
86
- if (word.font) {
87
- if (skipFonts.has(word.font)) {
86
+ if (word.style.font) {
87
+ if (skipFonts.has(word.style.font)) {
88
88
  continue;
89
89
  // Printing words off screen is a common method of hiding text in PDFs.
90
90
  } else if (word.bbox.left < 0 || word.bbox.top < 0 || word.bbox.right > page.dims.width || word.bbox.bottom > page.dims.height) {
91
91
  continue;
92
- } else if (!calcFonts.has(word.font)) {
93
- const sansSerifUnknown = determineSansSerif(word.font) === 'Default';
92
+ } else if (!calcFonts.has(word.style.font)) {
93
+ const sansSerifUnknown = determineSansSerif(word.style.font) === 'Default';
94
94
  if (sansSerifUnknown || !word.visualCoords) {
95
- calcFonts.add(word.font);
95
+ calcFonts.add(word.style.font);
96
96
  } else {
97
- skipFonts.add(word.font);
97
+ skipFonts.add(word.style.font);
98
98
  continue;
99
99
  }
100
100
  }
101
101
 
102
- if (!fontExamples[word.font]) {
103
- fontExamples[word.font] = [];
104
- } else if (fontExamples[word.font].length > 3) {
102
+ if (!fontExamples[word.style.font]) {
103
+ fontExamples[word.style.font] = [];
104
+ } else if (fontExamples[word.style.font].length > 3) {
105
105
  continue;
106
106
  }
107
107
 
108
- if (word.font !== wordFontLast || word.size !== wordFontSizeLast) {
109
- fontExamples[word.font].push([word]);
108
+ if (word.style.font !== wordFontLast || word.style.size !== wordFontSizeLast) {
109
+ fontExamples[word.style.font].push([word]);
110
110
  } else {
111
- fontExamples[word.font][fontExamples[word.font].length - 1].push(word);
111
+ fontExamples[word.style.font][fontExamples[word.style.font].length - 1].push(word);
112
112
  }
113
113
 
114
- wordFontLast = word.font;
115
- wordFontSizeLast = word.size;
114
+ wordFontLast = word.style.font;
115
+ wordFontSizeLast = word.style.size;
116
116
  }
117
117
  }
118
118
  }
@@ -158,8 +158,8 @@ export const calcSuppFontInfo = async (ocrArr) => {
158
158
  for (const page of ocrArr) {
159
159
  for (const line of page.lines) {
160
160
  for (const word of line.words) {
161
- if (word.font && word.size && FontProps.sizeMult[word.font]) {
162
- word.size = Math.round(word.size * FontProps.sizeMult[word.font] * 1000) / 1000;
161
+ if (word.style.font && word.style.size && FontProps.sizeMult[word.style.font]) {
162
+ word.style.size = Math.round(word.style.size * FontProps.sizeMult[word.style.font] * 1000) / 1000;
163
163
  }
164
164
  }
165
165
  }
package/js/global.d.ts CHANGED
@@ -1,5 +1,18 @@
1
1
  declare global {
2
2
 
3
+ type Style = {
4
+ font: ?string;
5
+ size: ?number;
6
+ bold: boolean;
7
+ italic: boolean;
8
+ underline: boolean;
9
+ smallCaps: boolean;
10
+ sup: boolean;
11
+ dropcap: boolean;
12
+ };
13
+
14
+ type StyleLookup = ('normal'|'bold'|'italic'|'boldItalic');
15
+
3
16
  // OCR objects
4
17
  type OcrPage = import("./objects/ocrObjects.js").OcrPage;
5
18
  type OcrLine = import("./objects/ocrObjects.js").OcrLine;
@@ -17,12 +30,14 @@ declare global {
17
30
  normal: FontContainerFont;
18
31
  italic: FontContainerFont;
19
32
  bold: FontContainerFont;
33
+ boldItalic: FontContainerFont;
20
34
  };
21
35
 
22
36
  type FontContainerFamilyUpload = {
23
37
  normal: FontContainerFont | null;
24
38
  italic: FontContainerFont | null;
25
39
  bold: FontContainerFont | null;
40
+ boldItalic: FontContainerFont | null;
26
41
  };
27
42
 
28
43
  type FontContainerFamily = FontContainerFamilyBuiltIn | FontContainerFamilyUpload;
@@ -42,12 +57,14 @@ declare global {
42
57
  normal: ArrayBuffer;
43
58
  italic: ArrayBuffer;
44
59
  bold: ArrayBuffer;
60
+ boldItalic: ArrayBuffer;
45
61
  };
46
62
 
47
63
  type fontSrcUpload = {
48
64
  normal: ArrayBuffer | null;
49
65
  italic: ArrayBuffer | null;
50
66
  bold: ArrayBuffer | null;
67
+ boldItalic: ArrayBuffer | null;
51
68
  };
52
69
 
53
70
  type opentypeFont = import("../lib/opentype.module.js").Font;
@@ -171,8 +171,18 @@ export async function convertPageAbbyy({ ocrStr, n }) {
171
171
  /** @type {Array<Array<OcrChar>>} */
172
172
  const charObjArrLine = Array(wordStrArr.length);
173
173
  text = text.fill('');
174
- let styleArr = Array(wordStrArr.length);
175
- styleArr = styleArr.fill('normal');
174
+
175
+ /** @type {Array<boolean>} */
176
+ const italicArr = Array(wordStrArr.length).fill(false);
177
+ /** @type {Array<boolean>} */
178
+ const boldArr = Array(wordStrArr.length).fill(false);
179
+ /** @type {Array<boolean>} */
180
+ const underlineArr = Array(wordStrArr.length).fill(false);
181
+ /** @type {Array<boolean>} */
182
+ const supArr = Array(wordStrArr.length).fill(false);
183
+ /** @type {Array<boolean>} */
184
+ const dropcapArr = Array(wordStrArr.length).fill(false);
185
+
176
186
  /** @type {Array<boolean>} */
177
187
  const smallCapsArr = Array(wordStrArr.length).fill(false);
178
188
  /** @type {Array<boolean>} */
@@ -184,28 +194,32 @@ export async function convertPageAbbyy({ ocrStr, n }) {
184
194
 
185
195
  if (typeof (letterArr[0][1]) !== 'undefined') {
186
196
  if (dropCap && i === 0) {
187
- styleArr[i] = 'dropcap';
197
+ dropcapArr[i] = true;
188
198
  } else if (/superscript=['"](1|true)/i.test(letterArr[0][1])) {
189
- styleArr[i] = 'sup';
190
- } else if (/italic=['"](1|true)/i.test(letterArr[0][1])) {
191
- styleArr[i] = 'italic';
192
- stylesLine.italic = true;
199
+ supArr[i] = true;
193
200
  } else {
194
- styleArr[i] = 'normal';
195
- stylesLine.normal = true;
201
+ if (/italic=['"](1|true)/i.test(letterArr[0][1])) {
202
+ italicArr[i] = true;
203
+ }
204
+
205
+ if (/bold=['"](1|true)/i.test(letterArr[0][1])) {
206
+ boldArr[i] = true;
207
+ }
208
+
209
+ if (/underline=['"](1|true)/i.test(letterArr[0][1])) {
210
+ underlineArr[i] = true;
211
+ }
196
212
  }
197
213
 
198
214
  if (/smallcaps=['"](1|true)/i.test(letterArr[0][1])) {
199
215
  smallCapsArr[i] = true;
200
216
  }
201
- } else if (i > 0) {
202
- if (styleArr[i - 1] === 'dropcap') {
203
- styleArr[i] = 'normal';
204
- smallCapsArr[i] = false;
205
- } else {
206
- styleArr[i] = styleArr[i - 1];
207
- smallCapsArr[i] = smallCapsArr[i - 1];
208
- }
217
+ } else if (i > 0 && !dropcapArr[i - 1]) {
218
+ italicArr[i] = italicArr[i - 1];
219
+ boldArr[i] = boldArr[i - 1];
220
+ underlineArr[i] = underlineArr[i - 1];
221
+ supArr[i] = supArr[i - 1];
222
+ smallCapsArr[i] = smallCapsArr[i - 1];
209
223
  }
210
224
 
211
225
  // Abbyy will sometimes misidentify capital letters immediately following drop caps as small caps,
@@ -372,18 +386,26 @@ export async function convertPageAbbyy({ ocrStr, n }) {
372
386
 
373
387
  console.assert(wordObj.chars.length === text[i].length, `Likely parsing error for word: ${id}. Number of letters in text does not match number of \`ocrChar\` objects.`);
374
388
 
375
- if (styleArr[i] === 'italic') {
376
- wordObj.style = 'italic';
389
+ if (italicArr[i]) {
390
+ wordObj.style.italic = true;
391
+ }
392
+
393
+ if (boldArr[i]) {
394
+ wordObj.style.bold = true;
395
+ }
396
+
397
+ if (underlineArr[i]) {
398
+ wordObj.style.underline = true;
377
399
  }
378
400
 
379
- wordObj.smallCaps = smallCapsArr[i];
401
+ wordObj.style.smallCaps = smallCapsArr[i];
380
402
 
381
- if (fontName) wordObj.font = fontName;
403
+ if (fontName) wordObj.style.font = fontName;
382
404
 
383
- if (styleArr[i] === 'sup') {
384
- wordObj.sup = true;
385
- } else if (styleArr[i] === 'dropcap') {
386
- wordObj.dropcap = true;
405
+ if (supArr[i]) {
406
+ wordObj.style.sup = true;
407
+ } else if (dropcapArr[i]) {
408
+ wordObj.style.dropcap = true;
387
409
  }
388
410
 
389
411
  lineObj.words.push(wordObj);
@@ -140,11 +140,11 @@ export async function convertPageBlocks({
140
140
  // The `word` object has a `is_italic` property, but it is always false.
141
141
  // Therefore, the font name is checked to determine if the word is italic.
142
142
  // See: https://github.com/naptha/tesseract.js/issues/907
143
- if (keepItalic && /italic/i.test(word.font_name)) wordObj.style = 'italic';
143
+ if (keepItalic && /italic/i.test(word.font_name)) wordObj.style.italic = true;
144
144
 
145
145
  // Our fork of Tesseract Legacy should be able to recognize fonts, so this information is included.
146
146
  // The generic HOCR importer does not include font information, as this is assumed to be unreliable.
147
- wordObj.font = word.font_name;
147
+ wordObj.style.font = word.font_name;
148
148
 
149
149
  wordObj.chars = [];
150
150
  for (let m = 0; m < word.symbols.length; m++) {
@@ -247,8 +247,8 @@ export async function convertPageHocr({
247
247
 
248
248
  if (debugMode) wordObj.raw = match;
249
249
 
250
- if (italic) wordObj.style = 'italic';
251
- if (fontName) wordObj.font = fontName;
250
+ if (italic) wordObj.style.italic = true;
251
+ if (fontName) wordObj.style.font = fontName;
252
252
 
253
253
  wordObj.conf = wordConf;
254
254
 
@@ -302,19 +302,6 @@ export async function convertPageHocr({
302
302
 
303
303
  const styleStr = match.match(/style=['"]([^'"]+)/)?.[1];
304
304
 
305
- let smallCaps = false;
306
- /** @type {('normal'|'italic'|'bold')} */
307
- let fontStyle = 'normal';
308
- if (styleStr && /italic/i.test(styleStr)) {
309
- fontStyle = 'italic';
310
- } else if (styleStr && /bold/i.test(styleStr)) {
311
- fontStyle = 'bold';
312
- }
313
-
314
- if (styleStr && /small-caps/i.test(styleStr)) {
315
- smallCaps = true;
316
- }
317
-
318
305
  const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0';
319
306
  const wordConf = parseInt(confMatch) || 0;
320
307
 
@@ -327,16 +314,19 @@ export async function convertPageHocr({
327
314
  const wordFontSizeStr = titleStrWord.match(/(?:;|\s)x_fsize\s+(\d+)/)?.[1];
328
315
  if (wordFontSizeStr) {
329
316
  const wordFontSize = parseInt(wordFontSizeStr);
330
- if (wordFontSize) wordObj.size = wordFontSize;
317
+ if (wordFontSize) wordObj.style.size = wordFontSize;
331
318
  }
332
319
  }
333
320
 
334
- wordObj.style = fontStyle;
335
- if (fontName) wordObj.font = fontName;
321
+ if (styleStr) {
322
+ if (/italic/i.test(styleStr)) wordObj.style.italic = true;
323
+ if (/bold/i.test(styleStr)) wordObj.style.bold = true;
324
+ if (/small-caps/i.test(styleStr)) wordObj.style.smallCaps = true;
325
+ }
336
326
 
337
- wordObj.sup = wordSup;
327
+ if (wordSup) wordObj.style.sup = true;
338
328
 
339
- wordObj.smallCaps = smallCaps;
329
+ if (fontName) wordObj.style.font = fontName;
340
330
 
341
331
  wordObj.conf = wordConf;
342
332