scribe.js-ocr 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/.eslintrc.json +6 -0
  2. package/fonts/all/Carlito-Bold.woff +0 -0
  3. package/fonts/all/Carlito-Italic.woff +0 -0
  4. package/fonts/all/Carlito-Regular.woff +0 -0
  5. package/fonts/all/{C059-Bold.woff → Century-Bold.woff} +0 -0
  6. package/fonts/all/{C059-Italic.woff → Century-Italic.woff} +0 -0
  7. package/fonts/all/{C059-Roman.woff → Century-Regular.woff} +0 -0
  8. package/fonts/all/{EBGaramond-Bold.woff → Garamond-Bold.woff} +0 -0
  9. package/fonts/all/{EBGaramond-Italic.woff → Garamond-Italic.woff} +0 -0
  10. package/fonts/all/{EBGaramond-Regular.woff → Garamond-Regular.woff} +0 -0
  11. package/fonts/all/{NimbusMonoPS-Bold.woff → NimbusMono-Bold.woff} +0 -0
  12. package/fonts/all/{NimbusMonoPS-Italic.woff → NimbusMono-Italic.woff} +0 -0
  13. package/fonts/all/{NimbusMonoPS-Regular.woff → NimbusMono-Regular.woff} +0 -0
  14. package/fonts/all/NimbusRoman-Bold.woff +0 -0
  15. package/fonts/all/NimbusRoman-Italic.woff +0 -0
  16. package/fonts/all/NimbusRoman-Regular.woff +0 -0
  17. package/fonts/all/NimbusSans-Bold.woff +0 -0
  18. package/fonts/all/NimbusSans-Italic.woff +0 -0
  19. package/fonts/all/NimbusSans-Regular.woff +0 -0
  20. package/fonts/all/{P052-Bold.woff → Palatino-Bold.woff} +0 -0
  21. package/fonts/all/{P052-Italic.woff → Palatino-Italic.woff} +0 -0
  22. package/fonts/all/{P052-Roman.woff → Palatino-Regular.woff} +0 -0
  23. package/fonts/latin/Carlito-Bold.woff +0 -0
  24. package/fonts/latin/Carlito-Italic.woff +0 -0
  25. package/fonts/latin/Carlito-Regular.woff +0 -0
  26. package/fonts/latin/{C059-Bold.woff → Century-Bold.woff} +0 -0
  27. package/fonts/latin/{C059-Italic.woff → Century-Italic.woff} +0 -0
  28. package/fonts/latin/{C059-Roman.woff → Century-Regular.woff} +0 -0
  29. package/fonts/latin/{EBGaramond-Bold.woff → Garamond-Bold.woff} +0 -0
  30. package/fonts/latin/{EBGaramond-Italic.woff → Garamond-Italic.woff} +0 -0
  31. package/fonts/latin/{EBGaramond-Regular.woff → Garamond-Regular.woff} +0 -0
  32. package/fonts/latin/{NimbusMonoPS-Bold.woff → NimbusMono-Bold.woff} +0 -0
  33. package/fonts/latin/{NimbusMonoPS-Italic.woff → NimbusMono-Italic.woff} +0 -0
  34. package/fonts/latin/{NimbusMonoPS-Regular.woff → NimbusMono-Regular.woff} +0 -0
  35. package/fonts/latin/NimbusRoman-Bold.woff +0 -0
  36. package/fonts/latin/NimbusRoman-Italic.woff +0 -0
  37. package/fonts/latin/NimbusRoman-Regular.woff +0 -0
  38. package/fonts/latin/NimbusSans-Bold.woff +0 -0
  39. package/fonts/latin/NimbusSans-Italic.woff +0 -0
  40. package/fonts/latin/NimbusSans-Regular.woff +0 -0
  41. package/fonts/latin/{P052-Bold.woff → Palatino-Bold.woff} +0 -0
  42. package/fonts/latin/{P052-Italic.woff → Palatino-Italic.woff} +0 -0
  43. package/fonts/latin/{P052-Roman.woff → Palatino-Regular.woff} +0 -0
  44. package/js/containers/app.js +2 -0
  45. package/js/containers/fontContainer.js +18 -12
  46. package/js/export/export.js +4 -1
  47. package/js/export/writeHtml.js +223 -0
  48. package/js/fontContainerMain.js +60 -60
  49. package/js/fontEval.js +9 -9
  50. package/js/fontSupp.js +7 -7
  51. package/js/generalWorkerMain.js +16 -16
  52. package/js/global.d.ts +1 -1
  53. package/js/import/convertPageHocr.js +6 -6
  54. package/js/import/importOCR.js +3 -1
  55. package/js/import/nodeAdapter.js +8 -8
  56. package/js/objects/imageObjects.js +9 -9
  57. package/js/objects/ocrObjects.js +3 -3
  58. package/js/utils/fontUtils.js +8 -8
  59. package/js/utils/imageUtils.js +9 -10
  60. package/js/utils/reflowPars.js +5 -5
  61. package/js/worker/compareOCRModule.js +29 -29
  62. package/js/worker/generalWorker.js +8 -8
  63. package/package.json +4 -3
  64. package/fonts/all_ttf/C059-Bold.ttf +0 -0
  65. package/fonts/all_ttf/C059-Italic.ttf +0 -0
  66. package/fonts/all_ttf/C059-Roman.ttf +0 -0
  67. package/fonts/all_ttf/Carlito-Bold.ttf +0 -0
  68. package/fonts/all_ttf/Carlito-Italic.ttf +0 -0
  69. package/fonts/all_ttf/Carlito-Regular.ttf +0 -0
  70. package/fonts/all_ttf/EBGaramond-Bold.ttf +0 -0
  71. package/fonts/all_ttf/EBGaramond-Italic.ttf +0 -0
  72. package/fonts/all_ttf/EBGaramond-Regular.ttf +0 -0
  73. package/fonts/all_ttf/NimbusMonoPS-Bold.ttf +0 -0
  74. package/fonts/all_ttf/NimbusMonoPS-Italic.ttf +0 -0
  75. package/fonts/all_ttf/NimbusMonoPS-Regular.ttf +0 -0
  76. package/fonts/all_ttf/NimbusRoman-Bold.ttf +0 -0
  77. package/fonts/all_ttf/NimbusRoman-Italic.ttf +0 -0
  78. package/fonts/all_ttf/NimbusRoman-Regular.ttf +0 -0
  79. package/fonts/all_ttf/NimbusSans-Bold.ttf +0 -0
  80. package/fonts/all_ttf/NimbusSans-Italic.ttf +0 -0
  81. package/fonts/all_ttf/NimbusSans-Regular.ttf +0 -0
  82. package/fonts/all_ttf/P052-Bold.ttf +0 -0
  83. package/fonts/all_ttf/P052-Italic.ttf +0 -0
  84. package/fonts/all_ttf/P052-Roman.ttf +0 -0
package/.eslintrc.json CHANGED
@@ -11,6 +11,9 @@
11
11
  "ecmaVersion": "latest",
12
12
  "sourceType": "module"
13
13
  },
14
+ "plugins": [
15
+ "jsdoc"
16
+ ],
14
17
  // "globals": {
15
18
  // "fabric": "writable"
16
19
  // },
@@ -20,6 +23,9 @@
20
23
  200,
21
24
  { "ignoreRegExpLiterals": true, "ignoreTemplateLiterals": true }
22
25
  ],
26
+
27
+ "jsdoc/check-alignment": 1,
28
+
23
29
  // This rule results in code being deleted.
24
30
  "no-unreachable": "off",
25
31
  // This edit allows for .js files (but not packages) to have an extension.
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -29,6 +29,8 @@ export class opt {
29
29
 
30
30
  static reflow = true;
31
31
 
32
+ static removeMargins = false;
33
+
32
34
  static pageBreaks = true;
33
35
 
34
36
  /** @type {("invis"|"ebook"|"eval"|"proof")} */
@@ -264,7 +264,7 @@ export class FontCont {
264
264
 
265
265
  static defaultFontName = 'SerifDefault';
266
266
 
267
- static serifDefaultName = 'NimbusRomNo9L';
267
+ static serifDefaultName = 'NimbusRoman';
268
268
 
269
269
  static sansDefaultName = 'NimbusSans';
270
270
 
@@ -365,14 +365,14 @@ export class FontCont {
365
365
  };
366
366
 
367
367
  /**
368
- * Gets a font object. Unlike accessing the font containers directly,
369
- * this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
370
- *
371
- * @param {('Default'|'SansDefault'|'SerifDefault'|string)} family - Font family name.
372
- * @param {('normal'|'italic'|'bold'|string)} [style='normal']
373
- * @param {string} [lang='eng']
374
- * @returns {FontContainerFont}
375
- */
368
+ * Gets a font object. Unlike accessing the font containers directly,
369
+ * this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
370
+ *
371
+ * @param {('Default'|'SansDefault'|'SerifDefault'|string)} family - Font family name.
372
+ * @param {('normal'|'italic'|'bold'|string)} [style='normal']
373
+ * @param {string} [lang='eng']
374
+ * @returns {FontContainerFont}
375
+ */
376
376
  static getFont = (family, style = 'normal', lang = 'eng') => {
377
377
  if (FontCont.doc?.[family]?.[style] && !FontCont.doc?.[family]?.[style]?.disable) {
378
378
  return FontCont.doc[family][style];
@@ -388,8 +388,12 @@ export class FontCont {
388
388
  // Option 1: If we have access to the font, use it.
389
389
  // Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
390
390
  if (!FontCont.raw?.[family]?.[style]) {
391
- if (/Times/i.test(family)) {
392
- family = 'NimbusRomNo9L';
391
+ if (/NimbusRom/i.test(family)) {
392
+ family = 'NimbusRoman';
393
+ } else if (/Times/i.test(family)) {
394
+ family = 'NimbusRoman';
395
+ } else if (/NimbusSan/i.test(family)) {
396
+ family = 'NimbusSans';
393
397
  } else if (/Helvetica/i.test(family)) {
394
398
  family = 'NimbusSans';
395
399
  } else if (/Arial/i.test(family)) {
@@ -406,6 +410,8 @@ export class FontCont {
406
410
  family = 'Carlito';
407
411
  } else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
408
412
  family = 'NimbusMono';
413
+ } else if (/NimbusMono/i.test(family) && FontCont.enableCleanToNimbusMono) {
414
+ family = 'NimbusMono';
409
415
  }
410
416
  }
411
417
 
@@ -451,7 +457,7 @@ export class FontCont {
451
457
  FontCont.enableCleanToNimbusMono = false;
452
458
 
453
459
  FontCont.defaultFontName = 'SerifDefault';
454
- FontCont.serifDefaultName = 'NimbusRomNo9L';
460
+ FontCont.serifDefaultName = 'NimbusRoman';
455
461
  FontCont.sansDefaultName = 'NimbusSans';
456
462
  };
457
463
 
@@ -8,11 +8,12 @@ import { saveAs } from '../utils/miscUtils.js';
8
8
  import { writePdf } from './writePdf.js';
9
9
  import { writeHocr } from './writeHocr.js';
10
10
  import { writeText } from './writeText.js';
11
+ import { writeHtml } from './writeHtml.js';
11
12
 
12
13
  /**
13
14
  * Export active OCR data to specified format.
14
15
  * @public
15
- * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} [format='txt']
16
+ * @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'} [format='txt']
16
17
  * @param {number} [minPage=0] - First page to export.
17
18
  * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
18
19
  * @returns {Promise<string|ArrayBuffer>}
@@ -183,6 +184,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
183
184
  }
184
185
  } else if (format === 'hocr') {
185
186
  content = writeHocr(ocrAll.active, minPage, maxPage);
187
+ } else if (format === 'html') {
188
+ content = writeHtml(ocrAll.active, minPage, maxPage, opt.reflow, opt.removeMargins);
186
189
  } else if (format === 'txt') {
187
190
  content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
188
191
  // Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
@@ -0,0 +1,223 @@
1
+ import { FontCont } from '../containers/fontContainer.js';
2
+ import { opt } from '../containers/app.js';
3
+ import { calcWordMetrics } from '../utils/fontUtils.js';
4
+ import { assignParagraphs } from '../utils/reflowPars.js';
5
+ import { pageMetricsArr } from '../containers/dataContainer.js';
6
+ import ocr from '../objects/ocrObjects.js';
7
+
8
+ /**
9
+ *
10
+ * @param {string} text
11
+ * @param {number} fontSizeHTMLSmallCaps
12
+ */
13
+ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
14
+ const textDivs0 = text.match(/([a-z]+)|([^a-z]+)/g);
15
+ if (!textDivs0) return '';
16
+ const textDivs = textDivs0.map((x) => {
17
+ const lower = /[a-z]/.test(x);
18
+ const styleStr = lower ? `style="font-size:${fontSizeHTMLSmallCaps}px"` : '';
19
+ return `<span class="input-sub" ${styleStr}>${x}</span>`;
20
+ });
21
+ return textDivs.join('');
22
+ };
23
+
24
+ /**
25
+ * Convert an array of ocrPage objects to HTML.
26
+ *
27
+ * @param {Array<OcrPage>} ocrCurrent -
28
+ * @param {number} minpage - The first page to include in the document.
29
+ * @param {number} maxpage - The last page to include in the document.
30
+ * @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph.
31
+ * @param {boolean} removeMargins - Remove the margins from the text.
32
+ * @param {?Array<string>} wordIds - An array of word IDs to include in the document.
33
+ * If omitted, all words are included.
34
+ */
35
+ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null) {
36
+ if (!(typeof process === 'undefined')) {
37
+ throw new Error('HTML exports are not supported in Node.js');
38
+ }
39
+
40
+ const canvas = new OffscreenCanvas(1, 1);
41
+ const ctx = /** @type {OffscreenCanvasRenderingContext2D} */ (canvas.getContext('2d'));
42
+
43
+ const fontsUsed = new Set();
44
+
45
+ const pad = 5;
46
+
47
+ let bodyStr = '<body>';
48
+
49
+ if (maxpage === -1) maxpage = ocrCurrent.length - 1;
50
+
51
+ let newLine = false;
52
+
53
+ let top = 0;
54
+
55
+ for (let g = minpage; g <= maxpage; g++) {
56
+ if (!ocrCurrent[g] || ocrCurrent[g].lines.length === 0) continue;
57
+
58
+ const pageObj = ocrCurrent[g];
59
+
60
+ let minLeft = 0;
61
+ let minTop = 0;
62
+ let maxBottom = 0;
63
+ if (removeMargins) {
64
+ const wordArr = ocr.getPageWords(pageObj);
65
+ for (let h = 0; h < wordArr.length; h++) {
66
+ const wordObj = wordArr[h];
67
+ if (wordIds && !wordIds.includes(wordObj.id)) continue;
68
+ if (minLeft === 0 || wordObj.bbox.left < minLeft) minLeft = wordObj.bbox.left;
69
+ if (minTop === 0 || wordObj.bbox.top < minTop) minTop = wordObj.bbox.top;
70
+ if (wordObj.bbox.bottom > maxBottom) maxBottom = wordObj.bbox.bottom;
71
+ }
72
+ }
73
+
74
+ bodyStr += `<div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">`;
75
+ if (removeMargins) {
76
+ top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10);
77
+ } else {
78
+ top += pageMetricsArr[g].dims.height + 10;
79
+ }
80
+
81
+ if (reflowText) {
82
+ const angle = pageMetricsArr[g].angle || 0;
83
+ assignParagraphs(pageObj, angle);
84
+ }
85
+
86
+ let parCurrent = pageObj.lines[0].par;
87
+
88
+ for (let h = 0; h < pageObj.lines.length; h++) {
89
+ const lineObj = pageObj.lines[h];
90
+
91
+ if (reflowText) {
92
+ if (g > 0 && h === 0 || lineObj.par !== parCurrent) newLine = true;
93
+ parCurrent = lineObj.par;
94
+ } else {
95
+ newLine = true;
96
+ }
97
+
98
+ for (let i = 0; i < lineObj.words.length; i++) {
99
+ const wordObj = lineObj.words[i];
100
+ if (!wordObj) continue;
101
+
102
+ if (wordIds && !wordIds.includes(wordObj.id)) continue;
103
+
104
+ if (newLine) {
105
+ bodyStr += '\n';
106
+ } else if (h > 0 || g > 0 || i > 0) {
107
+ bodyStr += ' ';
108
+ }
109
+
110
+ newLine = false;
111
+
112
+ const scale = 1;
113
+ const angle = 0;
114
+
115
+ const fontI = FontCont.getWordFont(wordObj);
116
+ fontsUsed.add(fontI);
117
+
118
+ const {
119
+ charSpacing, leftSideBearing, rightSideBearing, fontSize, charArr, advanceArr, kerningArr, font,
120
+ } = calcWordMetrics(wordObj);
121
+
122
+ const wordStr = charArr.join('');
123
+
124
+ const charSpacingHTML = charSpacing * scale;
125
+
126
+ let x1 = wordObj.bbox.left - minLeft;
127
+ const y1 = wordObj.line.bbox.bottom + wordObj.line.baseline[1] - minTop;
128
+
129
+ if (wordObj.visualCoords) x1 -= leftSideBearing * scale;
130
+
131
+ const fontSizeHTML = fontSize * scale;
132
+
133
+ ctx.font = `${fontI.fontFaceStyle} ${fontI.fontFaceWeight} ${fontSizeHTML}px ${fontI.fontFaceName}`;
134
+
135
+ const metrics = ctx.measureText(wordStr);
136
+
137
+ const fontSizeHTMLSmallCaps = fontSize * scale * fontI.smallCapsMult;
138
+
139
+ // Align with baseline
140
+ const topHTML = Math.round((y1 - metrics.fontBoundingBoxAscent + fontSizeHTML * 0.6) * 1000) / 1000;
141
+
142
+ let styleStr = '';
143
+
144
+ const topPadOffset = 5 * Math.sin(angle * (Math.PI / 180));
145
+ const leftPadOffset = 5 * Math.cos(angle * (Math.PI / 180));
146
+
147
+ styleStr += `left:${x1 - leftPadOffset}px;`;
148
+ styleStr += `top:${topHTML - topPadOffset}px;`;
149
+ styleStr += `font-size:${fontSizeHTML}px;`;
150
+ styleStr += `font-family:${fontI.fontFaceName};`;
151
+
152
+ if (Math.abs(angle ?? 0) > 0.05) {
153
+ styleStr += `transform-origin:left ${y1 - topHTML}px;`;
154
+ styleStr += `transform:rotate(${angle}deg);`;
155
+ }
156
+
157
+ // We cannot make the text uppercase in the input field, as this would result in the text being saved as uppercase.
158
+ // Additionally, while there is a small-caps CSS property, it does not allow for customizing the size of the small caps.
159
+ // Therefore, we handle small caps by making all text print as uppercase using the `text-transform` CSS property,
160
+ // and then wrapping each letter in a span with a smaller font size.
161
+ let innerHTML;
162
+ if (wordObj.smallCaps) {
163
+ styleStr += 'text-transform:uppercase;';
164
+ innerHTML = makeSmallCapsDivs(wordStr, fontSizeHTMLSmallCaps);
165
+ } else {
166
+ innerHTML = wordStr;
167
+ }
168
+
169
+ styleStr += `letter-spacing:${charSpacingHTML}px;`;
170
+
171
+ styleStr += `font-weight:${fontI.fontFaceWeight};`;
172
+ styleStr += `font-style:${fontI.fontFaceStyle};`;
173
+
174
+ // Line height must match the height of the font bounding box for the font metrics to be accurate.
175
+ styleStr += `line-height:${metrics.fontBoundingBoxAscent + metrics.fontBoundingBoxDescent}px;`;
176
+
177
+ bodyStr += `<span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
178
+ }
179
+ }
180
+
181
+ bodyStr += '</div>';
182
+
183
+ opt.progressHandler({ n: g, type: 'export', info: { } });
184
+ }
185
+
186
+ let styleStr = '<style>.scribe-word {';
187
+
188
+ styleStr += 'position:absolute;';
189
+ styleStr += `padding-left:${pad}px;`;
190
+ styleStr += `padding-right:${pad}px;`;
191
+ styleStr += 'z-index:1;';
192
+ styleStr += 'white-space:nowrap;';
193
+ if (opt.kerning) {
194
+ styleStr += 'font-kerning:normal;';
195
+ } else {
196
+ styleStr += 'font-kerning:none;';
197
+ }
198
+
199
+ styleStr += '}';
200
+
201
+ for (const fontI of fontsUsed) {
202
+ const cdnPath = 'https://cdn.jsdelivr.net/npm/scribe.js-ocr@0.7.1/fonts/all/';
203
+ let styleTitleCase = fontI.style.charAt(0).toUpperCase() + fontI.style.slice(1).toLowerCase();
204
+ if (styleTitleCase === 'Normal') styleTitleCase = 'Regular';
205
+ const fontName = `${fontI.family}-${styleTitleCase}.woff`;
206
+ const fontPath = cdnPath + fontName;
207
+
208
+ styleStr += `@font-face {
209
+ font-family: '${fontI.fontFaceName}';
210
+ font-style: ${fontI.fontFaceStyle};
211
+ font-weight: ${fontI.fontFaceWeight};
212
+ src: url('${fontPath}');
213
+ }\n`;
214
+ }
215
+
216
+ styleStr += '</style>';
217
+
218
+ bodyStr += '</body>';
219
+
220
+ const htmlStr = `<html><head>${styleStr}</head>${bodyStr}</html>`;
221
+
222
+ return htmlStr;
223
+ }