scribe.js-ocr 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cli/cli.js CHANGED
@@ -11,13 +11,13 @@ export const confCLI = async (ocrFile) => {
11
11
  process.exitCode = 0;
12
12
  };
13
13
 
14
- export const checkCLI = async (pdfFile, ocrFile) => {
15
- await check(pdfFile, ocrFile);
14
+ export const checkCLI = async (pdfFile, ocrFile, options) => {
15
+ await check(pdfFile, ocrFile, options);
16
16
  process.exitCode = 0;
17
17
  };
18
18
 
19
- export const evalInternalCLI = async (pdfFile, ocrFile) => {
20
- const { evalMetrics } = await evalInternal(pdfFile, ocrFile);
19
+ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
20
+ const { evalMetrics } = await evalInternal(pdfFile, ocrFile, options);
21
21
 
22
22
  const ignoreExtra = true;
23
23
  let metricWER;
@@ -53,6 +53,7 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
53
53
  * @param {boolean} [options.robust]
54
54
  * @param {boolean} [options.conf]
55
55
  * @param {boolean} [options.vis]
56
+ * @param {number} [options.workers]
56
57
  */
57
58
  export const overlayCLI = async (pdfFile, ocrFile, outputDir, options) => {
58
59
  options.overlayMode = options.vis ? 'proof' : 'invis';
package/cli/main.js CHANGED
@@ -21,9 +21,11 @@ scribe.opt.saveDebugImages = debugMode;
21
21
  * @param {boolean} [params.robustConfMode]
22
22
  * @param {boolean} [params.printConf]
23
23
  * @param {"eval" | "ebook" | "proof" | "invis"} [params.overlayMode]
24
- *
24
+ * @param {number} [params.workerN]
25
25
  */
26
26
  async function main(func, params) {
27
+ scribe.opt.workerN = params.workerN || null;
28
+
27
29
  await scribe.init({
28
30
  pdf: true,
29
31
  ocr: true,
@@ -118,16 +120,20 @@ export const conf = async (ocrFile) => (main('conf', { ocrFile }));
118
120
  *
119
121
  * @param {string} pdfFile - Path to PDF file.
120
122
  * @param {string} ocrFile
123
+ * @param {Object} options
124
+ * @param {number} [options.workers]
121
125
  */
122
- export const check = async (pdfFile, ocrFile) => (main('check', { pdfFile, ocrFile }));
126
+ export const check = async (pdfFile, ocrFile, options) => (main('check', { pdfFile, ocrFile, workerN: options?.workers }));
123
127
 
124
128
  /**
125
129
  * Evaluate internal OCR engine.
126
130
  *
127
131
  * @param {string} pdfFile - Path to PDF file.
128
132
  * @param {string} ocrFile - Path to OCR file containing ground truth.
133
+ * @param {Object} options
134
+ * @param {number} [options.workers]
129
135
  */
130
- export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile, ocrFile }));
136
+ export const evalInternal = async (pdfFile, ocrFile, options) => (main('eval', { pdfFile, ocrFile, workerN: options?.workers }));
131
137
 
132
138
  /**
133
139
  *
@@ -138,10 +144,10 @@ export const evalInternal = async (pdfFile, ocrFile) => (main('eval', { pdfFile,
138
144
  * @param {boolean} [options.robust]
139
145
  * @param {boolean} [options.conf]
140
146
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
141
- * @returns
147
+ * @param {number} [options.workers]
142
148
  */
143
149
  export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('overlay', {
144
- pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis',
150
+ pdfFile, ocrFile, outputDir, robustConfMode: options?.robust || false, printConf: options?.conf || false, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers,
145
151
  }));
146
152
 
147
153
  /**
@@ -149,9 +155,9 @@ export const overlay = async (pdfFile, ocrFile, outputDir, options) => (main('ov
149
155
  * @param {string} pdfFile - Path to PDF file.
150
156
  * @param {Object} options
151
157
  * @param {"eval" | "ebook" | "proof" | "invis"} [options.overlayMode]
152
- * @returns
158
+ * @param {number} [options.workers]
153
159
  */
154
- export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis' }));
160
+ export const recognize = async (pdfFile, options) => (main('recognize', { pdfFile, overlayMode: options?.overlayMode || 'invis', workerN: options?.workers }));
155
161
 
156
162
  /**
157
163
  *
package/cli/scribe.js CHANGED
@@ -19,6 +19,7 @@ program
19
19
  .command('check')
20
20
  .argument('<pdf_file>', 'Input PDF file.')
21
21
  .argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
22
+ .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
22
23
  .description('Calculate confidence metric for OCR data by running Tesseract OCR and comparing results.')
23
24
  .action(checkCLI);
24
25
 
@@ -26,6 +27,7 @@ program
26
27
  .command('eval')
27
28
  .argument('<pdf_file>', 'Input PDF file.')
28
29
  .argument('<ocr_file>', 'Input OCR file. Accepts .hocr and Abbyy .xml (with character-level data enabled).')
30
+ .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
29
31
  .description('Evaluate internal OCR engine by recognizing document (provided PDF file), and comparing to ground truth (provided OCR file).')
30
32
  .action(evalInternalCLI);
31
33
 
@@ -46,6 +48,7 @@ program
46
48
  .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
47
49
  .option('-c, --conf', 'Print average confidence metric for document.')
48
50
  .option('-r, --robust', 'Generate confidence metrics by running Tesseract OCR and comparing, rather than using confidence info in provided data.')
51
+ .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
49
52
  .description('Add OCR data to provided PDF file and save result as PDF.')
50
53
  .action(overlayCLI);
51
54
 
@@ -54,6 +57,7 @@ program
54
57
  .argument('<pdf_file>', 'Input PDF file.')
55
58
  .description('Recognize text in PDF file using internal OCR engine.')
56
59
  .option('-v, --vis', 'Print OCR text visibly over provided PDF file with colors coded by confidence.')
60
+ .option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
57
61
  .action(recognizeCLI);
58
62
 
59
63
  program
@@ -61,6 +61,18 @@ export class opt {
61
61
  static extractPDFFonts = false;
62
62
 
63
63
  static calcSuppFontInfo = false;
64
+
65
+ static usePDFTextSupp = true;
66
+
67
+ static usePDFTextMain = true;
68
+
69
+ /**
70
+ * Number of workers to use. Must be set prior to initialization.
71
+ * If set to `null` (default), the number of workers will be set up to 6 (browser) or 8 (node),
72
+ * if the system has enough resources.
73
+ * @type {?number}
74
+ */
75
+ static workerN = null;
64
76
  }
65
77
 
66
78
  export class inputData {
@@ -70,6 +82,9 @@ export class inputData {
70
82
  /** `true` if user uploaded pdf */
71
83
  static pdfMode = false;
72
84
 
85
+ /** @type {?('text'|'ocr'|'image')} */
86
+ static pdfType = null;
87
+
73
88
  /** `true` if user uploaded image files (.png, .jpeg) */
74
89
  static imageMode = false;
75
90
 
@@ -263,6 +263,13 @@ export class FontCont {
263
263
 
264
264
  static sansDefaultName = 'NimbusSans';
265
265
 
266
+ /**
267
+ * If `false`, 'Courier' will not be cleaned to Nimbus Mono.
268
+ * This setting is useful because Tesseract sometimes misidentifies fonts as Courier, and when not the document default, Nimbus Mono is almost always incorrect.
269
+ * Even with this setting `false`, Nimbus Mono will still be used when the font is exactly 'NimbusMono' and Nimbus Mono can still be the document default font.
270
+ */
271
+ static enableCleanToNimbusMono = false;
272
+
266
273
  /** @type {?('latin'|'all')} */
267
274
  static glyphSet = null;
268
275
 
@@ -337,6 +344,8 @@ export class FontCont {
337
344
  family = 'Carlito';
338
345
  } else if (/Calibri/i.test(family)) {
339
346
  family = 'Carlito';
347
+ } else if (/Courier/i.test(family) && FontCont.enableCleanToNimbusMono) {
348
+ family = 'NimbusMono';
340
349
  }
341
350
  }
342
351
 
@@ -379,6 +388,8 @@ export class FontCont {
379
388
  FontCont.rawMetrics = null;
380
389
  FontCont.optMetrics = null;
381
390
 
391
+ FontCont.enableCleanToNimbusMono = false;
392
+
382
393
  FontCont.defaultFontName = 'SerifDefault';
383
394
  FontCont.serifDefaultName = 'NimbusRomNo9L';
384
395
  FontCont.sansDefaultName = 'NimbusSans';
@@ -159,9 +159,6 @@ export class ImageCache {
159
159
  image: false,
160
160
  };
161
161
 
162
- /** @type {?('text'|'ocr'|'image')} */
163
- static pdfType = null;
164
-
165
162
  static colorModeDefault = 'gray';
166
163
 
167
164
  /**
@@ -196,6 +193,9 @@ export class ImageCache {
196
193
 
197
194
  const workersPromiseArr = range(0, scheduler.workers.length - 1).map(async (x) => {
198
195
  const w = scheduler.workers[x];
196
+
197
+ if (w.pdfDoc) await w.freeDocument(w.pdfDoc);
198
+
199
199
  // The ArrayBuffer is transferred to the worker, so a new one must be created for each worker.
200
200
  // const fileData = await file.arrayBuffer();
201
201
  const fileDataCopy = fileData.slice(0);
@@ -143,6 +143,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
143
143
  doc1: pdfOverlay, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
144
144
  });
145
145
  }
146
+
147
+ w.freeDocument(pdfOverlay);
146
148
  } else {
147
149
  const pdfStr = await writePdf(ocrDownload, minValue, maxValue, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
148
150
  opt.overlayOpacity / 100);
@@ -169,6 +171,8 @@ export async function exportData(format = 'txt', minValue = 0, maxValue = -1) {
169
171
  content = await w.write({
170
172
  doc1: pdf, minpage: minValue, maxpage: maxValue, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, humanReadable: opt.humanReadablePDF,
171
173
  });
174
+
175
+ w.freeDocument(pdf);
172
176
  }
173
177
  } else if (format === 'hocr') {
174
178
  content = writeHocr(ocrAll.active, minValue, maxValue);
@@ -534,10 +534,11 @@ async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle
534
534
  const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngle;
535
535
  // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right;
536
536
 
537
- const wordGlyphMetrics = wordFontOpentype.charToGlyph(charArr.at(-1)).getMetrics();
537
+ const wordGlyph = wordFontOpentype.charToGlyph(charArr.at(-1));
538
+ const wordGlyphMetrics = wordGlyph.getMetrics();
538
539
  const wordNextGlyphMetrics = wordFontOpentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics();
539
540
 
540
- const wordRightBearing = wordJ.visualCoords ? wordGlyphMetrics.rightSideBearing * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
541
+ const wordRightBearing = wordJ.visualCoords ? (wordGlyph.advanceWidth - wordGlyphMetrics.xMax) * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
541
542
 
542
543
  const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFontOpentype.unitsPerEm) : 0;
543
544
 
@@ -1,3 +1,4 @@
1
+ import { inputData } from './containers/app.js';
1
2
  import { ocrAll, ocrAllRaw } from './containers/dataContainer.js';
2
3
  import { ImageCache } from './containers/imageContainer.js';
3
4
  import { convertOCR } from './recognizeConvert.js';
@@ -83,7 +84,7 @@ export const extractInternalPDFText = async (options = {}) => {
83
84
 
84
85
  const res = await extractInternalPDFTextRaw();
85
86
 
86
- ImageCache.pdfType = res.type;
87
+ inputData.pdfType = res.type;
87
88
  ocrAllRaw.pdf = res.contentRaw;
88
89
 
89
90
  if (!extractPDFTextImage && res.type === 'image') return res;
@@ -102,7 +103,7 @@ export const extractInternalPDFText = async (options = {}) => {
102
103
  const format = 'stext';
103
104
 
104
105
  // Process HOCR using web worker, reading from file first if that has not been done already
105
- await convertOCR(ocrAllRaw.active, true, format, 'pdf', false);
106
+ await convertOCR(ocrAllRaw.pdf, true, format, 'pdf', false);
106
107
 
107
108
  res.content = ocrAll.pdf;
108
109
 
@@ -39,6 +39,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
39
39
  let /** @type {Promise<ArrayBuffer>} */nimbusSansNormal;
40
40
  let /** @type {Promise<ArrayBuffer>} */nimbusSansItalic;
41
41
  let /** @type {Promise<ArrayBuffer>} */nimbusSansBold;
42
+ let /** @type {Promise<ArrayBuffer>} */nimbusMonoNormal;
43
+ let /** @type {Promise<ArrayBuffer>} */nimbusMonoItalic;
44
+ let /** @type {Promise<ArrayBuffer>} */nimbusMonoBold;
42
45
  if (typeof process === 'undefined') {
43
46
  if (glyphSet === 'latin') {
44
47
  carlitoNormal = fetch(new URL('../fonts/latin/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
@@ -59,6 +62,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
59
62
  nimbusSansNormal = fetch(new URL('../fonts/latin/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
60
63
  nimbusSansItalic = fetch(new URL('../fonts/latin/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
61
64
  nimbusSansBold = fetch(new URL('../fonts/latin/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
65
+ nimbusMonoNormal = fetch(new URL('../fonts/latin/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
66
+ nimbusMonoItalic = fetch(new URL('../fonts/latin/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
67
+ nimbusMonoBold = fetch(new URL('../fonts/latin/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
62
68
  } else {
63
69
  carlitoNormal = fetch(new URL('../fonts/all/Carlito-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
64
70
  carlitoItalic = fetch(new URL('../fonts/all/Carlito-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
@@ -78,6 +84,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
78
84
  nimbusSansNormal = fetch(new URL('../fonts/all/NimbusSans-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
79
85
  nimbusSansItalic = fetch(new URL('../fonts/all/NimbusSans-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
80
86
  nimbusSansBold = fetch(new URL('../fonts/all/NimbusSans-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
87
+ nimbusMonoNormal = fetch(new URL('../fonts/all/NimbusMonoPS-Regular.woff', import.meta.url)).then((res) => res.arrayBuffer());
88
+ nimbusMonoItalic = fetch(new URL('../fonts/all/NimbusMonoPS-Italic.woff', import.meta.url)).then((res) => res.arrayBuffer());
89
+ nimbusMonoBold = fetch(new URL('../fonts/all/NimbusMonoPS-Bold.woff', import.meta.url)).then((res) => res.arrayBuffer());
81
90
  }
82
91
  } else {
83
92
  const { readFile } = await import('fs/promises');
@@ -99,6 +108,9 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
99
108
  nimbusSansNormal = readFile(new URL('../fonts/all_ttf/NimbusSans-Regular.ttf', import.meta.url)).then((res) => res.buffer);
100
109
  nimbusSansItalic = readFile(new URL('../fonts/all_ttf/NimbusSans-Italic.ttf', import.meta.url)).then((res) => res.buffer);
101
110
  nimbusSansBold = readFile(new URL('../fonts/all_ttf/NimbusSans-Bold.ttf', import.meta.url)).then((res) => res.buffer);
111
+ nimbusMonoNormal = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Regular.ttf', import.meta.url)).then((res) => res.buffer);
112
+ nimbusMonoItalic = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Italic.ttf', import.meta.url)).then((res) => res.buffer);
113
+ nimbusMonoBold = readFile(new URL('../fonts/all_ttf/NimbusMonoPS-Bold.ttf', import.meta.url)).then((res) => res.buffer);
102
114
  }
103
115
 
104
116
  const srcObj = {
@@ -108,6 +120,7 @@ export async function loadBuiltInFontsRaw(glyphSet = 'latin') {
108
120
  Palatino: { normal: await palatinoNormal, italic: await palatinoItalic, bold: await palatinoBold },
109
121
  NimbusRomNo9L: { normal: await nimbusRomNo9LNormal, italic: await nimbusRomNo9LItalic, bold: await nimbusRomNo9LBold },
110
122
  NimbusSans: { normal: await nimbusSansNormal, italic: await nimbusSansItalic, bold: await nimbusSansBold },
123
+ NimbusMono: { normal: await nimbusMonoNormal, italic: await nimbusMonoItalic, bold: await nimbusMonoBold },
111
124
  };
112
125
 
113
126
  FontCont.raw = await /** @type {FontContainer} */(/** @type {any} */(loadFontsFromSource(srcObj)));
@@ -256,7 +269,7 @@ export async function setUploadFontsWorker(scheduler) {
256
269
  /** @type {Object<string, fontSrcBuiltIn|fontSrcUpload>} */
257
270
  const fontsUpload = {};
258
271
  for (const [key, value] of Object.entries(FontCont.active)) {
259
- if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans'].includes(key)) {
272
+ if (!['Carlito', 'Century', 'Garamond', 'Palatino', 'NimbusRomNo9L', 'NimbusSans', 'NimbusMono'].includes(key)) {
260
273
  fontsUpload[key] = {
261
274
  normal: value?.normal?.src, italic: value?.italic?.src, bold: value?.bold?.src,
262
275
  };
@@ -381,8 +394,9 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
381
394
  const palatinoPromise = optimizeFontContainerFamily(fontPrivate.Palatino, fontMetricsObj);
382
395
  const nimbusRomNo9LPromise = optimizeFontContainerFamily(fontPrivate.NimbusRomNo9L, fontMetricsObj);
383
396
  const nimbusSansPromise = optimizeFontContainerFamily(fontPrivate.NimbusSans, fontMetricsObj);
397
+ const nimbusMonoPromise = optimizeFontContainerFamily(fontPrivate.NimbusMono, fontMetricsObj);
384
398
 
385
- const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise]);
399
+ const results = await Promise.all([carlitoPromise, centuryPromise, garamondPromise, palatinoPromise, nimbusRomNo9LPromise, nimbusSansPromise, nimbusMonoPromise]);
386
400
 
387
401
  if (results.every((x) => x === null)) return null;
388
402
 
@@ -393,5 +407,6 @@ export async function optimizeFontContainerAll(fontPrivate, fontMetricsObj) {
393
407
  Palatino: results[3],
394
408
  NimbusRomNo9L: results[4],
395
409
  NimbusSans: results[5],
410
+ NimbusMono: results[6],
396
411
  };
397
412
  }
package/js/fontEval.js CHANGED
@@ -50,6 +50,7 @@ export async function evaluateFonts(pageArr, opt) {
50
50
  const evalPalatino = !!(opt ? FontCont.opt?.Palatino : FontCont.raw?.Palatino);
51
51
  const evalGaramond = !!(opt ? FontCont.opt?.Garamond : FontCont.raw?.Garamond);
52
52
  const evalNimbusRomNo9L = !!(opt ? FontCont.opt?.NimbusRomNo9L : FontCont.raw?.NimbusRomNo9L);
53
+ const evalNimbusMono = !!(opt ? FontCont.opt?.NimbusMono : FontCont.raw?.NimbusMono);
53
54
 
54
55
  // The browser version runs in parallel using workers, however the Node.js version runs sequentially,
55
56
  // as the canvas package does not support workers, and trying to run in parallel causes problems.
@@ -63,6 +64,7 @@ export async function evaluateFonts(pageArr, opt) {
63
64
  palatino: evalPalatino ? evalPagesFont('Palatino', pageArr, opt) : null,
64
65
  garamond: evalGaramond ? evalPagesFont('Garamond', pageArr, opt) : null,
65
66
  nimbusRomNo9L: evalNimbusRomNo9L ? evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
67
+ nimbusMono: evalNimbusMono ? evalPagesFont('NimbusMono', pageArr, opt) : null,
66
68
  };
67
69
 
68
70
  fontMetricsTmp = {
@@ -72,6 +74,7 @@ export async function evaluateFonts(pageArr, opt) {
72
74
  palatino: await fontMetricsPromises.palatino,
73
75
  garamond: await fontMetricsPromises.garamond,
74
76
  nimbusRomNo9L: await fontMetricsPromises.nimbusRomNo9L,
77
+ nimbusMono: await fontMetricsPromises.nimbusMono,
75
78
  };
76
79
  } else {
77
80
  fontMetricsTmp = {
@@ -81,6 +84,7 @@ export async function evaluateFonts(pageArr, opt) {
81
84
  palatino: evalPalatino ? await evalPagesFont('Palatino', pageArr, opt) : null,
82
85
  garamond: evalGaramond ? await evalPagesFont('Garamond', pageArr, opt) : null,
83
86
  nimbusRomNo9L: evalNimbusRomNo9L ? await evalPagesFont('NimbusRomNo9L', pageArr, opt) : null,
87
+ nimbusMono: evalNimbusMono ? await evalPagesFont('NimbusMono', pageArr, opt) : null,
84
88
  };
85
89
  }
86
90
 
@@ -91,6 +95,7 @@ export async function evaluateFonts(pageArr, opt) {
91
95
  Palatino: fontMetricsTmp.palatino ? fontMetricsTmp.palatino.metricTotal / fontMetricsTmp.palatino.wordsTotal : null,
92
96
  Garamond: fontMetricsTmp.garamond ? fontMetricsTmp.garamond.metricTotal / fontMetricsTmp.garamond.wordsTotal : null,
93
97
  NimbusRomNo9L: fontMetricsTmp.nimbusRomNo9L ? fontMetricsTmp.nimbusRomNo9L.metricTotal / fontMetricsTmp.nimbusRomNo9L.wordsTotal : null,
98
+ NimbusMono: fontMetricsTmp.nimbusMono ? fontMetricsTmp.nimbusMono.metricTotal / fontMetricsTmp.nimbusMono.wordsTotal : null,
94
99
  };
95
100
 
96
101
  return fontMetrics;
@@ -106,7 +111,7 @@ const calcBestFonts = (fontMetrics) => {
106
111
 
107
112
  for (const [key, value] of Object.entries(fontMetrics)) {
108
113
  if (!['Carlito', 'NimbusSans'].includes(key)) continue;
109
- if (value < minValueSans) {
114
+ if (value && value < minValueSans) {
110
115
  minValueSans = value;
111
116
  minKeySans = key;
112
117
  }
@@ -116,8 +121,8 @@ const calcBestFonts = (fontMetrics) => {
116
121
  let minValueSerif = Number.MAX_VALUE;
117
122
 
118
123
  for (const [key, value] of Object.entries(fontMetrics)) {
119
- if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L'].includes(key)) continue;
120
- if (value < minValueSerif) {
124
+ if (!['Century', 'Palatino', 'Garamond', 'NimbusRomNo9L', 'NimbusMono'].includes(key)) continue;
125
+ if (value && value < minValueSerif) {
121
126
  minValueSerif = value;
122
127
  minKeySerif = key;
123
128
  }
package/js/fontSupp.js CHANGED
@@ -159,7 +159,7 @@ export const calcSuppFontInfo = async (ocrArr) => {
159
159
  for (const line of page.lines) {
160
160
  for (const word of line.words) {
161
161
  if (word.font && word.size && FontProps.sizeMult[word.font]) {
162
- word.size *= FontProps.sizeMult[word.font];
162
+ word.size = Math.round(word.size * FontProps.sizeMult[word.font] * 1000) / 1000;
163
163
  }
164
164
  }
165
165
  }
@@ -1,3 +1,5 @@
1
+ import { opt } from './containers/app.js';
2
+
1
3
  /**
2
4
  * Initializes a general worker and returns an object with methods controlled by the worker.
3
5
  * @returns {Promise} A promise that resolves to an object with control methods.
@@ -265,14 +267,14 @@ export class gs {
265
267
  gs.#resReady = resolve;
266
268
  });
267
269
 
268
- // Determine number of workers to use in the browser.
269
- // This is the minimum of:
270
- // 1. The number of cores
271
- // 3. 6 (browser-imposed memory limits make going higher than 6 problematic, even on hardware that could support it)
272
- // Node.js version only uses 1 worker.
273
- let workerN = 1;
274
- if (typeof process === 'undefined') {
270
+ let workerN;
271
+ if (opt.workerN) {
272
+ workerN = opt.workerN;
273
+ } else if (typeof process === 'undefined') {
275
274
  workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
275
+ } else {
276
+ const cpuN = Math.floor((await import('os')).cpus().length / 2);
277
+ workerN = Math.min(cpuN - 1, 8);
276
278
  }
277
279
 
278
280
  const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
package/js/global.d.ts CHANGED
@@ -34,6 +34,7 @@ declare global {
34
34
  Palatino: FontContainerFamilyBuiltIn;
35
35
  NimbusRomNo9L: FontContainerFamilyBuiltIn;
36
36
  NimbusSans: FontContainerFamilyBuiltIn;
37
+ NimbusMono: FontContainerFamilyBuiltIn;
37
38
  [key: string]: FontContainerFamily;
38
39
  };
39
40
 
@@ -50,6 +50,10 @@ export async function convertPageStext({ ocrStr, n }) {
50
50
  const xmlLinePreChar = xmlLine.match(/^[\s\S]*?(?=<char)/)?.[0];
51
51
  if (!xmlLinePreChar) return;
52
52
 
53
+ const dirStr = xmlLinePreChar.match(/dir=['"]([^'"]*)/)?.[1];
54
+ const dirSlopeStr = dirStr?.match(/[-\d.]+$/)?.[0];
55
+ const dirSlope = dirSlopeStr ? parseFloat(dirSlopeStr) : null;
56
+
53
57
  const xmlLineFormatting = xmlLinePreChar?.match(/<font[^>]+/)?.[0];
54
58
  const fontName = xmlLineFormatting?.match(/name=['"]([^'"]*)/)?.[1];
55
59
  const fontSizeStr = xmlLineFormatting?.match(/size=['"]([^'"]*)/)?.[1];
@@ -81,7 +85,7 @@ export async function convertPageStext({ ocrStr, n }) {
81
85
  /** @type {Array<Array<{left: number, top: number, right: number, bottom: number}>>} */
82
86
  const bboxes = [];
83
87
 
84
- const baselineSlopeArr = /** @type {Array<Number>} */ ([]);
88
+ let baselineFirstDone = false;
85
89
  const baselineFirst = /** @type {Array<Number>} */ ([]);
86
90
 
87
91
  let baselineCurrent = 0;
@@ -114,17 +118,72 @@ export async function convertPageStext({ ocrStr, n }) {
114
118
  /** @type {Array<boolean>} */
115
119
  const superArr = [];
116
120
 
117
- const wordLetterOrFontArr = /** @type {Array<Array<RegExpExecArray>>} */([]);
121
+ /**
122
+ * @typedef {Object} Point
123
+ * @property {number} x - The x coordinate.
124
+ * @property {number} y - The y coordinate.
125
+ */
126
+
127
+ /**
128
+ * @typedef {Object} Quad
129
+ * @property {Point} ul - Upper left corner.
130
+ * @property {Point} ur - Upper right corner.
131
+ * @property {Point} ll - Lower left corner.
132
+ * @property {Point} lr - Lower right corner.
133
+ */
134
+
135
+ /**
136
+ * @typedef {Object} StextChar
137
+ * @property {Quad} quad
138
+ * @property {Point} origin
139
+ * @property {string} text
140
+ */
141
+
142
+ /**
143
+ * @typedef {Object} StextFont
144
+ * @property {string} name
145
+ * @property {number} size
146
+ */
147
+
148
+ const wordCharOrFontArr = /** @type {Array<Array<StextChar|StextFont>>} */([]);
118
149
  for (let i = 0; i < wordStrArr.length; i++) {
119
150
  // Fonts can be changed at any point in the word string.
120
151
  // Sometimes the font is changed before a space character, and othertimes it is changed after the space character.
121
152
  // This regex splits the string into elements that contain either (1) a font change or (2) a character.
122
153
  // The "quad" attribute includes 8 numbers (x and y coordinates for all 4 corners) however we only use capturing groups for 4
123
- const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(?:\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
124
- wordLetterOrFontArr[i] = [...wordStrArr[i].matchAll(stextCharRegex)];
154
+ const stextCharRegex = /(<font[^>]+>\s*)|<char quad=['"](\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)(\s*[\d.-]+)[^>]*?x=['"]([\d.-]+)[^>]*?y=['"]([\d.-]+)['"][^>]*?c=['"]([^'"]+)['"]\s*\/>/ig;
155
+
156
+ const stextMatches = [...wordStrArr[i].matchAll(stextCharRegex)];
157
+
158
+ wordCharOrFontArr[i] = [];
159
+ for (let j = 0; j < stextMatches.length; j++) {
160
+ const fontStr = stextMatches[j][1];
161
+ const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
162
+ const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
163
+ if (fontNameStrI && fontSizeStrI) {
164
+ wordCharOrFontArr[i][j] = {
165
+ name: fontNameStrI,
166
+ size: parseFloat(fontSizeStrI),
167
+ };
168
+ continue;
169
+ }
170
+
171
+ const quad = {
172
+ ul: { x: parseFloat(stextMatches[j][2]), y: parseFloat(stextMatches[j][3]) },
173
+ ur: { x: parseFloat(stextMatches[j][4]), y: parseFloat(stextMatches[j][5]) },
174
+ ll: { x: parseFloat(stextMatches[j][6]), y: parseFloat(stextMatches[j][7]) },
175
+ lr: { x: parseFloat(stextMatches[j][8]), y: parseFloat(stextMatches[j][9]) },
176
+ };
177
+
178
+ wordCharOrFontArr[i][j] = {
179
+ quad,
180
+ origin: { x: parseFloat(stextMatches[j][10]), y: parseFloat(stextMatches[j][11]) },
181
+ text: stextMatches[j][12],
182
+ };
183
+ }
125
184
  }
126
185
 
127
- for (let i = 0; i < wordLetterOrFontArr.length; i++) {
186
+ for (let i = 0; i < wordCharOrFontArr.length; i++) {
128
187
  let textWordArr = [];
129
188
  let bboxesWordArr = [];
130
189
  let fontFamily = familyCurrent || fontFamilyLine || 'Default';
@@ -137,28 +196,38 @@ export async function convertPageStext({ ocrStr, n }) {
137
196
  let smallCapsWordAltTitleCaseAdj = false;
138
197
  let styleWord = 'normal';
139
198
 
140
- const letterOrFontArr = wordLetterOrFontArr[i];
141
-
142
- if (letterOrFontArr.length === 0) continue;
199
+ if (wordCharOrFontArr[i].length === 0) continue;
143
200
 
144
201
  let wordInit = false;
145
202
 
146
- for (let j = 0; j < letterOrFontArr.length; j++) {
147
- const fontStr = letterOrFontArr[j][1];
148
- const fontNameStrI = fontStr?.match(/name=['"]([^'"]*)/)?.[1];
149
- const fontSizeStrI = fontStr?.match(/size=['"]([^'"]*)/)?.[1];
150
- const baseline = parseFloat(letterOrFontArr[j][6]);
151
- if (fontNameStrI && fontSizeStrI) {
203
+ for (let j = 0; j < wordCharOrFontArr[i].length; j++) {
204
+ const charOrFont = wordCharOrFontArr[i][j];
205
+ if ('name' in charOrFont) {
152
206
  // While small caps can be printed using special "small caps" fonts, they can also be printed using a regular font with a size change.
153
207
  // This block of code detects small caps printed in title case by checking for a decrease in font size after the first letter.
154
208
  // TODO: This logic currently fails when:
155
209
  // (1) Runs of small caps include punctuation, which is printed at the full size (and therefore is counted as a size increase ending small caps).
156
210
  // (2) Runs of small caps that start with lower-case letters, which do not conform to the expectation that runs of small caps start with a capital letter.
157
211
  const sizePrevRaw = sizeCurrentRaw;
158
- sizeCurrentRaw = parseFloat(fontSizeStrI);
212
+ sizeCurrentRaw = charOrFont.size;
159
213
  const secondLetter = wordInit && textWordArr.length === 1 && /[A-Z]/.test(textWordArr[0]);
160
- const baselineNextLetter = parseFloat(letterOrFontArr[j + 1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[0]?.[6])
161
- || parseFloat(wordLetterOrFontArr[i + 1]?.[1]?.[6]) || parseFloat(wordLetterOrFontArr[i + 1]?.[2]?.[6]);
214
+
215
+ let baselineNextLetter;
216
+ const possibleNextLetter1 = wordCharOrFontArr[i][j + 1];
217
+ const possibleNextLetter2 = wordCharOrFontArr[i + 1]?.[0];
218
+ const possibleNextLetter3 = wordCharOrFontArr[i + 1]?.[1];
219
+ const possibleNextLetter4 = wordCharOrFontArr[i + 1]?.[2];
220
+
221
+ if (possibleNextLetter1 && 'origin' in possibleNextLetter1) {
222
+ baselineNextLetter = possibleNextLetter1.origin.y;
223
+ } else if (possibleNextLetter2 && 'origin' in possibleNextLetter2) {
224
+ baselineNextLetter = possibleNextLetter2.origin.y;
225
+ } else if (possibleNextLetter3 && 'origin' in possibleNextLetter3) {
226
+ baselineNextLetter = possibleNextLetter3.origin.y;
227
+ } else if (possibleNextLetter4 && 'origin' in possibleNextLetter4) {
228
+ baselineNextLetter = possibleNextLetter4.origin.y;
229
+ }
230
+
162
231
  const fontSizeMin = Math.min(sizeCurrentRaw, sizePrevRaw);
163
232
  const baselineDelta = (baselineNextLetter - baselineCurrent) / fontSizeMin;
164
233
  const sizeDelta = (sizeCurrentRaw - sizePrevRaw) / fontSizeMin;
@@ -177,7 +246,13 @@ export async function convertPageStext({ ocrStr, n }) {
177
246
  bboxes.push(bboxesWordArr);
178
247
  styleArr.push(styleWord);
179
248
  fontFamilyArr.push(fontFamily);
180
- fontSizeArr.push(fontSizeWord);
249
+
250
+ if (sizeDelta > 0) {
251
+ fontSizeArr.push(sizePrevRaw);
252
+ } else {
253
+ fontSizeArr.push(fontSizeWord);
254
+ }
255
+
181
256
  smallCapsArr.push(smallCapsWord);
182
257
  smallCapsAltArr.push(smallCapsWordAlt);
183
258
  smallCapsAltTitleCaseArr.push(smallCapsWordAltTitleCaseAdj);
@@ -187,21 +262,25 @@ export async function convertPageStext({ ocrStr, n }) {
187
262
  bboxesWordArr = [];
188
263
  }
189
264
 
190
- // If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
191
265
  if (sizeDelta > 0) {
192
- baselineFirst.length = 0;
193
- familyCurrent = fontNameStrI || familyCurrent;
266
+ // If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
267
+ if (!baselineFirstDone) baselineFirst.length = 0;
268
+ familyCurrent = charOrFont.name || familyCurrent;
194
269
  sizeCurrent = sizeCurrentRaw || sizeCurrent;
195
270
  fontSizeWord = sizeCurrent;
196
271
  fontFamily = familyCurrent;
197
272
  superArr[superArr.length - 1] = true;
198
- fontSizeArr[fontSizeArr.length - 1] = sizeCurrentRaw;
273
+ }
274
+
275
+ // If `baselineFirstDone` was set using a non-superscript word, mark it as done.
276
+ if (superArr.length > 0 && !superArr[superArr.length - 1] && baselineFirst.length > 0) {
277
+ baselineFirstDone = true;
199
278
  }
200
279
 
201
280
  superCurrent = sizeDelta < 0;
202
281
  } else {
203
282
  sizeCurrent = sizeCurrentRaw || sizeCurrent;
204
- familyCurrent = fontNameStrI || familyCurrent;
283
+ familyCurrent = charOrFont.name || familyCurrent;
205
284
  // Update current word only if this is before every letter in the word.
206
285
  if (textWordArr.length === 0) {
207
286
  fontSizeWord = sizeCurrent;
@@ -210,7 +289,7 @@ export async function convertPageStext({ ocrStr, n }) {
210
289
  // An increase in font size ends any small caps sequence.
211
290
  // A threshold is necessary because stext data has been observed to have small variations without a clear reason.
212
291
  // eslint-disable-next-line no-lonely-if
213
- if (Math.abs(sizeDelta) > 0.05) {
292
+ if (Number.isFinite(sizeDelta) && Math.abs(sizeDelta) > 0.05) {
214
293
  smallCapsCurrentAlt = false;
215
294
  if (textWordArr.length === 0) {
216
295
  superCurrent = false;
@@ -222,14 +301,14 @@ export async function convertPageStext({ ocrStr, n }) {
222
301
 
223
302
  // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
224
303
  smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
225
- smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(fontNameStrI);
304
+ smallCapsCurrent = /(small\W?cap)|(sc$)|(caps$)/i.test(charOrFont.name);
226
305
  smallCapsWord = smallCapsCurrent;
227
306
 
228
- if (/italic/i.test(fontNameStrI) || /-\w*ital/i.test(fontNameStrI)) {
307
+ if (/italic/i.test(charOrFont.name) || /-\w*ital/i.test(charOrFont.name)) {
229
308
  // The word is already initialized, so we need to change the last element of the style array.
230
309
  // Label as `smallCapsAlt` rather than `smallCaps`, as we confirm the word is all caps before marking as `smallCaps`.
231
310
  styleCurrent = 'italic';
232
- } else if (/bold|black/i.test(fontNameStrI)) {
311
+ } else if (/bold|black/i.test(charOrFont.name)) {
233
312
  styleCurrent = 'bold';
234
313
  } else {
235
314
  styleCurrent = 'normal';
@@ -237,7 +316,7 @@ export async function convertPageStext({ ocrStr, n }) {
237
316
 
238
317
  continue;
239
318
  } else {
240
- baselineCurrent = baseline;
319
+ baselineCurrent = charOrFont.origin.y;
241
320
  }
242
321
 
243
322
  if (!wordInit) {
@@ -246,24 +325,22 @@ export async function convertPageStext({ ocrStr, n }) {
246
325
  }
247
326
 
248
327
  const bbox = {
249
- left: Math.round(parseFloat(letterOrFontArr[j][2])),
250
- top: Math.round(parseFloat(letterOrFontArr[j][3])),
251
- right: Math.round(parseFloat(letterOrFontArr[j][4])),
252
- bottom: Math.round(parseFloat(letterOrFontArr[j][5])),
328
+ left: Math.round(charOrFont.origin.x),
329
+ top: Math.round(Math.min(charOrFont.quad.ul.y, charOrFont.quad.ur.y)),
330
+ right: Math.round(charOrFont.origin.x + (charOrFont.quad.ur.x - charOrFont.quad.ul.x)),
331
+ bottom: Math.round(Math.max(charOrFont.quad.ll.y, charOrFont.quad.lr.y)),
253
332
  };
254
333
 
255
334
  if (!superCurrent) {
256
335
  if (baselineFirst.length === 0) {
257
- baselineFirst.push(bbox.left, baseline);
258
- } else {
259
- baselineSlopeArr.push((baseline - baselineFirst[1]) / (bbox.left - baselineFirst[0]));
336
+ baselineFirst.push(bbox.left, charOrFont.origin.y);
260
337
  }
261
338
  }
262
339
 
263
340
  // Small caps created by reducing font size can carry forward across multiple words.
264
341
  smallCapsCurrentAlt = smallCapsCurrentAlt ?? smallCapsAltArr[smallCapsAltArr.length - 1];
265
342
 
266
- textWordArr.push(letterOrFontArr[j][7]);
343
+ textWordArr.push(charOrFont.text);
267
344
 
268
345
  bboxesWordArr.push(bbox);
269
346
  }
@@ -288,13 +365,19 @@ export async function convertPageStext({ ocrStr, n }) {
288
365
  }
289
366
 
290
367
  superArr.push(superCurrent);
368
+ if (superCurrent) fontSizeArr[fontSizeArr.length - 1] = sizeCurrentRaw;
291
369
  }
292
370
 
293
371
  // Return if there are no letters in the line.
294
372
  // This commonly happens for "lines" that contain only space characters.
295
373
  if (bboxes.length === 0) return;
296
374
 
297
- const baselineSlope = quantile(baselineSlopeArr, 0.5) || 0;
375
+ let baselineSlope = 0;
376
+ if (dirSlope !== null) {
377
+ baselineSlope = dirSlope;
378
+ } else {
379
+ console.log('Unable to parse slope.');
380
+ }
298
381
 
299
382
  const lineBbox = {
300
383
  left: lineBoxArr[0], top: lineBoxArr[1], right: lineBoxArr[2], bottom: lineBoxArr[3],
@@ -427,7 +510,10 @@ export async function convertPageStext({ ocrStr, n }) {
427
510
 
428
511
  for (let i = 0; i < lineStrArr.length; i++) {
429
512
  const angle = convertLineStext(lineStrArr[i]);
430
- if (typeof angle === 'number' && !Number.isNaN(angle)) angleRisePage.push(angle);
513
+ // The `Math.abs(angle) < 0.3` condition avoids vertical text impacting the angle calculation.
514
+ // The page angle is intended to account for page skew, not different orientations (90/180/270 degrees).
515
+ // TODO: Eventually different orientations should be supported.
516
+ if (typeof angle === 'number' && !Number.isNaN(angle) && Math.abs(angle) < 0.3) angleRisePage.push(angle);
431
517
  }
432
518
 
433
519
  if (parLineArr.length === 0) return;
@@ -447,9 +447,12 @@ export async function importFiles(files, options = {}) {
447
447
  });
448
448
  } else if (inputData.pdfMode && (extractPDFTextNative || extractPDFTextOCR)) {
449
449
  await extractInternalPDFText({
450
- setActive: true, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
450
+ setActive: opt.usePDFTextMain, extractPDFTextNative, extractPDFTextOCR, extractPDFTextImage,
451
451
  });
452
- if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
452
+ if (opt.usePDFTextMain) {
453
+ if (inputData.pdfType === 'text') FontCont.enableCleanToNimbusMono = true;
454
+ if (opt.calcSuppFontInfo) await calcSuppFontInfo(ocrAll.pdf);
455
+ }
453
456
  }
454
457
  }
455
458
 
@@ -644,6 +644,37 @@ function cloneChar(char) {
644
644
  return charNew;
645
645
  }
646
646
 
647
+ /**
648
+ * Gets words that match the provided text.
649
+ * @param {string} text
650
+ * @param {OcrPage} ocrPage
651
+ */
652
+ function getMatchingWords(text, ocrPage) {
653
+ text = text.trim().toLowerCase();
654
+
655
+ if (!text) return [];
656
+ const textArr = text.split(' ');
657
+
658
+ const wordArr = ocr.getPageWords(ocrPage);
659
+
660
+ const matchArr = [];
661
+
662
+ for (let i = 0; i < wordArr.length - (textArr.length - 1); i++) {
663
+ const word = wordArr[i];
664
+
665
+ if (!word.text.toLowerCase().includes(textArr[0])) continue;
666
+
667
+ const candArr = wordArr.slice(i, i + textArr.length);
668
+ const candText = candArr.map((x) => x.text).join(' ').toLowerCase();
669
+
670
+ if (candText.toLowerCase().includes(text)) {
671
+ matchArr.push(...candArr);
672
+ }
673
+ }
674
+
675
+ return matchArr;
676
+ }
677
+
647
678
  /**
648
679
  * Gets word IDs that match the provided text.
649
680
  * @param {string} text
@@ -729,6 +760,7 @@ const ocr = {
729
760
  getPageWord,
730
761
  getPageWords,
731
762
  getDistinctChars,
763
+ getMatchingWords,
732
764
  getMatchingWordIds,
733
765
  getPageText,
734
766
  getParText,
@@ -529,14 +529,21 @@ export async function recognize(options = {}) {
529
529
  if (langs.includes('rus') || langs.includes('ukr') || langs.includes('ell')) fontPromiseArr.push(loadBuiltInFontsRaw('all'));
530
530
  await Promise.all(fontPromiseArr);
531
531
 
532
- /** @type {?OcrPage[]} */
533
- const existingOCR = ocrAll['User Upload'] || ocrAll.pdf;
532
+ let forceMainData = false;
533
+ let existingOCR;
534
+ if (ocrAll['User Upload']) {
535
+ existingOCR = ocrAll['User Upload'];
536
+ } else if (opt.usePDFTextSupp && ocrAll.pdf) {
537
+ existingOCR = ocrAll.pdf;
538
+ // If the PDF text is not the active data, it is assumed to be for supplemental purposes only.
539
+ forceMainData = ocrAll.pdf !== ocrAll.active;
540
+ }
534
541
 
535
542
  // A single Tesseract engine can be used (Legacy or LSTM) or the results from both can be used and combined.
536
543
  if (oemMode === 'legacy' || oemMode === 'lstm') {
537
544
  // Tesseract is used as the "main" data unless user-uploaded data exists and only the LSTM model is being run.
538
545
  // This is because Tesseract Legacy provides very strong metrics, and Abbyy often does not.
539
- await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !(oemMode === 'lstm' && !!existingOCR), langs, vanillaMode);
546
+ await recognizeAllPages(oemMode === 'legacy', oemMode === 'lstm', !existingOCR, langs, vanillaMode);
540
547
 
541
548
  // Metrics from the LSTM model are so inaccurate they are not worth using.
542
549
  if (oemMode === 'legacy') {
@@ -544,7 +551,7 @@ export async function recognize(options = {}) {
544
551
  await runFontOptimization(ocrAll['Tesseract Legacy']);
545
552
  }
546
553
  } else if (oemMode === 'combined') {
547
- await recognizeAllPages(true, true, true, langs, vanillaMode);
554
+ await recognizeAllPages(true, true, !existingOCR, langs, vanillaMode);
548
555
 
549
556
  if (opt.saveDebugImages) {
550
557
  DebugData.debugImg.Combined = new Array(ImageCache.pageCount);
@@ -653,9 +660,16 @@ export async function recognize(options = {}) {
653
660
  ignorePunct: opt.ignorePunct,
654
661
  confThreshHigh: opt.confThreshHigh,
655
662
  confThreshMed: opt.confThreshMed,
663
+ // If the existing data was invisible OCR text extracted from a PDF, it is assumed to not have accurate bounding boxes.
664
+ useBboxB: !forceMainData && existingOCR === ocrAll.pdf && inputData.pdfMode && !!inputData.pdfType && ['image', 'ocr'].includes(inputData.pdfType),
656
665
  };
657
666
 
658
- const res = await compareOCR(existingOCR, ocrAll['Tesseract Combined'], compOptions);
667
+ let res;
668
+ if (forceMainData) {
669
+ res = await compareOCR(ocrAll['Tesseract Combined'], existingOCR, compOptions);
670
+ } else {
671
+ res = await compareOCR(existingOCR, ocrAll['Tesseract Combined'], compOptions);
672
+ }
659
673
 
660
674
  if (DebugData.debugImg.Combined) DebugData.debugImg.Combined = res.debug;
661
675
 
@@ -237,8 +237,11 @@ export function calcWordMetrics(word, angle = 0) {
237
237
  const wordLastGlyphMetrics = fontOpentype.charToGlyph(charArr2.at(-1)).getMetrics();
238
238
  const wordFirstGlyphMetrics = fontOpentype.charToGlyph(charArr2[0]).getMetrics();
239
239
 
240
- let wordLeftBearing = wordFirstGlyphMetrics.leftSideBearing || 0;
241
- let wordRightBearing = wordLastGlyphMetrics.rightSideBearing || 0;
240
+ // The `leftSideBearing`/`rightSideBearing`/ numbers reported by Opentype.js are not accurate for mono-spaced fonts, so `xMin`/`xMax` are used instead.
241
+ let wordLeftBearing = wordFirstGlyphMetrics.xMin || 0;
242
+ let lastGlyphMax = wordLastGlyphMetrics.xMax || 0;
243
+ if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) lastGlyphMax *= fontI.smallCapsMult;
244
+ let wordRightBearing = advanceArr[advanceArr.length - 1] - lastGlyphMax;
242
245
  if (word.smallCaps && charArr2[0] !== charArr[0]) wordLeftBearing *= fontI.smallCapsMult;
243
246
  if (word.smallCaps && charArr2[charArr2.length - 1] !== charArr[charArr2.length - 1]) wordRightBearing *= fontI.smallCapsMult;
244
247
 
@@ -290,6 +293,11 @@ export const calcWordFontSize = (word) => {
290
293
  if (word.visualCoords) {
291
294
  return getFontSize(fontOpentype, word.bbox.bottom - word.bbox.top, word.text);
292
295
  }
296
+ if (word.size) {
297
+ const mult = FontProps.sizeMult[font.family] || 1;
298
+ return word.size / mult;
299
+ }
300
+
293
301
  return (word.bbox.bottom - word.bbox.top) * (fontOpentype.unitsPerEm / (fontOpentype.ascender - fontOpentype.descender));
294
302
  }
295
303
 
@@ -379,7 +379,7 @@ export function replaceObjectProperties(obj, obj2 = {}) {
379
379
  // Fonts that should not be added (both Sans and Serif variants):
380
380
  // DejaVu
381
381
  const serifFonts = ['SerifDefault', 'Baskerville', 'Bookman', 'C059', 'Calibri', 'Cambria', 'Century', 'Courier', 'Garamond', 'Georgia',
382
- 'LucidaBright', 'Minion', 'Optima', 'P052', 'Palatino', 'Times'];
382
+ 'LucidaBright', 'Minion', 'NimbusMono', 'Optima', 'P052', 'Palatino', 'Times'];
383
383
  const sansFonts = ['SansDefault', 'Avenir', 'Arial', 'Calibri', 'Candara', 'Carlito', 'Comic', 'Franklin', 'Futura', 'Gotham',
384
384
  'Helvetica', 'Impact', 'Interstate', 'Myriad', 'Tahoma', 'Trebuchet', 'Univers', 'Verdana'];
385
385
 
@@ -463,6 +463,7 @@ async function penalizeWord(wordObjs) {
463
463
  * rather than simply setting `compTruth`/`matchTruth`. Enabled when using recognition to update confidence metrics, but not when comparing to ground truth.
464
464
  * @param {boolean} [params.options.legacyLSTMComb] - Whether Tesseract Legacy and Tesseract LSTM are being combined, when `mode = 'comb'`.
465
465
  * When `legacyLSTMComb` is enabled, additional heuristics are applied that are based on specific behaviors of the Tesseract Legacy engine.
466
+ * @param {boolean} [params.options.useBboxB] - Use bounding boxes from `pageB` in combined output.
466
467
  * @param {string} [params.options.debugLabel]
467
468
  * @param {boolean} [params.options.evalConflicts] - Whether to evaluate word quality on conflicts. If `false` the text from `pageB` is always assumed correct.
468
469
  * This option is useful for combining the style from Tesseract Legacy with the text from Tesseract LSTM.
@@ -494,6 +495,7 @@ export async function compareOCRPageImp({
494
495
  const mode = options?.mode === undefined ? 'stats' : options?.mode;
495
496
  const editConf = options?.editConf === undefined ? false : options?.editConf;
496
497
  const legacyLSTMComb = options?.legacyLSTMComb === undefined ? false : options?.legacyLSTMComb;
498
+ const useBboxB = options?.useBboxB === undefined ? false : options?.useBboxB;
497
499
  const debugLabel = options?.debugLabel === undefined ? '' : options?.debugLabel;
498
500
  const evalConflicts = options?.evalConflicts === undefined ? true : options?.evalConflicts;
499
501
  const supplementComp = options?.supplementComp === undefined ? false : options?.supplementComp;
@@ -597,8 +599,13 @@ export async function compareOCRPageImp({
597
599
 
598
600
  const wordBoxACore = JSON.parse(JSON.stringify(wordBoxA));
599
601
 
600
- wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.1);
601
- wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.1);
602
+ if (wordA.visualCoords) {
603
+ wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.1);
604
+ wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.1);
605
+ } else {
606
+ wordBoxACore.top = wordBoxA.top + Math.round(wordBoxAHeight * 0.25);
607
+ wordBoxACore.bottom = wordBoxA.bottom - Math.round(wordBoxAHeight * 0.25);
608
+ }
602
609
 
603
610
  for (let l = minWordB; l < lineB.words.length; l++) {
604
611
  const wordB = lineB.words[l];
@@ -612,8 +619,13 @@ export async function compareOCRPageImp({
612
619
 
613
620
  const wordBoxBCore = JSON.parse(JSON.stringify(wordBoxB));
614
621
 
615
- wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.1);
616
- wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.1);
622
+ if (wordB.visualCoords) {
623
+ wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.1);
624
+ wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.1);
625
+ } else {
626
+ wordBoxBCore.top = wordBoxB.top + Math.round(wordBoxBHeight * 0.25);
627
+ wordBoxBCore.bottom = wordBoxB.bottom - Math.round(wordBoxBHeight * 0.25);
628
+ }
617
629
 
618
630
  // If left of word A is past right of word B, move to next word B
619
631
  if (wordBoxACore.left > wordBoxBCore.right) {
@@ -660,6 +672,11 @@ export async function compareOCRPageImp({
660
672
  if (mode === 'comb') wordA.conf = 100;
661
673
  hocrACorrect[wordA.id] = 1;
662
674
  hocrBCorrect[wordB.id] = 1;
675
+ if (mode === 'comb' && useBboxB) {
676
+ wordA.bbox = structuredClone(wordB.bbox);
677
+ wordA.visualCoords = true;
678
+ wordA.chars = structuredClone(wordB.chars);
679
+ }
663
680
  } else if (mode === 'comb') {
664
681
  wordA.conf = 0;
665
682
  wordA.matchTruth = false;
@@ -101,8 +101,11 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
101
101
  const indexFirst = font.charToGlyphIndex(charFirst);
102
102
  const indexSecond = font.charToGlyphIndex(charSecond);
103
103
 
104
- const metricsFirst = font.glyphs.glyphs[indexFirst].getMetrics();
105
- const metricsSecond = font.glyphs.glyphs[indexSecond].getMetrics();
104
+ const glyphFirst = font.glyphs.glyphs[indexFirst];
105
+ const glyphSecond = font.glyphs.glyphs[indexSecond];
106
+
107
+ const metricsFirst = glyphFirst.getMetrics();
108
+ const metricsSecond = glyphSecond.getMetrics();
106
109
 
107
110
  const fontKern1 = Math.round(value * xHeight);
108
111
  let spaceTarget = fontKern1;
@@ -119,7 +122,7 @@ const calculateKerningPairs = (font, fontMetricsObj, xHeight, style) => {
119
122
  }
120
123
 
121
124
  // Calculate current space between these 2 glyphs (without kerning adjustments)
122
- const spaceCurrent = metricsFirst.rightSideBearing + metricsSecond.leftSideBearing;
125
+ const spaceCurrent = (glyphFirst.advanceWidth - metricsFirst.xMax) + metricsSecond.xMin;
123
126
 
124
127
  // Calculate kerning adjustment needed
125
128
  let fontKern = spaceTarget - spaceCurrent;
Binary file
@@ -90,7 +90,7 @@ export async function initMuPDFWorker() {
90
90
  return function (...args) {
91
91
  return new Promise((resolve, reject) => {
92
92
  // Add the PDF as the first argument for most functions
93
- if (!['openDocument', 'cleanFile'].includes(func)) {
93
+ if (!['openDocument', 'cleanFile', 'freeDocument'].includes(func)) {
94
94
  // Remove job number (appended by Tesseract scheduler function)
95
95
  // args = args.slice(0,-1)
96
96
 
@@ -165,6 +165,8 @@ mupdf.pageText = function (doc, {
165
165
 
166
166
  const content = Module.UTF8ToString(dataPtr);
167
167
 
168
+ Module._free(dataPtr);
169
+
168
170
  return {
169
171
  letterCountTotal,
170
172
  letterCountVis,
@@ -464,7 +466,7 @@ const handleMessage = (data) => {
464
466
  } catch (error) {
465
467
  parentPort.postMessage(['ERROR', id, { name: error.name, message: error.message }]);
466
468
  }
467
- }
469
+ };
468
470
 
469
471
  if (typeof process === 'undefined') {
470
472
  onmessage = (event) => handleMessage(event.data);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {
package/scribe.js CHANGED
@@ -94,7 +94,7 @@ const extractText = async (files, langs = ['eng'], outputFormat = 'txt', options
94
94
  init({ ocr: true, font: true });
95
95
  await importFiles(files, { extractPDFTextNative: skipRecPDFTextNative, extractPDFTextOCR: skipRecPDFTextOCR });
96
96
  if (!inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode) throw new Error('No relevant files to process.');
97
- const skipRecPDF = inputData.pdfMode && (ImageCache.pdfType === 'text' && skipRecPDFTextNative || ImageCache.pdfType === 'ocr' && skipRecPDFTextOCR);
97
+ const skipRecPDF = inputData.pdfMode && (inputData.pdfType === 'text' && skipRecPDFTextNative || inputData.pdfType === 'ocr' && skipRecPDFTextOCR);
98
98
  const skipRecOCR = inputData.xmlMode[0] && !inputData.imageMode && !inputData.pdfMode;
99
99
  if (!skipRecPDF && !skipRecOCR) await recognize({ langs });
100
100
  return exportData(outputFormat);