scribe.js-ocr 0.9.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.gitmodules CHANGED
@@ -3,4 +3,4 @@
3
3
  url = https://github.com/scribeocr/scrollview-web.git
4
4
  [submodule "cloud-adapters"]
5
5
  path = cloud-adapters
6
- url = git@github.com:scribeocr/cloud-adapters.git
6
+ url = https://github.com/scribeocr/cloud-adapters.git
@@ -142,10 +142,23 @@ export class ImageCache {
142
142
  * Initializes the MuPDF scheduler.
143
143
  * This is separate from the function that loads the file (`#loadFileMuPDFScheduler`),
144
144
  * as the scheduler starts loading ahead of the file being available for performance reasons.
145
- * @param {number} numWorkers
146
- * @returns
145
+ * @param {number} [numWorkers]
147
146
  */
148
- static #initMuPDFScheduler = async (numWorkers = 3) => {
147
+ static #initMuPDFScheduler = async (numWorkers) => {
148
+ // If `numbWorkers` is not specified, use up to 3 workers based on hardware concurrency
149
+ // and the global `opt.workerN` setting.
150
+ if (!numWorkers) {
151
+ if (typeof process === 'undefined') {
152
+ numWorkers = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 3);
153
+ } else {
154
+ const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
155
+ numWorkers = Math.max(Math.min(cpuN - 1, 3), 1);
156
+ }
157
+ if (opt.workerN && opt.workerN < numWorkers) {
158
+ numWorkers = opt.workerN;
159
+ }
160
+ }
161
+
149
162
  const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
150
163
  const scheduler = await Tesseract.createScheduler();
151
164
  const workersPromiseArr = range(1, numWorkers).map(async () => {
@@ -357,10 +370,9 @@ export class ImageCache {
357
370
 
358
371
  /**
359
372
  * Gets the MuPDF scheduler if it exists, otherwise creates a new one.
360
- * @param {number} [numWorkers=3] - Number of workers to create.
361
- * @returns
373
+ * @param {number} [numWorkers] - Number of workers to create.
362
374
  */
363
- static getMuPDFScheduler = async (numWorkers = 3) => {
375
+ static getMuPDFScheduler = async (numWorkers) => {
364
376
  if (ImageCache.muPDFScheduler) return ImageCache.muPDFScheduler;
365
377
  ImageCache.muPDFScheduler = ImageCache.#initMuPDFScheduler(numWorkers);
366
378
  return ImageCache.muPDFScheduler;
@@ -372,7 +384,7 @@ export class ImageCache {
372
384
  * @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
373
385
  */
374
386
  static openMainPDF = async (fileData, skipText = false) => {
375
- const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
387
+ const muPDFScheduler = await ImageCache.getMuPDFScheduler();
376
388
 
377
389
  await ImageCache.#loadFileMuPDFScheduler(fileData);
378
390
 
@@ -254,7 +254,6 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
254
254
  minpage: minPage,
255
255
  maxpage: maxPage,
256
256
  reflowText: opt.reflow,
257
- docxMode: false,
258
257
  });
259
258
  // Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
260
259
  // @ts-ignore
@@ -1,9 +1,122 @@
1
1
  import { documentEnd, documentStart, docxStrings } from './resources/docxFiles.js';
2
2
 
3
- import { writeText } from './writeText.js';
4
-
5
3
  import { opt } from '../containers/app.js';
6
4
 
5
+ import { assignParagraphs } from '../utils/reflowPars.js';
6
+
7
+ import { pageMetricsAll } from '../containers/dataContainer.js';
8
+ import ocr from '../objects/ocrObjects.js';
9
+
10
+ /**
11
+ * Convert an array of ocrPage objects to XML for a Word document.
12
+ *
13
+ * @param {Object} params
14
+ * @param {Array<OcrPage>} params.ocrCurrent -
15
+ * @param {number} [params.minpage=0] - The first page to include in the document.
16
+ * @param {number} [params.maxpage=-1] - The last page to include in the document.
17
+ * @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
18
+ * @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
19
+ * If omitted, all words are included.
20
+ */
21
+ export function writeDocxContent({
22
+ ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
23
+ }) {
24
+ let textStr = '';
25
+
26
+ if (maxpage === -1) maxpage = ocrCurrent.length - 1;
27
+
28
+ let newLine = false;
29
+
30
+ for (let g = minpage; g <= maxpage; g++) {
31
+ if (!ocrCurrent[g] || ocrCurrent[g].lines.length === 0) continue;
32
+
33
+ const pageObj = ocrCurrent[g];
34
+
35
+ // Do not overwrite paragraphs from Abbyy or Textract.
36
+ if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
37
+ const angle = pageMetricsAll[g].angle || 0;
38
+ assignParagraphs(pageObj, angle);
39
+ }
40
+
41
+ let parCurrent = pageObj.lines[0].par;
42
+
43
+ let fontStylePrev = '';
44
+ let supPrev = false;
45
+
46
+ for (let h = 0; h < pageObj.lines.length; h++) {
47
+ const lineObj = pageObj.lines[h];
48
+
49
+ if (reflowText) {
50
+ if (g > 0 && h === 0 || lineObj.par !== parCurrent) newLine = true;
51
+ parCurrent = lineObj.par;
52
+ } else {
53
+ newLine = true;
54
+ }
55
+
56
+ for (let i = 0; i < lineObj.words.length; i++) {
57
+ const wordObj = lineObj.words[i];
58
+ if (!wordObj) continue;
59
+
60
+ if (wordIds && !wordIds.includes(wordObj.id)) continue;
61
+
62
+ let fontStyle = '';
63
+ if (wordObj.style.italic) {
64
+ fontStyle += '<w:i/>';
65
+ } else if (wordObj.style.bold) {
66
+ fontStyle += '<w:b/>';
67
+ }
68
+
69
+ if (wordObj.style.smallCaps) {
70
+ fontStyle += '<w:smallCaps/>';
71
+ }
72
+
73
+ if (wordObj.style.underline) {
74
+ fontStyle += '<w:u w:val="single"/>';
75
+ }
76
+
77
+ if (wordObj.style.sup) {
78
+ fontStyle += '<w:vertAlign w:val="superscript"/>';
79
+ }
80
+
81
+ if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
82
+ const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
83
+
84
+ if (h === 0 && g === 0 && i === 0) {
85
+ textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
86
+ } else if (newLine) {
87
+ textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
88
+ // If the previous word was a superscript, the space is added switching back to normal text.
89
+ } else if (supPrev) {
90
+ textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
91
+ // If this word is a superscript, no space is added between words.
92
+ } else if (wordObj.style.sup && i > 0) {
93
+ textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
94
+ } else {
95
+ textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
96
+ }
97
+ } else {
98
+ textStr += ' ';
99
+ }
100
+
101
+ fontStylePrev = fontStyle;
102
+ supPrev = wordObj.style.sup;
103
+
104
+ newLine = false;
105
+
106
+ // DOCX is an XML format, so any escaped XML characters need to continue being escaped.
107
+ // TODO: Figure out how to properly export superscripts to Word
108
+ textStr += ocr.escapeXml(wordObj.text);
109
+ }
110
+ }
111
+ opt.progressHandler({ n: g, type: 'export', info: { } });
112
+ }
113
+
114
+ // Add final closing tags
115
+ if (textStr) textStr += '</w:t></w:r></w:p>';
116
+
117
+ return textStr;
118
+ }
119
+
7
120
  /**
8
121
  * Create a Word document from an array of ocrPage objects.
9
122
  *
@@ -20,12 +133,11 @@ export async function writeDocx({ hocrCurrent, minpage = 0, maxpage = -1 }) {
20
133
  const zipFileWriter = new Uint8ArrayWriter();
21
134
  const zipWriter = new ZipWriter(zipFileWriter);
22
135
 
23
- const textReader = new TextReader(documentStart + writeText({
136
+ const textReader = new TextReader(documentStart + writeDocxContent({
24
137
  ocrCurrent: hocrCurrent,
25
138
  minpage,
26
139
  maxpage,
27
140
  reflowText: opt.reflow,
28
- docxMode: true,
29
141
  }) + documentEnd);
30
142
  await zipWriter.add('word/document.xml', textReader);
31
143
 
@@ -1,21 +1,21 @@
1
1
  import { opt } from '../containers/app.js';
2
2
  import { pageMetricsAll } from '../containers/dataContainer.js';
3
- import ocr from '../objects/ocrObjects.js';
4
3
  import { assignParagraphs } from '../utils/reflowPars.js';
5
4
 
6
5
  /**
7
- * Convert an array of ocrPage objects to plain text, or XML for a Word document.
6
+ * Convert an array of ocrPage objects to plain text.
8
7
  *
9
8
  * @param {Object} params
10
9
  * @param {Array<OcrPage>} params.ocrCurrent -
11
10
  * @param {number} [params.minpage=0] - The first page to include in the document.
12
11
  * @param {number} [params.maxpage=-1] - The last page to include in the document.
13
12
  * @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
14
- * @param {boolean} [params.docxMode=false] - Create XML for a word document rather than plain text.
15
13
  * @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
16
14
  * If omitted, all words are included.
17
15
  */
18
- export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null }) {
16
+ export function writeText({
17
+ ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
18
+ }) {
19
19
  let textStr = '';
20
20
 
21
21
  if (maxpage === -1) maxpage = ocrCurrent.length - 1;
@@ -35,9 +35,6 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
35
35
 
36
36
  let parCurrent = pageObj.lines[0].par;
37
37
 
38
- let fontStylePrev = '';
39
- let supPrev = false;
40
-
41
38
  for (let h = 0; h < pageObj.lines.length; h++) {
42
39
  const lineObj = pageObj.lines[h];
43
40
 
@@ -54,49 +51,7 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
54
51
 
55
52
  if (wordIds && !wordIds.includes(wordObj.id)) continue;
56
53
 
57
- if (docxMode) {
58
- let fontStyle = '';
59
- if (wordObj.style.italic) {
60
- fontStyle += '<w:i/>';
61
- } else if (wordObj.style.bold) {
62
- fontStyle += '<w:b/>';
63
- }
64
-
65
- if (wordObj.style.smallCaps) {
66
- fontStyle += '<w:smallCaps/>';
67
- }
68
-
69
- if (wordObj.style.underline) {
70
- fontStyle += '<w:u w:val="single"/>';
71
- }
72
-
73
- if (wordObj.style.sup) {
74
- fontStyle += '<w:vertAlign w:val="superscript"/>';
75
- }
76
-
77
- if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
78
- const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
79
-
80
- if (h === 0 && g === 0 && i === 0) {
81
- textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
82
- } else if (newLine) {
83
- textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
84
- // If the previous word was a superscript, the space is added switching back to normal text.
85
- } else if (supPrev) {
86
- textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
87
- // If this word is a superscript, no space is added between words.
88
- } else if (wordObj.style.sup && i > 0) {
89
- textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
90
- } else {
91
- textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
92
- }
93
- } else {
94
- textStr += ' ';
95
- }
96
-
97
- fontStylePrev = fontStyle;
98
- supPrev = wordObj.style.sup;
99
- } else if (newLine) {
54
+ if (newLine) {
100
55
  textStr = `${textStr}\n`;
101
56
  } else if (h > 0 || g > 0 || i > 0) {
102
57
  textStr = `${textStr} `;
@@ -104,20 +59,11 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
104
59
 
105
60
  newLine = false;
106
61
 
107
- // DOCX is an XML format, so any escaped XML characters need to continue being escaped.
108
- if (docxMode) {
109
- // TODO: Figure out how to properly export superscripts to Word
110
- textStr += ocr.escapeXml(wordObj.text);
111
- } else {
112
- textStr += wordObj.text;
113
- }
62
+ textStr += wordObj.text;
114
63
  }
115
64
  }
116
65
  opt.progressHandler({ n: g, type: 'export', info: { } });
117
66
  }
118
67
 
119
- // Add final closing tags
120
- if (docxMode && textStr) textStr += '</w:t></w:r></w:p>';
121
-
122
68
  return textStr;
123
69
  }
@@ -8,7 +8,7 @@ import { convertOCR } from './recognizeConvert.js';
8
8
  * Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
9
9
  */
10
10
  const extractInternalPDFTextRaw = async () => {
11
- const muPDFScheduler = await ImageCache.getMuPDFScheduler(3);
11
+ const muPDFScheduler = await ImageCache.getMuPDFScheduler();
12
12
 
13
13
  const pdfContentStats = {
14
14
  /** Total number of letters in the source PDF. */
@@ -293,7 +293,7 @@ export class gs {
293
293
  workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
294
294
  } else {
295
295
  const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
296
- workerN = Math.min(cpuN - 1, 8);
296
+ workerN = Math.max(Math.min(cpuN - 1, 8), 1);
297
297
  }
298
298
 
299
299
  const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
@@ -345,6 +345,11 @@ export async function importFiles(files) {
345
345
 
346
346
  format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
347
347
 
348
+ // The text import function requires built-in fonts to be loaded.
349
+ if (format === 'text') {
350
+ await loadBuiltInFontsRaw();
351
+ }
352
+
348
353
  ocrAllRaw.active = ocrData.hocrRaw;
349
354
  // Subset OCR data to avoid uncaught error that occurs when there are more pages of OCR data than image data.
350
355
  // While this should be rare, it appears to be fairly common with Archive.org documents.
@@ -95,7 +95,7 @@ export async function importOCRFiles(ocrFilesAll) {
95
95
  if (singleHOCRMode) {
96
96
  const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
97
97
 
98
- format = detectOcrFormat(hocrStrAll);
98
+ format = detectOcrFormat(hocrStrAll, ocrFilesAll[0]?.name?.split('.').pop());
99
99
 
100
100
  if (!format) {
101
101
  console.error(ocrFilesAll[0]);
@@ -134,7 +134,7 @@ export async function importOCRFiles(ocrFilesAll) {
134
134
  // Check whether input is Abbyy XML using the first file
135
135
  const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
136
136
 
137
- format = detectOcrFormat(hocrStrFirst);
137
+ format = detectOcrFormat(hocrStrFirst, ocrFilesAll[0]?.name?.split('.').pop());
138
138
 
139
139
  if (!format) {
140
140
  console.error(ocrFilesAll[0]);
@@ -323,7 +323,8 @@ export function countSubstringOccurrences(string, subString, allowOverlapping, c
323
323
  export const saveAs = async (content, fileName) => {
324
324
  if (typeof process !== 'undefined') {
325
325
  const { promises: fsPromises } = await import('node:fs');
326
- await fsPromises.writeFile(fileName, content);
326
+ const buffer = content instanceof ArrayBuffer ? Buffer.from(content) : content;
327
+ await fsPromises.writeFile(fileName, buffer);
327
328
  return;
328
329
  }
329
330
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "scribe.js-ocr",
3
- "version": "0.9.1",
3
+ "version": "0.9.2",
4
4
  "description": "High-quality OCR and text extraction for images and PDFs.",
5
5
  "main": "scribe.js",
6
6
  "directories": {