scribe.js-ocr 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +1 -1
- package/js/containers/imageContainer.js +19 -7
- package/js/export/export.js +0 -1
- package/js/export/writeDocx.js +116 -4
- package/js/export/writeText.js +6 -60
- package/js/extractPDFText.js +1 -1
- package/js/generalWorkerMain.js +1 -1
- package/js/import/import.js +5 -0
- package/js/import/importOCR.js +2 -2
- package/js/utils/miscUtils.js +2 -1
- package/package.json +1 -1
package/.gitmodules
CHANGED
|
@@ -142,10 +142,23 @@ export class ImageCache {
|
|
|
142
142
|
* Initializes the MuPDF scheduler.
|
|
143
143
|
* This is separate from the function that loads the file (`#loadFileMuPDFScheduler`),
|
|
144
144
|
* as the scheduler starts loading ahead of the file being available for performance reasons.
|
|
145
|
-
* @param {number} numWorkers
|
|
146
|
-
* @returns
|
|
145
|
+
* @param {number} [numWorkers]
|
|
147
146
|
*/
|
|
148
|
-
static #initMuPDFScheduler = async (numWorkers
|
|
147
|
+
static #initMuPDFScheduler = async (numWorkers) => {
|
|
148
|
+
// If `numbWorkers` is not specified, use up to 3 workers based on hardware concurrency
|
|
149
|
+
// and the global `opt.workerN` setting.
|
|
150
|
+
if (!numWorkers) {
|
|
151
|
+
if (typeof process === 'undefined') {
|
|
152
|
+
numWorkers = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 3);
|
|
153
|
+
} else {
|
|
154
|
+
const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
|
|
155
|
+
numWorkers = Math.max(Math.min(cpuN - 1, 3), 1);
|
|
156
|
+
}
|
|
157
|
+
if (opt.workerN && opt.workerN < numWorkers) {
|
|
158
|
+
numWorkers = opt.workerN;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
149
162
|
const Tesseract = typeof process === 'undefined' ? (await import('../../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
|
150
163
|
const scheduler = await Tesseract.createScheduler();
|
|
151
164
|
const workersPromiseArr = range(1, numWorkers).map(async () => {
|
|
@@ -357,10 +370,9 @@ export class ImageCache {
|
|
|
357
370
|
|
|
358
371
|
/**
|
|
359
372
|
* Gets the MuPDF scheduler if it exists, otherwise creates a new one.
|
|
360
|
-
* @param {number} [numWorkers
|
|
361
|
-
* @returns
|
|
373
|
+
* @param {number} [numWorkers] - Number of workers to create.
|
|
362
374
|
*/
|
|
363
|
-
static getMuPDFScheduler = async (numWorkers
|
|
375
|
+
static getMuPDFScheduler = async (numWorkers) => {
|
|
364
376
|
if (ImageCache.muPDFScheduler) return ImageCache.muPDFScheduler;
|
|
365
377
|
ImageCache.muPDFScheduler = ImageCache.#initMuPDFScheduler(numWorkers);
|
|
366
378
|
return ImageCache.muPDFScheduler;
|
|
@@ -372,7 +384,7 @@ export class ImageCache {
|
|
|
372
384
|
* @param {Boolean} [skipText=false] - Whether to skip native text when rendering PDF to image.
|
|
373
385
|
*/
|
|
374
386
|
static openMainPDF = async (fileData, skipText = false) => {
|
|
375
|
-
const muPDFScheduler = await ImageCache.getMuPDFScheduler(
|
|
387
|
+
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
|
|
376
388
|
|
|
377
389
|
await ImageCache.#loadFileMuPDFScheduler(fileData);
|
|
378
390
|
|
package/js/export/export.js
CHANGED
|
@@ -254,7 +254,6 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
254
254
|
minpage: minPage,
|
|
255
255
|
maxpage: maxPage,
|
|
256
256
|
reflowText: opt.reflow,
|
|
257
|
-
docxMode: false,
|
|
258
257
|
});
|
|
259
258
|
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
|
|
260
259
|
// @ts-ignore
|
package/js/export/writeDocx.js
CHANGED
|
@@ -1,9 +1,122 @@
|
|
|
1
1
|
import { documentEnd, documentStart, docxStrings } from './resources/docxFiles.js';
|
|
2
2
|
|
|
3
|
-
import { writeText } from './writeText.js';
|
|
4
|
-
|
|
5
3
|
import { opt } from '../containers/app.js';
|
|
6
4
|
|
|
5
|
+
import { assignParagraphs } from '../utils/reflowPars.js';
|
|
6
|
+
|
|
7
|
+
import { pageMetricsAll } from '../containers/dataContainer.js';
|
|
8
|
+
import ocr from '../objects/ocrObjects.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Convert an array of ocrPage objects to XML for a Word document.
|
|
12
|
+
*
|
|
13
|
+
* @param {Object} params
|
|
14
|
+
* @param {Array<OcrPage>} params.ocrCurrent -
|
|
15
|
+
* @param {number} [params.minpage=0] - The first page to include in the document.
|
|
16
|
+
* @param {number} [params.maxpage=-1] - The last page to include in the document.
|
|
17
|
+
* @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
|
|
18
|
+
* @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
|
|
19
|
+
* If omitted, all words are included.
|
|
20
|
+
*/
|
|
21
|
+
export function writeDocxContent({
|
|
22
|
+
ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
|
|
23
|
+
}) {
|
|
24
|
+
let textStr = '';
|
|
25
|
+
|
|
26
|
+
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
|
|
27
|
+
|
|
28
|
+
let newLine = false;
|
|
29
|
+
|
|
30
|
+
for (let g = minpage; g <= maxpage; g++) {
|
|
31
|
+
if (!ocrCurrent[g] || ocrCurrent[g].lines.length === 0) continue;
|
|
32
|
+
|
|
33
|
+
const pageObj = ocrCurrent[g];
|
|
34
|
+
|
|
35
|
+
// Do not overwrite paragraphs from Abbyy or Textract.
|
|
36
|
+
if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
|
|
37
|
+
const angle = pageMetricsAll[g].angle || 0;
|
|
38
|
+
assignParagraphs(pageObj, angle);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
let parCurrent = pageObj.lines[0].par;
|
|
42
|
+
|
|
43
|
+
let fontStylePrev = '';
|
|
44
|
+
let supPrev = false;
|
|
45
|
+
|
|
46
|
+
for (let h = 0; h < pageObj.lines.length; h++) {
|
|
47
|
+
const lineObj = pageObj.lines[h];
|
|
48
|
+
|
|
49
|
+
if (reflowText) {
|
|
50
|
+
if (g > 0 && h === 0 || lineObj.par !== parCurrent) newLine = true;
|
|
51
|
+
parCurrent = lineObj.par;
|
|
52
|
+
} else {
|
|
53
|
+
newLine = true;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
for (let i = 0; i < lineObj.words.length; i++) {
|
|
57
|
+
const wordObj = lineObj.words[i];
|
|
58
|
+
if (!wordObj) continue;
|
|
59
|
+
|
|
60
|
+
if (wordIds && !wordIds.includes(wordObj.id)) continue;
|
|
61
|
+
|
|
62
|
+
let fontStyle = '';
|
|
63
|
+
if (wordObj.style.italic) {
|
|
64
|
+
fontStyle += '<w:i/>';
|
|
65
|
+
} else if (wordObj.style.bold) {
|
|
66
|
+
fontStyle += '<w:b/>';
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (wordObj.style.smallCaps) {
|
|
70
|
+
fontStyle += '<w:smallCaps/>';
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (wordObj.style.underline) {
|
|
74
|
+
fontStyle += '<w:u w:val="single"/>';
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (wordObj.style.sup) {
|
|
78
|
+
fontStyle += '<w:vertAlign w:val="superscript"/>';
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
|
|
82
|
+
const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
|
|
83
|
+
|
|
84
|
+
if (h === 0 && g === 0 && i === 0) {
|
|
85
|
+
textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
86
|
+
} else if (newLine) {
|
|
87
|
+
textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
88
|
+
// If the previous word was a superscript, the space is added switching back to normal text.
|
|
89
|
+
} else if (supPrev) {
|
|
90
|
+
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
|
|
91
|
+
// If this word is a superscript, no space is added between words.
|
|
92
|
+
} else if (wordObj.style.sup && i > 0) {
|
|
93
|
+
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
94
|
+
} else {
|
|
95
|
+
textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
96
|
+
}
|
|
97
|
+
} else {
|
|
98
|
+
textStr += ' ';
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
fontStylePrev = fontStyle;
|
|
102
|
+
supPrev = wordObj.style.sup;
|
|
103
|
+
|
|
104
|
+
newLine = false;
|
|
105
|
+
|
|
106
|
+
// DOCX is an XML format, so any escaped XML characters need to continue being escaped.
|
|
107
|
+
// TODO: Figure out how to properly export superscripts to Word
|
|
108
|
+
textStr += ocr.escapeXml(wordObj.text);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
opt.progressHandler({ n: g, type: 'export', info: { } });
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Add final closing tags
|
|
115
|
+
if (textStr) textStr += '</w:t></w:r></w:p>';
|
|
116
|
+
|
|
117
|
+
return textStr;
|
|
118
|
+
}
|
|
119
|
+
|
|
7
120
|
/**
|
|
8
121
|
* Create a Word document from an array of ocrPage objects.
|
|
9
122
|
*
|
|
@@ -20,12 +133,11 @@ export async function writeDocx({ hocrCurrent, minpage = 0, maxpage = -1 }) {
|
|
|
20
133
|
const zipFileWriter = new Uint8ArrayWriter();
|
|
21
134
|
const zipWriter = new ZipWriter(zipFileWriter);
|
|
22
135
|
|
|
23
|
-
const textReader = new TextReader(documentStart +
|
|
136
|
+
const textReader = new TextReader(documentStart + writeDocxContent({
|
|
24
137
|
ocrCurrent: hocrCurrent,
|
|
25
138
|
minpage,
|
|
26
139
|
maxpage,
|
|
27
140
|
reflowText: opt.reflow,
|
|
28
|
-
docxMode: true,
|
|
29
141
|
}) + documentEnd);
|
|
30
142
|
await zipWriter.add('word/document.xml', textReader);
|
|
31
143
|
|
package/js/export/writeText.js
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
import { opt } from '../containers/app.js';
|
|
2
2
|
import { pageMetricsAll } from '../containers/dataContainer.js';
|
|
3
|
-
import ocr from '../objects/ocrObjects.js';
|
|
4
3
|
import { assignParagraphs } from '../utils/reflowPars.js';
|
|
5
4
|
|
|
6
5
|
/**
|
|
7
|
-
* Convert an array of ocrPage objects to plain text
|
|
6
|
+
* Convert an array of ocrPage objects to plain text.
|
|
8
7
|
*
|
|
9
8
|
* @param {Object} params
|
|
10
9
|
* @param {Array<OcrPage>} params.ocrCurrent -
|
|
11
10
|
* @param {number} [params.minpage=0] - The first page to include in the document.
|
|
12
11
|
* @param {number} [params.maxpage=-1] - The last page to include in the document.
|
|
13
12
|
* @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph.
|
|
14
|
-
* @param {boolean} [params.docxMode=false] - Create XML for a word document rather than plain text.
|
|
15
13
|
* @param {?Array<string>} [params.wordIds=null] - An array of word IDs to include in the document.
|
|
16
14
|
* If omitted, all words are included.
|
|
17
15
|
*/
|
|
18
|
-
export function writeText({
|
|
16
|
+
export function writeText({
|
|
17
|
+
ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, wordIds = null,
|
|
18
|
+
}) {
|
|
19
19
|
let textStr = '';
|
|
20
20
|
|
|
21
21
|
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
|
|
@@ -35,9 +35,6 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
|
|
|
35
35
|
|
|
36
36
|
let parCurrent = pageObj.lines[0].par;
|
|
37
37
|
|
|
38
|
-
let fontStylePrev = '';
|
|
39
|
-
let supPrev = false;
|
|
40
|
-
|
|
41
38
|
for (let h = 0; h < pageObj.lines.length; h++) {
|
|
42
39
|
const lineObj = pageObj.lines[h];
|
|
43
40
|
|
|
@@ -54,49 +51,7 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
|
|
|
54
51
|
|
|
55
52
|
if (wordIds && !wordIds.includes(wordObj.id)) continue;
|
|
56
53
|
|
|
57
|
-
if (
|
|
58
|
-
let fontStyle = '';
|
|
59
|
-
if (wordObj.style.italic) {
|
|
60
|
-
fontStyle += '<w:i/>';
|
|
61
|
-
} else if (wordObj.style.bold) {
|
|
62
|
-
fontStyle += '<w:b/>';
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
if (wordObj.style.smallCaps) {
|
|
66
|
-
fontStyle += '<w:smallCaps/>';
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
if (wordObj.style.underline) {
|
|
70
|
-
fontStyle += '<w:u w:val="single"/>';
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
if (wordObj.style.sup) {
|
|
74
|
-
fontStyle += '<w:vertAlign w:val="superscript"/>';
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) {
|
|
78
|
-
const styleStr = fontStyle === '' ? '' : `<w:rPr>${fontStyle}</w:rPr>`;
|
|
79
|
-
|
|
80
|
-
if (h === 0 && g === 0 && i === 0) {
|
|
81
|
-
textStr = `${textStr}<w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
82
|
-
} else if (newLine) {
|
|
83
|
-
textStr = `${textStr}</w:t></w:r></w:p><w:p><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
84
|
-
// If the previous word was a superscript, the space is added switching back to normal text.
|
|
85
|
-
} else if (supPrev) {
|
|
86
|
-
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve"> `;
|
|
87
|
-
// If this word is a superscript, no space is added between words.
|
|
88
|
-
} else if (wordObj.style.sup && i > 0) {
|
|
89
|
-
textStr = `${textStr}</w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
90
|
-
} else {
|
|
91
|
-
textStr = `${textStr} </w:t></w:r><w:r>${styleStr}<w:t xml:space="preserve">`;
|
|
92
|
-
}
|
|
93
|
-
} else {
|
|
94
|
-
textStr += ' ';
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
fontStylePrev = fontStyle;
|
|
98
|
-
supPrev = wordObj.style.sup;
|
|
99
|
-
} else if (newLine) {
|
|
54
|
+
if (newLine) {
|
|
100
55
|
textStr = `${textStr}\n`;
|
|
101
56
|
} else if (h > 0 || g > 0 || i > 0) {
|
|
102
57
|
textStr = `${textStr} `;
|
|
@@ -104,20 +59,11 @@ export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText =
|
|
|
104
59
|
|
|
105
60
|
newLine = false;
|
|
106
61
|
|
|
107
|
-
|
|
108
|
-
if (docxMode) {
|
|
109
|
-
// TODO: Figure out how to properly export superscripts to Word
|
|
110
|
-
textStr += ocr.escapeXml(wordObj.text);
|
|
111
|
-
} else {
|
|
112
|
-
textStr += wordObj.text;
|
|
113
|
-
}
|
|
62
|
+
textStr += wordObj.text;
|
|
114
63
|
}
|
|
115
64
|
}
|
|
116
65
|
opt.progressHandler({ n: g, type: 'export', info: { } });
|
|
117
66
|
}
|
|
118
67
|
|
|
119
|
-
// Add final closing tags
|
|
120
|
-
if (docxMode && textStr) textStr += '</w:t></w:r></w:p>';
|
|
121
|
-
|
|
122
68
|
return textStr;
|
|
123
69
|
}
|
package/js/extractPDFText.js
CHANGED
|
@@ -8,7 +8,7 @@ import { convertOCR } from './recognizeConvert.js';
|
|
|
8
8
|
* Reports whether PDF is text-native, contains invisible OCR text, or is image-only.
|
|
9
9
|
*/
|
|
10
10
|
const extractInternalPDFTextRaw = async () => {
|
|
11
|
-
const muPDFScheduler = await ImageCache.getMuPDFScheduler(
|
|
11
|
+
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
|
|
12
12
|
|
|
13
13
|
const pdfContentStats = {
|
|
14
14
|
/** Total number of letters in the source PDF. */
|
package/js/generalWorkerMain.js
CHANGED
|
@@ -293,7 +293,7 @@ export class gs {
|
|
|
293
293
|
workerN = Math.min(Math.round((globalThis.navigator.hardwareConcurrency || 8) / 2), 6);
|
|
294
294
|
} else {
|
|
295
295
|
const cpuN = Math.floor((await import('node:os')).cpus().length / 2);
|
|
296
|
-
workerN = Math.min(cpuN - 1, 8);
|
|
296
|
+
workerN = Math.max(Math.min(cpuN - 1, 8), 1);
|
|
297
297
|
}
|
|
298
298
|
|
|
299
299
|
const Tesseract = typeof process === 'undefined' ? (await import('../tess/tesseract.esm.min.js')).default : await import('@scribe.js/tesseract.js');
|
package/js/import/import.js
CHANGED
|
@@ -345,6 +345,11 @@ export async function importFiles(files) {
|
|
|
345
345
|
|
|
346
346
|
format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
|
|
347
347
|
|
|
348
|
+
// The text import function requires built-in fonts to be loaded.
|
|
349
|
+
if (format === 'text') {
|
|
350
|
+
await loadBuiltInFontsRaw();
|
|
351
|
+
}
|
|
352
|
+
|
|
348
353
|
ocrAllRaw.active = ocrData.hocrRaw;
|
|
349
354
|
// Subset OCR data to avoid uncaught error that occurs when there are more pages of OCR data than image data.
|
|
350
355
|
// While this should be rare, it appears to be fairly common with Archive.org documents.
|
package/js/import/importOCR.js
CHANGED
|
@@ -95,7 +95,7 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
95
95
|
if (singleHOCRMode) {
|
|
96
96
|
const hocrStrAll = await readOcrFile(ocrFilesAll[0]);
|
|
97
97
|
|
|
98
|
-
format = detectOcrFormat(hocrStrAll);
|
|
98
|
+
format = detectOcrFormat(hocrStrAll, ocrFilesAll[0]?.name?.split('.').pop());
|
|
99
99
|
|
|
100
100
|
if (!format) {
|
|
101
101
|
console.error(ocrFilesAll[0]);
|
|
@@ -134,7 +134,7 @@ export async function importOCRFiles(ocrFilesAll) {
|
|
|
134
134
|
// Check whether input is Abbyy XML using the first file
|
|
135
135
|
const hocrStrFirst = await readOcrFile(ocrFilesAll[0]);
|
|
136
136
|
|
|
137
|
-
format = detectOcrFormat(hocrStrFirst);
|
|
137
|
+
format = detectOcrFormat(hocrStrFirst, ocrFilesAll[0]?.name?.split('.').pop());
|
|
138
138
|
|
|
139
139
|
if (!format) {
|
|
140
140
|
console.error(ocrFilesAll[0]);
|
package/js/utils/miscUtils.js
CHANGED
|
@@ -323,7 +323,8 @@ export function countSubstringOccurrences(string, subString, allowOverlapping, c
|
|
|
323
323
|
export const saveAs = async (content, fileName) => {
|
|
324
324
|
if (typeof process !== 'undefined') {
|
|
325
325
|
const { promises: fsPromises } = await import('node:fs');
|
|
326
|
-
|
|
326
|
+
const buffer = content instanceof ArrayBuffer ? Buffer.from(content) : content;
|
|
327
|
+
await fsPromises.writeFile(fileName, buffer);
|
|
327
328
|
return;
|
|
328
329
|
}
|
|
329
330
|
|