scribe.js-ocr 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.js +12 -1
- package/cli/detectPDFType.js +13 -20
- package/cli/extract.js +4 -2
- package/cli/scribe.js +9 -1
- package/js/export/export.js +1 -1
- package/js/export/writeHtml.js +44 -28
- package/package.json +1 -1
package/cli/cli.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { detectPDFType } from './detectPDFType.js';
|
|
1
2
|
import { extract } from './extract.js';
|
|
2
3
|
import {
|
|
3
4
|
check,
|
|
@@ -36,7 +37,7 @@ export const evalInternalCLI = async (pdfFile, ocrFile, options) => {
|
|
|
36
37
|
* @param {string} pdfFile - Path to PDF file.
|
|
37
38
|
* @param {?string} [outputDir='.'] - Output directory.
|
|
38
39
|
* @param {Object} [options]
|
|
39
|
-
* @param {
|
|
40
|
+
* @param {"pdf" | "hocr" | "docx" | "xlsx" | "txt" | "text" | "html"} [options.format]
|
|
40
41
|
* @param {boolean} [options.reflow]
|
|
41
42
|
*/
|
|
42
43
|
export const extractCLI = async (pdfFile, outputDir, options) => {
|
|
@@ -44,6 +45,16 @@ export const extractCLI = async (pdfFile, outputDir, options) => {
|
|
|
44
45
|
process.exitCode = 0;
|
|
45
46
|
};
|
|
46
47
|
|
|
48
|
+
/**
|
|
49
|
+
*
|
|
50
|
+
* @param {string} pdfFile - Path to PDF file.
|
|
51
|
+
* @param {string} [outputPath] - Output file path.
|
|
52
|
+
*/
|
|
53
|
+
export const detectPDFTypeCLI = async (pdfFile, outputPath) => {
|
|
54
|
+
await detectPDFType(pdfFile, outputPath);
|
|
55
|
+
process.exitCode = 0;
|
|
56
|
+
};
|
|
57
|
+
|
|
47
58
|
/**
|
|
48
59
|
*
|
|
49
60
|
* @param {string} pdfFile - Path to PDF file.
|
package/cli/detectPDFType.js
CHANGED
|
@@ -1,20 +1,17 @@
|
|
|
1
|
-
// Code for adding visualization to OCR output
|
|
2
|
-
|
|
3
1
|
import fs from 'fs';
|
|
4
|
-
import
|
|
5
|
-
import Worker from 'web-worker';
|
|
6
|
-
import { initMuPDFWorker } from '../mupdf/mupdf-async.js';
|
|
7
|
-
|
|
8
|
-
globalThis.Worker = Worker;
|
|
9
|
-
globalThis.require = createRequire(import.meta.url);
|
|
2
|
+
import scribe from '../scribe.js';
|
|
10
3
|
|
|
11
|
-
|
|
4
|
+
/**
|
|
5
|
+
*
|
|
6
|
+
* @param {string} pdfFile - Path to PDF file.
|
|
7
|
+
* @param {string} [outputPath] - Output file path.
|
|
8
|
+
* If provided, the text will be extracted and saved to this path.
|
|
9
|
+
*/
|
|
10
|
+
export const detectPDFType = async (pdfFile, outputPath) => {
|
|
11
|
+
const mupdfScheduler = await scribe.data.image.getMuPDFScheduler(1);
|
|
12
|
+
const w = mupdfScheduler.workers[0];
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
const w = await initMuPDFWorker();
|
|
15
|
-
const fileData = await fs.readFileSync(args[0]);
|
|
16
|
-
|
|
17
|
-
const outputPath = args[1];
|
|
14
|
+
const fileData = await fs.readFileSync(pdfFile);
|
|
18
15
|
|
|
19
16
|
const pdfDoc = await w.openDocument(fileData, 'file.pdf');
|
|
20
17
|
w.pdfDoc = pdfDoc;
|
|
@@ -32,10 +29,6 @@ async function main() {
|
|
|
32
29
|
|
|
33
30
|
console.log('PDF Type:', type);
|
|
34
31
|
|
|
35
|
-
|
|
36
|
-
w.terminate();
|
|
37
|
-
|
|
38
|
-
process.exitCode = 0;
|
|
39
|
-
}
|
|
32
|
+
mupdfScheduler.scheduler.terminate();
|
|
40
33
|
|
|
41
|
-
|
|
34
|
+
};
|
package/cli/extract.js
CHANGED
|
@@ -7,7 +7,7 @@ import scribe from '../scribe.js';
|
|
|
7
7
|
* @param {string} pdfFile - Path to PDF file.
|
|
8
8
|
* @param {?string} [output='.'] - Output file or directory.
|
|
9
9
|
* @param {Object} [options]
|
|
10
|
-
* @param {
|
|
10
|
+
* @param {Parameters<typeof scribe.download>[0]} [options.format]
|
|
11
11
|
* @param {boolean} [options.reflow]
|
|
12
12
|
*/
|
|
13
13
|
export const extract = async (pdfFile, output, options) => {
|
|
@@ -18,7 +18,9 @@ export const extract = async (pdfFile, output, options) => {
|
|
|
18
18
|
const outputFile = outputDir === output ? `${path.basename(pdfFile).replace(/\.\w{1,5}$/i, `.${format}`)}` : path.basename(output);
|
|
19
19
|
const outputPath = `${outputDir}/${outputFile}`;
|
|
20
20
|
|
|
21
|
-
scribe.
|
|
21
|
+
scribe.opt.reflow = true;
|
|
22
|
+
scribe.opt.extractText = true;
|
|
23
|
+
|
|
22
24
|
await scribe.init();
|
|
23
25
|
await scribe.importFiles([pdfFile]);
|
|
24
26
|
|
package/cli/scribe.js
CHANGED
|
@@ -4,6 +4,7 @@ import {
|
|
|
4
4
|
checkCLI,
|
|
5
5
|
confCLI,
|
|
6
6
|
debugCLI,
|
|
7
|
+
detectPDFTypeCLI,
|
|
7
8
|
evalInternalCLI, extractCLI, overlayCLI, recognizeCLI,
|
|
8
9
|
} from './cli.js';
|
|
9
10
|
|
|
@@ -35,7 +36,7 @@ program
|
|
|
35
36
|
.command('extract')
|
|
36
37
|
.argument('<pdf_file>', 'Input PDF file.')
|
|
37
38
|
.argument('[output]', 'Output directory or file to save results.', '.')
|
|
38
|
-
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['txt']).default('txt'))
|
|
39
|
+
.addOption(new Option('-f, --format <ext>', 'Output format.').choices(['pdf', 'hocr', 'docx', 'xlsx', 'txt', 'text', 'html']).default('txt'))
|
|
39
40
|
.option('-r, --reflow', 'Reflow text by combining lines into paragraphs.')
|
|
40
41
|
.description('Extract text from PDF file and save in requested format.')
|
|
41
42
|
.action(extractCLI);
|
|
@@ -61,6 +62,13 @@ program
|
|
|
61
62
|
.option('-w, --workers <number>', 'Number of workers to use. Default is up to 8.')
|
|
62
63
|
.action(recognizeCLI);
|
|
63
64
|
|
|
65
|
+
program
|
|
66
|
+
.command('type')
|
|
67
|
+
.argument('<pdf_file>', 'Input PDF file.')
|
|
68
|
+
.argument('[output]', 'Output file path to save text.')
|
|
69
|
+
.description('Detect PDF file type (\'Text native\', \'Image + OCR text\', or \'Image native\').')
|
|
70
|
+
.action(detectPDFTypeCLI);
|
|
71
|
+
|
|
64
72
|
program
|
|
65
73
|
.command('debug')
|
|
66
74
|
.argument('<pdf_file>', 'Input PDF file.')
|
package/js/export/export.js
CHANGED
|
@@ -207,7 +207,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
|
|
|
207
207
|
/**
|
|
208
208
|
* Runs `exportData` and saves the result as a download (browser) or local file (Node.js).
|
|
209
209
|
* @public
|
|
210
|
-
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'} format
|
|
210
|
+
* @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'} format
|
|
211
211
|
* @param {string} fileName
|
|
212
212
|
* @param {number} [minPage=0] - First page to export.
|
|
213
213
|
* @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page.
|
package/js/export/writeHtml.js
CHANGED
|
@@ -5,6 +5,31 @@ import { assignParagraphs } from '../utils/reflowPars.js';
|
|
|
5
5
|
import { pageMetricsArr } from '../containers/dataContainer.js';
|
|
6
6
|
import ocr from '../objects/ocrObjects.js';
|
|
7
7
|
|
|
8
|
+
/**
|
|
9
|
+
* Calculate the font metrics for a given font and font size.
|
|
10
|
+
* This is used to get metrics that match `ctx.measureText`, but without requiring a canvas.
|
|
11
|
+
* @param {FontContainerFont} fontI
|
|
12
|
+
* @param {number} fontSize
|
|
13
|
+
*/
|
|
14
|
+
const calcFontMetrics = (fontI, fontSize) => {
|
|
15
|
+
const os2 = fontI.opentype.tables.os2;
|
|
16
|
+
const unitsPerEm = fontI.opentype.unitsPerEm;
|
|
17
|
+
|
|
18
|
+
// Bit 7: Use_Typo_Metrics (1 = Yes)
|
|
19
|
+
// eslint-disable-next-line no-bitwise
|
|
20
|
+
if (os2.fsSelection >> 7 & 1) {
|
|
21
|
+
return {
|
|
22
|
+
fontBoundingBoxAscent: Math.round(os2.sTypoAscender * (fontSize / unitsPerEm)),
|
|
23
|
+
fontBoundingBoxDescent: Math.round(os2.sTypoDescender * (fontSize / unitsPerEm)),
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
fontBoundingBoxAscent: Math.round(os2.usWinAscent * (fontSize / unitsPerEm)),
|
|
29
|
+
fontBoundingBoxDescent: Math.round(os2.usWinDescent * (fontSize / unitsPerEm)),
|
|
30
|
+
};
|
|
31
|
+
};
|
|
32
|
+
|
|
8
33
|
/**
|
|
9
34
|
*
|
|
10
35
|
* @param {string} text
|
|
@@ -33,18 +58,11 @@ const makeSmallCapsDivs = (text, fontSizeHTMLSmallCaps) => {
|
|
|
33
58
|
* If omitted, all words are included.
|
|
34
59
|
*/
|
|
35
60
|
export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, removeMargins = false, wordIds = null) {
|
|
36
|
-
if (!(typeof process === 'undefined')) {
|
|
37
|
-
throw new Error('HTML exports are not supported in Node.js');
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
const canvas = new OffscreenCanvas(1, 1);
|
|
41
|
-
const ctx = /** @type {OffscreenCanvasRenderingContext2D} */ (canvas.getContext('2d'));
|
|
42
|
-
|
|
43
61
|
const fontsUsed = new Set();
|
|
44
62
|
|
|
45
63
|
const pad = 5;
|
|
46
64
|
|
|
47
|
-
let bodyStr = '<body
|
|
65
|
+
let bodyStr = '<body>\n';
|
|
48
66
|
|
|
49
67
|
if (maxpage === -1) maxpage = ocrCurrent.length - 1;
|
|
50
68
|
|
|
@@ -71,7 +89,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
71
89
|
}
|
|
72
90
|
}
|
|
73
91
|
|
|
74
|
-
bodyStr +=
|
|
92
|
+
bodyStr += ` <div class="scribe-page" id="page${g}" style="position:absolute;top:${top}px;">\n`;
|
|
75
93
|
if (removeMargins) {
|
|
76
94
|
top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10);
|
|
77
95
|
} else {
|
|
@@ -130,9 +148,7 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
130
148
|
|
|
131
149
|
const fontSizeHTML = fontSize * scale;
|
|
132
150
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
const metrics = ctx.measureText(wordStr);
|
|
151
|
+
const metrics = calcFontMetrics(fontI, fontSizeHTML);
|
|
136
152
|
|
|
137
153
|
const fontSizeHTMLSmallCaps = fontSize * scale * fontI.smallCapsMult;
|
|
138
154
|
|
|
@@ -174,29 +190,29 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
174
190
|
// Line height must match the height of the font bounding box for the font metrics to be accurate.
|
|
175
191
|
styleStr += `line-height:${metrics.fontBoundingBoxAscent + metrics.fontBoundingBoxDescent}px;`;
|
|
176
192
|
|
|
177
|
-
bodyStr +=
|
|
193
|
+
bodyStr += ` <span class="scribe-word" id="${wordObj.id}" style="${styleStr}">${innerHTML}</span>`;
|
|
178
194
|
}
|
|
179
195
|
}
|
|
180
196
|
|
|
181
|
-
bodyStr += '</div
|
|
197
|
+
bodyStr += '\n </div>\n';
|
|
182
198
|
|
|
183
199
|
opt.progressHandler({ n: g, type: 'export', info: { } });
|
|
184
200
|
}
|
|
185
201
|
|
|
186
|
-
let styleStr = '<style
|
|
202
|
+
let styleStr = '<style>\n .scribe-word {\n';
|
|
187
203
|
|
|
188
|
-
styleStr += 'position:absolute
|
|
189
|
-
styleStr += `padding-left:${pad}px
|
|
190
|
-
styleStr += `padding-right:${pad}px
|
|
191
|
-
styleStr += 'z-index:1
|
|
192
|
-
styleStr += 'white-space:nowrap
|
|
204
|
+
styleStr += ' position:absolute;\n';
|
|
205
|
+
styleStr += ` padding-left:${pad}px;\n`;
|
|
206
|
+
styleStr += ` padding-right:${pad}px;\n`;
|
|
207
|
+
styleStr += ' z-index:1;\n';
|
|
208
|
+
styleStr += ' white-space:nowrap;\n';
|
|
193
209
|
if (opt.kerning) {
|
|
194
|
-
styleStr += 'font-kerning:normal
|
|
210
|
+
styleStr += ' font-kerning:normal;\n';
|
|
195
211
|
} else {
|
|
196
|
-
styleStr += 'font-kerning:none
|
|
212
|
+
styleStr += ' font-kerning:none;\n';
|
|
197
213
|
}
|
|
198
214
|
|
|
199
|
-
styleStr += '}';
|
|
215
|
+
styleStr += ' }\n';
|
|
200
216
|
|
|
201
217
|
for (const fontI of fontsUsed) {
|
|
202
218
|
const cdnPath = 'https://cdn.jsdelivr.net/npm/scribe.js-ocr@0.7.1/fonts/all/';
|
|
@@ -205,19 +221,19 @@ export function writeHtml(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa
|
|
|
205
221
|
const fontName = `${fontI.family}-${styleTitleCase}.woff`;
|
|
206
222
|
const fontPath = cdnPath + fontName;
|
|
207
223
|
|
|
208
|
-
styleStr +=
|
|
224
|
+
styleStr += ` @font-face {
|
|
209
225
|
font-family: '${fontI.fontFaceName}';
|
|
210
226
|
font-style: ${fontI.fontFaceStyle};
|
|
211
227
|
font-weight: ${fontI.fontFaceWeight};
|
|
212
228
|
src: url('${fontPath}');
|
|
213
|
-
}\n`;
|
|
229
|
+
}\n`;
|
|
214
230
|
}
|
|
215
231
|
|
|
216
|
-
styleStr += '</style
|
|
232
|
+
styleStr += '</style>\n';
|
|
217
233
|
|
|
218
|
-
bodyStr += '</body
|
|
234
|
+
bodyStr += '</body>\n';
|
|
219
235
|
|
|
220
|
-
const htmlStr = `<html
|
|
236
|
+
const htmlStr = `<html>\n<head>\n${styleStr}</head>\n${bodyStr}</html>`;
|
|
221
237
|
|
|
222
238
|
return htmlStr;
|
|
223
239
|
}
|