scribe.js-ocr 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -2
- package/cli/main.js +12 -46
- package/js/clear.js +3 -3
- package/js/containers/app.js +11 -2
- package/js/containers/dataContainer.js +0 -6
- package/js/containers/fontContainer.js +139 -97
- package/js/containers/imageContainer.js +20 -84
- package/js/debug.js +34 -0
- package/js/export/exportPDF.js +52 -57
- package/js/export/exportRenderHOCR.js +5 -5
- package/js/fontContainerMain.js +95 -108
- package/js/fontEval.js +83 -111
- package/js/generalWorkerMain.js +28 -3
- package/js/global.d.ts +3 -0
- package/js/import/convertPageBlocks.js +9 -0
- package/js/import/convertPageShared.js +13 -7
- package/js/import/import.js +15 -13
- package/js/objects/imageObjects.js +97 -0
- package/js/objects/ocrObjects.js +53 -1
- package/js/recognizeConvert.js +8 -4
- package/js/utils/fontUtils.js +5 -5
- package/js/utils/miscUtils.js +7 -2
- package/js/worker/compareOCRModule.js +279 -81
- package/js/worker/generalWorker.js +98 -28
- package/js/worker/renderWordCanvas.js +14 -29
- package/package.json +1 -1
- package/scribe.js +77 -5
package/README.md
CHANGED
|
@@ -17,7 +17,7 @@ Install from `npm` by running the following:
|
|
|
17
17
|
npm i scribe.js-ocr
|
|
18
18
|
```
|
|
19
19
|
|
|
20
|
-
Scribe.js is written in JavaScript using ESM, so can be imported directly from browser or Node.js JavaScript code.
|
|
20
|
+
Scribe.js is written in JavaScript using ESM, so can be imported directly from browser or Node.js JavaScript code without a build step.
|
|
21
21
|
```js
|
|
22
22
|
// Import statement in browser:
|
|
23
23
|
import scribe from 'node_modules/scribe.js-ocr/scribe.js';
|
|
@@ -31,6 +31,15 @@ scribe.extractText(['https://tesseract.projectnaptha.com/img/eng_bw.png'])
|
|
|
31
31
|
|
|
32
32
|
When using Scribe.js in the browser, all files must be served from the same origin as the file importing Scribe.js. This means that importing Scribe.js from a CDN will not work. There is no UMD version.
|
|
33
33
|
|
|
34
|
+
# Templates
|
|
35
|
+
The following are template repos showing how Scribe.js can be used within various frameworks/build systems.
|
|
36
|
+
|
|
37
|
+
- Browser with ESM (no build): https://github.com/scribeocr/scribe.js-example-esm-browser
|
|
38
|
+
- Browser with Webpack 5: https://github.com/scribeocr/scribe.js-example-webpack5
|
|
39
|
+
- Browser with Vue.js v2: https://github.com/scribeocr/scribe.js-example-vue2
|
|
40
|
+
|
|
41
|
+
Contributions are appreciated--if you are using Scribe.js within a framework not listed below, consider making a basic repo and adding to this list with a PR, especially if non-obvious steps were required.
|
|
42
|
+
|
|
34
43
|
# Scribe.js vs. Tesseract.js
|
|
35
44
|
Considering whether Scribe.js or Tesseract.js is better for your project? Read [this article](./docs/scribe_vs_tesseract.md).
|
|
36
45
|
|
|
@@ -40,6 +49,15 @@ Considering whether Scribe.js or Tesseract.js is better for your project? Read
|
|
|
40
49
|
- [Scribe.js vs. Tesseract.js Comparison](./docs/scribe_vs_tesseract.md)
|
|
41
50
|
- [API](./docs/API.md)
|
|
42
51
|
|
|
52
|
+
## Projects and Examples
|
|
53
|
+
The following are examples and projects built using Scribe.js. Additional examples can be found in the [examples](https://github.com/scribeocr/scribe.js/tree/master/examples) directory.
|
|
54
|
+
|
|
55
|
+
- Projects
|
|
56
|
+
- Scribe OCR: officially supported GUI front-end for Scribe.js
|
|
57
|
+
- Site at [scribeocr.com](https://scribeocr.com/), repo at [github.com/scribeocr/scribeocr](https://github.com/scribeocr/scribeocr)
|
|
58
|
+
|
|
59
|
+
If you have a project or example repo that uses Scribe.js, feel free to add it to this list using a pull request. Examples submitted should be well documented such that new users can run them; projects should be functional and actively maintained.
|
|
60
|
+
|
|
43
61
|
# Contributing
|
|
44
62
|
To work on a local copy, simply clone with `--recurse-submodules` and install. Please run the automated tests before making a PR.
|
|
45
63
|
```sh
|
|
@@ -55,4 +73,4 @@ npm i
|
|
|
55
73
|
|
|
56
74
|
## Run automated tests before making PR
|
|
57
75
|
npm run test
|
|
58
|
-
```
|
|
76
|
+
```
|
package/cli/main.js
CHANGED
|
@@ -7,48 +7,10 @@ import path from 'path';
|
|
|
7
7
|
import { tmpUnique } from '../js/worker/compareOCRModule.js';
|
|
8
8
|
import scribe from '../scribe.js';
|
|
9
9
|
|
|
10
|
-
// When `debugMode` is enabled:
|
|
11
|
-
// (1) Comparison images are saved as .png files.
|
|
12
|
-
// (2) Comparison logs are saved as .txt files.
|
|
13
|
-
// (3) All OCR data is dumped as .hocr files.
|
|
14
10
|
const debugMode = false;
|
|
15
11
|
|
|
16
12
|
scribe.opt.saveDebugImages = debugMode;
|
|
17
13
|
|
|
18
|
-
/** @type {import('canvas').CanvasRenderingContext2D} */
|
|
19
|
-
let ctxDebug;
|
|
20
|
-
if (debugMode) {
|
|
21
|
-
const { createCanvas } = await import('canvas');
|
|
22
|
-
const canvasAlt = createCanvas(200, 200);
|
|
23
|
-
ctxDebug = canvasAlt.getContext('2d');
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
const debugDir = `${__dirname}/../../dev/debug/`;
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
*
|
|
30
|
-
* @param {import('canvas').CanvasRenderingContext2D} ctx
|
|
31
|
-
* @param {Array<Array<CompDebugNode>>} compDebugArrArr
|
|
32
|
-
* @param {string} filePath
|
|
33
|
-
*/
|
|
34
|
-
async function writeDebugImages(ctx, compDebugArrArr, filePath) {
|
|
35
|
-
await scribe.utils.drawDebugImages({ ctx, compDebugArrArr, context: 'node' });
|
|
36
|
-
const buffer0 = ctx.canvas.toBuffer('image/png');
|
|
37
|
-
fs.writeFileSync(filePath, buffer0);
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
async function dumpDebugImagesAll() {
|
|
41
|
-
if (!scribe.data.debug.debugImg.Combined || scribe.data.debug.debugImg.Combined.length === 0) {
|
|
42
|
-
console.log('No debug images to dump.');
|
|
43
|
-
return;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
for (let i = 0; i < scribe.data.debug.debugImg.Combined.length; i++) {
|
|
47
|
-
const filePath = `${debugDir}legacy_lstm_comp_${i}.png`;
|
|
48
|
-
await writeDebugImages(ctxDebug, [scribe.data.debug.debugImg.Combined[i]], filePath);
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
14
|
/**
|
|
53
15
|
* @param {string} func
|
|
54
16
|
* @param {Object} params
|
|
@@ -75,15 +37,16 @@ async function main(func, params) {
|
|
|
75
37
|
|
|
76
38
|
const output = {};
|
|
77
39
|
|
|
78
|
-
const debugComp = false;
|
|
79
|
-
|
|
80
40
|
const files = [];
|
|
81
41
|
if (params.pdfFile) files.push(params.pdfFile);
|
|
82
42
|
if (params.ocrFile) files.push(params.ocrFile);
|
|
83
43
|
await scribe.importFiles(files);
|
|
84
44
|
|
|
85
45
|
const backgroundArg = params.pdfFile;
|
|
86
|
-
const
|
|
46
|
+
const backgroundStem = backgroundArg ? path.basename(backgroundArg).replace(/\.\w{1,5}$/i, '') : undefined;
|
|
47
|
+
const ocrStem = params.ocrFile ? path.basename(params.ocrFile).replace(/\.\w{1,5}$/i, '') : undefined;
|
|
48
|
+
const outputStem = backgroundStem || ocrStem || 'output';
|
|
49
|
+
|
|
87
50
|
const outputDir = params.outputDir || '.';
|
|
88
51
|
|
|
89
52
|
if (outputDir) fs.mkdirSync(outputDir, { recursive: true });
|
|
@@ -126,12 +89,15 @@ async function main(func, params) {
|
|
|
126
89
|
await scribe.download('pdf', outputPath);
|
|
127
90
|
}
|
|
128
91
|
|
|
129
|
-
if (
|
|
130
|
-
const
|
|
131
|
-
|
|
132
|
-
|
|
92
|
+
if (debugMode) {
|
|
93
|
+
const debugDir = `${outputDir}/${outputStem}_debug`;
|
|
94
|
+
fs.mkdirSync(debugDir, { recursive: true });
|
|
95
|
+
const outputPathCsv = `${debugDir}/_debug.csv`;
|
|
96
|
+
scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv);
|
|
133
97
|
|
|
134
|
-
|
|
98
|
+
scribe.utils.dumpDebugImages(debugDir);
|
|
99
|
+
scribe.utils.dumpHOCR(debugDir);
|
|
100
|
+
}
|
|
135
101
|
|
|
136
102
|
// Delete temp directory with fonts
|
|
137
103
|
await tmpUnique.delete();
|
package/js/clear.js
CHANGED
|
@@ -8,12 +8,12 @@ import {
|
|
|
8
8
|
ocrAllRaw,
|
|
9
9
|
pageMetricsArr,
|
|
10
10
|
} from './containers/dataContainer.js';
|
|
11
|
-
import {
|
|
11
|
+
import { FontCont } from './containers/fontContainer.js';
|
|
12
12
|
import { ImageCache } from './containers/imageContainer.js';
|
|
13
13
|
import { replaceObjectProperties } from './utils/miscUtils.js';
|
|
14
14
|
|
|
15
15
|
export function clearData() {
|
|
16
|
-
inputData.
|
|
16
|
+
inputData.clear();
|
|
17
17
|
replaceObjectProperties(ocrAll, { active: [] });
|
|
18
18
|
replaceObjectProperties(ocrAllRaw, { active: [] });
|
|
19
19
|
layoutRegions.pages.length = 0;
|
|
@@ -23,5 +23,5 @@ export function clearData() {
|
|
|
23
23
|
ImageCache.clear();
|
|
24
24
|
// Clear optimized font data and reset fontAll to raw data.
|
|
25
25
|
replaceObjectProperties(fontMetricsObj);
|
|
26
|
-
|
|
26
|
+
FontCont.clear();
|
|
27
27
|
}
|
package/js/containers/app.js
CHANGED
|
@@ -5,8 +5,6 @@ export class opt {
|
|
|
5
5
|
|
|
6
6
|
static extractText = false;
|
|
7
7
|
|
|
8
|
-
static enableOpt = false;
|
|
9
|
-
|
|
10
8
|
static enableUpscale = false;
|
|
11
9
|
|
|
12
10
|
static ignorePunct = false;
|
|
@@ -82,4 +80,15 @@ export class inputData {
|
|
|
82
80
|
static defaultDownloadFileName = '';
|
|
83
81
|
|
|
84
82
|
static pageCount = 0;
|
|
83
|
+
|
|
84
|
+
static clear = () => {
|
|
85
|
+
inputData.xmlMode.length = 0;
|
|
86
|
+
inputData.pdfMode = false;
|
|
87
|
+
inputData.imageMode = false;
|
|
88
|
+
inputData.resumeMode = false;
|
|
89
|
+
inputData.evalMode = false;
|
|
90
|
+
inputData.inputFileNames = [];
|
|
91
|
+
inputData.defaultDownloadFileName = '';
|
|
92
|
+
inputData.pageCount = 0;
|
|
93
|
+
};
|
|
85
94
|
}
|
|
@@ -79,12 +79,6 @@ export const pageMetricsArr = [];
|
|
|
79
79
|
export class DebugData {
|
|
80
80
|
/** @type {{[key: string]: Array<Array<CompDebugBrowser|CompDebugNode>> | undefined}} */
|
|
81
81
|
static debugImg = {};
|
|
82
|
-
|
|
83
|
-
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
84
|
-
static evalRaw;
|
|
85
|
-
|
|
86
|
-
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
87
|
-
static evalOpt;
|
|
88
82
|
}
|
|
89
83
|
|
|
90
84
|
/** @type {Array<Awaited<ReturnType<typeof import('../../scrollview-web/scrollview/ScrollView.js').ScrollView.prototype.getAll>>>} */
|
|
@@ -221,117 +221,159 @@ export async function loadFontsFromSource(srcObj, opt = false) {
|
|
|
221
221
|
// FontCont must contain no font data when initialized, and no data should be defined in this file.
|
|
222
222
|
// This is because this file is run both from the main thread and workers, and fonts are defined different ways in each.
|
|
223
223
|
// In the main thread, "raw" fonts are loaded from fetch requests, however in workers they are loaded from the main thread.
|
|
224
|
-
class FontCont {
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
* @returns {FontContainer}
|
|
255
|
-
*/
|
|
256
|
-
this.getContainer = (container) => {
|
|
257
|
-
const fontRes = this[container];
|
|
258
|
-
if (!fontRes) throw new Error(`${container} font container does not exist.`);
|
|
259
|
-
return fontRes;
|
|
260
|
-
};
|
|
224
|
+
export class FontCont {
|
|
225
|
+
/** @type {?FontContainer} */
|
|
226
|
+
static raw = null;
|
|
227
|
+
|
|
228
|
+
/** @type {?FontContainer} */
|
|
229
|
+
static opt = null;
|
|
230
|
+
|
|
231
|
+
/** @type {?FontContainer} */
|
|
232
|
+
static export = null;
|
|
233
|
+
|
|
234
|
+
static supp = {
|
|
235
|
+
/** @type {?FontContainerFont} */
|
|
236
|
+
chi_sim: null,
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
/** Optimized fonts will be used when believed to improve quality. */
|
|
240
|
+
static enableOpt = false;
|
|
241
|
+
|
|
242
|
+
/** Optimized fonts will always be used when they exist, even if believed to reduce quality. */
|
|
243
|
+
static forceOpt = false;
|
|
244
|
+
|
|
245
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
246
|
+
static rawMetrics = null;
|
|
247
|
+
|
|
248
|
+
/** @type {?Awaited<ReturnType<import('../fontEval.js').evaluateFonts>>} */
|
|
249
|
+
static optMetrics = null;
|
|
250
|
+
|
|
251
|
+
static defaultFontName = 'SerifDefault';
|
|
252
|
+
|
|
253
|
+
static serifDefaultName = 'NimbusRomNo9L';
|
|
261
254
|
|
|
262
|
-
|
|
255
|
+
static sansDefaultName = 'NimbusSans';
|
|
256
|
+
|
|
257
|
+
/** @type {?('latin'|'all')} */
|
|
258
|
+
static glyphSet = null;
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Decide whether to use the optimized version of a font family.
|
|
262
|
+
* Note that even when this function returns `true`, optimized versions of every style will not exist.
|
|
263
|
+
* @param {string} family - Font family name.
|
|
264
|
+
*/
|
|
265
|
+
static useOptFamily = (family) => {
|
|
266
|
+
const raw = FontCont.raw?.[family]?.normal;
|
|
267
|
+
if (!raw) return false;
|
|
268
|
+
const opt = FontCont.opt?.[family]?.normal;
|
|
269
|
+
if (opt && FontCont.forceOpt) {
|
|
270
|
+
return true;
|
|
271
|
+
// If optimized fonts are enabled (but not forced), the optimized version of a font will be used if:
|
|
272
|
+
// (1) The optimized version exists
|
|
273
|
+
// (2) The optimized version has a better metric (so quality should improve).
|
|
274
|
+
// (3) The optimized version of the default sans/serif font also has a better metric.
|
|
275
|
+
// This last condition avoids font optimization being enabled in the UI when it only improves an unused font.
|
|
276
|
+
} if (opt && FontCont.enableOpt) {
|
|
277
|
+
const defaultFamily = raw.type === 'serif' ? FontCont.serifDefaultName : FontCont.sansDefaultName;
|
|
278
|
+
|
|
279
|
+
const rawMetricDefault = FontCont.rawMetrics?.[defaultFamily];
|
|
280
|
+
const optMetricDefault = FontCont.optMetrics?.[defaultFamily];
|
|
281
|
+
|
|
282
|
+
const rawMetric = FontCont.rawMetrics?.[family];
|
|
283
|
+
const optMetric = FontCont.optMetrics?.[family];
|
|
284
|
+
if (rawMetric && optMetric && optMetric < rawMetric && optMetricDefault < rawMetricDefault) {
|
|
285
|
+
return true;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
return false;
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
/**
|
|
263
292
|
* Gets a font object. Unlike accessing the font containers directly,
|
|
264
293
|
* this method allows for special values 'Default', 'SansDefault', and 'SerifDefault' to be used.
|
|
265
294
|
*
|
|
266
295
|
* @param {('Default'|'SansDefault'|'SerifDefault'|string)} family - Font family name.
|
|
267
296
|
* @param {('normal'|'italic'|'bold'|string)} [style='normal']
|
|
268
297
|
* @param {string} [lang='eng']
|
|
269
|
-
* @param {('raw'|'opt'|'active'|'optInitial')} [container='active']
|
|
270
298
|
* @returns {FontContainerFont}
|
|
271
299
|
*/
|
|
272
|
-
|
|
273
|
-
|
|
300
|
+
static getFont = (family, style = 'normal', lang = 'eng') => {
|
|
301
|
+
if (lang === 'chi_sim') {
|
|
302
|
+
if (!FontCont.supp.chi_sim) throw new Error('chi_sim font does not exist.');
|
|
303
|
+
return FontCont.supp.chi_sim;
|
|
304
|
+
}
|
|
274
305
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
306
|
+
if (!FontCont.raw) throw new Error('Raw fonts not yet initialized.');
|
|
307
|
+
|
|
308
|
+
// Option 1: If we have access to the font, use it.
|
|
309
|
+
// Option 2: If we do not have access to the font, but it closely resembles a built-in font, use the built-in font.
|
|
310
|
+
if (!FontCont.raw?.[family]?.[style]) {
|
|
311
|
+
if (/Times/i.test(family)) {
|
|
312
|
+
family = 'NimbusRomNo9L';
|
|
313
|
+
} else if (/Helvetica/i.test(family)) {
|
|
314
|
+
family = 'NimbusSans';
|
|
315
|
+
} else if (/Arial/i.test(family)) {
|
|
316
|
+
family = 'NimbusSans';
|
|
317
|
+
} else if (/Century/i.test(family)) {
|
|
318
|
+
family = 'Century';
|
|
319
|
+
} else if (/Palatino/i.test(family)) {
|
|
320
|
+
family = 'Palatino';
|
|
321
|
+
} else if (/Garamond/i.test(family)) {
|
|
322
|
+
family = 'Garamond';
|
|
323
|
+
} else if (/Carlito/i.test(family)) {
|
|
324
|
+
family = 'Carlito';
|
|
325
|
+
} else if (/Calibri/i.test(family)) {
|
|
326
|
+
family = 'Carlito';
|
|
278
327
|
}
|
|
328
|
+
}
|
|
279
329
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
family = 'NimbusRomNo9L';
|
|
285
|
-
} else if (/Helvetica/i.test(family)) {
|
|
286
|
-
family = 'NimbusSans';
|
|
287
|
-
} else if (/Arial/i.test(family)) {
|
|
288
|
-
family = 'NimbusSans';
|
|
289
|
-
} else if (/Century/i.test(family)) {
|
|
290
|
-
family = 'Century';
|
|
291
|
-
} else if (/Palatino/i.test(family)) {
|
|
292
|
-
family = 'Palatino';
|
|
293
|
-
} else if (/Garamond/i.test(family)) {
|
|
294
|
-
family = 'Garamond';
|
|
295
|
-
} else if (/Carlito/i.test(family)) {
|
|
296
|
-
family = 'Carlito';
|
|
297
|
-
} else if (/Calibri/i.test(family)) {
|
|
298
|
-
family = 'Carlito';
|
|
299
|
-
}
|
|
300
|
-
}
|
|
330
|
+
// Option 3: If the font still is not identified, use the default sans/serif font.
|
|
331
|
+
if (!FontCont.raw?.[family]?.[style]) {
|
|
332
|
+
family = determineSansSerif(family);
|
|
333
|
+
}
|
|
301
334
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
family = determineSansSerif(family);
|
|
305
|
-
}
|
|
335
|
+
// This needs to come first as `defaultFontName` maps to either 'SerifDefault' or 'SansDefault'.
|
|
336
|
+
if (family === 'Default') family = FontCont.defaultFontName;
|
|
306
337
|
|
|
307
|
-
|
|
308
|
-
|
|
338
|
+
if (family === 'SerifDefault') family = FontCont.serifDefaultName;
|
|
339
|
+
if (family === 'SansDefault') family = FontCont.sansDefaultName;
|
|
309
340
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
if (!fontRes) throw new Error(`Font container does not contain ${family} (${style}).`);
|
|
314
|
-
return fontRes;
|
|
315
|
-
};
|
|
341
|
+
/** @type {FontContainerFont} */
|
|
342
|
+
let fontRes = FontCont.raw?.[family]?.[style];
|
|
343
|
+
if (!fontRes) throw new Error(`Font container does not contain ${family} (${style}).`);
|
|
316
344
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
* @param {('raw'|'opt'|'active'|'optInitial')} [container='active']
|
|
321
|
-
*/
|
|
322
|
-
this.getWordFont = (word, container = 'active') => {
|
|
323
|
-
const wordFontFamily = word.font || fontAll.defaultFontName;
|
|
324
|
-
return this.getFont(wordFontFamily, word.style, word.lang, container);
|
|
325
|
-
};
|
|
326
|
-
|
|
327
|
-
this.clear = () => {
|
|
328
|
-
this.active = this.raw;
|
|
329
|
-
this.optInitial = null;
|
|
330
|
-
this.opt = null;
|
|
331
|
-
this.loadedBuiltInRawWorker = false;
|
|
332
|
-
this.loadedBuiltInOptWorker = false;
|
|
333
|
-
};
|
|
334
|
-
}
|
|
335
|
-
}
|
|
345
|
+
const opt = FontCont.opt?.[family]?.[style];
|
|
346
|
+
const useOpt = FontCont.useOptFamily(family);
|
|
347
|
+
if (opt && useOpt) fontRes = opt;
|
|
336
348
|
|
|
337
|
-
|
|
349
|
+
return fontRes;
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
*
|
|
354
|
+
* @param {OcrWord} word
|
|
355
|
+
*/
|
|
356
|
+
static getWordFont = (word) => {
|
|
357
|
+
const wordFontFamily = word.font || FontCont.defaultFontName;
|
|
358
|
+
return FontCont.getFont(wordFontFamily, word.style, word.lang);
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Reset font container to original state but do not unload default resources.
|
|
363
|
+
*/
|
|
364
|
+
static clear = () => {
|
|
365
|
+
FontCont.opt = null;
|
|
366
|
+
FontCont.rawMetrics = null;
|
|
367
|
+
FontCont.optMetrics = null;
|
|
368
|
+
|
|
369
|
+
FontCont.defaultFontName = 'SerifDefault';
|
|
370
|
+
FontCont.serifDefaultName = 'NimbusRomNo9L';
|
|
371
|
+
FontCont.sansDefaultName = 'NimbusSans';
|
|
372
|
+
};
|
|
373
|
+
|
|
374
|
+
static terminate = () => {
|
|
375
|
+
FontCont.clear();
|
|
376
|
+
FontCont.raw = null;
|
|
377
|
+
FontCont.glyphSet = null;
|
|
378
|
+
};
|
|
379
|
+
}
|
|
@@ -4,86 +4,21 @@ import {
|
|
|
4
4
|
|
|
5
5
|
import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';
|
|
6
6
|
|
|
7
|
-
import { getImageBitmap
|
|
7
|
+
import { getImageBitmap } from '../utils/imageUtils.js';
|
|
8
8
|
|
|
9
9
|
import { setUploadFontsWorker } from '../fontContainerMain.js';
|
|
10
10
|
import { pageMetricsArr } from './dataContainer.js';
|
|
11
11
|
import {
|
|
12
|
+
FontCont,
|
|
12
13
|
FontContainerFont,
|
|
13
|
-
fontAll,
|
|
14
14
|
loadOpentype,
|
|
15
15
|
} from './fontContainer.js';
|
|
16
16
|
|
|
17
17
|
import { gs } from '../generalWorkerMain.js';
|
|
18
|
+
import { imageUtils } from '../objects/imageObjects.js';
|
|
18
19
|
import { determineSansSerif, range } from '../utils/miscUtils.js';
|
|
19
20
|
import { opt } from './app.js';
|
|
20
21
|
|
|
21
|
-
/**
|
|
22
|
-
*
|
|
23
|
-
* @param {ImageWrapper} img
|
|
24
|
-
* @returns
|
|
25
|
-
*/
|
|
26
|
-
const getDims = async (img) => {
|
|
27
|
-
if (!img._dims) {
|
|
28
|
-
if (img.format === 'jpeg') {
|
|
29
|
-
img._dims = getJpegDimensions(img.src);
|
|
30
|
-
} else {
|
|
31
|
-
img._dims = getPngDimensions(img.src);
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
return img._dims;
|
|
35
|
-
};
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Checks whether existing transformations need to be undone by re-rendering raw image.
|
|
39
|
-
* When an existing image has an unwanted tranformation, it is re-rendered from the original source,
|
|
40
|
-
* rather than attempting to unrotate/downscale/etc. the transformed image.
|
|
41
|
-
*
|
|
42
|
-
* @param {(ImageWrapper|ImageProperties)} img
|
|
43
|
-
* @param {?ImagePropertiesRequest|ImageWrapper} [props]
|
|
44
|
-
* @returns
|
|
45
|
-
*/
|
|
46
|
-
const requiresUndo = (img, props) => {
|
|
47
|
-
if (!props) return false;
|
|
48
|
-
if (img.rotated && props.rotated === false) return true;
|
|
49
|
-
if (img.upscaled && props.upscaled === false) return true;
|
|
50
|
-
// This condition should only apply to PDFs.
|
|
51
|
-
if (img.colorMode === 'color' && props.colorMode === 'gray' || img.colorMode === 'gray' && props.colorMode === 'color') return true;
|
|
52
|
-
return false;
|
|
53
|
-
};
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Whether the image properties are compatible with the requested properties.
|
|
57
|
-
* @param {ImageWrapper|ImageProperties} img
|
|
58
|
-
* @param {?ImagePropertiesRequest|ImageWrapper} [props]
|
|
59
|
-
*/
|
|
60
|
-
const compatible = (img, props) => {
|
|
61
|
-
if (!props) return true;
|
|
62
|
-
if (props.rotated === false && img.rotated === true) {
|
|
63
|
-
// Requests to unrotate an image are always respected, even if the angle is very close to 0.
|
|
64
|
-
// This is because the intent may be to restore the raw user-uploaded image for an export, which should always be possible.
|
|
65
|
-
return false;
|
|
66
|
-
} if (props.rotated === true && img.rotated === false) {
|
|
67
|
-
// An unrotated image is considered compatible with a rotated request if the angle is very close to 0.
|
|
68
|
-
if (Math.abs(pageMetricsArr[img.n].angle || 0) > 0.05) {
|
|
69
|
-
return false;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
if (props.upscaled === true && img.upscaled === false || props.upscaled === false && img.upscaled === true) return false;
|
|
74
|
-
|
|
75
|
-
// The value 'native' is used for images uploaded from the user, and is essentially a default value.
|
|
76
|
-
// These cannot be considered incompatible with any color mode as the color of user-uploaded images is never edited (binarization aside).
|
|
77
|
-
if (props.colorMode && props.colorMode !== img.colorMode && img.colorMode !== 'native' && img.colorMode !== 'native') return false;
|
|
78
|
-
return true;
|
|
79
|
-
};
|
|
80
|
-
|
|
81
|
-
export const imageUtils = {
|
|
82
|
-
getDims,
|
|
83
|
-
requiresUndo,
|
|
84
|
-
compatible,
|
|
85
|
-
};
|
|
86
|
-
|
|
87
22
|
let skipTextMode = false;
|
|
88
23
|
|
|
89
24
|
export class MuPDFScheduler {
|
|
@@ -306,7 +241,7 @@ export class ImageCache {
|
|
|
306
241
|
/**
|
|
307
242
|
* @param {ImageWrapper} inputImage
|
|
308
243
|
* @param {number} n - Page number
|
|
309
|
-
* @param {
|
|
244
|
+
* @param {ImagePropertiesRequest} [props] - Image properties needed.
|
|
310
245
|
* Image properties should only be defined if needed, as they can require the image to be re-rendered.
|
|
311
246
|
* @param {boolean} [saveNativeImage=true] - Whether the native image should be saved.
|
|
312
247
|
*/
|
|
@@ -350,20 +285,22 @@ export class ImageCache {
|
|
|
350
285
|
|
|
351
286
|
/**
|
|
352
287
|
* @param {number} n - Page number
|
|
353
|
-
* @param {
|
|
288
|
+
* @param {ImagePropertiesRequest} [props] - Image properties needed.
|
|
354
289
|
* Image properties should only be defined if needed, as they can require the image to be re-rendered.
|
|
355
290
|
* @param {boolean} [nativeOnly=true]
|
|
356
291
|
*/
|
|
357
292
|
static getImages = (n, props, nativeOnly = true) => {
|
|
358
|
-
const
|
|
359
|
-
|
|
293
|
+
const significantRotation = Math.abs(pageMetricsArr[n].angle || 0) > 0.05;
|
|
294
|
+
|
|
295
|
+
const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation);
|
|
296
|
+
const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation));
|
|
360
297
|
|
|
361
298
|
if (newNative || newBinary) {
|
|
362
299
|
const renderRaw = !ImageCache.native[n] || imageUtils.requiresUndo(ImageCache.nativeProps[n], props);
|
|
363
300
|
const propsRaw = {
|
|
364
301
|
colorMode: opt.colorMode, rotated: false, upscaled: false, n,
|
|
365
302
|
};
|
|
366
|
-
const renderTransform = newBinary || !imageUtils.compatible(propsRaw, props);
|
|
303
|
+
const renderTransform = newBinary || !imageUtils.compatible(propsRaw, props, significantRotation);
|
|
367
304
|
|
|
368
305
|
const propsNew = renderRaw ? propsRaw : JSON.parse(JSON.stringify(ImageCache.nativeProps[n]));
|
|
369
306
|
propsNew.colorMode = props?.colorMode || propsNew.colorMode;
|
|
@@ -387,7 +324,6 @@ export class ImageCache {
|
|
|
387
324
|
if (renderTransform) {
|
|
388
325
|
return ImageCache.transformImage(img1, n, props, true);
|
|
389
326
|
}
|
|
390
|
-
console.assert(nativeOnly, 'Binary should not be null when binary is needed');
|
|
391
327
|
return { native: img1, binary: null };
|
|
392
328
|
})();
|
|
393
329
|
|
|
@@ -400,20 +336,20 @@ export class ImageCache {
|
|
|
400
336
|
|
|
401
337
|
/**
|
|
402
338
|
* @param {number} n
|
|
403
|
-
* @param {
|
|
339
|
+
* @param {ImagePropertiesRequest} [props]
|
|
404
340
|
*/
|
|
405
341
|
static getNative = async (n, props) => ImageCache.getImages(n, props, true).native;
|
|
406
342
|
|
|
407
343
|
/**
|
|
408
344
|
* @param {number} n
|
|
409
|
-
* @param {
|
|
345
|
+
* @param {ImagePropertiesRequest} [props]
|
|
410
346
|
*/
|
|
411
347
|
static getBinary = async (n, props) => ImageCache.getImages(n, props, false).binary;
|
|
412
348
|
|
|
413
349
|
/**
|
|
414
350
|
*
|
|
415
351
|
* @param {number} n - Page number
|
|
416
|
-
* @param {
|
|
352
|
+
* @param {ImagePropertiesRequest} [props] - Image properties needed.
|
|
417
353
|
* Image properties should only be defined if needed, as they can require the image to be re-rendered.
|
|
418
354
|
*/
|
|
419
355
|
static getNativeBitmap = async (n, props) => {
|
|
@@ -431,7 +367,7 @@ export class ImageCache {
|
|
|
431
367
|
/**
|
|
432
368
|
*
|
|
433
369
|
* @param {number} n - Page number
|
|
434
|
-
* @param {
|
|
370
|
+
* @param {ImagePropertiesRequest} [props] - Image properties needed.
|
|
435
371
|
* Image properties should only be defined if needed, as they can require the image to be re-rendered.
|
|
436
372
|
*/
|
|
437
373
|
static getBinaryBitmap = async (n, props) => {
|
|
@@ -454,9 +390,9 @@ export class ImageCache {
|
|
|
454
390
|
* @param {number} min - Min page to render.
|
|
455
391
|
* @param {number} max - Max page to render.
|
|
456
392
|
* @param {boolean} binary - Whether to render binary images.
|
|
457
|
-
* @param {
|
|
393
|
+
* @param {ImagePropertiesRequest} [props]
|
|
458
394
|
*/
|
|
459
|
-
static preRenderRange = async (min, max, binary, props
|
|
395
|
+
static preRenderRange = async (min, max, binary, props) => {
|
|
460
396
|
const pagesArr = range(min, max);
|
|
461
397
|
if (binary) {
|
|
462
398
|
await Promise.all(pagesArr.map((n) => ImageCache.getBinary(n, props).then(() => {
|
|
@@ -624,12 +560,12 @@ export class ImageCache {
|
|
|
624
560
|
// mupdf replaces spaces with underscores in font names.
|
|
625
561
|
const fontName = fontFamilyEmbedded.replace(/[^+]+\+/g, '').replace(/\s/g, '_');
|
|
626
562
|
|
|
627
|
-
if (!
|
|
628
|
-
|
|
563
|
+
if (!FontCont.raw[fontName]) {
|
|
564
|
+
FontCont.raw[fontName] = {};
|
|
629
565
|
}
|
|
630
566
|
|
|
631
|
-
if (!
|
|
632
|
-
|
|
567
|
+
if (!FontCont.raw[fontName][fontStyle]) {
|
|
568
|
+
FontCont.raw[fontName][fontStyle] = new FontContainerFont(fontName, fontStyle, src, false, fontObj);
|
|
633
569
|
}
|
|
634
570
|
}
|
|
635
571
|
|