@luii/node-tesseract-ocr 2.0.13 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -3
- package/README.md +547 -153
- package/binding-options.js +4 -0
- package/dist/cjs/index.cjs +144 -18
- package/dist/cjs/index.d.ts +6 -859
- package/dist/cjs/types.d.ts +1272 -0
- package/dist/cjs/types.js +17 -0
- package/dist/cjs/utils.d.ts +1 -0
- package/dist/cjs/utils.js +38 -0
- package/dist/esm/index.d.ts +6 -859
- package/dist/esm/index.mjs +129 -14
- package/dist/esm/types.d.ts +1272 -0
- package/dist/esm/types.js +16 -0
- package/dist/esm/utils.d.ts +1 -0
- package/dist/esm/utils.js +25 -0
- package/package.json +15 -10
- package/prebuilds/node-tesseract-ocr-darwin-arm64/node-napi-v10.node +0 -0
- package/prebuilds/node-tesseract-ocr-linux-x64/node-napi-v10.node +0 -0
- package/src/commands.hpp +656 -88
- package/src/tesseract_wrapper.cpp +643 -187
- package/src/tesseract_wrapper.hpp +27 -4
- package/src/worker_thread.cpp +146 -2
- package/src/worker_thread.hpp +4 -1
package/dist/cjs/index.cjs
CHANGED
|
@@ -23,8 +23,20 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
24
24
|
});
|
|
25
25
|
};
|
|
26
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
27
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
28
|
+
};
|
|
26
29
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
27
30
|
exports.NativeTesseract = exports.Tesseract = exports.LogLevels = exports.PageSegmentationModes = exports.OcrEngineModes = exports.Language = void 0;
|
|
31
|
+
const node_fs_1 = require("node:fs");
|
|
32
|
+
const promises_1 = require("node:fs/promises");
|
|
33
|
+
const node_os_1 = __importDefault(require("node:os"));
|
|
34
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
35
|
+
const node_stream_1 = require("node:stream");
|
|
36
|
+
const promises_2 = require("node:stream/promises");
|
|
37
|
+
const node_zlib_1 = require("node:zlib");
|
|
38
|
+
const proper_lockfile_1 = require("proper-lockfile");
|
|
39
|
+
const utils_1 = require("./utils");
|
|
28
40
|
/**
|
|
29
41
|
* All available languages for tesseract
|
|
30
42
|
* @readonly
|
|
@@ -239,14 +251,15 @@ exports.LogLevels = {
|
|
|
239
251
|
FATAL: "50000",
|
|
240
252
|
OFF: "2147483647",
|
|
241
253
|
};
|
|
242
|
-
const
|
|
243
|
-
const
|
|
244
|
-
const
|
|
245
|
-
const
|
|
246
|
-
const
|
|
254
|
+
const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
|
|
255
|
+
const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
|
|
256
|
+
const DEFAULT_CACHE_DIR = node_path_1.default.join(node_os_1.default.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
|
|
257
|
+
const rootFromSource = node_path_1.default.resolve(__dirname, "../../");
|
|
258
|
+
const bindingOptionsFromSource = node_path_1.default.resolve(rootFromSource, "binding-options.js");
|
|
259
|
+
const bindingOptionsPath = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
|
|
247
260
|
? bindingOptionsFromSource
|
|
248
|
-
:
|
|
249
|
-
const prebuildRoot =
|
|
261
|
+
: node_path_1.default.resolve(process.cwd(), "binding-options.js");
|
|
262
|
+
const prebuildRoot = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
|
|
250
263
|
? rootFromSource
|
|
251
264
|
: process.cwd();
|
|
252
265
|
const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
|
|
@@ -254,24 +267,137 @@ exports.NativeTesseract = NativeTesseract;
|
|
|
254
267
|
class Tesseract extends NativeTesseract {
|
|
255
268
|
constructor() {
|
|
256
269
|
super();
|
|
270
|
+
Object.defineProperty(this, "document", {
|
|
271
|
+
enumerable: true,
|
|
272
|
+
configurable: true,
|
|
273
|
+
writable: true,
|
|
274
|
+
value: {
|
|
275
|
+
begin: this.beginProcessPages.bind(this),
|
|
276
|
+
addPage: this.addProcessPage.bind(this),
|
|
277
|
+
finish: this.finishProcessPages.bind(this),
|
|
278
|
+
abort: this.abortProcessPages.bind(this),
|
|
279
|
+
status: this.getProcessPagesStatus.bind(this),
|
|
280
|
+
}
|
|
281
|
+
});
|
|
257
282
|
}
|
|
258
|
-
init(
|
|
283
|
+
init() {
|
|
259
284
|
const _super = Object.create(null, {
|
|
260
285
|
init: { get: () => super.init }
|
|
261
286
|
});
|
|
262
|
-
return __awaiter(this,
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
287
|
+
return __awaiter(this, arguments, void 0, function* (options = {}) {
|
|
288
|
+
var _a, _b, _c, _d, _e, _f;
|
|
289
|
+
(_a = options.langs) !== null && _a !== void 0 ? _a : (options.langs = []);
|
|
290
|
+
(_b = options.ensureTraineddata) !== null && _b !== void 0 ? _b : (options.ensureTraineddata = true);
|
|
291
|
+
(_c = options.cachePath) !== null && _c !== void 0 ? _c : (options.cachePath = DEFAULT_CACHE_DIR);
|
|
292
|
+
(_d = options.dataPath) !== null && _d !== void 0 ? _d : (options.dataPath = (_e = process.env.TESSDATA_PREFIX) !== null && _e !== void 0 ? _e : options.cachePath);
|
|
293
|
+
(_f = options.progressCallback) !== null && _f !== void 0 ? _f : (options.progressCallback = undefined);
|
|
294
|
+
const cachePath = node_path_1.default.resolve(options.cachePath);
|
|
295
|
+
const dataPath = node_path_1.default.resolve(options.dataPath);
|
|
296
|
+
if (options.ensureTraineddata) {
|
|
297
|
+
for (const lang of [...options.langs, exports.Language.osd]) {
|
|
298
|
+
const downloadBaseUrl = options.oem === exports.OcrEngineModes.OEM_LSTM_ONLY
|
|
299
|
+
? TESSDATA4_BEST(lang)
|
|
300
|
+
: TESSDATA4(lang);
|
|
301
|
+
lang &&
|
|
302
|
+
(yield this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
|
|
303
|
+
}
|
|
304
|
+
}
|
|
272
305
|
return _super.init.call(this, options);
|
|
273
306
|
});
|
|
274
307
|
}
|
|
308
|
+
ensureTrainingData(_a, progressCallback_1) {
|
|
309
|
+
return __awaiter(this, arguments, void 0, function* ({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
|
|
310
|
+
const traineddataPath = node_path_1.default.join(dataPath, `${lang}.traineddata`);
|
|
311
|
+
const cacheTraineddataPath = node_path_1.default.join(cachePath, `${lang}.traineddata`);
|
|
312
|
+
if (yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath)) {
|
|
313
|
+
if (traineddataPath !== cacheTraineddataPath) {
|
|
314
|
+
yield (0, promises_1.mkdir)(dataPath, { recursive: true });
|
|
315
|
+
yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
|
|
316
|
+
}
|
|
317
|
+
return traineddataPath;
|
|
318
|
+
}
|
|
319
|
+
if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
|
|
320
|
+
return traineddataPath;
|
|
321
|
+
}
|
|
322
|
+
yield (0, promises_1.mkdir)(dataPath, { recursive: true });
|
|
323
|
+
const release = yield (0, proper_lockfile_1.lock)(traineddataPath, {
|
|
324
|
+
lockfilePath: `${traineddataPath}.lock`,
|
|
325
|
+
stale: 10 * 60 * 1000,
|
|
326
|
+
update: 30 * 1000,
|
|
327
|
+
realpath: false,
|
|
328
|
+
retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
|
|
329
|
+
});
|
|
330
|
+
try {
|
|
331
|
+
if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
|
|
332
|
+
return traineddataPath;
|
|
333
|
+
}
|
|
334
|
+
if (traineddataPath !== cacheTraineddataPath &&
|
|
335
|
+
(yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath))) {
|
|
336
|
+
yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
|
|
337
|
+
return traineddataPath;
|
|
338
|
+
}
|
|
339
|
+
const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
|
|
340
|
+
const response = yield fetch(url);
|
|
341
|
+
if (!response.ok || !response.body) {
|
|
342
|
+
throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
|
|
343
|
+
}
|
|
344
|
+
const tmpPath = node_path_1.default.join(node_os_1.default.tmpdir(), [
|
|
345
|
+
"node-tesseract-ocr",
|
|
346
|
+
lang,
|
|
347
|
+
"traineddata",
|
|
348
|
+
process.pid,
|
|
349
|
+
Date.now(),
|
|
350
|
+
Math.random().toString(36).slice(2),
|
|
351
|
+
].join("-"));
|
|
352
|
+
const totalBytesHeader = response.headers.get("content-length");
|
|
353
|
+
const totalBytes = totalBytesHeader
|
|
354
|
+
? Number(totalBytesHeader)
|
|
355
|
+
: undefined;
|
|
356
|
+
let downloadedBytes = 0;
|
|
357
|
+
const progressStream = new node_stream_1.Transform({
|
|
358
|
+
transform(chunk, _, callback) {
|
|
359
|
+
if (progressCallback) {
|
|
360
|
+
downloadedBytes += chunk.length;
|
|
361
|
+
const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
|
|
362
|
+
? (downloadedBytes / totalBytes) * 100
|
|
363
|
+
: undefined;
|
|
364
|
+
progressCallback({
|
|
365
|
+
lang,
|
|
366
|
+
url,
|
|
367
|
+
downloadedBytes,
|
|
368
|
+
totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
|
|
369
|
+
percent,
|
|
370
|
+
});
|
|
371
|
+
}
|
|
372
|
+
callback(null, chunk);
|
|
373
|
+
},
|
|
374
|
+
});
|
|
375
|
+
try {
|
|
376
|
+
yield (0, promises_2.pipeline)(node_stream_1.Readable.fromWeb(response.body), progressStream, (0, node_zlib_1.createGunzip)(), (0, node_fs_1.createWriteStream)(tmpPath));
|
|
377
|
+
try {
|
|
378
|
+
yield (0, promises_1.rename)(tmpPath, traineddataPath);
|
|
379
|
+
}
|
|
380
|
+
catch (error) {
|
|
381
|
+
if (error.code === "EXDEV") {
|
|
382
|
+
yield (0, promises_1.copyFile)(tmpPath, traineddataPath);
|
|
383
|
+
yield (0, promises_1.rm)(tmpPath, { force: true });
|
|
384
|
+
}
|
|
385
|
+
else {
|
|
386
|
+
throw error;
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
catch (error) {
|
|
391
|
+
yield (0, promises_1.rm)(tmpPath, { force: true });
|
|
392
|
+
throw error;
|
|
393
|
+
}
|
|
394
|
+
return traineddataPath;
|
|
395
|
+
}
|
|
396
|
+
finally {
|
|
397
|
+
yield release();
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
}
|
|
275
401
|
}
|
|
276
402
|
exports.Tesseract = Tesseract;
|
|
277
403
|
exports.default = Tesseract;
|