@luii/node-tesseract-ocr 2.0.13 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ module.exports = {
2
+ name: "node-tesseract-ocr",
3
+ napi_versions: [10],
4
+ };
@@ -23,8 +23,20 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  step((generator = generator.apply(thisArg, _arguments || [])).next());
24
24
  });
25
25
  };
26
+ var __importDefault = (this && this.__importDefault) || function (mod) {
27
+ return (mod && mod.__esModule) ? mod : { "default": mod };
28
+ };
26
29
  Object.defineProperty(exports, "__esModule", { value: true });
27
30
  exports.NativeTesseract = exports.Tesseract = exports.LogLevels = exports.PageSegmentationModes = exports.OcrEngineModes = exports.Language = void 0;
31
+ const node_fs_1 = require("node:fs");
32
+ const promises_1 = require("node:fs/promises");
33
+ const node_os_1 = __importDefault(require("node:os"));
34
+ const node_path_1 = __importDefault(require("node:path"));
35
+ const node_stream_1 = require("node:stream");
36
+ const promises_2 = require("node:stream/promises");
37
+ const node_zlib_1 = require("node:zlib");
38
+ const proper_lockfile_1 = require("proper-lockfile");
39
+ const utils_1 = require("./utils");
28
40
  /**
29
41
  * All available languages for tesseract
30
42
  * @readonly
@@ -239,14 +251,15 @@ exports.LogLevels = {
239
251
  FATAL: "50000",
240
252
  OFF: "2147483647",
241
253
  };
242
- const fs = require("node:fs");
243
- const path = require("node:path");
244
- const rootFromSource = path.resolve(__dirname, "../../");
245
- const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
246
- const bindingOptionsPath = fs.existsSync(bindingOptionsFromSource)
254
+ const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
255
+ const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
256
+ const DEFAULT_CACHE_DIR = node_path_1.default.join(node_os_1.default.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
257
+ const rootFromSource = node_path_1.default.resolve(__dirname, "../../");
258
+ const bindingOptionsFromSource = node_path_1.default.resolve(rootFromSource, "binding-options.js");
259
+ const bindingOptionsPath = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
247
260
  ? bindingOptionsFromSource
248
- : path.resolve(process.cwd(), "binding-options.js");
249
- const prebuildRoot = fs.existsSync(bindingOptionsFromSource)
261
+ : node_path_1.default.resolve(process.cwd(), "binding-options.js");
262
+ const prebuildRoot = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
250
263
  ? rootFromSource
251
264
  : process.cwd();
252
265
  const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
@@ -254,24 +267,137 @@ exports.NativeTesseract = NativeTesseract;
254
267
  class Tesseract extends NativeTesseract {
255
268
  constructor() {
256
269
  super();
270
+ Object.defineProperty(this, "document", {
271
+ enumerable: true,
272
+ configurable: true,
273
+ writable: true,
274
+ value: {
275
+ begin: this.beginProcessPages.bind(this),
276
+ addPage: this.addProcessPage.bind(this),
277
+ finish: this.finishProcessPages.bind(this),
278
+ abort: this.abortProcessPages.bind(this),
279
+ status: this.getProcessPagesStatus.bind(this),
280
+ }
281
+ });
257
282
  }
258
- init(options) {
283
+ init() {
259
284
  const _super = Object.create(null, {
260
285
  init: { get: () => super.init }
261
286
  });
262
- return __awaiter(this, void 0, void 0, function* () {
263
- // scan train data for any files
264
- // check whether the requested langs are available/cached
265
- // if not
266
- // fetch traineddata from cdn
267
- // - add .lock file to downloaded file (while downloading, so other instances
268
- // can wait on it and dont have to download again)
269
- // - place into tesseract standard folder
270
- // if available
271
- // just go on with the init function of the native addon
287
+ return __awaiter(this, arguments, void 0, function* (options = {}) {
288
+ var _a, _b, _c, _d, _e, _f;
289
+ (_a = options.langs) !== null && _a !== void 0 ? _a : (options.langs = []);
290
+ (_b = options.ensureTraineddata) !== null && _b !== void 0 ? _b : (options.ensureTraineddata = true);
291
+ (_c = options.cachePath) !== null && _c !== void 0 ? _c : (options.cachePath = DEFAULT_CACHE_DIR);
292
+ (_d = options.dataPath) !== null && _d !== void 0 ? _d : (options.dataPath = (_e = process.env.TESSDATA_PREFIX) !== null && _e !== void 0 ? _e : options.cachePath);
293
+ (_f = options.progressCallback) !== null && _f !== void 0 ? _f : (options.progressCallback = undefined);
294
+ const cachePath = node_path_1.default.resolve(options.cachePath);
295
+ const dataPath = node_path_1.default.resolve(options.dataPath);
296
+ if (options.ensureTraineddata) {
297
+ for (const lang of [...options.langs, exports.Language.osd]) {
298
+ const downloadBaseUrl = options.oem === exports.OcrEngineModes.OEM_LSTM_ONLY
299
+ ? TESSDATA4_BEST(lang)
300
+ : TESSDATA4(lang);
301
+ lang &&
302
+ (yield this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
303
+ }
304
+ }
272
305
  return _super.init.call(this, options);
273
306
  });
274
307
  }
308
+ ensureTrainingData(_a, progressCallback_1) {
309
+ return __awaiter(this, arguments, void 0, function* ({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
310
+ const traineddataPath = node_path_1.default.join(dataPath, `${lang}.traineddata`);
311
+ const cacheTraineddataPath = node_path_1.default.join(cachePath, `${lang}.traineddata`);
312
+ if (yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath)) {
313
+ if (traineddataPath !== cacheTraineddataPath) {
314
+ yield (0, promises_1.mkdir)(dataPath, { recursive: true });
315
+ yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
316
+ }
317
+ return traineddataPath;
318
+ }
319
+ if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
320
+ return traineddataPath;
321
+ }
322
+ yield (0, promises_1.mkdir)(dataPath, { recursive: true });
323
+ const release = yield (0, proper_lockfile_1.lock)(traineddataPath, {
324
+ lockfilePath: `${traineddataPath}.lock`,
325
+ stale: 10 * 60 * 1000,
326
+ update: 30 * 1000,
327
+ realpath: false,
328
+ retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
329
+ });
330
+ try {
331
+ if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
332
+ return traineddataPath;
333
+ }
334
+ if (traineddataPath !== cacheTraineddataPath &&
335
+ (yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath))) {
336
+ yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
337
+ return traineddataPath;
338
+ }
339
+ const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
340
+ const response = yield fetch(url);
341
+ if (!response.ok || !response.body) {
342
+ throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
343
+ }
344
+ const tmpPath = node_path_1.default.join(node_os_1.default.tmpdir(), [
345
+ "node-tesseract-ocr",
346
+ lang,
347
+ "traineddata",
348
+ process.pid,
349
+ Date.now(),
350
+ Math.random().toString(36).slice(2),
351
+ ].join("-"));
352
+ const totalBytesHeader = response.headers.get("content-length");
353
+ const totalBytes = totalBytesHeader
354
+ ? Number(totalBytesHeader)
355
+ : undefined;
356
+ let downloadedBytes = 0;
357
+ const progressStream = new node_stream_1.Transform({
358
+ transform(chunk, _, callback) {
359
+ if (progressCallback) {
360
+ downloadedBytes += chunk.length;
361
+ const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
362
+ ? (downloadedBytes / totalBytes) * 100
363
+ : undefined;
364
+ progressCallback({
365
+ lang,
366
+ url,
367
+ downloadedBytes,
368
+ totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
369
+ percent,
370
+ });
371
+ }
372
+ callback(null, chunk);
373
+ },
374
+ });
375
+ try {
376
+ yield (0, promises_2.pipeline)(node_stream_1.Readable.fromWeb(response.body), progressStream, (0, node_zlib_1.createGunzip)(), (0, node_fs_1.createWriteStream)(tmpPath));
377
+ try {
378
+ yield (0, promises_1.rename)(tmpPath, traineddataPath);
379
+ }
380
+ catch (error) {
381
+ if (error.code === "EXDEV") {
382
+ yield (0, promises_1.copyFile)(tmpPath, traineddataPath);
383
+ yield (0, promises_1.rm)(tmpPath, { force: true });
384
+ }
385
+ else {
386
+ throw error;
387
+ }
388
+ }
389
+ }
390
+ catch (error) {
391
+ yield (0, promises_1.rm)(tmpPath, { force: true });
392
+ throw error;
393
+ }
394
+ return traineddataPath;
395
+ }
396
+ finally {
397
+ yield release();
398
+ }
399
+ });
400
+ }
275
401
  }
276
402
  exports.Tesseract = Tesseract;
277
403
  exports.default = Tesseract;