@luii/node-tesseract-ocr 2.0.13 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,15 @@
13
13
  * or implied. See the License for the specific language governing
14
14
  * permissions and limitations under the License.
15
15
  */
16
+ import { existsSync, createWriteStream } from "node:fs";
17
+ import { mkdir, rename, rm, copyFile } from "node:fs/promises";
18
+ import os from "node:os";
19
+ import path from "node:path";
20
+ import { Readable, Transform } from "node:stream";
21
+ import { pipeline } from "node:stream/promises";
22
+ import { createGunzip } from "node:zlib";
23
+ import { lock } from "proper-lockfile";
24
+ import { isValidTraineddata } from "./utils";
16
25
  /**
17
26
  * All available languages for tesseract
18
27
  * @readonly
@@ -227,33 +236,139 @@ export const LogLevels = {
227
236
  FATAL: "50000",
228
237
  OFF: "2147483647",
229
238
  };
230
- const fs = require("node:fs");
231
- const path = require("node:path");
239
+ const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
240
+ const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
241
+ const DEFAULT_CACHE_DIR = path.join(os.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
232
242
  const rootFromSource = path.resolve(__dirname, "../../");
233
243
  const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
234
- const bindingOptionsPath = fs.existsSync(bindingOptionsFromSource)
244
+ const bindingOptionsPath = existsSync(bindingOptionsFromSource)
235
245
  ? bindingOptionsFromSource
236
246
  : path.resolve(process.cwd(), "binding-options.js");
237
- const prebuildRoot = fs.existsSync(bindingOptionsFromSource)
247
+ const prebuildRoot = existsSync(bindingOptionsFromSource)
238
248
  ? rootFromSource
239
249
  : process.cwd();
240
250
  const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
241
251
  class Tesseract extends NativeTesseract {
252
+ document = {
253
+ begin: this.beginProcessPages.bind(this),
254
+ addPage: this.addProcessPage.bind(this),
255
+ finish: this.finishProcessPages.bind(this),
256
+ abort: this.abortProcessPages.bind(this),
257
+ status: this.getProcessPagesStatus.bind(this),
258
+ };
242
259
  constructor() {
243
260
  super();
244
261
  }
245
- async init(options) {
246
- // scan train data for any files
247
- // check whether the requested langs are available/cached
248
- // if not
249
- // fetch traineddata from cdn
250
- // - add .lock file to downloaded file (while downloading, so other instances
251
- // can wait on it and dont have to download again)
252
- // - place into tesseract standard folder
253
- // if available
254
- // just go on with the init function of the native addon
262
+ async init(options = {}) {
263
+ options.langs ??= [];
264
+ options.ensureTraineddata ??= true;
265
+ options.cachePath ??= DEFAULT_CACHE_DIR;
266
+ options.dataPath ??= process.env.TESSDATA_PREFIX ?? options.cachePath;
267
+ options.progressCallback ??= undefined;
268
+ const cachePath = path.resolve(options.cachePath);
269
+ const dataPath = path.resolve(options.dataPath);
270
+ if (options.ensureTraineddata) {
271
+ for (const lang of [...options.langs, Language.osd]) {
272
+ const downloadBaseUrl = options.oem === OcrEngineModes.OEM_LSTM_ONLY
273
+ ? TESSDATA4_BEST(lang)
274
+ : TESSDATA4(lang);
275
+ lang &&
276
+ (await this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
277
+ }
278
+ }
255
279
  return super.init(options);
256
280
  }
281
+ async ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
282
+ const traineddataPath = path.join(dataPath, `${lang}.traineddata`);
283
+ const cacheTraineddataPath = path.join(cachePath, `${lang}.traineddata`);
284
+ if (await isValidTraineddata(cacheTraineddataPath)) {
285
+ if (traineddataPath !== cacheTraineddataPath) {
286
+ await mkdir(dataPath, { recursive: true });
287
+ await copyFile(cacheTraineddataPath, traineddataPath);
288
+ }
289
+ return traineddataPath;
290
+ }
291
+ if (await isValidTraineddata(traineddataPath)) {
292
+ return traineddataPath;
293
+ }
294
+ await mkdir(dataPath, { recursive: true });
295
+ const release = await lock(traineddataPath, {
296
+ lockfilePath: `${traineddataPath}.lock`,
297
+ stale: 10 * 60 * 1000,
298
+ update: 30 * 1000,
299
+ realpath: false,
300
+ retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
301
+ });
302
+ try {
303
+ if (await isValidTraineddata(traineddataPath)) {
304
+ return traineddataPath;
305
+ }
306
+ if (traineddataPath !== cacheTraineddataPath &&
307
+ (await isValidTraineddata(cacheTraineddataPath))) {
308
+ await copyFile(cacheTraineddataPath, traineddataPath);
309
+ return traineddataPath;
310
+ }
311
+ const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
312
+ const response = await fetch(url);
313
+ if (!response.ok || !response.body) {
314
+ throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
315
+ }
316
+ const tmpPath = path.join(os.tmpdir(), [
317
+ "node-tesseract-ocr",
318
+ lang,
319
+ "traineddata",
320
+ process.pid,
321
+ Date.now(),
322
+ Math.random().toString(36).slice(2),
323
+ ].join("-"));
324
+ const totalBytesHeader = response.headers.get("content-length");
325
+ const totalBytes = totalBytesHeader
326
+ ? Number(totalBytesHeader)
327
+ : undefined;
328
+ let downloadedBytes = 0;
329
+ const progressStream = new Transform({
330
+ transform(chunk, _, callback) {
331
+ if (progressCallback) {
332
+ downloadedBytes += chunk.length;
333
+ const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
334
+ ? (downloadedBytes / totalBytes) * 100
335
+ : undefined;
336
+ progressCallback({
337
+ lang,
338
+ url,
339
+ downloadedBytes,
340
+ totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
341
+ percent,
342
+ });
343
+ }
344
+ callback(null, chunk);
345
+ },
346
+ });
347
+ try {
348
+ await pipeline(Readable.fromWeb(response.body), progressStream, createGunzip(), createWriteStream(tmpPath));
349
+ try {
350
+ await rename(tmpPath, traineddataPath);
351
+ }
352
+ catch (error) {
353
+ if (error.code === "EXDEV") {
354
+ await copyFile(tmpPath, traineddataPath);
355
+ await rm(tmpPath, { force: true });
356
+ }
357
+ else {
358
+ throw error;
359
+ }
360
+ }
361
+ }
362
+ catch (error) {
363
+ await rm(tmpPath, { force: true });
364
+ throw error;
365
+ }
366
+ return traineddataPath;
367
+ }
368
+ finally {
369
+ await release();
370
+ }
371
+ }
257
372
  }
258
373
  export { Tesseract, NativeTesseract };
259
374
  export default Tesseract;