@luii/node-tesseract-ocr 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -16,7 +16,6 @@ Native C++ addon for Node.js that exposes Tesseract OCR (`libtesseract-dev`) to
16
16
  - [Enums](#enums)
17
17
  - [Types](#types)
18
18
  - [Tesseract API](#tesseract-api)
19
- - [Example](#example)
20
19
  - [License](#license)
21
20
  - [Special Thanks](#special-thanks)
22
21
 
@@ -25,6 +24,7 @@ Native C++ addon for Node.js that exposes Tesseract OCR (`libtesseract-dev`) to
25
24
  - Native bindings to Tesseract (prebuilds via `pkg-prebuilds`)
26
25
  - Access to Tesseract enums and configuration from TypeScript
27
26
  - Progress callback and multiple output formats
27
+ - Lazy download of missing traineddata (configurable)
28
28
 
29
29
  ## Prerequisites
30
30
 
@@ -33,7 +33,7 @@ Native C++ addon for Node.js that exposes Tesseract OCR (`libtesseract-dev`) to
33
33
  - c++ build toolchain (e.g. build-essentials)
34
34
  - libtesseract-dev
35
35
  - libleptonica-dev
36
- - Tesseract training data (eng, deu, ...)
36
+ - Tesseract training data (eng, deu, ...) or let the library handle that
37
37
 
38
38
  > See [Install](#install)
39
39
 
@@ -59,7 +59,9 @@ Install additional languages as needed, for example:
59
59
  sudo apt install -y tesseract-ocr-deu tesseract-ocr-eng tesseract-ocr-jpn
60
60
  ```
61
61
 
62
- If you install traineddata files manually, make sure `NODE_TESSERACT_DATAPATH` points to the directory that contains them (for example `/usr/share/tesseract-ocr/5/tessdata`).
62
+ If you install traineddata files manually, make sure `TESSDATA_PREFIX` points to the directory that contains them (for example `/usr/share/tessdata`).
63
+
64
+ If traineddata is missing, this package will download it lazily during `init` by default. You can control this behavior via `ensureTraineddata`, `cachePath`, and `dataPath`.
63
65
 
64
66
  ## Build
65
67
 
@@ -73,12 +75,14 @@ npm run build:release
73
75
 
74
76
  ## Start
75
77
 
76
- Set `NODE_TESSERACT_DATAPATH` to your traineddata directory (usually `/usr/share/tesseract-ocr/5/tessdata`).
78
+ Set `TESSDATA_PREFIX` to your traineddata directory (usually `/usr/share/tesseract-ocr/5/tessdata` or `/usr/share/tessdata`).
77
79
 
78
80
  ```sh
79
- env NODE_TESSERACT_DATAPATH=/usr/share/tesseract-ocr/5/tessdata node path/to/your/app.js
81
+ env TESSDATA_PREFIX=/usr/share/tessdata node path/to/your/app.js
80
82
  ```
81
83
 
84
+ If you prefer automatic downloads, you can skip setting `TESSDATA_PREFIX` and let the default cache directory handle traineddata on first use.
85
+
82
86
  ## Scripts
83
87
 
84
88
  ```bash
@@ -86,9 +90,6 @@ env NODE_TESSERACT_DATAPATH=/usr/share/tesseract-ocr/5/tessdata node path/to/you
86
90
  npm run build:debug
87
91
  npm run build:release
88
92
 
89
- # Build precompiled binaries for distribution
90
- npm run prebuild
91
-
92
93
  # Run the JS example (builds debug first)
93
94
  npm run example:recognize
94
95
 
@@ -100,8 +101,73 @@ npm run test:js:watch
100
101
 
101
102
  ## Examples
102
103
 
104
+ ### Run Included Example
105
+
103
106
  ```sh
104
- env NODE_TESSERACT_DATAPATH=/usr/share/tesseract-ocr/5/tessdata npm run example:recognize
107
+ env TESSDATA_PREFIX=/usr/share/tessdata npm run example:recognize
108
+ ```
109
+
110
+ ### Basic OCR (Local Traineddata)
111
+
112
+ You can find a similar example in the `examples/` folder of the project.
113
+
114
+ ```ts
115
+ import fs from "node:fs";
116
+ import Tesseract, { OcrEngineModes } from "node-tesseract-ocr";
117
+
118
+ process.env.TESSDATA_PREFIX = "/usr/share/tessdata/";
119
+
120
+ async function main() {
121
+ const tesseract = new Tesseract();
122
+ await tesseract.init({
123
+ langs: ["eng"],
124
+ });
125
+
126
+ const buffer = fs.readFileSync("example1.png");
127
+ await tesseract.setImage(buffer);
128
+ await tesseract.recognize((info) => {
129
+ console.log(`Progress: ${info.percent}%`);
130
+ });
131
+
132
+ const text = await tesseract.getUTF8Text();
133
+ console.log(text);
134
+
135
+ await tesseract.end();
136
+ }
137
+
138
+ main().catch((err) => {
139
+ console.error(err);
140
+ process.exit(1);
141
+ });
142
+ ```
143
+
144
+ ### Lazy Traineddata Download (Default)
145
+
146
+ ```ts
147
+ import fs from "node:fs";
148
+ import Tesseract from "node-tesseract-ocr";
149
+
150
+ async function main() {
151
+ const tesseract = new Tesseract();
152
+ await tesseract.init({
153
+ langs: ["eng"],
154
+ ensureTraineddata: true
155
+ dataPath: './tessdata-local'
156
+ });
157
+
158
+ const buffer = fs.readFileSync("example1.png");
159
+ await tesseract.setImage(buffer);
160
+ await tesseract.recognize();
161
+ const text = await tesseract.getUTF8Text();
162
+ console.log(text);
163
+
164
+ await tesseract.end();
165
+ }
166
+
167
+ main().catch((err) => {
168
+ console.error(err);
169
+ process.exit(1);
170
+ });
105
171
  ```
106
172
 
107
173
  ## Public API
@@ -151,13 +217,17 @@ Full list of page segmentation modes from Tesseract.
151
217
 
152
218
  #### `TesseractInitOptions`
153
219
 
154
- | Field | Type | Optional | Default | Description |
155
- | ----------------------- | ----------------------------------------------------------------------------------------------------- | -------- | ----------- | --------------------------------------- |
156
- | `lang` | [`Language[]`](#availablelanguages) | Yes | `undefined` | Languages to load as an array. |
157
- | `oem` | [`OcrEngineMode`](#ocrenginemode) | Yes | `undefined` | OCR engine mode. |
158
- | `vars` | `Partial<Record<keyof ConfigurationVariables, ConfigurationVariables[keyof ConfigurationVariables]>>` | Yes | `undefined` | Variables to set. |
159
- | `configs` | `Array<string>` | Yes | `undefined` | Tesseract config files to apply. |
160
- | `setOnlyNonDebugParams` | `boolean` | Yes | `undefined` | If true, only non-debug params are set. |
220
+ | Field | Type | Optional | Default | Description |
221
+ | ----------------------- | ----------------------------------------------------------------------------------------------------- | -------- | -------------------------------------- | --------------------------------------- |
222
+ | `langs` | [`Language[]`](#availablelanguages) | Yes | `undefined` | Languages to load as an array. |
223
+ | `oem` | [`OcrEngineMode`](#ocrenginemode) | Yes | `undefined` | OCR engine mode. |
224
+ | `vars` | `Partial<Record<keyof ConfigurationVariables, ConfigurationVariables[keyof ConfigurationVariables]>>` | Yes | `undefined` | Variables to set. |
225
+ | `configs` | `Array<string>` | Yes | `undefined` | Tesseract config files to apply. |
226
+ | `setOnlyNonDebugParams` | `boolean` | Yes | `undefined` | If true, only non-debug params are set. |
227
+ | `ensureTraineddata` | `boolean` | Yes | `true` | Download missing traineddata lazily. |
228
+ | `cachePath` | `string` | Yes | `~/.cache/node-tesseract-ocr/tessdata` | Cache directory for downloads. |
229
+ | `dataPath` | `string` | Yes | `TESSDATA_PREFIX` or `cachePath` | Directory used by Tesseract for data. |
230
+ | `progressCallback` | `(info: TrainingDataDownloadProgress) => void` | Yes | `undefined` | Download progress callback. |
161
231
 
162
232
  #### `TesseractSetRectangleOptions`
163
233
 
@@ -464,39 +534,6 @@ Ends the instance.
464
534
  end(): Promise<void>
465
535
  ```
466
536
 
467
- ## Example
468
-
469
- You can find a similar example in the `examples/` folder of the project
470
-
471
- ```ts
472
- import fs from "node:fs";
473
- import Tesseract, { OcrEngineModes } from "node-tesseract-ocr";
474
-
475
- async function main() {
476
- const tesseract = new Tesseract();
477
- await tesseract.init({
478
- lang: ["eng"],
479
- oem: OcrEngineModes.OEM_LSTM_ONLY,
480
- });
481
-
482
- const buffer = fs.readFileSync("example1.png");
483
- await tesseract.setImage(buffer);
484
- await tesseract.recognize((info) => {
485
- console.log(`Progress: ${info.percent}%`);
486
- });
487
-
488
- const text = await tesseract.getUTF8Text();
489
- console.log(text);
490
-
491
- await tesseract.end();
492
- }
493
-
494
- main().catch((err) => {
495
- console.error(err);
496
- process.exit(1);
497
- });
498
- ```
499
-
500
537
  ## License
501
538
 
502
539
  Apache-2.0. See [`LICENSE.md`](/LICENSE.md) for full terms.
@@ -23,6 +23,9 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  step((generator = generator.apply(thisArg, _arguments || [])).next());
24
24
  });
25
25
  };
26
+ var __importDefault = (this && this.__importDefault) || function (mod) {
27
+ return (mod && mod.__esModule) ? mod : { "default": mod };
28
+ };
26
29
  Object.defineProperty(exports, "__esModule", { value: true });
27
30
  exports.NativeTesseract = exports.Tesseract = exports.LogLevels = exports.PageSegmentationModes = exports.OcrEngineModes = exports.Language = void 0;
28
31
  /**
@@ -239,14 +242,24 @@ exports.LogLevels = {
239
242
  FATAL: "50000",
240
243
  OFF: "2147483647",
241
244
  };
242
- const fs = require("node:fs");
243
- const path = require("node:path");
244
- const rootFromSource = path.resolve(__dirname, "../../");
245
- const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
246
- const bindingOptionsPath = fs.existsSync(bindingOptionsFromSource)
245
+ const node_fs_1 = require("node:fs");
246
+ const promises_1 = require("node:fs/promises");
247
+ const node_os_1 = __importDefault(require("node:os"));
248
+ const node_path_1 = __importDefault(require("node:path"));
249
+ const node_stream_1 = require("node:stream");
250
+ const promises_2 = require("node:stream/promises");
251
+ const node_zlib_1 = require("node:zlib");
252
+ const proper_lockfile_1 = require("proper-lockfile");
253
+ const utils_1 = require("./utils");
254
+ const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
255
+ const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
256
+ const DEFAULT_CACHE_DIR = node_path_1.default.join(node_os_1.default.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
257
+ const rootFromSource = node_path_1.default.resolve(__dirname, "../../");
258
+ const bindingOptionsFromSource = node_path_1.default.resolve(rootFromSource, "binding-options.js");
259
+ const bindingOptionsPath = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
247
260
  ? bindingOptionsFromSource
248
- : path.resolve(process.cwd(), "binding-options.js");
249
- const prebuildRoot = fs.existsSync(bindingOptionsFromSource)
261
+ : node_path_1.default.resolve(process.cwd(), "binding-options.js");
262
+ const prebuildRoot = (0, node_fs_1.existsSync)(bindingOptionsFromSource)
250
263
  ? rootFromSource
251
264
  : process.cwd();
252
265
  const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
@@ -255,23 +268,124 @@ class Tesseract extends NativeTesseract {
255
268
  constructor() {
256
269
  super();
257
270
  }
258
- init(options) {
271
+ init() {
259
272
  const _super = Object.create(null, {
260
273
  init: { get: () => super.init }
261
274
  });
262
- return __awaiter(this, void 0, void 0, function* () {
263
- // scan train data for any files
264
- // check whether the requested langs are available/cached
265
- // if not
266
- // fetch traineddata from cdn
267
- // - add .lock file to downloaded file (while downloading, so other instances
268
- // can wait on it and dont have to download again)
269
- // - place into tesseract standard folder
270
- // if available
271
- // just go on with the init function of the native addon
275
+ return __awaiter(this, arguments, void 0, function* (options = {}) {
276
+ var _a, _b, _c, _d, _e, _f;
277
+ (_a = options.langs) !== null && _a !== void 0 ? _a : (options.langs = []);
278
+ (_b = options.ensureTraineddata) !== null && _b !== void 0 ? _b : (options.ensureTraineddata = true);
279
+ (_c = options.cachePath) !== null && _c !== void 0 ? _c : (options.cachePath = DEFAULT_CACHE_DIR);
280
+ (_d = options.dataPath) !== null && _d !== void 0 ? _d : (options.dataPath = (_e = process.env.TESSDATA_PREFIX) !== null && _e !== void 0 ? _e : options.cachePath);
281
+ (_f = options.progressCallback) !== null && _f !== void 0 ? _f : (options.progressCallback = undefined);
282
+ const cachePath = node_path_1.default.resolve(options.cachePath);
283
+ const dataPath = node_path_1.default.resolve(options.dataPath);
284
+ if (options.ensureTraineddata) {
285
+ for (const lang of [...options.langs, exports.Language.osd]) {
286
+ const downloadBaseUrl = options.oem === exports.OcrEngineModes.OEM_LSTM_ONLY
287
+ ? TESSDATA4_BEST(lang)
288
+ : TESSDATA4(lang);
289
+ lang &&
290
+ (yield this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
291
+ }
292
+ }
272
293
  return _super.init.call(this, options);
273
294
  });
274
295
  }
296
+ ensureTrainingData(_a, progressCallback_1) {
297
+ return __awaiter(this, arguments, void 0, function* ({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
298
+ const traineddataPath = node_path_1.default.join(dataPath, `${lang}.traineddata`);
299
+ const cacheTraineddataPath = node_path_1.default.join(cachePath, `${lang}.traineddata`);
300
+ if (yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath)) {
301
+ if (traineddataPath !== cacheTraineddataPath) {
302
+ yield (0, promises_1.mkdir)(dataPath, { recursive: true });
303
+ yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
304
+ }
305
+ return traineddataPath;
306
+ }
307
+ if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
308
+ return traineddataPath;
309
+ }
310
+ yield (0, promises_1.mkdir)(dataPath, { recursive: true });
311
+ const release = yield (0, proper_lockfile_1.lock)(traineddataPath, {
312
+ lockfilePath: `${traineddataPath}.lock`,
313
+ stale: 10 * 60 * 1000,
314
+ update: 30 * 1000,
315
+ realpath: false,
316
+ retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
317
+ });
318
+ try {
319
+ if (yield (0, utils_1.isValidTraineddata)(traineddataPath)) {
320
+ return traineddataPath;
321
+ }
322
+ if (traineddataPath !== cacheTraineddataPath &&
323
+ (yield (0, utils_1.isValidTraineddata)(cacheTraineddataPath))) {
324
+ yield (0, promises_1.copyFile)(cacheTraineddataPath, traineddataPath);
325
+ return traineddataPath;
326
+ }
327
+ const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
328
+ const response = yield fetch(url);
329
+ if (!response.ok || !response.body) {
330
+ throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
331
+ }
332
+ const tmpPath = node_path_1.default.join(node_os_1.default.tmpdir(), [
333
+ "node-tesseract-ocr",
334
+ lang,
335
+ "traineddata",
336
+ process.pid,
337
+ Date.now(),
338
+ Math.random().toString(36).slice(2),
339
+ ].join("-"));
340
+ const totalBytesHeader = response.headers.get("content-length");
341
+ const totalBytes = totalBytesHeader
342
+ ? Number(totalBytesHeader)
343
+ : undefined;
344
+ let downloadedBytes = 0;
345
+ const progressStream = new node_stream_1.Transform({
346
+ transform(chunk, _, callback) {
347
+ if (progressCallback) {
348
+ downloadedBytes += chunk.length;
349
+ const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
350
+ ? (downloadedBytes / totalBytes) * 100
351
+ : undefined;
352
+ progressCallback({
353
+ lang,
354
+ url,
355
+ downloadedBytes,
356
+ totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
357
+ percent,
358
+ });
359
+ }
360
+ callback(null, chunk);
361
+ },
362
+ });
363
+ try {
364
+ yield (0, promises_2.pipeline)(node_stream_1.Readable.fromWeb(response.body), progressStream, (0, node_zlib_1.createGunzip)(), (0, node_fs_1.createWriteStream)(tmpPath));
365
+ try {
366
+ yield (0, promises_1.rename)(tmpPath, traineddataPath);
367
+ }
368
+ catch (error) {
369
+ if (error.code === "EXDEV") {
370
+ yield (0, promises_1.copyFile)(tmpPath, traineddataPath);
371
+ yield (0, promises_1.rm)(tmpPath, { force: true });
372
+ }
373
+ else {
374
+ throw error;
375
+ }
376
+ }
377
+ }
378
+ catch (error) {
379
+ yield (0, promises_1.rm)(tmpPath, { force: true });
380
+ throw error;
381
+ }
382
+ return traineddataPath;
383
+ }
384
+ finally {
385
+ yield release();
386
+ }
387
+ });
388
+ }
275
389
  }
276
390
  exports.Tesseract = Tesseract;
277
391
  exports.default = Tesseract;
@@ -839,20 +839,81 @@ export interface TesseractInitOptions {
839
839
  /**
840
840
  * Its generally safer to use as few languages as possible.
841
841
  * The more languages Tesseract needs to load the longer it takes to recognize a image.
842
- * @public
842
+ * The OSD Language will always be loaded to support orientation and script detection
843
+ * IMPORTANT: if you specify more than one language here (e.g.: `deu, eng` for example)
844
+ * tesseract will try to recognize german and english in the same image.
845
+ * Originally tesseract itself accepts it as `deu+eng`, but since this
846
+ * makes typing very hard to near impossible its safer to just accept a
847
+ * array with the languages it should look for.
848
+ * When talking about "hard typing/impossible typing" its because typescript
849
+ * itself cannot create recursive types, and chaining template types
850
+ * (e.g.: `${Language}+${Language}+...`) stretches out the compilation time
851
+ * to a unacceptable amount
852
+ *
853
+ * @default [Language.osd]
843
854
  */
844
- lang?: Language[];
855
+ langs?: Language[];
856
+ /**
857
+ * Specify where the trainingdata is located
858
+ * Besides the datapath in general it is versioned to the
859
+ * version of tesseract
860
+ * @default '~/.cache/node-tesseract-ocr/'
861
+ */
862
+ cachePath?: string;
863
+ /**
864
+ * Explicit datapath for traineddata. Takes precedence over
865
+ * the `TESSDATA_PREFIX` environment variable.
866
+ */
867
+ dataPath?: string;
868
+ /**
869
+ * This will be called for every language that was specified in `lang`,
870
+ * it allows the user to be flexible about the training data's location
871
+ * Or if he needs to specify his own location for certain languages/custom languages
872
+ * IMPORTANT: Ensures that trainingdata will be downloaded from the following cdn
873
+ * in case they dont exist
874
+ * OEM_LSTM_ONLY => https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int
875
+ * NON OEM_LSTM_ONLY => https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0
876
+ * NOTE: Tesseract 5.x.x still uses the 4.x.x trainingdata
877
+ *
878
+ * @default true
879
+ */
880
+ ensureTraineddata?: boolean;
881
+ /**
882
+ * Optional progress callback for traineddata downloads.
883
+ */
884
+ progressCallback?: (info: TrainingDataDownloadProgress) => void;
845
885
  /**
846
886
  * OCR Engine Modes
847
887
  * The engine mode cannot be changed after creating the instance
848
888
  * If another mode is needed, its advised to create a new instance.
889
+ * @default OEM_DEFAULT
849
890
  * @throws {Error} Will throw an error when oem mode is below 0 or over 3
850
891
  */
851
892
  oem?: OcrEngineMode;
893
+ /**
894
+ * Controls if only non debug parameters will be set upon initialization
895
+ * @default false
896
+ */
852
897
  setOnlyNonDebugParams?: boolean;
898
+ /**
899
+ * Array of paths that point to their corresponding config files
900
+ * usually located in the `dataPath` location alongside the training data
901
+ */
853
902
  configs?: Array<string>;
903
+ /**
904
+ * Record of parameters that should be set upon initialization
905
+ * Consult the original documentation of tesseract on which variables
906
+ * can actually be set
907
+ */
854
908
  vars?: Partial<Record<keyof ConfigurationVariables, ConfigurationVariables[keyof ConfigurationVariables]>>;
855
909
  }
910
+ export interface TrainingDataDownloadProgress {
911
+ lang: Language;
912
+ url: string;
913
+ downloadedBytes: number;
914
+ totalBytes?: number;
915
+ percent?: number;
916
+ }
856
917
  export interface TesseractSetRectangleOptions {
857
918
  top: number;
858
919
  left: number;
@@ -913,6 +974,13 @@ export interface DetectOrientationScriptResult {
913
974
  */
914
975
  scriptConfidence: number;
915
976
  }
977
+ export type EnsureTrainedDataOptions = {
978
+ lang: Language;
979
+ cachePath: string;
980
+ dataPath: string;
981
+ downloadBaseUrl: string;
982
+ progressCallback?: (info: TrainingDataDownloadProgress) => void;
983
+ };
916
984
  export interface TesseractInstance {
917
985
  /**
918
986
  * Initialize the engine with the given options.
@@ -1063,7 +1131,8 @@ export type TesseractConstructor = new () => TesseractInstance;
1063
1131
  declare const NativeTesseract: TesseractConstructor;
1064
1132
  declare class Tesseract extends NativeTesseract {
1065
1133
  constructor();
1066
- init(options: TesseractInitOptions): Promise<void>;
1134
+ init(options?: TesseractInitOptions): Promise<void>;
1135
+ ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }: EnsureTrainedDataOptions, progressCallback?: (info: TrainingDataDownloadProgress) => void): Promise<string>;
1067
1136
  }
1068
1137
  export { Tesseract, NativeTesseract };
1069
1138
  export default Tesseract;
@@ -0,0 +1 @@
1
+ export declare const isValidTraineddata: (filePath: string) => Promise<boolean>;
@@ -0,0 +1,23 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.isValidTraineddata = void 0;
13
+ const promises_1 = require("node:fs/promises");
14
+ const isValidTraineddata = (filePath) => __awaiter(void 0, void 0, void 0, function* () {
15
+ try {
16
+ const info = yield (0, promises_1.stat)(filePath);
17
+ return info.isFile() && info.size > 0;
18
+ }
19
+ catch (_a) {
20
+ return false;
21
+ }
22
+ });
23
+ exports.isValidTraineddata = isValidTraineddata;
@@ -839,20 +839,81 @@ export interface TesseractInitOptions {
839
839
  /**
840
840
  * Its generally safer to use as few languages as possible.
841
841
  * The more languages Tesseract needs to load the longer it takes to recognize a image.
842
- * @public
842
+ * The OSD Language will always be loaded to support orientation and script detection
843
+ * IMPORTANT: if you specify more than one language here (e.g.: `deu, eng` for example)
844
+ * tesseract will try to recognize german and english in the same image.
845
+ * Originally tesseract itself accepts it as `deu+eng`, but since this
846
+ * makes typing very hard to near impossible its safer to just accept a
847
+ * array with the languages it should look for.
848
+ * When talking about "hard typing/impossible typing" its because typescript
849
+ * itself cannot create recursive types, and chaining template types
850
+ * (e.g.: `${Language}+${Language}+...`) stretches out the compilation time
851
+ * to a unacceptable amount
852
+ *
853
+ * @default [Language.osd]
843
854
  */
844
- lang?: Language[];
855
+ langs?: Language[];
856
+ /**
857
+ * Specify where the trainingdata is located
858
+ * Besides the datapath in general it is versioned to the
859
+ * version of tesseract
860
+ * @default '~/.cache/node-tesseract-ocr/'
861
+ */
862
+ cachePath?: string;
863
+ /**
864
+ * Explicit datapath for traineddata. Takes precedence over
865
+ * the `TESSDATA_PREFIX` environment variable.
866
+ */
867
+ dataPath?: string;
868
+ /**
869
+ * This will be called for every language that was specified in `lang`,
870
+ * it allows the user to be flexible about the training data's location
871
+ * Or if he needs to specify his own location for certain languages/custom languages
872
+ * IMPORTANT: Ensures that trainingdata will be downloaded from the following cdn
873
+ * in case they dont exist
874
+ * OEM_LSTM_ONLY => https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int
875
+ * NON OEM_LSTM_ONLY => https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0
876
+ * NOTE: Tesseract 5.x.x still uses the 4.x.x trainingdata
877
+ *
878
+ * @default true
879
+ */
880
+ ensureTraineddata?: boolean;
881
+ /**
882
+ * Optional progress callback for traineddata downloads.
883
+ */
884
+ progressCallback?: (info: TrainingDataDownloadProgress) => void;
845
885
  /**
846
886
  * OCR Engine Modes
847
887
  * The engine mode cannot be changed after creating the instance
848
888
  * If another mode is needed, its advised to create a new instance.
889
+ * @default OEM_DEFAULT
849
890
  * @throws {Error} Will throw an error when oem mode is below 0 or over 3
850
891
  */
851
892
  oem?: OcrEngineMode;
893
+ /**
894
+ * Controls if only non debug parameters will be set upon initialization
895
+ * @default false
896
+ */
852
897
  setOnlyNonDebugParams?: boolean;
898
+ /**
899
+ * Array of paths that point to their corresponding config files
900
+ * usually located in the `dataPath` location alongside the training data
901
+ */
853
902
  configs?: Array<string>;
903
+ /**
904
+ * Record of parameters that should be set upon initialization
905
+ * Consult the original documentation of tesseract on which variables
906
+ * can actually be set
907
+ */
854
908
  vars?: Partial<Record<keyof ConfigurationVariables, ConfigurationVariables[keyof ConfigurationVariables]>>;
855
909
  }
910
+ export interface TrainingDataDownloadProgress {
911
+ lang: Language;
912
+ url: string;
913
+ downloadedBytes: number;
914
+ totalBytes?: number;
915
+ percent?: number;
916
+ }
856
917
  export interface TesseractSetRectangleOptions {
857
918
  top: number;
858
919
  left: number;
@@ -913,6 +974,13 @@ export interface DetectOrientationScriptResult {
913
974
  */
914
975
  scriptConfidence: number;
915
976
  }
977
+ export type EnsureTrainedDataOptions = {
978
+ lang: Language;
979
+ cachePath: string;
980
+ dataPath: string;
981
+ downloadBaseUrl: string;
982
+ progressCallback?: (info: TrainingDataDownloadProgress) => void;
983
+ };
916
984
  export interface TesseractInstance {
917
985
  /**
918
986
  * Initialize the engine with the given options.
@@ -1063,7 +1131,8 @@ export type TesseractConstructor = new () => TesseractInstance;
1063
1131
  declare const NativeTesseract: TesseractConstructor;
1064
1132
  declare class Tesseract extends NativeTesseract {
1065
1133
  constructor();
1066
- init(options: TesseractInitOptions): Promise<void>;
1134
+ init(options?: TesseractInitOptions): Promise<void>;
1135
+ ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }: EnsureTrainedDataOptions, progressCallback?: (info: TrainingDataDownloadProgress) => void): Promise<string>;
1067
1136
  }
1068
1137
  export { Tesseract, NativeTesseract };
1069
1138
  export default Tesseract;
@@ -227,14 +227,24 @@ export const LogLevels = {
227
227
  FATAL: "50000",
228
228
  OFF: "2147483647",
229
229
  };
230
- const fs = require("node:fs");
231
- const path = require("node:path");
230
+ import { existsSync, createWriteStream } from "node:fs";
231
+ import { mkdir, rename, rm, copyFile } from "node:fs/promises";
232
+ import os from "node:os";
233
+ import path from "node:path";
234
+ import { Readable, Transform } from "node:stream";
235
+ import { pipeline } from "node:stream/promises";
236
+ import { createGunzip } from "node:zlib";
237
+ import { lock } from "proper-lockfile";
238
+ import { isValidTraineddata } from "./utils";
239
+ const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
240
+ const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
241
+ const DEFAULT_CACHE_DIR = path.join(os.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
232
242
  const rootFromSource = path.resolve(__dirname, "../../");
233
243
  const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
234
- const bindingOptionsPath = fs.existsSync(bindingOptionsFromSource)
244
+ const bindingOptionsPath = existsSync(bindingOptionsFromSource)
235
245
  ? bindingOptionsFromSource
236
246
  : path.resolve(process.cwd(), "binding-options.js");
237
- const prebuildRoot = fs.existsSync(bindingOptionsFromSource)
247
+ const prebuildRoot = existsSync(bindingOptionsFromSource)
238
248
  ? rootFromSource
239
249
  : process.cwd();
240
250
  const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
@@ -242,18 +252,116 @@ class Tesseract extends NativeTesseract {
242
252
  constructor() {
243
253
  super();
244
254
  }
245
- async init(options) {
246
- // scan train data for any files
247
- // check whether the requested langs are available/cached
248
- // if not
249
- // fetch traineddata from cdn
250
- // - add .lock file to downloaded file (while downloading, so other instances
251
- // can wait on it and dont have to download again)
252
- // - place into tesseract standard folder
253
- // if available
254
- // just go on with the init function of the native addon
255
+ async init(options = {}) {
256
+ options.langs ??= [];
257
+ options.ensureTraineddata ??= true;
258
+ options.cachePath ??= DEFAULT_CACHE_DIR;
259
+ options.dataPath ??= process.env.TESSDATA_PREFIX ?? options.cachePath;
260
+ options.progressCallback ??= undefined;
261
+ const cachePath = path.resolve(options.cachePath);
262
+ const dataPath = path.resolve(options.dataPath);
263
+ if (options.ensureTraineddata) {
264
+ for (const lang of [...options.langs, Language.osd]) {
265
+ const downloadBaseUrl = options.oem === OcrEngineModes.OEM_LSTM_ONLY
266
+ ? TESSDATA4_BEST(lang)
267
+ : TESSDATA4(lang);
268
+ lang &&
269
+ (await this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
270
+ }
271
+ }
255
272
  return super.init(options);
256
273
  }
274
+ async ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
275
+ const traineddataPath = path.join(dataPath, `${lang}.traineddata`);
276
+ const cacheTraineddataPath = path.join(cachePath, `${lang}.traineddata`);
277
+ if (await isValidTraineddata(cacheTraineddataPath)) {
278
+ if (traineddataPath !== cacheTraineddataPath) {
279
+ await mkdir(dataPath, { recursive: true });
280
+ await copyFile(cacheTraineddataPath, traineddataPath);
281
+ }
282
+ return traineddataPath;
283
+ }
284
+ if (await isValidTraineddata(traineddataPath)) {
285
+ return traineddataPath;
286
+ }
287
+ await mkdir(dataPath, { recursive: true });
288
+ const release = await lock(traineddataPath, {
289
+ lockfilePath: `${traineddataPath}.lock`,
290
+ stale: 10 * 60 * 1000,
291
+ update: 30 * 1000,
292
+ realpath: false,
293
+ retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
294
+ });
295
+ try {
296
+ if (await isValidTraineddata(traineddataPath)) {
297
+ return traineddataPath;
298
+ }
299
+ if (traineddataPath !== cacheTraineddataPath &&
300
+ (await isValidTraineddata(cacheTraineddataPath))) {
301
+ await copyFile(cacheTraineddataPath, traineddataPath);
302
+ return traineddataPath;
303
+ }
304
+ const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
305
+ const response = await fetch(url);
306
+ if (!response.ok || !response.body) {
307
+ throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
308
+ }
309
+ const tmpPath = path.join(os.tmpdir(), [
310
+ "node-tesseract-ocr",
311
+ lang,
312
+ "traineddata",
313
+ process.pid,
314
+ Date.now(),
315
+ Math.random().toString(36).slice(2),
316
+ ].join("-"));
317
+ const totalBytesHeader = response.headers.get("content-length");
318
+ const totalBytes = totalBytesHeader
319
+ ? Number(totalBytesHeader)
320
+ : undefined;
321
+ let downloadedBytes = 0;
322
+ const progressStream = new Transform({
323
+ transform(chunk, _, callback) {
324
+ if (progressCallback) {
325
+ downloadedBytes += chunk.length;
326
+ const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
327
+ ? (downloadedBytes / totalBytes) * 100
328
+ : undefined;
329
+ progressCallback({
330
+ lang,
331
+ url,
332
+ downloadedBytes,
333
+ totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
334
+ percent,
335
+ });
336
+ }
337
+ callback(null, chunk);
338
+ },
339
+ });
340
+ try {
341
+ await pipeline(Readable.fromWeb(response.body), progressStream, createGunzip(), createWriteStream(tmpPath));
342
+ try {
343
+ await rename(tmpPath, traineddataPath);
344
+ }
345
+ catch (error) {
346
+ if (error.code === "EXDEV") {
347
+ await copyFile(tmpPath, traineddataPath);
348
+ await rm(tmpPath, { force: true });
349
+ }
350
+ else {
351
+ throw error;
352
+ }
353
+ }
354
+ }
355
+ catch (error) {
356
+ await rm(tmpPath, { force: true });
357
+ throw error;
358
+ }
359
+ return traineddataPath;
360
+ }
361
+ finally {
362
+ await release();
363
+ }
364
+ }
257
365
  }
258
366
  export { Tesseract, NativeTesseract };
259
367
  export default Tesseract;
@@ -0,0 +1 @@
1
+ export declare const isValidTraineddata: (filePath: string) => Promise<boolean>;
@@ -0,0 +1,10 @@
1
+ import { stat } from "node:fs/promises";
2
+ export const isValidTraineddata = async (filePath) => {
3
+ try {
4
+ const info = await stat(filePath);
5
+ return info.isFile() && info.size > 0;
6
+ }
7
+ catch {
8
+ return false;
9
+ }
10
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@luii/node-tesseract-ocr",
3
- "version": "2.0.0",
3
+ "version": "2.1.0",
4
4
  "private": false,
5
5
  "binary": {
6
6
  "napi_versions": [
@@ -63,15 +63,17 @@
63
63
  "LICENSE.md"
64
64
  ],
65
65
  "devDependencies": {
66
- "vitest": "^2.1.9",
67
- "@types/node": "^22.0.0",
68
- "typescript": "^5.6.0"
69
- },
70
- "dependencies": {
71
- "cmake-js": "^7.4.0",
66
+ "cmake-js": "^8.0.0",
72
67
  "node-addon-api": "^8.5.0",
68
+ "@types/node": "^25.1.0",
69
+ "@types/proper-lockfile": "^4.1.4",
73
70
  "dotenv": "^16.4.5",
74
- "pkg-prebuilds": "^1.0.0"
71
+ "typescript": "^5.6.0",
72
+ "vitest": "^4.0.18"
73
+ },
74
+ "dependencies": {
75
+ "pkg-prebuilds": "^1.0.0",
76
+ "proper-lockfile": "^4.1.2"
75
77
  },
76
78
  "exports": {
77
79
  "require": {
package/src/commands.hpp CHANGED
@@ -18,7 +18,6 @@
18
18
 
19
19
  #include "monitor.hpp"
20
20
  #include "utils.hpp"
21
- #include <iostream>
22
21
  #include <memory>
23
22
  #include <napi.h>
24
23
  #include <optional>
@@ -94,16 +94,29 @@ Napi::Value TesseractWrapper::Init(const Napi::CallbackInfo &info) {
94
94
  auto options = info[0].As<Napi::Object>();
95
95
  CommandInit command{};
96
96
 
97
- const Napi::Value langOption = options.Get("lang");
98
- if (!langOption.IsUndefined()) {
99
- if (!langOption.IsArray()) {
97
+ const Napi::Value dataPathOption = options.Get("dataPath");
98
+ if (!dataPathOption.IsUndefined()) {
99
+ if (!dataPathOption.IsString()) {
100
100
  deferred.Reject(
101
- Napi::TypeError::New(env, "Option 'lang' must be a array of strings")
101
+ Napi::TypeError::New(env, "Option 'dataPath' must be a string")
102
102
  .Value());
103
103
  return deferred.Promise();
104
104
  }
105
105
 
106
- Napi::Array languages = langOption.As<Napi::Array>();
106
+ Napi::String dataPath = dataPathOption.As<Napi::String>();
107
+ command.data_path = dataPath.Utf8Value();
108
+ }
109
+
110
+ const Napi::Value langsOption = options.Get("langs");
111
+ if (!langsOption.IsUndefined()) {
112
+ if (!langsOption.IsArray()) {
113
+ deferred.Reject(
114
+ Napi::TypeError::New(env, "Option 'langs' must be a array of strings")
115
+ .Value());
116
+ return deferred.Promise();
117
+ }
118
+
119
+ Napi::Array languages = langsOption.As<Napi::Array>();
107
120
  std::string language;
108
121
 
109
122
  for (uint32_t i = 0; i < languages.Length(); ++i) {
@@ -65,6 +65,4 @@ private:
65
65
 
66
66
  Napi::Env _env;
67
67
  WorkerThread _worker_thread;
68
-
69
- const std::string _data_path = std::getenv("TESSDATA_PREFIX");
70
68
  };