@luii/node-tesseract-ocr 2.0.13 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -3
- package/README.md +547 -153
- package/binding-options.js +4 -0
- package/dist/cjs/index.cjs +144 -18
- package/dist/cjs/index.d.ts +6 -859
- package/dist/cjs/types.d.ts +1272 -0
- package/dist/cjs/types.js +17 -0
- package/dist/cjs/utils.d.ts +1 -0
- package/dist/cjs/utils.js +38 -0
- package/dist/esm/index.d.ts +6 -859
- package/dist/esm/index.mjs +129 -14
- package/dist/esm/types.d.ts +1272 -0
- package/dist/esm/types.js +16 -0
- package/dist/esm/utils.d.ts +1 -0
- package/dist/esm/utils.js +25 -0
- package/package.json +15 -10
- package/prebuilds/node-tesseract-ocr-darwin-arm64/node-napi-v10.node +0 -0
- package/prebuilds/node-tesseract-ocr-linux-x64/node-napi-v10.node +0 -0
- package/src/commands.hpp +656 -88
- package/src/tesseract_wrapper.cpp +643 -187
- package/src/tesseract_wrapper.hpp +27 -4
- package/src/worker_thread.cpp +146 -2
- package/src/worker_thread.hpp +4 -1
package/dist/esm/index.mjs
CHANGED
|
@@ -13,6 +13,15 @@
|
|
|
13
13
|
* or implied. See the License for the specific language governing
|
|
14
14
|
* permissions and limitations under the License.
|
|
15
15
|
*/
|
|
16
|
+
import { existsSync, createWriteStream } from "node:fs";
|
|
17
|
+
import { mkdir, rename, rm, copyFile } from "node:fs/promises";
|
|
18
|
+
import os from "node:os";
|
|
19
|
+
import path from "node:path";
|
|
20
|
+
import { Readable, Transform } from "node:stream";
|
|
21
|
+
import { pipeline } from "node:stream/promises";
|
|
22
|
+
import { createGunzip } from "node:zlib";
|
|
23
|
+
import { lock } from "proper-lockfile";
|
|
24
|
+
import { isValidTraineddata } from "./utils";
|
|
16
25
|
/**
|
|
17
26
|
* All available languages for tesseract
|
|
18
27
|
* @readonly
|
|
@@ -227,33 +236,139 @@ export const LogLevels = {
|
|
|
227
236
|
FATAL: "50000",
|
|
228
237
|
OFF: "2147483647",
|
|
229
238
|
};
|
|
230
|
-
const
|
|
231
|
-
const
|
|
239
|
+
const TESSDATA4_BEST = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int/`;
|
|
240
|
+
const TESSDATA4 = (lang) => `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0/`;
|
|
241
|
+
const DEFAULT_CACHE_DIR = path.join(os.homedir(), ".cache", "node-tesseract-ocr", "tessdata");
|
|
232
242
|
const rootFromSource = path.resolve(__dirname, "../../");
|
|
233
243
|
const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
|
|
234
|
-
const bindingOptionsPath =
|
|
244
|
+
const bindingOptionsPath = existsSync(bindingOptionsFromSource)
|
|
235
245
|
? bindingOptionsFromSource
|
|
236
246
|
: path.resolve(process.cwd(), "binding-options.js");
|
|
237
|
-
const prebuildRoot =
|
|
247
|
+
const prebuildRoot = existsSync(bindingOptionsFromSource)
|
|
238
248
|
? rootFromSource
|
|
239
249
|
: process.cwd();
|
|
240
250
|
const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
|
|
241
251
|
class Tesseract extends NativeTesseract {
|
|
252
|
+
document = {
|
|
253
|
+
begin: this.beginProcessPages.bind(this),
|
|
254
|
+
addPage: this.addProcessPage.bind(this),
|
|
255
|
+
finish: this.finishProcessPages.bind(this),
|
|
256
|
+
abort: this.abortProcessPages.bind(this),
|
|
257
|
+
status: this.getProcessPagesStatus.bind(this),
|
|
258
|
+
};
|
|
242
259
|
constructor() {
|
|
243
260
|
super();
|
|
244
261
|
}
|
|
245
|
-
async init(options) {
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
262
|
+
async init(options = {}) {
|
|
263
|
+
options.langs ??= [];
|
|
264
|
+
options.ensureTraineddata ??= true;
|
|
265
|
+
options.cachePath ??= DEFAULT_CACHE_DIR;
|
|
266
|
+
options.dataPath ??= process.env.TESSDATA_PREFIX ?? options.cachePath;
|
|
267
|
+
options.progressCallback ??= undefined;
|
|
268
|
+
const cachePath = path.resolve(options.cachePath);
|
|
269
|
+
const dataPath = path.resolve(options.dataPath);
|
|
270
|
+
if (options.ensureTraineddata) {
|
|
271
|
+
for (const lang of [...options.langs, Language.osd]) {
|
|
272
|
+
const downloadBaseUrl = options.oem === OcrEngineModes.OEM_LSTM_ONLY
|
|
273
|
+
? TESSDATA4_BEST(lang)
|
|
274
|
+
: TESSDATA4(lang);
|
|
275
|
+
lang &&
|
|
276
|
+
(await this.ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, options.progressCallback));
|
|
277
|
+
}
|
|
278
|
+
}
|
|
255
279
|
return super.init(options);
|
|
256
280
|
}
|
|
281
|
+
async ensureTrainingData({ lang, dataPath, cachePath, downloadBaseUrl }, progressCallback) {
|
|
282
|
+
const traineddataPath = path.join(dataPath, `${lang}.traineddata`);
|
|
283
|
+
const cacheTraineddataPath = path.join(cachePath, `${lang}.traineddata`);
|
|
284
|
+
if (await isValidTraineddata(cacheTraineddataPath)) {
|
|
285
|
+
if (traineddataPath !== cacheTraineddataPath) {
|
|
286
|
+
await mkdir(dataPath, { recursive: true });
|
|
287
|
+
await copyFile(cacheTraineddataPath, traineddataPath);
|
|
288
|
+
}
|
|
289
|
+
return traineddataPath;
|
|
290
|
+
}
|
|
291
|
+
if (await isValidTraineddata(traineddataPath)) {
|
|
292
|
+
return traineddataPath;
|
|
293
|
+
}
|
|
294
|
+
await mkdir(dataPath, { recursive: true });
|
|
295
|
+
const release = await lock(traineddataPath, {
|
|
296
|
+
lockfilePath: `${traineddataPath}.lock`,
|
|
297
|
+
stale: 10 * 60 * 1000,
|
|
298
|
+
update: 30 * 1000,
|
|
299
|
+
realpath: false,
|
|
300
|
+
retries: { retries: 50, minTimeout: 200, maxTimeout: 2000 },
|
|
301
|
+
});
|
|
302
|
+
try {
|
|
303
|
+
if (await isValidTraineddata(traineddataPath)) {
|
|
304
|
+
return traineddataPath;
|
|
305
|
+
}
|
|
306
|
+
if (traineddataPath !== cacheTraineddataPath &&
|
|
307
|
+
(await isValidTraineddata(cacheTraineddataPath))) {
|
|
308
|
+
await copyFile(cacheTraineddataPath, traineddataPath);
|
|
309
|
+
return traineddataPath;
|
|
310
|
+
}
|
|
311
|
+
const url = new URL(`${lang}.traineddata.gz`, downloadBaseUrl).toString();
|
|
312
|
+
const response = await fetch(url);
|
|
313
|
+
if (!response.ok || !response.body) {
|
|
314
|
+
throw new Error(`Failed to download traineddata for ${lang}: ${response.status} ${response.statusText}`);
|
|
315
|
+
}
|
|
316
|
+
const tmpPath = path.join(os.tmpdir(), [
|
|
317
|
+
"node-tesseract-ocr",
|
|
318
|
+
lang,
|
|
319
|
+
"traineddata",
|
|
320
|
+
process.pid,
|
|
321
|
+
Date.now(),
|
|
322
|
+
Math.random().toString(36).slice(2),
|
|
323
|
+
].join("-"));
|
|
324
|
+
const totalBytesHeader = response.headers.get("content-length");
|
|
325
|
+
const totalBytes = totalBytesHeader
|
|
326
|
+
? Number(totalBytesHeader)
|
|
327
|
+
: undefined;
|
|
328
|
+
let downloadedBytes = 0;
|
|
329
|
+
const progressStream = new Transform({
|
|
330
|
+
transform(chunk, _, callback) {
|
|
331
|
+
if (progressCallback) {
|
|
332
|
+
downloadedBytes += chunk.length;
|
|
333
|
+
const percent = typeof totalBytes === "number" && Number.isFinite(totalBytes)
|
|
334
|
+
? (downloadedBytes / totalBytes) * 100
|
|
335
|
+
: undefined;
|
|
336
|
+
progressCallback({
|
|
337
|
+
lang,
|
|
338
|
+
url,
|
|
339
|
+
downloadedBytes,
|
|
340
|
+
totalBytes: Number.isFinite(totalBytes) ? totalBytes : undefined,
|
|
341
|
+
percent,
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
callback(null, chunk);
|
|
345
|
+
},
|
|
346
|
+
});
|
|
347
|
+
try {
|
|
348
|
+
await pipeline(Readable.fromWeb(response.body), progressStream, createGunzip(), createWriteStream(tmpPath));
|
|
349
|
+
try {
|
|
350
|
+
await rename(tmpPath, traineddataPath);
|
|
351
|
+
}
|
|
352
|
+
catch (error) {
|
|
353
|
+
if (error.code === "EXDEV") {
|
|
354
|
+
await copyFile(tmpPath, traineddataPath);
|
|
355
|
+
await rm(tmpPath, { force: true });
|
|
356
|
+
}
|
|
357
|
+
else {
|
|
358
|
+
throw error;
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
catch (error) {
|
|
363
|
+
await rm(tmpPath, { force: true });
|
|
364
|
+
throw error;
|
|
365
|
+
}
|
|
366
|
+
return traineddataPath;
|
|
367
|
+
}
|
|
368
|
+
finally {
|
|
369
|
+
await release();
|
|
370
|
+
}
|
|
371
|
+
}
|
|
257
372
|
}
|
|
258
373
|
export { Tesseract, NativeTesseract };
|
|
259
374
|
export default Tesseract;
|