@luii/node-tesseract-ocr 1.0.19 → 2.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +45 -0
- package/README.md +457 -85
- package/dist/cjs/index.cjs +272 -16
- package/dist/cjs/index.d.ts +1069 -0
- package/dist/esm/index.d.ts +1069 -0
- package/dist/esm/index.mjs +257 -16
- package/package.json +32 -26
- package/prebuilds/node-tesseract-ocr-darwin-arm64/node-napi-v10.node +0 -0
- package/prebuilds/node-tesseract-ocr-linux-x64/node-napi-v10.node +0 -0
- package/src/addon.cpp +9 -24
- package/src/commands.hpp +489 -0
- package/src/monitor.hpp +81 -0
- package/src/tesseract_wrapper.cpp +714 -0
- package/src/tesseract_wrapper.hpp +70 -0
- package/src/utils.hpp +8 -0
- package/src/worker_thread.cpp +141 -0
- package/src/worker_thread.hpp +79 -0
- package/binding.gyp +0 -60
- package/dist/index.d.ts +0 -349
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/src/handle.cpp +0 -174
- package/src/handle.h +0 -57
- package/src/ocr_result.cpp +0 -99
- package/src/ocr_result.h +0 -47
- package/src/ocr_worker.cpp +0 -191
- package/src/ocr_worker.h +0 -67
package/dist/esm/index.mjs
CHANGED
|
@@ -1,18 +1,259 @@
|
|
|
1
1
|
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
|
|
2
|
+
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
+
*
|
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
* you may not use this file except in compliance with the License.
|
|
6
|
+
* You may obtain a copy of the License at
|
|
7
|
+
*
|
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
*
|
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
+
* or implied. See the License for the specific language governing
|
|
14
|
+
* permissions and limitations under the License.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* All available languages for tesseract
|
|
18
|
+
* @readonly
|
|
19
|
+
* @enum {string}
|
|
20
|
+
*/
|
|
21
|
+
export const Language = {
|
|
22
|
+
afr: "afr",
|
|
23
|
+
amh: "amh",
|
|
24
|
+
ara: "ara",
|
|
25
|
+
asm: "asm",
|
|
26
|
+
aze: "aze",
|
|
27
|
+
aze_cyrl: "aze_cyrl",
|
|
28
|
+
bel: "bel",
|
|
29
|
+
ben: "ben",
|
|
30
|
+
bod: "bod",
|
|
31
|
+
bos: "bos",
|
|
32
|
+
bre: "bre",
|
|
33
|
+
bul: "bul",
|
|
34
|
+
cat: "cat",
|
|
35
|
+
ceb: "ceb",
|
|
36
|
+
ces: "ces",
|
|
37
|
+
chi_sim: "chi_sim",
|
|
38
|
+
chi_tra: "chi_tra",
|
|
39
|
+
chr: "chr",
|
|
40
|
+
cos: "cos",
|
|
41
|
+
cym: "cym",
|
|
42
|
+
dan: "dan",
|
|
43
|
+
deu: "deu",
|
|
44
|
+
deu_latf: "deu_latf",
|
|
45
|
+
div: "div",
|
|
46
|
+
dzo: "dzo",
|
|
47
|
+
ell: "ell",
|
|
48
|
+
eng: "eng",
|
|
49
|
+
enm: "enm",
|
|
50
|
+
epo: "epo",
|
|
51
|
+
equ: "equ",
|
|
52
|
+
est: "est",
|
|
53
|
+
eus: "eus",
|
|
54
|
+
fao: "fao",
|
|
55
|
+
fas: "fas",
|
|
56
|
+
fil: "fil",
|
|
57
|
+
fin: "fin",
|
|
58
|
+
fra: "fra",
|
|
59
|
+
frm: "frm",
|
|
60
|
+
fry: "fry",
|
|
61
|
+
gla: "gla",
|
|
62
|
+
gle: "gle",
|
|
63
|
+
glg: "glg",
|
|
64
|
+
grc: "grc",
|
|
65
|
+
guj: "guj",
|
|
66
|
+
hat: "hat",
|
|
67
|
+
heb: "heb",
|
|
68
|
+
hin: "hin",
|
|
69
|
+
hrv: "hrv",
|
|
70
|
+
hun: "hun",
|
|
71
|
+
hye: "hye",
|
|
72
|
+
iku: "iku",
|
|
73
|
+
ind: "ind",
|
|
74
|
+
isl: "isl",
|
|
75
|
+
ita: "ita",
|
|
76
|
+
ita_old: "ita_old",
|
|
77
|
+
jav: "jav",
|
|
78
|
+
jpn: "jpn",
|
|
79
|
+
kan: "kan",
|
|
80
|
+
kat: "kat",
|
|
81
|
+
kat_old: "kat_old",
|
|
82
|
+
kaz: "kaz",
|
|
83
|
+
khm: "khm",
|
|
84
|
+
kir: "kir",
|
|
85
|
+
kmr: "kmr",
|
|
86
|
+
kor: "kor",
|
|
87
|
+
kor_vert: "kor_vert",
|
|
88
|
+
kur: "kur",
|
|
89
|
+
lao: "lao",
|
|
90
|
+
lat: "lat",
|
|
91
|
+
lav: "lav",
|
|
92
|
+
lit: "lit",
|
|
93
|
+
ltz: "ltz",
|
|
94
|
+
mal: "mal",
|
|
95
|
+
mar: "mar",
|
|
96
|
+
mkd: "mkd",
|
|
97
|
+
mlt: "mlt",
|
|
98
|
+
mon: "mon",
|
|
99
|
+
mri: "mri",
|
|
100
|
+
msa: "msa",
|
|
101
|
+
mya: "mya",
|
|
102
|
+
nep: "nep",
|
|
103
|
+
nld: "nld",
|
|
104
|
+
nor: "nor",
|
|
105
|
+
oci: "oci",
|
|
106
|
+
ori: "ori",
|
|
107
|
+
osd: "osd",
|
|
108
|
+
pan: "pan",
|
|
109
|
+
pol: "pol",
|
|
110
|
+
por: "por",
|
|
111
|
+
pus: "pus",
|
|
112
|
+
que: "que",
|
|
113
|
+
ron: "ron",
|
|
114
|
+
rus: "rus",
|
|
115
|
+
san: "san",
|
|
116
|
+
sin: "sin",
|
|
117
|
+
slk: "slk",
|
|
118
|
+
slv: "slv",
|
|
119
|
+
snd: "snd",
|
|
120
|
+
spa: "spa",
|
|
121
|
+
spa_old: "spa_old",
|
|
122
|
+
sqi: "sqi",
|
|
123
|
+
srp: "srp",
|
|
124
|
+
srp_latn: "srp_latn",
|
|
125
|
+
sun: "sun",
|
|
126
|
+
swa: "swa",
|
|
127
|
+
swe: "swe",
|
|
128
|
+
syr: "syr",
|
|
129
|
+
tam: "tam",
|
|
130
|
+
tat: "tat",
|
|
131
|
+
tel: "tel",
|
|
132
|
+
tgk: "tgk",
|
|
133
|
+
tha: "tha",
|
|
134
|
+
tir: "tir",
|
|
135
|
+
ton: "ton",
|
|
136
|
+
tur: "tur",
|
|
137
|
+
uig: "uig",
|
|
138
|
+
ukr: "ukr",
|
|
139
|
+
urd: "urd",
|
|
140
|
+
uzb: "uzb",
|
|
141
|
+
uzb_cyrl: "uzb_cyrl",
|
|
142
|
+
vie: "vie",
|
|
143
|
+
yid: "yid",
|
|
144
|
+
yor: "yor",
|
|
145
|
+
};
|
|
146
|
+
/**
|
|
147
|
+
* When Tesseract/Cube is initialized we can choose to instantiate/load/run
|
|
148
|
+
* only the Tesseract part, only the Cube part or both along with the combiner.
|
|
149
|
+
* The preference of which engine to use is stored in tessedit_ocr_engine_mode.
|
|
150
|
+
* @readonly
|
|
151
|
+
* @enum {number}
|
|
152
|
+
*/
|
|
153
|
+
export const OcrEngineModes = {
|
|
154
|
+
/**
|
|
155
|
+
* Run Tesseract only - fastest
|
|
156
|
+
* @deprecated
|
|
157
|
+
* @type {number}
|
|
158
|
+
*/
|
|
159
|
+
OEM_TESSERACT_ONLY: 0,
|
|
160
|
+
/**
|
|
161
|
+
* Run just the LSTM line recognizer.
|
|
162
|
+
* @type {nmumber}
|
|
163
|
+
*/
|
|
164
|
+
OEM_LSTM_ONLY: 1,
|
|
165
|
+
/**
|
|
166
|
+
* Run the LSTM recognizer, but allow fallback
|
|
167
|
+
* to Tesseract when things get difficult.
|
|
168
|
+
* @deprecated
|
|
169
|
+
* @type {number}
|
|
170
|
+
*/
|
|
171
|
+
OEM_TESSERACT_LSTM_COMBINED: 2,
|
|
172
|
+
/**
|
|
173
|
+
* Specify this mode when calling init(),
|
|
174
|
+
* to indicate that any of the above modes
|
|
175
|
+
* should be automatically inferred from the
|
|
176
|
+
* variables in the language-specific config,
|
|
177
|
+
* command-line configs, or if not specified
|
|
178
|
+
* in any of the above should be set to the
|
|
179
|
+
* default OEM_TESSERACT_ONLY.
|
|
180
|
+
* @type {number}
|
|
181
|
+
* @default
|
|
182
|
+
*/
|
|
183
|
+
OEM_DEFAULT: 3,
|
|
184
|
+
};
|
|
185
|
+
/**
|
|
186
|
+
* Possible modes for page layout analysis.
|
|
187
|
+
* @readonly
|
|
188
|
+
* @enum {number}
|
|
189
|
+
*/
|
|
190
|
+
export const PageSegmentationModes = {
|
|
191
|
+
// Orientation and script detection only.
|
|
192
|
+
PSM_OSD_ONLY: 0,
|
|
193
|
+
// Automatic page segmentation with orientation and script detection. (OSD)
|
|
194
|
+
PSM_AUTO_OSD: 1,
|
|
195
|
+
// Automatic page segmentation, but no OSD, or OCR.
|
|
196
|
+
PSM_AUTO_ONLY: 2,
|
|
197
|
+
// Fully automatic page segmentation, but no OSD.
|
|
198
|
+
PSM_AUTO: 3,
|
|
199
|
+
// Assume a single column of text of variable sizes.
|
|
200
|
+
PSM_SINGLE_COLUMN: 4,
|
|
201
|
+
// Assume a single uniform block of vertically aligned text.
|
|
202
|
+
PSM_SINGLE_BLOCK_VERT_TEXT: 5,
|
|
203
|
+
// Assume a single uniform block of text. (Default.)
|
|
204
|
+
PSM_SINGLE_BLOCK: 6,
|
|
205
|
+
// Treat the image as a single text line.
|
|
206
|
+
PSM_SINGLE_LINE: 7,
|
|
207
|
+
// Treat the image as a single word.
|
|
208
|
+
PSM_SINGLE_WORD: 8,
|
|
209
|
+
// Treat the image as a single word in a circle.
|
|
210
|
+
PSM_CIRCLE_WORD: 9,
|
|
211
|
+
// Treat the image as a single character.
|
|
212
|
+
PSM_SINGLE_CHAR: 10,
|
|
213
|
+
// Find as much text as possible in no particular order.
|
|
214
|
+
PSM_SPARSE_TEXT: 11,
|
|
215
|
+
// Sparse text with orientation and script det.
|
|
216
|
+
PSM_SPARSE_TEXT_OSD: 12,
|
|
217
|
+
// Treat the image as a single text line, bypassing hacks that are Tesseract-specific.
|
|
218
|
+
PSM_RAW_LINE: 13,
|
|
219
|
+
};
|
|
220
|
+
export const LogLevels = {
|
|
221
|
+
ALL: "-2147483648",
|
|
222
|
+
TRACE: "5000",
|
|
223
|
+
DEBUG: "10000",
|
|
224
|
+
INFO: "20000",
|
|
225
|
+
WARN: "30000",
|
|
226
|
+
ERROR: "40000",
|
|
227
|
+
FATAL: "50000",
|
|
228
|
+
OFF: "2147483647",
|
|
229
|
+
};
|
|
230
|
+
const fs = require("node:fs");
|
|
231
|
+
const path = require("node:path");
|
|
232
|
+
const rootFromSource = path.resolve(__dirname, "../../");
|
|
233
|
+
const bindingOptionsFromSource = path.resolve(rootFromSource, "binding-options.js");
|
|
234
|
+
const bindingOptionsPath = fs.existsSync(bindingOptionsFromSource)
|
|
235
|
+
? bindingOptionsFromSource
|
|
236
|
+
: path.resolve(process.cwd(), "binding-options.js");
|
|
237
|
+
const prebuildRoot = fs.existsSync(bindingOptionsFromSource)
|
|
238
|
+
? rootFromSource
|
|
239
|
+
: process.cwd();
|
|
240
|
+
const { Tesseract: NativeTesseract } = require("pkg-prebuilds")(prebuildRoot, require(bindingOptionsPath));
|
|
241
|
+
class Tesseract extends NativeTesseract {
|
|
242
|
+
constructor() {
|
|
243
|
+
super();
|
|
244
|
+
}
|
|
245
|
+
async init(options) {
|
|
246
|
+
// scan train data for any files
|
|
247
|
+
// check whether the requested langs are available/cached
|
|
248
|
+
// if not
|
|
249
|
+
// fetch traineddata from cdn
|
|
250
|
+
// - add .lock file to downloaded file (while downloading, so other instances
|
|
251
|
+
// can wait on it and dont have to download again)
|
|
252
|
+
// - place into tesseract standard folder
|
|
253
|
+
// if available
|
|
254
|
+
// just go on with the init function of the native addon
|
|
255
|
+
return super.init(options);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
export { Tesseract, NativeTesseract };
|
|
18
259
|
export default Tesseract;
|
package/package.json
CHANGED
|
@@ -1,10 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@luii/node-tesseract-ocr",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.13",
|
|
4
4
|
"private": false,
|
|
5
|
-
"
|
|
5
|
+
"binary": {
|
|
6
|
+
"napi_versions": [
|
|
7
|
+
10
|
|
8
|
+
]
|
|
9
|
+
},
|
|
6
10
|
"main": "dist/cjs/index.cjs",
|
|
7
11
|
"module": "dist/esm/index.mjs",
|
|
12
|
+
"engines": {
|
|
13
|
+
"node": ">=22.14.0"
|
|
14
|
+
},
|
|
8
15
|
"types": "dist/index.d.ts",
|
|
9
16
|
"homepage": "https://github.com/luii/node-tesseract-ocr",
|
|
10
17
|
"repository": {
|
|
@@ -29,52 +36,51 @@
|
|
|
29
36
|
"ocr",
|
|
30
37
|
"tesseract",
|
|
31
38
|
"leptonica",
|
|
32
|
-
"
|
|
39
|
+
"cmake-js",
|
|
33
40
|
"node-addon-api"
|
|
34
41
|
],
|
|
35
42
|
"publishConfig": {
|
|
36
43
|
"access": "public"
|
|
37
44
|
},
|
|
38
45
|
"scripts": {
|
|
39
|
-
"install": "
|
|
40
|
-
"
|
|
41
|
-
"build": "npm run build:
|
|
42
|
-
"build:
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"build:esm": "tsc -p tsconfig.esm.json && mv dist/esm/index.js dist/esm/index.mjs",
|
|
48
|
-
"build:debug": "rm -rf dist && mkdir dist && npm run build:addon:debug && npm run build:cjs && npm run build:esm",
|
|
49
|
-
"build:release": "rm -rf dist && mkdir dist && npm run build:addon:release && npm run build:cjs && npm run build:esm",
|
|
50
|
-
"build:examples": "npm run build && npx tsc -p tsconfig.examples.json",
|
|
51
|
-
"example:recognize": "npm run build:examples && node dist/examples/recognize.js"
|
|
46
|
+
"install": "cmake-js compile",
|
|
47
|
+
"build:ts": "tsc -p tsconfig.cjs.json && tsc -p tsconfig.esm.json && mkdir -p dist && mv dist/cjs/index.js dist/cjs/index.cjs && mv dist/esm/index.js dist/esm/index.mjs",
|
|
48
|
+
"build:debug": "cmake-js compile --debug && npm run build:ts",
|
|
49
|
+
"build:release": "cmake-js compile --release && npm run build:ts",
|
|
50
|
+
"example:recognize": "npm run build:debug && tsc -p tsconfig.examples.json && node -r dotenv/config dist/examples/recognize.js dotenv_config_path=.env.local",
|
|
51
|
+
"test:cpp": "cmake-js compile --release && ./build/release/node-tesseract-ocr-tests",
|
|
52
|
+
"test:js": "vitest run",
|
|
53
|
+
"test:js:watch": "vitest"
|
|
52
54
|
},
|
|
53
55
|
"files": [
|
|
54
56
|
"dist/**",
|
|
55
57
|
"prebuilds/**",
|
|
56
58
|
"src/**",
|
|
57
|
-
"build/
|
|
59
|
+
"build/release/*.node",
|
|
58
60
|
"package.json",
|
|
59
|
-
"
|
|
61
|
+
"CMakeLists.txt",
|
|
60
62
|
"README.md",
|
|
61
63
|
"LICENSE.md"
|
|
62
64
|
],
|
|
63
65
|
"devDependencies": {
|
|
66
|
+
"vitest": "^2.1.9",
|
|
64
67
|
"@types/node": "^22.0.0",
|
|
65
|
-
"node-addon-api": "^8.5.0",
|
|
66
|
-
"prebuildify": "^5.0.0",
|
|
67
|
-
"node-gyp": "^10.0.0",
|
|
68
68
|
"typescript": "^5.6.0"
|
|
69
69
|
},
|
|
70
70
|
"dependencies": {
|
|
71
|
-
"
|
|
71
|
+
"cmake-js": "^7.4.0",
|
|
72
|
+
"node-addon-api": "^8.5.0",
|
|
73
|
+
"dotenv": "^16.4.5",
|
|
74
|
+
"pkg-prebuilds": "^1.0.0"
|
|
72
75
|
},
|
|
73
76
|
"exports": {
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
|
|
77
|
+
"require": {
|
|
78
|
+
"types": "./dist/cjs/index.d.ts",
|
|
79
|
+
"default": "./dist/cjs/index.cjs"
|
|
80
|
+
},
|
|
81
|
+
"import": {
|
|
82
|
+
"types": "./dist/esm/index.d.ts",
|
|
83
|
+
"default": "./dist/esm/index.mjs"
|
|
78
84
|
}
|
|
79
85
|
}
|
|
80
86
|
}
|
|
Binary file
|
|
Binary file
|
package/src/addon.cpp
CHANGED
|
@@ -1,24 +1,9 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#include "handle.h"
|
|
18
|
-
#include <napi.h>
|
|
19
|
-
|
|
20
|
-
Napi::Object InitAll(Napi::Env env, Napi::Object exports) {
|
|
21
|
-
return Handle::GetClass(env, exports);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
NODE_API_MODULE(hello, InitAll)
|
|
1
|
+
|
|
2
|
+
#include "tesseract_wrapper.hpp"
|
|
3
|
+
#include <napi.h>
|
|
4
|
+
|
|
5
|
+
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
|
6
|
+
return TesseractWrapper::InitAddon(env, exports);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
NODE_API_MODULE(NODE_GYP_MODULE_NAME, Init)
|