@luii/node-tesseract-ocr 1.0.19 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +45 -0
- package/README.md +457 -85
- package/dist/cjs/index.cjs +272 -16
- package/dist/cjs/index.d.ts +1069 -0
- package/dist/esm/index.d.ts +1069 -0
- package/dist/esm/index.mjs +257 -16
- package/package.json +32 -26
- package/prebuilds/node-tesseract-ocr-darwin-arm64/node-napi-v10.node +0 -0
- package/prebuilds/node-tesseract-ocr-linux-x64/node-napi-v10.node +0 -0
- package/src/addon.cpp +9 -24
- package/src/commands.hpp +489 -0
- package/src/monitor.hpp +81 -0
- package/src/tesseract_wrapper.cpp +714 -0
- package/src/tesseract_wrapper.hpp +70 -0
- package/src/utils.hpp +8 -0
- package/src/worker_thread.cpp +141 -0
- package/src/worker_thread.hpp +79 -0
- package/binding.gyp +0 -60
- package/dist/index.d.ts +0 -349
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/src/handle.cpp +0 -174
- package/src/handle.h +0 -57
- package/src/ocr_result.cpp +0 -99
- package/src/ocr_result.h +0 -47
- package/src/ocr_worker.cpp +0 -191
- package/src/ocr_worker.h +0 -67
|
Binary file
|
|
Binary file
|
package/src/handle.cpp
DELETED
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#include "handle.h"
|
|
18
|
-
#include "napi.h"
|
|
19
|
-
#include "ocr_worker.h"
|
|
20
|
-
#include <tesseract/pageiterator.h>
|
|
21
|
-
#include <tesseract/publictypes.h>
|
|
22
|
-
|
|
23
|
-
Napi::Object Handle::GetClass(Napi::Env env, Napi::Object exports) {
|
|
24
|
-
Napi::Function funcs = DefineClass(
|
|
25
|
-
env, "Tesseract", {InstanceMethod("recognize", &Handle::Recognize)});
|
|
26
|
-
|
|
27
|
-
Napi::FunctionReference *constructor = new Napi::FunctionReference();
|
|
28
|
-
|
|
29
|
-
*constructor = Napi::Persistent(funcs);
|
|
30
|
-
exports.Set("Tesseract", funcs);
|
|
31
|
-
env.SetInstanceData<Napi::FunctionReference>(constructor);
|
|
32
|
-
|
|
33
|
-
return exports;
|
|
34
|
-
};
|
|
35
|
-
|
|
36
|
-
Handle::Handle(const Napi::CallbackInfo &info)
|
|
37
|
-
: Napi::ObjectWrap<Handle>(info) {
|
|
38
|
-
|
|
39
|
-
Napi::Env env = info.Env();
|
|
40
|
-
|
|
41
|
-
if (info.Length() == 1 && info[0].IsObject()) {
|
|
42
|
-
auto ctorOptions = info[0].As<Napi::Object>();
|
|
43
|
-
|
|
44
|
-
const Napi::Value skipOcrOption = ctorOptions.Get("skipOcr");
|
|
45
|
-
if (!skipOcrOption.IsUndefined() && skipOcrOption.IsBoolean()) {
|
|
46
|
-
skipOcr_ = skipOcrOption.As<Napi::Boolean>().Value();
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
const Napi::Value dataPathOption = ctorOptions.Get("dataPath");
|
|
50
|
-
if (!dataPathOption.IsUndefined() && dataPathOption.IsString()) {
|
|
51
|
-
dataPath_ = dataPathOption.As<Napi::String>().Utf8Value();
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
const Napi::Value langOption = ctorOptions.Get("lang");
|
|
55
|
-
if (!langOption.IsUndefined() && langOption.IsString()) {
|
|
56
|
-
lang_ = langOption.As<Napi::String>().Utf8Value();
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const Napi::Value engineModeOption = ctorOptions.Get("engineMode");
|
|
60
|
-
if (!engineModeOption.IsUndefined() && engineModeOption.IsNumber()) {
|
|
61
|
-
oemMode_ = static_cast<tesseract::OcrEngineMode>(
|
|
62
|
-
engineModeOption.As<Napi::Number>().Int32Value());
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
const Napi::Value psmOption = ctorOptions.Get("psm");
|
|
66
|
-
if (!psmOption.IsUndefined() && psmOption.IsNumber()) {
|
|
67
|
-
psm_ = static_cast<tesseract::PageSegMode>(
|
|
68
|
-
psmOption.As<Napi::Number>().Int32Value());
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
if (oemMode_ < 0 || oemMode_ >= tesseract::OEM_COUNT) {
|
|
72
|
-
Napi::TypeError::New(env, "Unsupported OCR Engine Mode")
|
|
73
|
-
.ThrowAsJavaScriptException();
|
|
74
|
-
return;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
if (psm_ < 0 || psm_ >= tesseract::PSM_COUNT) {
|
|
78
|
-
Napi::TypeError::New(env, "Unsupported Page Segmentation Mode")
|
|
79
|
-
.ThrowAsJavaScriptException();
|
|
80
|
-
return;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
Handle::~Handle() {}
|
|
86
|
-
|
|
87
|
-
std::unique_ptr<tesseract::TessBaseAPI> Handle::CreateApi() {
|
|
88
|
-
auto api = std::make_unique<tesseract::TessBaseAPI>();
|
|
89
|
-
if (skipOcr_) {
|
|
90
|
-
api->InitForAnalysePage();
|
|
91
|
-
} else {
|
|
92
|
-
if (api->Init(dataPath_.c_str(), lang_.c_str(), oemMode_) == -1) {
|
|
93
|
-
api->End();
|
|
94
|
-
return nullptr;
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
api->SetPageSegMode(static_cast<tesseract::PageSegMode>(psm_));
|
|
99
|
-
return api;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
Napi::Value Handle::Recognize(const Napi::CallbackInfo &info) {
|
|
103
|
-
const Napi::Env env = info.Env();
|
|
104
|
-
const Napi::Promise::Deferred deffered = Napi::Promise::Deferred::New(env);
|
|
105
|
-
|
|
106
|
-
if (skipOcr_) {
|
|
107
|
-
deffered.Reject(Napi::Error::New(env, "OCR not available when handle was "
|
|
108
|
-
"created with `skipOcr` turned on")
|
|
109
|
-
.Value());
|
|
110
|
-
return deffered.Promise();
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
if (info.Length() <= 0 || !info[0].IsBuffer()) {
|
|
114
|
-
deffered.Reject(
|
|
115
|
-
Napi::TypeError::New(env, "Expected image buffer to be of type Buffer")
|
|
116
|
-
.Value());
|
|
117
|
-
return deffered.Promise();
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
Napi::Function progressCallback = Napi::Function();
|
|
121
|
-
if (info.Length() == 2 && info[1].IsObject()) {
|
|
122
|
-
const Napi::Object recognizeOptions = info[1].As<Napi::Object>();
|
|
123
|
-
const Napi::Value progressChangedOption =
|
|
124
|
-
recognizeOptions.Get("progressChanged");
|
|
125
|
-
if (!progressChangedOption.IsUndefined() &&
|
|
126
|
-
progressChangedOption.IsFunction()) {
|
|
127
|
-
progressCallback = progressChangedOption.As<Napi::Function>();
|
|
128
|
-
} else if (!progressChangedOption.IsFunction()) {
|
|
129
|
-
Napi::TypeError::New(
|
|
130
|
-
env, "Expected `progressChanged` callback to be a function")
|
|
131
|
-
.ThrowAsJavaScriptException();
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
auto imageBuffer = info[0].As<Napi::Buffer<uint8_t>>();
|
|
136
|
-
auto *pWorker = new OCRWorker(this, info.This().As<Napi::Object>(),
|
|
137
|
-
imageBuffer, deffered, progressCallback);
|
|
138
|
-
|
|
139
|
-
pWorker->Queue();
|
|
140
|
-
|
|
141
|
-
return deffered.Promise();
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Napi::Value Handle::AnalyzeLayout(const Napi::CallbackInfo &info) {
|
|
145
|
-
// const Napi::Env env = info.Env();
|
|
146
|
-
// ArgParser args(info);
|
|
147
|
-
//
|
|
148
|
-
// const Napi::Promise::Deferred deffered = Napi::Promise::Deferred::New(env);
|
|
149
|
-
//
|
|
150
|
-
// if (!skipOcr_) {
|
|
151
|
-
// deffered.Reject(
|
|
152
|
-
// Napi::Error::New(
|
|
153
|
-
// env, "Page analysis not available unless `skipOcr` is turned on")
|
|
154
|
-
// .Value());
|
|
155
|
-
//
|
|
156
|
-
// return deffered.Promise();
|
|
157
|
-
// }
|
|
158
|
-
//
|
|
159
|
-
// if (info.Length() < 1 || !info[0].IsBoolean()) {
|
|
160
|
-
// deffered.Reject(Napi::TypeError::New(
|
|
161
|
-
// info.Env(), "Expected first argument to be a
|
|
162
|
-
// boolean") .Value());
|
|
163
|
-
// return deffered.Promise();
|
|
164
|
-
// }
|
|
165
|
-
//
|
|
166
|
-
// bool merge_similar_words = info[0].As<Napi::Boolean>().Value();
|
|
167
|
-
//
|
|
168
|
-
// api_->SetImage();
|
|
169
|
-
//
|
|
170
|
-
// tesseract::PageIterator *iterator =
|
|
171
|
-
// api_->AnalyseLayout(merge_similar_words); return;
|
|
172
|
-
// }
|
|
173
|
-
|
|
174
|
-
std::mutex &Handle::Mutex() { return mutex_; }
|
package/src/handle.h
DELETED
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#ifndef HANDLE_H
|
|
18
|
-
#define HANDLE_H
|
|
19
|
-
|
|
20
|
-
#include "napi.h"
|
|
21
|
-
#include <cstdint>
|
|
22
|
-
#include <cstdlib>
|
|
23
|
-
#include <memory>
|
|
24
|
-
#include <mutex>
|
|
25
|
-
#include <string>
|
|
26
|
-
#include <tesseract/baseapi.h>
|
|
27
|
-
#include <tesseract/ocrclass.h>
|
|
28
|
-
#include <tesseract/publictypes.h>
|
|
29
|
-
|
|
30
|
-
using Napi::CallbackInfo;
|
|
31
|
-
|
|
32
|
-
class Handle : public Napi::ObjectWrap<Handle> {
|
|
33
|
-
public:
|
|
34
|
-
static Napi::Object GetClass(Napi::Env env, Napi::Object exports);
|
|
35
|
-
Handle(const Napi::CallbackInfo &info);
|
|
36
|
-
~Handle();
|
|
37
|
-
|
|
38
|
-
std::unique_ptr<tesseract::TessBaseAPI> CreateApi();
|
|
39
|
-
std::mutex &Mutex();
|
|
40
|
-
tesseract::ETEXT_DESC *Monitor();
|
|
41
|
-
|
|
42
|
-
private:
|
|
43
|
-
bool skipOcr_ = false;
|
|
44
|
-
|
|
45
|
-
std::string dataPath_ = std::getenv("NODE_TESSERACT_DATAPATH");
|
|
46
|
-
std::string lang_ = "eng";
|
|
47
|
-
tesseract::OcrEngineMode oemMode_ = tesseract::OEM_DEFAULT;
|
|
48
|
-
tesseract::PageSegMode psm_ = tesseract::PSM_SINGLE_BLOCK;
|
|
49
|
-
|
|
50
|
-
std::unique_ptr<tesseract::TessBaseAPI> api_;
|
|
51
|
-
std::mutex mutex_;
|
|
52
|
-
|
|
53
|
-
Napi::Value Recognize(const CallbackInfo &info);
|
|
54
|
-
Napi::Value AnalyzeLayout(const CallbackInfo &info);
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
#endif // HANDLE_H
|
package/src/ocr_result.cpp
DELETED
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#include "ocr_result.h"
|
|
18
|
-
#include "napi.h"
|
|
19
|
-
#include <mutex>
|
|
20
|
-
#include <tesseract/baseapi.h>
|
|
21
|
-
#include <tesseract/ocrclass.h>
|
|
22
|
-
|
|
23
|
-
Napi::Function OCRResult::GetClass(Napi::Env env) {
|
|
24
|
-
return DefineClass(env, "OCRResult",
|
|
25
|
-
{
|
|
26
|
-
InstanceMethod("getText", &OCRResult::GetText),
|
|
27
|
-
InstanceMethod("getHOCR", &OCRResult::GetHOCR),
|
|
28
|
-
InstanceMethod("getTSV", &OCRResult::GetTSV),
|
|
29
|
-
InstanceMethod("getALTO", &OCRResult::GetALTO),
|
|
30
|
-
});
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
OCRResult::OCRResult(const Napi::CallbackInfo &info)
|
|
34
|
-
: Napi::ObjectWrap<OCRResult>(info), handle_(nullptr) {
|
|
35
|
-
// Expect constructor args: text, hocr, tsv (all strings)
|
|
36
|
-
if (info.Length() >= 1 && info[0].IsString()) {
|
|
37
|
-
text_ = info[0].As<Napi::String>().Utf8Value();
|
|
38
|
-
}
|
|
39
|
-
if (info.Length() >= 2 && info[1].IsString()) {
|
|
40
|
-
hocr_ = info[1].As<Napi::String>().Utf8Value();
|
|
41
|
-
}
|
|
42
|
-
if (info.Length() >= 3 && info[2].IsString()) {
|
|
43
|
-
tsv_ = info[2].As<Napi::String>().Utf8Value();
|
|
44
|
-
}
|
|
45
|
-
if (info.Length() >= 4 && info[3].IsString()) {
|
|
46
|
-
alto_ = info[3].As<Napi::String>().Utf8Value();
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
Napi::Object OCRResult::NewInstance(Napi::Env env, const std::string &text,
|
|
51
|
-
const std::string &hocr,
|
|
52
|
-
const std::string &tsv,
|
|
53
|
-
const std::string &alto) {
|
|
54
|
-
Napi::EscapableHandleScope scope(env);
|
|
55
|
-
|
|
56
|
-
Napi::Function ctor = OCRResult::GetClass(env);
|
|
57
|
-
Napi::Object obj =
|
|
58
|
-
ctor.New({Napi::String::New(env, text), Napi::String::New(env, hocr),
|
|
59
|
-
Napi::String::New(env, tsv), Napi::String::New(env, alto)});
|
|
60
|
-
|
|
61
|
-
return scope.Escape(obj).As<Napi::Object>();
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
void OCRResult::Cancel(const Napi::CallbackInfo &info) {
|
|
65
|
-
// Napi::Env env = info.Env();
|
|
66
|
-
|
|
67
|
-
{
|
|
68
|
-
std::lock_guard<std::mutex> lock(handle_->Mutex());
|
|
69
|
-
tesseract::ETEXT_DESC *monitor = handle_->Monitor();
|
|
70
|
-
|
|
71
|
-
monitor->cancel = [](void *should_cancel, int wordcount) -> bool {
|
|
72
|
-
return true;
|
|
73
|
-
};
|
|
74
|
-
monitor->cancel_this = (void *)true;
|
|
75
|
-
monitor->cancel(monitor->cancel_this, monitor->count);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
return;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
Napi::Value OCRResult::GetText(const Napi::CallbackInfo &info) {
|
|
82
|
-
Napi::Env env = info.Env();
|
|
83
|
-
return Napi::String::New(env, text_);
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
Napi::Value OCRResult::GetHOCR(const Napi::CallbackInfo &info) {
|
|
87
|
-
Napi::Env env = info.Env();
|
|
88
|
-
return Napi::String::New(env, hocr_);
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
Napi::Value OCRResult::GetTSV(const Napi::CallbackInfo &info) {
|
|
92
|
-
Napi::Env env = info.Env();
|
|
93
|
-
return Napi::String::New(env, tsv_);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
Napi::Value OCRResult::GetALTO(const Napi::CallbackInfo &info) {
|
|
97
|
-
Napi::Env env = info.Env();
|
|
98
|
-
return Napi::String::New(env, alto_);
|
|
99
|
-
}
|
package/src/ocr_result.h
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#ifndef OCRRESULT_H
|
|
18
|
-
#define OCRRESULT_H
|
|
19
|
-
|
|
20
|
-
#include "handle.h"
|
|
21
|
-
#include <napi.h>
|
|
22
|
-
|
|
23
|
-
class OCRResult : public Napi::ObjectWrap<OCRResult> {
|
|
24
|
-
|
|
25
|
-
public:
|
|
26
|
-
OCRResult(const Napi::CallbackInfo &info);
|
|
27
|
-
static Napi::Function GetClass(Napi::Env env);
|
|
28
|
-
static Napi::Object NewInstance(Napi::Env env, const std::string &text,
|
|
29
|
-
const std::string &hocr,
|
|
30
|
-
const std::string &tsv,
|
|
31
|
-
const std::string &alto);
|
|
32
|
-
|
|
33
|
-
private:
|
|
34
|
-
void Cancel(const CallbackInfo &info);
|
|
35
|
-
Napi::Value GetText(const CallbackInfo &info);
|
|
36
|
-
Napi::Value GetHOCR(const CallbackInfo &info);
|
|
37
|
-
Napi::Value GetTSV(const CallbackInfo &info);
|
|
38
|
-
Napi::Value GetALTO(const CallbackInfo &info);
|
|
39
|
-
|
|
40
|
-
Handle *handle_;
|
|
41
|
-
std::string text_;
|
|
42
|
-
std::string hocr_;
|
|
43
|
-
std::string tsv_;
|
|
44
|
-
std::string alto_;
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
#endif // OCRRESULT_H
|
package/src/ocr_worker.cpp
DELETED
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#include "ocr_worker.h"
|
|
18
|
-
#include "napi.h"
|
|
19
|
-
#include "ocr_result.h"
|
|
20
|
-
#include <cstddef>
|
|
21
|
-
#include <cstdio>
|
|
22
|
-
#include <tesseract/baseapi.h>
|
|
23
|
-
#include <tesseract/ocrclass.h>
|
|
24
|
-
|
|
25
|
-
OCRWorker::OCRWorker(Handle *handle, Napi::Object handleObject,
|
|
26
|
-
Napi::Buffer<uint8_t> buffer,
|
|
27
|
-
Napi::Promise::Deferred deffered,
|
|
28
|
-
Napi::Function &progressCallback)
|
|
29
|
-
: Napi::AsyncProgressWorker<ProgressPayload>{handle->Env(), "OCRWorker"},
|
|
30
|
-
handle_(handle), data_(buffer.Data()), length_(buffer.Length()),
|
|
31
|
-
deffered_{deffered} {
|
|
32
|
-
|
|
33
|
-
this->progressCallback_.Reset(progressCallback, 1);
|
|
34
|
-
|
|
35
|
-
// Hold a persistent reference to the JS Buffer.
|
|
36
|
-
// So its memory isn't freed while the async worker is running.
|
|
37
|
-
this->bufferRef_.Reset(buffer, 1);
|
|
38
|
-
// Hold a persistent reference to the JS Handle object.
|
|
39
|
-
// So it isn't GC'd while this worker is active.
|
|
40
|
-
this->handleRef_.Reset(handleObject, 1);
|
|
41
|
-
|
|
42
|
-
// allocate and zero-init the monitor to avoid uninitialized fields
|
|
43
|
-
monitor_ = new tesseract::ETEXT_DESC();
|
|
44
|
-
|
|
45
|
-
monitor_->progress_callback2 = [](tesseract::ETEXT_DESC *monitor, int left,
|
|
46
|
-
int right, int top, int bottom) -> bool {
|
|
47
|
-
ProgressPayload payload{monitor->more_to_come,
|
|
48
|
-
monitor->progress,
|
|
49
|
-
monitor->ocr_alive,
|
|
50
|
-
top,
|
|
51
|
-
right,
|
|
52
|
-
bottom,
|
|
53
|
-
left};
|
|
54
|
-
auto *executionProgress =
|
|
55
|
-
static_cast<OCRWorker::ExecutionProgress *>(monitor->cancel_this);
|
|
56
|
-
if (executionProgress == nullptr) {
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
executionProgress->Send(&payload, 1);
|
|
60
|
-
return false;
|
|
61
|
-
};
|
|
62
|
-
monitor_->ocr_alive = 1;
|
|
63
|
-
};
|
|
64
|
-
|
|
65
|
-
OCRWorker::~OCRWorker() {
|
|
66
|
-
// release the persistent buffer reference
|
|
67
|
-
if (!this->bufferRef_.IsEmpty()) {
|
|
68
|
-
this->bufferRef_.Reset();
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// free monitor
|
|
72
|
-
if (monitor_) {
|
|
73
|
-
delete monitor_;
|
|
74
|
-
monitor_ = nullptr;
|
|
75
|
-
}
|
|
76
|
-
if (!this->handleRef_.IsEmpty()) {
|
|
77
|
-
this->handleRef_.Reset();
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
void OCRWorker::Execute(const ExecutionProgress &executionProgress) {
|
|
82
|
-
|
|
83
|
-
Pix *pPix = pixReadMem(data_, length_);
|
|
84
|
-
|
|
85
|
-
if (!pPix) {
|
|
86
|
-
SetError("Could not read image from buffer");
|
|
87
|
-
return;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// determine image depth and normalize to 8-bit grayscale (if needed)
|
|
91
|
-
int depth = pixGetDepth(pPix);
|
|
92
|
-
if (depth != 8) {
|
|
93
|
-
Pix *pGray = pixConvertTo8(pPix, 0); /* 0 = no colormap */
|
|
94
|
-
if (pGray) {
|
|
95
|
-
pixDestroy(&pPix);
|
|
96
|
-
pPix = pGray;
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
std::unique_ptr<tesseract::TessBaseAPI> localApi;
|
|
101
|
-
{
|
|
102
|
-
std::lock_guard<std::mutex> lock(handle_->Mutex());
|
|
103
|
-
|
|
104
|
-
// Create a thread-local TessBaseAPI to avoid sharing across threads;
|
|
105
|
-
localApi = handle_->CreateApi();
|
|
106
|
-
|
|
107
|
-
if (!localApi) {
|
|
108
|
-
pixDestroy(&pPix);
|
|
109
|
-
SetError("Could not initialize thread-local Tesseract API");
|
|
110
|
-
return;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
localApi->SetImage(pPix);
|
|
114
|
-
monitor_->cancel_this = (void *)&executionProgress;
|
|
115
|
-
|
|
116
|
-
if (localApi->Recognize(monitor_) != 0) {
|
|
117
|
-
pixDestroy(&pPix);
|
|
118
|
-
monitor_->cancel_this = nullptr;
|
|
119
|
-
SetError("Recognize failed");
|
|
120
|
-
return;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// retrieve results from the local API
|
|
125
|
-
{
|
|
126
|
-
char *t = nullptr;
|
|
127
|
-
t = localApi->GetUTF8Text();
|
|
128
|
-
if (t) {
|
|
129
|
-
resultText_.assign(t);
|
|
130
|
-
delete[] t;
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
t = localApi->GetHOCRText(0);
|
|
134
|
-
if (t) {
|
|
135
|
-
resultHOCR_.assign(t);
|
|
136
|
-
delete[] t;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
t = localApi->GetTSVText(0);
|
|
140
|
-
if (t) {
|
|
141
|
-
resultTSV_.assign(t);
|
|
142
|
-
delete[] t;
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
t = localApi->GetAltoText(0);
|
|
146
|
-
if (t) {
|
|
147
|
-
resultALTO_.assign(t);
|
|
148
|
-
delete[] t;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
monitor_->cancel_this = nullptr;
|
|
153
|
-
pixDestroy(&pPix);
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
void OCRWorker::OnOK() {
|
|
157
|
-
Napi::Env env = Env();
|
|
158
|
-
Napi::HandleScope scope(env);
|
|
159
|
-
|
|
160
|
-
Napi::Object resultObj = OCRResult::NewInstance(env, resultText_, resultHOCR_,
|
|
161
|
-
resultTSV_, resultALTO_);
|
|
162
|
-
// Attach a reference to the Handle JS object on the result.
|
|
163
|
-
// So the Handle stays alive as long as the OCRResult is reachable from JS.
|
|
164
|
-
if (!handleRef_.IsEmpty()) {
|
|
165
|
-
resultObj.Set("_handle", handleRef_.Value());
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
this->deffered_.Resolve(resultObj);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
void OCRWorker::OnError(const Napi::Error &error) {
|
|
172
|
-
Napi::HandleScope scope(Env());
|
|
173
|
-
this->deffered_.Reject(error.Value());
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
void OCRWorker::OnProgress(const ProgressPayload *payload, size_t count) {
|
|
177
|
-
Napi::HandleScope scope(Env());
|
|
178
|
-
Napi::Object progress = Napi::Object::New(Env());
|
|
179
|
-
|
|
180
|
-
progress.Set("percent", payload->percent);
|
|
181
|
-
progress.Set("progress", payload->progress);
|
|
182
|
-
progress.Set("ocr_alive", payload->ocr_alive);
|
|
183
|
-
progress.Set("top", payload->top);
|
|
184
|
-
progress.Set("right", payload->right);
|
|
185
|
-
progress.Set("bottom", payload->bottom);
|
|
186
|
-
progress.Set("left", payload->left);
|
|
187
|
-
|
|
188
|
-
if (!this->progressCallback_.IsEmpty()) {
|
|
189
|
-
progressCallback_.Call(Env().Undefined(), {progress});
|
|
190
|
-
}
|
|
191
|
-
}
|
package/src/ocr_worker.h
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
* Copyright 2025 Philipp Czarnetzki
|
|
3
|
-
*
|
|
4
|
-
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
* you may not use this file except in compliance with the License.
|
|
6
|
-
* You may obtain a copy of the License at
|
|
7
|
-
*
|
|
8
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
*
|
|
10
|
-
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
13
|
-
* or implied. See the License for the specific language governing
|
|
14
|
-
* permissions and limitations under the License.
|
|
15
|
-
*/
|
|
16
|
-
|
|
17
|
-
#ifndef OCRWORKER_H
|
|
18
|
-
#define OCRWORKER_H
|
|
19
|
-
|
|
20
|
-
#include "handle.h"
|
|
21
|
-
#include "leptonica/allheaders.h"
|
|
22
|
-
#include "napi.h"
|
|
23
|
-
#include <cstddef>
|
|
24
|
-
#include <memory>
|
|
25
|
-
#include <tesseract/baseapi.h>
|
|
26
|
-
#include <tesseract/ocrclass.h>
|
|
27
|
-
|
|
28
|
-
struct ProgressPayload {
|
|
29
|
-
int percent;
|
|
30
|
-
int progress;
|
|
31
|
-
int ocr_alive;
|
|
32
|
-
int top;
|
|
33
|
-
int right;
|
|
34
|
-
int bottom;
|
|
35
|
-
int left;
|
|
36
|
-
};
|
|
37
|
-
|
|
38
|
-
class OCRWorker : public Napi::AsyncProgressWorker<ProgressPayload> {
|
|
39
|
-
public:
|
|
40
|
-
OCRWorker(Handle *handle, Napi::Object handleObject,
|
|
41
|
-
Napi::Buffer<uint8_t> buffer, Napi::Promise::Deferred deffered,
|
|
42
|
-
Napi::Function &progressCallback);
|
|
43
|
-
~OCRWorker();
|
|
44
|
-
|
|
45
|
-
protected:
|
|
46
|
-
void Execute(const ExecutionProgress &executionProgress) override;
|
|
47
|
-
|
|
48
|
-
void OnOK() override;
|
|
49
|
-
void OnError(const Napi::Error &error) override;
|
|
50
|
-
void OnProgress(const ProgressPayload *payload, size_t count) override;
|
|
51
|
-
|
|
52
|
-
private:
|
|
53
|
-
Handle *handle_;
|
|
54
|
-
Napi::Reference<Napi::Object> handleRef_;
|
|
55
|
-
uint8_t *data_;
|
|
56
|
-
size_t length_;
|
|
57
|
-
Napi::Reference<Napi::Buffer<uint8_t>> bufferRef_;
|
|
58
|
-
Napi::Promise::Deferred deffered_;
|
|
59
|
-
tesseract::ETEXT_DESC *monitor_ = nullptr;
|
|
60
|
-
Napi::FunctionReference progressCallback_;
|
|
61
|
-
std::string resultText_;
|
|
62
|
-
std::string resultHOCR_;
|
|
63
|
-
std::string resultTSV_;
|
|
64
|
-
std::string resultALTO_;
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
#endif // OCRWORKER_H
|