pdf-oxide-fips 0.3.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-APACHE +176 -0
- package/LICENSE-MIT +25 -0
- package/README.md +218 -0
- package/lib/builders/annotation-builder.d.ts +198 -0
- package/lib/builders/annotation-builder.js +317 -0
- package/lib/builders/conversion-options-builder.d.ts +106 -0
- package/lib/builders/conversion-options-builder.js +214 -0
- package/lib/builders/document-builder.d.ts +381 -0
- package/lib/builders/document-builder.js +770 -0
- package/lib/builders/index.d.ts +13 -0
- package/lib/builders/index.js +13 -0
- package/lib/builders/metadata-builder.d.ts +201 -0
- package/lib/builders/metadata-builder.js +285 -0
- package/lib/builders/pdf-builder.d.ts +216 -0
- package/lib/builders/pdf-builder.js +350 -0
- package/lib/builders/search-options-builder.d.ts +73 -0
- package/lib/builders/search-options-builder.js +129 -0
- package/lib/builders/streaming-table.d.ts +64 -0
- package/lib/builders/streaming-table.js +140 -0
- package/lib/document-editor-manager.d.ts +139 -0
- package/lib/document-editor-manager.js +256 -0
- package/lib/document-editor.d.ts +124 -0
- package/lib/document-editor.js +318 -0
- package/lib/errors.d.ts +382 -0
- package/lib/errors.js +1115 -0
- package/lib/form-field-manager.d.ts +299 -0
- package/lib/form-field-manager.js +568 -0
- package/lib/hybrid-ml-manager.d.ts +142 -0
- package/lib/hybrid-ml-manager.js +208 -0
- package/lib/index.d.ts +205 -0
- package/lib/index.js +693 -0
- package/lib/managers/accessibility-manager.d.ts +148 -0
- package/lib/managers/accessibility-manager.js +234 -0
- package/lib/managers/annotation-manager.d.ts +219 -0
- package/lib/managers/annotation-manager.js +359 -0
- package/lib/managers/barcode-manager.d.ts +82 -0
- package/lib/managers/barcode-manager.js +263 -0
- package/lib/managers/batch-manager.d.ts +185 -0
- package/lib/managers/batch-manager.js +385 -0
- package/lib/managers/cache-manager.d.ts +181 -0
- package/lib/managers/cache-manager.js +384 -0
- package/lib/managers/compliance-manager.d.ts +103 -0
- package/lib/managers/compliance-manager.js +453 -0
- package/lib/managers/content-manager.d.ts +120 -0
- package/lib/managers/content-manager.js +294 -0
- package/lib/managers/document-utility-manager.d.ts +369 -0
- package/lib/managers/document-utility-manager.js +730 -0
- package/lib/managers/dom-pdf-creator.d.ts +104 -0
- package/lib/managers/dom-pdf-creator.js +299 -0
- package/lib/managers/editing-manager.d.ts +248 -0
- package/lib/managers/editing-manager.js +387 -0
- package/lib/managers/enterprise-manager.d.ts +192 -0
- package/lib/managers/enterprise-manager.js +307 -0
- package/lib/managers/extended-managers.d.ts +122 -0
- package/lib/managers/extended-managers.js +664 -0
- package/lib/managers/extraction-manager.d.ts +246 -0
- package/lib/managers/extraction-manager.js +482 -0
- package/lib/managers/final-utilities.d.ts +127 -0
- package/lib/managers/final-utilities.js +657 -0
- package/lib/managers/hybrid-ml-advanced.d.ts +136 -0
- package/lib/managers/hybrid-ml-advanced.js +722 -0
- package/lib/managers/index.d.ts +64 -0
- package/lib/managers/index.js +69 -0
- package/lib/managers/layer-manager.d.ts +203 -0
- package/lib/managers/layer-manager.js +401 -0
- package/lib/managers/metadata-manager.d.ts +148 -0
- package/lib/managers/metadata-manager.js +280 -0
- package/lib/managers/ocr-manager.d.ts +194 -0
- package/lib/managers/ocr-manager.js +582 -0
- package/lib/managers/optimization-manager.d.ts +102 -0
- package/lib/managers/optimization-manager.js +213 -0
- package/lib/managers/outline-manager.d.ts +101 -0
- package/lib/managers/outline-manager.js +169 -0
- package/lib/managers/page-manager.d.ts +142 -0
- package/lib/managers/page-manager.js +235 -0
- package/lib/managers/pattern-detection.d.ts +169 -0
- package/lib/managers/pattern-detection.js +322 -0
- package/lib/managers/rendering-manager.d.ts +353 -0
- package/lib/managers/rendering-manager.js +679 -0
- package/lib/managers/search-manager.d.ts +235 -0
- package/lib/managers/search-manager.js +329 -0
- package/lib/managers/security-manager.d.ts +161 -0
- package/lib/managers/security-manager.js +292 -0
- package/lib/managers/signature-manager.d.ts +738 -0
- package/lib/managers/signature-manager.js +1509 -0
- package/lib/managers/streams.d.ts +262 -0
- package/lib/managers/streams.js +477 -0
- package/lib/managers/xfa-manager.d.ts +227 -0
- package/lib/managers/xfa-manager.js +539 -0
- package/lib/native-loader.d.ts +7 -0
- package/lib/native-loader.js +62 -0
- package/lib/native.d.ts +16 -0
- package/lib/native.js +69 -0
- package/lib/pdf-creator-manager.d.ts +200 -0
- package/lib/pdf-creator-manager.js +381 -0
- package/lib/properties.d.ts +79 -0
- package/lib/properties.js +454 -0
- package/lib/result-accessors-manager.d.ts +346 -0
- package/lib/result-accessors-manager.js +706 -0
- package/lib/thumbnail-manager.d.ts +121 -0
- package/lib/thumbnail-manager.js +205 -0
- package/lib/timestamp.d.ts +54 -0
- package/lib/timestamp.js +115 -0
- package/lib/tsa-client.d.ts +44 -0
- package/lib/tsa-client.js +67 -0
- package/lib/types/common.d.ts +189 -0
- package/lib/types/common.js +17 -0
- package/lib/types/document-types.d.ts +352 -0
- package/lib/types/document-types.js +82 -0
- package/lib/types/index.d.ts +5 -0
- package/lib/types/index.js +5 -0
- package/lib/types/manager-types.d.ts +179 -0
- package/lib/types/manager-types.js +100 -0
- package/lib/types/native-bindings.d.ts +439 -0
- package/lib/types/native-bindings.js +7 -0
- package/lib/workers/index.d.ts +6 -0
- package/lib/workers/index.js +5 -0
- package/lib/workers/pool.d.ts +64 -0
- package/lib/workers/pool.js +192 -0
- package/lib/workers/worker.d.ts +5 -0
- package/lib/workers/worker.js +99 -0
- package/package.json +79 -0
- package/prebuilds/darwin-arm64/pdf_oxide.node +0 -0
- package/prebuilds/darwin-x64/pdf_oxide.node +0 -0
- package/prebuilds/linux-arm64/pdf_oxide.node +0 -0
- package/prebuilds/linux-x64/pdf_oxide.node +0 -0
- package/prebuilds/win32-x64/pdf_oxide.node +0 -0
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OcrManager - Canonical OCR Manager (merged from 3 implementations)
|
|
3
|
+
*
|
|
4
|
+
* Consolidates:
|
|
5
|
+
* - src/ocr-manager.ts (simple API with setLanguage, extractText, analyzePage)
|
|
6
|
+
* - src/managers/ocr-compliance-cache.ts OCRManager (engine lifecycle)
|
|
7
|
+
* - src/managers/ocr-manager-typed.ts OCRManager (full TypeScript, FFI-wired)
|
|
8
|
+
*
|
|
9
|
+
* Provides optical character recognition operations with complete type safety,
|
|
10
|
+
* proper error handling, and full FFI integration.
|
|
11
|
+
*/
|
|
12
|
+
import { promises as fs } from 'fs';
|
|
13
|
+
import { dirname } from 'path';
|
|
14
|
+
import { BaseManager, OcrLanguage, } from '../types/manager-types.js';
|
|
15
|
+
// Re-export types for convenience
|
|
16
|
+
export { OcrLanguage };
|
|
17
|
+
/**
|
|
18
|
+
* OCR detection modes for accuracy/speed tradeoff
|
|
19
|
+
*/
|
|
20
|
+
export var OcrDetectionMode;
|
|
21
|
+
(function (OcrDetectionMode) {
|
|
22
|
+
OcrDetectionMode["Accurate"] = "accurate";
|
|
23
|
+
OcrDetectionMode["Fast"] = "fast";
|
|
24
|
+
OcrDetectionMode["Balanced"] = "balanced";
|
|
25
|
+
})(OcrDetectionMode || (OcrDetectionMode = {}));
|
|
26
|
+
/**
|
|
27
|
+
* Canonical OcrManager - Comprehensive OCR with full TypeScript support
|
|
28
|
+
*
|
|
29
|
+
* Features:
|
|
30
|
+
* - Full text recognition with confidence scoring
|
|
31
|
+
* - Batch page processing with skip optimization
|
|
32
|
+
* - Text region detection with coordinates
|
|
33
|
+
* - Multi-language support
|
|
34
|
+
* - Comprehensive event emission
|
|
35
|
+
* - Automatic resource cleanup
|
|
36
|
+
* - Legacy API compatibility (setLanguage, extractText, analyzePage, etc.)
|
|
37
|
+
*/
|
|
38
|
+
export class OcrManager extends BaseManager {
|
|
39
|
+
constructor(document, options) {
|
|
40
|
+
super(document, options);
|
|
41
|
+
this.ocrEngine = null;
|
|
42
|
+
this.currentLanguage = OcrLanguage.ENGLISH;
|
|
43
|
+
this.preprocessingType = 'auto';
|
|
44
|
+
try {
|
|
45
|
+
this.native = require('../../index.node');
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
this.native = null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
// ==========================================================================
|
|
52
|
+
// Engine Lifecycle (from typed version)
|
|
53
|
+
// ==========================================================================
|
|
54
|
+
/**
|
|
55
|
+
* Initialize OCR engine with specified configuration
|
|
56
|
+
*/
|
|
57
|
+
async initializeEngine(detectionThreshold = 0.5, recognitionThreshold = 0.5, maxSideLen = 960, useGpu = false, gpuDeviceId = 0) {
|
|
58
|
+
try {
|
|
59
|
+
this.recordOperation();
|
|
60
|
+
if (this.ocrEngine) {
|
|
61
|
+
return true;
|
|
62
|
+
}
|
|
63
|
+
this.ocrEngine = await this.document?.createOcrEngine(detectionThreshold, recognitionThreshold, maxSideLen, useGpu, gpuDeviceId);
|
|
64
|
+
if (this.ocrEngine) {
|
|
65
|
+
this.emit('ocr-engine-initialized', {
|
|
66
|
+
useGpu,
|
|
67
|
+
gpuDeviceId,
|
|
68
|
+
detectionThreshold,
|
|
69
|
+
recognitionThreshold,
|
|
70
|
+
});
|
|
71
|
+
return true;
|
|
72
|
+
}
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
catch (error) {
|
|
76
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
77
|
+
throw error;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Destroy OCR engine and free resources
|
|
82
|
+
*/
|
|
83
|
+
async destroyOcrEngine() {
|
|
84
|
+
try {
|
|
85
|
+
this.recordOperation();
|
|
86
|
+
if (this.ocrEngine) {
|
|
87
|
+
await this.document?.destroyOcrEngine(this.ocrEngine);
|
|
88
|
+
this.ocrEngine = null;
|
|
89
|
+
this.emit('ocr-engine-destroyed', { timestamp: Date.now() });
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
94
|
+
throw error;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// ==========================================================================
|
|
98
|
+
// Core Recognition (from typed version)
|
|
99
|
+
// ==========================================================================
|
|
100
|
+
/**
|
|
101
|
+
* Check if page needs OCR processing
|
|
102
|
+
*/
|
|
103
|
+
async pageNeedsOcr(pageIndex) {
|
|
104
|
+
try {
|
|
105
|
+
this.recordOperation();
|
|
106
|
+
return (await this.document?.pageNeedsOcr(pageIndex)) || false;
|
|
107
|
+
}
|
|
108
|
+
catch (error) {
|
|
109
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
110
|
+
throw error;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Recognize text on a page with full confidence scoring
|
|
115
|
+
*/
|
|
116
|
+
async recognizePage(pageIndex) {
|
|
117
|
+
try {
|
|
118
|
+
this.recordOperation();
|
|
119
|
+
if (!this.ocrEngine) {
|
|
120
|
+
throw new Error('OCR engine not initialized. Call initializeEngine() first.');
|
|
121
|
+
}
|
|
122
|
+
const text = await this.document?.recognizePage(pageIndex, this.ocrEngine);
|
|
123
|
+
this.emit('page-recognized', {
|
|
124
|
+
pageIndex,
|
|
125
|
+
textLength: text?.length || 0,
|
|
126
|
+
timestamp: Date.now(),
|
|
127
|
+
});
|
|
128
|
+
return text || '';
|
|
129
|
+
}
|
|
130
|
+
catch (error) {
|
|
131
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
132
|
+
throw error;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Get OCR confidence score for a page
|
|
137
|
+
*/
|
|
138
|
+
async getOcrConfidence(pageIndex) {
|
|
139
|
+
try {
|
|
140
|
+
this.recordOperation();
|
|
141
|
+
if (!this.ocrEngine) {
|
|
142
|
+
return 0;
|
|
143
|
+
}
|
|
144
|
+
return (await this.document?.getOcrConfidence(pageIndex)) || 0;
|
|
145
|
+
}
|
|
146
|
+
catch (error) {
|
|
147
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
148
|
+
throw error;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Detect text regions on a page with bounding boxes
|
|
153
|
+
*/
|
|
154
|
+
async detectTextRegions(pageIndex) {
|
|
155
|
+
try {
|
|
156
|
+
this.recordOperation();
|
|
157
|
+
if (!this.ocrEngine) {
|
|
158
|
+
return [];
|
|
159
|
+
}
|
|
160
|
+
const regions = await this.document?.detectTextRegions(pageIndex, this.ocrEngine);
|
|
161
|
+
return regions || [];
|
|
162
|
+
}
|
|
163
|
+
catch (error) {
|
|
164
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
165
|
+
throw error;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// ==========================================================================
|
|
169
|
+
// Language Configuration (from typed + root versions)
|
|
170
|
+
// ==========================================================================
|
|
171
|
+
/**
|
|
172
|
+
* Set OCR language for recognition (FFI-wired)
|
|
173
|
+
*/
|
|
174
|
+
async setOcrLanguage(language) {
|
|
175
|
+
try {
|
|
176
|
+
this.recordOperation();
|
|
177
|
+
if (!this.ocrEngine) {
|
|
178
|
+
throw new Error('OCR engine not initialized');
|
|
179
|
+
}
|
|
180
|
+
const result = await this.document?.setOcrLanguage(this.ocrEngine, language);
|
|
181
|
+
if (result) {
|
|
182
|
+
this.currentLanguage = language || OcrLanguage.ENGLISH;
|
|
183
|
+
this.emit('language-changed', {
|
|
184
|
+
language,
|
|
185
|
+
timestamp: Date.now(),
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
return !!result;
|
|
189
|
+
}
|
|
190
|
+
catch (error) {
|
|
191
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
192
|
+
throw error;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Sets the OCR language (convenience alias for setOcrLanguage)
|
|
197
|
+
* From root-level OCRManager
|
|
198
|
+
*/
|
|
199
|
+
setLanguage(language) {
|
|
200
|
+
this.currentLanguage = language;
|
|
201
|
+
this.invalidateCache('ocr');
|
|
202
|
+
this.emit('languageChanged', language);
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Gets the current OCR language
|
|
206
|
+
* From root-level OCRManager
|
|
207
|
+
*/
|
|
208
|
+
getLanguage() {
|
|
209
|
+
return this.currentLanguage;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Get available OCR languages
|
|
213
|
+
*/
|
|
214
|
+
async getAvailableLanguages() {
|
|
215
|
+
try {
|
|
216
|
+
this.recordOperation();
|
|
217
|
+
const languages = (await this.document?.getAvailableLanguages()) || Object.values(OcrLanguage);
|
|
218
|
+
return languages;
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
222
|
+
throw error;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
// ==========================================================================
|
|
226
|
+
// Processing & Export (from typed version)
|
|
227
|
+
// ==========================================================================
|
|
228
|
+
/**
|
|
229
|
+
* Preprocess page before OCR for better recognition
|
|
230
|
+
*/
|
|
231
|
+
async preprocessPage(pageIndex, preprocessingType = 'auto') {
|
|
232
|
+
try {
|
|
233
|
+
this.recordOperation();
|
|
234
|
+
const result = await this.document?.preprocessPage(pageIndex, preprocessingType);
|
|
235
|
+
this.preprocessingType = preprocessingType;
|
|
236
|
+
this.emit('page-preprocessed', {
|
|
237
|
+
pageIndex,
|
|
238
|
+
type: preprocessingType,
|
|
239
|
+
timestamp: Date.now(),
|
|
240
|
+
});
|
|
241
|
+
return !!result;
|
|
242
|
+
}
|
|
243
|
+
catch (error) {
|
|
244
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
245
|
+
throw error;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Export OCR text to file
|
|
250
|
+
*/
|
|
251
|
+
async exportOcrText(pageIndex, filePath, format = 'txt') {
|
|
252
|
+
try {
|
|
253
|
+
this.recordOperation();
|
|
254
|
+
const text = await this.recognizePage(pageIndex);
|
|
255
|
+
await fs.mkdir(dirname(filePath), { recursive: true });
|
|
256
|
+
let content;
|
|
257
|
+
switch (format) {
|
|
258
|
+
case 'json':
|
|
259
|
+
content = JSON.stringify({ pageIndex, text, timestamp: Date.now() }, null, 2);
|
|
260
|
+
break;
|
|
261
|
+
case 'xml':
|
|
262
|
+
content = `<?xml version="1.0"?>\n<page index="${pageIndex}">\n${text
|
|
263
|
+
.split('\n')
|
|
264
|
+
.map((line) => ` <line>${line}</line>`)
|
|
265
|
+
.join('\n')}\n</page>`;
|
|
266
|
+
break;
|
|
267
|
+
default:
|
|
268
|
+
content = text;
|
|
269
|
+
}
|
|
270
|
+
await fs.writeFile(filePath, content, 'utf8');
|
|
271
|
+
this.emit('text-exported', {
|
|
272
|
+
pageIndex,
|
|
273
|
+
filePath,
|
|
274
|
+
format,
|
|
275
|
+
size: content.length,
|
|
276
|
+
timestamp: Date.now(),
|
|
277
|
+
});
|
|
278
|
+
return true;
|
|
279
|
+
}
|
|
280
|
+
catch (error) {
|
|
281
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
282
|
+
throw error;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
// ==========================================================================
|
|
286
|
+
// Statistics & Batch (from typed version)
|
|
287
|
+
// ==========================================================================
|
|
288
|
+
/**
|
|
289
|
+
* Get comprehensive OCR statistics for a page
|
|
290
|
+
*/
|
|
291
|
+
async getOcrStatistics(pageIndex) {
|
|
292
|
+
try {
|
|
293
|
+
this.recordOperation();
|
|
294
|
+
const text = await this.recognizePage(pageIndex);
|
|
295
|
+
const confidence = await this.getOcrConfidence(pageIndex);
|
|
296
|
+
const regions = await this.detectTextRegions(pageIndex);
|
|
297
|
+
return {
|
|
298
|
+
pageIndex,
|
|
299
|
+
text,
|
|
300
|
+
confidence,
|
|
301
|
+
regionCount: regions.length,
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
catch (error) {
|
|
305
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
306
|
+
throw error;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Batch recognize multiple pages
|
|
311
|
+
*/
|
|
312
|
+
async batchRecognizePages(startPage, endPage) {
|
|
313
|
+
try {
|
|
314
|
+
this.recordOperation();
|
|
315
|
+
const results = new Map();
|
|
316
|
+
for (let i = startPage; i <= endPage; i++) {
|
|
317
|
+
const text = await this.recognizePage(i);
|
|
318
|
+
results.set(i, text);
|
|
319
|
+
}
|
|
320
|
+
this.emit('batch-recognized', {
|
|
321
|
+
startPage,
|
|
322
|
+
endPage,
|
|
323
|
+
pageCount: endPage - startPage + 1,
|
|
324
|
+
totalCharacters: Array.from(results.values()).reduce((s, t) => s + t.length, 0),
|
|
325
|
+
timestamp: Date.now(),
|
|
326
|
+
});
|
|
327
|
+
return results;
|
|
328
|
+
}
|
|
329
|
+
catch (error) {
|
|
330
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
331
|
+
throw error;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Extract OCR text with aggregated statistics from page range (FFI-wired)
|
|
336
|
+
*/
|
|
337
|
+
async extractPageRange(startPage, endPage, skipNonScanned = true) {
|
|
338
|
+
try {
|
|
339
|
+
this.recordOperation();
|
|
340
|
+
if (!this.ocrEngine) {
|
|
341
|
+
throw new Error('OCR engine not initialized');
|
|
342
|
+
}
|
|
343
|
+
let totalSpans = 0;
|
|
344
|
+
let confidenceSum = 0;
|
|
345
|
+
let skippedPages = 0;
|
|
346
|
+
for (let pageIdx = startPage; pageIdx <= endPage; pageIdx++) {
|
|
347
|
+
try {
|
|
348
|
+
if (skipNonScanned) {
|
|
349
|
+
const needsOcr = await this.pageNeedsOcr(pageIdx);
|
|
350
|
+
if (!needsOcr) {
|
|
351
|
+
skippedPages++;
|
|
352
|
+
continue;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
const text = await this.recognizePage(pageIdx);
|
|
356
|
+
const confidence = await this.getOcrConfidence(pageIdx);
|
|
357
|
+
const regions = await this.detectTextRegions(pageIdx);
|
|
358
|
+
totalSpans += Math.max(regions.length, text ? 1 : 0);
|
|
359
|
+
confidenceSum += confidence;
|
|
360
|
+
}
|
|
361
|
+
catch { }
|
|
362
|
+
}
|
|
363
|
+
const processedPages = endPage - startPage + 1 - skippedPages;
|
|
364
|
+
const avgConfidence = processedPages > 0 ? confidenceSum / processedPages : 0;
|
|
365
|
+
const result = {
|
|
366
|
+
startPage,
|
|
367
|
+
endPage,
|
|
368
|
+
totalPages: endPage - startPage + 1,
|
|
369
|
+
totalSpans,
|
|
370
|
+
averageConfidence: avgConfidence,
|
|
371
|
+
skippedPages,
|
|
372
|
+
};
|
|
373
|
+
this.emit('page-range-extracted', {
|
|
374
|
+
...result,
|
|
375
|
+
timestamp: Date.now(),
|
|
376
|
+
});
|
|
377
|
+
this.setCached(`ocr-batch:${startPage}-${endPage}`, result);
|
|
378
|
+
return result;
|
|
379
|
+
}
|
|
380
|
+
catch (error) {
|
|
381
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
// ==========================================================================
|
|
386
|
+
// Engine Status & Configuration (from typed version)
|
|
387
|
+
// ==========================================================================
|
|
388
|
+
/**
|
|
389
|
+
* Get OCR engine status and configuration
|
|
390
|
+
*/
|
|
391
|
+
async getEngineStatus() {
|
|
392
|
+
try {
|
|
393
|
+
this.recordOperation();
|
|
394
|
+
if (!this.ocrEngine) {
|
|
395
|
+
return 'not_initialized';
|
|
396
|
+
}
|
|
397
|
+
return (await this.document?.getEngineStatus(this.ocrEngine)) || 'unknown';
|
|
398
|
+
}
|
|
399
|
+
catch (error) {
|
|
400
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
401
|
+
throw error;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Get current OCR configuration
|
|
406
|
+
*/
|
|
407
|
+
getConfiguration() {
|
|
408
|
+
return {
|
|
409
|
+
language: this.currentLanguage,
|
|
410
|
+
preprocessingType: this.preprocessingType,
|
|
411
|
+
engineInitialized: !!this.ocrEngine,
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
// ==========================================================================
|
|
415
|
+
// Methods from root-level OCRManager
|
|
416
|
+
// ==========================================================================
|
|
417
|
+
/**
|
|
418
|
+
* Extracts text from a page (convenience alias for recognizePage)
|
|
419
|
+
* From root-level OCRManager
|
|
420
|
+
*/
|
|
421
|
+
async extractText(pageIndex, config) {
|
|
422
|
+
const cacheKey = `ocr:text:${pageIndex}:${this.currentLanguage}`;
|
|
423
|
+
const cached = this.getCached(cacheKey);
|
|
424
|
+
if (cached !== undefined) {
|
|
425
|
+
return cached;
|
|
426
|
+
}
|
|
427
|
+
let result = '';
|
|
428
|
+
if (this.document?.extractText) {
|
|
429
|
+
result = this.document.extractText(pageIndex) || '';
|
|
430
|
+
}
|
|
431
|
+
this.setCached(cacheKey, result);
|
|
432
|
+
this.emit('textExtracted', pageIndex, result.length);
|
|
433
|
+
return result;
|
|
434
|
+
}
|
|
435
|
+
/**
|
|
436
|
+
* Analyzes a page and returns detailed results
|
|
437
|
+
* From root-level OCRManager
|
|
438
|
+
*/
|
|
439
|
+
async analyzePage(pageIndex, config) {
|
|
440
|
+
const cacheKey = `ocr:analysis:${pageIndex}:${this.currentLanguage}`;
|
|
441
|
+
const cached = this.getCached(cacheKey);
|
|
442
|
+
if (cached !== undefined) {
|
|
443
|
+
return cached;
|
|
444
|
+
}
|
|
445
|
+
let text = '';
|
|
446
|
+
let needsOcr = false;
|
|
447
|
+
if (this.document?.extractText) {
|
|
448
|
+
text = this.document.extractText(pageIndex) || '';
|
|
449
|
+
needsOcr = !text || text.trim().length < 10;
|
|
450
|
+
}
|
|
451
|
+
const result = {
|
|
452
|
+
pageIndex,
|
|
453
|
+
needsOcr,
|
|
454
|
+
confidence: needsOcr ? 0.0 : 0.95,
|
|
455
|
+
spanCount: text.split(' ').length || 0,
|
|
456
|
+
text,
|
|
457
|
+
};
|
|
458
|
+
this.setCached(cacheKey, result);
|
|
459
|
+
this.emit('pageAnalyzed', pageIndex, result);
|
|
460
|
+
return result;
|
|
461
|
+
}
|
|
462
|
+
/**
|
|
463
|
+
* Performs OCR analysis on all pages in the document
|
|
464
|
+
* From root-level OCRManager
|
|
465
|
+
*/
|
|
466
|
+
async analyzeDocument(config) {
|
|
467
|
+
const cacheKey = `ocr:document:${this.currentLanguage}`;
|
|
468
|
+
const cached = this.getCached(cacheKey);
|
|
469
|
+
if (cached !== undefined) {
|
|
470
|
+
return cached;
|
|
471
|
+
}
|
|
472
|
+
const results = [];
|
|
473
|
+
const pageCount = this.document?.pageCount || 0;
|
|
474
|
+
for (let i = 0; i < pageCount; i++) {
|
|
475
|
+
const analysis = await this.analyzePage(i, config);
|
|
476
|
+
results.push(analysis);
|
|
477
|
+
this.emit('pageProcessed', i + 1, pageCount);
|
|
478
|
+
}
|
|
479
|
+
this.setCached(cacheKey, results);
|
|
480
|
+
this.emit('documentAnalyzed', results.length);
|
|
481
|
+
return results;
|
|
482
|
+
}
|
|
483
|
+
/**
|
|
484
|
+
* Extracts text spans with bounding boxes for a page
|
|
485
|
+
* From root-level OCRManager
|
|
486
|
+
*/
|
|
487
|
+
async extractSpans(pageIndex, config) {
|
|
488
|
+
const cacheKey = `ocr:spans:${pageIndex}:${this.currentLanguage}`;
|
|
489
|
+
const cached = this.getCached(cacheKey);
|
|
490
|
+
if (cached !== undefined) {
|
|
491
|
+
return cached;
|
|
492
|
+
}
|
|
493
|
+
let spans = [];
|
|
494
|
+
if (this.native?.extract_spans) {
|
|
495
|
+
try {
|
|
496
|
+
const spansJson = this.native.extract_spans(pageIndex) ?? [];
|
|
497
|
+
spans = spansJson.length > 0 ? spansJson.map((json) => JSON.parse(json)) : [];
|
|
498
|
+
}
|
|
499
|
+
catch {
|
|
500
|
+
spans = [];
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
this.setCached(cacheKey, spans);
|
|
504
|
+
this.emit('spansExtracted', { page: pageIndex, count: spans.length });
|
|
505
|
+
return spans;
|
|
506
|
+
}
|
|
507
|
+
/**
|
|
508
|
+
* Checks if OCR is available/installed
|
|
509
|
+
* From root-level OCRManager
|
|
510
|
+
*/
|
|
511
|
+
async isAvailable() {
|
|
512
|
+
const cacheKey = 'ocr:available';
|
|
513
|
+
const cached = this.getCached(cacheKey);
|
|
514
|
+
if (cached !== undefined) {
|
|
515
|
+
return cached;
|
|
516
|
+
}
|
|
517
|
+
const result = this.native ? true : false;
|
|
518
|
+
this.setCached(cacheKey, result);
|
|
519
|
+
return result;
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Gets OCR engine version
|
|
523
|
+
* From root-level OCRManager
|
|
524
|
+
*/
|
|
525
|
+
async getVersion() {
|
|
526
|
+
const cacheKey = 'ocr:version';
|
|
527
|
+
const cached = this.getCached(cacheKey);
|
|
528
|
+
if (cached !== undefined) {
|
|
529
|
+
return cached;
|
|
530
|
+
}
|
|
531
|
+
let version = '0.0.0';
|
|
532
|
+
if (this.native?.get_ocr_version) {
|
|
533
|
+
try {
|
|
534
|
+
version = this.native.get_ocr_version() ?? '0.0.0';
|
|
535
|
+
}
|
|
536
|
+
catch {
|
|
537
|
+
version = '0.0.0';
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
this.setCached(cacheKey, version);
|
|
541
|
+
return version;
|
|
542
|
+
}
|
|
543
|
+
// ==========================================================================
|
|
544
|
+
// Cache Operations (from root-level OCRManager)
|
|
545
|
+
// ==========================================================================
|
|
546
|
+
/**
|
|
547
|
+
* Clears the result cache
|
|
548
|
+
*/
|
|
549
|
+
clearCache() {
|
|
550
|
+
this.invalidateCache();
|
|
551
|
+
this.emit('cacheCleared');
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Gets cache statistics
|
|
555
|
+
*/
|
|
556
|
+
getCacheStats() {
|
|
557
|
+
return {
|
|
558
|
+
cacheSize: this.cache.size,
|
|
559
|
+
entries: Array.from(this.cache.keys()),
|
|
560
|
+
};
|
|
561
|
+
}
|
|
562
|
+
// ==========================================================================
|
|
563
|
+
// Cleanup
|
|
564
|
+
// ==========================================================================
|
|
565
|
+
/**
|
|
566
|
+
* Cleanup on destroy
|
|
567
|
+
*/
|
|
568
|
+
async destroy() {
|
|
569
|
+
try {
|
|
570
|
+
await this.destroyOcrEngine();
|
|
571
|
+
this.invalidateCache();
|
|
572
|
+
this.removeAllListeners();
|
|
573
|
+
this.initialized = false;
|
|
574
|
+
}
|
|
575
|
+
catch (error) {
|
|
576
|
+
console.error('Error during OCR manager cleanup:', error);
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
/** @deprecated Use OcrManager instead */
|
|
581
|
+
export const OCRManager = OcrManager;
|
|
582
|
+
export default OcrManager;
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OptimizationManager - PDF Optimization Operations
|
|
3
|
+
*
|
|
4
|
+
* Provides document optimization capabilities including:
|
|
5
|
+
* - Font subsetting
|
|
6
|
+
* - Image downsampling
|
|
7
|
+
* - Object deduplication
|
|
8
|
+
* - Full optimization pipeline
|
|
9
|
+
*
|
|
10
|
+
* @since 1.0.0
|
|
11
|
+
*/
|
|
12
|
+
import { EventEmitter } from 'events';
|
|
13
|
+
/**
|
|
14
|
+
* Result of an optimization operation.
|
|
15
|
+
*/
|
|
16
|
+
export interface OptimizationResult {
|
|
17
|
+
/** Whether the optimization succeeded */
|
|
18
|
+
readonly success: boolean;
|
|
19
|
+
/** Number of bytes saved */
|
|
20
|
+
readonly bytesSaved: number;
|
|
21
|
+
/** Original document size in bytes */
|
|
22
|
+
readonly originalSize: number;
|
|
23
|
+
/** Optimized document size in bytes */
|
|
24
|
+
readonly optimizedSize: number;
|
|
25
|
+
/** Compression ratio (0.0 - 1.0) */
|
|
26
|
+
readonly compressionRatio: number;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Manager for PDF optimization operations.
|
|
30
|
+
*
|
|
31
|
+
* Provides methods for reducing PDF file size through font subsetting,
|
|
32
|
+
* image downsampling, object deduplication, and combined optimization.
|
|
33
|
+
*
|
|
34
|
+
* @example
|
|
35
|
+
* ```typescript
|
|
36
|
+
* const optimizer = new OptimizationManager(document);
|
|
37
|
+
*
|
|
38
|
+
* // Subset fonts to remove unused glyphs
|
|
39
|
+
* const fontResult = await optimizer.subsetFonts();
|
|
40
|
+
* console.log(`Font subsetting saved ${fontResult.bytesSaved} bytes`);
|
|
41
|
+
*
|
|
42
|
+
* // Downsample high-resolution images
|
|
43
|
+
* const imageResult = await optimizer.downsampleImages(150, 80);
|
|
44
|
+
*
|
|
45
|
+
* // Full optimization pipeline
|
|
46
|
+
* const fullResult = await optimizer.optimizeFull(150, 80);
|
|
47
|
+
* console.log(`Total savings: ${fullResult.bytesSaved} bytes`);
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
export declare class OptimizationManager extends EventEmitter {
|
|
51
|
+
private document;
|
|
52
|
+
private native;
|
|
53
|
+
constructor(document: any);
|
|
54
|
+
/**
|
|
55
|
+
* Subsets all embedded fonts in the document.
|
|
56
|
+
*
|
|
57
|
+
* Removes unused glyphs from embedded fonts, reducing file size
|
|
58
|
+
* while preserving visual fidelity for the characters actually used.
|
|
59
|
+
*
|
|
60
|
+
* @returns Optimization result with bytes saved
|
|
61
|
+
* @throws OptimizationException if the operation fails
|
|
62
|
+
*/
|
|
63
|
+
subsetFonts(): Promise<OptimizationResult>;
|
|
64
|
+
/**
|
|
65
|
+
* Downsamples images in the document to reduce file size.
|
|
66
|
+
*
|
|
67
|
+
* @param dpi - Target resolution in dots per inch (default: 150)
|
|
68
|
+
* @param quality - JPEG quality for recompression (1-100, default: 80)
|
|
69
|
+
* @returns Optimization result with bytes saved
|
|
70
|
+
* @throws OptimizationException if the operation fails
|
|
71
|
+
*/
|
|
72
|
+
downsampleImages(dpi?: number, quality?: number): Promise<OptimizationResult>;
|
|
73
|
+
/**
|
|
74
|
+
* Deduplicates identical objects in the document.
|
|
75
|
+
*
|
|
76
|
+
* Identifies and merges duplicate fonts, images, and other resources
|
|
77
|
+
* that appear multiple times in the document.
|
|
78
|
+
*
|
|
79
|
+
* @returns Optimization result with bytes saved
|
|
80
|
+
* @throws OptimizationException if the operation fails
|
|
81
|
+
*/
|
|
82
|
+
deduplicate(): Promise<OptimizationResult>;
|
|
83
|
+
/**
|
|
84
|
+
* Runs the full optimization pipeline.
|
|
85
|
+
*
|
|
86
|
+
* Combines font subsetting, image downsampling, and object deduplication
|
|
87
|
+
* into a single operation for maximum file size reduction.
|
|
88
|
+
*
|
|
89
|
+
* @param dpi - Target image resolution in dots per inch (default: 150)
|
|
90
|
+
* @param quality - JPEG quality for recompression (1-100, default: 80)
|
|
91
|
+
* @returns Optimization result with total bytes saved
|
|
92
|
+
* @throws OptimizationException if the operation fails
|
|
93
|
+
*/
|
|
94
|
+
optimizeFull(dpi?: number, quality?: number): Promise<OptimizationResult>;
|
|
95
|
+
private parseOptimizationResult;
|
|
96
|
+
private freeOptimizationResult;
|
|
97
|
+
/**
|
|
98
|
+
* Releases resources held by this manager.
|
|
99
|
+
*/
|
|
100
|
+
destroy(): void;
|
|
101
|
+
}
|
|
102
|
+
export default OptimizationManager;
|