pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OcrManager - Canonical OCR Manager (merged from 3 implementations)
|
|
3
|
+
*
|
|
4
|
+
* Consolidates:
|
|
5
|
+
* - src/ocr-manager.ts (simple API with setLanguage, extractText, analyzePage)
|
|
6
|
+
* - src/managers/ocr-compliance-cache.ts OCRManager (engine lifecycle)
|
|
7
|
+
* - src/managers/ocr-manager-typed.ts OCRManager (full TypeScript, FFI-wired)
|
|
8
|
+
*
|
|
9
|
+
* Provides optical character recognition operations with complete type safety,
|
|
10
|
+
* proper error handling, and full FFI integration.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import {
|
|
14
|
+
BaseManager,
|
|
15
|
+
OcrLanguage,
|
|
16
|
+
OcrResult,
|
|
17
|
+
OcrBatchResult,
|
|
18
|
+
TextRegion,
|
|
19
|
+
PdfDocumentHandle,
|
|
20
|
+
ManagerOptions,
|
|
21
|
+
} from '../types/manager-types.js';
|
|
22
|
+
import { promises as fs } from 'fs';
|
|
23
|
+
import { dirname } from 'path';
|
|
24
|
+
|
|
25
|
+
// Re-export types for convenience
|
|
26
|
+
export { OcrLanguage };
|
|
27
|
+
export type { OcrResult, OcrBatchResult, TextRegion };
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* OCR detection modes for accuracy/speed tradeoff
|
|
31
|
+
*/
|
|
32
|
+
export enum OcrDetectionMode {
|
|
33
|
+
Accurate = 'accurate',
|
|
34
|
+
Fast = 'fast',
|
|
35
|
+
Balanced = 'balanced',
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Configuration for OCR operations
|
|
40
|
+
*/
|
|
41
|
+
export interface OcrConfig {
|
|
42
|
+
language?: OcrLanguage;
|
|
43
|
+
detectionMode?: OcrDetectionMode;
|
|
44
|
+
detectionThreshold?: number;
|
|
45
|
+
recognitionThreshold?: number;
|
|
46
|
+
maxSideLen?: number;
|
|
47
|
+
useGpu?: boolean;
|
|
48
|
+
gpuDeviceId?: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* A recognized text span with position and confidence
|
|
53
|
+
*/
|
|
54
|
+
export interface OcrSpan {
|
|
55
|
+
text: string;
|
|
56
|
+
confidence: number;
|
|
57
|
+
x: number;
|
|
58
|
+
y: number;
|
|
59
|
+
width: number;
|
|
60
|
+
height: number;
|
|
61
|
+
charCount: number;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Analysis result for a single page
|
|
66
|
+
*/
|
|
67
|
+
export interface OcrPageAnalysis {
|
|
68
|
+
pageIndex: number;
|
|
69
|
+
needsOcr: boolean;
|
|
70
|
+
confidence: number;
|
|
71
|
+
spanCount: number;
|
|
72
|
+
text: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Canonical OcrManager - Comprehensive OCR with full TypeScript support
|
|
77
|
+
*
|
|
78
|
+
* Features:
|
|
79
|
+
* - Full text recognition with confidence scoring
|
|
80
|
+
* - Batch page processing with skip optimization
|
|
81
|
+
* - Text region detection with coordinates
|
|
82
|
+
* - Multi-language support
|
|
83
|
+
* - Comprehensive event emission
|
|
84
|
+
* - Automatic resource cleanup
|
|
85
|
+
* - Legacy API compatibility (setLanguage, extractText, analyzePage, etc.)
|
|
86
|
+
*/
|
|
87
|
+
export class OcrManager extends BaseManager<PdfDocumentHandle> {
|
|
88
|
+
private ocrEngine: unknown | null = null;
|
|
89
|
+
private currentLanguage: OcrLanguage = OcrLanguage.ENGLISH;
|
|
90
|
+
private preprocessingType: string = 'auto';
|
|
91
|
+
private native: any;
|
|
92
|
+
|
|
93
|
+
constructor(document: PdfDocumentHandle, options?: ManagerOptions) {
|
|
94
|
+
super(document, options);
|
|
95
|
+
try {
|
|
96
|
+
this.native = require('../../index.node');
|
|
97
|
+
} catch {
|
|
98
|
+
this.native = null;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// ==========================================================================
|
|
103
|
+
// Engine Lifecycle (from typed version)
|
|
104
|
+
// ==========================================================================
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Initialize OCR engine with specified configuration
|
|
108
|
+
*/
|
|
109
|
+
async initializeEngine(
|
|
110
|
+
detectionThreshold: number = 0.5,
|
|
111
|
+
recognitionThreshold: number = 0.5,
|
|
112
|
+
maxSideLen: number = 960,
|
|
113
|
+
useGpu: boolean = false,
|
|
114
|
+
gpuDeviceId: number = 0
|
|
115
|
+
): Promise<boolean> {
|
|
116
|
+
try {
|
|
117
|
+
this.recordOperation();
|
|
118
|
+
|
|
119
|
+
if (this.ocrEngine) {
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
this.ocrEngine = await (this.document as any)?.createOcrEngine(
|
|
124
|
+
detectionThreshold,
|
|
125
|
+
recognitionThreshold,
|
|
126
|
+
maxSideLen,
|
|
127
|
+
useGpu,
|
|
128
|
+
gpuDeviceId
|
|
129
|
+
);
|
|
130
|
+
|
|
131
|
+
if (this.ocrEngine) {
|
|
132
|
+
this.emit('ocr-engine-initialized', {
|
|
133
|
+
useGpu,
|
|
134
|
+
gpuDeviceId,
|
|
135
|
+
detectionThreshold,
|
|
136
|
+
recognitionThreshold,
|
|
137
|
+
});
|
|
138
|
+
return true;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return false;
|
|
142
|
+
} catch (error) {
|
|
143
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
144
|
+
throw error;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Destroy OCR engine and free resources
|
|
150
|
+
*/
|
|
151
|
+
async destroyOcrEngine(): Promise<void> {
|
|
152
|
+
try {
|
|
153
|
+
this.recordOperation();
|
|
154
|
+
|
|
155
|
+
if (this.ocrEngine) {
|
|
156
|
+
await (this.document as any)?.destroyOcrEngine(this.ocrEngine);
|
|
157
|
+
this.ocrEngine = null;
|
|
158
|
+
this.emit('ocr-engine-destroyed', { timestamp: Date.now() });
|
|
159
|
+
}
|
|
160
|
+
} catch (error) {
|
|
161
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
162
|
+
throw error;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ==========================================================================
|
|
167
|
+
// Core Recognition (from typed version)
|
|
168
|
+
// ==========================================================================
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Check if page needs OCR processing
|
|
172
|
+
*/
|
|
173
|
+
async pageNeedsOcr(pageIndex: number): Promise<boolean> {
|
|
174
|
+
try {
|
|
175
|
+
this.recordOperation();
|
|
176
|
+
return (await (this.document as any)?.pageNeedsOcr(pageIndex)) || false;
|
|
177
|
+
} catch (error) {
|
|
178
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
179
|
+
throw error;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Recognize text on a page with full confidence scoring
|
|
185
|
+
*/
|
|
186
|
+
async recognizePage(pageIndex: number): Promise<string> {
|
|
187
|
+
try {
|
|
188
|
+
this.recordOperation();
|
|
189
|
+
|
|
190
|
+
if (!this.ocrEngine) {
|
|
191
|
+
throw new Error('OCR engine not initialized. Call initializeEngine() first.');
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const text = await (this.document as any)?.recognizePage(
|
|
195
|
+
pageIndex,
|
|
196
|
+
this.ocrEngine
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
this.emit('page-recognized', {
|
|
200
|
+
pageIndex,
|
|
201
|
+
textLength: text?.length || 0,
|
|
202
|
+
timestamp: Date.now(),
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
return text || '';
|
|
206
|
+
} catch (error) {
|
|
207
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
208
|
+
throw error;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Get OCR confidence score for a page
|
|
214
|
+
*/
|
|
215
|
+
async getOcrConfidence(pageIndex: number): Promise<number> {
|
|
216
|
+
try {
|
|
217
|
+
this.recordOperation();
|
|
218
|
+
|
|
219
|
+
if (!this.ocrEngine) {
|
|
220
|
+
return 0;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return (await (this.document as any)?.getOcrConfidence(pageIndex)) || 0;
|
|
224
|
+
} catch (error) {
|
|
225
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
226
|
+
throw error;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Detect text regions on a page with bounding boxes
|
|
232
|
+
*/
|
|
233
|
+
async detectTextRegions(pageIndex: number): Promise<TextRegion[]> {
|
|
234
|
+
try {
|
|
235
|
+
this.recordOperation();
|
|
236
|
+
|
|
237
|
+
if (!this.ocrEngine) {
|
|
238
|
+
return [];
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const regions = await (this.document as any)?.detectTextRegions(
|
|
242
|
+
pageIndex,
|
|
243
|
+
this.ocrEngine
|
|
244
|
+
);
|
|
245
|
+
|
|
246
|
+
return regions || [];
|
|
247
|
+
} catch (error) {
|
|
248
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
249
|
+
throw error;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// ==========================================================================
|
|
254
|
+
// Language Configuration (from typed + root versions)
|
|
255
|
+
// ==========================================================================
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Set OCR language for recognition (FFI-wired)
|
|
259
|
+
*/
|
|
260
|
+
async setOcrLanguage(language: OcrLanguage | string): Promise<boolean> {
|
|
261
|
+
try {
|
|
262
|
+
this.recordOperation();
|
|
263
|
+
|
|
264
|
+
if (!this.ocrEngine) {
|
|
265
|
+
throw new Error('OCR engine not initialized');
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const result = await (this.document as any)?.setOcrLanguage(
|
|
269
|
+
this.ocrEngine,
|
|
270
|
+
language
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
if (result) {
|
|
274
|
+
this.currentLanguage = (language as OcrLanguage) || OcrLanguage.ENGLISH;
|
|
275
|
+
this.emit('language-changed', {
|
|
276
|
+
language,
|
|
277
|
+
timestamp: Date.now(),
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return !!result;
|
|
282
|
+
} catch (error) {
|
|
283
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
284
|
+
throw error;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Sets the OCR language (convenience alias for setOcrLanguage)
|
|
290
|
+
* From root-level OCRManager
|
|
291
|
+
*/
|
|
292
|
+
setLanguage(language: OcrLanguage): void {
|
|
293
|
+
this.currentLanguage = language;
|
|
294
|
+
this.invalidateCache('ocr');
|
|
295
|
+
this.emit('languageChanged', language);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* Gets the current OCR language
|
|
300
|
+
* From root-level OCRManager
|
|
301
|
+
*/
|
|
302
|
+
getLanguage(): OcrLanguage {
|
|
303
|
+
return this.currentLanguage;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Get available OCR languages
|
|
308
|
+
*/
|
|
309
|
+
async getAvailableLanguages(): Promise<OcrLanguage[]> {
|
|
310
|
+
try {
|
|
311
|
+
this.recordOperation();
|
|
312
|
+
|
|
313
|
+
const languages =
|
|
314
|
+
(await (this.document as any)?.getAvailableLanguages()) ||
|
|
315
|
+
Object.values(OcrLanguage);
|
|
316
|
+
|
|
317
|
+
return languages;
|
|
318
|
+
} catch (error) {
|
|
319
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
320
|
+
throw error;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// ==========================================================================
|
|
325
|
+
// Processing & Export (from typed version)
|
|
326
|
+
// ==========================================================================
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Preprocess page before OCR for better recognition
|
|
330
|
+
*/
|
|
331
|
+
async preprocessPage(
|
|
332
|
+
pageIndex: number,
|
|
333
|
+
preprocessingType: string = 'auto'
|
|
334
|
+
): Promise<boolean> {
|
|
335
|
+
try {
|
|
336
|
+
this.recordOperation();
|
|
337
|
+
|
|
338
|
+
const result = await (this.document as any)?.preprocessPage(
|
|
339
|
+
pageIndex,
|
|
340
|
+
preprocessingType
|
|
341
|
+
);
|
|
342
|
+
|
|
343
|
+
this.preprocessingType = preprocessingType;
|
|
344
|
+
this.emit('page-preprocessed', {
|
|
345
|
+
pageIndex,
|
|
346
|
+
type: preprocessingType,
|
|
347
|
+
timestamp: Date.now(),
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
return !!result;
|
|
351
|
+
} catch (error) {
|
|
352
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
353
|
+
throw error;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Export OCR text to file
|
|
359
|
+
*/
|
|
360
|
+
async exportOcrText(
|
|
361
|
+
pageIndex: number,
|
|
362
|
+
filePath: string,
|
|
363
|
+
format: 'txt' | 'json' | 'xml' = 'txt'
|
|
364
|
+
): Promise<boolean> {
|
|
365
|
+
try {
|
|
366
|
+
this.recordOperation();
|
|
367
|
+
|
|
368
|
+
const text = await this.recognizePage(pageIndex);
|
|
369
|
+
|
|
370
|
+
await fs.mkdir(dirname(filePath), { recursive: true });
|
|
371
|
+
|
|
372
|
+
let content: string;
|
|
373
|
+
switch (format) {
|
|
374
|
+
case 'json':
|
|
375
|
+
content = JSON.stringify(
|
|
376
|
+
{ pageIndex, text, timestamp: Date.now() },
|
|
377
|
+
null,
|
|
378
|
+
2
|
|
379
|
+
);
|
|
380
|
+
break;
|
|
381
|
+
case 'xml':
|
|
382
|
+
content = `<?xml version="1.0"?>\n<page index="${pageIndex}">\n${text.split('\n').map(line => ` <line>${line}</line>`).join('\n')}\n</page>`;
|
|
383
|
+
break;
|
|
384
|
+
default:
|
|
385
|
+
content = text;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
await fs.writeFile(filePath, content, 'utf8');
|
|
389
|
+
|
|
390
|
+
this.emit('text-exported', {
|
|
391
|
+
pageIndex,
|
|
392
|
+
filePath,
|
|
393
|
+
format,
|
|
394
|
+
size: content.length,
|
|
395
|
+
timestamp: Date.now(),
|
|
396
|
+
});
|
|
397
|
+
|
|
398
|
+
return true;
|
|
399
|
+
} catch (error) {
|
|
400
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
401
|
+
throw error;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// ==========================================================================
|
|
406
|
+
// Statistics & Batch (from typed version)
|
|
407
|
+
// ==========================================================================
|
|
408
|
+
|
|
409
|
+
/**
|
|
410
|
+
* Get comprehensive OCR statistics for a page
|
|
411
|
+
*/
|
|
412
|
+
async getOcrStatistics(pageIndex: number): Promise<OcrResult> {
|
|
413
|
+
try {
|
|
414
|
+
this.recordOperation();
|
|
415
|
+
|
|
416
|
+
const text = await this.recognizePage(pageIndex);
|
|
417
|
+
const confidence = await this.getOcrConfidence(pageIndex);
|
|
418
|
+
const regions = await this.detectTextRegions(pageIndex);
|
|
419
|
+
|
|
420
|
+
return {
|
|
421
|
+
pageIndex,
|
|
422
|
+
text,
|
|
423
|
+
confidence,
|
|
424
|
+
regionCount: regions.length,
|
|
425
|
+
};
|
|
426
|
+
} catch (error) {
|
|
427
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
428
|
+
throw error;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Batch recognize multiple pages
|
|
434
|
+
*/
|
|
435
|
+
async batchRecognizePages(
|
|
436
|
+
startPage: number,
|
|
437
|
+
endPage: number
|
|
438
|
+
): Promise<Map<number, string>> {
|
|
439
|
+
try {
|
|
440
|
+
this.recordOperation();
|
|
441
|
+
|
|
442
|
+
const results = new Map<number, string>();
|
|
443
|
+
|
|
444
|
+
for (let i = startPage; i <= endPage; i++) {
|
|
445
|
+
const text = await this.recognizePage(i);
|
|
446
|
+
results.set(i, text);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
this.emit('batch-recognized', {
|
|
450
|
+
startPage,
|
|
451
|
+
endPage,
|
|
452
|
+
pageCount: endPage - startPage + 1,
|
|
453
|
+
totalCharacters: Array.from(results.values()).reduce((s, t) => s + t.length, 0),
|
|
454
|
+
timestamp: Date.now(),
|
|
455
|
+
});
|
|
456
|
+
|
|
457
|
+
return results;
|
|
458
|
+
} catch (error) {
|
|
459
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
460
|
+
throw error;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Extract OCR text with aggregated statistics from page range (FFI-wired)
|
|
466
|
+
*/
|
|
467
|
+
async extractPageRange(
|
|
468
|
+
startPage: number,
|
|
469
|
+
endPage: number,
|
|
470
|
+
skipNonScanned: boolean = true
|
|
471
|
+
): Promise<OcrBatchResult> {
|
|
472
|
+
try {
|
|
473
|
+
this.recordOperation();
|
|
474
|
+
|
|
475
|
+
if (!this.ocrEngine) {
|
|
476
|
+
throw new Error('OCR engine not initialized');
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
let totalSpans = 0;
|
|
480
|
+
let confidenceSum = 0;
|
|
481
|
+
let skippedPages = 0;
|
|
482
|
+
|
|
483
|
+
for (let pageIdx = startPage; pageIdx <= endPage; pageIdx++) {
|
|
484
|
+
try {
|
|
485
|
+
if (skipNonScanned) {
|
|
486
|
+
const needsOcr = await this.pageNeedsOcr(pageIdx);
|
|
487
|
+
if (!needsOcr) {
|
|
488
|
+
skippedPages++;
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
const text = await this.recognizePage(pageIdx);
|
|
494
|
+
const confidence = await this.getOcrConfidence(pageIdx);
|
|
495
|
+
const regions = await this.detectTextRegions(pageIdx);
|
|
496
|
+
totalSpans += Math.max(regions.length, text ? 1 : 0);
|
|
497
|
+
confidenceSum += confidence;
|
|
498
|
+
} catch {
|
|
499
|
+
continue;
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
const processedPages = endPage - startPage + 1 - skippedPages;
|
|
504
|
+
const avgConfidence = processedPages > 0 ? confidenceSum / processedPages : 0;
|
|
505
|
+
|
|
506
|
+
const result: OcrBatchResult = {
|
|
507
|
+
startPage,
|
|
508
|
+
endPage,
|
|
509
|
+
totalPages: endPage - startPage + 1,
|
|
510
|
+
totalSpans,
|
|
511
|
+
averageConfidence: avgConfidence,
|
|
512
|
+
skippedPages,
|
|
513
|
+
};
|
|
514
|
+
|
|
515
|
+
this.emit('page-range-extracted', {
|
|
516
|
+
...result,
|
|
517
|
+
timestamp: Date.now(),
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
this.setCached(`ocr-batch:${startPage}-${endPage}`, result);
|
|
521
|
+
|
|
522
|
+
return result;
|
|
523
|
+
} catch (error) {
|
|
524
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
525
|
+
throw error;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
// ==========================================================================
|
|
530
|
+
// Engine Status & Configuration (from typed version)
|
|
531
|
+
// ==========================================================================
|
|
532
|
+
|
|
533
|
+
/**
|
|
534
|
+
* Get OCR engine status and configuration
|
|
535
|
+
*/
|
|
536
|
+
async getEngineStatus(): Promise<string> {
|
|
537
|
+
try {
|
|
538
|
+
this.recordOperation();
|
|
539
|
+
|
|
540
|
+
if (!this.ocrEngine) {
|
|
541
|
+
return 'not_initialized';
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
return (await (this.document as any)?.getEngineStatus(this.ocrEngine)) || 'unknown';
|
|
545
|
+
} catch (error) {
|
|
546
|
+
this.recordError(error instanceof Error ? error : new Error(String(error)));
|
|
547
|
+
throw error;
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
/**
|
|
552
|
+
* Get current OCR configuration
|
|
553
|
+
*/
|
|
554
|
+
getConfiguration(): {
|
|
555
|
+
language: OcrLanguage;
|
|
556
|
+
preprocessingType: string;
|
|
557
|
+
engineInitialized: boolean;
|
|
558
|
+
} {
|
|
559
|
+
return {
|
|
560
|
+
language: this.currentLanguage,
|
|
561
|
+
preprocessingType: this.preprocessingType,
|
|
562
|
+
engineInitialized: !!this.ocrEngine,
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// ==========================================================================
|
|
567
|
+
// Methods from root-level OCRManager
|
|
568
|
+
// ==========================================================================
|
|
569
|
+
|
|
570
|
+
/**
|
|
571
|
+
* Extracts text from a page (convenience alias for recognizePage)
|
|
572
|
+
* From root-level OCRManager
|
|
573
|
+
*/
|
|
574
|
+
async extractText(pageIndex: number, config?: OcrConfig): Promise<string> {
|
|
575
|
+
const cacheKey = `ocr:text:${pageIndex}:${this.currentLanguage}`;
|
|
576
|
+
const cached = this.getCached<string>(cacheKey);
|
|
577
|
+
if (cached !== undefined) {
|
|
578
|
+
return cached;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
let result = '';
|
|
582
|
+
if ((this.document as any)?.extractText) {
|
|
583
|
+
result = (this.document as any).extractText(pageIndex) || '';
|
|
584
|
+
}
|
|
585
|
+
this.setCached(cacheKey, result);
|
|
586
|
+
this.emit('textExtracted', pageIndex, result.length);
|
|
587
|
+
return result;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
/**
|
|
591
|
+
* Analyzes a page and returns detailed results
|
|
592
|
+
* From root-level OCRManager
|
|
593
|
+
*/
|
|
594
|
+
async analyzePage(pageIndex: number, config?: OcrConfig): Promise<OcrPageAnalysis> {
|
|
595
|
+
const cacheKey = `ocr:analysis:${pageIndex}:${this.currentLanguage}`;
|
|
596
|
+
const cached = this.getCached<OcrPageAnalysis>(cacheKey);
|
|
597
|
+
if (cached !== undefined) {
|
|
598
|
+
return cached;
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
let text = '';
|
|
602
|
+
let needsOcr = false;
|
|
603
|
+
|
|
604
|
+
if ((this.document as any)?.extractText) {
|
|
605
|
+
text = (this.document as any).extractText(pageIndex) || '';
|
|
606
|
+
needsOcr = !text || text.trim().length < 10;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const result: OcrPageAnalysis = {
|
|
610
|
+
pageIndex,
|
|
611
|
+
needsOcr,
|
|
612
|
+
confidence: needsOcr ? 0.0 : 0.95,
|
|
613
|
+
spanCount: text.split(' ').length || 0,
|
|
614
|
+
text,
|
|
615
|
+
};
|
|
616
|
+
this.setCached(cacheKey, result);
|
|
617
|
+
this.emit('pageAnalyzed', pageIndex, result);
|
|
618
|
+
return result;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/**
|
|
622
|
+
* Performs OCR analysis on all pages in the document
|
|
623
|
+
* From root-level OCRManager
|
|
624
|
+
*/
|
|
625
|
+
async analyzeDocument(config?: OcrConfig): Promise<OcrPageAnalysis[]> {
|
|
626
|
+
const cacheKey = `ocr:document:${this.currentLanguage}`;
|
|
627
|
+
const cached = this.getCached<OcrPageAnalysis[]>(cacheKey);
|
|
628
|
+
if (cached !== undefined) {
|
|
629
|
+
return cached;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
const results: OcrPageAnalysis[] = [];
|
|
633
|
+
const pageCount = (this.document as any)?.pageCount || 0;
|
|
634
|
+
|
|
635
|
+
for (let i = 0; i < pageCount; i++) {
|
|
636
|
+
const analysis = await this.analyzePage(i, config);
|
|
637
|
+
results.push(analysis);
|
|
638
|
+
this.emit('pageProcessed', i + 1, pageCount);
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
this.setCached(cacheKey, results);
|
|
642
|
+
this.emit('documentAnalyzed', results.length);
|
|
643
|
+
return results;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Extracts text spans with bounding boxes for a page
|
|
648
|
+
* From root-level OCRManager
|
|
649
|
+
*/
|
|
650
|
+
async extractSpans(pageIndex: number, config?: OcrConfig): Promise<OcrSpan[]> {
|
|
651
|
+
const cacheKey = `ocr:spans:${pageIndex}:${this.currentLanguage}`;
|
|
652
|
+
const cached = this.getCached<OcrSpan[]>(cacheKey);
|
|
653
|
+
if (cached !== undefined) {
|
|
654
|
+
return cached;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
let spans: OcrSpan[] = [];
|
|
658
|
+
if (this.native?.extract_spans) {
|
|
659
|
+
try {
|
|
660
|
+
const spansJson = this.native.extract_spans(pageIndex) ?? [];
|
|
661
|
+
spans = spansJson.length > 0 ? spansJson.map((json: string) => JSON.parse(json)) : [];
|
|
662
|
+
} catch {
|
|
663
|
+
spans = [];
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
this.setCached(cacheKey, spans);
|
|
668
|
+
this.emit('spansExtracted', { page: pageIndex, count: spans.length });
|
|
669
|
+
return spans;
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
/**
|
|
673
|
+
* Checks if OCR is available/installed
|
|
674
|
+
* From root-level OCRManager
|
|
675
|
+
*/
|
|
676
|
+
async isAvailable(): Promise<boolean> {
|
|
677
|
+
const cacheKey = 'ocr:available';
|
|
678
|
+
const cached = this.getCached<boolean>(cacheKey);
|
|
679
|
+
if (cached !== undefined) {
|
|
680
|
+
return cached;
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
const result = this.native ? true : false;
|
|
684
|
+
this.setCached(cacheKey, result);
|
|
685
|
+
return result;
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
/**
|
|
689
|
+
* Gets OCR engine version
|
|
690
|
+
* From root-level OCRManager
|
|
691
|
+
*/
|
|
692
|
+
async getVersion(): Promise<string> {
|
|
693
|
+
const cacheKey = 'ocr:version';
|
|
694
|
+
const cached = this.getCached<string>(cacheKey);
|
|
695
|
+
if (cached !== undefined) {
|
|
696
|
+
return cached;
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
let version = '0.0.0';
|
|
700
|
+
if (this.native?.get_ocr_version) {
|
|
701
|
+
try {
|
|
702
|
+
version = this.native.get_ocr_version() ?? '0.0.0';
|
|
703
|
+
} catch {
|
|
704
|
+
version = '0.0.0';
|
|
705
|
+
}
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
this.setCached(cacheKey, version);
|
|
709
|
+
return version;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
// ==========================================================================
|
|
713
|
+
// Cache Operations (from root-level OCRManager)
|
|
714
|
+
// ==========================================================================
|
|
715
|
+
|
|
716
|
+
/**
|
|
717
|
+
* Clears the result cache
|
|
718
|
+
*/
|
|
719
|
+
clearCache(): void {
|
|
720
|
+
this.invalidateCache();
|
|
721
|
+
this.emit('cacheCleared');
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
/**
|
|
725
|
+
* Gets cache statistics
|
|
726
|
+
*/
|
|
727
|
+
getCacheStats(): Record<string, any> {
|
|
728
|
+
return {
|
|
729
|
+
cacheSize: this.cache.size,
|
|
730
|
+
entries: Array.from(this.cache.keys()),
|
|
731
|
+
};
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
// ==========================================================================
|
|
735
|
+
// Cleanup
|
|
736
|
+
// ==========================================================================
|
|
737
|
+
|
|
738
|
+
/**
|
|
739
|
+
* Cleanup on destroy
|
|
740
|
+
*/
|
|
741
|
+
async destroy(): Promise<void> {
|
|
742
|
+
try {
|
|
743
|
+
await this.destroyOcrEngine();
|
|
744
|
+
this.invalidateCache();
|
|
745
|
+
this.removeAllListeners();
|
|
746
|
+
this.initialized = false;
|
|
747
|
+
} catch (error) {
|
|
748
|
+
console.error('Error during OCR manager cleanup:', error);
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
/** @deprecated Use OcrManager instead */
|
|
754
|
+
export const OCRManager = OcrManager;
|
|
755
|
+
|
|
756
|
+
export default OcrManager;
|