pdf-oxide 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +218 -0
  2. package/binding.gyp +35 -0
  3. package/package.json +78 -0
  4. package/src/builders/annotation-builder.ts +367 -0
  5. package/src/builders/conversion-options-builder.ts +257 -0
  6. package/src/builders/index.ts +12 -0
  7. package/src/builders/metadata-builder.ts +317 -0
  8. package/src/builders/pdf-builder.ts +386 -0
  9. package/src/builders/search-options-builder.ts +151 -0
  10. package/src/document-editor-manager.ts +318 -0
  11. package/src/errors.ts +1629 -0
  12. package/src/form-field-manager.ts +666 -0
  13. package/src/hybrid-ml-manager.ts +283 -0
  14. package/src/index.ts +453 -0
  15. package/src/managers/accessibility-manager.ts +338 -0
  16. package/src/managers/annotation-manager.ts +439 -0
  17. package/src/managers/barcode-manager.ts +235 -0
  18. package/src/managers/batch-manager.ts +533 -0
  19. package/src/managers/cache-manager.ts +486 -0
  20. package/src/managers/compliance-manager.ts +375 -0
  21. package/src/managers/content-manager.ts +339 -0
  22. package/src/managers/document-utility-manager.ts +922 -0
  23. package/src/managers/dom-pdf-creator.ts +365 -0
  24. package/src/managers/editing-manager.ts +514 -0
  25. package/src/managers/enterprise-manager.ts +478 -0
  26. package/src/managers/extended-managers.ts +437 -0
  27. package/src/managers/extraction-manager.ts +583 -0
  28. package/src/managers/final-utilities.ts +429 -0
  29. package/src/managers/hybrid-ml-advanced.ts +479 -0
  30. package/src/managers/index.ts +239 -0
  31. package/src/managers/layer-manager.ts +500 -0
  32. package/src/managers/metadata-manager.ts +303 -0
  33. package/src/managers/ocr-manager.ts +756 -0
  34. package/src/managers/optimization-manager.ts +262 -0
  35. package/src/managers/outline-manager.ts +196 -0
  36. package/src/managers/page-manager.ts +289 -0
  37. package/src/managers/pattern-detection.ts +440 -0
  38. package/src/managers/rendering-manager.ts +863 -0
  39. package/src/managers/search-manager.ts +385 -0
  40. package/src/managers/security-manager.ts +345 -0
  41. package/src/managers/signature-manager.ts +1664 -0
  42. package/src/managers/streams.ts +618 -0
  43. package/src/managers/xfa-manager.ts +500 -0
  44. package/src/pdf-creator-manager.ts +494 -0
  45. package/src/properties.ts +522 -0
  46. package/src/result-accessors-manager.ts +867 -0
  47. package/src/tests/advanced-features.test.ts +414 -0
  48. package/src/tests/advanced.test.ts +266 -0
  49. package/src/tests/extended-managers.test.ts +316 -0
  50. package/src/tests/final-utilities.test.ts +455 -0
  51. package/src/tests/foundation.test.ts +315 -0
  52. package/src/tests/high-demand.test.ts +257 -0
  53. package/src/tests/specialized.test.ts +97 -0
  54. package/src/thumbnail-manager.ts +272 -0
  55. package/src/types/common.ts +142 -0
  56. package/src/types/document-types.ts +457 -0
  57. package/src/types/index.ts +6 -0
  58. package/src/types/manager-types.ts +284 -0
  59. package/src/types/native-bindings.ts +517 -0
  60. package/src/workers/index.ts +7 -0
  61. package/src/workers/pool.ts +274 -0
  62. package/src/workers/worker.ts +131 -0
@@ -0,0 +1,756 @@
1
+ /**
2
+ * OcrManager - Canonical OCR Manager (merged from 3 implementations)
3
+ *
4
+ * Consolidates:
5
+ * - src/ocr-manager.ts (simple API with setLanguage, extractText, analyzePage)
6
+ * - src/managers/ocr-compliance-cache.ts OCRManager (engine lifecycle)
7
+ * - src/managers/ocr-manager-typed.ts OCRManager (full TypeScript, FFI-wired)
8
+ *
9
+ * Provides optical character recognition operations with complete type safety,
10
+ * proper error handling, and full FFI integration.
11
+ */
12
+
13
+ import {
14
+ BaseManager,
15
+ OcrLanguage,
16
+ OcrResult,
17
+ OcrBatchResult,
18
+ TextRegion,
19
+ PdfDocumentHandle,
20
+ ManagerOptions,
21
+ } from '../types/manager-types.js';
22
+ import { promises as fs } from 'fs';
23
+ import { dirname } from 'path';
24
+
25
+ // Re-export types for convenience
26
+ export { OcrLanguage };
27
+ export type { OcrResult, OcrBatchResult, TextRegion };
28
+
29
+ /**
30
+ * OCR detection modes for accuracy/speed tradeoff
31
+ */
32
+ export enum OcrDetectionMode {
33
+ Accurate = 'accurate',
34
+ Fast = 'fast',
35
+ Balanced = 'balanced',
36
+ }
37
+
38
+ /**
39
+ * Configuration for OCR operations
40
+ */
41
+ export interface OcrConfig {
42
+ language?: OcrLanguage;
43
+ detectionMode?: OcrDetectionMode;
44
+ detectionThreshold?: number;
45
+ recognitionThreshold?: number;
46
+ maxSideLen?: number;
47
+ useGpu?: boolean;
48
+ gpuDeviceId?: number;
49
+ }
50
+
51
+ /**
52
+ * A recognized text span with position and confidence
53
+ */
54
+ export interface OcrSpan {
55
+ text: string;
56
+ confidence: number;
57
+ x: number;
58
+ y: number;
59
+ width: number;
60
+ height: number;
61
+ charCount: number;
62
+ }
63
+
64
+ /**
65
+ * Analysis result for a single page
66
+ */
67
+ export interface OcrPageAnalysis {
68
+ pageIndex: number;
69
+ needsOcr: boolean;
70
+ confidence: number;
71
+ spanCount: number;
72
+ text: string;
73
+ }
74
+
75
+ /**
76
+ * Canonical OcrManager - Comprehensive OCR with full TypeScript support
77
+ *
78
+ * Features:
79
+ * - Full text recognition with confidence scoring
80
+ * - Batch page processing with skip optimization
81
+ * - Text region detection with coordinates
82
+ * - Multi-language support
83
+ * - Comprehensive event emission
84
+ * - Automatic resource cleanup
85
+ * - Legacy API compatibility (setLanguage, extractText, analyzePage, etc.)
86
+ */
87
+ export class OcrManager extends BaseManager<PdfDocumentHandle> {
88
+ private ocrEngine: unknown | null = null;
89
+ private currentLanguage: OcrLanguage = OcrLanguage.ENGLISH;
90
+ private preprocessingType: string = 'auto';
91
+ private native: any;
92
+
93
+ constructor(document: PdfDocumentHandle, options?: ManagerOptions) {
94
+ super(document, options);
95
+ try {
96
+ this.native = require('../../index.node');
97
+ } catch {
98
+ this.native = null;
99
+ }
100
+ }
101
+
102
+ // ==========================================================================
103
+ // Engine Lifecycle (from typed version)
104
+ // ==========================================================================
105
+
106
+ /**
107
+ * Initialize OCR engine with specified configuration
108
+ */
109
+ async initializeEngine(
110
+ detectionThreshold: number = 0.5,
111
+ recognitionThreshold: number = 0.5,
112
+ maxSideLen: number = 960,
113
+ useGpu: boolean = false,
114
+ gpuDeviceId: number = 0
115
+ ): Promise<boolean> {
116
+ try {
117
+ this.recordOperation();
118
+
119
+ if (this.ocrEngine) {
120
+ return true;
121
+ }
122
+
123
+ this.ocrEngine = await (this.document as any)?.createOcrEngine(
124
+ detectionThreshold,
125
+ recognitionThreshold,
126
+ maxSideLen,
127
+ useGpu,
128
+ gpuDeviceId
129
+ );
130
+
131
+ if (this.ocrEngine) {
132
+ this.emit('ocr-engine-initialized', {
133
+ useGpu,
134
+ gpuDeviceId,
135
+ detectionThreshold,
136
+ recognitionThreshold,
137
+ });
138
+ return true;
139
+ }
140
+
141
+ return false;
142
+ } catch (error) {
143
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
144
+ throw error;
145
+ }
146
+ }
147
+
148
+ /**
149
+ * Destroy OCR engine and free resources
150
+ */
151
+ async destroyOcrEngine(): Promise<void> {
152
+ try {
153
+ this.recordOperation();
154
+
155
+ if (this.ocrEngine) {
156
+ await (this.document as any)?.destroyOcrEngine(this.ocrEngine);
157
+ this.ocrEngine = null;
158
+ this.emit('ocr-engine-destroyed', { timestamp: Date.now() });
159
+ }
160
+ } catch (error) {
161
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
162
+ throw error;
163
+ }
164
+ }
165
+
166
+ // ==========================================================================
167
+ // Core Recognition (from typed version)
168
+ // ==========================================================================
169
+
170
+ /**
171
+ * Check if page needs OCR processing
172
+ */
173
+ async pageNeedsOcr(pageIndex: number): Promise<boolean> {
174
+ try {
175
+ this.recordOperation();
176
+ return (await (this.document as any)?.pageNeedsOcr(pageIndex)) || false;
177
+ } catch (error) {
178
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
179
+ throw error;
180
+ }
181
+ }
182
+
183
+ /**
184
+ * Recognize text on a page with full confidence scoring
185
+ */
186
+ async recognizePage(pageIndex: number): Promise<string> {
187
+ try {
188
+ this.recordOperation();
189
+
190
+ if (!this.ocrEngine) {
191
+ throw new Error('OCR engine not initialized. Call initializeEngine() first.');
192
+ }
193
+
194
+ const text = await (this.document as any)?.recognizePage(
195
+ pageIndex,
196
+ this.ocrEngine
197
+ );
198
+
199
+ this.emit('page-recognized', {
200
+ pageIndex,
201
+ textLength: text?.length || 0,
202
+ timestamp: Date.now(),
203
+ });
204
+
205
+ return text || '';
206
+ } catch (error) {
207
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
208
+ throw error;
209
+ }
210
+ }
211
+
212
+ /**
213
+ * Get OCR confidence score for a page
214
+ */
215
+ async getOcrConfidence(pageIndex: number): Promise<number> {
216
+ try {
217
+ this.recordOperation();
218
+
219
+ if (!this.ocrEngine) {
220
+ return 0;
221
+ }
222
+
223
+ return (await (this.document as any)?.getOcrConfidence(pageIndex)) || 0;
224
+ } catch (error) {
225
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
226
+ throw error;
227
+ }
228
+ }
229
+
230
+ /**
231
+ * Detect text regions on a page with bounding boxes
232
+ */
233
+ async detectTextRegions(pageIndex: number): Promise<TextRegion[]> {
234
+ try {
235
+ this.recordOperation();
236
+
237
+ if (!this.ocrEngine) {
238
+ return [];
239
+ }
240
+
241
+ const regions = await (this.document as any)?.detectTextRegions(
242
+ pageIndex,
243
+ this.ocrEngine
244
+ );
245
+
246
+ return regions || [];
247
+ } catch (error) {
248
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
249
+ throw error;
250
+ }
251
+ }
252
+
253
+ // ==========================================================================
254
+ // Language Configuration (from typed + root versions)
255
+ // ==========================================================================
256
+
257
+ /**
258
+ * Set OCR language for recognition (FFI-wired)
259
+ */
260
+ async setOcrLanguage(language: OcrLanguage | string): Promise<boolean> {
261
+ try {
262
+ this.recordOperation();
263
+
264
+ if (!this.ocrEngine) {
265
+ throw new Error('OCR engine not initialized');
266
+ }
267
+
268
+ const result = await (this.document as any)?.setOcrLanguage(
269
+ this.ocrEngine,
270
+ language
271
+ );
272
+
273
+ if (result) {
274
+ this.currentLanguage = (language as OcrLanguage) || OcrLanguage.ENGLISH;
275
+ this.emit('language-changed', {
276
+ language,
277
+ timestamp: Date.now(),
278
+ });
279
+ }
280
+
281
+ return !!result;
282
+ } catch (error) {
283
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
284
+ throw error;
285
+ }
286
+ }
287
+
288
+ /**
289
+ * Sets the OCR language (convenience alias for setOcrLanguage)
290
+ * From root-level OCRManager
291
+ */
292
+ setLanguage(language: OcrLanguage): void {
293
+ this.currentLanguage = language;
294
+ this.invalidateCache('ocr');
295
+ this.emit('languageChanged', language);
296
+ }
297
+
298
+ /**
299
+ * Gets the current OCR language
300
+ * From root-level OCRManager
301
+ */
302
+ getLanguage(): OcrLanguage {
303
+ return this.currentLanguage;
304
+ }
305
+
306
+ /**
307
+ * Get available OCR languages
308
+ */
309
+ async getAvailableLanguages(): Promise<OcrLanguage[]> {
310
+ try {
311
+ this.recordOperation();
312
+
313
+ const languages =
314
+ (await (this.document as any)?.getAvailableLanguages()) ||
315
+ Object.values(OcrLanguage);
316
+
317
+ return languages;
318
+ } catch (error) {
319
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
320
+ throw error;
321
+ }
322
+ }
323
+
324
+ // ==========================================================================
325
+ // Processing & Export (from typed version)
326
+ // ==========================================================================
327
+
328
+ /**
329
+ * Preprocess page before OCR for better recognition
330
+ */
331
+ async preprocessPage(
332
+ pageIndex: number,
333
+ preprocessingType: string = 'auto'
334
+ ): Promise<boolean> {
335
+ try {
336
+ this.recordOperation();
337
+
338
+ const result = await (this.document as any)?.preprocessPage(
339
+ pageIndex,
340
+ preprocessingType
341
+ );
342
+
343
+ this.preprocessingType = preprocessingType;
344
+ this.emit('page-preprocessed', {
345
+ pageIndex,
346
+ type: preprocessingType,
347
+ timestamp: Date.now(),
348
+ });
349
+
350
+ return !!result;
351
+ } catch (error) {
352
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
353
+ throw error;
354
+ }
355
+ }
356
+
357
+ /**
358
+ * Export OCR text to file
359
+ */
360
+ async exportOcrText(
361
+ pageIndex: number,
362
+ filePath: string,
363
+ format: 'txt' | 'json' | 'xml' = 'txt'
364
+ ): Promise<boolean> {
365
+ try {
366
+ this.recordOperation();
367
+
368
+ const text = await this.recognizePage(pageIndex);
369
+
370
+ await fs.mkdir(dirname(filePath), { recursive: true });
371
+
372
+ let content: string;
373
+ switch (format) {
374
+ case 'json':
375
+ content = JSON.stringify(
376
+ { pageIndex, text, timestamp: Date.now() },
377
+ null,
378
+ 2
379
+ );
380
+ break;
381
+ case 'xml':
382
+ content = `<?xml version="1.0"?>\n<page index="${pageIndex}">\n${text.split('\n').map(line => ` <line>${line}</line>`).join('\n')}\n</page>`;
383
+ break;
384
+ default:
385
+ content = text;
386
+ }
387
+
388
+ await fs.writeFile(filePath, content, 'utf8');
389
+
390
+ this.emit('text-exported', {
391
+ pageIndex,
392
+ filePath,
393
+ format,
394
+ size: content.length,
395
+ timestamp: Date.now(),
396
+ });
397
+
398
+ return true;
399
+ } catch (error) {
400
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
401
+ throw error;
402
+ }
403
+ }
404
+
405
+ // ==========================================================================
406
+ // Statistics & Batch (from typed version)
407
+ // ==========================================================================
408
+
409
+ /**
410
+ * Get comprehensive OCR statistics for a page
411
+ */
412
+ async getOcrStatistics(pageIndex: number): Promise<OcrResult> {
413
+ try {
414
+ this.recordOperation();
415
+
416
+ const text = await this.recognizePage(pageIndex);
417
+ const confidence = await this.getOcrConfidence(pageIndex);
418
+ const regions = await this.detectTextRegions(pageIndex);
419
+
420
+ return {
421
+ pageIndex,
422
+ text,
423
+ confidence,
424
+ regionCount: regions.length,
425
+ };
426
+ } catch (error) {
427
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
428
+ throw error;
429
+ }
430
+ }
431
+
432
+ /**
433
+ * Batch recognize multiple pages
434
+ */
435
+ async batchRecognizePages(
436
+ startPage: number,
437
+ endPage: number
438
+ ): Promise<Map<number, string>> {
439
+ try {
440
+ this.recordOperation();
441
+
442
+ const results = new Map<number, string>();
443
+
444
+ for (let i = startPage; i <= endPage; i++) {
445
+ const text = await this.recognizePage(i);
446
+ results.set(i, text);
447
+ }
448
+
449
+ this.emit('batch-recognized', {
450
+ startPage,
451
+ endPage,
452
+ pageCount: endPage - startPage + 1,
453
+ totalCharacters: Array.from(results.values()).reduce((s, t) => s + t.length, 0),
454
+ timestamp: Date.now(),
455
+ });
456
+
457
+ return results;
458
+ } catch (error) {
459
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
460
+ throw error;
461
+ }
462
+ }
463
+
464
+ /**
465
+ * Extract OCR text with aggregated statistics from page range (FFI-wired)
466
+ */
467
+ async extractPageRange(
468
+ startPage: number,
469
+ endPage: number,
470
+ skipNonScanned: boolean = true
471
+ ): Promise<OcrBatchResult> {
472
+ try {
473
+ this.recordOperation();
474
+
475
+ if (!this.ocrEngine) {
476
+ throw new Error('OCR engine not initialized');
477
+ }
478
+
479
+ let totalSpans = 0;
480
+ let confidenceSum = 0;
481
+ let skippedPages = 0;
482
+
483
+ for (let pageIdx = startPage; pageIdx <= endPage; pageIdx++) {
484
+ try {
485
+ if (skipNonScanned) {
486
+ const needsOcr = await this.pageNeedsOcr(pageIdx);
487
+ if (!needsOcr) {
488
+ skippedPages++;
489
+ continue;
490
+ }
491
+ }
492
+
493
+ const text = await this.recognizePage(pageIdx);
494
+ const confidence = await this.getOcrConfidence(pageIdx);
495
+ const regions = await this.detectTextRegions(pageIdx);
496
+ totalSpans += Math.max(regions.length, text ? 1 : 0);
497
+ confidenceSum += confidence;
498
+ } catch {
499
+ continue;
500
+ }
501
+ }
502
+
503
+ const processedPages = endPage - startPage + 1 - skippedPages;
504
+ const avgConfidence = processedPages > 0 ? confidenceSum / processedPages : 0;
505
+
506
+ const result: OcrBatchResult = {
507
+ startPage,
508
+ endPage,
509
+ totalPages: endPage - startPage + 1,
510
+ totalSpans,
511
+ averageConfidence: avgConfidence,
512
+ skippedPages,
513
+ };
514
+
515
+ this.emit('page-range-extracted', {
516
+ ...result,
517
+ timestamp: Date.now(),
518
+ });
519
+
520
+ this.setCached(`ocr-batch:${startPage}-${endPage}`, result);
521
+
522
+ return result;
523
+ } catch (error) {
524
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
525
+ throw error;
526
+ }
527
+ }
528
+
529
+ // ==========================================================================
530
+ // Engine Status & Configuration (from typed version)
531
+ // ==========================================================================
532
+
533
+ /**
534
+ * Get OCR engine status and configuration
535
+ */
536
+ async getEngineStatus(): Promise<string> {
537
+ try {
538
+ this.recordOperation();
539
+
540
+ if (!this.ocrEngine) {
541
+ return 'not_initialized';
542
+ }
543
+
544
+ return (await (this.document as any)?.getEngineStatus(this.ocrEngine)) || 'unknown';
545
+ } catch (error) {
546
+ this.recordError(error instanceof Error ? error : new Error(String(error)));
547
+ throw error;
548
+ }
549
+ }
550
+
551
+ /**
552
+ * Get current OCR configuration
553
+ */
554
+ getConfiguration(): {
555
+ language: OcrLanguage;
556
+ preprocessingType: string;
557
+ engineInitialized: boolean;
558
+ } {
559
+ return {
560
+ language: this.currentLanguage,
561
+ preprocessingType: this.preprocessingType,
562
+ engineInitialized: !!this.ocrEngine,
563
+ };
564
+ }
565
+
566
+ // ==========================================================================
567
+ // Methods from root-level OCRManager
568
+ // ==========================================================================
569
+
570
+ /**
571
+ * Extracts text from a page (convenience alias for recognizePage)
572
+ * From root-level OCRManager
573
+ */
574
+ async extractText(pageIndex: number, config?: OcrConfig): Promise<string> {
575
+ const cacheKey = `ocr:text:${pageIndex}:${this.currentLanguage}`;
576
+ const cached = this.getCached<string>(cacheKey);
577
+ if (cached !== undefined) {
578
+ return cached;
579
+ }
580
+
581
+ let result = '';
582
+ if ((this.document as any)?.extractText) {
583
+ result = (this.document as any).extractText(pageIndex) || '';
584
+ }
585
+ this.setCached(cacheKey, result);
586
+ this.emit('textExtracted', pageIndex, result.length);
587
+ return result;
588
+ }
589
+
590
+ /**
591
+ * Analyzes a page and returns detailed results
592
+ * From root-level OCRManager
593
+ */
594
+ async analyzePage(pageIndex: number, config?: OcrConfig): Promise<OcrPageAnalysis> {
595
+ const cacheKey = `ocr:analysis:${pageIndex}:${this.currentLanguage}`;
596
+ const cached = this.getCached<OcrPageAnalysis>(cacheKey);
597
+ if (cached !== undefined) {
598
+ return cached;
599
+ }
600
+
601
+ let text = '';
602
+ let needsOcr = false;
603
+
604
+ if ((this.document as any)?.extractText) {
605
+ text = (this.document as any).extractText(pageIndex) || '';
606
+ needsOcr = !text || text.trim().length < 10;
607
+ }
608
+
609
+ const result: OcrPageAnalysis = {
610
+ pageIndex,
611
+ needsOcr,
612
+ confidence: needsOcr ? 0.0 : 0.95,
613
+ spanCount: text.split(' ').length || 0,
614
+ text,
615
+ };
616
+ this.setCached(cacheKey, result);
617
+ this.emit('pageAnalyzed', pageIndex, result);
618
+ return result;
619
+ }
620
+
621
+ /**
622
+ * Performs OCR analysis on all pages in the document
623
+ * From root-level OCRManager
624
+ */
625
+ async analyzeDocument(config?: OcrConfig): Promise<OcrPageAnalysis[]> {
626
+ const cacheKey = `ocr:document:${this.currentLanguage}`;
627
+ const cached = this.getCached<OcrPageAnalysis[]>(cacheKey);
628
+ if (cached !== undefined) {
629
+ return cached;
630
+ }
631
+
632
+ const results: OcrPageAnalysis[] = [];
633
+ const pageCount = (this.document as any)?.pageCount || 0;
634
+
635
+ for (let i = 0; i < pageCount; i++) {
636
+ const analysis = await this.analyzePage(i, config);
637
+ results.push(analysis);
638
+ this.emit('pageProcessed', i + 1, pageCount);
639
+ }
640
+
641
+ this.setCached(cacheKey, results);
642
+ this.emit('documentAnalyzed', results.length);
643
+ return results;
644
+ }
645
+
646
+ /**
647
+ * Extracts text spans with bounding boxes for a page
648
+ * From root-level OCRManager
649
+ */
650
+ async extractSpans(pageIndex: number, config?: OcrConfig): Promise<OcrSpan[]> {
651
+ const cacheKey = `ocr:spans:${pageIndex}:${this.currentLanguage}`;
652
+ const cached = this.getCached<OcrSpan[]>(cacheKey);
653
+ if (cached !== undefined) {
654
+ return cached;
655
+ }
656
+
657
+ let spans: OcrSpan[] = [];
658
+ if (this.native?.extract_spans) {
659
+ try {
660
+ const spansJson = this.native.extract_spans(pageIndex) ?? [];
661
+ spans = spansJson.length > 0 ? spansJson.map((json: string) => JSON.parse(json)) : [];
662
+ } catch {
663
+ spans = [];
664
+ }
665
+ }
666
+
667
+ this.setCached(cacheKey, spans);
668
+ this.emit('spansExtracted', { page: pageIndex, count: spans.length });
669
+ return spans;
670
+ }
671
+
672
+ /**
673
+ * Checks if OCR is available/installed
674
+ * From root-level OCRManager
675
+ */
676
+ async isAvailable(): Promise<boolean> {
677
+ const cacheKey = 'ocr:available';
678
+ const cached = this.getCached<boolean>(cacheKey);
679
+ if (cached !== undefined) {
680
+ return cached;
681
+ }
682
+
683
+ const result = this.native ? true : false;
684
+ this.setCached(cacheKey, result);
685
+ return result;
686
+ }
687
+
688
+ /**
689
+ * Gets OCR engine version
690
+ * From root-level OCRManager
691
+ */
692
+ async getVersion(): Promise<string> {
693
+ const cacheKey = 'ocr:version';
694
+ const cached = this.getCached<string>(cacheKey);
695
+ if (cached !== undefined) {
696
+ return cached;
697
+ }
698
+
699
+ let version = '0.0.0';
700
+ if (this.native?.get_ocr_version) {
701
+ try {
702
+ version = this.native.get_ocr_version() ?? '0.0.0';
703
+ } catch {
704
+ version = '0.0.0';
705
+ }
706
+ }
707
+
708
+ this.setCached(cacheKey, version);
709
+ return version;
710
+ }
711
+
712
+ // ==========================================================================
713
+ // Cache Operations (from root-level OCRManager)
714
+ // ==========================================================================
715
+
716
+ /**
717
+ * Clears the result cache
718
+ */
719
+ clearCache(): void {
720
+ this.invalidateCache();
721
+ this.emit('cacheCleared');
722
+ }
723
+
724
+ /**
725
+ * Gets cache statistics
726
+ */
727
+ getCacheStats(): Record<string, any> {
728
+ return {
729
+ cacheSize: this.cache.size,
730
+ entries: Array.from(this.cache.keys()),
731
+ };
732
+ }
733
+
734
+ // ==========================================================================
735
+ // Cleanup
736
+ // ==========================================================================
737
+
738
+ /**
739
+ * Cleanup on destroy
740
+ */
741
+ async destroy(): Promise<void> {
742
+ try {
743
+ await this.destroyOcrEngine();
744
+ this.invalidateCache();
745
+ this.removeAllListeners();
746
+ this.initialized = false;
747
+ } catch (error) {
748
+ console.error('Error during OCR manager cleanup:', error);
749
+ }
750
+ }
751
+ }
752
+
753
+ /** @deprecated Use OcrManager instead */
754
+ export const OCRManager = OcrManager;
755
+
756
+ export default OcrManager;