natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -32,11 +32,11 @@ class TextElement(Element):
32
32
  obj["object_type"] = "text"
33
33
 
34
34
  super().__init__(obj, page)
35
-
35
+
36
36
  # Memory optimization: Store character indices instead of full dictionaries
37
37
  # This reduces memory usage by ~50% by avoiding character data duplication
38
38
  self._char_indices = obj.pop("_char_indices", [])
39
-
39
+
40
40
  # Backward compatibility: Keep _char_dicts for existing code
41
41
  # But prefer _char_indices when available to save memory
42
42
  self._char_dicts = obj.pop("_char_dicts", [])
@@ -44,20 +44,20 @@ class TextElement(Element):
44
44
  @property
45
45
  def chars(self):
46
46
  """Get constituent character elements efficiently.
47
-
47
+
48
48
  Uses character indices when available to avoid memory duplication,
49
49
  falls back to _char_dicts for backward compatibility.
50
50
  """
51
51
  if self._char_indices:
52
52
  # Memory-efficient approach: access characters by index
53
- if hasattr(self.page, '_element_mgr'):
54
- char_elements = self.page._element_mgr.get_elements('chars')
53
+ if hasattr(self.page, "_element_mgr"):
54
+ char_elements = self.page._element_mgr.get_elements("chars")
55
55
  return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
56
-
56
+
57
57
  # Backward compatibility: convert _char_dicts to TextElement objects
58
58
  if self._char_dicts:
59
59
  return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
60
-
60
+
61
61
  return []
62
62
 
63
63
  @property
@@ -75,12 +75,12 @@ class TextElement(Element):
75
75
  try:
76
76
  # If using memory-efficient character indices, update the referenced chars
77
77
  if hasattr(self, "_char_indices") and self._char_indices:
78
- if hasattr(self.page, '_element_mgr'):
79
- char_elements = self.page._element_mgr.get_elements('chars')
78
+ if hasattr(self.page, "_element_mgr"):
79
+ char_elements = self.page._element_mgr.get_elements("chars")
80
80
  for idx, char_idx in enumerate(self._char_indices):
81
81
  if char_idx < len(char_elements) and idx < len(value):
82
82
  char_elements[char_idx].text = value[idx]
83
-
83
+
84
84
  # Legacy _char_dicts synchronization for backward compatibility
85
85
  elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
86
86
  if not self._char_dicts:
@@ -121,6 +121,7 @@ class TextElement(Element):
121
121
  except Exception as sync_err: # pragma: no cover
122
122
  # Keep failures silent but logged; better to have outdated chars than crash.
123
123
  import logging
124
+
124
125
  logger = logging.getLogger(__name__)
125
126
  logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
126
127
 
@@ -379,7 +380,9 @@ class TextElement(Element):
379
380
  @property
380
381
  def underline(self) -> bool:
381
382
  """True if element is underlined."""
382
- return bool(self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline"))
383
+ return bool(
384
+ self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline")
385
+ )
383
386
 
384
387
  # -----------------------------
385
388
  # Highlight decoration
@@ -397,7 +400,9 @@ class TextElement(Element):
397
400
  @property
398
401
  def highlight_color(self):
399
402
  """Return RGB(A) tuple of highlight colour if stored."""
400
- return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get("highlight_color")
403
+ return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get(
404
+ "highlight_color"
405
+ )
401
406
 
402
407
  def __repr__(self) -> str:
403
408
  """String representation of the text element."""
@@ -489,6 +494,7 @@ class TextElement(Element):
489
494
 
490
495
  try:
491
496
  from bidi.algorithm import get_display # type: ignore
497
+
492
498
  from natural_pdf.utils.bidi_mirror import mirror_brackets
493
499
 
494
500
  # Convert from logical order to visual order
@@ -1,16 +1,19 @@
1
1
  from .base import FinetuneExporter
2
2
 
3
+
3
4
  # Lazy import for PaddleOCRRecognitionExporter to avoid heavy paddle dependencies at module level
4
5
  def _get_paddleocr_exporter():
5
6
  """Lazy import for PaddleOCRRecognitionExporter."""
6
7
  from .paddleocr import PaddleOCRRecognitionExporter
7
-
8
+
8
9
  return PaddleOCRRecognitionExporter
9
10
 
11
+
10
12
  # Make PaddleOCRRecognitionExporter available through attribute access
11
13
  def __getattr__(name):
12
14
  if name == "PaddleOCRRecognitionExporter":
13
15
  return _get_paddleocr_exporter()
14
16
  raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
15
17
 
18
+
16
19
  __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
@@ -2,9 +2,9 @@
2
2
  Module for exporting original PDF pages without modification.
3
3
  """
4
4
 
5
+ import io
5
6
  import logging
6
7
  import os
7
- import io
8
8
  import urllib.request
9
9
  from pathlib import Path
10
10
  from typing import TYPE_CHECKING, List, Set, Union
@@ -103,11 +103,17 @@ def create_original_pdf(
103
103
  source_handle = pikepdf.Pdf.open(first_page_pdf_path)
104
104
  else:
105
105
  # Fallback: attempt to open from in-memory bytes stored on PDF object
106
- if first_page_pdf_obj is not None and hasattr(first_page_pdf_obj, "_original_bytes") and first_page_pdf_obj._original_bytes:
106
+ if (
107
+ first_page_pdf_obj is not None
108
+ and hasattr(first_page_pdf_obj, "_original_bytes")
109
+ and first_page_pdf_obj._original_bytes
110
+ ):
107
111
  source_handle = pikepdf.Pdf.open(io.BytesIO(first_page_pdf_obj._original_bytes))
108
112
  else:
109
113
  # Attempt to download bytes directly if path looks like URL
110
- if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(("http://", "https://")):
114
+ if isinstance(first_page_pdf_path, str) and first_page_pdf_path.startswith(
115
+ ("http://", "https://")
116
+ ):
111
117
  try:
112
118
  with urllib.request.urlopen(first_page_pdf_path) as resp:
113
119
  data = resp.read()
@@ -117,7 +123,9 @@ def create_original_pdf(
117
123
  f"Source PDF bytes not available and download failed for {first_page_pdf_path}: {dl_err}"
118
124
  )
119
125
  else:
120
- raise FileNotFoundError(f"Source PDF bytes not available for {first_page_pdf_path}")
126
+ raise FileNotFoundError(
127
+ f"Source PDF bytes not available for {first_page_pdf_path}"
128
+ )
121
129
 
122
130
  with source_handle as source_pikepdf_doc:
123
131
  target_pikepdf_doc = pikepdf.Pdf.new()
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, Any, Optional, Type, Sequence
3
+ from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
4
4
 
5
5
  from pydantic import BaseModel, Field, create_model
6
6
 
@@ -16,9 +16,54 @@ DEFAULT_STRUCTURED_KEY = "structured" # Define default key
16
16
 
17
17
 
18
18
  class ExtractionMixin(ABC):
19
- """
20
- Mixin class providing structured data extraction capabilities to elements.
21
- Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
19
+ """Mixin class providing structured data extraction capabilities to elements.
20
+
21
+ This mixin adds AI-powered structured data extraction functionality to pages,
22
+ regions, and elements, enabling extraction of specific data fields using
23
+ Pydantic schemas and large language models. It supports both text-based and
24
+ vision-based extraction modes.
25
+
26
+ The mixin integrates with the StructuredDataManager to handle LLM interactions
27
+ and provides schema validation using Pydantic models. Extracted data is
28
+ automatically validated against the provided schema and stored with
29
+ confidence metrics and metadata.
30
+
31
+ Extraction modes:
32
+ - Text-based: Uses extracted text content for LLM processing
33
+ - Vision-based: Uses rendered images for multimodal LLM analysis
34
+ - Automatic: Selects best mode based on content and model capabilities
35
+
36
+ Host class requirements:
37
+ - Must implement extract_text(**kwargs) -> str
38
+ - Must implement to_image(**kwargs) -> PIL.Image
39
+ - Must have access to StructuredDataManager (usually via parent PDF)
40
+
41
+ Example:
42
+ ```python
43
+ from pydantic import BaseModel
44
+
45
+ class InvoiceData(BaseModel):
46
+ invoice_number: str
47
+ total_amount: float
48
+ due_date: str
49
+ vendor_name: str
50
+
51
+ pdf = npdf.PDF("invoice.pdf")
52
+ page = pdf.pages[0]
53
+
54
+ # Extract structured data
55
+ invoice = page.extract_structured_data(InvoiceData)
56
+ print(f"Invoice {invoice.data.invoice_number}: ${invoice.data.total_amount}")
57
+
58
+ # Region-specific extraction
59
+ header_region = page.find('text:contains("Invoice")').above()
60
+ header_data = header_region.extract_structured_data(InvoiceData)
61
+ ```
62
+
63
+ Note:
64
+ Structured extraction requires a compatible LLM to be configured in the
65
+ StructuredDataManager. Results include confidence scores and validation
66
+ metadata for quality assessment.
22
67
  """
23
68
 
24
69
  def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
@@ -386,10 +431,13 @@ class ExtractionMixin(ABC):
386
431
  question_map = question_map or {}
387
432
 
388
433
  try:
389
- from natural_pdf.qa.document_qa import get_qa_engine
390
- from natural_pdf.extraction.result import StructuredDataResult
391
- from pydantic import Field as _Field, create_model
392
434
  import re
435
+
436
+ from pydantic import Field as _Field
437
+ from pydantic import create_model
438
+
439
+ from natural_pdf.extraction.result import StructuredDataResult
440
+ from natural_pdf.qa.document_qa import get_qa_engine
393
441
  except ImportError as exc:
394
442
  raise RuntimeError(
395
443
  "Document-QA dependencies missing. Install with `pip install natural-pdf[ai]`."
@@ -424,7 +472,9 @@ class ExtractionMixin(ABC):
424
472
  question = question_map[display_name]
425
473
  else:
426
474
  description = None
427
- if hasattr(field_obj, "field_info") and hasattr(field_obj.field_info, "description"):
475
+ if hasattr(field_obj, "field_info") and hasattr(
476
+ field_obj.field_info, "description"
477
+ ):
428
478
  description = field_obj.field_info.description
429
479
  elif hasattr(field_obj, "description"):
430
480
  description = field_obj.description
@@ -529,7 +579,11 @@ class ExtractionMixin(ABC):
529
579
  pdf_instance = self
530
580
  elif hasattr(self, "pdf") and hasattr(self.pdf, "get_manager"):
531
581
  pdf_instance = self.pdf
532
- elif hasattr(self, "page") and hasattr(self.page, "pdf") and hasattr(self.page.pdf, "get_manager"):
582
+ elif (
583
+ hasattr(self, "page")
584
+ and hasattr(self.page, "pdf")
585
+ and hasattr(self.page.pdf, "get_manager")
586
+ ):
533
587
  pdf_instance = self.page.pdf
534
588
  else:
535
589
  raise RuntimeError("Cannot access PDF manager to perform LLM extraction.")
@@ -542,7 +596,9 @@ class ExtractionMixin(ABC):
542
596
  layout_for_text = kwargs.pop("layout", True)
543
597
  content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
544
598
 
545
- if content is None or (using == "text" and isinstance(content, str) and not content.strip()):
599
+ if content is None or (
600
+ using == "text" and isinstance(content, str) and not content.strip()
601
+ ):
546
602
  result = StructuredDataResult(
547
603
  data=None,
548
604
  success=False,
@@ -1,5 +1,5 @@
1
- from typing import Any, Generic, Optional, TypeVar
2
1
  from collections.abc import Mapping
2
+ from typing import Any, Generic, Optional, TypeVar
3
3
 
4
4
  from pydantic import BaseModel, Field
5
5
 
natural_pdf/flows/flow.py CHANGED
@@ -14,10 +14,69 @@ logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
16
  class Flow:
17
- """
18
- Defines a logical flow or sequence of physical Page or Region objects,
19
- specifying their arrangement and alignment to enable operations that
20
- span across these segments as if they were a continuous area.
17
+ """Defines a logical flow or sequence of physical Page or Region objects.
18
+
19
+ A Flow represents a continuous logical document structure that spans across
20
+ multiple pages or regions, enabling operations on content that flows across
21
+ boundaries. This is essential for handling multi-page tables, articles that
22
+ span columns, or any content that requires reading order across segments.
23
+
24
+ Flows specify arrangement (vertical/horizontal) and alignment rules to create
25
+ a unified coordinate system for element extraction and text processing. They
26
+ enable natural-pdf to treat fragmented content as a single continuous area
27
+ for analysis and extraction operations.
28
+
29
+ The Flow system is particularly useful for:
30
+ - Multi-page tables that break across page boundaries
31
+ - Multi-column articles with complex reading order
32
+ - Forms that span multiple pages
33
+ - Any content requiring logical continuation across segments
34
+
35
+ Attributes:
36
+ segments: List of Page or Region objects in flow order.
37
+ arrangement: Primary flow direction ('vertical' or 'horizontal').
38
+ alignment: Cross-axis alignment for segments of different sizes.
39
+ segment_gap: Virtual gap between segments in PDF points.
40
+
41
+ Example:
42
+ Multi-page table flow:
43
+ ```python
44
+ pdf = npdf.PDF("multi_page_table.pdf")
45
+
46
+ # Create flow for table spanning pages 2-4
47
+ table_flow = Flow(
48
+ segments=[pdf.pages[1], pdf.pages[2], pdf.pages[3]],
49
+ arrangement='vertical',
50
+ alignment='left',
51
+ segment_gap=10.0
52
+ )
53
+
54
+ # Extract table as if it were continuous
55
+ table_data = table_flow.extract_table()
56
+ text_content = table_flow.get_text()
57
+ ```
58
+
59
+ Multi-column article flow:
60
+ ```python
61
+ page = pdf.pages[0]
62
+ left_column = page.region(0, 0, 300, page.height)
63
+ right_column = page.region(320, 0, page.width, page.height)
64
+
65
+ # Create horizontal flow for columns
66
+ article_flow = Flow(
67
+ segments=[left_column, right_column],
68
+ arrangement='horizontal',
69
+ alignment='top'
70
+ )
71
+
72
+ # Read in proper order
73
+ article_text = article_flow.get_text()
74
+ ```
75
+
76
+ Note:
77
+ Flows create virtual coordinate systems that map element positions across
78
+ segments, enabling spatial navigation and element selection to work
79
+ seamlessly across boundaries.
21
80
  """
22
81
 
23
82
  def __init__(
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, Callable
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
3
3
 
4
4
  from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
5
5
 
@@ -133,7 +133,7 @@ class FlowRegion:
133
133
  # This is a simplification; true layout-aware joining would be more complex.
134
134
  joiner = (
135
135
  "\n" if self.flow.arrangement == "vertical" else " "
136
- ) # TODO: Make this smarter, consider segment_gap
136
+ ) # TODO: Consider flow.segment_gap for proportional spacing between segments
137
137
  extracted = joiner.join(t for t in texts if t)
138
138
 
139
139
  if apply_exclusions: # Only cache if standard exclusion behavior
@@ -258,7 +258,7 @@ class FlowRegion:
258
258
  """
259
259
  Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
260
260
  If multiple pages are involved, they are stacked into a single image.
261
-
261
+
262
262
  Args:
263
263
  resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
264
264
  labels: Whether to include a legend for highlights.
@@ -270,7 +270,7 @@ class FlowRegion:
270
270
  stack_gap: Gap in pixels between stacked pages.
271
271
  stack_background_color: RGB background color for the stacked image.
272
272
  **kwargs: Additional arguments passed to the underlying rendering methods.
273
-
273
+
274
274
  Returns:
275
275
  PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
276
276
  """
natural_pdf/ocr/engine.py CHANGED
@@ -12,7 +12,38 @@ logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
14
  class TextRegion:
15
- """Standard representation of an OCR text region."""
15
+ """Standard representation of an OCR text region.
16
+
17
+ TextRegion provides a standardized format for representing text detected by
18
+ OCR engines, regardless of the underlying engine implementation. This ensures
19
+ consistent interfaces across different OCR backends (EasyOCR, Surya, PaddleOCR, etc.).
20
+
21
+ The class handles coordinate normalization and provides utilities for converting
22
+ between different coordinate formats (bounding boxes vs. polygons).
23
+
24
+ Attributes:
25
+ bbox: Bounding box coordinates as (x0, y0, x1, y1) tuple.
26
+ text: The recognized text content.
27
+ confidence: Confidence score from 0.0 (low) to 1.0 (high).
28
+ source: Source identifier, typically "ocr" or engine name.
29
+
30
+ Example:
31
+ ```python
32
+ # Create from bounding box
33
+ region = TextRegion(
34
+ bbox=(100, 200, 300, 250),
35
+ text="Hello World",
36
+ confidence=0.95
37
+ )
38
+
39
+ # Create from polygon coordinates
40
+ polygon = [[100, 200], [300, 200], [300, 250], [100, 250]]
41
+ region = TextRegion.from_polygon(polygon, "Hello World", 0.95)
42
+
43
+ # Convert to dictionary for processing
44
+ data = region.to_dict()
45
+ ```
46
+ """
16
47
 
17
48
  def __init__(
18
49
  self,
@@ -54,7 +85,57 @@ class TextRegion:
54
85
 
55
86
 
56
87
  class OCREngine(ABC):
57
- """Abstract Base Class for OCR engines."""
88
+ """Abstract base class for OCR engines.
89
+
90
+ This class defines the standard interface that all OCR engines must implement
91
+ in natural-pdf. It provides a consistent API for text recognition regardless
92
+ of the underlying OCR technology (EasyOCR, Surya, PaddleOCR, DocTR, etc.).
93
+
94
+ The base class handles common functionality like model caching, parameter
95
+ validation, and result standardization, while concrete implementations
96
+ provide engine-specific processing logic.
97
+
98
+ Subclasses must implement:
99
+ - process_single_image(): Core OCR processing for a single image
100
+ - is_available(): Check if the engine dependencies are installed
101
+ - get_supported_languages(): Return list of supported language codes
102
+
103
+ Class Attributes:
104
+ DEFAULT_MIN_CONFIDENCE: Default confidence threshold (0.2).
105
+ DEFAULT_LANGUAGES: Default language list (["en"]).
106
+ DEFAULT_DEVICE: Default processing device ("cpu").
107
+
108
+ Attributes:
109
+ logger: Logger instance for the specific engine.
110
+ _model: Cached model instance for the engine.
111
+ _initialized: Whether the engine has been initialized.
112
+ _reader_cache: Cache for initialized models/readers.
113
+
114
+ Example:
115
+ Implementing a custom OCR engine:
116
+ ```python
117
+ class MyOCREngine(OCREngine):
118
+ @classmethod
119
+ def is_available(cls) -> bool:
120
+ try:
121
+ import my_ocr_library
122
+ return True
123
+ except ImportError:
124
+ return False
125
+
126
+ def process_single_image(self, image, languages, min_confidence,
127
+ device, detect_only, options):
128
+ # Implement OCR processing
129
+ return text_regions
130
+ ```
131
+
132
+ Using an OCR engine:
133
+ ```python
134
+ if EasyOCREngine.is_available():
135
+ engine = EasyOCREngine()
136
+ results = engine.process_image(image, languages=['en', 'es'])
137
+ ```
138
+ """
58
139
 
59
140
  # Default values as class constants
60
141
  DEFAULT_MIN_CONFIDENCE = 0.2
@@ -11,6 +11,7 @@ from .ocr_options import BaseOCROptions, PaddleOCROptions
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+
14
15
  class PaddleOCREngine(OCREngine):
15
16
  """PaddleOCR engine implementation."""
16
17
 
@@ -147,8 +148,8 @@ class PaddleOCREngine(OCREngine):
147
148
 
148
149
  # --- RESTORE: Language/version support check logic ---
149
150
  user_specified_model = (
150
- getattr(paddle_options, "text_recognition_model_name", None) is not None or
151
- getattr(paddle_options, "text_detection_model_name", None) is not None
151
+ getattr(paddle_options, "text_recognition_model_name", None) is not None
152
+ or getattr(paddle_options, "text_detection_model_name", None) is not None
152
153
  )
153
154
  if user_specified_model and user_ocr_version:
154
155
  if primary_lang not in self.SUPPORT_MATRIX.get(user_ocr_version, set()):
@@ -169,7 +170,7 @@ class PaddleOCREngine(OCREngine):
169
170
  user_ocr_version,
170
171
  )
171
172
  final_ocr_version = None # Reset to find a compatible version
172
-
173
+
173
174
  # If no version was specified or the specified one was incompatible, find the best fit.
174
175
  if not final_ocr_version:
175
176
  found_compatible = False
@@ -269,7 +270,6 @@ class PaddleOCREngine(OCREngine):
269
270
  if value is not None:
270
271
  ocr_config[arg] = value
271
272
 
272
-
273
273
  try:
274
274
  # The new API uses PaddleOCR as a pipeline object.
275
275
  self._model = paddleocr.PaddleOCR(**ocr_config)
@@ -350,7 +350,7 @@ class PaddleOCREngine(OCREngine):
350
350
  # This code converts any numpy array to a list before passing to _standardize_bbox,
351
351
  # which handles both rectangle and polygon formats robustly.
352
352
  box = rec_boxes[i]
353
- if hasattr(box, 'tolist'):
353
+ if hasattr(box, "tolist"):
354
354
  box = box.tolist()
355
355
  bbox = self._standardize_bbox(box)
356
356
  if detect_only:
@@ -32,7 +32,8 @@ class OCRFactory:
32
32
  return SuryaOCREngine(**kwargs)
33
33
  except ImportError:
34
34
  raise ImportError(
35
- "Surya engine requires additional dependencies. " "Install with: npdf install surya"
35
+ "Surya engine requires additional dependencies. "
36
+ "Install with: npdf install surya"
36
37
  )
37
38
  elif engine_type == "easyocr":
38
39
  try:
@@ -11,6 +11,7 @@ from PIL import Image
11
11
  from .engine import OCREngine
12
12
  from .engine_doctr import DoctrOCREngine
13
13
  from .engine_easyocr import EasyOCREngine
14
+
14
15
  # Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level
15
16
  # from .engine_paddle import PaddleOCREngine
16
17
  from .engine_surya import SuryaOCREngine
@@ -33,12 +34,16 @@ class OCRManager:
33
34
  def _get_paddle_engine_class():
34
35
  """Lazy import for PaddleOCREngine to avoid heavy paddle dependencies at module level."""
35
36
  from .engine_paddle import PaddleOCREngine
37
+
36
38
  return PaddleOCREngine
37
39
 
38
40
  # Registry mapping engine names to classes and default options
39
41
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
40
42
  "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
41
- "paddle": {"class": lambda: OCRManager._get_paddle_engine_class(), "options_class": PaddleOCROptions},
43
+ "paddle": {
44
+ "class": lambda: OCRManager._get_paddle_engine_class(),
45
+ "options_class": PaddleOCROptions,
46
+ },
42
47
  "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
43
48
  "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
44
49
  # Add other engines here
@@ -85,7 +90,10 @@ class OCRManager:
85
90
  )
86
91
  engine_class_or_factory = self.ENGINE_REGISTRY[engine_name]["class"]
87
92
  # Handle lazy loading - if it's a lambda function, call it to get the actual class
88
- if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
93
+ if (
94
+ callable(engine_class_or_factory)
95
+ and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
96
+ ):
89
97
  engine_class = engine_class_or_factory()
90
98
  else:
91
99
  engine_class = engine_class_or_factory
@@ -283,7 +291,10 @@ class OCRManager:
283
291
  # Temporarily instantiate to check availability without caching
284
292
  engine_class_or_factory = registry_entry["class"]
285
293
  # Handle lazy loading - if it's a lambda function, call it to get the actual class
286
- if callable(engine_class_or_factory) and getattr(engine_class_or_factory, '__name__', '') == '<lambda>':
294
+ if (
295
+ callable(engine_class_or_factory)
296
+ and getattr(engine_class_or_factory, "__name__", "") == "<lambda>"
297
+ ):
287
298
  engine_class = engine_class_or_factory()
288
299
  else:
289
300
  engine_class = engine_class_or_factory
@@ -299,49 +310,49 @@ class OCRManager:
299
310
  def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
300
311
  """
301
312
  Cleanup OCR engine instances to free memory.
302
-
313
+
303
314
  Args:
304
315
  engine_name: Specific engine to cleanup, or None to cleanup all engines
305
-
316
+
306
317
  Returns:
307
318
  Number of engines cleaned up
308
319
  """
309
320
  cleaned_count = 0
310
-
321
+
311
322
  if engine_name:
312
323
  # Cleanup specific engine
313
324
  engine_name = engine_name.lower()
314
325
  if engine_name in self._engine_instances:
315
326
  engine = self._engine_instances.pop(engine_name)
316
- if hasattr(engine, 'cleanup'):
327
+ if hasattr(engine, "cleanup"):
317
328
  try:
318
329
  engine.cleanup()
319
330
  except Exception as e:
320
331
  logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
321
-
332
+
322
333
  # Clear associated locks
323
334
  self._engine_locks.pop(engine_name, None)
324
335
  self._engine_inference_locks.pop(engine_name, None)
325
-
336
+
326
337
  logger.info(f"Cleaned up OCR engine: {engine_name}")
327
338
  cleaned_count = 1
328
339
  else:
329
340
  # Cleanup all engines
330
341
  for name, engine in list(self._engine_instances.items()):
331
- if hasattr(engine, 'cleanup'):
342
+ if hasattr(engine, "cleanup"):
332
343
  try:
333
344
  engine.cleanup()
334
345
  except Exception as e:
335
346
  logger.debug(f"Engine {name} cleanup method failed: {e}")
336
-
347
+
337
348
  # Clear all caches
338
349
  engine_count = len(self._engine_instances)
339
350
  self._engine_instances.clear()
340
351
  self._engine_locks.clear()
341
352
  self._engine_inference_locks.clear()
342
-
353
+
343
354
  if engine_count > 0:
344
355
  logger.info(f"Cleaned up {engine_count} OCR engines")
345
356
  cleaned_count = engine_count
346
-
357
+
347
358
  return cleaned_count