natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/engine.py CHANGED
@@ -1,7 +1,8 @@
1
1
  # ocr_engine_base.py
2
2
  import logging
3
3
  from abc import ABC, abstractmethod
4
- from typing import Dict, List, Any, Optional, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  from PIL import Image
6
7
 
7
8
  # Assuming ocr_options defines BaseOCROptions
@@ -9,35 +10,138 @@ from .ocr_options import BaseOCROptions
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
13
+
14
+ class TextRegion:
15
+ """Standard representation of an OCR text region."""
16
+
17
+ def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
18
+ """
19
+ Initialize a text region.
20
+
21
+ Args:
22
+ bbox: Tuple of (x0, y0, x1, y1) coordinates
23
+ text: The recognized text
24
+ confidence: Confidence score (0.0-1.0)
25
+ source: Source of the text region (default: "ocr")
26
+ """
27
+ self.bbox = bbox
28
+ self.text = text
29
+ self.confidence = confidence
30
+ self.source = source
31
+
32
+ @classmethod
33
+ def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
34
+ """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
35
+ x_coords = [float(point[0]) for point in polygon]
36
+ y_coords = [float(point[1]) for point in polygon]
37
+ bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
38
+ return cls(bbox, text, confidence)
39
+
40
+ def to_dict(self) -> Dict[str, Any]:
41
+ """Convert to dictionary representation for compatibility."""
42
+ return {
43
+ "bbox": self.bbox,
44
+ "text": self.text,
45
+ "confidence": self.confidence,
46
+ "source": self.source
47
+ }
48
+
49
+
12
50
  class OCREngine(ABC):
13
51
  """Abstract Base Class for OCR engines."""
52
+
53
+ # Default values as class constants
54
+ DEFAULT_MIN_CONFIDENCE = 0.2
55
+ DEFAULT_LANGUAGES = ['en']
56
+ DEFAULT_DEVICE = 'cpu'
14
57
 
15
58
  def __init__(self):
16
59
  """Initializes the base OCR engine."""
17
60
  self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
18
61
  self.logger.info(f"Initializing {self.__class__.__name__}")
19
- self._reader_cache = {} # Cache for initialized models/readers
62
+ self._model = None
63
+ self._initialized = False
64
+ self._reader_cache = {} # Cache for initialized models/readers
20
65
 
21
- @abstractmethod
22
66
  def process_image(
23
67
  self,
24
- images: Union[Image.Image, List[Image.Image]], # Accept single or list
25
- options: BaseOCROptions
26
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
68
+ images: Union[Image.Image, List[Image.Image]],
69
+ languages: Optional[List[str]] = None,
70
+ min_confidence: Optional[float] = None,
71
+ device: Optional[str] = None,
72
+ detect_only: bool = False,
73
+ options: Optional[BaseOCROptions] = None,
74
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
27
75
  """
28
- Processes a single image or a batch of images using the specific engine and options.
29
-
76
+ Process a single image or batch of images with OCR.
77
+
30
78
  Args:
31
- images: A single PIL Image or a list of PIL Images.
32
- options: An instance of a dataclass inheriting from BaseOCROptions
33
- containing configuration for this run.
34
-
79
+ images: A single PIL Image or a list of PIL Images
80
+ languages: List of languages to use (default: ['en'])
81
+ min_confidence: Minimum confidence threshold (default: 0.2)
82
+ device: Device to use for processing (default: 'cpu')
83
+ detect_only: Whether to only detect text regions without recognition
84
+ options: Engine-specific options
85
+
35
86
  Returns:
36
- If input is a single image: List of result dictionaries.
37
- If input is a list of images: List of lists of result dictionaries,
38
- corresponding to each input image.
39
- An empty list indicates failure for that image.
87
+ For a single image: List of text region dictionaries
88
+ For a batch: List of lists of text region dictionaries
40
89
  """
90
+ # Convert single image to batch format
91
+ single_image = not isinstance(images, list)
92
+ image_batch = [images] if single_image else images
93
+
94
+ # Use default values where parameters are not provided
95
+ effective_languages = languages or self.DEFAULT_LANGUAGES
96
+ effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
97
+ effective_device = device or self.DEFAULT_DEVICE
98
+
99
+ # Ensure the model is initialized
100
+ self._ensure_initialized(effective_languages, effective_device, options)
101
+
102
+ # Process each image in the batch
103
+ results = []
104
+ for img in image_batch:
105
+ # Preprocess the image for the specific engine
106
+ processed_img = self._preprocess_image(img)
107
+
108
+ # Process the image with the engine-specific implementation
109
+ raw_results = self._process_single_image(processed_img, detect_only, options)
110
+
111
+ # Convert results to standardized format
112
+ text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
113
+
114
+ # Convert TextRegion objects to dictionaries for backward compatibility
115
+ region_dicts = [region.to_dict() for region in text_regions]
116
+ results.append(region_dicts)
117
+
118
+ # Return results in the appropriate format
119
+ return results[0] if single_image else results
120
+
121
+ def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
122
+ """Ensure the model is initialized with the correct parameters."""
123
+ if not self._initialized:
124
+ self._initialize_model(languages, device, options)
125
+ self._initialized = True
126
+
127
+ @abstractmethod
128
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
129
+ """Initialize the OCR model with the given parameters."""
130
+ raise NotImplementedError("Subclasses must implement this method")
131
+
132
+ @abstractmethod
133
+ def _preprocess_image(self, image: Image.Image) -> Any:
134
+ """Convert PIL Image to engine-specific format."""
135
+ raise NotImplementedError("Subclasses must implement this method")
136
+
137
+ @abstractmethod
138
+ def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
139
+ """Process a single image with the initialized model."""
140
+ raise NotImplementedError("Subclasses must implement this method")
141
+
142
+ @abstractmethod
143
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
144
+ """Convert engine-specific results to standardized TextRegion objects."""
41
145
  raise NotImplementedError("Subclasses must implement this method")
42
146
 
43
147
  @abstractmethod
@@ -61,44 +165,44 @@ class OCREngine(ABC):
61
165
  Returns:
62
166
  A string cache key.
63
167
  """
64
- # Basic key includes languages and device
65
- lang_key = "-".join(sorted(options.languages))
66
- device_key = str(options.device).lower()
168
+ lang_key = "-".join(sorted(getattr(options, "languages", self.DEFAULT_LANGUAGES)))
169
+ device_key = str(getattr(options, "device", self.DEFAULT_DEVICE)).lower()
67
170
  return f"{self.__class__.__name__}_{lang_key}_{device_key}"
68
171
 
69
- def _standardize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
70
- """
71
- Helper to standardize bounding boxes to (x0, y0, x1, y1) format.
72
-
73
- Args:
74
- bbox: The bounding box in the engine's native format.
75
- Expected formats:
76
- - List/Tuple of 4 numbers: (x0, y0, x1, y1)
77
- - List of points: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (polygon)
78
-
79
- Returns:
80
- Tuple[float, float, float, float] or None if conversion fails.
81
- """
82
- try:
83
- if isinstance(bbox, (list, tuple)) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
84
- # Already in (x0, y0, x1, y1) format (or similar)
172
+ def _standardize_bbox(self, bbox: Any) -> Tuple[float, float, float, float]:
173
+ """Standardizes bounding boxes to (x0, y0, x1, y1) format. Raises ValueError if standardization fails."""
174
+ # Check if it's already in the correct tuple/list format
175
+ if (
176
+ isinstance(bbox, (list, tuple))
177
+ and len(bbox) == 4
178
+ and all(isinstance(n, (int, float)) for n in bbox)
179
+ ):
180
+ try:
85
181
  return tuple(float(c) for c in bbox[:4])
86
- elif isinstance(bbox, (list, tuple)) and len(bbox) > 0 and isinstance(bbox[0], (list, tuple)):
87
- # Polygon format [[x1,y1],[x2,y2],...]
182
+ except (ValueError, TypeError) as e:
183
+ raise ValueError(f"Invalid number format in bbox: {bbox}") from e
184
+
185
+ # Check if it's in polygon format [[x1,y1],[x2,y2],...]
186
+ elif (
187
+ isinstance(bbox, (list, tuple))
188
+ and len(bbox) > 0
189
+ and isinstance(bbox[0], (list, tuple))
190
+ and len(bbox[0]) == 2 # Ensure points are pairs
191
+ ):
192
+ try:
88
193
  x_coords = [float(point[0]) for point in bbox]
89
194
  y_coords = [float(point[1]) for point in bbox]
90
- x0 = min(x_coords)
91
- y0 = min(y_coords)
92
- x1 = max(x_coords)
93
- y1 = max(y_coords)
94
- return (x0, y0, x1, y1)
95
- except Exception as e:
96
- self.logger.warning(f"Could not standardize bounding box: {bbox}. Error: {e}")
97
- return None
195
+ if not x_coords or not y_coords: # Handle empty polygon case
196
+ raise ValueError("Empty polygon provided")
197
+ return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
198
+ except (ValueError, TypeError, IndexError) as e:
199
+ raise ValueError(f"Invalid polygon format or values: {bbox}") from e
200
+
201
+ # If it's neither format, raise an error
202
+ raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
98
203
 
99
204
  def __del__(self):
100
205
  """Cleanup resources when the engine is deleted."""
101
206
  self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
102
207
  # Clear reader cache to free up memory/GPU resources
103
208
  self._reader_cache.clear()
104
-
@@ -1,179 +1,175 @@
1
1
  # ocr_engine_easyocr.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  import numpy as np
6
7
  from PIL import Image
7
- import inspect # Used for dynamic parameter passing
8
8
 
9
- from .engine import OCREngine
10
- from .ocr_options import EasyOCROptions, BaseOCROptions
9
+ from .engine import OCREngine, TextRegion
10
+ from .ocr_options import BaseOCROptions, EasyOCROptions
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+
14
15
  class EasyOCREngine(OCREngine):
15
16
  """EasyOCR engine implementation."""
16
17
 
17
18
  def __init__(self):
18
19
  super().__init__()
19
- self._easyocr = None # Lazy load easyocr module
20
-
21
- def _lazy_import_easyocr(self):
22
- """Imports easyocr only when needed."""
23
- if self._easyocr is None:
24
- if not self.is_available():
25
- raise ImportError("EasyOCR is not installed or available.")
26
- try:
27
- import easyocr
28
- self._easyocr = easyocr
29
- logger.info("EasyOCR module imported successfully.")
30
- except ImportError as e:
31
- logger.error(f"Failed to import EasyOCR: {e}")
32
- raise
33
- return self._easyocr
20
+ # No longer need _easyocr attribute
21
+ # self._easyocr = None
34
22
 
35
23
  def is_available(self) -> bool:
36
24
  """Check if EasyOCR is installed."""
37
25
  return importlib.util.find_spec("easyocr") is not None
38
26
 
39
- def _get_cache_key(self, options: EasyOCROptions) -> str:
40
- """Generate a more specific cache key for EasyOCR."""
41
- base_key = super()._get_cache_key(options)
42
- recog_key = options.recog_network
43
- detect_key = options.detect_network
44
- quantize_key = str(options.quantize)
45
- return f"{base_key}_{recog_key}_{detect_key}_{quantize_key}"
46
-
47
- def _get_reader(self, options: EasyOCROptions):
48
- """Get or initialize an EasyOCR reader based on options."""
49
- cache_key = self._get_cache_key(options)
50
- if cache_key in self._reader_cache:
51
- logger.debug(f"Using cached EasyOCR reader for key: {cache_key}")
52
- return self._reader_cache[cache_key]
53
-
54
- logger.info(f"Creating new EasyOCR reader for key: {cache_key}")
55
- easyocr = self._lazy_import_easyocr()
56
-
57
- constructor_sig = inspect.signature(easyocr.Reader.__init__)
58
- constructor_args = {}
59
- constructor_args['lang_list'] = options.languages
60
- constructor_args['gpu'] = 'cuda' in str(options.device).lower() or 'mps' in str(options.device).lower()
61
-
62
- for field_name, param in constructor_sig.parameters.items():
63
- if field_name in ['self', 'lang_list', 'gpu']: continue
64
- if hasattr(options, field_name):
65
- constructor_args[field_name] = getattr(options, field_name)
66
- elif field_name in options.extra_args:
67
- constructor_args[field_name] = options.extra_args[field_name]
68
-
69
- logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
27
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
28
+ """Initialize the EasyOCR model."""
29
+ # Import directly here
70
30
  try:
71
- reader = easyocr.Reader(**constructor_args)
72
- self._reader_cache[cache_key] = reader
73
- logger.info("EasyOCR reader created successfully.")
74
- return reader
31
+ import easyocr
32
+ self.logger.info("EasyOCR module imported successfully.")
33
+ except ImportError as e:
34
+ self.logger.error(f"Failed to import EasyOCR: {e}")
35
+ raise
36
+
37
+ # Cast to EasyOCROptions if possible, otherwise use default
38
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
39
+
40
+ # Prepare constructor arguments
41
+ use_gpu = "cuda" in device.lower() or "mps" in device.lower()
42
+
43
+ constructor_args = {
44
+ "lang_list": languages,
45
+ "gpu": use_gpu,
46
+ # Explicitly map relevant options
47
+ "model_storage_directory": easy_options.model_storage_directory,
48
+ "user_network_directory": easy_options.user_network_directory,
49
+ "recog_network": easy_options.recog_network,
50
+ "detect_network": easy_options.detect_network,
51
+ "download_enabled": easy_options.download_enabled,
52
+ "detector": easy_options.detector,
53
+ "recognizer": easy_options.recognizer,
54
+ "verbose": easy_options.verbose,
55
+ "quantize": easy_options.quantize,
56
+ "cudnn_benchmark": easy_options.cudnn_benchmark,
57
+ }
58
+
59
+ # Filter out None values, as EasyOCR expects non-None or default behaviour
60
+ constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
61
+
62
+ self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
63
+
64
+ # Create the reader
65
+ try:
66
+ self._model = easyocr.Reader(**constructor_args)
67
+ self.logger.info("EasyOCR reader created successfully")
75
68
  except Exception as e:
76
- logger.error(f"Failed to create EasyOCR reader: {e}", exc_info=True)
69
+ self.logger.error(f"Failed to create EasyOCR reader: {e}")
77
70
  raise
78
71
 
79
- def _prepare_readtext_args(self, options: EasyOCROptions, reader) -> Dict[str, Any]:
80
- """Helper to prepare arguments for the readtext method."""
81
- readtext_sig = inspect.signature(reader.readtext)
72
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
73
+ """Convert PIL Image to numpy array for EasyOCR."""
74
+ return np.array(image)
75
+
76
+ def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
77
+ """Process a single image with EasyOCR."""
78
+ if self._model is None:
79
+ raise RuntimeError("EasyOCR model not initialized")
80
+
81
+ # Cast options to proper type if provided
82
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
83
+
84
+ # Prepare readtext arguments (only needed if not detect_only)
82
85
  readtext_args = {}
83
- for field_name, param in readtext_sig.parameters.items():
84
- if field_name == 'image': continue
85
- if hasattr(options, field_name):
86
- readtext_args[field_name] = getattr(options, field_name)
87
- elif field_name in options.extra_args:
88
- readtext_args[field_name] = options.extra_args[field_name]
89
- logger.debug(f"EasyOCR readtext args: {readtext_args}")
90
- return readtext_args
91
-
92
- def _standardize_results(self, raw_results: List[Any], options: EasyOCROptions) -> List[Dict[str, Any]]:
93
- """Standardizes raw results from EasyOCR's readtext."""
94
- standardized_results = []
95
- min_confidence = options.min_confidence
96
-
86
+ if not detect_only:
87
+ for param in [
88
+ "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
89
+ "filter_ths", "text_threshold", "low_text", "link_threshold",
90
+ "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
91
+ "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
92
+ ]:
93
+ if hasattr(easy_options, param):
94
+ val = getattr(easy_options, param)
95
+ if val is not None:
96
+ readtext_args[param] = val
97
+
98
+ # Process differently based on detect_only flag
99
+ if detect_only:
100
+ # Returns tuple (horizontal_list, free_list)
101
+ # horizontal_list is a list containing one item: the list of boxes
102
+ # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
103
+ bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
104
+ if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
105
+ return bboxes_tuple[0] # Return the list of polygons directly
106
+ else:
107
+ self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
108
+ return [] # Return empty list on unexpected format
109
+ else:
110
+ return self._model.readtext(image, **readtext_args)
111
+
112
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
113
+ """Convert EasyOCR results to standardized TextRegion objects."""
114
+ standardized_regions = []
115
+
116
+ if detect_only:
117
+ # In detect_only mode, raw_results is already a list of bounding boxes
118
+ # Each bbox is in [x_min, x_max, y_min, y_max] format
119
+ if isinstance(raw_results, list):
120
+ for detection in raw_results:
121
+ try:
122
+ if isinstance(detection, (list, tuple)) and len(detection) == 4:
123
+ x_min, x_max, y_min, y_max = detection
124
+ # Convert to standardized (x0, y0, x1, y1) format
125
+ try:
126
+ bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
127
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
128
+ except (ValueError, TypeError) as e:
129
+ raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
130
+ else:
131
+ raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
132
+ except ValueError as e:
133
+ # Re-raise any value errors from standardization or format checks
134
+ raise e
135
+ except Exception as e:
136
+ # Catch other potential processing errors
137
+ raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
138
+ else:
139
+ raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
140
+
141
+ return standardized_regions
142
+
143
+ # Full OCR mode (readtext results)
97
144
  for detection in raw_results:
98
145
  try:
99
- if options.detail == 1 and isinstance(detection, (list, tuple)) and len(detection) >= 3:
100
- bbox_raw = detection[0]
146
+ # Detail mode (list/tuple result)
147
+ if isinstance(detection, (list, tuple)) and len(detection) >= 3:
148
+ bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
101
149
  text = str(detection[1])
102
150
  confidence = float(detection[2])
103
-
151
+
104
152
  if confidence >= min_confidence:
105
- bbox = self._standardize_bbox(bbox_raw)
106
- if bbox:
107
- standardized_results.append({
108
- 'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
109
- })
110
- else:
111
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
112
-
113
- elif options.detail == 0 and isinstance(detection, str):
114
- standardized_results.append({
115
- 'bbox': None, 'text': detection, 'confidence': 1.0, 'source': 'ocr'
116
- })
117
- except (IndexError, ValueError, TypeError) as e:
118
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
119
- continue
120
- return standardized_results
121
-
122
-
123
- def process_image(
124
- self,
125
- images: Union[Image.Image, List[Image.Image]],
126
- options: BaseOCROptions
127
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
128
- """Processes a single image or a batch of images with EasyOCR."""
129
-
130
- if not isinstance(options, EasyOCROptions):
131
- logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
132
- # Create default EasyOCR options if base was passed, preserving base settings
133
- options = EasyOCROptions(
134
- languages=options.languages,
135
- min_confidence=options.min_confidence,
136
- device=options.device,
137
- extra_args=options.extra_args # Pass along any extra args
138
- )
139
-
140
- reader = self._get_reader(options)
141
- readtext_args = self._prepare_readtext_args(options, reader)
142
-
143
- # --- Handle single image or batch ---
144
- if isinstance(images, list):
145
- # --- Batch Processing (Iterative for EasyOCR) ---
146
- all_results = []
147
- logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
148
- for i, img in enumerate(images):
149
- if not isinstance(img, Image.Image):
150
- logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
151
- all_results.append([])
152
- continue
153
- img_array = np.array(img)
154
- try:
155
- logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
156
- raw_results = reader.readtext(img_array, **readtext_args)
157
- standardized = self._standardize_results(raw_results, options)
158
- all_results.append(standardized)
159
- except Exception as e:
160
- logger.error(f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True)
161
- all_results.append([]) # Append empty list for failed image
162
- logger.info(f"Finished processing batch with EasyOCR.")
163
- return all_results # Return List[List[Dict]]
164
-
165
- elif isinstance(images, Image.Image):
166
- # --- Single Image Processing ---
167
- logger.info("Processing single image with EasyOCR...")
168
- img_array = np.array(images)
169
- try:
170
- raw_results = reader.readtext(img_array, **readtext_args)
171
- standardized = self._standardize_results(raw_results, options)
172
- logger.info(f"Finished processing single image. Found {len(standardized)} results.")
173
- return standardized # Return List[Dict]
153
+ try:
154
+ # Use the standard helper for polygons
155
+ bbox = self._standardize_bbox(bbox_raw)
156
+ standardized_regions.append(TextRegion(bbox, text, confidence))
157
+ except ValueError as e:
158
+ raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
159
+
160
+ # Simple mode (string result)
161
+ elif isinstance(detection, str):
162
+ if 0.0 >= min_confidence: # Always include if min_confidence is 0
163
+ standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
164
+ else:
165
+ # Handle unexpected format in OCR mode
166
+ raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
167
+
168
+ except ValueError as e:
169
+ # Re-raise any value errors from standardization or format checks
170
+ raise e
174
171
  except Exception as e:
175
- logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
176
- return [] # Return empty list on failure
177
- else:
178
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
179
-
172
+ # Catch other potential processing errors
173
+ raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
174
+
175
+ return standardized_regions