natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,134 @@
1
+ import logging
2
+ from typing import Any, Type, Optional
3
+ from pydantic import BaseModel
4
+ import io
5
+ import base64
6
+ from PIL import Image
7
+
8
+ from natural_pdf.extraction.result import StructuredDataResult
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class StructuredDataManager:
14
+ """
15
+ Manages the process of extracting structured data from elements using LLMs.
16
+
17
+ This manager is typically accessed via `pdf.get_manager('structured_data')`.
18
+ It is stateless and relies on parameters passed during method calls.
19
+ """
20
+
21
+ DEFAULT_TEXT_MODEL = "gpt-4o-mini"
22
+ DEFAULT_VISION_MODEL = "gpt-4o"
23
+
24
+ def __init__(self):
25
+ """Initializes the manager."""
26
+ logger.info("Initialized StructuredDataManager.")
27
+
28
+ def is_available(self) -> bool:
29
+ """Checks if necessary dependencies are available."""
30
+ try:
31
+ import pydantic
32
+ return True
33
+ except ImportError:
34
+ logger.warning("Pydantic is required for structured data extraction.")
35
+ return False
36
+
37
+ def _prepare_llm_messages(
38
+ self,
39
+ content: Any,
40
+ prompt: Optional[str],
41
+ using: str,
42
+ schema: Type[BaseModel]
43
+ ) -> list:
44
+ """Prepares the message list for the LLM API call."""
45
+ system_prompt = prompt or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
46
+
47
+ messages = [
48
+ {"role": "system", "content": system_prompt}
49
+ ]
50
+
51
+ if using == 'text':
52
+ messages.append({"role": "user", "content": str(content)})
53
+ elif using == 'vision':
54
+ if isinstance(content, Image.Image):
55
+ buffered = io.BytesIO()
56
+ content.save(buffered, format="PNG")
57
+ base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
58
+ messages.append({
59
+ "role": "user",
60
+ "content": [
61
+ {"type": "text", "text": "Extract information from this image based on the schema."},
62
+ {
63
+ "type": "image_url",
64
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"},
65
+ },
66
+ ],
67
+ })
68
+ else:
69
+ raise TypeError(f"Content must be a PIL Image for using='vision', got {type(content)}")
70
+ else:
71
+ raise ValueError(f"Unsupported value for 'using': {using}")
72
+
73
+ return messages
74
+
75
+ def extract(
76
+ self,
77
+ content: Any,
78
+ schema: Type[BaseModel],
79
+ client: Any,
80
+ prompt: Optional[str] = None,
81
+ using: str = 'text',
82
+ model: Optional[str] = None,
83
+ **kwargs
84
+ ) -> StructuredDataResult:
85
+ """
86
+ Extract structured data from content using an LLM.
87
+
88
+ Args:
89
+ content: Text string or Image object
90
+ schema: Pydantic model class for the desired structure
91
+ client: Initialized LLM client (e.g., OpenAI client)
92
+ prompt: Optional user-provided instructions
93
+ using: Modality ('text' or 'vision')
94
+ model: Specific LLM model identifier
95
+ **kwargs: Additional parameters for the LLM API call
96
+
97
+ Returns:
98
+ StructuredDataResult object
99
+ """
100
+ logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
101
+
102
+ if isinstance(content, list) and using == 'vision':
103
+ if len(content) == 1:
104
+ content = content[0]
105
+ elif len(content) > 1:
106
+ logger.error("Vision extraction not supported for multi-page PDFs")
107
+ raise NotImplementedError("Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead.")
108
+
109
+ selected_model = model or (self.DEFAULT_VISION_MODEL if using == 'vision' else self.DEFAULT_TEXT_MODEL)
110
+ messages = self._prepare_llm_messages(content, prompt, using, schema)
111
+
112
+ try:
113
+ logger.debug(f"Extracting with model '{selected_model}'")
114
+ completion = client.beta.chat.completions.parse(
115
+ model=selected_model,
116
+ messages=messages,
117
+ response_format=schema,
118
+ **kwargs
119
+ )
120
+ parsed_data = completion.choices[0].message.parsed
121
+ return StructuredDataResult(
122
+ data=parsed_data,
123
+ success=True,
124
+ error_message=None,
125
+ model=selected_model
126
+ )
127
+ except Exception as e:
128
+ logger.error(f"Extraction failed: {str(e)}")
129
+ return StructuredDataResult(
130
+ data=None,
131
+ success=False,
132
+ error_message=str(e),
133
+ model=selected_model
134
+ )
@@ -0,0 +1,246 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Type, Optional
3
+ from abc import ABC, abstractmethod
4
+ from pydantic import BaseModel
5
+
6
+ # Avoid circular import
7
+ if TYPE_CHECKING:
8
+ from natural_pdf.extraction.result import StructuredDataResult
9
+ from natural_pdf.core.page import Page
10
+ from natural_pdf.elements.base import Element
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ DEFAULT_STRUCTURED_KEY = "default-structured" # Define default key
15
+
16
+ class ExtractionMixin(ABC):
17
+ """
18
+ Mixin class providing structured data extraction capabilities to elements.
19
+ Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
20
+ """
21
+
22
+ def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
23
+ """
24
+ Retrieves the content (text or image) for extraction.
25
+
26
+ Args:
27
+ using: 'text' or 'vision'
28
+ **kwargs: Additional arguments passed to extract_text or to_image
29
+
30
+ Returns:
31
+ str: Extracted text if using='text'
32
+ PIL.Image.Image: Rendered image if using='vision'
33
+ None: If content cannot be retrieved
34
+ """
35
+ if not hasattr(self, 'extract_text') or not callable(self.extract_text):
36
+ logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
37
+ return None
38
+ if not hasattr(self, 'to_image') or not callable(self.to_image):
39
+ logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
40
+ return None
41
+
42
+ try:
43
+ if using == 'text':
44
+ layout = kwargs.pop('layout', True)
45
+ return self.extract_text(layout=layout, **kwargs)
46
+ elif using == 'vision':
47
+ resolution = kwargs.pop('resolution', 72)
48
+ include_highlights = kwargs.pop('include_highlights', False)
49
+ labels = kwargs.pop('labels', False)
50
+ return self.to_image(
51
+ resolution=resolution,
52
+ include_highlights=include_highlights,
53
+ labels=labels,
54
+ **kwargs
55
+ )
56
+ else:
57
+ logger.error(f"Unsupported value for 'using': {using}")
58
+ return None
59
+ except Exception as e:
60
+ logger.error(f"Error getting {using} content from {self!r}: {e}")
61
+ return None
62
+
63
+ def extract(
64
+ self: Any,
65
+ schema: Type[BaseModel],
66
+ client: Any,
67
+ analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
68
+ prompt: Optional[str] = None,
69
+ using: str = 'text',
70
+ model: Optional[str] = None,
71
+ overwrite: bool = False, # Add overwrite parameter
72
+ **kwargs
73
+ ) -> Any:
74
+ """
75
+ Extracts structured data according to the provided schema.
76
+
77
+ Results are stored in the element's `analyses` dictionary.
78
+
79
+ Args:
80
+ schema: Pydantic model class defining the desired structure
81
+ client: Initialized LLM client
82
+ analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
83
+ prompt: Optional user-provided prompt for the LLM
84
+ using: Modality ('text' or 'vision')
85
+ model: Optional specific LLM model identifier
86
+ overwrite: If True, allow overwriting an existing result at `analysis_key`.
87
+ **kwargs: Additional parameters for extraction
88
+
89
+ Returns:
90
+ Self for method chaining
91
+ """
92
+ if not analysis_key:
93
+ raise ValueError("analysis_key cannot be empty for extract operation")
94
+
95
+ # --- Overwrite Check --- #
96
+ if not hasattr(self, 'analyses') or self.analyses is None:
97
+ self.analyses = {}
98
+
99
+ if analysis_key in self.analyses and not overwrite:
100
+ raise ValueError(
101
+ f"Analysis key '{analysis_key}' already exists in analyses. "
102
+ f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
103
+ )
104
+ # --- End Overwrite Check --- #
105
+
106
+ # Determine PDF instance to get manager
107
+ pdf_instance = None
108
+
109
+ if hasattr(self, 'get_manager') and callable(self.get_manager):
110
+ # Handle case where self is the PDF instance itself
111
+ pdf_instance = self
112
+ logger.debug(f"Manager access via self ({type(self).__name__})")
113
+ elif hasattr(self, 'pdf') and hasattr(self.pdf, 'get_manager') and callable(self.pdf.get_manager):
114
+ # Handle Page or other elements with direct .pdf reference
115
+ pdf_instance = self.pdf
116
+ logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
117
+ elif hasattr(self, 'page') and hasattr(self.page, 'pdf') and hasattr(self.page.pdf, 'get_manager') and callable(self.page.pdf.get_manager):
118
+ # Handle Region or other elements with .page.pdf reference
119
+ pdf_instance = self.page.pdf
120
+ logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
121
+ else:
122
+ logger.error(f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf")
123
+ raise RuntimeError(f"Cannot access PDF manager: {type(self).__name__} lacks necessary references")
124
+
125
+ try:
126
+ manager = pdf_instance.get_manager('structured_data')
127
+ except Exception as e:
128
+ raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
129
+
130
+ if not manager or not manager.is_available():
131
+ raise RuntimeError("StructuredDataManager is not available")
132
+
133
+ # Get content
134
+ layout_for_text = kwargs.pop('layout', True)
135
+ content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs) # Pass kwargs
136
+
137
+ if content is None or (using == 'text' and isinstance(content, str) and not content.strip()):
138
+ logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
139
+ # Import here to avoid circularity at module level
140
+ from natural_pdf.extraction.result import StructuredDataResult
141
+ result = StructuredDataResult(
142
+ data=None,
143
+ success=False,
144
+ error_message=f"No content available for extraction (using='{using}')",
145
+ model=model # Use model requested, even if failed
146
+ )
147
+ else:
148
+ result = manager.extract(
149
+ content=content,
150
+ schema=schema,
151
+ client=client,
152
+ prompt=prompt,
153
+ using=using,
154
+ model=model,
155
+ **kwargs
156
+ )
157
+
158
+ # Store the result
159
+ self.analyses[analysis_key] = result
160
+ logger.info(f"Stored extraction result under key '{analysis_key}' (Success: {result.success})")
161
+
162
+ return self
163
+
164
+ def extracted(self, field_name: Optional[str] = None, analysis_key: Optional[str] = None) -> Any:
165
+ """
166
+ Convenience method to access results from structured data extraction.
167
+
168
+ Args:
169
+ field_name: The specific field to retrieve from the extracted data dictionary.
170
+ If None, returns the entire data dictionary.
171
+ analysis_key: The key under which the extraction result was stored in `analyses`.
172
+ If None, defaults to "default-structured".
173
+
174
+ Returns:
175
+ The requested field value, the entire data dictionary, or raises an error.
176
+
177
+ Raises:
178
+ KeyError: If the specified `analysis_key` is not found in `analyses`.
179
+ ValueError: If the stored result for `analysis_key` indicates a failed extraction.
180
+ AttributeError: If the element does not have an `analyses` attribute.
181
+ KeyError: (Standard Python) If `field_name` is specified but not found in the data.
182
+ """
183
+ target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
184
+
185
+ if not hasattr(self, 'analyses') or self.analyses is None:
186
+ raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
187
+
188
+ if target_key not in self.analyses:
189
+ available_keys = list(self.analyses.keys())
190
+ raise KeyError(
191
+ f"Extraction '{target_key}' not found in analyses. "
192
+ f"Available extractions: {available_keys}"
193
+ )
194
+
195
+ # Import here to avoid circularity and allow type checking
196
+ from natural_pdf.extraction.result import StructuredDataResult
197
+ result: StructuredDataResult = self.analyses[target_key]
198
+
199
+ if not isinstance(result, StructuredDataResult):
200
+ logger.warning(f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process.")
201
+ raise TypeError(f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}")
202
+
203
+ if not result.success:
204
+ raise ValueError(
205
+ f"Stored result for '{target_key}' indicates a failed extraction attempt. "
206
+ f"Error: {result.error_message}"
207
+ )
208
+
209
+ if result.data is None:
210
+ # This case might occur if success=True but data is somehow None
211
+ raise ValueError(f"Extraction result for '{target_key}' has no data available, despite success flag.")
212
+
213
+ if field_name is None:
214
+ # Return the whole data object (Pydantic model instance or dict)
215
+ return result.data
216
+ else:
217
+ # Try dictionary key access first, then attribute access
218
+ if isinstance(result.data, dict):
219
+ try:
220
+ return result.data[field_name]
221
+ except KeyError:
222
+ available_keys = list(result.data.keys())
223
+ raise KeyError(
224
+ f"Field/Key '{field_name}' not found in extracted dictionary "
225
+ f"for key '{target_key}'. Available keys: {available_keys}"
226
+ )
227
+ else:
228
+ # Assume it's an object, try attribute access
229
+ try:
230
+ return getattr(result.data, field_name)
231
+ except AttributeError:
232
+ # Try to get available fields from the object
233
+ available_fields = []
234
+ if hasattr(result.data, 'model_fields'): # Pydantic v2
235
+ available_fields = list(result.data.model_fields.keys())
236
+ elif hasattr(result.data, '__fields__'): # Pydantic v1
237
+ available_fields = list(result.data.__fields__.keys())
238
+ elif hasattr(result.data, '__dict__'): # Fallback
239
+ available_fields = list(result.data.__dict__.keys())
240
+
241
+ raise AttributeError(
242
+ f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
243
+ f"for key '{target_key}'. Available fields/attributes: {available_fields}"
244
+ )
245
+ except Exception as e: # Catch other potential errors during getattr
246
+ raise TypeError(f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}") from e
@@ -0,0 +1,37 @@
1
+ from typing import Optional, TypeVar, Generic, Any
2
+ from pydantic import BaseModel, Field
3
+
4
+ # Generic type for the Pydantic model used in the schema
5
+ T_Schema = TypeVar("T_Schema", bound=BaseModel)
6
+
7
+
8
+ class StructuredDataResult(BaseModel, Generic[T_Schema]):
9
+ """
10
+ Represents the result of a structured data extraction operation.
11
+
12
+ Contains the extracted data, success status, and error information.
13
+ """
14
+
15
+ data: Optional[T_Schema] = Field(
16
+ None,
17
+ description="Validated data model or None on failure"
18
+ )
19
+ success: bool = Field(
20
+ ...,
21
+ description="Whether extraction succeeded"
22
+ )
23
+ error_message: Optional[str] = Field(
24
+ None,
25
+ description="Error details if extraction failed"
26
+ )
27
+ raw_output: Optional[Any] = Field(
28
+ None,
29
+ description="Raw output from the language model"
30
+ )
31
+ model_used: Optional[str] = Field(
32
+ None,
33
+ description="Identifier of the language model used"
34
+ )
35
+
36
+ class Config:
37
+ arbitrary_types_allowed = True
@@ -11,7 +11,13 @@ logger = logging.getLogger("natural_pdf.ocr")
11
11
 
12
12
  # Import the base classes that are always available
13
13
  from .engine import OCREngine
14
- from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
14
+ from .ocr_options import (
15
+ OCROptions,
16
+ BaseOCROptions,
17
+ EasyOCROptions,
18
+ PaddleOCROptions,
19
+ SuryaOCROptions,
20
+ )
15
21
  from .ocr_manager import OCRManager
16
22
  from .ocr_factory import OCRFactory
17
23
 
@@ -22,13 +28,14 @@ __all__ = [
22
28
  "OCROptions",
23
29
  "BaseOCROptions",
24
30
  "EasyOCROptions",
25
- "PaddleOCROptions",
31
+ "PaddleOCROptions",
26
32
  "SuryaOCROptions",
27
33
  "OCRFactory",
28
34
  "get_engine",
29
- "list_available_engines"
35
+ "list_available_engines",
30
36
  ]
31
37
 
38
+
32
39
  def get_engine(engine_name=None, **kwargs):
33
40
  """
34
41
  Get OCR engine by name with graceful handling of missing dependencies.
@@ -40,27 +47,27 @@ def get_engine(engine_name=None, **kwargs):
40
47
 
41
48
  Returns:
42
49
  OCREngine instance
43
-
50
+
44
51
  Raises:
45
52
  ImportError: If the requested engine's dependencies aren't installed
46
53
  ValueError: If the engine_name is unknown
47
54
  """
48
55
  logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
49
-
56
+
50
57
  try:
51
58
  if engine_name is None or engine_name == "default":
52
59
  # Use the factory to get the best available engine
53
60
  engine = OCRFactory.get_recommended_engine(**kwargs)
54
61
  logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
55
62
  return engine
56
-
63
+
57
64
  # Use the factory to create a specific engine
58
65
  normalized_name = engine_name.lower()
59
66
  if normalized_name in ["easyocr", "paddle", "surya"]:
60
67
  return OCRFactory.create_engine(normalized_name, **kwargs)
61
68
  else:
62
69
  raise ValueError(f"Unknown OCR engine: {engine_name}")
63
-
70
+
64
71
  except ImportError as e:
65
72
  logger.error(f"OCR engine dependency error: {e}")
66
73
  raise
@@ -68,10 +75,11 @@ def get_engine(engine_name=None, **kwargs):
68
75
  logger.error(f"Error initializing OCR engine: {e}")
69
76
  raise
70
77
 
78
+
71
79
  def list_available_engines():
72
80
  """
73
81
  List all available OCR engines.
74
-
82
+
75
83
  Returns:
76
84
  Dict[str, bool]: Dictionary mapping engine names to availability status
77
85
  """
natural_pdf/ocr/engine.py CHANGED
@@ -13,11 +13,17 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
  class TextRegion:
15
15
  """Standard representation of an OCR text region."""
16
-
17
- def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
16
+
17
+ def __init__(
18
+ self,
19
+ bbox: Tuple[float, float, float, float],
20
+ text: str,
21
+ confidence: float,
22
+ source: str = "ocr",
23
+ ):
18
24
  """
19
25
  Initialize a text region.
20
-
26
+
21
27
  Args:
22
28
  bbox: Tuple of (x0, y0, x1, y1) coordinates
23
29
  text: The recognized text
@@ -28,7 +34,7 @@ class TextRegion:
28
34
  self.text = text
29
35
  self.confidence = confidence
30
36
  self.source = source
31
-
37
+
32
38
  @classmethod
33
39
  def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
34
40
  """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
@@ -36,24 +42,24 @@ class TextRegion:
36
42
  y_coords = [float(point[1]) for point in polygon]
37
43
  bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
38
44
  return cls(bbox, text, confidence)
39
-
45
+
40
46
  def to_dict(self) -> Dict[str, Any]:
41
47
  """Convert to dictionary representation for compatibility."""
42
48
  return {
43
49
  "bbox": self.bbox,
44
50
  "text": self.text,
45
51
  "confidence": self.confidence,
46
- "source": self.source
52
+ "source": self.source,
47
53
  }
48
54
 
49
55
 
50
56
  class OCREngine(ABC):
51
57
  """Abstract Base Class for OCR engines."""
52
-
58
+
53
59
  # Default values as class constants
54
60
  DEFAULT_MIN_CONFIDENCE = 0.2
55
- DEFAULT_LANGUAGES = ['en']
56
- DEFAULT_DEVICE = 'cpu'
61
+ DEFAULT_LANGUAGES = ["en"]
62
+ DEFAULT_DEVICE = "cpu"
57
63
 
58
64
  def __init__(self):
59
65
  """Initializes the base OCR engine."""
@@ -74,7 +80,7 @@ class OCREngine(ABC):
74
80
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
75
81
  """
76
82
  Process a single image or batch of images with OCR.
77
-
83
+
78
84
  Args:
79
85
  images: A single PIL Image or a list of PIL Images
80
86
  languages: List of languages to use (default: ['en'])
@@ -82,7 +88,7 @@ class OCREngine(ABC):
82
88
  device: Device to use for processing (default: 'cpu')
83
89
  detect_only: Whether to only detect text regions without recognition
84
90
  options: Engine-specific options
85
-
91
+
86
92
  Returns:
87
93
  For a single image: List of text region dictionaries
88
94
  For a batch: List of lists of text region dictionaries
@@ -90,42 +96,48 @@ class OCREngine(ABC):
90
96
  # Convert single image to batch format
91
97
  single_image = not isinstance(images, list)
92
98
  image_batch = [images] if single_image else images
93
-
99
+
94
100
  # Use default values where parameters are not provided
95
101
  effective_languages = languages or self.DEFAULT_LANGUAGES
96
- effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
102
+ effective_confidence = (
103
+ min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
104
+ )
97
105
  effective_device = device or self.DEFAULT_DEVICE
98
-
106
+
99
107
  # Ensure the model is initialized
100
108
  self._ensure_initialized(effective_languages, effective_device, options)
101
-
109
+
102
110
  # Process each image in the batch
103
111
  results = []
104
112
  for img in image_batch:
105
113
  # Preprocess the image for the specific engine
106
114
  processed_img = self._preprocess_image(img)
107
-
115
+
108
116
  # Process the image with the engine-specific implementation
109
117
  raw_results = self._process_single_image(processed_img, detect_only, options)
110
-
118
+
111
119
  # Convert results to standardized format
112
120
  text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
113
-
121
+
114
122
  # Convert TextRegion objects to dictionaries for backward compatibility
115
123
  region_dicts = [region.to_dict() for region in text_regions]
116
124
  results.append(region_dicts)
117
-
125
+
118
126
  # Return results in the appropriate format
119
127
  return results[0] if single_image else results
120
128
 
121
- def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
129
+ def _ensure_initialized(
130
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
131
+ ):
122
132
  """Ensure the model is initialized with the correct parameters."""
123
133
  if not self._initialized:
124
134
  self._initialize_model(languages, device, options)
125
135
  self._initialized = True
126
-
136
+
127
137
  @abstractmethod
128
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
138
+ def _initialize_model(
139
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
140
+ ):
129
141
  """Initialize the OCR model with the given parameters."""
130
142
  raise NotImplementedError("Subclasses must implement this method")
131
143
 
@@ -133,14 +145,18 @@ class OCREngine(ABC):
133
145
  def _preprocess_image(self, image: Image.Image) -> Any:
134
146
  """Convert PIL Image to engine-specific format."""
135
147
  raise NotImplementedError("Subclasses must implement this method")
136
-
148
+
137
149
  @abstractmethod
138
- def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
150
+ def _process_single_image(
151
+ self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]
152
+ ) -> Any:
139
153
  """Process a single image with the initialized model."""
140
154
  raise NotImplementedError("Subclasses must implement this method")
141
-
155
+
142
156
  @abstractmethod
143
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
157
+ def _standardize_results(
158
+ self, raw_results: Any, min_confidence: float, detect_only: bool
159
+ ) -> List[TextRegion]:
144
160
  """Convert engine-specific results to standardized TextRegion objects."""
145
161
  raise NotImplementedError("Subclasses must implement this method")
146
162
 
@@ -181,23 +197,23 @@ class OCREngine(ABC):
181
197
  return tuple(float(c) for c in bbox[:4])
182
198
  except (ValueError, TypeError) as e:
183
199
  raise ValueError(f"Invalid number format in bbox: {bbox}") from e
184
-
200
+
185
201
  # Check if it's in polygon format [[x1,y1],[x2,y2],...]
186
202
  elif (
187
203
  isinstance(bbox, (list, tuple))
188
204
  and len(bbox) > 0
189
205
  and isinstance(bbox[0], (list, tuple))
190
- and len(bbox[0]) == 2 # Ensure points are pairs
206
+ and len(bbox[0]) == 2 # Ensure points are pairs
191
207
  ):
192
208
  try:
193
209
  x_coords = [float(point[0]) for point in bbox]
194
210
  y_coords = [float(point[1]) for point in bbox]
195
- if not x_coords or not y_coords: # Handle empty polygon case
211
+ if not x_coords or not y_coords: # Handle empty polygon case
196
212
  raise ValueError("Empty polygon provided")
197
213
  return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
198
214
  except (ValueError, TypeError, IndexError) as e:
199
215
  raise ValueError(f"Invalid polygon format or values: {bbox}") from e
200
-
216
+
201
217
  # If it's neither format, raise an error
202
218
  raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
203
219