natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,204 +1,147 @@
1
1
  # ocr_engine_paddleocr.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  import numpy as np
6
7
  from PIL import Image
7
- import inspect # Used for dynamic parameter passing
8
8
 
9
- from .engine import OCREngine
10
- from .ocr_options import PaddleOCROptions, BaseOCROptions
9
+ from .engine import OCREngine, TextRegion
10
+ from .ocr_options import BaseOCROptions, PaddleOCROptions
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
14
+
14
15
  class PaddleOCREngine(OCREngine):
15
16
  """PaddleOCR engine implementation."""
16
17
 
17
- LANGUAGE_MAP = {
18
- 'en': 'en', 'zh': 'ch', 'zh-cn': 'ch', 'zh-tw': 'chinese_cht',
19
- 'ja': 'japan', 'ko': 'korean', 'th': 'thai', 'fr': 'french',
20
- 'de': 'german', 'ru': 'russian', 'ar': 'arabic', 'hi': 'hindi',
21
- 'vi': 'vietnam', 'fa': 'cyrillic', 'ur': 'cyrillic', 'rs': 'serbian',
22
- 'oc': 'latin', 'rsc': 'cyrillic', 'bg': 'bulgarian', 'uk': 'cyrillic',
23
- 'be': 'cyrillic', 'te': 'telugu', 'kn': 'kannada', 'ta': 'tamil',
24
- 'latin': 'latin', 'cyrillic': 'cyrillic', 'devanagari': 'devanagari',
25
- }
26
-
27
18
  def __init__(self):
28
19
  super().__init__()
29
- self._paddleocr = None
30
-
31
- def _lazy_import_paddleocr(self):
32
- """Imports paddleocr only when needed."""
33
- if self._paddleocr is None:
34
- if not self.is_available():
35
- raise ImportError("PaddleOCR or PaddlePaddle is not installed or available.")
36
- try:
37
- import paddle
38
- import paddleocr
39
- self._paddleocr = paddleocr
40
- logger.info("PaddleOCR module imported successfully.")
41
- except ImportError as e:
42
- logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
43
- raise
44
- return self._paddleocr
45
20
 
46
21
  def is_available(self) -> bool:
47
22
  """Check if PaddleOCR and paddlepaddle are installed."""
48
- paddle_installed = importlib.util.find_spec("paddle") is not None or \
49
- importlib.util.find_spec("paddlepaddle") is not None
23
+ paddle_installed = (
24
+ importlib.util.find_spec("paddle") is not None
25
+ or importlib.util.find_spec("paddlepaddle") is not None
26
+ )
50
27
  paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
51
28
  return paddle_installed and paddleocr_installed
52
29
 
53
- def _map_language(self, iso_lang: str) -> str:
54
- """Map ISO language code to PaddleOCR language code."""
55
- return self.LANGUAGE_MAP.get(iso_lang.lower(), 'en')
56
-
57
- def _get_cache_key(self, options: PaddleOCROptions) -> str:
58
- """Generate a more specific cache key for PaddleOCR."""
59
- base_key = super()._get_cache_key(options)
60
- primary_lang = self._map_language(options.languages[0]) if options.languages else 'en'
61
- angle_cls_key = str(options.use_angle_cls)
62
- precision_key = options.precision
63
- return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
64
-
65
- def _get_reader(self, options: PaddleOCROptions):
66
- """Get or initialize a PaddleOCR reader based on options."""
67
- cache_key = self._get_cache_key(options)
68
- if cache_key in self._reader_cache:
69
- logger.debug(f"Using cached PaddleOCR reader for key: {cache_key}")
70
- return self._reader_cache[cache_key]
71
-
72
- logger.info(f"Creating new PaddleOCR reader for key: {cache_key}")
73
- paddleocr = self._lazy_import_paddleocr()
74
-
75
- constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
76
- constructor_args = {}
77
- constructor_args['lang'] = self._map_language(options.languages[0]) if options.languages else 'en'
78
-
79
- for field_name, param in constructor_sig.parameters.items():
80
- if field_name in ['self', 'lang']: continue
81
- if field_name == 'use_gpu':
82
- constructor_args['use_gpu'] = options.use_gpu
83
- continue
84
- if hasattr(options, field_name):
85
- constructor_args[field_name] = getattr(options, field_name)
86
- elif field_name in options.extra_args:
87
- constructor_args[field_name] = options.extra_args[field_name]
88
-
89
- constructor_args.pop('device', None)
90
- logger.debug(f"PaddleOCR constructor args: {constructor_args}")
91
-
30
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
31
+ """Initialize the PaddleOCR model."""
92
32
  try:
93
- show_log = constructor_args.get('show_log', False)
94
- original_log_level = logging.getLogger('ppocr').level
95
- if not show_log:
96
- logging.getLogger('ppocr').setLevel(logging.ERROR)
97
-
98
- reader = paddleocr.PaddleOCR(**constructor_args)
99
-
100
- if not show_log:
101
- logging.getLogger('ppocr').setLevel(original_log_level)
102
-
103
- self._reader_cache[cache_key] = reader
104
- logger.info("PaddleOCR reader created successfully.")
105
- return reader
33
+ import paddleocr
34
+ self.logger.info("PaddleOCR module imported successfully.")
35
+ except ImportError as e:
36
+ self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
37
+ raise
38
+
39
+ # Cast to PaddleOCROptions if possible
40
+ paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
41
+
42
+ # Determine parameters
43
+ primary_lang = languages[0] if languages else "en"
44
+ use_gpu = "cuda" in str(device).lower()
45
+
46
+ # Create constructor arguments
47
+ constructor_args = {
48
+ "lang": primary_lang,
49
+ "use_gpu": use_gpu,
50
+ "use_angle_cls": paddle_options.use_angle_cls,
51
+ "det": True,
52
+ "rec": True # We'll control recognition at process time
53
+ }
54
+
55
+ # Add optional parameters if available
56
+ for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
57
+ if hasattr(paddle_options, param):
58
+ val = getattr(paddle_options, param)
59
+ if val is not None:
60
+ constructor_args[param] = val
61
+
62
+ self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
63
+
64
+ # Create the model
65
+ try:
66
+ self._model = paddleocr.PaddleOCR(**constructor_args)
67
+ self.logger.info("PaddleOCR model created successfully")
106
68
  except Exception as e:
107
- logger.error(f"Failed to create PaddleOCR reader: {e}", exc_info=True)
69
+ self.logger.error(f"Failed to create PaddleOCR model: {e}")
108
70
  raise
109
71
 
110
- def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
111
- """Helper to prepare arguments for the ocr method (excluding image)."""
112
- ocr_args = {}
113
- # Determine 'cls' value based on options precedence
114
- ocr_args['cls'] = options.cls if options.cls is not None else options.use_angle_cls
115
- ocr_args['det'] = options.det
116
- ocr_args['rec'] = options.rec
117
- # Add extra args if needed (less common for ocr method itself)
118
- # for field_name in options.extra_args:
119
- # if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
120
- # ocr_args[field_name] = options.extra_args[field_name]
121
- logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
122
- return ocr_args
123
-
124
- def _standardize_results(self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions) -> List[Dict[str, Any]]:
125
- """Standardizes raw results from a single page/image from PaddleOCR."""
126
- standardized_page = []
127
- if not raw_page_results: # Handle None or empty list
128
- return standardized_page
129
-
130
- min_confidence = options.min_confidence
131
- for detection in raw_page_results:
132
- try:
133
- if not isinstance(detection, (list, tuple)) or len(detection) < 2: continue
134
- bbox_raw = detection[0]
135
- text_confidence = detection[1]
136
- if not isinstance(text_confidence, tuple) or len(text_confidence) < 2: continue
137
-
138
- text = str(text_confidence[0])
139
- confidence = float(text_confidence[1])
140
-
141
- if confidence >= min_confidence:
142
- bbox = self._standardize_bbox(bbox_raw)
143
- if bbox:
144
- standardized_page.append({
145
- 'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
146
- })
147
- else:
148
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
149
- except (IndexError, ValueError, TypeError) as e:
150
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
151
- continue
152
- return standardized_page
153
-
154
- def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
155
- """Converts PIL Image to BGR numpy array."""
156
- if image.mode == 'BGR': # Already BGR
157
- return np.array(image)
158
- img_rgb = image.convert('RGB')
72
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
73
+ """Convert PIL Image to BGR numpy array for PaddleOCR."""
74
+ if image.mode == "BGR":
75
+ return np.array(image)
76
+ img_rgb = image.convert("RGB")
159
77
  img_array_rgb = np.array(img_rgb)
160
- img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
78
+ img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
161
79
  return img_array_bgr
162
80
 
81
+ def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
82
+ """Process a single image with PaddleOCR."""
83
+ if self._model is None:
84
+ raise RuntimeError("PaddleOCR model not initialized")
85
+
86
+ # Prepare OCR arguments
87
+ ocr_args = {}
88
+ if options and isinstance(options, PaddleOCROptions):
89
+ ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
90
+ ocr_args["det"] = options.det
91
+ ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
92
+
93
+ # Run OCR
94
+ raw_results = self._model.ocr(image, **ocr_args)
95
+ return raw_results
163
96
 
164
- def process_image(
165
- self,
166
- images: Union[Image.Image, List[Image.Image]],
167
- options: BaseOCROptions
168
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
169
- """Processes a single image or a batch of images with PaddleOCR."""
170
-
171
- if not isinstance(options, PaddleOCROptions):
172
- logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
173
- options = PaddleOCROptions(
174
- languages=options.languages,
175
- min_confidence=options.min_confidence,
176
- device=options.device,
177
- extra_args=options.extra_args
178
- )
179
-
180
- reader = self._get_reader(options)
181
- ocr_args = self._prepare_ocr_args(options)
97
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
98
+ """Convert PaddleOCR results to standardized TextRegion objects."""
99
+ standardized_regions = []
182
100
 
183
- # Helper function to process one image
184
- def process_one(img):
185
- try:
186
- img_array_bgr = self._pil_to_bgr(img)
187
- raw_results = reader.ocr(img_array_bgr, **ocr_args)
188
-
189
- page_results = []
190
- if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
191
- page_results = raw_results[0]
192
-
193
- return self._standardize_results(page_results, options)
194
- except Exception as e:
195
- logger.error(f"Error processing image with PaddleOCR: {e}")
196
- return []
101
+ if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
102
+ return standardized_regions
103
+
104
+ page_results = raw_results[0] if raw_results[0] is not None else []
105
+
106
+ for detection in page_results:
107
+ # Initialize text and confidence
108
+ text = None
109
+ confidence = None
110
+ bbox_raw = None
111
+
112
+ # Paddle always seems to return the tuple structure [bbox, (text, conf)]
113
+ # even if rec=False. We need to parse this structure regardless.
114
+ if len(detection) == 4: # Handle potential alternative format?
115
+ detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
116
+
117
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2:
118
+ raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
119
+
120
+ bbox_raw = detection[0]
121
+ text_confidence = detection[1]
122
+
123
+ if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
124
+ # Even if detect_only, we expect the (text, conf) structure,
125
+ # it might just contain dummy values.
126
+ raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
127
+
128
+ # Extract text/conf only if not detect_only
129
+ if not detect_only:
130
+ text = str(text_confidence[0])
131
+ confidence = float(text_confidence[1])
197
132
 
198
- # Handle single image or list of images
199
- if isinstance(images, Image.Image):
200
- return process_one(images)
201
- elif isinstance(images, list):
202
- return [process_one(img) for img in images]
203
- else:
204
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
133
+ # Standardize the bbox (always needed)
134
+ try:
135
+ bbox = self._standardize_bbox(bbox_raw)
136
+ except ValueError as e:
137
+ raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
138
+
139
+ # Append based on mode
140
+ if detect_only:
141
+ # Append regardless of dummy confidence value, set text/conf to None
142
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
143
+ elif confidence >= min_confidence:
144
+ # Only append if confidence meets threshold in full OCR mode
145
+ standardized_regions.append(TextRegion(bbox, text, confidence))
146
+
147
+ return standardized_regions
@@ -1,14 +1,14 @@
1
1
  # ocr_engine_surya.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  import numpy as np
6
7
  from PIL import Image
7
8
 
8
- from .engine import OCREngine
9
- from .ocr_options import SuryaOCROptions, BaseOCROptions
9
+ from .engine import OCREngine, TextRegion
10
+ from .ocr_options import BaseOCROptions, SuryaOCROptions
10
11
 
11
- logger = logging.getLogger(__name__)
12
12
 
13
13
  class SuryaOCREngine(OCREngine):
14
14
  """Surya OCR engine implementation."""
@@ -19,153 +19,90 @@ class SuryaOCREngine(OCREngine):
19
19
  self._detection_predictor = None
20
20
  self._surya_recognition = None
21
21
  self._surya_detection = None
22
- self._initialized = False
23
-
24
- def _lazy_load_predictors(self, options: SuryaOCROptions):
25
- """Initializes Surya predictors when first needed."""
26
- if self._initialized:
27
- return
28
22
 
23
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
24
+ """Initialize Surya predictors."""
29
25
  if not self.is_available():
30
26
  raise ImportError("Surya OCR library is not installed or available.")
31
27
 
32
- try:
33
- from surya.recognition import RecognitionPredictor
34
- from surya.detection import DetectionPredictor
35
- self._surya_recognition = RecognitionPredictor
36
- self._surya_detection = DetectionPredictor
37
- logger.info("Surya modules imported successfully.")
38
-
39
- # --- Instantiate Predictors ---
40
- # Add arguments from options if Surya supports them
41
- # Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
42
- # predictor_args = {'device': options.device} # If applicable
43
- predictor_args = {} # Assuming parameterless init based on example
44
-
45
- logger.info("Instantiating Surya DetectionPredictor...")
46
- self._detection_predictor = self._surya_detection(**predictor_args)
47
- logger.info("Instantiating Surya RecognitionPredictor...")
48
- self._recognition_predictor = self._surya_recognition(**predictor_args)
49
-
50
- self._initialized = True
51
- logger.info("Surya predictors initialized.")
52
-
53
- except ImportError as e:
54
- logger.error(f"Failed to import Surya modules: {e}")
55
- raise
56
- except Exception as e:
57
- logger.error(f"Failed to initialize Surya predictors: {e}", exc_info=True)
58
- raise
28
+ # Store languages for use in _process_single_image
29
+ self._langs = languages
30
+
31
+ from surya.detection import DetectionPredictor
32
+ from surya.recognition import RecognitionPredictor
59
33
 
60
- def is_available(self) -> bool:
61
- """Check if the surya library is installed."""
62
- return importlib.util.find_spec("surya") is not None
34
+ self._surya_recognition = RecognitionPredictor
35
+ self._surya_detection = DetectionPredictor
36
+ self.logger.info("Surya modules imported successfully.")
63
37
 
64
- def _standardize_results(self, raw_ocr_result: Any, options: SuryaOCROptions) -> List[Dict[str, Any]]:
65
- """Standardizes raw results from a single image from Surya."""
66
- standardized_page = []
67
- min_confidence = options.min_confidence
38
+ predictor_args = {} # Configure if needed
68
39
 
69
- # Check if the result has the expected structure (OCRResult with text_lines)
70
- if not hasattr(raw_ocr_result, 'text_lines') or not isinstance(raw_ocr_result.text_lines, list):
71
- logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
72
- return standardized_page
40
+ self.logger.info("Instantiating Surya DetectionPredictor...")
41
+ self._detection_predictor = self._surya_detection(**predictor_args)
42
+ self.logger.info("Instantiating Surya RecognitionPredictor...")
43
+ self._recognition_predictor = self._surya_recognition(**predictor_args)
44
+
45
+ self.logger.info("Surya predictors initialized.")
73
46
 
74
- for line in raw_ocr_result.text_lines:
75
- try:
76
- # Extract data from Surya's TextLine object
77
- text = line.text
78
- confidence = line.confidence
79
- # Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
80
- bbox_raw = line.bbox # Use bbox directly if available and correct format
47
+ def _preprocess_image(self, image: Image.Image) -> Image.Image:
48
+ """Surya uses PIL images directly, so just return the image."""
49
+ return image
81
50
 
82
- if confidence >= min_confidence:
83
- bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
84
- if bbox:
85
- standardized_page.append({
86
- 'bbox': bbox,
87
- 'text': text,
88
- 'confidence': confidence,
89
- 'source': 'ocr'
90
- })
91
- else:
92
- # Try polygon if bbox failed standardization
93
- bbox_poly = self._standardize_bbox(line.polygon)
94
- if bbox_poly:
95
- standardized_page.append({
96
- 'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
97
- })
98
- else:
99
- logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
100
-
101
- except (AttributeError, ValueError, TypeError) as e:
102
- logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
103
- continue
104
- return standardized_page
105
-
106
- def process_image(
107
- self,
108
- images: Union[Image.Image, List[Image.Image]],
109
- options: BaseOCROptions
110
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
111
- """Processes a single image or a batch of images with Surya OCR."""
112
-
113
- if not isinstance(options, SuryaOCROptions):
114
- logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
115
- options = SuryaOCROptions(
116
- languages=options.languages,
117
- min_confidence=options.min_confidence,
118
- device=options.device,
119
- extra_args=options.extra_args
120
- )
121
-
122
- # Ensure predictors are loaded/initialized
123
- self._lazy_load_predictors(options)
51
+ def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
52
+ """Process a single image with Surya OCR."""
124
53
  if not self._recognition_predictor or not self._detection_predictor:
125
- raise RuntimeError("Surya predictors could not be initialized.")
126
-
127
- # --- Prepare inputs for Surya ---
128
- is_batch = isinstance(images, list)
129
- input_images: List[Image.Image] = images if is_batch else [images]
130
- # Surya expects a list of language lists, one per image
131
- input_langs: List[List[str]] = [options.languages for _ in input_images]
132
-
133
- if not input_images:
134
- logger.warning("No images provided for Surya processing.")
135
- return [] if not is_batch else [[]]
136
-
137
- # --- Run Surya Prediction ---
138
- try:
139
- processing_mode = "batch" if is_batch else "single image"
140
- logger.info(f"Processing {processing_mode} ({len(input_images)} images) with Surya...")
141
- # Call Surya's predictor
142
- # It returns a list of OCRResult objects, one per input image
143
- predictions = self._recognition_predictor(
144
- images=input_images,
145
- langs=input_langs,
54
+ raise RuntimeError("Surya predictors are not initialized.")
55
+
56
+ # Store languages instance variable during initialization to use here
57
+ langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
58
+
59
+ # Surya expects lists of images, so we need to wrap our single image
60
+ if detect_only:
61
+ results = self._detection_predictor(images=[image])
62
+ else:
63
+ results = self._recognition_predictor(
64
+ images=[image],
65
+ langs=langs, # Use the languages set during initialization
146
66
  det_predictor=self._detection_predictor
147
67
  )
148
- logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
149
-
150
- # --- Standardize Results ---
151
- if len(predictions) != len(input_images):
152
- logger.error(f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results.")
153
- # Decide on error handling: raise error or return empty structure
154
- return [[] for _ in input_images] if is_batch else []
155
-
156
- all_standardized_results = [self._standardize_results(res, options) for res in predictions]
157
-
158
- if is_batch:
159
- return all_standardized_results # Return List[List[Dict]]
68
+
69
+ # Surya may return a list with one result per image or a single result object
70
+ # Return the result as-is and handle the extraction in _standardize_results
71
+ return results
72
+
73
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
74
+ """Convert Surya results to standardized TextRegion objects."""
75
+ standardized_regions = []
76
+
77
+ raw_result = raw_results
78
+ if isinstance(raw_results, list) and len(raw_results) > 0:
79
+ raw_result = raw_results[0]
80
+
81
+ results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
82
+
83
+ for line in results:
84
+ # Always extract bbox first
85
+ try:
86
+ # Prioritize line.bbox, fallback to line.polygon
87
+ bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
88
+ if bbox_raw is None:
89
+ raise ValueError("Missing bbox/polygon data")
90
+ bbox = self._standardize_bbox(bbox_raw)
91
+ except ValueError as e:
92
+ raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
93
+
94
+ if detect_only:
95
+ # For detect_only, text and confidence are None
96
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
160
97
  else:
161
- return all_standardized_results[0] # Return List[Dict] for single image
162
-
163
- except Exception as e:
164
- logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
165
- # Return empty structure matching input type on failure
166
- return [[] for _ in input_images] if is_batch else []
167
-
168
- # Note: Caching is handled differently for Surya as predictors are stateful
169
- # and initialized once. The base class _reader_cache is not used here.
170
- # If predictors could be configured per-run, caching would need rethinking.
98
+ # For full OCR, extract text and confidence, then filter
99
+ text = line.text if hasattr(line, "text") else ""
100
+ confidence = line.confidence
101
+ if confidence >= min_confidence:
102
+ standardized_regions.append(TextRegion(bbox, text, confidence))
103
+
104
+ return standardized_regions
171
105
 
106
+ def is_available(self) -> bool:
107
+ """Check if the surya library is installed."""
108
+ return importlib.util.find_spec("surya") is not None