natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,22 +1,24 @@
1
1
  # ocr_engine_easyocr.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import inspect # Used for dynamic parameter passing
4
+ import logging
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
5
7
  import numpy as np
6
8
  from PIL import Image
7
- import inspect # Used for dynamic parameter passing
8
9
 
9
10
  from .engine import OCREngine
10
- from .ocr_options import EasyOCROptions, BaseOCROptions
11
+ from .ocr_options import BaseOCROptions, EasyOCROptions
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
15
+
14
16
  class EasyOCREngine(OCREngine):
15
17
  """EasyOCR engine implementation."""
16
18
 
17
19
  def __init__(self):
18
20
  super().__init__()
19
- self._easyocr = None # Lazy load easyocr module
21
+ self._easyocr = None # Lazy load easyocr module
20
22
 
21
23
  def _lazy_import_easyocr(self):
22
24
  """Imports easyocr only when needed."""
@@ -25,6 +27,7 @@ class EasyOCREngine(OCREngine):
25
27
  raise ImportError("EasyOCR is not installed or available.")
26
28
  try:
27
29
  import easyocr
30
+
28
31
  self._easyocr = easyocr
29
32
  logger.info("EasyOCR module imported successfully.")
30
33
  except ImportError as e:
@@ -56,15 +59,18 @@ class EasyOCREngine(OCREngine):
56
59
 
57
60
  constructor_sig = inspect.signature(easyocr.Reader.__init__)
58
61
  constructor_args = {}
59
- constructor_args['lang_list'] = options.languages
60
- constructor_args['gpu'] = 'cuda' in str(options.device).lower() or 'mps' in str(options.device).lower()
62
+ constructor_args["lang_list"] = options.languages
63
+ constructor_args["gpu"] = (
64
+ "cuda" in str(options.device).lower() or "mps" in str(options.device).lower()
65
+ )
61
66
 
62
67
  for field_name, param in constructor_sig.parameters.items():
63
- if field_name in ['self', 'lang_list', 'gpu']: continue
68
+ if field_name in ["self", "lang_list", "gpu"]:
69
+ continue
64
70
  if hasattr(options, field_name):
65
- constructor_args[field_name] = getattr(options, field_name)
71
+ constructor_args[field_name] = getattr(options, field_name)
66
72
  elif field_name in options.extra_args:
67
- constructor_args[field_name] = options.extra_args[field_name]
73
+ constructor_args[field_name] = options.extra_args[field_name]
68
74
 
69
75
  logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
70
76
  try:
@@ -81,22 +87,29 @@ class EasyOCREngine(OCREngine):
81
87
  readtext_sig = inspect.signature(reader.readtext)
82
88
  readtext_args = {}
83
89
  for field_name, param in readtext_sig.parameters.items():
84
- if field_name == 'image': continue
85
- if hasattr(options, field_name):
86
- readtext_args[field_name] = getattr(options, field_name)
87
- elif field_name in options.extra_args:
88
- readtext_args[field_name] = options.extra_args[field_name]
90
+ if field_name == "image":
91
+ continue
92
+ if hasattr(options, field_name):
93
+ readtext_args[field_name] = getattr(options, field_name)
94
+ elif field_name in options.extra_args:
95
+ readtext_args[field_name] = options.extra_args[field_name]
89
96
  logger.debug(f"EasyOCR readtext args: {readtext_args}")
90
97
  return readtext_args
91
98
 
92
- def _standardize_results(self, raw_results: List[Any], options: EasyOCROptions) -> List[Dict[str, Any]]:
99
+ def _standardize_results(
100
+ self, raw_results: List[Any], options: EasyOCROptions
101
+ ) -> List[Dict[str, Any]]:
93
102
  """Standardizes raw results from EasyOCR's readtext."""
94
103
  standardized_results = []
95
104
  min_confidence = options.min_confidence
96
105
 
97
106
  for detection in raw_results:
98
107
  try:
99
- if options.detail == 1 and isinstance(detection, (list, tuple)) and len(detection) >= 3:
108
+ if (
109
+ options.detail == 1
110
+ and isinstance(detection, (list, tuple))
111
+ and len(detection) >= 3
112
+ ):
100
113
  bbox_raw = detection[0]
101
114
  text = str(detection[1])
102
115
  confidence = float(detection[2])
@@ -104,38 +117,40 @@ class EasyOCREngine(OCREngine):
104
117
  if confidence >= min_confidence:
105
118
  bbox = self._standardize_bbox(bbox_raw)
106
119
  if bbox:
107
- standardized_results.append({
108
- 'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
109
- })
120
+ standardized_results.append(
121
+ {
122
+ "bbox": bbox,
123
+ "text": text,
124
+ "confidence": confidence,
125
+ "source": "ocr",
126
+ }
127
+ )
110
128
  else:
111
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
129
+ logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
112
130
 
113
131
  elif options.detail == 0 and isinstance(detection, str):
114
- standardized_results.append({
115
- 'bbox': None, 'text': detection, 'confidence': 1.0, 'source': 'ocr'
116
- })
132
+ standardized_results.append(
133
+ {"bbox": None, "text": detection, "confidence": 1.0, "source": "ocr"}
134
+ )
117
135
  except (IndexError, ValueError, TypeError) as e:
118
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
119
- continue
136
+ logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
137
+ continue
120
138
  return standardized_results
121
139
 
122
-
123
140
  def process_image(
124
- self,
125
- images: Union[Image.Image, List[Image.Image]],
126
- options: BaseOCROptions
141
+ self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
127
142
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
128
143
  """Processes a single image or a batch of images with EasyOCR."""
129
144
 
130
145
  if not isinstance(options, EasyOCROptions):
131
- logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
132
- # Create default EasyOCR options if base was passed, preserving base settings
133
- options = EasyOCROptions(
134
- languages=options.languages,
135
- min_confidence=options.min_confidence,
136
- device=options.device,
137
- extra_args=options.extra_args # Pass along any extra args
138
- )
146
+ logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
147
+ # Create default EasyOCR options if base was passed, preserving base settings
148
+ options = EasyOCROptions(
149
+ languages=options.languages,
150
+ min_confidence=options.min_confidence,
151
+ device=options.device,
152
+ extra_args=options.extra_args, # Pass along any extra args
153
+ )
139
154
 
140
155
  reader = self._get_reader(options)
141
156
  readtext_args = self._prepare_readtext_args(options, reader)
@@ -147,9 +162,9 @@ class EasyOCREngine(OCREngine):
147
162
  logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
148
163
  for i, img in enumerate(images):
149
164
  if not isinstance(img, Image.Image):
150
- logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
151
- all_results.append([])
152
- continue
165
+ logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
166
+ all_results.append([])
167
+ continue
153
168
  img_array = np.array(img)
154
169
  try:
155
170
  logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
@@ -157,10 +172,12 @@ class EasyOCREngine(OCREngine):
157
172
  standardized = self._standardize_results(raw_results, options)
158
173
  all_results.append(standardized)
159
174
  except Exception as e:
160
- logger.error(f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True)
161
- all_results.append([]) # Append empty list for failed image
175
+ logger.error(
176
+ f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True
177
+ )
178
+ all_results.append([]) # Append empty list for failed image
162
179
  logger.info(f"Finished processing batch with EasyOCR.")
163
- return all_results # Return List[List[Dict]]
180
+ return all_results # Return List[List[Dict]]
164
181
 
165
182
  elif isinstance(images, Image.Image):
166
183
  # --- Single Image Processing ---
@@ -170,10 +187,9 @@ class EasyOCREngine(OCREngine):
170
187
  raw_results = reader.readtext(img_array, **readtext_args)
171
188
  standardized = self._standardize_results(raw_results, options)
172
189
  logger.info(f"Finished processing single image. Found {len(standardized)} results.")
173
- return standardized # Return List[Dict]
190
+ return standardized # Return List[Dict]
174
191
  except Exception as e:
175
192
  logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
176
- return [] # Return empty list on failure
193
+ return [] # Return empty list on failure
177
194
  else:
178
195
  raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
179
-
@@ -1,27 +1,49 @@
1
1
  # ocr_engine_paddleocr.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import inspect # Used for dynamic parameter passing
4
+ import logging
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
5
7
  import numpy as np
6
8
  from PIL import Image
7
- import inspect # Used for dynamic parameter passing
8
9
 
9
10
  from .engine import OCREngine
10
- from .ocr_options import PaddleOCROptions, BaseOCROptions
11
+ from .ocr_options import BaseOCROptions, PaddleOCROptions
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
15
+
14
16
  class PaddleOCREngine(OCREngine):
15
17
  """PaddleOCR engine implementation."""
16
18
 
17
- LANGUAGE_MAP = {
18
- 'en': 'en', 'zh': 'ch', 'zh-cn': 'ch', 'zh-tw': 'chinese_cht',
19
- 'ja': 'japan', 'ko': 'korean', 'th': 'thai', 'fr': 'french',
20
- 'de': 'german', 'ru': 'russian', 'ar': 'arabic', 'hi': 'hindi',
21
- 'vi': 'vietnam', 'fa': 'cyrillic', 'ur': 'cyrillic', 'rs': 'serbian',
22
- 'oc': 'latin', 'rsc': 'cyrillic', 'bg': 'bulgarian', 'uk': 'cyrillic',
23
- 'be': 'cyrillic', 'te': 'telugu', 'kn': 'kannada', 'ta': 'tamil',
24
- 'latin': 'latin', 'cyrillic': 'cyrillic', 'devanagari': 'devanagari',
19
+ LANGUAGE_MAP = {
20
+ "en": "en",
21
+ "zh": "ch",
22
+ "zh-cn": "ch",
23
+ "zh-tw": "chinese_cht",
24
+ "ja": "japan",
25
+ "ko": "korean",
26
+ "th": "thai",
27
+ "fr": "french",
28
+ "de": "german",
29
+ "ru": "russian",
30
+ "ar": "arabic",
31
+ "hi": "hindi",
32
+ "vi": "vietnam",
33
+ "fa": "cyrillic",
34
+ "ur": "cyrillic",
35
+ "rs": "serbian",
36
+ "oc": "latin",
37
+ "rsc": "cyrillic",
38
+ "bg": "bulgarian",
39
+ "uk": "cyrillic",
40
+ "be": "cyrillic",
41
+ "te": "telugu",
42
+ "kn": "kannada",
43
+ "ta": "tamil",
44
+ "latin": "latin",
45
+ "cyrillic": "cyrillic",
46
+ "devanagari": "devanagari",
25
47
  }
26
48
 
27
49
  def __init__(self):
@@ -36,6 +58,7 @@ class PaddleOCREngine(OCREngine):
36
58
  try:
37
59
  import paddle
38
60
  import paddleocr
61
+
39
62
  self._paddleocr = paddleocr
40
63
  logger.info("PaddleOCR module imported successfully.")
41
64
  except ImportError as e:
@@ -45,19 +68,21 @@ class PaddleOCREngine(OCREngine):
45
68
 
46
69
  def is_available(self) -> bool:
47
70
  """Check if PaddleOCR and paddlepaddle are installed."""
48
- paddle_installed = importlib.util.find_spec("paddle") is not None or \
49
- importlib.util.find_spec("paddlepaddle") is not None
71
+ paddle_installed = (
72
+ importlib.util.find_spec("paddle") is not None
73
+ or importlib.util.find_spec("paddlepaddle") is not None
74
+ )
50
75
  paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
51
76
  return paddle_installed and paddleocr_installed
52
77
 
53
78
  def _map_language(self, iso_lang: str) -> str:
54
79
  """Map ISO language code to PaddleOCR language code."""
55
- return self.LANGUAGE_MAP.get(iso_lang.lower(), 'en')
80
+ return self.LANGUAGE_MAP.get(iso_lang.lower(), "en")
56
81
 
57
82
  def _get_cache_key(self, options: PaddleOCROptions) -> str:
58
83
  """Generate a more specific cache key for PaddleOCR."""
59
84
  base_key = super()._get_cache_key(options)
60
- primary_lang = self._map_language(options.languages[0]) if options.languages else 'en'
85
+ primary_lang = self._map_language(options.languages[0]) if options.languages else "en"
61
86
  angle_cls_key = str(options.use_angle_cls)
62
87
  precision_key = options.precision
63
88
  return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
@@ -74,31 +99,34 @@ class PaddleOCREngine(OCREngine):
74
99
 
75
100
  constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
76
101
  constructor_args = {}
77
- constructor_args['lang'] = self._map_language(options.languages[0]) if options.languages else 'en'
102
+ constructor_args["lang"] = (
103
+ self._map_language(options.languages[0]) if options.languages else "en"
104
+ )
78
105
 
79
106
  for field_name, param in constructor_sig.parameters.items():
80
- if field_name in ['self', 'lang']: continue
81
- if field_name == 'use_gpu':
82
- constructor_args['use_gpu'] = options.use_gpu
83
- continue
107
+ if field_name in ["self", "lang"]:
108
+ continue
109
+ if field_name == "use_gpu":
110
+ constructor_args["use_gpu"] = options.use_gpu
111
+ continue
84
112
  if hasattr(options, field_name):
85
- constructor_args[field_name] = getattr(options, field_name)
113
+ constructor_args[field_name] = getattr(options, field_name)
86
114
  elif field_name in options.extra_args:
87
- constructor_args[field_name] = options.extra_args[field_name]
115
+ constructor_args[field_name] = options.extra_args[field_name]
88
116
 
89
- constructor_args.pop('device', None)
117
+ constructor_args.pop("device", None)
90
118
  logger.debug(f"PaddleOCR constructor args: {constructor_args}")
91
119
 
92
120
  try:
93
- show_log = constructor_args.get('show_log', False)
94
- original_log_level = logging.getLogger('ppocr').level
121
+ show_log = constructor_args.get("show_log", False)
122
+ original_log_level = logging.getLogger("ppocr").level
95
123
  if not show_log:
96
- logging.getLogger('ppocr').setLevel(logging.ERROR)
124
+ logging.getLogger("ppocr").setLevel(logging.ERROR)
97
125
 
98
126
  reader = paddleocr.PaddleOCR(**constructor_args)
99
127
 
100
128
  if not show_log:
101
- logging.getLogger('ppocr').setLevel(original_log_level)
129
+ logging.getLogger("ppocr").setLevel(original_log_level)
102
130
 
103
131
  self._reader_cache[cache_key] = reader
104
132
  logger.info("PaddleOCR reader created successfully.")
@@ -108,32 +136,36 @@ class PaddleOCREngine(OCREngine):
108
136
  raise
109
137
 
110
138
  def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
111
- """Helper to prepare arguments for the ocr method (excluding image)."""
112
- ocr_args = {}
113
- # Determine 'cls' value based on options precedence
114
- ocr_args['cls'] = options.cls if options.cls is not None else options.use_angle_cls
115
- ocr_args['det'] = options.det
116
- ocr_args['rec'] = options.rec
117
- # Add extra args if needed (less common for ocr method itself)
118
- # for field_name in options.extra_args:
119
- # if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
120
- # ocr_args[field_name] = options.extra_args[field_name]
121
- logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
122
- return ocr_args
123
-
124
- def _standardize_results(self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions) -> List[Dict[str, Any]]:
139
+ """Helper to prepare arguments for the ocr method (excluding image)."""
140
+ ocr_args = {}
141
+ # Determine 'cls' value based on options precedence
142
+ ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
143
+ ocr_args["det"] = options.det
144
+ ocr_args["rec"] = options.rec
145
+ # Add extra args if needed (less common for ocr method itself)
146
+ # for field_name in options.extra_args:
147
+ # if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
148
+ # ocr_args[field_name] = options.extra_args[field_name]
149
+ logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
150
+ return ocr_args
151
+
152
+ def _standardize_results(
153
+ self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions
154
+ ) -> List[Dict[str, Any]]:
125
155
  """Standardizes raw results from a single page/image from PaddleOCR."""
126
156
  standardized_page = []
127
- if not raw_page_results: # Handle None or empty list
157
+ if not raw_page_results: # Handle None or empty list
128
158
  return standardized_page
129
159
 
130
160
  min_confidence = options.min_confidence
131
161
  for detection in raw_page_results:
132
162
  try:
133
- if not isinstance(detection, (list, tuple)) or len(detection) < 2: continue
163
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2:
164
+ continue
134
165
  bbox_raw = detection[0]
135
166
  text_confidence = detection[1]
136
- if not isinstance(text_confidence, tuple) or len(text_confidence) < 2: continue
167
+ if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
168
+ continue
137
169
 
138
170
  text = str(text_confidence[0])
139
171
  confidence = float(text_confidence[1])
@@ -141,55 +173,52 @@ class PaddleOCREngine(OCREngine):
141
173
  if confidence >= min_confidence:
142
174
  bbox = self._standardize_bbox(bbox_raw)
143
175
  if bbox:
144
- standardized_page.append({
145
- 'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
146
- })
176
+ standardized_page.append(
177
+ {"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
178
+ )
147
179
  else:
148
180
  logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
149
181
  except (IndexError, ValueError, TypeError) as e:
150
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
151
- continue
182
+ logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
183
+ continue
152
184
  return standardized_page
153
185
 
154
186
  def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
155
187
  """Converts PIL Image to BGR numpy array."""
156
- if image.mode == 'BGR': # Already BGR
157
- return np.array(image)
158
- img_rgb = image.convert('RGB')
188
+ if image.mode == "BGR": # Already BGR
189
+ return np.array(image)
190
+ img_rgb = image.convert("RGB")
159
191
  img_array_rgb = np.array(img_rgb)
160
- img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
192
+ img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
161
193
  return img_array_bgr
162
194
 
163
-
164
195
  def process_image(
165
- self,
166
- images: Union[Image.Image, List[Image.Image]],
167
- options: BaseOCROptions
196
+ self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
168
197
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
169
198
  """Processes a single image or a batch of images with PaddleOCR."""
170
199
 
171
200
  if not isinstance(options, PaddleOCROptions):
172
- logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
173
- options = PaddleOCROptions(
174
- languages=options.languages,
175
- min_confidence=options.min_confidence,
176
- device=options.device,
177
- extra_args=options.extra_args
178
- )
201
+ logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
202
+ options = PaddleOCROptions(
203
+ languages=options.languages,
204
+ min_confidence=options.min_confidence,
205
+ device=options.device,
206
+ extra_args=options.extra_args,
207
+ )
179
208
 
180
209
  reader = self._get_reader(options)
181
210
  ocr_args = self._prepare_ocr_args(options)
182
-
211
+
183
212
  # Helper function to process one image
184
213
  def process_one(img):
185
214
  try:
186
215
  img_array_bgr = self._pil_to_bgr(img)
187
216
  raw_results = reader.ocr(img_array_bgr, **ocr_args)
188
-
217
+
189
218
  page_results = []
190
219
  if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
191
220
  page_results = raw_results[0]
192
-
221
+
193
222
  return self._standardize_results(page_results, options)
194
223
  except Exception as e:
195
224
  logger.error(f"Error processing image with PaddleOCR: {e}")