natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,17 @@
1
1
  # ocr_engine_surya.py
2
- import logging
3
2
  import importlib.util
4
- from typing import Dict, List, Any, Optional, Tuple, Union
3
+ import logging
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
5
6
  import numpy as np
6
7
  from PIL import Image
7
8
 
8
9
  from .engine import OCREngine
9
- from .ocr_options import SuryaOCROptions, BaseOCROptions
10
+ from .ocr_options import BaseOCROptions, SuryaOCROptions
10
11
 
11
12
  logger = logging.getLogger(__name__)
12
13
 
14
+
13
15
  class SuryaOCREngine(OCREngine):
14
16
  """Surya OCR engine implementation."""
15
17
 
@@ -30,8 +32,9 @@ class SuryaOCREngine(OCREngine):
30
32
  raise ImportError("Surya OCR library is not installed or available.")
31
33
 
32
34
  try:
33
- from surya.recognition import RecognitionPredictor
34
35
  from surya.detection import DetectionPredictor
36
+ from surya.recognition import RecognitionPredictor
37
+
35
38
  self._surya_recognition = RecognitionPredictor
36
39
  self._surya_detection = DetectionPredictor
37
40
  logger.info("Surya modules imported successfully.")
@@ -40,7 +43,7 @@ class SuryaOCREngine(OCREngine):
40
43
  # Add arguments from options if Surya supports them
41
44
  # Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
42
45
  # predictor_args = {'device': options.device} # If applicable
43
- predictor_args = {} # Assuming parameterless init based on example
46
+ predictor_args = {} # Assuming parameterless init based on example
44
47
 
45
48
  logger.info("Instantiating Surya DetectionPredictor...")
46
49
  self._detection_predictor = self._surya_detection(**predictor_args)
@@ -61,13 +64,17 @@ class SuryaOCREngine(OCREngine):
61
64
  """Check if the surya library is installed."""
62
65
  return importlib.util.find_spec("surya") is not None
63
66
 
64
- def _standardize_results(self, raw_ocr_result: Any, options: SuryaOCROptions) -> List[Dict[str, Any]]:
67
+ def _standardize_results(
68
+ self, raw_ocr_result: Any, options: SuryaOCROptions
69
+ ) -> List[Dict[str, Any]]:
65
70
  """Standardizes raw results from a single image from Surya."""
66
71
  standardized_page = []
67
72
  min_confidence = options.min_confidence
68
73
 
69
74
  # Check if the result has the expected structure (OCRResult with text_lines)
70
- if not hasattr(raw_ocr_result, 'text_lines') or not isinstance(raw_ocr_result.text_lines, list):
75
+ if not hasattr(raw_ocr_result, "text_lines") or not isinstance(
76
+ raw_ocr_result.text_lines, list
77
+ ):
71
78
  logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
72
79
  return standardized_page
73
80
 
@@ -77,52 +84,54 @@ class SuryaOCREngine(OCREngine):
77
84
  text = line.text
78
85
  confidence = line.confidence
79
86
  # Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
80
- bbox_raw = line.bbox # Use bbox directly if available and correct format
87
+ bbox_raw = line.bbox # Use bbox directly if available and correct format
81
88
 
82
89
  if confidence >= min_confidence:
83
- bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
90
+ bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
84
91
  if bbox:
85
- standardized_page.append({
86
- 'bbox': bbox,
87
- 'text': text,
88
- 'confidence': confidence,
89
- 'source': 'ocr'
90
- })
92
+ standardized_page.append(
93
+ {"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
94
+ )
91
95
  else:
92
96
  # Try polygon if bbox failed standardization
93
97
  bbox_poly = self._standardize_bbox(line.polygon)
94
98
  if bbox_poly:
95
- standardized_page.append({
96
- 'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
97
- })
99
+ standardized_page.append(
100
+ {
101
+ "bbox": bbox_poly,
102
+ "text": text,
103
+ "confidence": confidence,
104
+ "source": "ocr",
105
+ }
106
+ )
98
107
  else:
99
- logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
108
+ logger.warning(
109
+ f"Skipping Surya line due to invalid bbox/polygon: {line}"
110
+ )
100
111
 
101
112
  except (AttributeError, ValueError, TypeError) as e:
102
- logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
103
- continue
113
+ logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
114
+ continue
104
115
  return standardized_page
105
116
 
106
117
  def process_image(
107
- self,
108
- images: Union[Image.Image, List[Image.Image]],
109
- options: BaseOCROptions
118
+ self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
110
119
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
111
120
  """Processes a single image or a batch of images with Surya OCR."""
112
121
 
113
122
  if not isinstance(options, SuryaOCROptions):
114
- logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
115
- options = SuryaOCROptions(
116
- languages=options.languages,
117
- min_confidence=options.min_confidence,
118
- device=options.device,
119
- extra_args=options.extra_args
120
- )
123
+ logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
124
+ options = SuryaOCROptions(
125
+ languages=options.languages,
126
+ min_confidence=options.min_confidence,
127
+ device=options.device,
128
+ extra_args=options.extra_args,
129
+ )
121
130
 
122
131
  # Ensure predictors are loaded/initialized
123
132
  self._lazy_load_predictors(options)
124
133
  if not self._recognition_predictor or not self._detection_predictor:
125
- raise RuntimeError("Surya predictors could not be initialized.")
134
+ raise RuntimeError("Surya predictors could not be initialized.")
126
135
 
127
136
  # --- Prepare inputs for Surya ---
128
137
  is_batch = isinstance(images, list)
@@ -131,8 +140,8 @@ class SuryaOCREngine(OCREngine):
131
140
  input_langs: List[List[str]] = [options.languages for _ in input_images]
132
141
 
133
142
  if not input_images:
134
- logger.warning("No images provided for Surya processing.")
135
- return [] if not is_batch else [[]]
143
+ logger.warning("No images provided for Surya processing.")
144
+ return [] if not is_batch else [[]]
136
145
 
137
146
  # --- Run Surya Prediction ---
138
147
  try:
@@ -141,24 +150,26 @@ class SuryaOCREngine(OCREngine):
141
150
  # Call Surya's predictor
142
151
  # It returns a list of OCRResult objects, one per input image
143
152
  predictions = self._recognition_predictor(
144
- images=input_images,
145
- langs=input_langs,
146
- det_predictor=self._detection_predictor
153
+ images=input_images, langs=input_langs, det_predictor=self._detection_predictor
147
154
  )
148
155
  logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
149
156
 
150
157
  # --- Standardize Results ---
151
158
  if len(predictions) != len(input_images):
152
- logger.error(f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results.")
153
- # Decide on error handling: raise error or return empty structure
154
- return [[] for _ in input_images] if is_batch else []
159
+ logger.error(
160
+ f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results."
161
+ )
162
+ # Decide on error handling: raise error or return empty structure
163
+ return [[] for _ in input_images] if is_batch else []
155
164
 
156
- all_standardized_results = [self._standardize_results(res, options) for res in predictions]
165
+ all_standardized_results = [
166
+ self._standardize_results(res, options) for res in predictions
167
+ ]
157
168
 
158
169
  if is_batch:
159
- return all_standardized_results # Return List[List[Dict]]
170
+ return all_standardized_results # Return List[List[Dict]]
160
171
  else:
161
- return all_standardized_results[0] # Return List[Dict] for single image
172
+ return all_standardized_results[0] # Return List[Dict] for single image
162
173
 
163
174
  except Exception as e:
164
175
  logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
@@ -168,4 +179,3 @@ class SuryaOCREngine(OCREngine):
168
179
  # Note: Caching is handled differently for Surya as predictors are stateful
169
180
  # and initialized once. The base class _reader_cache is not used here.
170
181
  # If predictors could be configured per-run, caching would need rethinking.
171
-
@@ -1,68 +1,76 @@
1
1
  # ocr_manager.py
2
+ import copy # For deep copying options
2
3
  import logging
3
- from typing import Dict, List, Any, Optional, Union, Type
4
+ from typing import Any, Dict, List, Optional, Type, Union
5
+
4
6
  from PIL import Image
5
- import copy # For deep copying options
6
7
 
7
8
  # Import engine classes and options
8
9
  from .engine import OCREngine
9
10
  from .engine_easyocr import EasyOCREngine
10
11
  from .engine_paddle import PaddleOCREngine
11
- from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
12
- from .ocr_options import (
13
- BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions, OCROptions # <-- Import Surya Options
14
- )
12
+ from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
13
+ from .ocr_options import OCROptions # <-- Import Surya Options
14
+ from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
+
18
19
  class OCRManager:
19
20
  """Manages OCR engine selection, configuration, and execution."""
20
21
 
21
22
  # Registry mapping engine names to classes and default options
22
23
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
23
- 'easyocr': {'class': EasyOCREngine, 'options_class': EasyOCROptions},
24
- 'paddle': {'class': PaddleOCREngine, 'options_class': PaddleOCROptions},
25
- 'surya': {'class': SuryaOCREngine, 'options_class': SuryaOCROptions}, # <-- Add Surya
24
+ "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
25
+ "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
26
+ "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions}, # <-- Add Surya
26
27
  # Add other engines here
27
28
  }
28
29
 
29
30
  # Define the limited set of kwargs allowed for the simple apply_ocr call
30
31
  SIMPLE_MODE_ALLOWED_KWARGS = {
31
- 'engine', 'languages', 'min_confidence', 'device'
32
+ "engine",
33
+ "languages",
34
+ "min_confidence",
35
+ "device",
32
36
  # Add image pre-processing args like 'resolution', 'width' if handled here
33
37
  }
34
38
 
35
39
  def __init__(self):
36
40
  """Initializes the OCR Manager."""
37
- self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
41
+ self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
38
42
  logger.info("OCRManager initialized.")
39
43
 
40
44
  def _get_engine_instance(self, engine_name: str) -> OCREngine:
41
45
  """Retrieves or creates an instance of the specified OCR engine."""
42
46
  engine_name = engine_name.lower()
43
47
  if engine_name not in self.ENGINE_REGISTRY:
44
- raise ValueError(f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
48
+ raise ValueError(
49
+ f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
50
+ )
45
51
 
46
52
  # Surya engine might manage its own predictor state, consider if caching instance is always right
47
53
  # For now, we cache the engine instance itself.
48
54
  if engine_name not in self._engine_instances:
49
55
  logger.info(f"Creating instance of engine: {engine_name}")
50
- engine_class = self.ENGINE_REGISTRY[engine_name]['class']
51
- engine_instance = engine_class() # Instantiate first
56
+ engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
57
+ engine_instance = engine_class() # Instantiate first
52
58
  if not engine_instance.is_available():
53
- # Check availability before storing
54
- raise RuntimeError(f"Engine '{engine_name}' is not available. Please check dependencies.")
55
- self._engine_instances[engine_name] = engine_instance # Store if available
59
+ # Check availability before storing
60
+ raise RuntimeError(
61
+ f"Engine '{engine_name}' is not available. Please check dependencies."
62
+ )
63
+ self._engine_instances[engine_name] = engine_instance # Store if available
56
64
 
57
65
  return self._engine_instances[engine_name]
58
66
 
59
67
  def apply_ocr(
60
68
  self,
61
- images: Union[Image.Image, List[Image.Image]], # Accept single or list
62
- engine: Optional[str] = 'easyocr', # Default engine
69
+ images: Union[Image.Image, List[Image.Image]], # Accept single or list
70
+ engine: Optional[str] = "easyocr", # Default engine
63
71
  options: Optional[OCROptions] = None,
64
- **kwargs
65
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
72
+ **kwargs,
73
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
66
74
  """
67
75
  Applies OCR to a single image or a batch of images using either simple
68
76
  keyword arguments or an options object.
@@ -94,54 +102,62 @@ class OCRManager:
94
102
  # --- Validate input type ---
95
103
  is_batch = isinstance(images, list)
96
104
  if not is_batch and not isinstance(images, Image.Image):
97
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
105
+ raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
98
106
  # Allow engines to handle non-PIL images in list if they support it/log warnings
99
107
  # if is_batch and not all(isinstance(img, Image.Image) for img in images):
100
108
  # logger.warning("Batch may contain items that are not PIL Images.")
101
109
 
102
-
103
110
  # --- Determine Options and Engine ---
104
111
  if options is not None:
105
112
  # Advanced Mode
106
113
  logger.debug(f"Using advanced mode with options object: {type(options).__name__}")
107
- final_options = copy.deepcopy(options) # Prevent modification of original
114
+ final_options = copy.deepcopy(options) # Prevent modification of original
108
115
  found_engine = False
109
116
  for name, registry_entry in self.ENGINE_REGISTRY.items():
110
117
  # Check if options object is an instance of the registered options class
111
- if isinstance(options, registry_entry['options_class']):
118
+ if isinstance(options, registry_entry["options_class"]):
112
119
  selected_engine_name = name
113
120
  found_engine = True
114
121
  break
115
122
  if not found_engine:
116
- raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered engine options.")
123
+ raise TypeError(
124
+ f"Provided options object type '{type(options).__name__}' does not match any registered engine options."
125
+ )
117
126
  if kwargs:
118
- logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
127
+ logger.warning(
128
+ f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored."
129
+ )
119
130
  else:
120
131
  # Simple Mode
121
- selected_engine_name = engine.lower() if engine else 'easyocr' # Fallback default
122
- logger.debug(f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
132
+ selected_engine_name = engine.lower() if engine else "easyocr" # Fallback default
133
+ logger.debug(
134
+ f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}"
135
+ )
123
136
 
124
137
  if selected_engine_name not in self.ENGINE_REGISTRY:
125
- raise ValueError(f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
138
+ raise ValueError(
139
+ f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
140
+ )
126
141
 
127
142
  unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
128
143
  if unexpected_kwargs:
129
- raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
144
+ raise TypeError(
145
+ f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
146
+ )
130
147
 
131
148
  # Get the *correct* options class for the selected engine
132
- options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
149
+ options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
133
150
 
134
151
  # Create options instance using provided simple kwargs or defaults
135
152
  simple_args = {
136
- 'languages': kwargs.get('languages', ['en']),
137
- 'min_confidence': kwargs.get('min_confidence', 0.5),
138
- 'device': kwargs.get('device', 'cpu')
153
+ "languages": kwargs.get("languages", ["en"]),
154
+ "min_confidence": kwargs.get("min_confidence", 0.5),
155
+ "device": kwargs.get("device", "cpu"),
139
156
  # Note: 'extra_args' isn't populated in simple mode
140
157
  }
141
158
  final_options = options_class(**simple_args)
142
159
  logger.debug(f"Constructed options for simple mode: {final_options}")
143
160
 
144
-
145
161
  # --- Get Engine Instance and Process ---
146
162
  try:
147
163
  engine_instance = self._get_engine_instance(selected_engine_name)
@@ -153,39 +169,49 @@ class OCRManager:
153
169
 
154
170
  # Log result summary based on mode
155
171
  if is_batch:
156
- # Ensure results is a list before trying to get lengths
157
- if isinstance(results, list):
158
- num_results_per_image = [len(res_list) if isinstance(res_list, list) else -1 for res_list in results] # Handle potential errors returning non-lists
159
- logger.info(f"Processing complete. Found results per image: {num_results_per_image}")
160
- else:
161
- logger.error(f"Processing complete but received unexpected result type for batch: {type(results)}")
172
+ # Ensure results is a list before trying to get lengths
173
+ if isinstance(results, list):
174
+ num_results_per_image = [
175
+ len(res_list) if isinstance(res_list, list) else -1 for res_list in results
176
+ ] # Handle potential errors returning non-lists
177
+ logger.info(
178
+ f"Processing complete. Found results per image: {num_results_per_image}"
179
+ )
180
+ else:
181
+ logger.error(
182
+ f"Processing complete but received unexpected result type for batch: {type(results)}"
183
+ )
162
184
  else:
163
- # Ensure results is a list
164
- if isinstance(results, list):
165
- logger.info(f"Processing complete. Found {len(results)} results.")
166
- else:
167
- logger.error(f"Processing complete but received unexpected result type for single image: {type(results)}")
168
- return results # Return type matches input type due to engine logic
185
+ # Ensure results is a list
186
+ if isinstance(results, list):
187
+ logger.info(f"Processing complete. Found {len(results)} results.")
188
+ else:
189
+ logger.error(
190
+ f"Processing complete but received unexpected result type for single image: {type(results)}"
191
+ )
192
+ return results # Return type matches input type due to engine logic
169
193
 
170
194
  except (ImportError, RuntimeError, ValueError, TypeError) as e:
171
- logger.error(f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True)
172
- raise # Re-raise expected errors
195
+ logger.error(
196
+ f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True
197
+ )
198
+ raise # Re-raise expected errors
173
199
  except Exception as e:
174
- logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
175
- raise # Re-raise unexpected errors
176
-
200
+ logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
201
+ raise # Re-raise unexpected errors
177
202
 
178
203
  def get_available_engines(self) -> List[str]:
179
204
  """Returns a list of registered engine names that are currently available."""
180
205
  available = []
181
206
  for name, registry_entry in self.ENGINE_REGISTRY.items():
182
- try:
183
- # Temporarily instantiate to check availability without caching
184
- engine_class = registry_entry['class']
185
- if engine_class().is_available():
186
- available.append(name)
187
- except Exception as e:
188
- logger.debug(f"Engine '{name}' check failed: {e}") # Log check failures at debug level
189
- pass # Ignore engines that fail to instantiate or check
207
+ try:
208
+ # Temporarily instantiate to check availability without caching
209
+ engine_class = registry_entry["class"]
210
+ if engine_class().is_available():
211
+ available.append(name)
212
+ except Exception as e:
213
+ logger.debug(
214
+ f"Engine '{name}' check failed: {e}"
215
+ ) # Log check failures at debug level
216
+ pass # Ignore engines that fail to instantiate or check
190
217
  return available
191
-
@@ -1,30 +1,34 @@
1
1
  # ocr_options.py
2
2
  import logging
3
3
  from dataclasses import dataclass, field
4
- from typing import List, Optional, Dict, Any, Tuple, Union
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
6
  # Configure logging
7
7
  # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
8
  # logger = logging.getLogger(__name__)
9
9
  # Assume logger is configured elsewhere or remove if not needed globally
10
10
 
11
+
11
12
  # --- Base Options ---
12
13
  @dataclass
13
14
  class BaseOCROptions:
14
15
  """Base class for OCR engine options."""
15
- languages: List[str] = field(default_factory=lambda: ['en'])
16
+
17
+ languages: List[str] = field(default_factory=lambda: ["en"])
16
18
  min_confidence: float = 0.5
17
- device: Optional[str] = 'cpu' # Suggestion, actual device usage depends on engine impl.
19
+ device: Optional[str] = "cpu" # Suggestion, actual device usage depends on engine impl.
18
20
  extra_args: Dict[str, Any] = field(default_factory=dict)
19
21
 
22
+
20
23
  # --- EasyOCR Specific Options ---
21
24
  @dataclass
22
25
  class EasyOCROptions(BaseOCROptions):
23
26
  """Specific options for the EasyOCR engine."""
27
+
24
28
  model_storage_directory: Optional[str] = None
25
29
  user_network_directory: Optional[str] = None
26
- recog_network: str = 'english_g2'
27
- detect_network: str = 'craft'
30
+ recog_network: str = "english_g2"
31
+ detect_network: str = "craft"
28
32
  download_enabled: bool = True
29
33
  detector: bool = True
30
34
  recognizer: bool = True
@@ -32,7 +36,7 @@ class EasyOCROptions(BaseOCROptions):
32
36
  quantize: bool = True
33
37
  cudnn_benchmark: bool = False
34
38
  detail: int = 1
35
- decoder: str = 'greedy'
39
+ decoder: str = "greedy"
36
40
  beamWidth: int = 5
37
41
  batch_size: int = 1
38
42
  workers: int = 0
@@ -55,7 +59,7 @@ class EasyOCROptions(BaseOCROptions):
55
59
  y_ths: float = 0.5
56
60
  x_ths: float = 1.0
57
61
  add_margin: float = 0.1
58
- output_format: str = 'standard'
62
+ output_format: str = "standard"
59
63
 
60
64
  # def __post_init__(self):
61
65
  # logger.debug(f"Initialized EasyOCROptions: {self}")
@@ -65,13 +69,14 @@ class EasyOCROptions(BaseOCROptions):
65
69
  @dataclass
66
70
  class PaddleOCROptions(BaseOCROptions):
67
71
  """Specific options for the PaddleOCR engine."""
72
+
68
73
  use_angle_cls: bool = True
69
74
  use_gpu: Optional[bool] = None
70
75
  gpu_mem: int = 500
71
76
  ir_optim: bool = True
72
77
  use_tensorrt: bool = False
73
78
  min_subgraph_size: int = 15
74
- precision: str = 'fp32'
79
+ precision: str = "fp32"
75
80
  enable_mkldnn: bool = False
76
81
  cpu_threads: int = 10
77
82
  use_fp16: bool = False
@@ -91,16 +96,18 @@ class PaddleOCROptions(BaseOCROptions):
91
96
 
92
97
  def __post_init__(self):
93
98
  if self.use_gpu is None:
94
- if self.device and 'cuda' in self.device.lower():
99
+ if self.device and "cuda" in self.device.lower():
95
100
  self.use_gpu = True
96
101
  else:
97
102
  self.use_gpu = False
98
103
  # logger.debug(f"Initialized PaddleOCROptions: {self}")
99
104
 
105
+
100
106
  # --- Surya Specific Options ---
101
107
  @dataclass
102
108
  class SuryaOCROptions(BaseOCROptions):
103
109
  """Specific options for the Surya OCR engine."""
110
+
104
111
  # Currently, Surya example shows languages passed at prediction time.
105
112
  # Add fields here if Surya's RecognitionPredictor or DetectionPredictor
106
113
  # constructors accept relevant arguments (e.g., model paths, device settings).
@@ -111,4 +118,3 @@ class SuryaOCROptions(BaseOCROptions):
111
118
 
112
119
  # --- Union type for type hinting ---
113
120
  OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
114
-
@@ -1,3 +1,3 @@
1
1
  from natural_pdf.qa.document_qa import DocumentQA, get_qa_engine
2
2
 
3
- __all__ = ["DocumentQA", "get_qa_engine"]
3
+ __all__ = ["DocumentQA", "get_qa_engine"]