natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
# ocr_engine_surya.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
4
|
-
|
3
|
+
import logging
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
|
+
|
5
6
|
import numpy as np
|
6
7
|
from PIL import Image
|
7
8
|
|
8
9
|
from .engine import OCREngine
|
9
|
-
from .ocr_options import
|
10
|
+
from .ocr_options import BaseOCROptions, SuryaOCROptions
|
10
11
|
|
11
12
|
logger = logging.getLogger(__name__)
|
12
13
|
|
14
|
+
|
13
15
|
class SuryaOCREngine(OCREngine):
|
14
16
|
"""Surya OCR engine implementation."""
|
15
17
|
|
@@ -30,8 +32,9 @@ class SuryaOCREngine(OCREngine):
|
|
30
32
|
raise ImportError("Surya OCR library is not installed or available.")
|
31
33
|
|
32
34
|
try:
|
33
|
-
from surya.recognition import RecognitionPredictor
|
34
35
|
from surya.detection import DetectionPredictor
|
36
|
+
from surya.recognition import RecognitionPredictor
|
37
|
+
|
35
38
|
self._surya_recognition = RecognitionPredictor
|
36
39
|
self._surya_detection = DetectionPredictor
|
37
40
|
logger.info("Surya modules imported successfully.")
|
@@ -40,7 +43,7 @@ class SuryaOCREngine(OCREngine):
|
|
40
43
|
# Add arguments from options if Surya supports them
|
41
44
|
# Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
|
42
45
|
# predictor_args = {'device': options.device} # If applicable
|
43
|
-
predictor_args = {}
|
46
|
+
predictor_args = {} # Assuming parameterless init based on example
|
44
47
|
|
45
48
|
logger.info("Instantiating Surya DetectionPredictor...")
|
46
49
|
self._detection_predictor = self._surya_detection(**predictor_args)
|
@@ -61,13 +64,17 @@ class SuryaOCREngine(OCREngine):
|
|
61
64
|
"""Check if the surya library is installed."""
|
62
65
|
return importlib.util.find_spec("surya") is not None
|
63
66
|
|
64
|
-
def _standardize_results(
|
67
|
+
def _standardize_results(
|
68
|
+
self, raw_ocr_result: Any, options: SuryaOCROptions
|
69
|
+
) -> List[Dict[str, Any]]:
|
65
70
|
"""Standardizes raw results from a single image from Surya."""
|
66
71
|
standardized_page = []
|
67
72
|
min_confidence = options.min_confidence
|
68
73
|
|
69
74
|
# Check if the result has the expected structure (OCRResult with text_lines)
|
70
|
-
if not hasattr(raw_ocr_result,
|
75
|
+
if not hasattr(raw_ocr_result, "text_lines") or not isinstance(
|
76
|
+
raw_ocr_result.text_lines, list
|
77
|
+
):
|
71
78
|
logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
|
72
79
|
return standardized_page
|
73
80
|
|
@@ -77,52 +84,54 @@ class SuryaOCREngine(OCREngine):
|
|
77
84
|
text = line.text
|
78
85
|
confidence = line.confidence
|
79
86
|
# Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
|
80
|
-
bbox_raw = line.bbox
|
87
|
+
bbox_raw = line.bbox # Use bbox directly if available and correct format
|
81
88
|
|
82
89
|
if confidence >= min_confidence:
|
83
|
-
bbox = self._standardize_bbox(bbox_raw)
|
90
|
+
bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
|
84
91
|
if bbox:
|
85
|
-
standardized_page.append(
|
86
|
-
|
87
|
-
|
88
|
-
'confidence': confidence,
|
89
|
-
'source': 'ocr'
|
90
|
-
})
|
92
|
+
standardized_page.append(
|
93
|
+
{"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
|
94
|
+
)
|
91
95
|
else:
|
92
96
|
# Try polygon if bbox failed standardization
|
93
97
|
bbox_poly = self._standardize_bbox(line.polygon)
|
94
98
|
if bbox_poly:
|
95
|
-
|
96
|
-
|
97
|
-
|
99
|
+
standardized_page.append(
|
100
|
+
{
|
101
|
+
"bbox": bbox_poly,
|
102
|
+
"text": text,
|
103
|
+
"confidence": confidence,
|
104
|
+
"source": "ocr",
|
105
|
+
}
|
106
|
+
)
|
98
107
|
else:
|
99
|
-
|
108
|
+
logger.warning(
|
109
|
+
f"Skipping Surya line due to invalid bbox/polygon: {line}"
|
110
|
+
)
|
100
111
|
|
101
112
|
except (AttributeError, ValueError, TypeError) as e:
|
102
|
-
|
103
|
-
|
113
|
+
logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
|
114
|
+
continue
|
104
115
|
return standardized_page
|
105
116
|
|
106
117
|
def process_image(
|
107
|
-
self,
|
108
|
-
images: Union[Image.Image, List[Image.Image]],
|
109
|
-
options: BaseOCROptions
|
118
|
+
self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
|
110
119
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
111
120
|
"""Processes a single image or a batch of images with Surya OCR."""
|
112
121
|
|
113
122
|
if not isinstance(options, SuryaOCROptions):
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
123
|
+
logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
|
124
|
+
options = SuryaOCROptions(
|
125
|
+
languages=options.languages,
|
126
|
+
min_confidence=options.min_confidence,
|
127
|
+
device=options.device,
|
128
|
+
extra_args=options.extra_args,
|
129
|
+
)
|
121
130
|
|
122
131
|
# Ensure predictors are loaded/initialized
|
123
132
|
self._lazy_load_predictors(options)
|
124
133
|
if not self._recognition_predictor or not self._detection_predictor:
|
125
|
-
|
134
|
+
raise RuntimeError("Surya predictors could not be initialized.")
|
126
135
|
|
127
136
|
# --- Prepare inputs for Surya ---
|
128
137
|
is_batch = isinstance(images, list)
|
@@ -131,8 +140,8 @@ class SuryaOCREngine(OCREngine):
|
|
131
140
|
input_langs: List[List[str]] = [options.languages for _ in input_images]
|
132
141
|
|
133
142
|
if not input_images:
|
134
|
-
|
135
|
-
|
143
|
+
logger.warning("No images provided for Surya processing.")
|
144
|
+
return [] if not is_batch else [[]]
|
136
145
|
|
137
146
|
# --- Run Surya Prediction ---
|
138
147
|
try:
|
@@ -141,24 +150,26 @@ class SuryaOCREngine(OCREngine):
|
|
141
150
|
# Call Surya's predictor
|
142
151
|
# It returns a list of OCRResult objects, one per input image
|
143
152
|
predictions = self._recognition_predictor(
|
144
|
-
images=input_images,
|
145
|
-
langs=input_langs,
|
146
|
-
det_predictor=self._detection_predictor
|
153
|
+
images=input_images, langs=input_langs, det_predictor=self._detection_predictor
|
147
154
|
)
|
148
155
|
logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
|
149
156
|
|
150
157
|
# --- Standardize Results ---
|
151
158
|
if len(predictions) != len(input_images):
|
152
|
-
|
153
|
-
|
154
|
-
|
159
|
+
logger.error(
|
160
|
+
f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results."
|
161
|
+
)
|
162
|
+
# Decide on error handling: raise error or return empty structure
|
163
|
+
return [[] for _ in input_images] if is_batch else []
|
155
164
|
|
156
|
-
all_standardized_results = [
|
165
|
+
all_standardized_results = [
|
166
|
+
self._standardize_results(res, options) for res in predictions
|
167
|
+
]
|
157
168
|
|
158
169
|
if is_batch:
|
159
|
-
return all_standardized_results
|
170
|
+
return all_standardized_results # Return List[List[Dict]]
|
160
171
|
else:
|
161
|
-
return all_standardized_results[0]
|
172
|
+
return all_standardized_results[0] # Return List[Dict] for single image
|
162
173
|
|
163
174
|
except Exception as e:
|
164
175
|
logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
|
@@ -168,4 +179,3 @@ class SuryaOCREngine(OCREngine):
|
|
168
179
|
# Note: Caching is handled differently for Surya as predictors are stateful
|
169
180
|
# and initialized once. The base class _reader_cache is not used here.
|
170
181
|
# If predictors could be configured per-run, caching would need rethinking.
|
171
|
-
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -1,68 +1,76 @@
|
|
1
1
|
# ocr_manager.py
|
2
|
+
import copy # For deep copying options
|
2
3
|
import logging
|
3
|
-
from typing import Dict, List,
|
4
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
5
|
+
|
4
6
|
from PIL import Image
|
5
|
-
import copy # For deep copying options
|
6
7
|
|
7
8
|
# Import engine classes and options
|
8
9
|
from .engine import OCREngine
|
9
10
|
from .engine_easyocr import EasyOCREngine
|
10
11
|
from .engine_paddle import PaddleOCREngine
|
11
|
-
from .engine_surya import SuryaOCREngine
|
12
|
-
from .ocr_options import
|
13
|
-
|
14
|
-
)
|
12
|
+
from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
|
13
|
+
from .ocr_options import OCROptions # <-- Import Surya Options
|
14
|
+
from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
|
15
15
|
|
16
16
|
logger = logging.getLogger(__name__)
|
17
17
|
|
18
|
+
|
18
19
|
class OCRManager:
|
19
20
|
"""Manages OCR engine selection, configuration, and execution."""
|
20
21
|
|
21
22
|
# Registry mapping engine names to classes and default options
|
22
23
|
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
"easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
|
25
|
+
"paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
|
26
|
+
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions}, # <-- Add Surya
|
26
27
|
# Add other engines here
|
27
28
|
}
|
28
29
|
|
29
30
|
# Define the limited set of kwargs allowed for the simple apply_ocr call
|
30
31
|
SIMPLE_MODE_ALLOWED_KWARGS = {
|
31
|
-
|
32
|
+
"engine",
|
33
|
+
"languages",
|
34
|
+
"min_confidence",
|
35
|
+
"device",
|
32
36
|
# Add image pre-processing args like 'resolution', 'width' if handled here
|
33
37
|
}
|
34
38
|
|
35
39
|
def __init__(self):
|
36
40
|
"""Initializes the OCR Manager."""
|
37
|
-
self._engine_instances: Dict[str, OCREngine] = {}
|
41
|
+
self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
|
38
42
|
logger.info("OCRManager initialized.")
|
39
43
|
|
40
44
|
def _get_engine_instance(self, engine_name: str) -> OCREngine:
|
41
45
|
"""Retrieves or creates an instance of the specified OCR engine."""
|
42
46
|
engine_name = engine_name.lower()
|
43
47
|
if engine_name not in self.ENGINE_REGISTRY:
|
44
|
-
raise ValueError(
|
48
|
+
raise ValueError(
|
49
|
+
f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
50
|
+
)
|
45
51
|
|
46
52
|
# Surya engine might manage its own predictor state, consider if caching instance is always right
|
47
53
|
# For now, we cache the engine instance itself.
|
48
54
|
if engine_name not in self._engine_instances:
|
49
55
|
logger.info(f"Creating instance of engine: {engine_name}")
|
50
|
-
engine_class = self.ENGINE_REGISTRY[engine_name][
|
51
|
-
engine_instance = engine_class()
|
56
|
+
engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
|
57
|
+
engine_instance = engine_class() # Instantiate first
|
52
58
|
if not engine_instance.is_available():
|
53
|
-
|
54
|
-
|
55
|
-
|
59
|
+
# Check availability before storing
|
60
|
+
raise RuntimeError(
|
61
|
+
f"Engine '{engine_name}' is not available. Please check dependencies."
|
62
|
+
)
|
63
|
+
self._engine_instances[engine_name] = engine_instance # Store if available
|
56
64
|
|
57
65
|
return self._engine_instances[engine_name]
|
58
66
|
|
59
67
|
def apply_ocr(
|
60
68
|
self,
|
61
|
-
images: Union[Image.Image, List[Image.Image]],
|
62
|
-
engine: Optional[str] =
|
69
|
+
images: Union[Image.Image, List[Image.Image]], # Accept single or list
|
70
|
+
engine: Optional[str] = "easyocr", # Default engine
|
63
71
|
options: Optional[OCROptions] = None,
|
64
|
-
**kwargs
|
65
|
-
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
72
|
+
**kwargs,
|
73
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
|
66
74
|
"""
|
67
75
|
Applies OCR to a single image or a batch of images using either simple
|
68
76
|
keyword arguments or an options object.
|
@@ -94,54 +102,62 @@ class OCRManager:
|
|
94
102
|
# --- Validate input type ---
|
95
103
|
is_batch = isinstance(images, list)
|
96
104
|
if not is_batch and not isinstance(images, Image.Image):
|
97
|
-
|
105
|
+
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
98
106
|
# Allow engines to handle non-PIL images in list if they support it/log warnings
|
99
107
|
# if is_batch and not all(isinstance(img, Image.Image) for img in images):
|
100
108
|
# logger.warning("Batch may contain items that are not PIL Images.")
|
101
109
|
|
102
|
-
|
103
110
|
# --- Determine Options and Engine ---
|
104
111
|
if options is not None:
|
105
112
|
# Advanced Mode
|
106
113
|
logger.debug(f"Using advanced mode with options object: {type(options).__name__}")
|
107
|
-
final_options = copy.deepcopy(options)
|
114
|
+
final_options = copy.deepcopy(options) # Prevent modification of original
|
108
115
|
found_engine = False
|
109
116
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
110
117
|
# Check if options object is an instance of the registered options class
|
111
|
-
if isinstance(options, registry_entry[
|
118
|
+
if isinstance(options, registry_entry["options_class"]):
|
112
119
|
selected_engine_name = name
|
113
120
|
found_engine = True
|
114
121
|
break
|
115
122
|
if not found_engine:
|
116
|
-
|
123
|
+
raise TypeError(
|
124
|
+
f"Provided options object type '{type(options).__name__}' does not match any registered engine options."
|
125
|
+
)
|
117
126
|
if kwargs:
|
118
|
-
logger.warning(
|
127
|
+
logger.warning(
|
128
|
+
f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored."
|
129
|
+
)
|
119
130
|
else:
|
120
131
|
# Simple Mode
|
121
|
-
selected_engine_name = engine.lower() if engine else
|
122
|
-
logger.debug(
|
132
|
+
selected_engine_name = engine.lower() if engine else "easyocr" # Fallback default
|
133
|
+
logger.debug(
|
134
|
+
f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}"
|
135
|
+
)
|
123
136
|
|
124
137
|
if selected_engine_name not in self.ENGINE_REGISTRY:
|
125
|
-
|
138
|
+
raise ValueError(
|
139
|
+
f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
140
|
+
)
|
126
141
|
|
127
142
|
unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
|
128
143
|
if unexpected_kwargs:
|
129
|
-
raise TypeError(
|
144
|
+
raise TypeError(
|
145
|
+
f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration."
|
146
|
+
)
|
130
147
|
|
131
148
|
# Get the *correct* options class for the selected engine
|
132
|
-
options_class = self.ENGINE_REGISTRY[selected_engine_name][
|
149
|
+
options_class = self.ENGINE_REGISTRY[selected_engine_name]["options_class"]
|
133
150
|
|
134
151
|
# Create options instance using provided simple kwargs or defaults
|
135
152
|
simple_args = {
|
136
|
-
|
137
|
-
|
138
|
-
|
153
|
+
"languages": kwargs.get("languages", ["en"]),
|
154
|
+
"min_confidence": kwargs.get("min_confidence", 0.5),
|
155
|
+
"device": kwargs.get("device", "cpu"),
|
139
156
|
# Note: 'extra_args' isn't populated in simple mode
|
140
157
|
}
|
141
158
|
final_options = options_class(**simple_args)
|
142
159
|
logger.debug(f"Constructed options for simple mode: {final_options}")
|
143
160
|
|
144
|
-
|
145
161
|
# --- Get Engine Instance and Process ---
|
146
162
|
try:
|
147
163
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
@@ -153,39 +169,49 @@ class OCRManager:
|
|
153
169
|
|
154
170
|
# Log result summary based on mode
|
155
171
|
if is_batch:
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
172
|
+
# Ensure results is a list before trying to get lengths
|
173
|
+
if isinstance(results, list):
|
174
|
+
num_results_per_image = [
|
175
|
+
len(res_list) if isinstance(res_list, list) else -1 for res_list in results
|
176
|
+
] # Handle potential errors returning non-lists
|
177
|
+
logger.info(
|
178
|
+
f"Processing complete. Found results per image: {num_results_per_image}"
|
179
|
+
)
|
180
|
+
else:
|
181
|
+
logger.error(
|
182
|
+
f"Processing complete but received unexpected result type for batch: {type(results)}"
|
183
|
+
)
|
162
184
|
else:
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
185
|
+
# Ensure results is a list
|
186
|
+
if isinstance(results, list):
|
187
|
+
logger.info(f"Processing complete. Found {len(results)} results.")
|
188
|
+
else:
|
189
|
+
logger.error(
|
190
|
+
f"Processing complete but received unexpected result type for single image: {type(results)}"
|
191
|
+
)
|
192
|
+
return results # Return type matches input type due to engine logic
|
169
193
|
|
170
194
|
except (ImportError, RuntimeError, ValueError, TypeError) as e:
|
171
|
-
|
172
|
-
|
195
|
+
logger.error(
|
196
|
+
f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True
|
197
|
+
)
|
198
|
+
raise # Re-raise expected errors
|
173
199
|
except Exception as e:
|
174
|
-
|
175
|
-
|
176
|
-
|
200
|
+
logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
|
201
|
+
raise # Re-raise unexpected errors
|
177
202
|
|
178
203
|
def get_available_engines(self) -> List[str]:
|
179
204
|
"""Returns a list of registered engine names that are currently available."""
|
180
205
|
available = []
|
181
206
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
207
|
+
try:
|
208
|
+
# Temporarily instantiate to check availability without caching
|
209
|
+
engine_class = registry_entry["class"]
|
210
|
+
if engine_class().is_available():
|
211
|
+
available.append(name)
|
212
|
+
except Exception as e:
|
213
|
+
logger.debug(
|
214
|
+
f"Engine '{name}' check failed: {e}"
|
215
|
+
) # Log check failures at debug level
|
216
|
+
pass # Ignore engines that fail to instantiate or check
|
190
217
|
return available
|
191
|
-
|
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -1,30 +1,34 @@
|
|
1
1
|
# ocr_options.py
|
2
2
|
import logging
|
3
3
|
from dataclasses import dataclass, field
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
5
|
|
6
6
|
# Configure logging
|
7
7
|
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8
8
|
# logger = logging.getLogger(__name__)
|
9
9
|
# Assume logger is configured elsewhere or remove if not needed globally
|
10
10
|
|
11
|
+
|
11
12
|
# --- Base Options ---
|
12
13
|
@dataclass
|
13
14
|
class BaseOCROptions:
|
14
15
|
"""Base class for OCR engine options."""
|
15
|
-
|
16
|
+
|
17
|
+
languages: List[str] = field(default_factory=lambda: ["en"])
|
16
18
|
min_confidence: float = 0.5
|
17
|
-
device: Optional[str] =
|
19
|
+
device: Optional[str] = "cpu" # Suggestion, actual device usage depends on engine impl.
|
18
20
|
extra_args: Dict[str, Any] = field(default_factory=dict)
|
19
21
|
|
22
|
+
|
20
23
|
# --- EasyOCR Specific Options ---
|
21
24
|
@dataclass
|
22
25
|
class EasyOCROptions(BaseOCROptions):
|
23
26
|
"""Specific options for the EasyOCR engine."""
|
27
|
+
|
24
28
|
model_storage_directory: Optional[str] = None
|
25
29
|
user_network_directory: Optional[str] = None
|
26
|
-
recog_network: str =
|
27
|
-
detect_network: str =
|
30
|
+
recog_network: str = "english_g2"
|
31
|
+
detect_network: str = "craft"
|
28
32
|
download_enabled: bool = True
|
29
33
|
detector: bool = True
|
30
34
|
recognizer: bool = True
|
@@ -32,7 +36,7 @@ class EasyOCROptions(BaseOCROptions):
|
|
32
36
|
quantize: bool = True
|
33
37
|
cudnn_benchmark: bool = False
|
34
38
|
detail: int = 1
|
35
|
-
decoder: str =
|
39
|
+
decoder: str = "greedy"
|
36
40
|
beamWidth: int = 5
|
37
41
|
batch_size: int = 1
|
38
42
|
workers: int = 0
|
@@ -55,7 +59,7 @@ class EasyOCROptions(BaseOCROptions):
|
|
55
59
|
y_ths: float = 0.5
|
56
60
|
x_ths: float = 1.0
|
57
61
|
add_margin: float = 0.1
|
58
|
-
output_format: str =
|
62
|
+
output_format: str = "standard"
|
59
63
|
|
60
64
|
# def __post_init__(self):
|
61
65
|
# logger.debug(f"Initialized EasyOCROptions: {self}")
|
@@ -65,13 +69,14 @@ class EasyOCROptions(BaseOCROptions):
|
|
65
69
|
@dataclass
|
66
70
|
class PaddleOCROptions(BaseOCROptions):
|
67
71
|
"""Specific options for the PaddleOCR engine."""
|
72
|
+
|
68
73
|
use_angle_cls: bool = True
|
69
74
|
use_gpu: Optional[bool] = None
|
70
75
|
gpu_mem: int = 500
|
71
76
|
ir_optim: bool = True
|
72
77
|
use_tensorrt: bool = False
|
73
78
|
min_subgraph_size: int = 15
|
74
|
-
precision: str =
|
79
|
+
precision: str = "fp32"
|
75
80
|
enable_mkldnn: bool = False
|
76
81
|
cpu_threads: int = 10
|
77
82
|
use_fp16: bool = False
|
@@ -91,16 +96,18 @@ class PaddleOCROptions(BaseOCROptions):
|
|
91
96
|
|
92
97
|
def __post_init__(self):
|
93
98
|
if self.use_gpu is None:
|
94
|
-
if self.device and
|
99
|
+
if self.device and "cuda" in self.device.lower():
|
95
100
|
self.use_gpu = True
|
96
101
|
else:
|
97
102
|
self.use_gpu = False
|
98
103
|
# logger.debug(f"Initialized PaddleOCROptions: {self}")
|
99
104
|
|
105
|
+
|
100
106
|
# --- Surya Specific Options ---
|
101
107
|
@dataclass
|
102
108
|
class SuryaOCROptions(BaseOCROptions):
|
103
109
|
"""Specific options for the Surya OCR engine."""
|
110
|
+
|
104
111
|
# Currently, Surya example shows languages passed at prediction time.
|
105
112
|
# Add fields here if Surya's RecognitionPredictor or DetectionPredictor
|
106
113
|
# constructors accept relevant arguments (e.g., model paths, device settings).
|
@@ -111,4 +118,3 @@ class SuryaOCROptions(BaseOCROptions):
|
|
111
118
|
|
112
119
|
# --- Union type for type hinting ---
|
113
120
|
OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
|
114
|
-
|
natural_pdf/qa/__init__.py
CHANGED