natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
"""
|
2
|
+
Base OCR engine interface.
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Dict, List, Any, Optional, Union
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
# Set up module logger
|
10
|
+
logger = logging.getLogger("natural_pdf.ocr.engine")
|
11
|
+
|
12
|
+
|
13
|
+
class OCREngine(ABC):
|
14
|
+
"""Base OCR engine interface."""
|
15
|
+
|
16
|
+
def __init__(self, **kwargs):
|
17
|
+
"""
|
18
|
+
Initialize with engine-specific settings.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
**kwargs: Engine-specific settings
|
22
|
+
"""
|
23
|
+
self.logger = logging.getLogger(f"natural_pdf.ocr.{self.__class__.__name__}")
|
24
|
+
self.logger.debug(f"Initializing {self.__class__.__name__} with settings: {kwargs}")
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def process_image(self, image: Image.Image, config: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
28
|
+
"""
|
29
|
+
Process an image and return standardized results.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
image: PIL Image to process
|
33
|
+
config: OCR configuration:
|
34
|
+
- enabled: Whether OCR is enabled
|
35
|
+
- languages: List of language codes (ISO format)
|
36
|
+
- device: Device to use (e.g., 'cpu', 'cuda')
|
37
|
+
- min_confidence: Threshold for result filtering
|
38
|
+
- model_settings: Engine-specific settings
|
39
|
+
|
40
|
+
Returns:
|
41
|
+
List of standardized result dictionaries with:
|
42
|
+
- 'bbox': (x0, y0, x1, y1) - Rectangle coordinates
|
43
|
+
- 'text': Recognized text
|
44
|
+
- 'confidence': Confidence score (0.0-1.0)
|
45
|
+
"""
|
46
|
+
raise NotImplementedError("Subclasses must implement this method")
|
47
|
+
|
48
|
+
@abstractmethod
|
49
|
+
def is_available(self) -> bool:
|
50
|
+
"""
|
51
|
+
Check if this engine's dependencies are installed.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
True if the engine can be used, False otherwise
|
55
|
+
"""
|
56
|
+
return False
|
57
|
+
|
58
|
+
def normalize_config(self, config: Optional[Union[bool, str, List, Dict]] = None) -> Dict[str, Any]:
|
59
|
+
"""
|
60
|
+
Normalize OCR configuration from various formats.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
config: OCR configuration in various formats:
|
64
|
+
- None: OCR disabled
|
65
|
+
- True: OCR enabled with defaults
|
66
|
+
- "auto": Auto OCR mode
|
67
|
+
- ["en", "fr"]: Use these languages
|
68
|
+
- {"languages": ["en"]}: Detailed configuration
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
Normalized configuration dictionary
|
72
|
+
"""
|
73
|
+
logger.debug(f"Normalizing OCR config: {config}")
|
74
|
+
# Base config - Note: default is now enabled=True except for None
|
75
|
+
result = {
|
76
|
+
"enabled": False, # Will be updated below for different config types
|
77
|
+
"languages": ["en"],
|
78
|
+
"device": "cpu",
|
79
|
+
"min_confidence": 0.5,
|
80
|
+
"model_settings": {}
|
81
|
+
}
|
82
|
+
|
83
|
+
# Handle simple cases
|
84
|
+
if config is None:
|
85
|
+
# Keep default of disabled for None
|
86
|
+
return result
|
87
|
+
|
88
|
+
if config is True:
|
89
|
+
result["enabled"] = True
|
90
|
+
return result
|
91
|
+
|
92
|
+
if isinstance(config, str):
|
93
|
+
if config.lower() == "auto":
|
94
|
+
result["enabled"] = "auto"
|
95
|
+
return result
|
96
|
+
else:
|
97
|
+
# Assume it's a language code
|
98
|
+
result["enabled"] = True
|
99
|
+
result["languages"] = [config]
|
100
|
+
return result
|
101
|
+
|
102
|
+
if isinstance(config, list):
|
103
|
+
# Assume it's a list of languages
|
104
|
+
result["enabled"] = True
|
105
|
+
result["languages"] = config
|
106
|
+
return result
|
107
|
+
|
108
|
+
if isinstance(config, dict):
|
109
|
+
# If enabled isn't explicitly set and we have contents, assume enabled
|
110
|
+
if "enabled" not in config:
|
111
|
+
# Enable by default if we have settings
|
112
|
+
has_settings = (
|
113
|
+
("languages" in config and config["languages"]) or
|
114
|
+
("model_settings" in config and config["model_settings"])
|
115
|
+
)
|
116
|
+
if has_settings:
|
117
|
+
result["enabled"] = True
|
118
|
+
|
119
|
+
# Update with provided values
|
120
|
+
result.update(config)
|
121
|
+
|
122
|
+
# Ensure model_settings exists
|
123
|
+
result.setdefault("model_settings", {})
|
124
|
+
|
125
|
+
return result
|
126
|
+
|
127
|
+
# Fallback for unknown types - enable by default
|
128
|
+
result["enabled"] = True
|
129
|
+
logger.debug(f"Normalized OCR config: {result}")
|
130
|
+
return result
|
131
|
+
|
132
|
+
def merge_configs(self, base_config: Dict[str, Any], override_config: Dict[str, Any]) -> Dict[str, Any]:
|
133
|
+
"""
|
134
|
+
Merge OCR configurations, with override_config taking precedence.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
base_config: Base configuration
|
138
|
+
override_config: Configuration to override base with
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
Merged configuration
|
142
|
+
"""
|
143
|
+
logger.debug(f"Merging OCR configs: base={base_config}, override={override_config}")
|
144
|
+
result = base_config.copy()
|
145
|
+
|
146
|
+
# Special handling for model_settings to ensure deep merge
|
147
|
+
if "model_settings" in override_config:
|
148
|
+
if "model_settings" not in result:
|
149
|
+
result["model_settings"] = {}
|
150
|
+
result["model_settings"].update(override_config["model_settings"])
|
151
|
+
|
152
|
+
# Merge other top-level keys
|
153
|
+
for key, value in override_config.items():
|
154
|
+
if key != "model_settings": # Already handled above
|
155
|
+
result[key] = value
|
156
|
+
|
157
|
+
logger.debug(f"Merged OCR config result: {result}")
|
158
|
+
return result
|
@@ -0,0 +1,263 @@
|
|
1
|
+
"""
|
2
|
+
PaddleOCR engine implementation.
|
3
|
+
"""
|
4
|
+
import importlib.util
|
5
|
+
from typing import Dict, List, Any, Optional, Tuple
|
6
|
+
import numpy as np
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from .engine import OCREngine
|
10
|
+
|
11
|
+
|
12
|
+
class PaddleOCREngine(OCREngine):
|
13
|
+
"""PaddleOCR implementation."""
|
14
|
+
|
15
|
+
# Language code mapping from ISO to PaddleOCR codes
|
16
|
+
LANGUAGE_MAP = {
|
17
|
+
'en': 'en',
|
18
|
+
'zh': 'ch',
|
19
|
+
'zh-cn': 'ch',
|
20
|
+
'zh-tw': 'chinese_cht',
|
21
|
+
'ja': 'japan',
|
22
|
+
'ko': 'korean',
|
23
|
+
'th': 'thai',
|
24
|
+
'fr': 'french',
|
25
|
+
'de': 'german',
|
26
|
+
'ru': 'russian',
|
27
|
+
'ar': 'arabic',
|
28
|
+
'hi': 'hindi',
|
29
|
+
'vi': 'vietnam',
|
30
|
+
'fa': 'cyrillic',
|
31
|
+
'ur': 'cyrillic',
|
32
|
+
'rs': 'serbian',
|
33
|
+
'oc': 'latin',
|
34
|
+
'rsc': 'cyrillic',
|
35
|
+
'bg': 'bulgarian',
|
36
|
+
'uk': 'cyrillic',
|
37
|
+
'be': 'cyrillic',
|
38
|
+
'te': 'telugu',
|
39
|
+
'kn': 'kannada',
|
40
|
+
'ta': 'tamil',
|
41
|
+
'latin': 'latin', # Direct mapping for some codes
|
42
|
+
'cyrillic': 'cyrillic',
|
43
|
+
'devanagari': 'devanagari',
|
44
|
+
}
|
45
|
+
|
46
|
+
def __init__(self, **kwargs):
|
47
|
+
"""
|
48
|
+
Initialize PaddleOCR engine.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
**kwargs: Engine-specific settings
|
52
|
+
"""
|
53
|
+
super().__init__(**kwargs)
|
54
|
+
self._readers = {} # Cache for readers
|
55
|
+
|
56
|
+
# Store initialization settings to use in model initialization
|
57
|
+
self._init_settings = kwargs
|
58
|
+
|
59
|
+
def is_available(self) -> bool:
|
60
|
+
"""
|
61
|
+
Check if PaddleOCR is installed.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
True if PaddleOCR is available, False otherwise
|
65
|
+
"""
|
66
|
+
try:
|
67
|
+
import paddleocr
|
68
|
+
import paddle
|
69
|
+
return True
|
70
|
+
except ImportError:
|
71
|
+
return False
|
72
|
+
|
73
|
+
def map_language(self, language: str) -> str:
|
74
|
+
"""
|
75
|
+
Map ISO language code to PaddleOCR language code.
|
76
|
+
|
77
|
+
Args:
|
78
|
+
language: ISO language code (e.g., 'en', 'zh-cn')
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
PaddleOCR language code (e.g., 'en', 'ch')
|
82
|
+
"""
|
83
|
+
return self.LANGUAGE_MAP.get(language.lower(), 'en')
|
84
|
+
|
85
|
+
def get_reader(self, config: Dict[str, Any]):
|
86
|
+
"""
|
87
|
+
Get or initialize a PaddleOCR reader based on configuration.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
config: OCR configuration
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
PaddleOCR reader instance
|
94
|
+
"""
|
95
|
+
# Get primary language from config and map it to PaddleOCR format
|
96
|
+
languages = config.get("languages", ["en"])
|
97
|
+
primary_lang = self.map_language(languages[0]) if languages else 'en'
|
98
|
+
|
99
|
+
# Handle device parameter mapping
|
100
|
+
device = config.get("device", "cpu")
|
101
|
+
|
102
|
+
# Create a cache key from configuration
|
103
|
+
use_angle_cls = config.get("model_settings", {}).get("use_angle_cls", False)
|
104
|
+
cache_key = f"paddleocr_{primary_lang}_{device}_{use_angle_cls}"
|
105
|
+
|
106
|
+
# Return cached reader if available
|
107
|
+
if cache_key in self._readers:
|
108
|
+
return self._readers[cache_key]
|
109
|
+
|
110
|
+
# Check if paddleocr is installed
|
111
|
+
if not importlib.util.find_spec("paddleocr"):
|
112
|
+
raise ImportError(
|
113
|
+
"PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
|
114
|
+
)
|
115
|
+
|
116
|
+
# Import paddleocr
|
117
|
+
import paddleocr
|
118
|
+
|
119
|
+
# Start with initialization settings
|
120
|
+
reader_kwargs = self._init_settings.copy()
|
121
|
+
|
122
|
+
# Set the language
|
123
|
+
reader_kwargs["lang"] = primary_lang
|
124
|
+
|
125
|
+
# Apply model_settings if provided
|
126
|
+
model_settings = config.get("model_settings", {})
|
127
|
+
reader_kwargs.update(model_settings)
|
128
|
+
|
129
|
+
# Create reader with specified settings
|
130
|
+
reader = paddleocr.PaddleOCR(**reader_kwargs)
|
131
|
+
|
132
|
+
# Cache reader
|
133
|
+
self._readers[cache_key] = reader
|
134
|
+
return reader
|
135
|
+
|
136
|
+
def process_image(self, image: Image.Image, config: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
137
|
+
"""
|
138
|
+
Process an image with PaddleOCR.
|
139
|
+
|
140
|
+
Args:
|
141
|
+
image: PIL Image to process
|
142
|
+
config: OCR configuration
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
List of standardized OCR results
|
146
|
+
"""
|
147
|
+
# Normalize config
|
148
|
+
if config is None:
|
149
|
+
config = {}
|
150
|
+
config = self.normalize_config(config)
|
151
|
+
|
152
|
+
# Skip if OCR is disabled
|
153
|
+
if not config.get("enabled"):
|
154
|
+
return []
|
155
|
+
|
156
|
+
# Get reader
|
157
|
+
reader = self.get_reader(config)
|
158
|
+
|
159
|
+
# Convert PIL Image to numpy array
|
160
|
+
if isinstance(image, Image.Image):
|
161
|
+
# PaddleOCR expects BGR format, but PIL is RGB
|
162
|
+
# Make a copy to preserve the original image
|
163
|
+
img_array = np.array(image.copy())
|
164
|
+
if len(img_array.shape) == 3 and img_array.shape[2] == 3:
|
165
|
+
# Convert RGB to BGR for PaddleOCR
|
166
|
+
img_array = img_array[:, :, ::-1]
|
167
|
+
else:
|
168
|
+
img_array = image
|
169
|
+
|
170
|
+
# Run OCR
|
171
|
+
# PaddleOCR result format:
|
172
|
+
# [
|
173
|
+
# [
|
174
|
+
# [[x1, y1], [x2, y2], [x3, y3], [x4, y4]], # Detection box
|
175
|
+
# ('text', confidence) # Recognition result
|
176
|
+
# ],
|
177
|
+
# # More results...
|
178
|
+
# ]
|
179
|
+
print(f"PaddleOCR: Running OCR with cls={config.get('model_settings', {}).get('cls', False)}")
|
180
|
+
try:
|
181
|
+
# Use cls parameter from config or model_settings
|
182
|
+
cls = config.get("model_settings", {}).get("cls", False)
|
183
|
+
result = reader.ocr(img_array, cls=cls)
|
184
|
+
print(f"PaddleOCR: Got result type {type(result)}")
|
185
|
+
if result is not None:
|
186
|
+
if isinstance(result, list) and len(result) > 0:
|
187
|
+
page_result = result[0] if isinstance(result[0], list) else result
|
188
|
+
print(f"PaddleOCR: Got {len(page_result)} results")
|
189
|
+
else:
|
190
|
+
print(f"PaddleOCR: Got empty result list")
|
191
|
+
else:
|
192
|
+
print(f"PaddleOCR: Got None result")
|
193
|
+
except Exception as e:
|
194
|
+
print(f"PaddleOCR error: {e}")
|
195
|
+
import traceback
|
196
|
+
traceback.print_exc()
|
197
|
+
return []
|
198
|
+
|
199
|
+
# Apply minimum confidence threshold
|
200
|
+
min_confidence = config.get("min_confidence", 0.5)
|
201
|
+
|
202
|
+
# Convert to standardized format
|
203
|
+
standardized_results = []
|
204
|
+
|
205
|
+
# PaddleOCR might return None if no text is detected
|
206
|
+
if result is None:
|
207
|
+
return []
|
208
|
+
|
209
|
+
# PaddleOCR might return a list of page results or a single page result
|
210
|
+
# Handle both cases
|
211
|
+
if isinstance(result, list) and len(result) > 0:
|
212
|
+
# If it's a list of pages (multi-page input), use the first page result
|
213
|
+
# Since we're processing a single image, there should only be one page
|
214
|
+
page_result = result[0] if isinstance(result[0], list) else result
|
215
|
+
|
216
|
+
for detection in page_result:
|
217
|
+
# Check if the detection has the expected structure
|
218
|
+
if not isinstance(detection, list) or len(detection) < 2:
|
219
|
+
continue
|
220
|
+
|
221
|
+
# Extract the detection box and recognition result
|
222
|
+
try:
|
223
|
+
bbox = detection[0] # [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
|
224
|
+
text_confidence = detection[1] # ('text', confidence)
|
225
|
+
|
226
|
+
# Extract text and confidence
|
227
|
+
if isinstance(text_confidence, tuple) and len(text_confidence) >= 2:
|
228
|
+
text = text_confidence[0]
|
229
|
+
confidence = float(text_confidence[1]) # Convert to float
|
230
|
+
else:
|
231
|
+
# Fallback if the format is unexpected
|
232
|
+
text = str(text_confidence)
|
233
|
+
confidence = 1.0
|
234
|
+
|
235
|
+
# Skip if confidence is below threshold
|
236
|
+
if confidence < min_confidence:
|
237
|
+
continue
|
238
|
+
|
239
|
+
# Convert polygon bbox to rectangle (x0, y0, x1, y1)
|
240
|
+
x_coords = [point[0] for point in bbox]
|
241
|
+
y_coords = [point[1] for point in bbox]
|
242
|
+
|
243
|
+
x0 = min(x_coords)
|
244
|
+
y0 = min(y_coords)
|
245
|
+
x1 = max(x_coords)
|
246
|
+
y1 = max(y_coords)
|
247
|
+
|
248
|
+
standardized_results.append({
|
249
|
+
'bbox': (x0, y0, x1, y1),
|
250
|
+
'text': text,
|
251
|
+
'confidence': confidence,
|
252
|
+
'source': 'ocr'
|
253
|
+
})
|
254
|
+
except Exception as e:
|
255
|
+
print(f"Error processing PaddleOCR detection: {e}")
|
256
|
+
continue
|
257
|
+
|
258
|
+
return standardized_results
|
259
|
+
|
260
|
+
def __del__(self):
|
261
|
+
"""Cleanup resources when the engine is deleted."""
|
262
|
+
# Clear reader cache to free up memory
|
263
|
+
self._readers.clear()
|