natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,158 @@
1
+ """
2
+ Base OCR engine interface.
3
+ """
4
+ import logging
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, List, Any, Optional, Union
7
+ from PIL import Image
8
+
9
+ # Set up module logger
10
+ logger = logging.getLogger("natural_pdf.ocr.engine")
11
+
12
+
13
+ class OCREngine(ABC):
14
+ """Base OCR engine interface."""
15
+
16
+ def __init__(self, **kwargs):
17
+ """
18
+ Initialize with engine-specific settings.
19
+
20
+ Args:
21
+ **kwargs: Engine-specific settings
22
+ """
23
+ self.logger = logging.getLogger(f"natural_pdf.ocr.{self.__class__.__name__}")
24
+ self.logger.debug(f"Initializing {self.__class__.__name__} with settings: {kwargs}")
25
+
26
+ @abstractmethod
27
+ def process_image(self, image: Image.Image, config: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
28
+ """
29
+ Process an image and return standardized results.
30
+
31
+ Args:
32
+ image: PIL Image to process
33
+ config: OCR configuration:
34
+ - enabled: Whether OCR is enabled
35
+ - languages: List of language codes (ISO format)
36
+ - device: Device to use (e.g., 'cpu', 'cuda')
37
+ - min_confidence: Threshold for result filtering
38
+ - model_settings: Engine-specific settings
39
+
40
+ Returns:
41
+ List of standardized result dictionaries with:
42
+ - 'bbox': (x0, y0, x1, y1) - Rectangle coordinates
43
+ - 'text': Recognized text
44
+ - 'confidence': Confidence score (0.0-1.0)
45
+ """
46
+ raise NotImplementedError("Subclasses must implement this method")
47
+
48
+ @abstractmethod
49
+ def is_available(self) -> bool:
50
+ """
51
+ Check if this engine's dependencies are installed.
52
+
53
+ Returns:
54
+ True if the engine can be used, False otherwise
55
+ """
56
+ return False
57
+
58
+ def normalize_config(self, config: Optional[Union[bool, str, List, Dict]] = None) -> Dict[str, Any]:
59
+ """
60
+ Normalize OCR configuration from various formats.
61
+
62
+ Args:
63
+ config: OCR configuration in various formats:
64
+ - None: OCR disabled
65
+ - True: OCR enabled with defaults
66
+ - "auto": Auto OCR mode
67
+ - ["en", "fr"]: Use these languages
68
+ - {"languages": ["en"]}: Detailed configuration
69
+
70
+ Returns:
71
+ Normalized configuration dictionary
72
+ """
73
+ logger.debug(f"Normalizing OCR config: {config}")
74
+ # Base config - Note: default is now enabled=True except for None
75
+ result = {
76
+ "enabled": False, # Will be updated below for different config types
77
+ "languages": ["en"],
78
+ "device": "cpu",
79
+ "min_confidence": 0.5,
80
+ "model_settings": {}
81
+ }
82
+
83
+ # Handle simple cases
84
+ if config is None:
85
+ # Keep default of disabled for None
86
+ return result
87
+
88
+ if config is True:
89
+ result["enabled"] = True
90
+ return result
91
+
92
+ if isinstance(config, str):
93
+ if config.lower() == "auto":
94
+ result["enabled"] = "auto"
95
+ return result
96
+ else:
97
+ # Assume it's a language code
98
+ result["enabled"] = True
99
+ result["languages"] = [config]
100
+ return result
101
+
102
+ if isinstance(config, list):
103
+ # Assume it's a list of languages
104
+ result["enabled"] = True
105
+ result["languages"] = config
106
+ return result
107
+
108
+ if isinstance(config, dict):
109
+ # If enabled isn't explicitly set and we have contents, assume enabled
110
+ if "enabled" not in config:
111
+ # Enable by default if we have settings
112
+ has_settings = (
113
+ ("languages" in config and config["languages"]) or
114
+ ("model_settings" in config and config["model_settings"])
115
+ )
116
+ if has_settings:
117
+ result["enabled"] = True
118
+
119
+ # Update with provided values
120
+ result.update(config)
121
+
122
+ # Ensure model_settings exists
123
+ result.setdefault("model_settings", {})
124
+
125
+ return result
126
+
127
+ # Fallback for unknown types - enable by default
128
+ result["enabled"] = True
129
+ logger.debug(f"Normalized OCR config: {result}")
130
+ return result
131
+
132
+ def merge_configs(self, base_config: Dict[str, Any], override_config: Dict[str, Any]) -> Dict[str, Any]:
133
+ """
134
+ Merge OCR configurations, with override_config taking precedence.
135
+
136
+ Args:
137
+ base_config: Base configuration
138
+ override_config: Configuration to override base with
139
+
140
+ Returns:
141
+ Merged configuration
142
+ """
143
+ logger.debug(f"Merging OCR configs: base={base_config}, override={override_config}")
144
+ result = base_config.copy()
145
+
146
+ # Special handling for model_settings to ensure deep merge
147
+ if "model_settings" in override_config:
148
+ if "model_settings" not in result:
149
+ result["model_settings"] = {}
150
+ result["model_settings"].update(override_config["model_settings"])
151
+
152
+ # Merge other top-level keys
153
+ for key, value in override_config.items():
154
+ if key != "model_settings": # Already handled above
155
+ result[key] = value
156
+
157
+ logger.debug(f"Merged OCR config result: {result}")
158
+ return result
@@ -0,0 +1,263 @@
1
+ """
2
+ PaddleOCR engine implementation.
3
+ """
4
+ import importlib.util
5
+ from typing import Dict, List, Any, Optional, Tuple
6
+ import numpy as np
7
+ from PIL import Image
8
+
9
+ from .engine import OCREngine
10
+
11
+
12
+ class PaddleOCREngine(OCREngine):
13
+ """PaddleOCR implementation."""
14
+
15
+ # Language code mapping from ISO to PaddleOCR codes
16
+ LANGUAGE_MAP = {
17
+ 'en': 'en',
18
+ 'zh': 'ch',
19
+ 'zh-cn': 'ch',
20
+ 'zh-tw': 'chinese_cht',
21
+ 'ja': 'japan',
22
+ 'ko': 'korean',
23
+ 'th': 'thai',
24
+ 'fr': 'french',
25
+ 'de': 'german',
26
+ 'ru': 'russian',
27
+ 'ar': 'arabic',
28
+ 'hi': 'hindi',
29
+ 'vi': 'vietnam',
30
+ 'fa': 'cyrillic',
31
+ 'ur': 'cyrillic',
32
+ 'rs': 'serbian',
33
+ 'oc': 'latin',
34
+ 'rsc': 'cyrillic',
35
+ 'bg': 'bulgarian',
36
+ 'uk': 'cyrillic',
37
+ 'be': 'cyrillic',
38
+ 'te': 'telugu',
39
+ 'kn': 'kannada',
40
+ 'ta': 'tamil',
41
+ 'latin': 'latin', # Direct mapping for some codes
42
+ 'cyrillic': 'cyrillic',
43
+ 'devanagari': 'devanagari',
44
+ }
45
+
46
+ def __init__(self, **kwargs):
47
+ """
48
+ Initialize PaddleOCR engine.
49
+
50
+ Args:
51
+ **kwargs: Engine-specific settings
52
+ """
53
+ super().__init__(**kwargs)
54
+ self._readers = {} # Cache for readers
55
+
56
+ # Store initialization settings to use in model initialization
57
+ self._init_settings = kwargs
58
+
59
+ def is_available(self) -> bool:
60
+ """
61
+ Check if PaddleOCR is installed.
62
+
63
+ Returns:
64
+ True if PaddleOCR is available, False otherwise
65
+ """
66
+ try:
67
+ import paddleocr
68
+ import paddle
69
+ return True
70
+ except ImportError:
71
+ return False
72
+
73
+ def map_language(self, language: str) -> str:
74
+ """
75
+ Map ISO language code to PaddleOCR language code.
76
+
77
+ Args:
78
+ language: ISO language code (e.g., 'en', 'zh-cn')
79
+
80
+ Returns:
81
+ PaddleOCR language code (e.g., 'en', 'ch')
82
+ """
83
+ return self.LANGUAGE_MAP.get(language.lower(), 'en')
84
+
85
+ def get_reader(self, config: Dict[str, Any]):
86
+ """
87
+ Get or initialize a PaddleOCR reader based on configuration.
88
+
89
+ Args:
90
+ config: OCR configuration
91
+
92
+ Returns:
93
+ PaddleOCR reader instance
94
+ """
95
+ # Get primary language from config and map it to PaddleOCR format
96
+ languages = config.get("languages", ["en"])
97
+ primary_lang = self.map_language(languages[0]) if languages else 'en'
98
+
99
+ # Handle device parameter mapping
100
+ device = config.get("device", "cpu")
101
+
102
+ # Create a cache key from configuration
103
+ use_angle_cls = config.get("model_settings", {}).get("use_angle_cls", False)
104
+ cache_key = f"paddleocr_{primary_lang}_{device}_{use_angle_cls}"
105
+
106
+ # Return cached reader if available
107
+ if cache_key in self._readers:
108
+ return self._readers[cache_key]
109
+
110
+ # Check if paddleocr is installed
111
+ if not importlib.util.find_spec("paddleocr"):
112
+ raise ImportError(
113
+ "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
114
+ )
115
+
116
+ # Import paddleocr
117
+ import paddleocr
118
+
119
+ # Start with initialization settings
120
+ reader_kwargs = self._init_settings.copy()
121
+
122
+ # Set the language
123
+ reader_kwargs["lang"] = primary_lang
124
+
125
+ # Apply model_settings if provided
126
+ model_settings = config.get("model_settings", {})
127
+ reader_kwargs.update(model_settings)
128
+
129
+ # Create reader with specified settings
130
+ reader = paddleocr.PaddleOCR(**reader_kwargs)
131
+
132
+ # Cache reader
133
+ self._readers[cache_key] = reader
134
+ return reader
135
+
136
+ def process_image(self, image: Image.Image, config: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
137
+ """
138
+ Process an image with PaddleOCR.
139
+
140
+ Args:
141
+ image: PIL Image to process
142
+ config: OCR configuration
143
+
144
+ Returns:
145
+ List of standardized OCR results
146
+ """
147
+ # Normalize config
148
+ if config is None:
149
+ config = {}
150
+ config = self.normalize_config(config)
151
+
152
+ # Skip if OCR is disabled
153
+ if not config.get("enabled"):
154
+ return []
155
+
156
+ # Get reader
157
+ reader = self.get_reader(config)
158
+
159
+ # Convert PIL Image to numpy array
160
+ if isinstance(image, Image.Image):
161
+ # PaddleOCR expects BGR format, but PIL is RGB
162
+ # Make a copy to preserve the original image
163
+ img_array = np.array(image.copy())
164
+ if len(img_array.shape) == 3 and img_array.shape[2] == 3:
165
+ # Convert RGB to BGR for PaddleOCR
166
+ img_array = img_array[:, :, ::-1]
167
+ else:
168
+ img_array = image
169
+
170
+ # Run OCR
171
+ # PaddleOCR result format:
172
+ # [
173
+ # [
174
+ # [[x1, y1], [x2, y2], [x3, y3], [x4, y4]], # Detection box
175
+ # ('text', confidence) # Recognition result
176
+ # ],
177
+ # # More results...
178
+ # ]
179
+ print(f"PaddleOCR: Running OCR with cls={config.get('model_settings', {}).get('cls', False)}")
180
+ try:
181
+ # Use cls parameter from config or model_settings
182
+ cls = config.get("model_settings", {}).get("cls", False)
183
+ result = reader.ocr(img_array, cls=cls)
184
+ print(f"PaddleOCR: Got result type {type(result)}")
185
+ if result is not None:
186
+ if isinstance(result, list) and len(result) > 0:
187
+ page_result = result[0] if isinstance(result[0], list) else result
188
+ print(f"PaddleOCR: Got {len(page_result)} results")
189
+ else:
190
+ print(f"PaddleOCR: Got empty result list")
191
+ else:
192
+ print(f"PaddleOCR: Got None result")
193
+ except Exception as e:
194
+ print(f"PaddleOCR error: {e}")
195
+ import traceback
196
+ traceback.print_exc()
197
+ return []
198
+
199
+ # Apply minimum confidence threshold
200
+ min_confidence = config.get("min_confidence", 0.5)
201
+
202
+ # Convert to standardized format
203
+ standardized_results = []
204
+
205
+ # PaddleOCR might return None if no text is detected
206
+ if result is None:
207
+ return []
208
+
209
+ # PaddleOCR might return a list of page results or a single page result
210
+ # Handle both cases
211
+ if isinstance(result, list) and len(result) > 0:
212
+ # If it's a list of pages (multi-page input), use the first page result
213
+ # Since we're processing a single image, there should only be one page
214
+ page_result = result[0] if isinstance(result[0], list) else result
215
+
216
+ for detection in page_result:
217
+ # Check if the detection has the expected structure
218
+ if not isinstance(detection, list) or len(detection) < 2:
219
+ continue
220
+
221
+ # Extract the detection box and recognition result
222
+ try:
223
+ bbox = detection[0] # [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]
224
+ text_confidence = detection[1] # ('text', confidence)
225
+
226
+ # Extract text and confidence
227
+ if isinstance(text_confidence, tuple) and len(text_confidence) >= 2:
228
+ text = text_confidence[0]
229
+ confidence = float(text_confidence[1]) # Convert to float
230
+ else:
231
+ # Fallback if the format is unexpected
232
+ text = str(text_confidence)
233
+ confidence = 1.0
234
+
235
+ # Skip if confidence is below threshold
236
+ if confidence < min_confidence:
237
+ continue
238
+
239
+ # Convert polygon bbox to rectangle (x0, y0, x1, y1)
240
+ x_coords = [point[0] for point in bbox]
241
+ y_coords = [point[1] for point in bbox]
242
+
243
+ x0 = min(x_coords)
244
+ y0 = min(y_coords)
245
+ x1 = max(x_coords)
246
+ y1 = max(y_coords)
247
+
248
+ standardized_results.append({
249
+ 'bbox': (x0, y0, x1, y1),
250
+ 'text': text,
251
+ 'confidence': confidence,
252
+ 'source': 'ocr'
253
+ })
254
+ except Exception as e:
255
+ print(f"Error processing PaddleOCR detection: {e}")
256
+ continue
257
+
258
+ return standardized_results
259
+
260
+ def __del__(self):
261
+ """Cleanup resources when the engine is deleted."""
262
+ # Clear reader cache to free up memory
263
+ self._readers.clear()
@@ -0,0 +1,3 @@
1
+ from natural_pdf.qa.document_qa import DocumentQA, get_qa_engine
2
+
3
+ __all__ = ["DocumentQA", "get_qa_engine"]