natural-pdf 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/elements/region.py +52 -25
- natural_pdf-0.1.2.dist-info/METADATA +124 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.2.dist-info}/RECORD +10 -10
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.2.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Dict, Any, Optional, Union
|
3
3
|
from PIL import Image
|
4
|
+
import copy
|
4
5
|
|
5
6
|
from natural_pdf.elements.region import Region
|
6
7
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
7
|
-
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
8
|
+
from natural_pdf.analyzers.layout.layout_options import LayoutOptions, TATRLayoutOptions, BaseLayoutOptions
|
8
9
|
|
9
10
|
logger = logging.getLogger(__name__)
|
10
11
|
|
@@ -36,20 +37,25 @@ class LayoutAnalyzer:
|
|
36
37
|
classes: Optional[List[str]] = None,
|
37
38
|
exclude_classes: Optional[List[str]] = None,
|
38
39
|
device: Optional[str] = None,
|
39
|
-
existing: str = "replace"
|
40
|
+
existing: str = "replace",
|
41
|
+
**kwargs
|
40
42
|
) -> List[Region]:
|
41
43
|
"""
|
42
44
|
Analyze the page layout using the configured LayoutManager.
|
43
45
|
|
46
|
+
This method constructs the final options object, including internal context,
|
47
|
+
and passes it to the LayoutManager.
|
48
|
+
|
44
49
|
Args:
|
45
|
-
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
|
46
|
-
options: Specific LayoutOptions object for advanced configuration.
|
50
|
+
engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None and no options object given.
|
51
|
+
options: Specific LayoutOptions object for advanced configuration. If provided, simple args (confidence, etc.) are ignored.
|
47
52
|
confidence: Minimum confidence threshold (simple mode).
|
48
53
|
classes: Specific classes to detect (simple mode).
|
49
54
|
exclude_classes: Classes to exclude (simple mode).
|
50
55
|
device: Device for inference (simple mode).
|
51
56
|
existing: How to handle existing detected regions: 'replace' (default) or 'append'.
|
52
|
-
|
57
|
+
**kwargs: Additional engine-specific arguments (added to options.extra_args or used by constructor if options=None).
|
58
|
+
|
53
59
|
Returns:
|
54
60
|
List of created Region objects.
|
55
61
|
"""
|
@@ -57,72 +63,139 @@ class LayoutAnalyzer:
|
|
57
63
|
logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
|
58
64
|
return []
|
59
65
|
|
60
|
-
logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
|
66
|
+
logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options provided: {options is not None})...")
|
61
67
|
|
62
|
-
# --- Render Page Image ---
|
63
|
-
logger.debug(f" Rendering page {self._page.number} to image for layout
|
68
|
+
# --- Render Page Image (Standard Resolution) ---
|
69
|
+
logger.debug(f" Rendering page {self._page.number} to image for initial layout detection...")
|
64
70
|
try:
|
65
|
-
|
66
|
-
layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
|
71
|
+
layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5)
|
67
72
|
layout_resolution = layout_scale * 72
|
68
|
-
|
69
|
-
|
70
|
-
|
73
|
+
std_res_page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
|
74
|
+
if not std_res_page_image:
|
75
|
+
raise ValueError("Initial page rendering returned None")
|
76
|
+
logger.debug(f" Initial rendered image size: {std_res_page_image.width}x{std_res_page_image.height}")
|
71
77
|
except Exception as e:
|
72
|
-
logger.error(f" Failed to render page
|
78
|
+
logger.error(f" Failed to render initial page image: {e}", exc_info=True)
|
73
79
|
return []
|
80
|
+
|
81
|
+
# --- Calculate Scaling Factors (Standard Res Image <-> PDF) ---
|
82
|
+
if std_res_page_image.width == 0 or std_res_page_image.height == 0:
|
83
|
+
logger.error(f"Page {self._page.number}: Invalid initial rendered image dimensions. Cannot scale results.")
|
84
|
+
return []
|
85
|
+
img_scale_x = self._page.width / std_res_page_image.width
|
86
|
+
img_scale_y = self._page.height / std_res_page_image.height
|
87
|
+
logger.debug(f" StdRes Image -> PDF Scaling: x={img_scale_x:.4f}, y={img_scale_y:.4f}")
|
74
88
|
|
75
|
-
# ---
|
76
|
-
|
77
|
-
|
78
|
-
if
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
89
|
+
# --- Construct Final Options Object ---
|
90
|
+
final_options: BaseLayoutOptions
|
91
|
+
|
92
|
+
if options is not None:
|
93
|
+
# User provided a complete options object, use it directly
|
94
|
+
logger.debug("Using user-provided options object.")
|
95
|
+
final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
|
96
|
+
if kwargs:
|
97
|
+
logger.warning(f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided.")
|
98
|
+
# Infer engine from options type if engine arg wasn't provided
|
99
|
+
if engine is None:
|
100
|
+
for name, registry_entry in self._layout_manager.ENGINE_REGISTRY.items():
|
101
|
+
if isinstance(final_options, registry_entry['options_class']):
|
102
|
+
engine = name
|
103
|
+
logger.debug(f"Inferred engine '{engine}' from options type.")
|
104
|
+
break
|
105
|
+
if engine is None:
|
106
|
+
logger.warning("Could not infer engine from provided options object.")
|
107
|
+
else:
|
108
|
+
# Construct options from simple args (engine, confidence, classes, etc.)
|
109
|
+
logger.debug("Constructing options from simple arguments.")
|
110
|
+
selected_engine = engine or self._layout_manager.get_available_engines()[0] # Use provided or first available
|
111
|
+
engine_lower = selected_engine.lower()
|
112
|
+
registry = self._layout_manager.ENGINE_REGISTRY
|
113
|
+
|
114
|
+
if engine_lower not in registry:
|
115
|
+
raise ValueError(f"Unknown or unavailable engine: '{selected_engine}'. Available: {list(registry.keys())}")
|
116
|
+
|
117
|
+
options_class = registry[engine_lower]['options_class']
|
118
|
+
|
119
|
+
# Get base defaults
|
120
|
+
base_defaults = BaseLayoutOptions()
|
121
|
+
|
122
|
+
# Prepare args for constructor, prioritizing explicit args over defaults
|
123
|
+
constructor_args = {
|
124
|
+
'confidence': confidence if confidence is not None else base_defaults.confidence,
|
125
|
+
'classes': classes, # Pass None if not provided
|
126
|
+
'exclude_classes': exclude_classes, # Pass None if not provided
|
127
|
+
'device': device if device is not None else base_defaults.device,
|
128
|
+
'extra_args': kwargs # Pass other kwargs here
|
129
|
+
}
|
130
|
+
# Remove None values unless they are valid defaults (like classes=None)
|
131
|
+
# We can pass all to the dataclass constructor; it handles defaults
|
132
|
+
|
133
|
+
try:
|
134
|
+
final_options = options_class(**constructor_args)
|
135
|
+
logger.debug(f"Constructed options: {final_options}")
|
136
|
+
except TypeError as e:
|
137
|
+
logger.error(f"Failed to construct options object {options_class.__name__} with args {constructor_args}: {e}")
|
138
|
+
# Filter kwargs to only include fields defined in the specific options class? Complex.
|
139
|
+
# Re-raise for now, indicates programming error or invalid kwarg.
|
140
|
+
raise e
|
141
|
+
|
142
|
+
# --- Add Internal Context to extra_args (ALWAYS) ---
|
143
|
+
if not hasattr(final_options, 'extra_args') or final_options.extra_args is None:
|
144
|
+
final_options.extra_args = {}
|
145
|
+
final_options.extra_args['_page_ref'] = self._page
|
146
|
+
final_options.extra_args['_img_scale_x'] = img_scale_x
|
147
|
+
final_options.extra_args['_img_scale_y'] = img_scale_y
|
148
|
+
logger.debug(f"Added internal context to final_options.extra_args: {final_options.extra_args}")
|
149
|
+
|
150
|
+
# --- Call Layout Manager with the Final Options ---
|
151
|
+
logger.debug(f"Calling Layout Manager with final options object.")
|
84
152
|
try:
|
85
|
-
|
153
|
+
# Pass only image and the constructed options object
|
154
|
+
detections = self._layout_manager.analyze_layout(
|
155
|
+
image=std_res_page_image,
|
156
|
+
options=final_options
|
157
|
+
# No engine, confidence, classes etc. passed here directly
|
158
|
+
)
|
86
159
|
logger.info(f" Layout Manager returned {len(detections)} detections.")
|
87
160
|
except Exception as e:
|
88
161
|
logger.error(f" Layout analysis failed: {e}", exc_info=True)
|
89
162
|
return []
|
90
163
|
|
91
|
-
# --- Process Detections (Convert to Regions, Scale Coords) ---
|
92
|
-
# Calculate scale factor to convert from image back to PDF coordinates
|
93
|
-
if page_image.width == 0 or page_image.height == 0:
|
94
|
-
logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
|
95
|
-
return []
|
96
|
-
scale_x = self._page.width / page_image.width
|
97
|
-
scale_y = self._page.height / page_image.height
|
98
|
-
logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
|
99
|
-
|
164
|
+
# --- Process Detections (Convert to Regions, Scale Coords from Image to PDF) ---
|
100
165
|
layout_regions = []
|
101
166
|
docling_id_to_region = {} # For hierarchy if using Docling
|
102
167
|
|
103
168
|
for detection in detections:
|
104
169
|
try:
|
170
|
+
# bbox is relative to std_res_page_image
|
105
171
|
x_min, y_min, x_max, y_max = detection['bbox']
|
106
172
|
|
107
173
|
# Convert coordinates from image to PDF space
|
108
|
-
pdf_x0 = x_min *
|
109
|
-
pdf_y0 = y_min *
|
110
|
-
pdf_x1 = x_max *
|
111
|
-
pdf_y1 = y_max *
|
112
|
-
|
113
|
-
#
|
174
|
+
pdf_x0 = x_min * img_scale_x
|
175
|
+
pdf_y0 = y_min * img_scale_y
|
176
|
+
pdf_x1 = x_max * img_scale_x
|
177
|
+
pdf_y1 = y_max * img_scale_y
|
178
|
+
|
179
|
+
# Ensure PDF coords are valid
|
180
|
+
pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
|
181
|
+
pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
|
182
|
+
pdf_x0 = max(0, pdf_x0)
|
183
|
+
pdf_y0 = max(0, pdf_y0)
|
184
|
+
pdf_x1 = min(self._page.width, pdf_x1)
|
185
|
+
pdf_y1 = min(self._page.height, pdf_y1)
|
186
|
+
|
187
|
+
# Create a Region object with PDF coordinates
|
114
188
|
region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
|
115
|
-
region.region_type = detection.get('class', 'unknown')
|
116
|
-
region.normalized_type = detection.get('normalized_class', 'unknown')
|
189
|
+
region.region_type = detection.get('class', 'unknown')
|
190
|
+
region.normalized_type = detection.get('normalized_class', 'unknown')
|
117
191
|
region.confidence = detection.get('confidence', 0.0)
|
118
|
-
region.model = detection.get('model', engine or 'unknown')
|
192
|
+
region.model = detection.get('model', engine or 'unknown')
|
119
193
|
region.source = 'detected'
|
120
|
-
|
194
|
+
|
121
195
|
# Add extra info if available
|
122
196
|
if 'text' in detection: region.text_content = detection['text']
|
123
197
|
if 'docling_id' in detection: region.docling_id = detection['docling_id']
|
124
198
|
if 'parent_id' in detection: region.parent_id = detection['parent_id']
|
125
|
-
# Add other fields like polygon, position, row/col index if needed
|
126
199
|
|
127
200
|
layout_regions.append(region)
|
128
201
|
|
@@ -163,4 +236,20 @@ class LayoutAnalyzer:
|
|
163
236
|
self._page.detected_layout_regions = self._page._regions['detected']
|
164
237
|
logger.info(f"Layout analysis complete for page {self._page.number}.")
|
165
238
|
|
239
|
+
# --- Auto-create cells if requested by TATR options ---
|
240
|
+
if isinstance(final_options, TATRLayoutOptions) and final_options.create_cells:
|
241
|
+
logger.info(f" Option create_cells=True detected for TATR. Attempting cell creation...")
|
242
|
+
created_cell_count = 0
|
243
|
+
for region in layout_regions:
|
244
|
+
# Only attempt on regions identified as tables by the TATR model
|
245
|
+
if region.model == 'tatr' and region.region_type == 'table':
|
246
|
+
try:
|
247
|
+
# create_cells now modifies the page elements directly and returns self
|
248
|
+
region.create_cells()
|
249
|
+
# We could potentially count cells created here if needed,
|
250
|
+
# but the method logs its own count.
|
251
|
+
except Exception as cell_error:
|
252
|
+
logger.warning(f" Error calling create_cells for table region {region.bbox}: {cell_error}")
|
253
|
+
logger.info(f" Finished cell creation process triggered by options.")
|
254
|
+
|
166
255
|
return layout_regions
|
@@ -120,9 +120,10 @@ class LayoutManager:
|
|
120
120
|
|
121
121
|
# --- Determine Options and Engine ---
|
122
122
|
if options is not None:
|
123
|
-
# Advanced Mode
|
124
|
-
|
125
|
-
|
123
|
+
# Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
|
124
|
+
# Use this object directly, do not deep copy or reconstruct.
|
125
|
+
logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
|
126
|
+
final_options = options # Use the provided object directly
|
126
127
|
found_engine = False
|
127
128
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
128
129
|
if isinstance(options, registry_entry['options_class']):
|
@@ -131,12 +132,14 @@ class LayoutManager:
|
|
131
132
|
break
|
132
133
|
if not found_engine:
|
133
134
|
raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
|
135
|
+
# Ignore simple kwargs if options object is present
|
134
136
|
if kwargs:
|
135
|
-
logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
|
137
|
+
logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored.")
|
136
138
|
else:
|
137
|
-
# Simple Mode
|
139
|
+
# Simple Mode: No options object provided initially.
|
140
|
+
# Determine engine from kwargs or default, then construct options.
|
138
141
|
selected_engine_name = default_engine.lower()
|
139
|
-
logger.debug(f"LayoutManager: Using simple mode
|
142
|
+
logger.debug(f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}")
|
140
143
|
|
141
144
|
if selected_engine_name not in self.ENGINE_REGISTRY:
|
142
145
|
raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
|
@@ -34,7 +34,7 @@ class TATRLayoutOptions(BaseLayoutOptions):
|
|
34
34
|
max_detection_size: int = 800
|
35
35
|
max_structure_size: int = 1000
|
36
36
|
# Whether to create cell regions (can be slow)
|
37
|
-
create_cells: bool =
|
37
|
+
create_cells: bool = True
|
38
38
|
|
39
39
|
# --- Paddle Specific Options ---
|
40
40
|
@dataclass
|
@@ -51,10 +51,8 @@ class PaddleLayoutOptions(BaseLayoutOptions):
|
|
51
51
|
@dataclass
|
52
52
|
class SuryaLayoutOptions(BaseLayoutOptions):
|
53
53
|
"""Options specific to Surya layout detection."""
|
54
|
-
# Surya doesn't seem to have many config options based on the example,
|
55
|
-
# but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
|
56
54
|
model_name: str = "default" # Placeholder if different models become available
|
57
|
-
|
55
|
+
recognize_table_structure: bool = True # Automatically run table structure recognition?
|
58
56
|
|
59
57
|
# --- Docling Specific Options ---
|
60
58
|
@dataclass
|
@@ -3,6 +3,7 @@ import logging
|
|
3
3
|
import importlib.util
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
import copy
|
6
7
|
from typing import List, Dict, Any, Optional, Tuple
|
7
8
|
from PIL import Image
|
8
9
|
|
@@ -11,20 +12,23 @@ from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
|
|
11
12
|
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
14
|
-
# Check for
|
15
|
+
# Check for dependencies
|
15
16
|
surya_spec = importlib.util.find_spec("surya")
|
16
17
|
LayoutPredictor = None
|
18
|
+
TableRecPredictor = None
|
19
|
+
|
17
20
|
if surya_spec:
|
18
21
|
try:
|
19
22
|
from surya.layout import LayoutPredictor
|
23
|
+
from surya.table_rec import TableRecPredictor
|
20
24
|
except ImportError as e:
|
21
|
-
logger.warning(f"Could not import Surya dependencies: {e}")
|
25
|
+
logger.warning(f"Could not import Surya dependencies (layout and/or table_rec): {e}")
|
22
26
|
else:
|
23
27
|
logger.warning("surya not found. SuryaLayoutDetector will not be available.")
|
24
28
|
|
25
29
|
|
26
30
|
class SuryaLayoutDetector(LayoutDetector):
|
27
|
-
"""Document layout detector using Surya models."""
|
31
|
+
"""Document layout and table structure detector using Surya models."""
|
28
32
|
|
29
33
|
def __init__(self):
|
30
34
|
super().__init__()
|
@@ -32,120 +36,224 @@ class SuryaLayoutDetector(LayoutDetector):
|
|
32
36
|
'text', 'pageheader', 'pagefooter', 'sectionheader',
|
33
37
|
'table', 'tableofcontents', 'picture', 'caption',
|
34
38
|
'heading', 'title', 'list', 'listitem', 'code',
|
35
|
-
'textinlinemath', 'mathformula', 'form'
|
39
|
+
'textinlinemath', 'mathformula', 'form',
|
40
|
+
'table-row', 'table-column'
|
36
41
|
}
|
37
|
-
#
|
42
|
+
self._page_ref = None # To store page reference from options
|
38
43
|
|
39
44
|
def is_available(self) -> bool:
|
40
|
-
|
41
|
-
return LayoutPredictor is not None
|
45
|
+
return LayoutPredictor is not None and TableRecPredictor is not None
|
42
46
|
|
43
47
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
44
|
-
"""Generate cache key based on model name and device."""
|
45
48
|
if not isinstance(options, SuryaLayoutOptions):
|
46
|
-
options = SuryaLayoutOptions(device=options.device)
|
47
|
-
|
49
|
+
options = SuryaLayoutOptions(device=options.device)
|
48
50
|
device_key = str(options.device).lower() if options.device else 'default_device'
|
49
|
-
# Include model_name if it affects loading, otherwise device might be enough
|
50
51
|
model_key = options.model_name
|
51
52
|
return f"{self.__class__.__name__}_{device_key}_{model_key}"
|
52
53
|
|
53
|
-
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
54
|
-
"""Load the Surya LayoutPredictor model."""
|
54
|
+
def _load_model_from_options(self, options: BaseLayoutOptions) -> Dict[str, Any]:
|
55
55
|
if not self.is_available():
|
56
|
-
raise RuntimeError("Surya
|
57
|
-
|
56
|
+
raise RuntimeError("Surya dependencies (surya.layout and surya.table_rec) not installed.")
|
58
57
|
if not isinstance(options, SuryaLayoutOptions):
|
59
58
|
raise TypeError("Incorrect options type provided for Surya model loading.")
|
60
|
-
|
61
|
-
|
62
|
-
try:
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
predictor = LayoutPredictor(**predictor_args)
|
68
|
-
self.logger.info("Surya LayoutPredictor loaded.")
|
69
|
-
return predictor
|
59
|
+
self.logger.info(f"Loading Surya models (device={options.device})...")
|
60
|
+
models = {}
|
61
|
+
try:
|
62
|
+
models['layout'] = LayoutPredictor()
|
63
|
+
models['table_rec'] = TableRecPredictor()
|
64
|
+
self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
|
65
|
+
return models
|
70
66
|
except Exception as e:
|
71
|
-
self.logger.error(f"Failed to load Surya
|
67
|
+
self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
|
72
68
|
raise
|
69
|
+
|
70
|
+
def _expand_bbox(self, bbox: Tuple[float, float, float, float],
|
71
|
+
padding: int, max_width: int, max_height: int) -> Tuple[int, int, int, int]:
|
72
|
+
"""Expand bbox by padding, clamping to max dimensions."""
|
73
|
+
x0, y0, x1, y1 = bbox
|
74
|
+
x0 = max(0, int(x0 - padding))
|
75
|
+
y0 = max(0, int(y0 - padding))
|
76
|
+
x1 = min(max_width, int(x1 + padding))
|
77
|
+
y1 = min(max_height, int(y1 + padding))
|
78
|
+
return x0, y0, x1, y1
|
73
79
|
|
74
80
|
def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
|
75
|
-
"""Detect layout elements in an image using Surya."""
|
81
|
+
"""Detect layout elements and optionally table structure in an image using Surya."""
|
76
82
|
if not self.is_available():
|
77
|
-
raise RuntimeError("Surya
|
83
|
+
raise RuntimeError("Surya dependencies (layout and table_rec) not installed.")
|
78
84
|
|
79
85
|
if not isinstance(options, SuryaLayoutOptions):
|
80
86
|
self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
|
81
87
|
options = SuryaLayoutOptions(
|
82
88
|
confidence=options.confidence, classes=options.classes,
|
83
89
|
exclude_classes=options.exclude_classes, device=options.device,
|
84
|
-
extra_args=options.extra_args
|
90
|
+
extra_args=options.extra_args,
|
91
|
+
recognize_table_structure=True
|
85
92
|
)
|
93
|
+
|
94
|
+
# Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
|
95
|
+
self._page_ref = options.extra_args.get('_page_ref')
|
96
|
+
img_scale_x = options.extra_args.get('_img_scale_x')
|
97
|
+
img_scale_y = options.extra_args.get('_img_scale_y')
|
98
|
+
|
99
|
+
# We still need this check, otherwise later steps that need these vars will fail
|
100
|
+
can_do_table_rec = options.recognize_table_structure and self._page_ref and img_scale_x is not None and img_scale_y is not None
|
101
|
+
if options.recognize_table_structure and not can_do_table_rec:
|
102
|
+
logger.warning("Surya table recognition cannot proceed without page reference and scaling factors. Disabling.")
|
103
|
+
options.recognize_table_structure = False
|
86
104
|
|
87
|
-
|
88
|
-
if options.
|
89
|
-
|
90
|
-
|
91
|
-
# Get the cached/loaded predictor instance
|
92
|
-
layout_predictor = self._get_model(options)
|
93
|
-
|
94
|
-
# Surya predictor takes a list of images
|
95
|
-
input_image_list = [image.convert("RGB")] # Ensure RGB
|
96
|
-
|
97
|
-
detections = []
|
98
|
-
try:
|
99
|
-
self.logger.debug("Running Surya layout prediction...")
|
100
|
-
# Call the predictor (returns a list of LayoutResult objects)
|
101
|
-
layout_predictions = layout_predictor(input_image_list)
|
102
|
-
self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
|
103
|
-
|
104
|
-
if not layout_predictions:
|
105
|
-
self.logger.warning("Surya returned empty predictions list.")
|
106
|
-
return []
|
107
|
-
|
108
|
-
# Process results for the first (and only) image
|
109
|
-
prediction = layout_predictions[0] # LayoutResult object
|
110
|
-
|
111
|
-
# Prepare normalized class filters once
|
112
|
-
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
113
|
-
normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
|
114
|
-
|
115
|
-
for layout_box in prediction.bboxes:
|
116
|
-
# Extract the class name and normalize it
|
117
|
-
class_name_orig = layout_box.label
|
118
|
-
normalized_class = self._normalize_class_name(class_name_orig)
|
119
|
-
score = float(layout_box.confidence)
|
120
|
-
|
121
|
-
# Apply confidence threshold
|
122
|
-
if score < options.confidence: continue
|
123
|
-
|
124
|
-
# Apply class filtering
|
125
|
-
if normalized_classes_req and normalized_class not in normalized_classes_req: continue
|
126
|
-
if normalized_class in normalized_classes_excl: continue
|
127
|
-
|
128
|
-
# Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
|
129
|
-
x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
|
130
|
-
|
131
|
-
# Add detection
|
132
|
-
detection_data = {
|
133
|
-
'bbox': (x_min, y_min, x_max, y_max),
|
134
|
-
'class': class_name_orig,
|
135
|
-
'confidence': score,
|
136
|
-
'normalized_class': normalized_class,
|
137
|
-
'source': 'layout',
|
138
|
-
'model': 'surya'
|
139
|
-
# Add polygon etc. if needed, check attributes on layout_box
|
140
|
-
# 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
|
141
|
-
}
|
142
|
-
detections.append(detection_data)
|
143
|
-
|
144
|
-
self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
|
105
|
+
# Validate classes
|
106
|
+
if options.classes: self.validate_classes(options.classes)
|
107
|
+
if options.exclude_classes: self.validate_classes(options.exclude_classes)
|
145
108
|
|
146
|
-
|
147
|
-
|
148
|
-
|
109
|
+
models = self._get_model(options)
|
110
|
+
layout_predictor = models['layout']
|
111
|
+
table_rec_predictor = models['table_rec']
|
112
|
+
|
113
|
+
input_image = image.convert("RGB")
|
114
|
+
input_image_list = [input_image]
|
115
|
+
|
116
|
+
initial_layout_detections = [] # Detections relative to input_image
|
117
|
+
tables_to_process = []
|
118
|
+
|
119
|
+
# --- Initial Layout Detection ---
|
120
|
+
self.logger.debug("Running Surya layout prediction...")
|
121
|
+
layout_predictions = layout_predictor(input_image_list)
|
122
|
+
self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
|
123
|
+
if not layout_predictions: return []
|
124
|
+
prediction = layout_predictions[0]
|
125
|
+
|
126
|
+
normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
127
|
+
normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
|
128
|
+
|
129
|
+
for layout_box in prediction.bboxes:
|
130
|
+
class_name_orig = layout_box.label
|
131
|
+
normalized_class = self._normalize_class_name(class_name_orig)
|
132
|
+
score = float(layout_box.confidence)
|
133
|
+
|
134
|
+
if score < options.confidence: continue
|
135
|
+
if normalized_classes_req and normalized_class not in normalized_classes_req: continue
|
136
|
+
if normalized_class in normalized_classes_excl: continue
|
137
|
+
|
138
|
+
x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
|
139
|
+
detection_data = {
|
140
|
+
'bbox': (x_min, y_min, x_max, y_max),
|
141
|
+
'class': class_name_orig,
|
142
|
+
'confidence': score,
|
143
|
+
'normalized_class': normalized_class,
|
144
|
+
'source': 'layout',
|
145
|
+
'model': 'surya'
|
146
|
+
}
|
147
|
+
initial_layout_detections.append(detection_data)
|
148
|
+
|
149
|
+
if options.recognize_table_structure and normalized_class in ('table', 'tableofcontents'):
|
150
|
+
tables_to_process.append(detection_data)
|
151
|
+
|
152
|
+
self.logger.info(f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria.")
|
153
|
+
|
154
|
+
# --- Table Structure Recognition (Optional) ---
|
155
|
+
if not options.recognize_table_structure or not tables_to_process:
|
156
|
+
self.logger.debug("Skipping Surya table structure recognition (disabled or no tables found).")
|
157
|
+
return initial_layout_detections
|
158
|
+
|
159
|
+
self.logger.info(f"Attempting Surya table structure recognition for {len(tables_to_process)} tables...")
|
160
|
+
high_res_crops = []
|
161
|
+
pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
|
162
|
+
|
163
|
+
high_res_dpi = getattr(self._page_ref._parent, '_config', {}).get('surya_table_rec_dpi', 192)
|
164
|
+
bbox_padding = getattr(self._page_ref._parent, '_config', {}).get('surya_table_bbox_padding', 10)
|
165
|
+
pdf_to_highres_scale = high_res_dpi / 72.0
|
166
|
+
|
167
|
+
# Render high-res page ONCE
|
168
|
+
self.logger.debug(f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition...")
|
169
|
+
high_res_page_image = self._page_ref.to_image(resolution=high_res_dpi, include_highlights=False)
|
170
|
+
if not high_res_page_image:
|
171
|
+
raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
|
172
|
+
self.logger.debug(f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}")
|
173
|
+
|
174
|
+
for i, table_detection in enumerate(tables_to_process):
|
175
|
+
img_x0, img_y0, img_x1, img_y1 = table_detection['bbox']
|
176
|
+
|
177
|
+
# PDF coords
|
178
|
+
pdf_x0 = img_x0 * img_scale_x
|
179
|
+
pdf_y0 = img_y0 * img_scale_y
|
180
|
+
pdf_x1 = img_x1 * img_scale_x
|
181
|
+
pdf_y1 = img_y1 * img_scale_y
|
182
|
+
pdf_x0 = max(0, pdf_x0)
|
183
|
+
pdf_y0 = max(0, pdf_y0)
|
184
|
+
pdf_x1 = min(self._page_ref.width, pdf_x1)
|
185
|
+
pdf_y1 = min(self._page_ref.height, pdf_y1)
|
186
|
+
|
187
|
+
# High-res image coords
|
188
|
+
hr_x0 = pdf_x0 * pdf_to_highres_scale
|
189
|
+
hr_y0 = pdf_y0 * pdf_to_highres_scale
|
190
|
+
hr_x1 = pdf_x1 * pdf_to_highres_scale
|
191
|
+
hr_y1 = pdf_y1 * pdf_to_highres_scale
|
192
|
+
|
193
|
+
# Expand high-res bbox
|
194
|
+
hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
|
195
|
+
(hr_x0, hr_y0, hr_x1, hr_y1),
|
196
|
+
padding=bbox_padding,
|
197
|
+
max_width=high_res_page_image.width,
|
198
|
+
max_height=high_res_page_image.height
|
199
|
+
)
|
200
|
+
|
201
|
+
crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
|
202
|
+
high_res_crops.append(crop)
|
203
|
+
pdf_offsets.append((pdf_x0, pdf_y0))
|
204
|
+
|
205
|
+
if not high_res_crops:
|
206
|
+
self.logger.info("No valid high-resolution table crops generated.")
|
207
|
+
return initial_layout_detections
|
208
|
+
|
209
|
+
structure_detections = [] # Detections relative to std_res input_image
|
210
|
+
|
211
|
+
# --- Run Table Recognition (will raise error on failure) ---
|
212
|
+
self.logger.debug(f"Running Surya table recognition on {len(high_res_crops)} high-res images...")
|
213
|
+
table_predictions = table_rec_predictor(high_res_crops)
|
214
|
+
self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
|
215
|
+
|
216
|
+
# --- Process Results ---
|
217
|
+
if len(table_predictions) != len(pdf_offsets):
|
218
|
+
# This case is less likely if predictor didn't error, but good sanity check
|
219
|
+
raise RuntimeError(f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)}).")
|
220
|
+
|
221
|
+
for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
|
222
|
+
# Process Rows
|
223
|
+
for row_box in table_pred.rows:
|
224
|
+
crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
|
225
|
+
pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
|
226
|
+
pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
|
227
|
+
pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
|
228
|
+
pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
|
229
|
+
img_row_x0 = pdf_row_x0 / img_scale_x
|
230
|
+
img_row_y0 = pdf_row_y0 / img_scale_y
|
231
|
+
img_row_x1 = pdf_row_x1 / img_scale_x
|
232
|
+
img_row_y1 = pdf_row_y1 / img_scale_y
|
233
|
+
structure_detections.append({
|
234
|
+
'bbox': (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
|
235
|
+
'class': 'table-row', 'confidence': 1.0, 'normalized_class': 'table-row',
|
236
|
+
'source': 'layout', 'model': 'surya'
|
237
|
+
})
|
238
|
+
|
239
|
+
# Process Columns
|
240
|
+
for col_box in table_pred.cols:
|
241
|
+
crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
|
242
|
+
pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
|
243
|
+
pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
|
244
|
+
pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
|
245
|
+
pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
|
246
|
+
img_col_x0 = pdf_col_x0 / img_scale_x
|
247
|
+
img_col_y0 = pdf_col_y0 / img_scale_y
|
248
|
+
img_col_x1 = pdf_col_x1 / img_scale_x
|
249
|
+
img_col_y1 = pdf_col_y1 / img_scale_y
|
250
|
+
structure_detections.append({
|
251
|
+
'bbox': (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
|
252
|
+
'class': 'table-column', 'confidence': 1.0, 'normalized_class': 'table-column',
|
253
|
+
'source': 'layout', 'model': 'surya'
|
254
|
+
})
|
255
|
+
|
256
|
+
self.logger.info(f"Added {len(structure_detections)} table structure elements.")
|
149
257
|
|
150
|
-
return
|
258
|
+
return initial_layout_detections + structure_detections
|
151
259
|
|
natural_pdf/elements/region.py
CHANGED
@@ -1514,48 +1514,75 @@ class Region(DirectionalMixin):
|
|
1514
1514
|
|
1515
1515
|
def create_cells(self):
|
1516
1516
|
"""
|
1517
|
-
Create cell regions for a
|
1517
|
+
Create cell regions for a detected table by intersecting its
|
1518
|
+
row and column regions, and add them to the page.
|
1518
1519
|
|
1520
|
+
Assumes child row and column regions are already present on the page.
|
1521
|
+
|
1519
1522
|
Returns:
|
1520
|
-
|
1523
|
+
Self for method chaining.
|
1521
1524
|
"""
|
1522
|
-
|
1523
|
-
|
1525
|
+
# Ensure this is called on a table region
|
1526
|
+
if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
|
1527
|
+
raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
|
1524
1528
|
|
1525
|
-
# Find rows and columns
|
1526
|
-
|
1527
|
-
|
1529
|
+
# Find rows and columns associated with this page
|
1530
|
+
# Remove the model-specific filter
|
1531
|
+
rows = self.page.find_all('region[type=table-row]')
|
1532
|
+
columns = self.page.find_all('region[type=table-column]')
|
1528
1533
|
|
1529
|
-
# Filter to only include those that overlap with this table
|
1534
|
+
# Filter to only include those that overlap with this table region
|
1530
1535
|
def is_in_table(element):
|
1531
|
-
|
1532
|
-
|
1533
|
-
return (
|
1534
|
-
|
1536
|
+
# Use a simple overlap check (more robust than just center point)
|
1537
|
+
# Check if element's bbox overlaps with self.bbox
|
1538
|
+
return (element.x0 < self.x1 and element.x1 > self.x0 and
|
1539
|
+
element.top < self.bottom and element.bottom > self.top)
|
1535
1540
|
|
1536
1541
|
table_rows = [r for r in rows if is_in_table(r)]
|
1537
1542
|
table_columns = [c for c in columns if is_in_table(c)]
|
1538
1543
|
|
1544
|
+
if not table_rows or not table_columns:
|
1545
|
+
self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
|
1546
|
+
return self # Return self even if no cells created
|
1547
|
+
|
1539
1548
|
# Sort rows and columns
|
1540
1549
|
table_rows.sort(key=lambda r: r.top)
|
1541
1550
|
table_columns.sort(key=lambda c: c.x0)
|
1542
1551
|
|
1543
|
-
# Create cells
|
1544
|
-
|
1552
|
+
# Create cells and add them to the page's element manager
|
1553
|
+
created_count = 0
|
1545
1554
|
for row in table_rows:
|
1546
1555
|
for column in table_columns:
|
1547
|
-
#
|
1548
|
-
|
1549
|
-
|
1550
|
-
)
|
1551
|
-
|
1552
|
-
|
1553
|
-
cell
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1556
|
+
# Calculate intersection bbox for the cell
|
1557
|
+
cell_x0 = max(row.x0, column.x0)
|
1558
|
+
cell_y0 = max(row.top, column.top)
|
1559
|
+
cell_x1 = min(row.x1, column.x1)
|
1560
|
+
cell_y1 = min(row.bottom, column.bottom)
|
1561
|
+
|
1562
|
+
# Only create a cell if the intersection is valid (positive width/height)
|
1563
|
+
if cell_x1 > cell_x0 and cell_y1 > cell_y0:
|
1564
|
+
# Create cell region at the intersection
|
1565
|
+
cell = self.page.create_region(
|
1566
|
+
cell_x0, cell_y0, cell_x1, cell_y1
|
1567
|
+
)
|
1568
|
+
# Set metadata
|
1569
|
+
cell.source = 'derived'
|
1570
|
+
cell.region_type = 'table-cell' # Explicitly set type
|
1571
|
+
cell.normalized_type = 'table-cell' # And normalized type
|
1572
|
+
# Inherit model from the parent table region
|
1573
|
+
cell.model = self.model
|
1574
|
+
cell.parent_region = self # Link cell to parent table region
|
1575
|
+
|
1576
|
+
# Add the cell region to the page's element manager
|
1577
|
+
self.page._element_mgr.add_region(cell)
|
1578
|
+
created_count += 1
|
1557
1579
|
|
1558
|
-
|
1580
|
+
# Optional: Add created cells to the table region's children
|
1581
|
+
# self.child_regions.extend(cells_created_in_this_call) # Needs list management
|
1582
|
+
|
1583
|
+
self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
|
1584
|
+
|
1585
|
+
return self # Return self for chaining
|
1559
1586
|
|
1560
1587
|
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1561
1588
|
"""
|
@@ -0,0 +1,124 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: natural-pdf
|
3
|
+
Version: 0.1.2
|
4
|
+
Summary: A more intuitive interface for working with PDFs
|
5
|
+
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/jsoma/natural-pdf
|
8
|
+
Project-URL: Repository, https://github.com/jsoma/natural-pdf
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.7
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: pdfplumber>=0.7.0
|
15
|
+
Requires-Dist: Pillow>=8.0.0
|
16
|
+
Requires-Dist: colour>=0.1.5
|
17
|
+
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
19
|
+
Requires-Dist: torch>=2.0.0
|
20
|
+
Requires-Dist: torchvision>=0.15.0
|
21
|
+
Requires-Dist: transformers>=4.30.0
|
22
|
+
Requires-Dist: huggingface_hub>=0.19.0
|
23
|
+
Provides-Extra: interactive
|
24
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
25
|
+
Provides-Extra: easyocr
|
26
|
+
Requires-Dist: easyocr; extra == "easyocr"
|
27
|
+
Provides-Extra: paddle
|
28
|
+
Requires-Dist: paddlepaddle; extra == "paddle"
|
29
|
+
Requires-Dist: paddleocr; extra == "paddle"
|
30
|
+
Provides-Extra: layout-yolo
|
31
|
+
Requires-Dist: doclayout_yolo; extra == "layout-yolo"
|
32
|
+
Provides-Extra: surya
|
33
|
+
Requires-Dist: surya-ocr; extra == "surya"
|
34
|
+
Provides-Extra: qa
|
35
|
+
Provides-Extra: all
|
36
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
|
37
|
+
Requires-Dist: easyocr; extra == "all"
|
38
|
+
Requires-Dist: paddlepaddle; extra == "all"
|
39
|
+
Requires-Dist: paddleocr; extra == "all"
|
40
|
+
Requires-Dist: doclayout_yolo; extra == "all"
|
41
|
+
Requires-Dist: surya-ocr; extra == "all"
|
42
|
+
Dynamic: license-file
|
43
|
+
|
44
|
+
# Natural PDF
|
45
|
+
|
46
|
+
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
47
|
+
|
48
|
+
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
49
|
+
|
50
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
51
|
+
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
52
|
+
|
53
|
+
<div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
|
54
|
+
|
55
|
+
## Installation
|
56
|
+
|
57
|
+
```bash
|
58
|
+
pip install natural-pdf
|
59
|
+
```
|
60
|
+
|
61
|
+
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
|
62
|
+
|
63
|
+
```bash
|
64
|
+
# Example: Install with EasyOCR support
|
65
|
+
pip install natural-pdf[easyocr]
|
66
|
+
pip install natural-pdf[surya]
|
67
|
+
pip install natural-pdf[paddle]
|
68
|
+
|
69
|
+
# Example: Install with interactive viewer support
|
70
|
+
pip install natural-pdf[interactive]
|
71
|
+
|
72
|
+
# Install everything
|
73
|
+
pip install natural-pdf[all]
|
74
|
+
```
|
75
|
+
|
76
|
+
See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
|
77
|
+
|
78
|
+
## Quick Start
|
79
|
+
|
80
|
+
```python
|
81
|
+
from natural_pdf import PDF
|
82
|
+
|
83
|
+
# Open a PDF
|
84
|
+
pdf = PDF('document.pdf')
|
85
|
+
page = pdf.pages[0]
|
86
|
+
|
87
|
+
# Find elements using CSS-like selectors
|
88
|
+
heading = page.find('text:contains("Summary"):bold')
|
89
|
+
|
90
|
+
# Extract content below the heading
|
91
|
+
content = heading.below().extract_text()
|
92
|
+
print("Content below Summary:", content[:100] + "...")
|
93
|
+
|
94
|
+
# Exclude headers/footers automatically (example)
|
95
|
+
# You might define these based on common text or position
|
96
|
+
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
97
|
+
page.add_exclusion(page.find_all('line')[-1].below())
|
98
|
+
|
99
|
+
# Extract clean text from the page
|
100
|
+
clean_text = page.extract_text()
|
101
|
+
print("\nClean page text:", clean_text[:200] + "...")
|
102
|
+
|
103
|
+
# Highlight the heading and view the page
|
104
|
+
heading.highlight(color='red')
|
105
|
+
page.to_image()
|
106
|
+
```
|
107
|
+
|
108
|
+
And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
|
109
|
+
|
110
|
+
## Key Features
|
111
|
+
|
112
|
+
Natural PDF offers a range of features for working with PDFs:
|
113
|
+
|
114
|
+
* **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
|
115
|
+
* **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
|
116
|
+
* **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
|
117
|
+
* **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
|
118
|
+
* **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
|
119
|
+
* **Document QA:** Ask natural language questions about your document's content.
|
120
|
+
* **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
|
121
|
+
|
122
|
+
## Learn More
|
123
|
+
|
124
|
+
Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
|
@@ -6,11 +6,11 @@ natural_pdf/analyzers/utils.py,sha256=u5_FAUPmEG1ydPVuxpu7bVw507NB3WzisMNSUhsnuk
|
|
6
6
|
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
7
7
|
natural_pdf/analyzers/layout/base.py,sha256=D6KHDsbVKzZWCfW4vt0khPC3TA9JzQD3cF4VtTSyf28,6752
|
8
8
|
natural_pdf/analyzers/layout/docling.py,sha256=iNeD10ZfolDVJNqayAUd0-Bs2tVr5INE7WK9c_Mll_8,11930
|
9
|
-
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=
|
10
|
-
natural_pdf/analyzers/layout/layout_manager.py,sha256=
|
11
|
-
natural_pdf/analyzers/layout/layout_options.py,sha256=
|
9
|
+
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=JJasXl7QEiP4DgAvf-zu1w7Uakdf8ypvITkpQ-OQDgA,13340
|
10
|
+
natural_pdf/analyzers/layout/layout_manager.py,sha256=6Zi9SBonpa0urWyeQBJnmxIL1hOn4xAx09ugkMrEhro,9555
|
11
|
+
natural_pdf/analyzers/layout/layout_options.py,sha256=EmvPEnDsVGMJkDNfn6ORLnX545gbmlo3kVcz4anVm5Q,3325
|
12
12
|
natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5Yk7QvRVcw,12519
|
13
|
-
natural_pdf/analyzers/layout/surya.py,sha256=
|
13
|
+
natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
|
14
14
|
natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
|
15
15
|
natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
|
16
16
|
natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
|
@@ -23,7 +23,7 @@ natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,
|
|
23
23
|
natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
|
24
24
|
natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
|
25
25
|
natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
|
26
|
-
natural_pdf/elements/region.py,sha256=
|
26
|
+
natural_pdf/elements/region.py,sha256=sfYWLn1nii7o7lqY_fTyJN2fd__Cg_9euGsZDQUQffA,74242
|
27
27
|
natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
|
28
28
|
natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
|
29
29
|
natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
|
@@ -45,8 +45,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
|
|
45
45
|
natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
|
46
46
|
natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
|
47
47
|
natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
|
48
|
-
natural_pdf-0.1.
|
49
|
-
natural_pdf-0.1.
|
50
|
-
natural_pdf-0.1.
|
51
|
-
natural_pdf-0.1.
|
52
|
-
natural_pdf-0.1.
|
48
|
+
natural_pdf-0.1.2.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
49
|
+
natural_pdf-0.1.2.dist-info/METADATA,sha256=NQQGLJQVgbbxkyj4UZW-wkmdQLfDGzu7U-UswwiojGU,4453
|
50
|
+
natural_pdf-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
51
|
+
natural_pdf-0.1.2.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
|
52
|
+
natural_pdf-0.1.2.dist-info/RECORD,,
|
@@ -1,295 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: natural-pdf
|
3
|
-
Version: 0.1.1
|
4
|
-
Summary: A more intuitive interface for working with PDFs
|
5
|
-
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
|
-
License-Expression: MIT
|
7
|
-
Project-URL: Homepage, https://github.com/jsoma/natural-pdf
|
8
|
-
Project-URL: Repository, https://github.com/jsoma/natural-pdf
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.7
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: pdfplumber>=0.7.0
|
15
|
-
Requires-Dist: Pillow>=8.0.0
|
16
|
-
Requires-Dist: colour>=0.1.5
|
17
|
-
Requires-Dist: numpy>=1.20.0
|
18
|
-
Requires-Dist: urllib3>=1.26.0
|
19
|
-
Requires-Dist: torch>=2.0.0
|
20
|
-
Requires-Dist: torchvision>=0.15.0
|
21
|
-
Requires-Dist: transformers>=4.30.0
|
22
|
-
Requires-Dist: huggingface_hub>=0.19.0
|
23
|
-
Provides-Extra: interactive
|
24
|
-
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
|
25
|
-
Provides-Extra: easyocr
|
26
|
-
Requires-Dist: easyocr; extra == "easyocr"
|
27
|
-
Provides-Extra: paddle
|
28
|
-
Requires-Dist: paddlepaddle; extra == "paddle"
|
29
|
-
Requires-Dist: paddleocr; extra == "paddle"
|
30
|
-
Provides-Extra: layout-yolo
|
31
|
-
Requires-Dist: doclayout_yolo; extra == "layout-yolo"
|
32
|
-
Provides-Extra: surya
|
33
|
-
Requires-Dist: surya-ocr; extra == "surya"
|
34
|
-
Provides-Extra: qa
|
35
|
-
Provides-Extra: all
|
36
|
-
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
|
37
|
-
Requires-Dist: easyocr; extra == "all"
|
38
|
-
Requires-Dist: paddlepaddle; extra == "all"
|
39
|
-
Requires-Dist: paddleocr; extra == "all"
|
40
|
-
Requires-Dist: doclayout_yolo; extra == "all"
|
41
|
-
Requires-Dist: surya-ocr; extra == "all"
|
42
|
-
Dynamic: license-file
|
43
|
-
|
44
|
-
# Natural PDF
|
45
|
-
|
46
|
-
A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
|
47
|
-
|
48
|
-
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
49
|
-
|
50
|
-
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
51
|
-
- [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
|
52
|
-
|
53
|
-
## Features
|
54
|
-
|
55
|
-
- **Fluent API** for chaining operations
|
56
|
-
- **CSS-like selectors** for finding elements
|
57
|
-
- **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
|
58
|
-
- **Element collections** for batch operations
|
59
|
-
- **Visual highlighting** for debugging (persistent highlights)
|
60
|
-
- **Interactive element viewer** for Jupyter environments (`.viewer()`)
|
61
|
-
- **Region visualization** with direct image extraction of specific regions
|
62
|
-
- **Text style analysis** for document structure
|
63
|
-
- **Exclusion zones** for headers, footers, and other areas to ignore
|
64
|
-
- **OCR integration** with multiple engines (EasyOCR, PaddleOCR, Surya)
|
65
|
-
- **Document layout analysis** for detecting document structure with ML models
|
66
|
-
- **Table extraction** with multiple detection methods
|
67
|
-
- **Structured logging** with configurable levels and handlers
|
68
|
-
|
69
|
-
## Installation
|
70
|
-
|
71
|
-
```bash
|
72
|
-
pip install natural-pdf
|
73
|
-
```
|
74
|
-
|
75
|
-
# Installs the core library along with required AI dependencies (PyTorch, Transformers)
|
76
|
-
```bash
|
77
|
-
# Install with support for specific OCR and layout engines
|
78
|
-
pip install natural-pdf[easyocr]
|
79
|
-
pip install natural-pdf[paddle]
|
80
|
-
pip install natural-pdf[surya]
|
81
|
-
pip install natural-pdf[layout_yolo]
|
82
|
-
|
83
|
-
# Install with support for the interactive Jupyter widget
|
84
|
-
pip install natural-pdf[interactive]
|
85
|
-
|
86
|
-
# Just install everything
|
87
|
-
pip install natural-pdf[all]
|
88
|
-
```
|
89
|
-
|
90
|
-
## Quick Start
|
91
|
-
|
92
|
-
```python
|
93
|
-
from natural_pdf import PDF
|
94
|
-
|
95
|
-
# Open a local PDF
|
96
|
-
pdf = PDF('document.pdf')
|
97
|
-
|
98
|
-
# Or open a PDF from a URL
|
99
|
-
pdf = PDF('https://example.com/document.pdf')
|
100
|
-
|
101
|
-
# Get the first page
|
102
|
-
page = pdf.pages[0]
|
103
|
-
|
104
|
-
# Find elements using CSS-like selectors
|
105
|
-
heading = page.find('text:contains("Summary"):bold')
|
106
|
-
|
107
|
-
# Extract content below the heading
|
108
|
-
content = heading.below().extract_text()
|
109
|
-
print(content)
|
110
|
-
|
111
|
-
# Exclude headers and footers
|
112
|
-
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
113
|
-
page.add_exclusion(page.find_all('line')[-1].below())
|
114
|
-
|
115
|
-
# Extract clean text
|
116
|
-
clean_text = page.extract_text()
|
117
|
-
print(clean_text)
|
118
|
-
```
|
119
|
-
|
120
|
-
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
121
|
-
|
122
|
-
## Selectors
|
123
|
-
|
124
|
-
The library supports CSS-like selectors for finding elements:
|
125
|
-
|
126
|
-
```python
|
127
|
-
# Find text containing a specific string
|
128
|
-
element = page.find('text:contains("Revenue")')
|
129
|
-
|
130
|
-
# Find bold text with a specific font size
|
131
|
-
headings = page.find_all('text[size>=12]:bold')
|
132
|
-
|
133
|
-
# Find thick red lines
|
134
|
-
lines = page.find_all('line[width>=2][color~=(1,0,0)]')
|
135
|
-
```
|
136
|
-
|
137
|
-
## Spatial Navigation
|
138
|
-
|
139
|
-
Navigate through the document with intuitive spatial methods:
|
140
|
-
|
141
|
-
```python
|
142
|
-
# Get content below a heading
|
143
|
-
heading = page.find('text:contains("Introduction")')
|
144
|
-
content = heading.below().extract_text()
|
145
|
-
|
146
|
-
# Get content from one element to another
|
147
|
-
start = page.find('text:contains("Start")')
|
148
|
-
end = page.find('text:contains("End")')
|
149
|
-
region = start.select_until(end)
|
150
|
-
content = region.extract_text()
|
151
|
-
```
|
152
|
-
|
153
|
-
## Exclusion Zones
|
154
|
-
|
155
|
-
Exclude headers, footers, or other areas from extraction:
|
156
|
-
|
157
|
-
```python
|
158
|
-
# Page-level exclusion
|
159
|
-
page.add_exclusion(page.find('text:contains("Page")').above())
|
160
|
-
page.add_exclusion(page.find_all('line')[-1].below())
|
161
|
-
|
162
|
-
# PDF-level exclusion with lambdas
|
163
|
-
pdf.add_exclusion(
|
164
|
-
lambda page: page.find('text:contains("Header")').above(),
|
165
|
-
label="headers"
|
166
|
-
)
|
167
|
-
|
168
|
-
# Extract text with exclusions applied
|
169
|
-
text = pdf.extract_text()
|
170
|
-
|
171
|
-
# Extract from a specific region with exclusions
|
172
|
-
summary = page.find('text:contains("Summary")')
|
173
|
-
conclusion = page.find('text:contains("Conclusion")')
|
174
|
-
region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
|
175
|
-
region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
|
176
|
-
|
177
|
-
# Disable exclusions for a specific extraction
|
178
|
-
full_text = page.extract_text(apply_exclusions=False)
|
179
|
-
```
|
180
|
-
|
181
|
-
Exclusions work efficiently with different region types:
|
182
|
-
- Regions without intersection with exclusion zones → exclusions ignored entirely
|
183
|
-
- Rectangular regions with header/footer exclusions → optimized cropping
|
184
|
-
- Complex regions with partial exclusions → advanced filtering with warning
|
185
|
-
|
186
|
-
## OCR Integration
|
187
|
-
|
188
|
-
Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
|
189
|
-
|
190
|
-
```python
|
191
|
-
# Apply OCR using a specific engine (e.g., PaddleOCR)
|
192
|
-
ocr_elements = page.apply_ocr(engine='paddle', languages=['en', 'zh-cn'])
|
193
|
-
|
194
|
-
# Extract text (will use previously applied OCR results if available)
|
195
|
-
text = page.extract_text()
|
196
|
-
|
197
|
-
# Configure advanced engine options using Options classes
|
198
|
-
from natural_pdf.ocr import PaddleOCROptions
|
199
|
-
paddle_opts = PaddleOCROptions(languages=['en'], use_angle_cls=False, rec_batch_num=8)
|
200
|
-
ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
|
201
|
-
|
202
|
-
# Force OCR regardless of existing text
|
203
|
-
ocr_text = page.extract_text(ocr=True)
|
204
|
-
|
205
|
-
# Find OCR-detected text with high confidence
|
206
|
-
high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
|
207
|
-
|
208
|
-
# Visualize OCR results with color-coded confidence levels
|
209
|
-
for elem in page.find_all('text[source=ocr]'):
|
210
|
-
if elem.confidence >= 0.8:
|
211
|
-
color = (0, 1, 0, 0.3) # Green for high confidence
|
212
|
-
elif elem.confidence >= 0.5:
|
213
|
-
color = (1, 1, 0, 0.3) # Yellow for medium confidence
|
214
|
-
else:
|
215
|
-
color = (1, 0, 0, 0.3) # Red for low confidence
|
216
|
-
|
217
|
-
elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
|
218
|
-
page.save_image('ocr_results.png', labels=True)
|
219
|
-
```
|
220
|
-
|
221
|
-
## Logging
|
222
|
-
|
223
|
-
The library includes a structured logging system to provide visibility into its operations:
|
224
|
-
|
225
|
-
```python
|
226
|
-
import logging
|
227
|
-
from natural_pdf import PDF, configure_logging
|
228
|
-
|
229
|
-
# Configure logging with INFO level to console
|
230
|
-
configure_logging(level=logging.INFO)
|
231
|
-
|
232
|
-
# Or log to a file with DEBUG level
|
233
|
-
file_handler = logging.FileHandler("natural_pdf.log")
|
234
|
-
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
235
|
-
configure_logging(level=logging.DEBUG, handler=file_handler)
|
236
|
-
|
237
|
-
# Now operations will generate logs
|
238
|
-
pdf = PDF("document.pdf")
|
239
|
-
# Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
|
240
|
-
|
241
|
-
# Run layout detection with verbose logging
|
242
|
-
regions = pdf.pages[0].analyze_layout(
|
243
|
-
model="paddle",
|
244
|
-
model_params={"verbose": True}
|
245
|
-
)
|
246
|
-
# Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
|
247
|
-
# Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
|
248
|
-
```
|
249
|
-
|
250
|
-
Logs follow a hierarchical structure matching the library's module organization:
|
251
|
-
- `natural_pdf.core` - Core PDF operations
|
252
|
-
- `natural_pdf.analyzers` - Layout analysis operations
|
253
|
-
- `natural_pdf.ocr` - OCR engine operations
|
254
|
-
|
255
|
-
## Document QA
|
256
|
-
|
257
|
-
Ask questions directly to your documents:
|
258
|
-
|
259
|
-
```python
|
260
|
-
# Ask questions about the document content
|
261
|
-
result = pdf.ask("What was the company's revenue in 2022?")
|
262
|
-
print(f"Answer: {result['answer']}")
|
263
|
-
print(f"Confidence: {result['confidence']:.2f}")
|
264
|
-
|
265
|
-
# Access more details in the result dictionary
|
266
|
-
result = pdf.ask("Who is the CEO?")
|
267
|
-
print(f"Answer: {result['answer']}")
|
268
|
-
print(f"Found on page: {result['page_num']}")
|
269
|
-
print(f"Source text: {result.get('source_text', 'N/A')}")
|
270
|
-
```
|
271
|
-
|
272
|
-
## More details
|
273
|
-
|
274
|
-
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
275
|
-
|
276
|
-
## Visual Debugging & Interactive Viewer
|
277
|
-
|
278
|
-
Use highlighting to understand element selection and analysis results. Add persistent highlights using `.highlight()` and view them with the interactive `.viewer()` or static `.save_image()`. You can also generate temporary previews of selected elements using `ElementCollection.show()`.
|
279
|
-
|
280
|
-
```python
|
281
|
-
# Highlight selected elements persistently
|
282
|
-
page.find_all('text:bold').highlight(label="Bold Text")
|
283
|
-
|
284
|
-
# Launch the interactive widget in Jupyter (shows persistent highlights)
|
285
|
-
# Requires: pip install natural-pdf[interactive]
|
286
|
-
page.viewer()
|
287
|
-
|
288
|
-
# Save a static image file with highlights and legend
|
289
|
-
page.save_image("highlighted_page.png", labels=True)
|
290
|
-
|
291
|
-
# Show a temporary preview image of specific elements, grouped by attribute
|
292
|
-
preview_image = page.find_all('region[type*=table]').show(group_by='type')
|
293
|
-
# In Jupyter, this image will display automatically
|
294
|
-
preview_image
|
295
|
-
```
|
File without changes
|
File without changes
|
File without changes
|