natural-pdf 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  import logging
2
2
  from typing import List, Dict, Any, Optional, Union
3
3
  from PIL import Image
4
+ import copy
4
5
 
5
6
  from natural_pdf.elements.region import Region
6
7
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
7
- from natural_pdf.analyzers.layout.layout_options import LayoutOptions
8
+ from natural_pdf.analyzers.layout.layout_options import LayoutOptions, TATRLayoutOptions, BaseLayoutOptions
8
9
 
9
10
  logger = logging.getLogger(__name__)
10
11
 
@@ -36,20 +37,25 @@ class LayoutAnalyzer:
36
37
  classes: Optional[List[str]] = None,
37
38
  exclude_classes: Optional[List[str]] = None,
38
39
  device: Optional[str] = None,
39
- existing: str = "replace"
40
+ existing: str = "replace",
41
+ **kwargs
40
42
  ) -> List[Region]:
41
43
  """
42
44
  Analyze the page layout using the configured LayoutManager.
43
45
 
46
+ This method constructs the final options object, including internal context,
47
+ and passes it to the LayoutManager.
48
+
44
49
  Args:
45
- engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None.
46
- options: Specific LayoutOptions object for advanced configuration.
50
+ engine: Name of the layout engine (e.g., 'yolo', 'tatr'). Uses manager's default if None and no options object given.
51
+ options: Specific LayoutOptions object for advanced configuration. If provided, simple args (confidence, etc.) are ignored.
47
52
  confidence: Minimum confidence threshold (simple mode).
48
53
  classes: Specific classes to detect (simple mode).
49
54
  exclude_classes: Classes to exclude (simple mode).
50
55
  device: Device for inference (simple mode).
51
56
  existing: How to handle existing detected regions: 'replace' (default) or 'append'.
52
-
57
+ **kwargs: Additional engine-specific arguments (added to options.extra_args or used by constructor if options=None).
58
+
53
59
  Returns:
54
60
  List of created Region objects.
55
61
  """
@@ -57,72 +63,139 @@ class LayoutAnalyzer:
57
63
  logger.error(f"Page {self._page.number}: LayoutManager not available. Cannot analyze layout.")
58
64
  return []
59
65
 
60
- logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options: {options is not None})...")
66
+ logger.info(f"Page {self._page.number}: Analyzing layout (Engine: {engine or 'default'}, Options provided: {options is not None})...")
61
67
 
62
- # --- Render Page Image ---
63
- logger.debug(f" Rendering page {self._page.number} to image for layout analysis...")
68
+ # --- Render Page Image (Standard Resolution) ---
69
+ logger.debug(f" Rendering page {self._page.number} to image for initial layout detection...")
64
70
  try:
65
- # Use a resolution suitable for layout analysis, potentially configurable
66
- layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5) # ~108 DPI default
71
+ layout_scale = getattr(self._page._parent, '_config', {}).get('layout_image_scale', 1.5)
67
72
  layout_resolution = layout_scale * 72
68
- # Render without existing highlights to avoid interference
69
- page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
70
- logger.debug(f" Rendered image size: {page_image.width}x{page_image.height}")
73
+ std_res_page_image = self._page.to_image(resolution=layout_resolution, include_highlights=False)
74
+ if not std_res_page_image:
75
+ raise ValueError("Initial page rendering returned None")
76
+ logger.debug(f" Initial rendered image size: {std_res_page_image.width}x{std_res_page_image.height}")
71
77
  except Exception as e:
72
- logger.error(f" Failed to render page {self._page.number} to image: {e}", exc_info=True)
78
+ logger.error(f" Failed to render initial page image: {e}", exc_info=True)
73
79
  return []
80
+
81
+ # --- Calculate Scaling Factors (Standard Res Image <-> PDF) ---
82
+ if std_res_page_image.width == 0 or std_res_page_image.height == 0:
83
+ logger.error(f"Page {self._page.number}: Invalid initial rendered image dimensions. Cannot scale results.")
84
+ return []
85
+ img_scale_x = self._page.width / std_res_page_image.width
86
+ img_scale_y = self._page.height / std_res_page_image.height
87
+ logger.debug(f" StdRes Image -> PDF Scaling: x={img_scale_x:.4f}, y={img_scale_y:.4f}")
74
88
 
75
- # --- Prepare Arguments for Layout Manager ---
76
- manager_args = {'image': page_image, 'options': options, 'engine': engine}
77
- if confidence is not None: manager_args['confidence'] = confidence
78
- if classes is not None: manager_args['classes'] = classes
79
- if exclude_classes is not None: manager_args['exclude_classes'] = exclude_classes
80
- if device is not None: manager_args['device'] = device
81
-
82
- # --- Call Layout Manager ---
83
- logger.debug(f" Calling Layout Manager...")
89
+ # --- Construct Final Options Object ---
90
+ final_options: BaseLayoutOptions
91
+
92
+ if options is not None:
93
+ # User provided a complete options object, use it directly
94
+ logger.debug("Using user-provided options object.")
95
+ final_options = copy.deepcopy(options) # Copy to avoid modifying original user object
96
+ if kwargs:
97
+ logger.warning(f"Ignoring kwargs {list(kwargs.keys())} because a full options object was provided.")
98
+ # Infer engine from options type if engine arg wasn't provided
99
+ if engine is None:
100
+ for name, registry_entry in self._layout_manager.ENGINE_REGISTRY.items():
101
+ if isinstance(final_options, registry_entry['options_class']):
102
+ engine = name
103
+ logger.debug(f"Inferred engine '{engine}' from options type.")
104
+ break
105
+ if engine is None:
106
+ logger.warning("Could not infer engine from provided options object.")
107
+ else:
108
+ # Construct options from simple args (engine, confidence, classes, etc.)
109
+ logger.debug("Constructing options from simple arguments.")
110
+ selected_engine = engine or self._layout_manager.get_available_engines()[0] # Use provided or first available
111
+ engine_lower = selected_engine.lower()
112
+ registry = self._layout_manager.ENGINE_REGISTRY
113
+
114
+ if engine_lower not in registry:
115
+ raise ValueError(f"Unknown or unavailable engine: '{selected_engine}'. Available: {list(registry.keys())}")
116
+
117
+ options_class = registry[engine_lower]['options_class']
118
+
119
+ # Get base defaults
120
+ base_defaults = BaseLayoutOptions()
121
+
122
+ # Prepare args for constructor, prioritizing explicit args over defaults
123
+ constructor_args = {
124
+ 'confidence': confidence if confidence is not None else base_defaults.confidence,
125
+ 'classes': classes, # Pass None if not provided
126
+ 'exclude_classes': exclude_classes, # Pass None if not provided
127
+ 'device': device if device is not None else base_defaults.device,
128
+ 'extra_args': kwargs # Pass other kwargs here
129
+ }
130
+ # Remove None values unless they are valid defaults (like classes=None)
131
+ # We can pass all to the dataclass constructor; it handles defaults
132
+
133
+ try:
134
+ final_options = options_class(**constructor_args)
135
+ logger.debug(f"Constructed options: {final_options}")
136
+ except TypeError as e:
137
+ logger.error(f"Failed to construct options object {options_class.__name__} with args {constructor_args}: {e}")
138
+ # Filter kwargs to only include fields defined in the specific options class? Complex.
139
+ # Re-raise for now, indicates programming error or invalid kwarg.
140
+ raise e
141
+
142
+ # --- Add Internal Context to extra_args (ALWAYS) ---
143
+ if not hasattr(final_options, 'extra_args') or final_options.extra_args is None:
144
+ final_options.extra_args = {}
145
+ final_options.extra_args['_page_ref'] = self._page
146
+ final_options.extra_args['_img_scale_x'] = img_scale_x
147
+ final_options.extra_args['_img_scale_y'] = img_scale_y
148
+ logger.debug(f"Added internal context to final_options.extra_args: {final_options.extra_args}")
149
+
150
+ # --- Call Layout Manager with the Final Options ---
151
+ logger.debug(f"Calling Layout Manager with final options object.")
84
152
  try:
85
- detections = self._layout_manager.analyze_layout(**manager_args)
153
+ # Pass only image and the constructed options object
154
+ detections = self._layout_manager.analyze_layout(
155
+ image=std_res_page_image,
156
+ options=final_options
157
+ # No engine, confidence, classes etc. passed here directly
158
+ )
86
159
  logger.info(f" Layout Manager returned {len(detections)} detections.")
87
160
  except Exception as e:
88
161
  logger.error(f" Layout analysis failed: {e}", exc_info=True)
89
162
  return []
90
163
 
91
- # --- Process Detections (Convert to Regions, Scale Coords) ---
92
- # Calculate scale factor to convert from image back to PDF coordinates
93
- if page_image.width == 0 or page_image.height == 0:
94
- logger.error(f"Page {self._page.number}: Invalid rendered image dimensions ({page_image.width}x{page_image.height}). Cannot scale layout results.")
95
- return []
96
- scale_x = self._page.width / page_image.width
97
- scale_y = self._page.height / page_image.height
98
- logger.debug(f" Scaling factors: x={scale_x:.4f}, y={scale_y:.4f}")
99
-
164
+ # --- Process Detections (Convert to Regions, Scale Coords from Image to PDF) ---
100
165
  layout_regions = []
101
166
  docling_id_to_region = {} # For hierarchy if using Docling
102
167
 
103
168
  for detection in detections:
104
169
  try:
170
+ # bbox is relative to std_res_page_image
105
171
  x_min, y_min, x_max, y_max = detection['bbox']
106
172
 
107
173
  # Convert coordinates from image to PDF space
108
- pdf_x0 = x_min * scale_x
109
- pdf_y0 = y_min * scale_y
110
- pdf_x1 = x_max * scale_x
111
- pdf_y1 = y_max * scale_y
112
-
113
- # Create a Region object
174
+ pdf_x0 = x_min * img_scale_x
175
+ pdf_y0 = y_min * img_scale_y
176
+ pdf_x1 = x_max * img_scale_x
177
+ pdf_y1 = y_max * img_scale_y
178
+
179
+ # Ensure PDF coords are valid
180
+ pdf_x0, pdf_x1 = min(pdf_x0, pdf_x1), max(pdf_x0, pdf_x1)
181
+ pdf_y0, pdf_y1 = min(pdf_y0, pdf_y1), max(pdf_y0, pdf_y1)
182
+ pdf_x0 = max(0, pdf_x0)
183
+ pdf_y0 = max(0, pdf_y0)
184
+ pdf_x1 = min(self._page.width, pdf_x1)
185
+ pdf_y1 = min(self._page.height, pdf_y1)
186
+
187
+ # Create a Region object with PDF coordinates
114
188
  region = Region(self._page, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
115
- region.region_type = detection.get('class', 'unknown') # Original class name
116
- region.normalized_type = detection.get('normalized_class', 'unknown') # Hyphenated name
189
+ region.region_type = detection.get('class', 'unknown')
190
+ region.normalized_type = detection.get('normalized_class', 'unknown')
117
191
  region.confidence = detection.get('confidence', 0.0)
118
- region.model = detection.get('model', engine or 'unknown') # Store model name
192
+ region.model = detection.get('model', engine or 'unknown')
119
193
  region.source = 'detected'
120
-
194
+
121
195
  # Add extra info if available
122
196
  if 'text' in detection: region.text_content = detection['text']
123
197
  if 'docling_id' in detection: region.docling_id = detection['docling_id']
124
198
  if 'parent_id' in detection: region.parent_id = detection['parent_id']
125
- # Add other fields like polygon, position, row/col index if needed
126
199
 
127
200
  layout_regions.append(region)
128
201
 
@@ -163,4 +236,20 @@ class LayoutAnalyzer:
163
236
  self._page.detected_layout_regions = self._page._regions['detected']
164
237
  logger.info(f"Layout analysis complete for page {self._page.number}.")
165
238
 
239
+ # --- Auto-create cells if requested by TATR options ---
240
+ if isinstance(final_options, TATRLayoutOptions) and final_options.create_cells:
241
+ logger.info(f" Option create_cells=True detected for TATR. Attempting cell creation...")
242
+ created_cell_count = 0
243
+ for region in layout_regions:
244
+ # Only attempt on regions identified as tables by the TATR model
245
+ if region.model == 'tatr' and region.region_type == 'table':
246
+ try:
247
+ # create_cells now modifies the page elements directly and returns self
248
+ region.create_cells()
249
+ # We could potentially count cells created here if needed,
250
+ # but the method logs its own count.
251
+ except Exception as cell_error:
252
+ logger.warning(f" Error calling create_cells for table region {region.bbox}: {cell_error}")
253
+ logger.info(f" Finished cell creation process triggered by options.")
254
+
166
255
  return layout_regions
@@ -120,9 +120,10 @@ class LayoutManager:
120
120
 
121
121
  # --- Determine Options and Engine ---
122
122
  if options is not None:
123
- # Advanced Mode
124
- logger.debug(f"LayoutManager: Using advanced mode with options object: {type(options).__name__}")
125
- final_options = copy.deepcopy(options) # Use copy
123
+ # Advanced Mode: An options object was provided directly (or constructed by LayoutAnalyzer)
124
+ # Use this object directly, do not deep copy or reconstruct.
125
+ logger.debug(f"LayoutManager: Using provided options object: {type(options).__name__}")
126
+ final_options = options # Use the provided object directly
126
127
  found_engine = False
127
128
  for name, registry_entry in self.ENGINE_REGISTRY.items():
128
129
  if isinstance(options, registry_entry['options_class']):
@@ -131,12 +132,14 @@ class LayoutManager:
131
132
  break
132
133
  if not found_engine:
133
134
  raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered layout engine options.")
135
+ # Ignore simple kwargs if options object is present
134
136
  if kwargs:
135
- logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
137
+ logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside an 'options' object and will be ignored.")
136
138
  else:
137
- # Simple Mode
139
+ # Simple Mode: No options object provided initially.
140
+ # Determine engine from kwargs or default, then construct options.
138
141
  selected_engine_name = default_engine.lower()
139
- logger.debug(f"LayoutManager: Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
142
+ logger.debug(f"LayoutManager: Using simple mode. Engine: '{selected_engine_name}', kwargs: {kwargs}")
140
143
 
141
144
  if selected_engine_name not in self.ENGINE_REGISTRY:
142
145
  raise ValueError(f"Unknown or unavailable layout engine: '{selected_engine_name}'. Available: {available_engines}")
@@ -34,7 +34,7 @@ class TATRLayoutOptions(BaseLayoutOptions):
34
34
  max_detection_size: int = 800
35
35
  max_structure_size: int = 1000
36
36
  # Whether to create cell regions (can be slow)
37
- create_cells: bool = False # Keep the flag for cell creation control
37
+ create_cells: bool = True
38
38
 
39
39
  # --- Paddle Specific Options ---
40
40
  @dataclass
@@ -51,10 +51,8 @@ class PaddleLayoutOptions(BaseLayoutOptions):
51
51
  @dataclass
52
52
  class SuryaLayoutOptions(BaseLayoutOptions):
53
53
  """Options specific to Surya layout detection."""
54
- # Surya doesn't seem to have many config options based on the example,
55
- # but we can add placeholders if needed. Device is handled by BaseLayoutOptions.
56
54
  model_name: str = "default" # Placeholder if different models become available
57
- verbose: bool = False # Verbose logging for the detector class
55
+ recognize_table_structure: bool = True # Automatically run table structure recognition?
58
56
 
59
57
  # --- Docling Specific Options ---
60
58
  @dataclass
@@ -3,6 +3,7 @@ import logging
3
3
  import importlib.util
4
4
  import os
5
5
  import tempfile
6
+ import copy
6
7
  from typing import List, Dict, Any, Optional, Tuple
7
8
  from PIL import Image
8
9
 
@@ -11,20 +12,23 @@ from .layout_options import SuryaLayoutOptions, BaseLayoutOptions
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
14
- # Check for dependency
15
+ # Check for dependencies
15
16
  surya_spec = importlib.util.find_spec("surya")
16
17
  LayoutPredictor = None
18
+ TableRecPredictor = None
19
+
17
20
  if surya_spec:
18
21
  try:
19
22
  from surya.layout import LayoutPredictor
23
+ from surya.table_rec import TableRecPredictor
20
24
  except ImportError as e:
21
- logger.warning(f"Could not import Surya dependencies: {e}")
25
+ logger.warning(f"Could not import Surya dependencies (layout and/or table_rec): {e}")
22
26
  else:
23
27
  logger.warning("surya not found. SuryaLayoutDetector will not be available.")
24
28
 
25
29
 
26
30
  class SuryaLayoutDetector(LayoutDetector):
27
- """Document layout detector using Surya models."""
31
+ """Document layout and table structure detector using Surya models."""
28
32
 
29
33
  def __init__(self):
30
34
  super().__init__()
@@ -32,120 +36,224 @@ class SuryaLayoutDetector(LayoutDetector):
32
36
  'text', 'pageheader', 'pagefooter', 'sectionheader',
33
37
  'table', 'tableofcontents', 'picture', 'caption',
34
38
  'heading', 'title', 'list', 'listitem', 'code',
35
- 'textinlinemath', 'mathformula', 'form'
39
+ 'textinlinemath', 'mathformula', 'form',
40
+ 'table-row', 'table-column'
36
41
  }
37
- # Predictor instance is cached via _get_model
42
+ self._page_ref = None # To store page reference from options
38
43
 
39
44
  def is_available(self) -> bool:
40
- """Check if surya is installed."""
41
- return LayoutPredictor is not None
45
+ return LayoutPredictor is not None and TableRecPredictor is not None
42
46
 
43
47
  def _get_cache_key(self, options: BaseLayoutOptions) -> str:
44
- """Generate cache key based on model name and device."""
45
48
  if not isinstance(options, SuryaLayoutOptions):
46
- options = SuryaLayoutOptions(device=options.device) # Use base device
47
-
49
+ options = SuryaLayoutOptions(device=options.device)
48
50
  device_key = str(options.device).lower() if options.device else 'default_device'
49
- # Include model_name if it affects loading, otherwise device might be enough
50
51
  model_key = options.model_name
51
52
  return f"{self.__class__.__name__}_{device_key}_{model_key}"
52
53
 
53
- def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
54
- """Load the Surya LayoutPredictor model."""
54
+ def _load_model_from_options(self, options: BaseLayoutOptions) -> Dict[str, Any]:
55
55
  if not self.is_available():
56
- raise RuntimeError("Surya dependency (surya-ocr) not installed.")
57
-
56
+ raise RuntimeError("Surya dependencies (surya.layout and surya.table_rec) not installed.")
58
57
  if not isinstance(options, SuryaLayoutOptions):
59
58
  raise TypeError("Incorrect options type provided for Surya model loading.")
60
-
61
- self.logger.info(f"Loading Surya LayoutPredictor (device={options.device})...")
62
- try:
63
- # Pass device and potentially other init args from options.extra_args
64
- predictor_args = {'device': options.device} if options.device else {}
65
- predictor_args.update(options.extra_args) # Add any extra init args
66
-
67
- predictor = LayoutPredictor(**predictor_args)
68
- self.logger.info("Surya LayoutPredictor loaded.")
69
- return predictor
59
+ self.logger.info(f"Loading Surya models (device={options.device})...")
60
+ models = {}
61
+ try:
62
+ models['layout'] = LayoutPredictor()
63
+ models['table_rec'] = TableRecPredictor()
64
+ self.logger.info("Surya LayoutPredictor and TableRecPredictor loaded.")
65
+ return models
70
66
  except Exception as e:
71
- self.logger.error(f"Failed to load Surya LayoutPredictor: {e}", exc_info=True)
67
+ self.logger.error(f"Failed to load Surya models: {e}", exc_info=True)
72
68
  raise
69
+
70
+ def _expand_bbox(self, bbox: Tuple[float, float, float, float],
71
+ padding: int, max_width: int, max_height: int) -> Tuple[int, int, int, int]:
72
+ """Expand bbox by padding, clamping to max dimensions."""
73
+ x0, y0, x1, y1 = bbox
74
+ x0 = max(0, int(x0 - padding))
75
+ y0 = max(0, int(y0 - padding))
76
+ x1 = min(max_width, int(x1 + padding))
77
+ y1 = min(max_height, int(y1 + padding))
78
+ return x0, y0, x1, y1
73
79
 
74
80
  def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
75
- """Detect layout elements in an image using Surya."""
81
+ """Detect layout elements and optionally table structure in an image using Surya."""
76
82
  if not self.is_available():
77
- raise RuntimeError("Surya dependency (surya-ocr) not installed.")
83
+ raise RuntimeError("Surya dependencies (layout and table_rec) not installed.")
78
84
 
79
85
  if not isinstance(options, SuryaLayoutOptions):
80
86
  self.logger.warning("Received BaseLayoutOptions, expected SuryaLayoutOptions. Using defaults.")
81
87
  options = SuryaLayoutOptions(
82
88
  confidence=options.confidence, classes=options.classes,
83
89
  exclude_classes=options.exclude_classes, device=options.device,
84
- extra_args=options.extra_args
90
+ extra_args=options.extra_args,
91
+ recognize_table_structure=True
85
92
  )
93
+
94
+ # Extract page reference and scaling factors from extra_args (passed by LayoutAnalyzer)
95
+ self._page_ref = options.extra_args.get('_page_ref')
96
+ img_scale_x = options.extra_args.get('_img_scale_x')
97
+ img_scale_y = options.extra_args.get('_img_scale_y')
98
+
99
+ # We still need this check, otherwise later steps that need these vars will fail
100
+ can_do_table_rec = options.recognize_table_structure and self._page_ref and img_scale_x is not None and img_scale_y is not None
101
+ if options.recognize_table_structure and not can_do_table_rec:
102
+ logger.warning("Surya table recognition cannot proceed without page reference and scaling factors. Disabling.")
103
+ options.recognize_table_structure = False
86
104
 
87
- self.validate_classes(options.classes or [])
88
- if options.exclude_classes:
89
- self.validate_classes(options.exclude_classes)
90
-
91
- # Get the cached/loaded predictor instance
92
- layout_predictor = self._get_model(options)
93
-
94
- # Surya predictor takes a list of images
95
- input_image_list = [image.convert("RGB")] # Ensure RGB
96
-
97
- detections = []
98
- try:
99
- self.logger.debug("Running Surya layout prediction...")
100
- # Call the predictor (returns a list of LayoutResult objects)
101
- layout_predictions = layout_predictor(input_image_list)
102
- self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
103
-
104
- if not layout_predictions:
105
- self.logger.warning("Surya returned empty predictions list.")
106
- return []
107
-
108
- # Process results for the first (and only) image
109
- prediction = layout_predictions[0] # LayoutResult object
110
-
111
- # Prepare normalized class filters once
112
- normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
113
- normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
114
-
115
- for layout_box in prediction.bboxes:
116
- # Extract the class name and normalize it
117
- class_name_orig = layout_box.label
118
- normalized_class = self._normalize_class_name(class_name_orig)
119
- score = float(layout_box.confidence)
120
-
121
- # Apply confidence threshold
122
- if score < options.confidence: continue
123
-
124
- # Apply class filtering
125
- if normalized_classes_req and normalized_class not in normalized_classes_req: continue
126
- if normalized_class in normalized_classes_excl: continue
127
-
128
- # Extract bbox coordinates (Surya provides [x_min, y_min, x_max, y_max])
129
- x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
130
-
131
- # Add detection
132
- detection_data = {
133
- 'bbox': (x_min, y_min, x_max, y_max),
134
- 'class': class_name_orig,
135
- 'confidence': score,
136
- 'normalized_class': normalized_class,
137
- 'source': 'layout',
138
- 'model': 'surya'
139
- # Add polygon etc. if needed, check attributes on layout_box
140
- # 'polygon': layout_box.polygon if hasattr(layout_box, 'polygon') else None,
141
- }
142
- detections.append(detection_data)
143
-
144
- self.logger.info(f"Surya detected {len(detections)} layout elements matching criteria.")
105
+ # Validate classes
106
+ if options.classes: self.validate_classes(options.classes)
107
+ if options.exclude_classes: self.validate_classes(options.exclude_classes)
145
108
 
146
- except Exception as e:
147
- self.logger.error(f"Error during Surya layout detection: {e}", exc_info=True)
148
- raise
109
+ models = self._get_model(options)
110
+ layout_predictor = models['layout']
111
+ table_rec_predictor = models['table_rec']
112
+
113
+ input_image = image.convert("RGB")
114
+ input_image_list = [input_image]
115
+
116
+ initial_layout_detections = [] # Detections relative to input_image
117
+ tables_to_process = []
118
+
119
+ # --- Initial Layout Detection ---
120
+ self.logger.debug("Running Surya layout prediction...")
121
+ layout_predictions = layout_predictor(input_image_list)
122
+ self.logger.debug(f"Surya prediction returned {len(layout_predictions)} results.")
123
+ if not layout_predictions: return []
124
+ prediction = layout_predictions[0]
125
+
126
+ normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
127
+ normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
128
+
129
+ for layout_box in prediction.bboxes:
130
+ class_name_orig = layout_box.label
131
+ normalized_class = self._normalize_class_name(class_name_orig)
132
+ score = float(layout_box.confidence)
133
+
134
+ if score < options.confidence: continue
135
+ if normalized_classes_req and normalized_class not in normalized_classes_req: continue
136
+ if normalized_class in normalized_classes_excl: continue
137
+
138
+ x_min, y_min, x_max, y_max = map(float, layout_box.bbox)
139
+ detection_data = {
140
+ 'bbox': (x_min, y_min, x_max, y_max),
141
+ 'class': class_name_orig,
142
+ 'confidence': score,
143
+ 'normalized_class': normalized_class,
144
+ 'source': 'layout',
145
+ 'model': 'surya'
146
+ }
147
+ initial_layout_detections.append(detection_data)
148
+
149
+ if options.recognize_table_structure and normalized_class in ('table', 'tableofcontents'):
150
+ tables_to_process.append(detection_data)
151
+
152
+ self.logger.info(f"Surya initially detected {len(initial_layout_detections)} layout elements matching criteria.")
153
+
154
+ # --- Table Structure Recognition (Optional) ---
155
+ if not options.recognize_table_structure or not tables_to_process:
156
+ self.logger.debug("Skipping Surya table structure recognition (disabled or no tables found).")
157
+ return initial_layout_detections
158
+
159
+ self.logger.info(f"Attempting Surya table structure recognition for {len(tables_to_process)} tables...")
160
+ high_res_crops = []
161
+ pdf_offsets = [] # Store (pdf_x0, pdf_y0) for each crop
162
+
163
+ high_res_dpi = getattr(self._page_ref._parent, '_config', {}).get('surya_table_rec_dpi', 192)
164
+ bbox_padding = getattr(self._page_ref._parent, '_config', {}).get('surya_table_bbox_padding', 10)
165
+ pdf_to_highres_scale = high_res_dpi / 72.0
166
+
167
+ # Render high-res page ONCE
168
+ self.logger.debug(f"Rendering page {self._page_ref.number} at {high_res_dpi} DPI for table recognition...")
169
+ high_res_page_image = self._page_ref.to_image(resolution=high_res_dpi, include_highlights=False)
170
+ if not high_res_page_image:
171
+ raise RuntimeError(f"Failed to render page {self._page_ref.number} at high resolution.")
172
+ self.logger.debug(f" High-res image size: {high_res_page_image.width}x{high_res_page_image.height}")
173
+
174
+ for i, table_detection in enumerate(tables_to_process):
175
+ img_x0, img_y0, img_x1, img_y1 = table_detection['bbox']
176
+
177
+ # PDF coords
178
+ pdf_x0 = img_x0 * img_scale_x
179
+ pdf_y0 = img_y0 * img_scale_y
180
+ pdf_x1 = img_x1 * img_scale_x
181
+ pdf_y1 = img_y1 * img_scale_y
182
+ pdf_x0 = max(0, pdf_x0)
183
+ pdf_y0 = max(0, pdf_y0)
184
+ pdf_x1 = min(self._page_ref.width, pdf_x1)
185
+ pdf_y1 = min(self._page_ref.height, pdf_y1)
186
+
187
+ # High-res image coords
188
+ hr_x0 = pdf_x0 * pdf_to_highres_scale
189
+ hr_y0 = pdf_y0 * pdf_to_highres_scale
190
+ hr_x1 = pdf_x1 * pdf_to_highres_scale
191
+ hr_y1 = pdf_y1 * pdf_to_highres_scale
192
+
193
+ # Expand high-res bbox
194
+ hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp = self._expand_bbox(
195
+ (hr_x0, hr_y0, hr_x1, hr_y1),
196
+ padding=bbox_padding,
197
+ max_width=high_res_page_image.width,
198
+ max_height=high_res_page_image.height
199
+ )
200
+
201
+ crop = high_res_page_image.crop((hr_x0_exp, hr_y0_exp, hr_x1_exp, hr_y1_exp))
202
+ high_res_crops.append(crop)
203
+ pdf_offsets.append((pdf_x0, pdf_y0))
204
+
205
+ if not high_res_crops:
206
+ self.logger.info("No valid high-resolution table crops generated.")
207
+ return initial_layout_detections
208
+
209
+ structure_detections = [] # Detections relative to std_res input_image
210
+
211
+ # --- Run Table Recognition (will raise error on failure) ---
212
+ self.logger.debug(f"Running Surya table recognition on {len(high_res_crops)} high-res images...")
213
+ table_predictions = table_rec_predictor(high_res_crops)
214
+ self.logger.debug(f"Surya table recognition returned {len(table_predictions)} results.")
215
+
216
+ # --- Process Results ---
217
+ if len(table_predictions) != len(pdf_offsets):
218
+ # This case is less likely if predictor didn't error, but good sanity check
219
+ raise RuntimeError(f"Mismatch between table inputs ({len(pdf_offsets)}) and predictions ({len(table_predictions)}).")
220
+
221
+ for table_pred, (offset_pdf_x0, offset_pdf_y0) in zip(table_predictions, pdf_offsets):
222
+ # Process Rows
223
+ for row_box in table_pred.rows:
224
+ crop_rx0, crop_ry0, crop_rx1, crop_ry1 = map(float, row_box.bbox)
225
+ pdf_row_x0 = offset_pdf_x0 + crop_rx0 / pdf_to_highres_scale
226
+ pdf_row_y0 = offset_pdf_y0 + crop_ry0 / pdf_to_highres_scale
227
+ pdf_row_x1 = offset_pdf_x0 + crop_rx1 / pdf_to_highres_scale
228
+ pdf_row_y1 = offset_pdf_y0 + crop_ry1 / pdf_to_highres_scale
229
+ img_row_x0 = pdf_row_x0 / img_scale_x
230
+ img_row_y0 = pdf_row_y0 / img_scale_y
231
+ img_row_x1 = pdf_row_x1 / img_scale_x
232
+ img_row_y1 = pdf_row_y1 / img_scale_y
233
+ structure_detections.append({
234
+ 'bbox': (img_row_x0, img_row_y0, img_row_x1, img_row_y1),
235
+ 'class': 'table-row', 'confidence': 1.0, 'normalized_class': 'table-row',
236
+ 'source': 'layout', 'model': 'surya'
237
+ })
238
+
239
+ # Process Columns
240
+ for col_box in table_pred.cols:
241
+ crop_cx0, crop_cy0, crop_cx1, crop_cy1 = map(float, col_box.bbox)
242
+ pdf_col_x0 = offset_pdf_x0 + crop_cx0 / pdf_to_highres_scale
243
+ pdf_col_y0 = offset_pdf_y0 + crop_cy0 / pdf_to_highres_scale
244
+ pdf_col_x1 = offset_pdf_x0 + crop_cx1 / pdf_to_highres_scale
245
+ pdf_col_y1 = offset_pdf_y0 + crop_cy1 / pdf_to_highres_scale
246
+ img_col_x0 = pdf_col_x0 / img_scale_x
247
+ img_col_y0 = pdf_col_y0 / img_scale_y
248
+ img_col_x1 = pdf_col_x1 / img_scale_x
249
+ img_col_y1 = pdf_col_y1 / img_scale_y
250
+ structure_detections.append({
251
+ 'bbox': (img_col_x0, img_col_y0, img_col_x1, img_col_y1),
252
+ 'class': 'table-column', 'confidence': 1.0, 'normalized_class': 'table-column',
253
+ 'source': 'layout', 'model': 'surya'
254
+ })
255
+
256
+ self.logger.info(f"Added {len(structure_detections)} table structure elements.")
149
257
 
150
- return detections
258
+ return initial_layout_detections + structure_detections
151
259
 
@@ -1514,48 +1514,75 @@ class Region(DirectionalMixin):
1514
1514
 
1515
1515
  def create_cells(self):
1516
1516
  """
1517
- Create cell regions for a TATR-detected table.
1517
+ Create cell regions for a detected table by intersecting its
1518
+ row and column regions, and add them to the page.
1518
1519
 
1520
+ Assumes child row and column regions are already present on the page.
1521
+
1519
1522
  Returns:
1520
- List of cell regions
1523
+ Self for method chaining.
1521
1524
  """
1522
- if not (self.region_type == 'table' and self.model == 'tatr'):
1523
- raise ValueError("Only works for TATR-detected table regions")
1525
+ # Ensure this is called on a table region
1526
+ if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
1527
+ raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
1524
1528
 
1525
- # Find rows and columns that belong to this table
1526
- rows = self.page.find_all(f'region[type=table-row][model=tatr]')
1527
- columns = self.page.find_all(f'region[type=table-column][model=tatr]')
1529
+ # Find rows and columns associated with this page
1530
+ # Remove the model-specific filter
1531
+ rows = self.page.find_all('region[type=table-row]')
1532
+ columns = self.page.find_all('region[type=table-column]')
1528
1533
 
1529
- # Filter to only include those that overlap with this table
1534
+ # Filter to only include those that overlap with this table region
1530
1535
  def is_in_table(element):
1531
- element_center_x = (element.x0 + element.x1) / 2
1532
- element_center_y = (element.top + element.bottom) / 2
1533
- return (self.x0 <= element_center_x <= self.x1 and
1534
- self.top <= element_center_y <= self.bottom)
1536
+ # Use a simple overlap check (more robust than just center point)
1537
+ # Check if element's bbox overlaps with self.bbox
1538
+ return (element.x0 < self.x1 and element.x1 > self.x0 and
1539
+ element.top < self.bottom and element.bottom > self.top)
1535
1540
 
1536
1541
  table_rows = [r for r in rows if is_in_table(r)]
1537
1542
  table_columns = [c for c in columns if is_in_table(c)]
1538
1543
 
1544
+ if not table_rows or not table_columns:
1545
+ self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
1546
+ return self # Return self even if no cells created
1547
+
1539
1548
  # Sort rows and columns
1540
1549
  table_rows.sort(key=lambda r: r.top)
1541
1550
  table_columns.sort(key=lambda c: c.x0)
1542
1551
 
1543
- # Create cells
1544
- cells = []
1552
+ # Create cells and add them to the page's element manager
1553
+ created_count = 0
1545
1554
  for row in table_rows:
1546
1555
  for column in table_columns:
1547
- # Create cell region at the intersection
1548
- cell = self.page.create_region(
1549
- column.x0, row.top, column.x1, row.bottom
1550
- )
1551
- # Set minimal metadata
1552
- cell.source = 'derived'
1553
- cell.region_type = 'table-cell'
1554
- cell.model = 'tatr'
1555
-
1556
- cells.append(cell)
1556
+ # Calculate intersection bbox for the cell
1557
+ cell_x0 = max(row.x0, column.x0)
1558
+ cell_y0 = max(row.top, column.top)
1559
+ cell_x1 = min(row.x1, column.x1)
1560
+ cell_y1 = min(row.bottom, column.bottom)
1561
+
1562
+ # Only create a cell if the intersection is valid (positive width/height)
1563
+ if cell_x1 > cell_x0 and cell_y1 > cell_y0:
1564
+ # Create cell region at the intersection
1565
+ cell = self.page.create_region(
1566
+ cell_x0, cell_y0, cell_x1, cell_y1
1567
+ )
1568
+ # Set metadata
1569
+ cell.source = 'derived'
1570
+ cell.region_type = 'table-cell' # Explicitly set type
1571
+ cell.normalized_type = 'table-cell' # And normalized type
1572
+ # Inherit model from the parent table region
1573
+ cell.model = self.model
1574
+ cell.parent_region = self # Link cell to parent table region
1575
+
1576
+ # Add the cell region to the page's element manager
1577
+ self.page._element_mgr.add_region(cell)
1578
+ created_count += 1
1557
1579
 
1558
- return cells
1580
+ # Optional: Add created cells to the table region's children
1581
+ # self.child_regions.extend(cells_created_in_this_call) # Needs list management
1582
+
1583
+ self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
1584
+
1585
+ return self # Return self for chaining
1559
1586
 
1560
1587
  def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1561
1588
  """
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.4
2
+ Name: natural-pdf
3
+ Version: 0.1.2
4
+ Summary: A more intuitive interface for working with PDFs
5
+ Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
+ Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfplumber>=0.7.0
15
+ Requires-Dist: Pillow>=8.0.0
16
+ Requires-Dist: colour>=0.1.5
17
+ Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
19
+ Requires-Dist: torch>=2.0.0
20
+ Requires-Dist: torchvision>=0.15.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: huggingface_hub>=0.19.0
23
+ Provides-Extra: interactive
24
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
25
+ Provides-Extra: easyocr
26
+ Requires-Dist: easyocr; extra == "easyocr"
27
+ Provides-Extra: paddle
28
+ Requires-Dist: paddlepaddle; extra == "paddle"
29
+ Requires-Dist: paddleocr; extra == "paddle"
30
+ Provides-Extra: layout-yolo
31
+ Requires-Dist: doclayout_yolo; extra == "layout-yolo"
32
+ Provides-Extra: surya
33
+ Requires-Dist: surya-ocr; extra == "surya"
34
+ Provides-Extra: qa
35
+ Provides-Extra: all
36
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
37
+ Requires-Dist: easyocr; extra == "all"
38
+ Requires-Dist: paddlepaddle; extra == "all"
39
+ Requires-Dist: paddleocr; extra == "all"
40
+ Requires-Dist: doclayout_yolo; extra == "all"
41
+ Requires-Dist: surya-ocr; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # Natural PDF
45
+
46
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
47
+
48
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
49
+
50
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
51
+ - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
52
+
53
+ <div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ pip install natural-pdf
59
+ ```
60
+
61
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
62
+
63
+ ```bash
64
+ # Example: Install with EasyOCR support
65
+ pip install natural-pdf[easyocr]
66
+ pip install natural-pdf[surya]
67
+ pip install natural-pdf[paddle]
68
+
69
+ # Example: Install with interactive viewer support
70
+ pip install natural-pdf[interactive]
71
+
72
+ # Install everything
73
+ pip install natural-pdf[all]
74
+ ```
75
+
76
+ See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
77
+
78
+ ## Quick Start
79
+
80
+ ```python
81
+ from natural_pdf import PDF
82
+
83
+ # Open a PDF
84
+ pdf = PDF('document.pdf')
85
+ page = pdf.pages[0]
86
+
87
+ # Find elements using CSS-like selectors
88
+ heading = page.find('text:contains("Summary"):bold')
89
+
90
+ # Extract content below the heading
91
+ content = heading.below().extract_text()
92
+ print("Content below Summary:", content[:100] + "...")
93
+
94
+ # Exclude headers/footers automatically (example)
95
+ # You might define these based on common text or position
96
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
97
+ page.add_exclusion(page.find_all('line')[-1].below())
98
+
99
+ # Extract clean text from the page
100
+ clean_text = page.extract_text()
101
+ print("\nClean page text:", clean_text[:200] + "...")
102
+
103
+ # Highlight the heading and view the page
104
+ heading.highlight(color='red')
105
+ page.to_image()
106
+ ```
107
+
108
+ And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
109
+
110
+ ## Key Features
111
+
112
+ Natural PDF offers a range of features for working with PDFs:
113
+
114
+ * **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
115
+ * **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
116
+ * **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
117
+ * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
118
+ * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
119
+ * **Document QA:** Ask natural language questions about your document's content.
120
+ * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
121
+
122
+ ## Learn More
123
+
124
+ Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
@@ -6,11 +6,11 @@ natural_pdf/analyzers/utils.py,sha256=u5_FAUPmEG1ydPVuxpu7bVw507NB3WzisMNSUhsnuk
6
6
  natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
7
7
  natural_pdf/analyzers/layout/base.py,sha256=D6KHDsbVKzZWCfW4vt0khPC3TA9JzQD3cF4VtTSyf28,6752
8
8
  natural_pdf/analyzers/layout/docling.py,sha256=iNeD10ZfolDVJNqayAUd0-Bs2tVr5INE7WK9c_Mll_8,11930
9
- natural_pdf/analyzers/layout/layout_analyzer.py,sha256=oQeqPDHL6vpj_3NHuzS5ja7KVAAL7PhQ7IOwustDBBo,8008
10
- natural_pdf/analyzers/layout/layout_manager.py,sha256=Qr5pxcv_Wk5IJRJr0IoYJJAz71RGJvQgqXONNBhNLOw,9221
11
- natural_pdf/analyzers/layout/layout_options.py,sha256=vZvTSg_M27OirZomcC5uWLSmPYXjvnnCEo5QKy9RjaQ,3503
9
+ natural_pdf/analyzers/layout/layout_analyzer.py,sha256=JJasXl7QEiP4DgAvf-zu1w7Uakdf8ypvITkpQ-OQDgA,13340
10
+ natural_pdf/analyzers/layout/layout_manager.py,sha256=6Zi9SBonpa0urWyeQBJnmxIL1hOn4xAx09ugkMrEhro,9555
11
+ natural_pdf/analyzers/layout/layout_options.py,sha256=EmvPEnDsVGMJkDNfn6ORLnX545gbmlo3kVcz4anVm5Q,3325
12
12
  natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5Yk7QvRVcw,12519
13
- natural_pdf/analyzers/layout/surya.py,sha256=hmPDfXzTkF2PQPgvg1xjTJSBKFuCmjZB3GTCHa-kpA4,6477
13
+ natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
14
14
  natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
15
15
  natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
16
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
@@ -23,7 +23,7 @@ natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,
23
23
  natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
24
24
  natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
25
25
  natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
26
- natural_pdf/elements/region.py,sha256=MXQK00LLMvwuq94NigeeCVFoGov_RWFe9ZylnIMpzB0,72453
26
+ natural_pdf/elements/region.py,sha256=sfYWLn1nii7o7lqY_fTyJN2fd__Cg_9euGsZDQUQffA,74242
27
27
  natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
28
28
  natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
29
29
  natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
@@ -45,8 +45,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
45
45
  natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
46
46
  natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
47
47
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
48
- natural_pdf-0.1.1.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
- natural_pdf-0.1.1.dist-info/METADATA,sha256=8o22GEPtEqlSqexFQxy6tVoHTB35LmT63sjbjbjORRE,10009
50
- natural_pdf-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
- natural_pdf-0.1.1.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
- natural_pdf-0.1.1.dist-info/RECORD,,
48
+ natural_pdf-0.1.2.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
+ natural_pdf-0.1.2.dist-info/METADATA,sha256=NQQGLJQVgbbxkyj4UZW-wkmdQLfDGzu7U-UswwiojGU,4453
50
+ natural_pdf-0.1.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
+ natural_pdf-0.1.2.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
+ natural_pdf-0.1.2.dist-info/RECORD,,
@@ -1,295 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: natural-pdf
3
- Version: 0.1.1
4
- Summary: A more intuitive interface for working with PDFs
5
- Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
- Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.7
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: pdfplumber>=0.7.0
15
- Requires-Dist: Pillow>=8.0.0
16
- Requires-Dist: colour>=0.1.5
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: urllib3>=1.26.0
19
- Requires-Dist: torch>=2.0.0
20
- Requires-Dist: torchvision>=0.15.0
21
- Requires-Dist: transformers>=4.30.0
22
- Requires-Dist: huggingface_hub>=0.19.0
23
- Provides-Extra: interactive
24
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
25
- Provides-Extra: easyocr
26
- Requires-Dist: easyocr; extra == "easyocr"
27
- Provides-Extra: paddle
28
- Requires-Dist: paddlepaddle; extra == "paddle"
29
- Requires-Dist: paddleocr; extra == "paddle"
30
- Provides-Extra: layout-yolo
31
- Requires-Dist: doclayout_yolo; extra == "layout-yolo"
32
- Provides-Extra: surya
33
- Requires-Dist: surya-ocr; extra == "surya"
34
- Provides-Extra: qa
35
- Provides-Extra: all
36
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
37
- Requires-Dist: easyocr; extra == "all"
38
- Requires-Dist: paddlepaddle; extra == "all"
39
- Requires-Dist: paddleocr; extra == "all"
40
- Requires-Dist: doclayout_yolo; extra == "all"
41
- Requires-Dist: surya-ocr; extra == "all"
42
- Dynamic: license-file
43
-
44
- # Natural PDF
45
-
46
- A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
47
-
48
- Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
49
-
50
- - [Complete documentation here](https://jsoma.github.io/natural-pdf)
51
- - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
52
-
53
- ## Features
54
-
55
- - **Fluent API** for chaining operations
56
- - **CSS-like selectors** for finding elements
57
- - **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
58
- - **Element collections** for batch operations
59
- - **Visual highlighting** for debugging (persistent highlights)
60
- - **Interactive element viewer** for Jupyter environments (`.viewer()`)
61
- - **Region visualization** with direct image extraction of specific regions
62
- - **Text style analysis** for document structure
63
- - **Exclusion zones** for headers, footers, and other areas to ignore
64
- - **OCR integration** with multiple engines (EasyOCR, PaddleOCR, Surya)
65
- - **Document layout analysis** for detecting document structure with ML models
66
- - **Table extraction** with multiple detection methods
67
- - **Structured logging** with configurable levels and handlers
68
-
69
- ## Installation
70
-
71
- ```bash
72
- pip install natural-pdf
73
- ```
74
-
75
- # Installs the core library along with required AI dependencies (PyTorch, Transformers)
76
- ```bash
77
- # Install with support for specific OCR and layout engines
78
- pip install natural-pdf[easyocr]
79
- pip install natural-pdf[paddle]
80
- pip install natural-pdf[surya]
81
- pip install natural-pdf[layout_yolo]
82
-
83
- # Install with support for the interactive Jupyter widget
84
- pip install natural-pdf[interactive]
85
-
86
- # Just install everything
87
- pip install natural-pdf[all]
88
- ```
89
-
90
- ## Quick Start
91
-
92
- ```python
93
- from natural_pdf import PDF
94
-
95
- # Open a local PDF
96
- pdf = PDF('document.pdf')
97
-
98
- # Or open a PDF from a URL
99
- pdf = PDF('https://example.com/document.pdf')
100
-
101
- # Get the first page
102
- page = pdf.pages[0]
103
-
104
- # Find elements using CSS-like selectors
105
- heading = page.find('text:contains("Summary"):bold')
106
-
107
- # Extract content below the heading
108
- content = heading.below().extract_text()
109
- print(content)
110
-
111
- # Exclude headers and footers
112
- page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
113
- page.add_exclusion(page.find_all('line')[-1].below())
114
-
115
- # Extract clean text
116
- clean_text = page.extract_text()
117
- print(clean_text)
118
- ```
119
-
120
- - [Complete documentation here](https://jsoma.github.io/natural-pdf)
121
-
122
- ## Selectors
123
-
124
- The library supports CSS-like selectors for finding elements:
125
-
126
- ```python
127
- # Find text containing a specific string
128
- element = page.find('text:contains("Revenue")')
129
-
130
- # Find bold text with a specific font size
131
- headings = page.find_all('text[size>=12]:bold')
132
-
133
- # Find thick red lines
134
- lines = page.find_all('line[width>=2][color~=(1,0,0)]')
135
- ```
136
-
137
- ## Spatial Navigation
138
-
139
- Navigate through the document with intuitive spatial methods:
140
-
141
- ```python
142
- # Get content below a heading
143
- heading = page.find('text:contains("Introduction")')
144
- content = heading.below().extract_text()
145
-
146
- # Get content from one element to another
147
- start = page.find('text:contains("Start")')
148
- end = page.find('text:contains("End")')
149
- region = start.select_until(end)
150
- content = region.extract_text()
151
- ```
152
-
153
- ## Exclusion Zones
154
-
155
- Exclude headers, footers, or other areas from extraction:
156
-
157
- ```python
158
- # Page-level exclusion
159
- page.add_exclusion(page.find('text:contains("Page")').above())
160
- page.add_exclusion(page.find_all('line')[-1].below())
161
-
162
- # PDF-level exclusion with lambdas
163
- pdf.add_exclusion(
164
- lambda page: page.find('text:contains("Header")').above(),
165
- label="headers"
166
- )
167
-
168
- # Extract text with exclusions applied
169
- text = pdf.extract_text()
170
-
171
- # Extract from a specific region with exclusions
172
- summary = page.find('text:contains("Summary")')
173
- conclusion = page.find('text:contains("Conclusion")')
174
- region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
175
- region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
176
-
177
- # Disable exclusions for a specific extraction
178
- full_text = page.extract_text(apply_exclusions=False)
179
- ```
180
-
181
- Exclusions work efficiently with different region types:
182
- - Regions without intersection with exclusion zones → exclusions ignored entirely
183
- - Rectangular regions with header/footer exclusions → optimized cropping
184
- - Complex regions with partial exclusions → advanced filtering with warning
185
-
186
- ## OCR Integration
187
-
188
- Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
189
-
190
- ```python
191
- # Apply OCR using a specific engine (e.g., PaddleOCR)
192
- ocr_elements = page.apply_ocr(engine='paddle', languages=['en', 'zh-cn'])
193
-
194
- # Extract text (will use previously applied OCR results if available)
195
- text = page.extract_text()
196
-
197
- # Configure advanced engine options using Options classes
198
- from natural_pdf.ocr import PaddleOCROptions
199
- paddle_opts = PaddleOCROptions(languages=['en'], use_angle_cls=False, rec_batch_num=8)
200
- ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
201
-
202
- # Force OCR regardless of existing text
203
- ocr_text = page.extract_text(ocr=True)
204
-
205
- # Find OCR-detected text with high confidence
206
- high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
207
-
208
- # Visualize OCR results with color-coded confidence levels
209
- for elem in page.find_all('text[source=ocr]'):
210
- if elem.confidence >= 0.8:
211
- color = (0, 1, 0, 0.3) # Green for high confidence
212
- elif elem.confidence >= 0.5:
213
- color = (1, 1, 0, 0.3) # Yellow for medium confidence
214
- else:
215
- color = (1, 0, 0, 0.3) # Red for low confidence
216
-
217
- elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
218
- page.save_image('ocr_results.png', labels=True)
219
- ```
220
-
221
- ## Logging
222
-
223
- The library includes a structured logging system to provide visibility into its operations:
224
-
225
- ```python
226
- import logging
227
- from natural_pdf import PDF, configure_logging
228
-
229
- # Configure logging with INFO level to console
230
- configure_logging(level=logging.INFO)
231
-
232
- # Or log to a file with DEBUG level
233
- file_handler = logging.FileHandler("natural_pdf.log")
234
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
235
- configure_logging(level=logging.DEBUG, handler=file_handler)
236
-
237
- # Now operations will generate logs
238
- pdf = PDF("document.pdf")
239
- # Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
240
-
241
- # Run layout detection with verbose logging
242
- regions = pdf.pages[0].analyze_layout(
243
- model="paddle",
244
- model_params={"verbose": True}
245
- )
246
- # Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
247
- # Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
248
- ```
249
-
250
- Logs follow a hierarchical structure matching the library's module organization:
251
- - `natural_pdf.core` - Core PDF operations
252
- - `natural_pdf.analyzers` - Layout analysis operations
253
- - `natural_pdf.ocr` - OCR engine operations
254
-
255
- ## Document QA
256
-
257
- Ask questions directly to your documents:
258
-
259
- ```python
260
- # Ask questions about the document content
261
- result = pdf.ask("What was the company's revenue in 2022?")
262
- print(f"Answer: {result['answer']}")
263
- print(f"Confidence: {result['confidence']:.2f}")
264
-
265
- # Access more details in the result dictionary
266
- result = pdf.ask("Who is the CEO?")
267
- print(f"Answer: {result['answer']}")
268
- print(f"Found on page: {result['page_num']}")
269
- print(f"Source text: {result.get('source_text', 'N/A')}")
270
- ```
271
-
272
- ## More details
273
-
274
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
275
-
276
- ## Visual Debugging & Interactive Viewer
277
-
278
- Use highlighting to understand element selection and analysis results. Add persistent highlights using `.highlight()` and view them with the interactive `.viewer()` or static `.save_image()`. You can also generate temporary previews of selected elements using `ElementCollection.show()`.
279
-
280
- ```python
281
- # Highlight selected elements persistently
282
- page.find_all('text:bold').highlight(label="Bold Text")
283
-
284
- # Launch the interactive widget in Jupyter (shows persistent highlights)
285
- # Requires: pip install natural-pdf[interactive]
286
- page.viewer()
287
-
288
- # Save a static image file with highlights and legend
289
- page.save_image("highlighted_page.png", labels=True)
290
-
291
- # Show a temporary preview image of specific elements, grouped by attribute
292
- preview_image = page.find_all('region[type*=table]').show(group_by='type')
293
- # In Jupyter, this image will display automatically
294
- preview_image
295
- ```