PyPI - natural-pdf - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/classification/manager.py +2 -3
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/highlighting_service.py +29 -38
natural_pdf/core/page.py +283 -186
natural_pdf/core/pdf.py +4 -4
natural_pdf/elements/base.py +34 -0
natural_pdf/elements/collections.py +160 -9
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +353 -12
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/selectors/parser.py +163 -8
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -74,6 +74,11 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
 # --- End Classification Imports --- #
+# --- Shape Detection Mixin --- #
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
+# --- End Shape Detection Mixin --- #
 try:
     from deskew import determine_skew
@@ -86,7 +91,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Page(ClassificationMixin, ExtractionMixin):
+class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
     """
     Enhanced Page wrapper built on top of pdfplumber.Page.
@@ -161,6 +166,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._layout_analyzer = None
         self._load_elements()
+        self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
     @property
     def pdf(self) -> "PDF":
@@ -649,7 +655,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         Exclusions are now handled by the calling methods (find, find_all) if requested.
         Args:
-            selector_obj: Parsed selector dictionary
+            selector_obj: Parsed selector dictionary (single or compound OR selector)
             **kwargs: Additional filter parameters including 'regex' and 'case'
         Returns:
@@ -657,6 +663,30 @@ class Page(ClassificationMixin, ExtractionMixin):
         """
         from natural_pdf.selectors.parser import selector_to_filter_func
+        # Handle compound OR selectors
+        if selector_obj.get("type") == "or":
+            # For OR selectors, search all elements and let the filter function decide
+            elements_to_search = self._element_mgr.get_all_elements()
+            # Create filter function from compound selector
+            filter_func = selector_to_filter_func(selector_obj, **kwargs)
+            # Apply the filter to all elements
+            matching_elements = [element for element in elements_to_search if filter_func(element)]
+            # Sort elements in reading order if requested
+            if kwargs.get("reading_order", True):
+                if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
+                    matching_elements.sort(key=lambda el: (el.top, el.x0))
+                else:
+                    logger.warning(
+                        "Cannot sort elements in reading order: Missing required attributes (top, x0)."
+                    )
+            # Return result collection
+            return ElementCollection(matching_elements)
+        # Handle single selectors (existing logic)
         # Get element type to filter
         element_type = selector_obj.get("type", "any").lower()
@@ -1411,114 +1441,171 @@ class Page(ClassificationMixin, ExtractionMixin):
         Returns:
             PIL Image of the page, or None if rendering fails.
         """
-        image = None
-        render_resolution = resolution if resolution is not None else scale * 72
-        thread_id = threading.current_thread().name
-        logger.debug(
-            f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
-        )
-        lock_wait_start = time.monotonic()
+        # 1. Create cache key (excluding path)
+        cache_key_parts = [
+            scale,
+            width,
+            labels,
+            legend_position,
+            render_ocr,
+            resolution,
+            include_highlights,
+            exclusions,
+        ]
+        # Convert kwargs to a stable, hashable representation
+        sorted_kwargs_list = []
+        for k, v in sorted(kwargs.items()):
+            if isinstance(v, list):
+                try:
+                    v = tuple(v)  # Convert lists to tuples
+                except TypeError: # pragma: no cover
+                    # If list contains unhashable items, fall back to repr or skip
+                    # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
+                    logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
+            sorted_kwargs_list.append((k, v))
+        cache_key_parts.append(tuple(sorted_kwargs_list))
         try:
-            # Acquire the global PDF rendering lock
-            with pdf_render_lock:
-                lock_acquired_time = time.monotonic()
-                logger.debug(
-                    f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
-                )
-                if include_highlights:
-                    # Delegate rendering to the central service
-                    image = self._highlighter.render_page(
-                        page_index=self.index,
-                        scale=scale,
-                        labels=labels,
-                        legend_position=legend_position,
-                        render_ocr=render_ocr,
-                        resolution=render_resolution,  # Pass the calculated resolution
-                        **kwargs,
-                    )
-                else:
-                    image = render_plain_page(self, render_resolution)
-        except Exception as e:
-            logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
-            return None  # Return None on error
-        finally:
-            render_end_time = time.monotonic()
-            logger.debug(
-                f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
-            )
+            cache_key = tuple(cache_key_parts)
+        except TypeError as e: # pragma: no cover
+            logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
+            cache_key = None # Fallback to not using cache for this call
-        if image is None:
-            return None
+        image_to_return: Optional[Image.Image] = None
-        # --- Apply exclusion masking if requested ---
-        if exclusions == "mask" and self._exclusions:
+        # 2. Check cache
+        if cache_key is not None and cache_key in self._to_image_cache:
+            image_to_return = self._to_image_cache[cache_key]
+            logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
+        else:
+            # --- This is the original logic to generate the image ---
+            rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
+            render_resolution = resolution if resolution is not None else scale * 72
+            thread_id = threading.current_thread().name
+            logger.debug(
+                f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
+            )
+            lock_wait_start = time.monotonic()
             try:
-                # Ensure image is mutable (RGB or RGBA)
-                if image.mode not in ("RGB", "RGBA"):
-                    image = image.convert("RGB")
-                exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
-                if exclusion_regions:
-                    draw = ImageDraw.Draw(image)
-                    # Calculate the scaling factor used for the image
-                    # Base image was rendered at render_resolution (DPI)
-                    # pdfplumber default is 72 DPI
-                    # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
-                    img_scale = render_resolution / 72.0
-                    for region in exclusion_regions:
-                        # Convert PDF points (x0, top, x1, bottom) to image pixels
-                        img_x0 = region.x0 * img_scale
-                        img_top = region.top * img_scale
-                        img_x1 = region.x1 * img_scale
-                        img_bottom = region.bottom * img_scale
-                        # Draw a white rectangle over the excluded area
-                        # Ensure coordinates are within image bounds (though region should be)
-                        img_coords = (
-                            max(0, img_x0),
-                            max(0, img_top),
-                            min(image.width, img_x1),
-                            min(image.height, img_bottom),
+                # Acquire the global PDF rendering lock
+                with pdf_render_lock:
+                    lock_acquired_time = time.monotonic()
+                    logger.debug(
+                        f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
+                    )
+                    if include_highlights:
+                        # Delegate rendering to the central service
+                        rendered_image_component = self._highlighter.render_page(
+                            page_index=self.index,
+                            scale=scale,
+                            labels=labels,
+                            legend_position=legend_position,
+                            render_ocr=render_ocr,
+                            resolution=render_resolution,  # Pass the calculated resolution
+                            **kwargs,
                         )
-                        if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
-                            draw.rectangle(img_coords, fill="white")
-                        else:
-                            logger.warning(
-                                f"Skipping invalid exclusion rect for masking: {img_coords}"
-                            )
-                    del draw  # Release drawing context
-            except Exception as mask_error:
-                logger.error(
-                    f"Error applying exclusion mask to page {self.index}: {mask_error}",
-                    exc_info=True,
+                    else:
+                        rendered_image_component = render_plain_page(self, render_resolution)
+            except Exception as e:
+                logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
+                # rendered_image_component remains None
+            finally:
+                render_end_time = time.monotonic()
+                logger.debug(
+                    f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
                 )
-                # Decide if you want to return None or continue without mask
-                # For now, continue without mask
-        # Resize the final image if width is provided
-        if width is not None and width > 0 and image.width > 0:
-            aspect_ratio = image.height / image.width
-            height = int(width * aspect_ratio)
-            try:
-                image = image.resize(
-                    (width, height), Image.Resampling.LANCZOS
-                )  # Use modern resampling
-            except Exception as resize_error:
-                logger.warning(f"Could not resize image: {resize_error}")
-        # Save the image if path is provided
-        if path:
+            if rendered_image_component is None:
+                if cache_key is not None:
+                    self._to_image_cache[cache_key] = None # Cache the failure
+                # Save the image if path is provided (will try to save None, handled by PIL/OS)
+                if path:
+                    try:
+                        if os.path.dirname(path):
+                            os.makedirs(os.path.dirname(path), exist_ok=True)
+                        if rendered_image_component is not None: # Should be None here
+                           rendered_image_component.save(path) # This line won't be hit if None
+                        # else: logger.debug("Not saving None image") # Not strictly needed
+                    except Exception as save_error: # pragma: no cover
+                        logger.error(f"Failed to save image to {path}: {save_error}")
+                return None
+            # --- Apply exclusion masking if requested ---
+            # This modifies 'rendered_image_component'
+            image_after_masking = rendered_image_component # Start with the rendered image
+            if exclusions == "mask" and self._exclusions:
+                try:
+                    # Ensure image is mutable (RGB or RGBA)
+                    if image_after_masking.mode not in ("RGB", "RGBA"):
+                        image_after_masking = image_after_masking.convert("RGB")
+                    exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
+                    if exclusion_regions:
+                        draw = ImageDraw.Draw(image_after_masking)
+                        # Calculate the scaling factor used for the image
+                        img_scale = render_resolution / 72.0
+                        for region in exclusion_regions:
+                            # Convert PDF points (x0, top, x1, bottom) to image pixels
+                            img_x0 = region.x0 * img_scale
+                            img_top = region.top * img_scale
+                            img_x1 = region.x1 * img_scale
+                            img_bottom = region.bottom * img_scale
+                            # Draw a white rectangle over the excluded area
+                            img_coords = (
+                                max(0, img_x0),
+                                max(0, img_top),
+                                min(image_after_masking.width, img_x1),
+                                min(image_after_masking.height, img_bottom),
+                            )
+                            if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
+                                draw.rectangle(img_coords, fill="white")
+                            else: # pragma: no cover
+                                logger.warning(
+                                    f"Skipping invalid exclusion rect for masking: {img_coords}"
+                                )
+                        del draw  # Release drawing context
+                except Exception as mask_error: # pragma: no cover
+                    logger.error(
+                        f"Error applying exclusion mask to page {self.index}: {mask_error}",
+                        exc_info=True,
+                    )
+                    # Continue with potentially unmasked or partially masked image
+            # --- Resize the final image if width is provided ---
+            image_final_content = image_after_masking # Start with image after masking
+            if width is not None and width > 0 and image_final_content.width > 0:
+                aspect_ratio = image_final_content.height / image_final_content.width
+                height = int(width * aspect_ratio)
+                try:
+                    image_final_content = image_final_content.resize(
+                        (width, height), Image.Resampling.LANCZOS
+                    )
+                except Exception as resize_error: # pragma: no cover
+                    logger.warning(f"Could not resize image: {resize_error}")
+                    # image_final_content remains the un-resized version if resize fails
+            # Store in cache
+            if cache_key is not None:
+                self._to_image_cache[cache_key] = image_final_content
+                logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
+            image_to_return = image_final_content
+        # --- End of cache miss block ---
+        # Save the image (either from cache or newly generated) if path is provided
+        if path and image_to_return:
             try:
                 # Ensure directory exists
-                os.makedirs(os.path.dirname(path), exist_ok=True)
-                image.save(path)
+                if os.path.dirname(path): # Only call makedirs if there's a directory part
+                    os.makedirs(os.path.dirname(path), exist_ok=True)
+                image_to_return.save(path)
                 logger.debug(f"Saved page image to: {path}")
-            except Exception as save_error:
+            except Exception as save_error: # pragma: no cover
                 logger.error(f"Failed to save image to {path}: {save_error}")
-        return image
+        return image_to_return
     def _create_text_elements_from_ocr(
         self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
@@ -1986,7 +2073,6 @@ class Page(ClassificationMixin, ExtractionMixin):
                 region.is_end_next_start = False
                 regions.append(region)
-        # Return the list wrapped in an ElementCollection
         return ElementCollection(regions)
     def __repr__(self) -> str:
@@ -2213,6 +2299,7 @@ class Page(ClassificationMixin, ExtractionMixin):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        selector: Optional[str] = "text[source=ocr]",
         max_workers: Optional[int] = None,
         progress_callback: Optional[Callable[[], None]] = None,  # Added progress callback
     ) -> "Page":  # Return self for chaining
@@ -2240,7 +2327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         )
         target_elements_collection = self.find_all(
-            selector="text[source=ocr]", apply_exclusions=False
+            selector=selector, apply_exclusions=False
         )
         target_elements = target_elements_collection.elements  # Get the list
@@ -2248,102 +2335,112 @@ class Page(ClassificationMixin, ExtractionMixin):
             logger.info(f"Page {self.number}: No OCR elements found to correct.")
             return self
-        processed_count = 0
-        updated_count = 0
-        error_count = 0
+        element_pbar = None
+        try:
+            element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
-        # Define the task to be run by the worker thread or sequentially
-        def _process_element_task(element):
-            try:
-                current_text = getattr(element, "text", None)
-                # Call the user-provided callback
-                corrected_text = correction_callback(element)
+            processed_count = 0
+            updated_count = 0
+            error_count = 0
-                # Validate result type
-                if corrected_text is not None and not isinstance(corrected_text, str):
-                    logger.warning(
-                        f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
-                    )
-                    return element, None, None  # Treat as no correction
+            # Define the task to be run by the worker thread or sequentially
+            def _process_element_task(element):
+                try:
+                    current_text = getattr(element, "text", None)
+                    # Call the user-provided callback
+                    corrected_text = correction_callback(element)
-                return element, corrected_text, None  # Return element, result, no error
-            except Exception as e:
-                logger.error(
-                    f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
-                    exc_info=False,  # Keep log concise
-                )
-                return element, None, e  # Return element, no result, error
-            finally:
-                # --- Call progress callback here --- #
-                if progress_callback:
-                    try:
-                        progress_callback()
-                    except Exception as cb_e:
-                        # Log error in callback itself, but don't stop processing
-                        logger.error(
-                            f"Page {self.number}: Error executing progress_callback: {cb_e}",
-                            exc_info=False,
+                    # Validate result type
+                    if corrected_text is not None and not isinstance(corrected_text, str):
+                        logger.warning(
+                            f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
                         )
+                        return element, None, None  # Treat as no correction
-        # Choose execution strategy based on max_workers
-        if max_workers is not None and max_workers > 1:
-            # --- Parallel execution --- #
-            logger.info(
-                f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
-            )
-            futures = []
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                # Submit all tasks
-                future_to_element = {
-                    executor.submit(_process_element_task, element): element
-                    for element in target_elements
-                }
+                    return element, corrected_text, None  # Return element, result, no error
+                except Exception as e:
+                    logger.error(
+                        f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
+                        exc_info=False,  # Keep log concise
+                    )
+                    return element, None, e  # Return element, no result, error
+                finally:
+                    # --- Update internal tqdm progress bar ---
+                    if element_pbar:
+                        element_pbar.update(1)
+                    # --- Call user's progress callback --- #
+                    if progress_callback:
+                        try:
+                            progress_callback()
+                        except Exception as cb_e:
+                            # Log error in callback itself, but don't stop processing
+                            logger.error(
+                                f"Page {self.number}: Error executing progress_callback: {cb_e}",
+                                exc_info=False,
+                            )
-                # Process results as they complete (progress_callback called by worker)
-                for future in concurrent.futures.as_completed(future_to_element):
-                    processed_count += 1
-                    try:
-                        element, corrected_text, error = future.result()
-                        if error:
+            # Choose execution strategy based on max_workers
+            if max_workers is not None and max_workers > 1:
+                # --- Parallel execution --- #
+                logger.info(
+                    f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
+                )
+                futures = []
+                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                    # Submit all tasks
+                    future_to_element = {
+                        executor.submit(_process_element_task, element): element
+                        for element in target_elements
+                    }
+                    # Process results as they complete (progress_callback called by worker)
+                    for future in concurrent.futures.as_completed(future_to_element):
+                        processed_count += 1
+                        try:
+                            element, corrected_text, error = future.result()
+                            if error:
+                                error_count += 1
+                                # Error already logged in worker
+                            elif corrected_text is not None:
+                                # Apply correction if text changed
+                                current_text = getattr(element, "text", None)
+                                if corrected_text != current_text:
+                                    element.text = corrected_text
+                                    updated_count += 1
+                        except Exception as exc:
+                            # Catch errors from future.result() itself
+                            element = future_to_element[future]  # Find original element
+                            logger.error(
+                                f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
+                                exc_info=True,
+                            )
                             error_count += 1
-                            # Error already logged in worker
-                        elif corrected_text is not None:
-                            # Apply correction if text changed
-                            current_text = getattr(element, "text", None)
-                            if corrected_text != current_text:
-                                element.text = corrected_text
-                                updated_count += 1
-                    except Exception as exc:
-                        # Catch errors from future.result() itself
-                        element = future_to_element[future]  # Find original element
-                        logger.error(
-                            f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
-                            exc_info=True,
-                        )
-                        error_count += 1
-                        # Note: progress_callback was already called in the worker's finally block
+                            # Note: progress_callback was already called in the worker's finally block
-        else:
-            # --- Sequential execution --- #
-            logger.info(f"Page {self.number}: Running OCR correction sequentially.")
-            for element in target_elements:
-                # Call the task function directly (it handles progress_callback)
-                processed_count += 1
-                _element, corrected_text, error = _process_element_task(element)
-                if error:
-                    error_count += 1
-                elif corrected_text is not None:
-                    # Apply correction if text changed
-                    current_text = getattr(_element, "text", None)
-                    if corrected_text != current_text:
-                        _element.text = corrected_text
-                        updated_count += 1
+            else:
+                # --- Sequential execution --- #
+                logger.info(f"Page {self.number}: Running OCR correction sequentially.")
+                for element in target_elements:
+                    # Call the task function directly (it handles progress_callback)
+                    processed_count += 1
+                    _element, corrected_text, error = _process_element_task(element)
+                    if error:
+                        error_count += 1
+                    elif corrected_text is not None:
+                        # Apply correction if text changed
+                        current_text = getattr(_element, "text", None)
+                        if corrected_text != current_text:
+                            _element.text = corrected_text
+                            updated_count += 1
-        logger.info(
-            f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
-        )
+            logger.info(
+                f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
+            )
-        return self  # Return self for chaining
+            return self  # Return self for chaining
+        finally:
+            if element_pbar:
+                element_pbar.close()
     # --- Classification Mixin Implementation --- #
     def _get_classification_manager(self) -> "ClassificationManager":

natural_pdf/core/pdf.py CHANGED Viewed

@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from tqdm.auto import tqdm
 try:
     from typing import Any as TypingAny
@@ -71,7 +71,6 @@ except ImportError:
     create_original_pdf = None
 logger = logging.getLogger("natural_pdf.core.pdf")
-tqdm = get_tqdm()
 DEFAULT_MANAGERS = {
     "classification": ClassificationManager,
@@ -1253,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         resolution: int = 300,
+        angle: Optional[float] = None,
         detection_resolution: int = 72,
         force_overwrite: bool = False,
         **deskew_kwargs,
@@ -1271,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         Args:
             pages: Page indices/slice to include (0-based). If None, processes all pages.
             resolution: DPI resolution for rendering the output deskewed pages.
+            angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
             detection_resolution: DPI resolution used for skew detection if angles are not
                                   already cached on the page objects.
             force_overwrite: If False (default), raises a ValueError if any target page
@@ -1315,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         deskewed_images_bytes = []
         logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
-        # Use tqdm via get_tqdm
         for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
             try:
                 # Use page.deskew to get the corrected PIL image
                 # Pass down resolutions and kwargs
                 deskewed_img = page.deskew(
                     resolution=resolution,
-                    angle=None,  # Let page.deskew handle detection/caching
+                    angle=angle,  # Let page.deskew handle detection/caching
                     detection_resolution=detection_resolution,
                     **deskew_kwargs,
                 )

natural_pdf/elements/base.py CHANGED Viewed

@@ -15,6 +15,40 @@ if TYPE_CHECKING:
     from natural_pdf.elements.region import Region
+def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
+    """
+    Extract bounding box coordinates from any object that has bbox properties.
+    Args:
+        obj: Object that might have bbox coordinates (Element, Region, etc.)
+    Returns:
+        Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
+    """
+    # Try bbox property first (most common)
+    if hasattr(obj, 'bbox') and obj.bbox is not None:
+        bbox = obj.bbox
+        if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
+            return tuple(float(coord) for coord in bbox)
+    # Try individual coordinate properties
+    if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
+        try:
+            return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
+        except (ValueError, TypeError):
+            pass
+    # If object is a dict with bbox keys
+    if isinstance(obj, dict):
+        if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
+            try:
+                return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
+            except (ValueError, TypeError):
+                pass
+    return None
 class DirectionalMixin:
     """
     Mixin class providing directional methods for both Element and Region classes.

natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl