PyPI - natural-pdf - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/classification/manager.py +2 -3
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/highlighting_service.py +29 -38
natural_pdf/core/page.py +283 -186
natural_pdf/core/pdf.py +4 -4
natural_pdf/elements/base.py +34 -0
natural_pdf/elements/collections.py +160 -9
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +353 -12
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/selectors/parser.py +163 -8
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/classification/manager.py CHANGED Viewed

@@ -25,7 +25,7 @@ except ImportError:
     AutoModelForSequenceClassification = object
     torch = None
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from tqdm.auto import tqdm
 # Import result classes
 from .results import CategoryScore, ClassificationResult
@@ -343,8 +343,7 @@ class ClassificationManager:
             total_items = len(item_contents)
             if progress_bar:
                 # Get the appropriate tqdm class
-                tqdm_class = get_tqdm()
-                results_iterator = tqdm_class(
+                results_iterator = tqdm(
                     results_iterator,
                     total=total_items,
                     desc=f"Classifying batch ({model_id})",

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -25,14 +25,12 @@ from typing import (
 )
 from PIL import Image
-from tqdm import tqdm
-from tqdm.auto import tqdm as auto_tqdm
-from tqdm.notebook import tqdm as notebook_tqdm
+from tqdm.auto import tqdm
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from natural_pdf.exporters.base import FinetuneExporter
-# Get the appropriate tqdm class once
-tqdm = get_tqdm()
+# Need to import this utility
+from natural_pdf.utils.identifiers import generate_short_path_hash
 # Set up logger early
 # Configure logging to include thread information
@@ -67,8 +65,10 @@ except ImportError as e:
 from natural_pdf.collections.mixins import ApplyMixin
 from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
-class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixin
+class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin):  # Add ExportMixin and ShapeDetectionMixin
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -119,16 +119,8 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     @staticmethod
     def _get_pdf_class():
         """Helper method to dynamically import the PDF class."""
-        try:
-            # Import needs to resolve path correctly
-            from natural_pdf.core.pdf import PDF
-            return PDF
-        except ImportError as e:
-            logger.error(
-                "Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
-            )
-            raise ImportError("PDF class is required but could not be imported.") from e
+        from natural_pdf.core.pdf import PDF
+        return PDF
     # --- Internal Helpers ---
@@ -141,16 +133,13 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     def _execute_glob(self, pattern: str) -> Set[str]:
         """Glob for paths and return a set of valid PDF paths."""
         found_paths = set()
-        try:
-            # Use iglob for potentially large directories/matches
-            paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
-            for path_str in paths_iter:
-                # Use Path object for easier checking
-                p = Path(path_str)
-                if p.is_file() and p.suffix.lower() == ".pdf":
-                    found_paths.add(str(p.resolve()))  # Store resolved absolute path
-        except Exception as e:
-            logger.error(f"Error processing glob pattern '{pattern}': {e}")
+        # Use iglob for potentially large directories/matches
+        paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
+        for path_str in paths_iter:
+            # Use Path object for easier checking
+            p = Path(path_str)
+            if p.is_file() and p.suffix.lower() == ".pdf":
+                found_paths.add(str(p.resolve()))  # Store resolved absolute path
         return found_paths
     def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
@@ -534,19 +523,10 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
             **kwargs: Additional arguments passed to create_correction_task_package
                       (e.g., image_render_scale, overwrite).
         """
-        try:
-            from natural_pdf.utils.packaging import create_correction_task_package
+        from natural_pdf.utils.packaging import create_correction_task_package
-            # Pass the collection itself (self) as the source
-            create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
-        except ImportError:
-            logger.error(
-                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
-            )
-            # Or raise
-        except Exception as e:
-            logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
-            raise  # Re-raise the exception from the utility function
+        # Pass the collection itself (self) as the source
+        create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
     # --- Mixin Required Implementation ---
     def get_indexable_items(self) -> Iterable[Indexable]:

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -215,21 +215,14 @@ class HighlightRenderer:
     def _render_ocr_text(self):
         """Renders OCR text onto the image. (Adapted from old HighlightManager)"""
         # Use the page reference to get OCR elements
-        try:
-            # Try finding first, then extracting if necessary
-            ocr_elements = self.page.find_all("text[source=ocr]")
-            if not ocr_elements:
-                # Don't run full OCR here, just extract if already run
-                ocr_elements = [
-                    el for el in self.page.words if getattr(el, "source", None) == "ocr"
-                ]
-                # Alternative: self.page.extract_ocr_elements() - but might be slow
-        except Exception as e:
-            logger.warning(
-                f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
-            )
-            return  # Don't modify image if OCR elements aren't available
+        # Try finding first, then extracting if necessary
+        ocr_elements = self.page.find_all("text[source=ocr]")
+        if not ocr_elements:
+            # Don't run full OCR here, just extract if already run
+            ocr_elements = [
+                el for el in self.page.words if getattr(el, "source", None) == "ocr"
+            ]
+            # Alternative: self.page.extract_ocr_elements() - but might be slow
         if not ocr_elements:
             logger.debug(f"No OCR elements found for page {self.page.number} to render.")
@@ -293,20 +286,15 @@ class HighlightRenderer:
             )
             # Calculate text position (centered vertically, slightly offset from left)
-            try:
-                if hasattr(sized_font, "getbbox"):  # Modern PIL
-                    _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
-                    text_h = text_bottom_offset - text_top_offset
-                else:  # Older PIL approximation
-                    text_h = font_size
-                text_y = top_s + (box_h - text_h) / 2
-                # Adjust for vertical offset in some fonts
-                text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
-                text_x = x0_s + padding  # Start near left edge with padding
-            except Exception:
-                # Fallback positioning
-                text_x, text_y = x0_s + padding, top_s + padding
+            if hasattr(sized_font, "getbbox"):  # Modern PIL
+                _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
+                text_h = text_bottom_offset - text_top_offset
+            else:  # Older PIL approximation
+                text_h = font_size
+            text_y = top_s + (box_h - text_h) / 2
+            # Adjust for vertical offset in some fonts
+            text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
+            text_x = x0_s + padding  # Start near left edge with padding
             draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
@@ -392,9 +380,6 @@ class HighlightingService:
             except ValueError:
                 logger.warning(f"Invalid color string: '{color_input}'")
                 return None
-            except Exception as e:
-                logger.error(f"Error processing color string '{color_input}': {e}")
-                return None
         else:
             logger.warning(f"Invalid color input type: {type(color_input)}")
             return None
@@ -677,9 +662,12 @@ class HighlightingService:
                     actual_scale_y = scale # Fallback
                 logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
-        except Exception as e:
-            logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
-            return None
+        except IOError as e:
+            logger.error(f"IOError creating base image for page {page_index}: {e}")
+            raise
+        except AttributeError as e:
+            logger.error(f"AttributeError creating base image for page {page_index}: {e}")
+            raise
         renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
@@ -865,8 +853,11 @@ class HighlightingService:
             else:
                 final_image = rendered_image
-        except Exception as e:
-            logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
-            return None
+        except IOError as e:
+            logger.error(f"IOError rendering preview for page {page_index}: {e}")
+            raise
+        except AttributeError as e:
+            logger.error(f"AttributeError rendering preview for page {page_index}: {e}")
+            raise
         return final_image

natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl