PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +3 -4
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +146 -75
natural_pdf/core/page.py +287 -188
natural_pdf/core/pdf.py +57 -42
natural_pdf/elements/base.py +51 -0
natural_pdf/elements/collections.py +362 -67
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +396 -23
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/selectors/parser.py +163 -8
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -25,14 +25,12 @@ from typing import (
 )
 from PIL import Image
-from tqdm import tqdm
-from tqdm.auto import tqdm as auto_tqdm
-from tqdm.notebook import tqdm as notebook_tqdm
+from tqdm.auto import tqdm
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from natural_pdf.exporters.base import FinetuneExporter
-# Get the appropriate tqdm class once
-tqdm = get_tqdm()
+# Need to import this utility
+from natural_pdf.utils.identifiers import generate_short_path_hash
 # Set up logger early
 # Configure logging to include thread information
@@ -67,8 +65,10 @@ except ImportError as e:
 from natural_pdf.collections.mixins import ApplyMixin
 from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
-class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixin
+class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin):  # Add ExportMixin and ShapeDetectionMixin
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -119,16 +119,8 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     @staticmethod
     def _get_pdf_class():
         """Helper method to dynamically import the PDF class."""
-        try:
-            # Import needs to resolve path correctly
-            from natural_pdf.core.pdf import PDF
-            return PDF
-        except ImportError as e:
-            logger.error(
-                "Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
-            )
-            raise ImportError("PDF class is required but could not be imported.") from e
+        from natural_pdf.core.pdf import PDF
+        return PDF
     # --- Internal Helpers ---
@@ -141,16 +133,13 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
     def _execute_glob(self, pattern: str) -> Set[str]:
         """Glob for paths and return a set of valid PDF paths."""
         found_paths = set()
-        try:
-            # Use iglob for potentially large directories/matches
-            paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
-            for path_str in paths_iter:
-                # Use Path object for easier checking
-                p = Path(path_str)
-                if p.is_file() and p.suffix.lower() == ".pdf":
-                    found_paths.add(str(p.resolve()))  # Store resolved absolute path
-        except Exception as e:
-            logger.error(f"Error processing glob pattern '{pattern}': {e}")
+        # Use iglob for potentially large directories/matches
+        paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
+        for path_str in paths_iter:
+            # Use Path object for easier checking
+            p = Path(path_str)
+            if p.is_file() and p.suffix.lower() == ".pdf":
+                found_paths.add(str(p.resolve()))  # Store resolved absolute path
         return found_paths
     def _resolve_sources_to_paths(self, source: Union[str, Iterable[str]]) -> List[str]:
@@ -534,19 +523,10 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin):  # Add ExportMixi
             **kwargs: Additional arguments passed to create_correction_task_package
                       (e.g., image_render_scale, overwrite).
         """
-        try:
-            from natural_pdf.utils.packaging import create_correction_task_package
+        from natural_pdf.utils.packaging import create_correction_task_package
-            # Pass the collection itself (self) as the source
-            create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
-        except ImportError:
-            logger.error(
-                "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
-            )
-            # Or raise
-        except Exception as e:
-            logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
-            raise  # Re-raise the exception from the utility function
+        # Pass the collection itself (self) as the source
+        create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
     # --- Mixin Required Implementation ---
     def get_indexable_items(self) -> Iterable[Indexable]:

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -407,7 +407,17 @@ class ElementManager:
                     char_dict_data = ocr_char_dict  # Use the one we already created
                     char_dict_data["object_type"] = "char"  # Mark as char type
                     char_dict_data.setdefault("adv", char_dict_data.get("width", 0))
-                    self._elements["chars"].append(char_dict_data)  # Append the dictionary
+                    # Create a TextElement for the char representation
+                    # Ensure _char_dicts is handled correctly by TextElement constructor
+                    # For an OCR word represented as a char, its _char_dicts can be a list containing its own data
+                    char_element_specific_data = char_dict_data.copy()
+                    char_element_specific_data["_char_dicts"] = [char_dict_data.copy()]
+                    ocr_char_as_element = TextElement(char_element_specific_data, self._page)
+                    self._elements["chars"].append(
+                        ocr_char_as_element
+                    )  # Append TextElement instance
             except (KeyError, ValueError, TypeError) as e:
                 logger.error(f"Failed to process OCR result: {result}. Error: {e}", exc_info=True)

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -215,21 +215,14 @@ class HighlightRenderer:
     def _render_ocr_text(self):
         """Renders OCR text onto the image. (Adapted from old HighlightManager)"""
         # Use the page reference to get OCR elements
-        try:
-            # Try finding first, then extracting if necessary
-            ocr_elements = self.page.find_all("text[source=ocr]")
-            if not ocr_elements:
-                # Don't run full OCR here, just extract if already run
-                ocr_elements = [
-                    el for el in self.page.words if getattr(el, "source", None) == "ocr"
-                ]
-                # Alternative: self.page.extract_ocr_elements() - but might be slow
-        except Exception as e:
-            logger.warning(
-                f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
-            )
-            return  # Don't modify image if OCR elements aren't available
+        # Try finding first, then extracting if necessary
+        ocr_elements = self.page.find_all("text[source=ocr]")
+        if not ocr_elements:
+            # Don't run full OCR here, just extract if already run
+            ocr_elements = [
+                el for el in self.page.words if getattr(el, "source", None) == "ocr"
+            ]
+            # Alternative: self.page.extract_ocr_elements() - but might be slow
         if not ocr_elements:
             logger.debug(f"No OCR elements found for page {self.page.number} to render.")
@@ -293,20 +286,15 @@ class HighlightRenderer:
             )
             # Calculate text position (centered vertically, slightly offset from left)
-            try:
-                if hasattr(sized_font, "getbbox"):  # Modern PIL
-                    _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
-                    text_h = text_bottom_offset - text_top_offset
-                else:  # Older PIL approximation
-                    text_h = font_size
-                text_y = top_s + (box_h - text_h) / 2
-                # Adjust for vertical offset in some fonts
-                text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
-                text_x = x0_s + padding  # Start near left edge with padding
-            except Exception:
-                # Fallback positioning
-                text_x, text_y = x0_s + padding, top_s + padding
+            if hasattr(sized_font, "getbbox"):  # Modern PIL
+                _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
+                text_h = text_bottom_offset - text_top_offset
+            else:  # Older PIL approximation
+                text_h = font_size
+            text_y = top_s + (box_h - text_h) / 2
+            # Adjust for vertical offset in some fonts
+            text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
+            text_x = x0_s + padding  # Start near left edge with padding
             draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
@@ -392,9 +380,6 @@ class HighlightingService:
             except ValueError:
                 logger.warning(f"Invalid color string: '{color_input}'")
                 return None
-            except Exception as e:
-                logger.error(f"Error processing color string '{color_input}': {e}")
-                return None
         else:
             logger.warning(f"Invalid color input type: {type(color_input)}")
             return None
@@ -611,13 +596,13 @@ class HighlightingService:
         Args:
             page_index: The 0-based index of the page to render.
-            scale: Scale factor for rendering highlights.
+            scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on the image.
-            resolution: Optional resolution (DPI) for the base page image.
-                       Defaults to scale * 72.
-            kwargs: Additional keyword arguments for pdfplumber's page.to_image.
+            resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
+                       Defaults to scale * 72 if not otherwise specified.
+            kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
         Returns:
             A PIL Image object of the rendered page, or None if rendering fails.
@@ -626,34 +611,84 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for rendering.")
             return None
-        page = self._pdf[page_index]
+        page_obj = self._pdf[page_index] # Renamed to avoid conflict
         highlights_on_page = self.get_highlights_for_page(page_index)
-        render_resolution = resolution if resolution is not None else scale * 72
-        base_image = render_plain_page(page, render_resolution)
-        base_image = base_image.convert("RGBA")
-        logger.debug(
-            f"Base image for page {page_index} rendered with resolution {render_resolution}."
-        )
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
+            if "height" in to_image_args: to_image_args.pop("height", None)
+            # Actual scale will be calculated after image creation
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
+            # Actual scale will be calculated after image creation
+        else:
+            # Use explicit resolution from kwargs if present, then the resolution param, then scale
+            render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
+            if render_resolution is None:
+                render_resolution = scale * 72
+            to_image_args["resolution"] = render_resolution # Add it back for the call
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
+        try:
+            # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
+                img_object.annotated
+                if hasattr(img_object, "annotated")
+                else img_object._repr_png_()
+            )
+            if isinstance(base_image_pil, bytes):
+                from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
+            logger.debug(
+                f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
+            )
+            if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
+        except IOError as e:
+            logger.error(f"IOError creating base image for page {page_index}: {e}")
+            raise
+        except AttributeError as e:
+            logger.error(f"AttributeError creating base image for page {page_index}: {e}")
+            raise
+        renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
         # --- Render Highlights ---
         rendered_image: Image.Image
         if highlights_on_page:
             renderer = HighlightRenderer(
-                page=page,
-                base_image=base_image,
+                page=page_obj,
+                base_image=base_image_pil,
                 highlights=highlights_on_page,
-                scale=scale,
+                scale=renderer_scale, # Use the determined actual scale
                 render_ocr=render_ocr,
             )
             rendered_image = renderer.render()
         else:
             if render_ocr:
-                # Still render OCR even if no highlights
-                renderer = HighlightRenderer(page, base_image, [], scale, True)
+                # Still render OCR even if no highlights, using the determined actual scale
+                renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
                 rendered_image = renderer.render()
             else:
-                rendered_image = base_image  # No highlights, no OCR requested
+                rendered_image = base_image_pil  # No highlights, no OCR requested
         # --- Add Legend (Based ONLY on this page's highlights) ---
         if labels:
@@ -697,12 +732,12 @@ class HighlightingService:
         Args:
             page_index: Index of the page to render.
             temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
-            scale: Scale factor for rendering.
+            scale: Original scale factor for rendering, used if width/height are not provided.
             labels: Whether to include a legend.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
-            resolution: Resolution for base page image rendering.
-            **kwargs: Additional args for pdfplumber's to_image.
+            resolution: Resolution for base page image rendering if width/height not used.
+            **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
         Returns:
             PIL Image of the preview, or None if rendering fails.
@@ -711,35 +746,64 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for render_preview.")
             return None
-        page = self._pdf.pages[page_index]
-        render_resolution = resolution if resolution is not None else scale * 72
+        page_obj = self._pdf.pages[page_index]
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        # Determine arguments for page._page.to_image()
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
+            # Resolution is implicitly handled by pdfplumber when width is set
+            if "height" in to_image_args:
+                to_image_args.pop("height", None)
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
+            # Resolution is implicitly handled by pdfplumber when height is set
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        else:
+            # Neither width nor height is provided, use resolution or scale.
+            render_resolution = resolution if resolution is not None else scale * 72
+            to_image_args["resolution"] = render_resolution
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
         try:
-            # Get base image from pdfplumber using the Page object's underlying _page
-            img_object = page._page.to_image(resolution=render_resolution, **kwargs)
-            base_image = (
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
                 img_object.annotated
                 if hasattr(img_object, "annotated")
                 else img_object._repr_png_()
             )
-            if isinstance(base_image, bytes):
+            if isinstance(base_image_pil, bytes):
                 from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGB")
-                base_image = Image.open(BytesIO(base_image))
-            base_image = base_image.convert("RGB")  # Ensure consistent format
+            # If scale was not determined by resolution, calculate it now from base_image_pil dimensions
+            if actual_scale_x is None or actual_scale_y is None:
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback to original scale
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback to original scale
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
             # Convert temporary highlight dicts to Highlight objects
-            # Note: Colors/labels should be determined *here* for temporary preview
             preview_highlights = []
             for hl_data in temporary_highlights:
-                # Determine the final color using the service logic
                 final_color = self._determine_highlight_color(
                     color_input=hl_data.get("color"),
                     label=hl_data.get("label"),
                     use_color_cycling=hl_data.get("use_color_cycling", False),
                 )
-                # Extract potential attributes to draw
                 attrs_to_draw = {}
                 element = hl_data.get("element")
                 include_attrs = hl_data.get("include_attrs")
@@ -753,25 +817,29 @@ class HighlightingService:
                             logger.warning(
                                 f"Attribute '{attr_name}' not found on element {element}"
                             )
-                # Add highlight if geometry exists
                 if hl_data.get("bbox") or hl_data.get("polygon"):
                     preview_highlights.append(
                         Highlight(
                             page_index=hl_data["page_index"],
                             bbox=hl_data.get("bbox"),
                             polygon=hl_data.get("polygon"),
-                            color=final_color,  # Use the determined color
+                            color=final_color,
                             label=hl_data.get("label"),
                             attributes=attrs_to_draw,
                         )
                     )
-            # Render only these highlights
-            renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
+            # Use the calculated actual_scale_x for the HighlightRenderer
+            # Assuming HighlightRenderer can handle a single scale or we adapt it.
+            # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
+            # If not, HighlightRenderer needs to accept scale_x and scale_y.
+            # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
+            # or if not, it's a reasonable approximation for highlight scaling.
+            renderer_scale = actual_scale_x
+            renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
             rendered_image = renderer.render()
-            # Create legend only from temporary highlights
             legend = None
             if labels:
                 preview_labels = {h.label: h.color for h in preview_highlights if h.label}
@@ -781,12 +849,15 @@ class HighlightingService:
                         rendered_image, legend, position=legend_position
                     )
                 else:
-                    final_image = rendered_image  # No legend needed
+                    final_image = rendered_image
             else:
                 final_image = rendered_image
-        except Exception as e:
-            logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
-            return None
+        except IOError as e:
+            logger.error(f"IOError rendering preview for page {page_index}: {e}")
+            raise
+        except AttributeError as e:
+            logger.error(f"AttributeError rendering preview for page {page_index}: {e}")
+            raise
         return final_image

natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl