PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +1 -1
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +120 -40
natural_pdf/core/page.py +4 -2
natural_pdf/core/pdf.py +53 -38
natural_pdf/elements/base.py +17 -0
natural_pdf/elements/collections.py +203 -59
natural_pdf/elements/region.py +43 -11
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -611,13 +611,13 @@ class HighlightingService:
         Args:
             page_index: The 0-based index of the page to render.
-            scale: Scale factor for rendering highlights.
+            scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on the image.
-            resolution: Optional resolution (DPI) for the base page image.
-                       Defaults to scale * 72.
-            kwargs: Additional keyword arguments for pdfplumber's page.to_image.
+            resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
+                       Defaults to scale * 72 if not otherwise specified.
+            kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
         Returns:
             A PIL Image object of the rendered page, or None if rendering fails.
@@ -626,34 +626,81 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for rendering.")
             return None
-        page = self._pdf[page_index]
+        page_obj = self._pdf[page_index] # Renamed to avoid conflict
         highlights_on_page = self.get_highlights_for_page(page_index)
-        render_resolution = resolution if resolution is not None else scale * 72
-        base_image = render_plain_page(page, render_resolution)
-        base_image = base_image.convert("RGBA")
-        logger.debug(
-            f"Base image for page {page_index} rendered with resolution {render_resolution}."
-        )
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering page {page_index} with width={to_image_args['width']}.")
+            if "height" in to_image_args: to_image_args.pop("height", None)
+            # Actual scale will be calculated after image creation
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
+            # Actual scale will be calculated after image creation
+        else:
+            # Use explicit resolution from kwargs if present, then the resolution param, then scale
+            render_resolution = to_image_args.pop("resolution", resolution) # Use and remove from kwargs if present
+            if render_resolution is None:
+                render_resolution = scale * 72
+            to_image_args["resolution"] = render_resolution # Add it back for the call
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering page {page_index} with resolution {render_resolution} (scale: {actual_scale_x:.2f}).")
+        try:
+            # base_image = render_plain_page(page_obj, actual_scale_x * 72 if actual_scale_x else scale * 72) # Old call
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
+                img_object.annotated
+                if hasattr(img_object, "annotated")
+                else img_object._repr_png_()
+            )
+            if isinstance(base_image_pil, bytes):
+                from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGBA") # Ensure RGBA for renderer
+            logger.debug(
+                f"Base image for page {page_index} rendered. Size: {base_image_pil.size}."
+            )
+            if actual_scale_x is None or actual_scale_y is None: # If not set by resolution path
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}")
+        except Exception as e:
+            logger.error(f"Error creating base image for page {page_index}: {e}", exc_info=True)
+            return None
+        renderer_scale = actual_scale_x # Assuming aspect ratio maintained, use x_scale
         # --- Render Highlights ---
         rendered_image: Image.Image
         if highlights_on_page:
             renderer = HighlightRenderer(
-                page=page,
-                base_image=base_image,
+                page=page_obj,
+                base_image=base_image_pil,
                 highlights=highlights_on_page,
-                scale=scale,
+                scale=renderer_scale, # Use the determined actual scale
                 render_ocr=render_ocr,
             )
             rendered_image = renderer.render()
         else:
             if render_ocr:
-                # Still render OCR even if no highlights
-                renderer = HighlightRenderer(page, base_image, [], scale, True)
+                # Still render OCR even if no highlights, using the determined actual scale
+                renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
                 rendered_image = renderer.render()
             else:
-                rendered_image = base_image  # No highlights, no OCR requested
+                rendered_image = base_image_pil  # No highlights, no OCR requested
         # --- Add Legend (Based ONLY on this page's highlights) ---
         if labels:
@@ -697,12 +744,12 @@ class HighlightingService:
         Args:
             page_index: Index of the page to render.
             temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
-            scale: Scale factor for rendering.
+            scale: Original scale factor for rendering, used if width/height are not provided.
             labels: Whether to include a legend.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
-            resolution: Resolution for base page image rendering.
-            **kwargs: Additional args for pdfplumber's to_image.
+            resolution: Resolution for base page image rendering if width/height not used.
+            **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
         Returns:
             PIL Image of the preview, or None if rendering fails.
@@ -711,35 +758,64 @@ class HighlightingService:
             logger.error(f"Invalid page index {page_index} for render_preview.")
             return None
-        page = self._pdf.pages[page_index]
-        render_resolution = resolution if resolution is not None else scale * 72
+        page_obj = self._pdf.pages[page_index]
+        to_image_args = kwargs.copy()
+        actual_scale_x = None
+        actual_scale_y = None
+        # Determine arguments for page._page.to_image()
+        if "width" in to_image_args and to_image_args["width"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with width={to_image_args['width']}.")
+            # Resolution is implicitly handled by pdfplumber when width is set
+            if "height" in to_image_args:
+                to_image_args.pop("height", None)
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        elif "height" in to_image_args and to_image_args["height"] is not None:
+            logger.debug(f"Rendering preview for page {page_index} with height={to_image_args['height']}.")
+            # Resolution is implicitly handled by pdfplumber when height is set
+            # after image is created, we will calculate actual_scale_x and actual_scale_y
+        else:
+            # Neither width nor height is provided, use resolution or scale.
+            render_resolution = resolution if resolution is not None else scale * 72
+            to_image_args["resolution"] = render_resolution
+            actual_scale_x = render_resolution / 72.0
+            actual_scale_y = render_resolution / 72.0
+            logger.debug(f"Rendering preview for page {page_index} with resolution={render_resolution} (scale: {actual_scale_x:.2f}).")
         try:
-            # Get base image from pdfplumber using the Page object's underlying _page
-            img_object = page._page.to_image(resolution=render_resolution, **kwargs)
-            base_image = (
+            img_object = page_obj._page.to_image(**to_image_args)
+            base_image_pil = (
                 img_object.annotated
                 if hasattr(img_object, "annotated")
                 else img_object._repr_png_()
             )
-            if isinstance(base_image, bytes):
+            if isinstance(base_image_pil, bytes):
                 from io import BytesIO
+                base_image_pil = Image.open(BytesIO(base_image_pil))
+            base_image_pil = base_image_pil.convert("RGB")
-                base_image = Image.open(BytesIO(base_image))
-            base_image = base_image.convert("RGB")  # Ensure consistent format
+            # If scale was not determined by resolution, calculate it now from base_image_pil dimensions
+            if actual_scale_x is None or actual_scale_y is None:
+                if page_obj.width > 0:
+                    actual_scale_x = base_image_pil.width / page_obj.width
+                else:
+                    actual_scale_x = scale # Fallback to original scale
+                if page_obj.height > 0:
+                    actual_scale_y = base_image_pil.height / page_obj.height
+                else:
+                    actual_scale_y = scale # Fallback to original scale
+                logger.debug(f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})")
             # Convert temporary highlight dicts to Highlight objects
-            # Note: Colors/labels should be determined *here* for temporary preview
             preview_highlights = []
             for hl_data in temporary_highlights:
-                # Determine the final color using the service logic
                 final_color = self._determine_highlight_color(
                     color_input=hl_data.get("color"),
                     label=hl_data.get("label"),
                     use_color_cycling=hl_data.get("use_color_cycling", False),
                 )
-                # Extract potential attributes to draw
                 attrs_to_draw = {}
                 element = hl_data.get("element")
                 include_attrs = hl_data.get("include_attrs")
@@ -753,25 +829,29 @@ class HighlightingService:
                             logger.warning(
                                 f"Attribute '{attr_name}' not found on element {element}"
                             )
-                # Add highlight if geometry exists
                 if hl_data.get("bbox") or hl_data.get("polygon"):
                     preview_highlights.append(
                         Highlight(
                             page_index=hl_data["page_index"],
                             bbox=hl_data.get("bbox"),
                             polygon=hl_data.get("polygon"),
-                            color=final_color,  # Use the determined color
+                            color=final_color,
                             label=hl_data.get("label"),
                             attributes=attrs_to_draw,
                         )
                     )
-            # Render only these highlights
-            renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
+            # Use the calculated actual_scale_x for the HighlightRenderer
+            # Assuming HighlightRenderer can handle a single scale or we adapt it.
+            # For now, pdfplumber usually maintains aspect ratio, so one scale should be okay.
+            # If not, HighlightRenderer needs to accept scale_x and scale_y.
+            # We will use actual_scale_x assuming aspect ratio is maintained by pdfplumber,
+            # or if not, it's a reasonable approximation for highlight scaling.
+            renderer_scale = actual_scale_x
+            renderer = HighlightRenderer(page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr)
             rendered_image = renderer.render()
-            # Create legend only from temporary highlights
             legend = None
             if labels:
                 preview_labels = {h.label: h.color for h in preview_highlights if h.label}
@@ -781,7 +861,7 @@ class HighlightingService:
                         rendered_image, legend, position=legend_position
                     )
                 else:
-                    final_image = rendered_image  # No legend needed
+                    final_image = rendered_image
             else:
                 final_image = rendered_image

natural_pdf/core/page.py CHANGED Viewed

@@ -1349,7 +1349,9 @@ class Page(ClassificationMixin, ExtractionMixin):
         self._highlighter.clear_page(self.index)
         return self
-    def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
+    def analyze_text_styles(
+        self, options: Optional[TextStyleOptions] = None
+    ) -> "ElementCollection":
         """
         Analyze text elements by style, adding attributes directly to elements.
@@ -2130,7 +2132,7 @@ class Page(ClassificationMixin, ExtractionMixin):
         if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
             logger.error(
                 "Interactive viewer requires optional dependencies ('ipywidgets'). "
-                "Install with `pip install natural-pdf[interactive]`"
+                "Install with `pip install natural-pdf[viewer]`"
             )
             # raise ImportError("ipywidgets not found.") # Option 1: Raise error
             return None  # Option 2: Return None gracefully

natural_pdf/core/pdf.py CHANGED Viewed

@@ -60,6 +60,7 @@ except ImportError:
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
 try:
     from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
 except ImportError:
@@ -791,10 +792,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
         )
         if create_searchable_pdf is None:
-             raise ImportError(
-                 "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
-                 "Install with: pip install \"natural-pdf[ocr-export]\""
-             )
+            raise ImportError(
+                "Saving searchable PDF requires 'pikepdf'. "
+                'Install with: pip install "natural-pdf[ocr-export]"'
+            )
         output_path_str = str(output_path)
         # Call the exporter directly, passing self (the PDF instance)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
@@ -842,55 +843,59 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         output_path_str = str(output_path_obj)
         if ocr:
-            if create_searchable_pdf is None:
-                raise ImportError(
-                    "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
-                    "Install with: pip install \"natural-pdf[ocr-export]\""
-                )
-            # Optional: Add warning about vector data loss similar to PageCollection
             has_vector_elements = False
             for page in self.pages:
-                if (hasattr(page, 'rects') and page.rects or
-                    hasattr(page, 'lines') and page.lines or
-                    hasattr(page, 'curves') and page.curves or
-                    (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
-                    (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
+                if (
+                    hasattr(page, "rects")
+                    and page.rects
+                    or hasattr(page, "lines")
+                    and page.lines
+                    or hasattr(page, "curves")
+                    and page.curves
+                    or (
+                        hasattr(page, "chars")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.chars)
+                    )
+                    or (
+                        hasattr(page, "words")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.words)
+                    )
+                ):
                     has_vector_elements = True
                     break
             if has_vector_elements:
-                 logger.warning(
-                     "Warning: Saving with ocr=True creates an image-based PDF. "
-                     "Original vector elements (rects, lines, non-OCR text/chars) "
-                     "will not be preserved in the output file."
-                 )
+                logger.warning(
+                    "Warning: Saving with ocr=True creates an image-based PDF. "
+                    "Original vector elements (rects, lines, non-OCR text/chars) "
+                    "will not be preserved in the output file."
+                )
             logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
             try:
                 # Delegate to the searchable PDF exporter, passing self (PDF instance)
                 create_searchable_pdf(self, output_path_str, dpi=dpi)
             except Exception as e:
-                 raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
+                raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
         elif original:
             if create_original_pdf is None:
                 raise ImportError(
                     "Saving with original=True requires 'pikepdf'. "
-                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                    'Install with: pip install "natural-pdf[ocr-export]"'
                 )
-             # Optional: Add warning about losing OCR data similar to PageCollection
+            # Optional: Add warning about losing OCR data similar to PageCollection
             has_ocr_elements = False
             for page in self.pages:
-                 if hasattr(page, 'find_all'):
-                     ocr_text_elements = page.find_all("text[source=ocr]")
-                     if ocr_text_elements:
-                         has_ocr_elements = True
-                         break
-                 elif hasattr(page, 'words'): # Fallback
-                     if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
-                          has_ocr_elements = True
-                          break
+                if hasattr(page, "find_all"):
+                    ocr_text_elements = page.find_all("text[source=ocr]")
+                    if ocr_text_elements:
+                        has_ocr_elements = True
+                        break
+                elif hasattr(page, "words"):  # Fallback
+                    if any(getattr(el, "source", None) == "ocr" for el in page.words):
+                        has_ocr_elements = True
+                        break
             if has_ocr_elements:
                 logger.warning(
                     "Warning: Saving with original=True preserves original page content. "
@@ -899,11 +904,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             logger.info(f"Saving original PDF content to: {output_path_str}")
             try:
-                 # Delegate to the original PDF exporter, passing self (PDF instance)
-                 create_original_pdf(self, output_path_str)
+                # Delegate to the original PDF exporter, passing self (PDF instance)
+                create_original_pdf(self, output_path_str)
             except Exception as e:
-                 # Re-raise exception from exporter
-                 raise e
+                # Re-raise exception from exporter
+                raise e
     def ask(
         self,
@@ -1227,6 +1232,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         """Context manager exit."""
         self.close()
+    def __repr__(self) -> str:
+        """Return a string representation of the PDF object."""
+        if not hasattr(self, "_pages"):
+            page_count_str = "uninitialized"
+        else:
+            page_count_str = str(len(self._pages))
+        source_info = getattr(self, "source_path", "unknown source")
+        return f"<PDF source='{source_info}' pages={page_count_str}>"
     def get_id(self) -> str:
         """Get unique identifier for this PDF."""
         """Get unique identifier for this PDF."""
@@ -1400,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             except ImportError:
                 raise ImportError(
                     "Classification dependencies missing. "
-                    'Install with: pip install "natural-pdf[classification]"'
+                    'Install with: pip install "natural-pdf[core-ml]"'
                 )
             raise ClassificationError("ClassificationManager not available.")

natural_pdf/elements/base.py CHANGED Viewed

@@ -814,6 +814,7 @@ class Element(DirectionalMixin):
         legend_position: str = "right",
         color: Optional[Union[Tuple, str]] = "red",  # Default color for single element
         label: Optional[str] = None,
+        width: Optional[int] = None,  # Add width parameter
     ) -> Optional["Image.Image"]:
         """
         Show the page with only this element highlighted temporarily.
@@ -824,6 +825,7 @@ class Element(DirectionalMixin):
             legend_position: Position of the legend
             color: Color to highlight this element (default: red)
             label: Optional label for this element in the legend
+            width: Optional width for the output image in pixels
         Returns:
             PIL Image of the page with only this element highlighted, or None if error.
@@ -861,6 +863,7 @@ class Element(DirectionalMixin):
                 page_index=self.page.index,
                 temporary_highlights=[temp_highlight_data],
                 scale=scale,
+                width=width,  # Pass the width parameter
                 labels=labels,
                 legend_position=legend_position,
             )
@@ -898,6 +901,7 @@ class Element(DirectionalMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -909,6 +913,7 @@ class Element(DirectionalMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -920,6 +925,7 @@ class Element(DirectionalMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -934,6 +940,9 @@ class Element(DirectionalMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to apply exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -950,6 +959,7 @@ class Element(DirectionalMixin):
         return temp_region.find(
             selector=selector,
             text=text,
+            contains=contains,
             apply_exclusions=apply_exclusions,
             regex=regex,
             case=case,
@@ -961,6 +971,7 @@ class Element(DirectionalMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -972,6 +983,7 @@ class Element(DirectionalMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -983,6 +995,7 @@ class Element(DirectionalMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -997,6 +1010,9 @@ class Element(DirectionalMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to apply exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1013,6 +1029,7 @@ class Element(DirectionalMixin):
         return temp_region.find_all(
             selector=selector,
             text=text,
+            contains=contains,
             apply_exclusions=apply_exclusions,
             regex=regex,
             case=case,

natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.12py3-none-any.whl