PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +3 -4
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +146 -75
natural_pdf/core/page.py +287 -188
natural_pdf/core/pdf.py +57 -42
natural_pdf/elements/base.py +51 -0
natural_pdf/elements/collections.py +362 -67
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +396 -23
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/selectors/parser.py +163 -8
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
 from natural_pdf.ocr import OCRManager, OCROptions
 from natural_pdf.selectors.parser import parse_selector
 from natural_pdf.utils.locks import pdf_render_lock
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from tqdm.auto import tqdm
 try:
     from typing import Any as TypingAny
@@ -60,6 +60,7 @@ except ImportError:
             "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
         )
 try:
     from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
 except ImportError:
@@ -70,7 +71,6 @@ except ImportError:
     create_original_pdf = None
 logger = logging.getLogger("natural_pdf.core.pdf")
-tqdm = get_tqdm()
 DEFAULT_MANAGERS = {
     "classification": ClassificationManager,
@@ -791,10 +791,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
         )
         if create_searchable_pdf is None:
-             raise ImportError(
-                 "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
-                 "Install with: pip install \"natural-pdf[ocr-export]\""
-             )
+            raise ImportError(
+                "Saving searchable PDF requires 'pikepdf'. "
+                'Install with: pip install "natural-pdf[ocr-export]"'
+            )
         output_path_str = str(output_path)
         # Call the exporter directly, passing self (the PDF instance)
         create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
@@ -842,55 +842,59 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         output_path_str = str(output_path_obj)
         if ocr:
-            if create_searchable_pdf is None:
-                raise ImportError(
-                    "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
-                    "Install with: pip install \"natural-pdf[ocr-export]\""
-                )
-            # Optional: Add warning about vector data loss similar to PageCollection
             has_vector_elements = False
             for page in self.pages:
-                if (hasattr(page, 'rects') and page.rects or
-                    hasattr(page, 'lines') and page.lines or
-                    hasattr(page, 'curves') and page.curves or
-                    (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
-                    (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
+                if (
+                    hasattr(page, "rects")
+                    and page.rects
+                    or hasattr(page, "lines")
+                    and page.lines
+                    or hasattr(page, "curves")
+                    and page.curves
+                    or (
+                        hasattr(page, "chars")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.chars)
+                    )
+                    or (
+                        hasattr(page, "words")
+                        and any(getattr(el, "source", None) != "ocr" for el in page.words)
+                    )
+                ):
                     has_vector_elements = True
                     break
             if has_vector_elements:
-                 logger.warning(
-                     "Warning: Saving with ocr=True creates an image-based PDF. "
-                     "Original vector elements (rects, lines, non-OCR text/chars) "
-                     "will not be preserved in the output file."
-                 )
+                logger.warning(
+                    "Warning: Saving with ocr=True creates an image-based PDF. "
+                    "Original vector elements (rects, lines, non-OCR text/chars) "
+                    "will not be preserved in the output file."
+                )
             logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
             try:
                 # Delegate to the searchable PDF exporter, passing self (PDF instance)
                 create_searchable_pdf(self, output_path_str, dpi=dpi)
             except Exception as e:
-                 raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
+                raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
         elif original:
             if create_original_pdf is None:
                 raise ImportError(
                     "Saving with original=True requires 'pikepdf'. "
-                    "Install with: pip install \"natural-pdf[ocr-export]\""
+                    'Install with: pip install "natural-pdf[ocr-export]"'
                 )
-             # Optional: Add warning about losing OCR data similar to PageCollection
+            # Optional: Add warning about losing OCR data similar to PageCollection
             has_ocr_elements = False
             for page in self.pages:
-                 if hasattr(page, 'find_all'):
-                     ocr_text_elements = page.find_all("text[source=ocr]")
-                     if ocr_text_elements:
-                         has_ocr_elements = True
-                         break
-                 elif hasattr(page, 'words'): # Fallback
-                     if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
-                          has_ocr_elements = True
-                          break
+                if hasattr(page, "find_all"):
+                    ocr_text_elements = page.find_all("text[source=ocr]")
+                    if ocr_text_elements:
+                        has_ocr_elements = True
+                        break
+                elif hasattr(page, "words"):  # Fallback
+                    if any(getattr(el, "source", None) == "ocr" for el in page.words):
+                        has_ocr_elements = True
+                        break
             if has_ocr_elements:
                 logger.warning(
                     "Warning: Saving with original=True preserves original page content. "
@@ -899,11 +903,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             logger.info(f"Saving original PDF content to: {output_path_str}")
             try:
-                 # Delegate to the original PDF exporter, passing self (PDF instance)
-                 create_original_pdf(self, output_path_str)
+                # Delegate to the original PDF exporter, passing self (PDF instance)
+                create_original_pdf(self, output_path_str)
             except Exception as e:
-                 # Re-raise exception from exporter
-                 raise e
+                # Re-raise exception from exporter
+                raise e
     def ask(
         self,
@@ -1227,6 +1231,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         """Context manager exit."""
         self.close()
+    def __repr__(self) -> str:
+        """Return a string representation of the PDF object."""
+        if not hasattr(self, "_pages"):
+            page_count_str = "uninitialized"
+        else:
+            page_count_str = str(len(self._pages))
+        source_info = getattr(self, "source_path", "unknown source")
+        return f"<PDF source='{source_info}' pages={page_count_str}>"
     def get_id(self) -> str:
         """Get unique identifier for this PDF."""
         """Get unique identifier for this PDF."""
@@ -1238,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
         resolution: int = 300,
+        angle: Optional[float] = None,
         detection_resolution: int = 72,
         force_overwrite: bool = False,
         **deskew_kwargs,
@@ -1256,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         Args:
             pages: Page indices/slice to include (0-based). If None, processes all pages.
             resolution: DPI resolution for rendering the output deskewed pages.
+            angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
             detection_resolution: DPI resolution used for skew detection if angles are not
                                   already cached on the page objects.
             force_overwrite: If False (default), raises a ValueError if any target page
@@ -1300,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         deskewed_images_bytes = []
         logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
-        # Use tqdm via get_tqdm
         for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
             try:
                 # Use page.deskew to get the corrected PIL image
                 # Pass down resolutions and kwargs
                 deskewed_img = page.deskew(
                     resolution=resolution,
-                    angle=None,  # Let page.deskew handle detection/caching
+                    angle=angle,  # Let page.deskew handle detection/caching
                     detection_resolution=detection_resolution,
                     **deskew_kwargs,
                 )
@@ -1400,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             except ImportError:
                 raise ImportError(
                     "Classification dependencies missing. "
-                    'Install with: pip install "natural-pdf[classification]"'
+                    'Install with: pip install "natural-pdf[core-ml]"'
                 )
             raise ClassificationError("ClassificationManager not available.")

natural_pdf/elements/base.py CHANGED Viewed

@@ -15,6 +15,40 @@ if TYPE_CHECKING:
     from natural_pdf.elements.region import Region
+def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
+    """
+    Extract bounding box coordinates from any object that has bbox properties.
+    Args:
+        obj: Object that might have bbox coordinates (Element, Region, etc.)
+    Returns:
+        Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
+    """
+    # Try bbox property first (most common)
+    if hasattr(obj, 'bbox') and obj.bbox is not None:
+        bbox = obj.bbox
+        if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
+            return tuple(float(coord) for coord in bbox)
+    # Try individual coordinate properties
+    if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
+        try:
+            return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
+        except (ValueError, TypeError):
+            pass
+    # If object is a dict with bbox keys
+    if isinstance(obj, dict):
+        if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
+            try:
+                return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
+            except (ValueError, TypeError):
+                pass
+    return None
 class DirectionalMixin:
     """
     Mixin class providing directional methods for both Element and Region classes.
@@ -814,6 +848,7 @@ class Element(DirectionalMixin):
         legend_position: str = "right",
         color: Optional[Union[Tuple, str]] = "red",  # Default color for single element
         label: Optional[str] = None,
+        width: Optional[int] = None,  # Add width parameter
     ) -> Optional["Image.Image"]:
         """
         Show the page with only this element highlighted temporarily.
@@ -824,6 +859,7 @@ class Element(DirectionalMixin):
             legend_position: Position of the legend
             color: Color to highlight this element (default: red)
             label: Optional label for this element in the legend
+            width: Optional width for the output image in pixels
         Returns:
             PIL Image of the page with only this element highlighted, or None if error.
@@ -861,6 +897,7 @@ class Element(DirectionalMixin):
                 page_index=self.page.index,
                 temporary_highlights=[temp_highlight_data],
                 scale=scale,
+                width=width,  # Pass the width parameter
                 labels=labels,
                 legend_position=legend_position,
             )
@@ -898,6 +935,7 @@ class Element(DirectionalMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -909,6 +947,7 @@ class Element(DirectionalMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -920,6 +959,7 @@ class Element(DirectionalMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -934,6 +974,9 @@ class Element(DirectionalMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to apply exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -950,6 +993,7 @@ class Element(DirectionalMixin):
         return temp_region.find(
             selector=selector,
             text=text,
+            contains=contains,
             apply_exclusions=apply_exclusions,
             regex=regex,
             case=case,
@@ -961,6 +1005,7 @@ class Element(DirectionalMixin):
         self,
         *,
         text: str,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -972,6 +1017,7 @@ class Element(DirectionalMixin):
         self,
         selector: str,
         *,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -983,6 +1029,7 @@ class Element(DirectionalMixin):
         selector: Optional[str] = None,
         *,
         text: Optional[str] = None,
+        contains: str = "all",
         apply_exclusions: bool = True,
         regex: bool = False,
         case: bool = True,
@@ -997,6 +1044,9 @@ class Element(DirectionalMixin):
         Args:
             selector: CSS-like selector string.
             text: Text content to search for (equivalent to 'text:contains(...)').
+            contains: How to determine if elements are inside: 'all' (fully inside),
+                     'any' (any overlap), or 'center' (center point inside).
+                     (default: "all")
             apply_exclusions: Whether to apply exclusion regions (default: True).
             regex: Whether to use regex for text search (`selector` or `text`) (default: False).
             case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1013,6 +1063,7 @@ class Element(DirectionalMixin):
         return temp_region.find_all(
             selector=selector,
             text=text,
+            contains=contains,
             apply_exclusions=apply_exclusions,
             regex=regex,
             case=case,

natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl