PyPI - natural-pdf - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl - Mend

natural-pdf 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

natural_pdf/__init__.py +45 -0
natural_pdf/analyzers/guides.py +359 -0
natural_pdf/core/element_manager.py +4 -0
natural_pdf/core/page.py +88 -22
natural_pdf/core/page_collection.py +75 -0
natural_pdf/core/pdf.py +33 -0
natural_pdf/describe/base.py +48 -7
natural_pdf/elements/base.py +408 -43
natural_pdf/elements/element_collection.py +83 -10
natural_pdf/elements/region.py +217 -178
natural_pdf/elements/text.py +5 -3
natural_pdf/flows/element.py +1 -0
natural_pdf/flows/flow.py +175 -480
natural_pdf/flows/region.py +76 -0
natural_pdf/selectors/parser.py +180 -9
natural_pdf/utils/pdfminer_patches.py +136 -0
natural_pdf/utils/sections.py +346 -0
natural_pdf/utils/spatial.py +169 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0

natural_pdf/core/page_collection.py CHANGED Viewed

@@ -789,6 +789,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page,
                                 (0, top, start_element.page.width, bottom),
                             )
+                            section._boundary_exclusions = include_boundaries
                         else:  # horizontal
                             left = start_element.x0
                             right = end_element.x1
@@ -821,6 +822,7 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page,
                                 (left, 0, right, start_element.page.height),
                             )
+                            section._boundary_exclusions = include_boundaries
                         section.start_element = start_element
                         section.boundary_element_found = end_element
                     else:
@@ -865,6 +867,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page, (0, top, start_element.page.width, bottom)
                             )
                             section.start_element = start_element
+                            section.end_element = (
+                                next_start  # The next start is the end of this section
+                            )
+                            section._boundary_exclusions = include_boundaries
                             sections.append(section)
                     else:  # horizontal
                         # Determine horizontal bounds
@@ -882,6 +888,10 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
                                 start_element.page, (left, 0, right, start_element.page.height)
                             )
                             section.start_element = start_element
+                            section.end_element = (
+                                next_start  # The next start is the end of this section
+                            )
+                            section._boundary_exclusions = include_boundaries
                             sections.append(section)
                 else:
                     # Cross-page section - create from current_start to the end of its page
@@ -982,6 +992,71 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
         return ElementCollection(sections)
+    def split(self, divider, **kwargs) -> "ElementCollection[Region]":
+        """
+        Divide this page collection into sections based on the provided divider elements.
+        Args:
+            divider: Elements or selector string that mark section boundaries
+            **kwargs: Additional parameters passed to get_sections()
+                - include_boundaries: How to include boundary elements (default: 'start')
+                - orientation: 'vertical' or 'horizontal' (default: 'vertical')
+                - new_section_on_page_break: Whether to split at page boundaries (default: False)
+        Returns:
+            ElementCollection of Region objects representing the sections
+        Example:
+            # Split a PDF by chapter titles
+            chapters = pdf.pages.split("text[size>20]:contains('CHAPTER')")
+            # Split by page breaks
+            page_sections = pdf.pages.split(None, new_section_on_page_break=True)
+            # Split multi-page document by section headers
+            sections = pdf.pages[10:20].split("text:bold:contains('Section')")
+        """
+        # Default to 'start' boundaries for split (include divider at start of each section)
+        if "include_boundaries" not in kwargs:
+            kwargs["include_boundaries"] = "start"
+        sections = self.get_sections(start_elements=divider, **kwargs)
+        # Add initial section if there's content before the first divider
+        if sections and divider is not None:
+            # Get all elements across all pages
+            all_elements = []
+            for page in self.pages:
+                all_elements.extend(page.get_elements())
+            if all_elements:
+                # Find first divider
+                if isinstance(divider, str):
+                    # Search for first matching element
+                    first_divider = None
+                    for page in self.pages:
+                        match = page.find(divider)
+                        if match:
+                            first_divider = match
+                            break
+                else:
+                    # divider is already elements
+                    first_divider = divider[0] if hasattr(divider, "__getitem__") else divider
+                if first_divider and all_elements[0] != first_divider:
+                    # There's content before the first divider
+                    # Get section from start to first divider
+                    initial_sections = self.get_sections(
+                        start_elements=None,
+                        end_elements=[first_divider],
+                        include_boundaries="none",
+                        orientation=kwargs.get("orientation", "vertical"),
+                    )
+                    if initial_sections:
+                        sections = ElementCollection([initial_sections[0]] + list(sections))
+        return sections
     def _gather_analysis_data(
         self,
         analysis_keys: List[str],

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1333,6 +1333,39 @@ class PDF(
             orientation=orientation,
         )
+    def split(self, divider, **kwargs) -> "ElementCollection":
+        """
+        Divide the PDF into sections based on the provided divider elements.
+        Args:
+            divider: Elements or selector string that mark section boundaries
+            **kwargs: Additional parameters passed to get_sections()
+                - include_boundaries: How to include boundary elements (default: 'start')
+                - orientation: 'vertical' or 'horizontal' (default: 'vertical')
+                - new_section_on_page_break: Whether to split at page boundaries (default: False)
+        Returns:
+            ElementCollection of Region objects representing the sections
+        Example:
+            # Split a PDF by chapter titles
+            chapters = pdf.split("text[size>20]:contains('Chapter')")
+            # Export each chapter to a separate file
+            for i, chapter in enumerate(chapters):
+                chapter_text = chapter.extract_text()
+                with open(f"chapter_{i+1}.txt", "w") as f:
+                    f.write(chapter_text)
+            # Split by horizontal rules/lines
+            sections = pdf.split("line[orientation=horizontal]")
+            # Split only by page breaks (no divider elements)
+            pages = pdf.split(None, new_section_on_page_break=True)
+        """
+        # Delegate to pages collection
+        return self.pages.split(divider, **kwargs)
     def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
         """
         DEPRECATED: Use save_pdf(..., ocr=True) instead.

natural_pdf/describe/base.py CHANGED Viewed

@@ -272,17 +272,12 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
                 "font_family",
                 "font_variant",
                 "size",
-                "bold",
-                "italic",
-                "strike",
-                "underline",
-                "highlight",
+                "styles",
                 "source",
                 "confidence",
+                "color",
             ]
         )
-        # Add foreground text colour too
-        columns.append("color")
     elif element_type == "rect":
         columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
     elif element_type == "line":
@@ -358,6 +353,52 @@ def _extract_element_value(element: "Element", column: str) -> Any:
                 return str(col_val)
             return ""
+        elif column == "styles":
+            # Collect all active text decorations
+            styles = []
+            if getattr(element, "bold", False):
+                styles.append("bold")
+            if getattr(element, "italic", False):
+                styles.append("italic")
+            if getattr(element, "strike", False):
+                styles.append("strike")
+            if getattr(element, "underline", False):
+                styles.append("underline")
+            # Handle highlight specially - include color if not default yellow
+            if getattr(element, "is_highlighted", False):
+                highlight_color = getattr(element, "highlight_color", None)
+                if highlight_color is not None:
+                    # Convert color to hex if needed
+                    if isinstance(highlight_color, (tuple, list)) and len(highlight_color) >= 3:
+                        try:
+                            r, g, b = [
+                                int(v * 255) if v <= 1 else int(v) for v in highlight_color[:3]
+                            ]
+                            hex_color = f"#{r:02x}{g:02x}{b:02x}"
+                            styles.append(f"highlight({hex_color})")
+                        except Exception:
+                            styles.append("highlight")
+                    elif isinstance(highlight_color, (int, float)):
+                        # Grayscale value
+                        try:
+                            gray = (
+                                int(highlight_color * 255)
+                                if highlight_color <= 1
+                                else int(highlight_color)
+                            )
+                            hex_color = f"#{gray:02x}{gray:02x}{gray:02x}"
+                            styles.append(f"highlight({hex_color})")
+                        except Exception:
+                            styles.append("highlight")
+                    else:
+                        styles.append("highlight")
+                else:
+                    styles.append("highlight")
+            return ", ".join(styles) if styles else ""
         elif column in ["stroke", "fill", "color"]:
             value = getattr(element, column, None)
             # If already a string (e.g. '#ff00aa' or 'red') return as is

natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

natural-pdf 0.2.16py3-none-any.whl → 0.2.17py3-none-any.whl