PyPI - natural-pdf - Versions diffs - 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

natural-pdf 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +44 -0
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +556 -25
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +89 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/ocr_manager.py +50 -0
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -70,16 +70,16 @@ class HighlightRenderer:
         page: Page,
         base_image: Image.Image,
         highlights: List[Highlight],
-        scale: float,
+        scale_factor: float,
         render_ocr: bool,
     ):
         self.page = page  # Keep page reference for OCR rendering
         self.base_image = base_image.convert("RGBA")  # Ensure RGBA
         self.highlights = highlights
-        self.scale = scale
+        self.scale_factor = scale_factor  # Renamed from scale to scale_factor for clarity
         self.render_ocr = render_ocr
         self.result_image = self.base_image.copy()
-        self.vertex_size = max(3, int(2 * self.scale))  # Size of corner markers
+        self.vertex_size = max(3, int(2 * self.scale_factor))  # Size of corner markers
     def render(self) -> Image.Image:
         """Executes the rendering process."""
@@ -98,7 +98,7 @@ class HighlightRenderer:
             scaled_bbox = None
             if highlight.is_polygon:
-                scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
+                scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
                 # Draw polygon fill and border
                 draw.polygon(
                     scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
@@ -113,10 +113,10 @@ class HighlightRenderer:
             else:  # Rectangle
                 x0, top, x1, bottom = highlight.bbox
                 x0_s, top_s, x1_s, bottom_s = (
-                    x0 * self.scale,
-                    top * self.scale,
-                    x1 * self.scale,
-                    bottom * self.scale,
+                    x0 * self.scale_factor,
+                    top * self.scale_factor,
+                    x1 * self.scale_factor,
+                    bottom * self.scale_factor,
                 )
                 scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
                 # Draw rectangle fill and border
@@ -159,15 +159,15 @@ class HighlightRenderer:
         """Draws attribute key-value pairs on the highlight."""
         try:
             # Slightly larger font, scaled
-            font_size = max(10, int(8 * self.scale))
+            font_size = max(10, int(8 * self.scale_factor))
             # Prioritize monospace fonts for better alignment
             font = ImageFont.truetype("Arial.ttf", font_size)  # Fallback sans-serif
         except IOError:
             font = ImageFont.load_default()
             font_size = 10  # Reset size for default font
-        line_height = font_size + int(4 * self.scale)  # Scaled line spacing
-        bg_padding = int(3 * self.scale)
+        line_height = font_size + int(4 * self.scale_factor)  # Scaled line spacing
+        bg_padding = int(3 * self.scale_factor)
         max_width = 0
         text_lines = []
@@ -191,8 +191,8 @@ class HighlightRenderer:
         total_height = line_height * len(text_lines)
         # Position near top-right corner with padding
-        x = bbox_scaled[2] - int(2 * self.scale) - max_width
-        y = bbox_scaled[1] + int(2 * self.scale)
+        x = bbox_scaled[2] - int(2 * self.scale_factor) - max_width
+        y = bbox_scaled[1] + int(2 * self.scale_factor)
         # Draw background rectangle (semi-transparent white)
         bg_x0 = x - bg_padding
@@ -244,10 +244,10 @@ class HighlightRenderer:
         for element in ocr_elements:
             x0, top, x1, bottom = element.bbox
             x0_s, top_s, x1_s, bottom_s = (
-                x0 * self.scale,
-                top * self.scale,
-                x1 * self.scale,
-                bottom * self.scale,
+                x0 * self.scale_factor,
+                top * self.scale_factor,
+                x1 * self.scale_factor,
+                bottom * self.scale_factor,
             )
             box_w, box_h = x1_s - x0_s, bottom_s - top_s
@@ -556,19 +556,62 @@ class HighlightingService:
         self._highlights_by_page[page_index].append(highlight)
         logger.debug(f"Added highlight to page {page_index}: {highlight}")
+        # --- Invalidate page-level image cache --------------------------------
+        # The Page.to_image method maintains an internal cache keyed by rendering
+        # parameters.  Because the cache key currently does **not** incorporate
+        # any information about the highlights themselves, it can return stale
+        # images after highlights are added or removed.  To ensure the next
+        # render reflects the new highlights, we clear the cache for the
+        # affected page here.
+        try:
+            page_obj = self._pdf[page_index]
+            if hasattr(page_obj, "_to_image_cache"):
+                page_obj._to_image_cache.clear()
+                logger.debug(
+                    f"Cleared cached to_image renders for page {page_index} after adding a highlight."
+                )
+        except Exception as cache_err:  # pragma: no cover – never fail highlight creation
+            logger.warning(
+                f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
+                exc_info=True,
+            )
     def clear_all(self):
         """Clears all highlights from all pages and resets the color manager."""
         self._highlights_by_page = {}
         self._color_manager.reset()
         logger.info("Cleared all highlights and reset ColorManager.")
+        # Clear cached images for *all* pages because their visual state may
+        # depend on highlight visibility.
+        for idx, page in enumerate(self._pdf.pages):
+            try:
+                if hasattr(page, "_to_image_cache"):
+                    page._to_image_cache.clear()
+            except Exception:
+                # Non-critical – keep going for remaining pages
+                continue
     def clear_page(self, page_index: int):
         """Clears all highlights from a specific page."""
         if page_index in self._highlights_by_page:
             del self._highlights_by_page[page_index]
             logger.debug(f"Cleared highlights for page {page_index}.")
-            # Note: We typically don't reset the color manager when clearing a single page
-            # to maintain color consistency if highlights are added back.
+        # Also clear any cached rendered images for this page so the next render
+        # reflects the removal of highlights.
+        try:
+            page_obj = self._pdf[page_index]
+            if hasattr(page_obj, "_to_image_cache"):
+                page_obj._to_image_cache.clear()
+                logger.debug(
+                    f"Cleared cached to_image renders for page {page_index} after removing highlights."
+                )
+        except Exception as cache_err:  # pragma: no cover
+            logger.warning(
+                f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
+                exc_info=True,
+            )
     def get_highlights_for_page(self, page_index: int) -> List[Highlight]:
         """Returns a list of Highlight objects for a specific page."""
@@ -581,11 +624,10 @@ class HighlightingService:
     def render_page(
         self,
         page_index: int,
-        scale: float = 2.0,
+        resolution: float = 144,
         labels: bool = True,
         legend_position: str = "right",
         render_ocr: bool = False,
-        resolution: Optional[float] = None,
         **kwargs,  # Pass other args to pdfplumber.page.to_image if needed
     ) -> Optional[Image.Image]:
         """
@@ -594,12 +636,11 @@ class HighlightingService:
         Args:
             page_index: The 0-based index of the page to render.
-            scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
+            resolution: Resolution (DPI) for the base page image if width/height not in kwargs.
+                       Defaults to 144 DPI (equivalent to previous scale=2.0).
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on the image.
-            resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
-                       Defaults to scale * 72 if not otherwise specified.
             kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
         Returns:
@@ -625,13 +666,16 @@ class HighlightingService:
             logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
             # Actual scale will be calculated after image creation
         else:
-            # Use explicit resolution from kwargs if present, then the resolution param, then scale
-            render_resolution = to_image_args.pop(
-                "resolution", resolution
-            )  # Use and remove from kwargs if present
+            # Use explicit resolution if provided via kwargs, otherwise fallback to the
+            # `resolution` parameter (which might be None).  If we still end up with
+            # `None`, default to 144 DPI to avoid downstream errors.
+            render_resolution = to_image_args.pop("resolution", resolution)
             if render_resolution is None:
-                render_resolution = scale * 72
-            to_image_args["resolution"] = render_resolution  # Add it back for the call
+                render_resolution = 144
+            # Reinstate into kwargs for pdfplumber
+            to_image_args["resolution"] = render_resolution
             actual_scale_x = render_resolution / 72.0
             actual_scale_y = render_resolution / 72.0
             logger.debug(
@@ -657,11 +701,11 @@ class HighlightingService:
                 if page_obj.width > 0:
                     actual_scale_x = base_image_pil.width / page_obj.width
                 else:
-                    actual_scale_x = scale  # Fallback
+                    actual_scale_x = resolution / 72.0  # Fallback to resolution-based scale
                 if page_obj.height > 0:
                     actual_scale_y = base_image_pil.height / page_obj.height
                 else:
-                    actual_scale_y = scale  # Fallback
+                    actual_scale_y = resolution / 72.0  # Fallback to resolution-based scale
                 logger.debug(
                     f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}"
                 )
@@ -682,14 +726,20 @@ class HighlightingService:
                 page=page_obj,
                 base_image=base_image_pil,
                 highlights=highlights_on_page,
-                scale=renderer_scale,  # Use the determined actual scale
+                scale_factor=renderer_scale,  # Use the determined actual scale
                 render_ocr=render_ocr,
             )
             rendered_image = renderer.render()
         else:
             if render_ocr:
                 # Still render OCR even if no highlights, using the determined actual scale
-                renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
+                renderer = HighlightRenderer(
+                    page=page_obj,
+                    base_image=base_image_pil,
+                    highlights=[],
+                    scale_factor=renderer_scale,
+                    render_ocr=True,
+                )
                 rendered_image = renderer.render()
             else:
                 rendered_image = base_image_pil  # No highlights, no OCR requested
@@ -722,11 +772,10 @@ class HighlightingService:
         self,
         page_index: int,
         temporary_highlights: List[Dict],
-        scale: float = 2.0,
+        resolution: float = 144,
         labels: bool = True,
         legend_position: str = "right",
         render_ocr: bool = False,
-        resolution: Optional[float] = None,
         crop_bbox: Optional[Tuple[float, float, float, float]] = None,
         **kwargs,
     ) -> Optional[Image.Image]:
@@ -737,11 +786,11 @@ class HighlightingService:
         Args:
             page_index: Index of the page to render.
             temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
-            scale: Original scale factor for rendering, used if width/height are not provided.
+            resolution: Resolution (DPI) for base page image rendering if width/height not used.
+                       Defaults to 144 DPI (equivalent to previous scale=2.0).
             labels: Whether to include a legend.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
-            resolution: Resolution for base page image rendering if width/height not used.
             crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
                 space to crop the output image to, before legends or other overlays are
                 applied. If None, no cropping is performed.
@@ -777,9 +826,11 @@ class HighlightingService:
             # Resolution is implicitly handled by pdfplumber when height is set
             # after image is created, we will calculate actual_scale_x and actual_scale_y
         else:
-            # Neither width nor height is provided, use resolution or scale.
-            render_resolution = resolution if resolution is not None else scale * 72
+            # Neither width nor height is provided, rely on `resolution`.
+            # If `resolution` was explicitly passed as `None`, fall back to 144 DPI.
+            render_resolution = 144 if resolution is None else resolution
             to_image_args["resolution"] = render_resolution
             actual_scale_x = render_resolution / 72.0
             actual_scale_y = render_resolution / 72.0
             logger.debug(
@@ -804,11 +855,11 @@ class HighlightingService:
                 if page_obj.width > 0:
                     actual_scale_x = base_image_pil.width / page_obj.width
                 else:
-                    actual_scale_x = scale  # Fallback to original scale
+                    actual_scale_x = resolution / 72.0  # Fallback to resolution-based scale
                 if page_obj.height > 0:
                     actual_scale_y = base_image_pil.height / page_obj.height
                 else:
-                    actual_scale_y = scale  # Fallback to original scale
+                    actual_scale_y = resolution / 72.0  # Fallback to resolution-based scale
                 logger.debug(
                     f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})"
                 )
@@ -855,7 +906,11 @@ class HighlightingService:
             renderer_scale = actual_scale_x
             renderer = HighlightRenderer(
-                page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr
+                page=page_obj,
+                base_image=base_image_pil,
+                highlights=preview_highlights,
+                scale_factor=renderer_scale,
+                render_ocr=render_ocr,
             )
             rendered_image = renderer.render()

natural_pdf/core/page.py CHANGED Viewed

@@ -867,6 +867,28 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             >>> page.region(right=200, width=50)  # Region from x=150 to x=200
             >>> page.region(top=100, bottom=200, width="full") # Explicit full width
         """
+        # ------------------------------------------------------------------
+        # Percentage support – convert strings like "30%" to absolute values
+        # based on page dimensions.  X-axis params (left, right, width) use
+        # page.width; Y-axis params (top, bottom, height) use page.height.
+        # ------------------------------------------------------------------
+        def _pct_to_abs(val, axis: str):
+            if isinstance(val, str) and val.strip().endswith("%"):
+                try:
+                    pct = float(val.strip()[:-1]) / 100.0
+                except ValueError:
+                    return val  # leave unchanged if not a number
+                return pct * (self.width if axis == "x" else self.height)
+            return val
+        left = _pct_to_abs(left, "x")
+        right = _pct_to_abs(right, "x")
+        width = _pct_to_abs(width, "x")
+        top = _pct_to_abs(top, "y")
+        bottom = _pct_to_abs(bottom, "y")
+        height = _pct_to_abs(height, "y")
         # --- Type checking and basic validation ---
         is_width_numeric = isinstance(width, (int, float))
         is_width_string = isinstance(width, str)
@@ -1137,6 +1159,40 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             user_kwargs=kwargs,  # Pass original user kwargs
         )
+        # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
+        apply_bidi = kwargs.get("bidi", True)
+        if apply_bidi and result:
+            # Quick check for any RTL character
+            import unicodedata
+            def _contains_rtl(s):
+                return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
+            if _contains_rtl(result):
+                try:
+                    from bidi.algorithm import get_display  # type: ignore
+                    from natural_pdf.utils.bidi_mirror import mirror_brackets
+                    result = "\n".join(
+                        mirror_brackets(
+                            get_display(
+                                line,
+                                base_dir=(
+                                    "R"
+                                    if any(
+                                        unicodedata.bidirectional(ch)
+                                        in ("R", "AL", "AN")
+                                        for ch in line
+                                    )
+                                    else "L"
+                                ),
+                            )
+                        )
+                        for line in result.split("\n")
+                    )
+                except ModuleNotFoundError:
+                    pass  # silently skip if python-bidi not available
         logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
         return result
@@ -1440,7 +1496,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     def show(
         self,
-        scale: float = 2.0,
+        resolution: float = 144,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
@@ -1450,7 +1506,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         Generates and returns an image of the page with persistent highlights rendered.
         Args:
-            scale: Scale factor for rendering.
+            resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
             width: Optional width for the output image.
             labels: Whether to include a legend for labels.
             legend_position: Position of the legend.
@@ -1460,7 +1516,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             PIL Image object of the page with highlights, or None if rendering fails.
         """
         return self.to_image(
-            scale=scale,
+            resolution=resolution,
             width=width,
             labels=labels,
             legend_position=legend_position,
@@ -1471,13 +1527,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     def save_image(
         self,
         filename: str,
-        scale: float = 2.0,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
         render_ocr: bool = False,
         include_highlights: bool = True,  # Allow saving without highlights
-        resolution: Optional[float] = None,
+        resolution: float = 144,
         **kwargs,
     ) -> "Page":
         """
@@ -1485,13 +1540,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         Args:
             filename: Path to save the image to.
-            scale: Scale factor for rendering highlights.
             width: Optional width for the output image.
             labels: Whether to include a legend.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text.
             include_highlights: Whether to render highlights.
-            resolution: Resolution for base image rendering.
+            resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
             **kwargs: Additional args for pdfplumber's to_image.
         Returns:
@@ -1500,7 +1554,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Use to_image to generate and save the image
         self.to_image(
             path=filename,
-            scale=scale,
             width=width,
             labels=labels,
             legend_position=legend_position,
@@ -1554,7 +1607,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     def to_image(
         self,
         path: Optional[str] = None,
-        scale: float = 2.0,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
@@ -1569,12 +1621,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         Args:
             path: Optional path to save the image to.
-            scale: Scale factor for rendering highlights.
             width: Optional width for the output image.
             labels: Whether to include a legend for highlights.
             legend_position: Position of the legend.
             render_ocr: Whether to render OCR text on highlights.
-            resolution: Resolution in DPI for base page image (default: scale * 72).
+            resolution: Resolution in DPI for base page image. If None, uses global setting or defaults to 144 DPI.
             include_highlights: Whether to render highlights.
             exclusions: Accepts one of the following:
                         • None  – no masking (default)
@@ -1593,11 +1644,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         # Use global options if parameters are not explicitly set
         if width is None:
             width = natural_pdf.options.image.width
-        if resolution is None and natural_pdf.options.image.resolution is not None:
-            resolution = natural_pdf.options.image.resolution
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # 1. Create cache key (excluding path)
         cache_key_parts = [
-            scale,
             width,
             labels,
             legend_position,
@@ -1641,7 +1694,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             rendered_image_component: Optional[Image.Image] = (
                 None  # Renamed from 'image' in original
             )
-            render_resolution = resolution if resolution is not None else scale * 72
+            render_resolution = resolution
             thread_id = threading.current_thread().name
             logger.debug(
                 f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
@@ -1658,11 +1711,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
                         # Delegate rendering to the central service
                         rendered_image_component = self._highlighter.render_page(
                             page_index=self.index,
-                            scale=scale,
+                            resolution=render_resolution,
                             labels=labels,
                             legend_position=legend_position,
                             render_ocr=render_ocr,
-                            resolution=render_resolution,  # Pass the calculated resolution
                             **kwargs,
                         )
                     else:
@@ -2336,7 +2388,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     def show_preview(
         self,
         temporary_highlights: List[Dict],
-        scale: float = 2.0,
+        resolution: float = 144,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
@@ -2349,7 +2401,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         Args:
             temporary_highlights: List of highlight data dictionaries (as prepared by
                                   ElementCollection._prepare_highlight_data).
-            scale: Scale factor for rendering.
+            resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
             width: Optional width for the output image.
             labels: Whether to include a legend.
             legend_position: Position of the legend.
@@ -2363,7 +2415,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             img = self._highlighter.render_preview(
                 page_index=self.index,
                 temporary_highlights=temporary_highlights,
-                scale=scale,
+                resolution=resolution,
                 labels=labels,
                 legend_position=legend_position,
                 render_ocr=render_ocr,
@@ -2897,3 +2949,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
             properties, and other details for each element
         """
         return self.find_all('*').inspect(limit=limit)
+    @property
+    def lines(self) -> List[Any]:
+        """Get all line elements on this page."""
+        return self._element_mgr.lines
+    # ------------------------------------------------------------------
+    # Image elements
+    # ------------------------------------------------------------------
+    @property
+    def images(self) -> List[Any]:
+        """Get all embedded raster images on this page."""
+        return self._element_mgr.images

natural_pdf/core/pdf.py CHANGED Viewed

@@ -653,8 +653,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             raise ValueError("Internal error: No selector or text provided.")
         selector_obj = parse_selector(effective_selector)
-        kwargs["regex"] = regex
-        kwargs["case"] = case
         # Search page by page
         for page in self.pages:

natural_pdf/describe/base.py CHANGED Viewed

@@ -269,15 +269,28 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
     base_columns = ['x0', 'top', 'x1', 'bottom']
     if element_type == 'word':
-        columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
-        # Add color for text elements
+        columns = ['text'] + base_columns + [
+            'font_family',
+            'font_variant',
+            'size',
+            'bold',
+            'italic',
+            'strike',
+            'underline',
+            'highlight',
+            'source',
+            'confidence',
+        ]
+        # Add foreground text colour too
         columns.append('color')
     elif element_type == 'rect':
         columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
     elif element_type == 'line':
         columns = base_columns + ['width', 'is_horizontal', 'is_vertical']  # LineElement properties
     elif element_type == 'region':
-        columns = base_columns + ['width', 'height', 'type']
+        columns = base_columns + ['width', 'height', 'type', 'color']
+    elif element_type == 'blob':
+        columns = base_columns + ['width', 'height', 'color']
     else:
         columns = base_columns + ['type']
@@ -325,19 +338,37 @@ def _extract_element_value(element: "Element", column: str) -> Any:
                 return fontname.split("+", 1)[0]
             return ''
-        elif column in ['bold', 'italic']:
+        elif column in ['bold', 'italic', 'strike', 'underline']:
             value = getattr(element, column, False)
             return value if isinstance(value, bool) else False
+        elif column == 'highlight':
+            # If element is highlighted, return its colour; otherwise blank
+            if getattr(element, 'highlight', False):
+                col_val = getattr(element, 'highlight_color', None)
+                if col_val is None:
+                    return 'True'  # fallback if colour missing
+                # Convert tuple to hex
+                if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
+                    try:
+                        r, g, b = [int(v * 255) if v <= 1 else int(v) for v in col_val[:3]]
+                        return f"#{r:02x}{g:02x}{b:02x}"
+                    except Exception:
+                        return str(col_val)
+                return str(col_val)
+            return ''
         elif column in ['stroke', 'fill', 'color']:
-            # For rectangles and text, these return color tuples
             value = getattr(element, column, None)
+            # If already a string (e.g. '#ff00aa' or 'red') return as is
+            if isinstance(value, str):
+                return value
+            # If tuple/list convert to hex
             if value and isinstance(value, (tuple, list)) and len(value) >= 3:
-                # Convert to hex color for display
                 try:
                     r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
                     return f"#{r:02x}{g:02x}{b:02x}"
-                except:
+                except Exception:
                     return str(value)
             return ""
@@ -406,7 +437,7 @@ def describe_element(element: "Element") -> "ElementSummary":
     # Add common text properties - use dict structure for proper list formatting
     text_props = {}
-    for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
+    for prop in ['font_family', 'size', 'bold', 'italic', 'strike', 'underline', 'highlight', 'source', 'confidence']:
         if hasattr(element, prop):
             value = getattr(element, prop)
             if value is not None:
@@ -414,7 +445,7 @@ def describe_element(element: "Element") -> "ElementSummary":
                     text_props[prop] = round(value, 3)
                 elif prop == 'size' and isinstance(value, (int, float)):
                     text_props[prop] = round(value, 1)
-                elif prop in ['bold', 'italic']:
+                elif prop in ['bold', 'italic', 'strike', 'underline']:
                     text_props[prop] = value
                 else:
                     text_props[prop] = value

natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

natural-pdf 0.1.28py3-none-any.whl → 0.1.30py3-none-any.whl