PyPI - natural-pdf - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +45 -1
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/layout/yolo.py +2 -2
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +556 -25
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +89 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/engine_paddle.py +1 -1
natural_pdf/ocr/ocr_factory.py +8 -8
natural_pdf/ocr/ocr_manager.py +51 -1
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/text.py CHANGED Viewed

@@ -32,10 +32,34 @@ class TextElement(Element):
             obj["object_type"] = "text"
         super().__init__(obj, page)
-        # Explicitly store constituent characters if provided
-        # (Pop from obj to avoid storing it twice if super() stores _obj by ref)
+        # Memory optimization: Store character indices instead of full dictionaries
+        # This reduces memory usage by ~50% by avoiding character data duplication
+        self._char_indices = obj.pop("_char_indices", [])
+        # Backward compatibility: Keep _char_dicts for existing code
+        # But prefer _char_indices when available to save memory
         self._char_dicts = obj.pop("_char_dicts", [])
+    @property
+    def chars(self):
+        """Get constituent character elements efficiently.
+        Uses character indices when available to avoid memory duplication,
+        falls back to _char_dicts for backward compatibility.
+        """
+        if self._char_indices:
+            # Memory-efficient approach: access characters by index
+            if hasattr(self.page, '_element_mgr'):
+                char_elements = self.page._element_mgr.get_elements('chars')
+                return [char_elements[i] for i in self._char_indices if i < len(char_elements)]
+        # Backward compatibility: convert _char_dicts to TextElement objects
+        if self._char_dicts:
+            return [TextElement(char_dict, self.page) for char_dict in self._char_dicts]
+        return []
     @property
     def text(self) -> str:
         """Get the text content."""
@@ -43,17 +67,22 @@ class TextElement(Element):
     @text.setter
     def text(self, value: str):
-        """Set the text content and synchronise underlying char dictionaries (if any)."""
+        """Set the text content and synchronise underlying char dictionaries/indices (if any)."""
         # Update the primary text value stored on the object itself
         self._obj["text"] = value
-        # --- Keep _char_dicts in sync so downstream utilities (e.g. extract_text)
-        #     that rely on the raw character dictionaries see the corrected text.
-        #     For OCR-generated words we usually have a single representative char
-        #     dict; for native words there may be one per character.
-        # ---------------------------------------------------------------------
+        # --- Sync character data for both memory-efficient and legacy approaches
         try:
-            if hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
+            # If using memory-efficient character indices, update the referenced chars
+            if hasattr(self, "_char_indices") and self._char_indices:
+                if hasattr(self.page, '_element_mgr'):
+                    char_elements = self.page._element_mgr.get_elements('chars')
+                    for idx, char_idx in enumerate(self._char_indices):
+                        if char_idx < len(char_elements) and idx < len(value):
+                            char_elements[char_idx].text = value[idx]
+            # Legacy _char_dicts synchronization for backward compatibility
+            elif hasattr(self, "_char_dicts") and isinstance(self._char_dicts, list):
                 if not self._char_dicts:
                     return  # Nothing to update
@@ -93,7 +122,7 @@ class TextElement(Element):
             # Keep failures silent but logged; better to have outdated chars than crash.
             import logging
             logger = logging.getLogger(__name__)
-            logger.debug(f"TextElement: Failed to sync _char_dicts after text update: {sync_err}")
+            logger.debug(f"TextElement: Failed to sync char data after text update: {sync_err}")
     @property
     def source(self) -> str:
@@ -331,6 +360,45 @@ class TextElement(Element):
         return False
+    @property
+    def strike(self) -> bool:  # alias: struck
+        """True if this element (word/char) is marked as strikethrough."""
+        # Two possible storage places: raw object dict (comes from extractor
+        # via extra_attrs) or metadata (if later pipeline stages mutate).
+        return bool(self._obj.get("strike") or self.metadata.get("decoration", {}).get("strike"))
+    # Back-compat alias
+    @property
+    def struck(self) -> bool:  # noqa: D401
+        return self.strike
+    # -----------------------------
+    #  Underline decoration
+    # -----------------------------
+    @property
+    def underline(self) -> bool:
+        """True if element is underlined."""
+        return bool(self._obj.get("underline") or self.metadata.get("decoration", {}).get("underline"))
+    # -----------------------------
+    #  Highlight decoration
+    # -----------------------------
+    @property
+    def is_highlighted(self) -> bool:
+        """True if element (char/word) is marked as highlighted in the original PDF."""
+        return bool(
+            self._obj.get("highlight")
+            or self._obj.get("is_highlighted")
+            or self.metadata.get("decoration", {}).get("highlight")
+        )
+    @property
+    def highlight_color(self):
+        """Return RGB(A) tuple of highlight colour if stored."""
+        return self._obj.get("highlight_color") or self.metadata.get("decoration", {}).get("highlight_color")
     def __repr__(self) -> str:
         """String representation of the text element."""
         if self.text:
@@ -342,6 +410,12 @@ class TextElement(Element):
             font_style.append("bold")
         if self.italic:
             font_style.append("italic")
+        if self.strike:
+            font_style.append("strike")
+        if self.underline:
+            font_style.append("underline")
+        if self.is_highlighted:
+            font_style.append("highlight")
         style_str = f", style={font_style}" if font_style else ""
         # Use font_family for display but include raw fontname and variant
@@ -353,7 +427,11 @@ class TextElement(Element):
             base_font = self.fontname.split("+", 1)[1]
             font_display = f"{font_display} ({base_font})"
-        return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
+        color_info = ""
+        if self.is_highlighted and self.highlight_color is not None:
+            color_info = f", highlight_color={self.highlight_color}"
+        return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str}{color_info} bbox={self.bbox}>"
     def font_info(self) -> dict:
         """

natural_pdf/flows/collections.py CHANGED Viewed

@@ -164,7 +164,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
     def show(
         self,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,
         legend_position: str = "right",
         default_color: Optional[Union[Tuple, str]] = "orange",  # A distinct color for FEC show
@@ -273,7 +273,7 @@ class FlowElementCollection(MutableSequence[T_FEC]):
                     else getattr(page_obj, "page_number", 1) - 1
                 ),
                 temporary_highlights=temp_highlights_for_page,
-                scale=scale,
+                resolution=resolution,
                 width=width,
                 labels=labels,
                 legend_position=legend_position,
@@ -480,7 +480,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
     def show(
         self,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,
         legend_position: str = "right",
         default_color: Optional[Union[Tuple, str]] = "darkviolet",  # A distinct color for FRC show
@@ -565,7 +565,7 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
                     else getattr(page_obj, "page_number", 1) - 1
                 ),
                 temporary_highlights=temp_highlights_for_page,
-                scale=scale,
+                resolution=resolution,
                 width=width,
                 labels=labels,
                 legend_position=legend_position,

natural_pdf/flows/region.py CHANGED Viewed

@@ -244,7 +244,7 @@ class FlowRegion:
     def show(
         self,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,
         legend_position: str = "right",
         color: Optional[Union[Tuple, str]] = "fuchsia",
@@ -258,6 +258,21 @@ class FlowRegion:
         """
         Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
         If multiple pages are involved, they are stacked into a single image.
+        Args:
+            resolution: Resolution in DPI for page rendering. If None, uses global setting or defaults to 144 DPI.
+            labels: Whether to include a legend for highlights.
+            legend_position: Position of the legend ('right', 'bottom', 'top', 'left').
+            color: Color for highlighting the constituent regions.
+            label_prefix: Prefix for region labels (e.g., 'FlowPart').
+            width: Optional width for the output image (overrides resolution).
+            stack_direction: Direction to stack multiple pages ('vertical' or 'horizontal').
+            stack_gap: Gap in pixels between stacked pages.
+            stack_background_color: RGB background color for the stacked image.
+            **kwargs: Additional arguments passed to the underlying rendering methods.
+        Returns:
+            PIL Image of the rendered pages with highlighted regions, or None if rendering fails.
         """
         if not self.constituent_regions:
             logger.info("FlowRegion.show() called with no constituent regions.")
@@ -350,7 +365,7 @@ class FlowRegion:
                     else getattr(page_obj, "page_number", 1) - 1
                 ),
                 temporary_highlights=temp_highlights_for_page,
-                scale=scale,
+                resolution=resolution,
                 width=width,
                 labels=labels,  # Pass through labels
                 legend_position=legend_position,

natural_pdf/ocr/engine_paddle.py CHANGED Viewed

@@ -127,7 +127,7 @@ class PaddleOCREngine(OCREngine):
         except ImportError as e:
             self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
             raise RuntimeError(
-                "paddleocr is not available. Install via: natural-pdf install paddle"
+                "paddleocr is not available. Install via: npdf install paddle"
             ) from e
         paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()

natural_pdf/ocr/ocr_factory.py CHANGED Viewed

@@ -32,7 +32,7 @@ class OCRFactory:
                 return SuryaOCREngine(**kwargs)
             except ImportError:
                 raise ImportError(
-                    "Surya engine requires additional dependencies. " "Install with: natural-pdf install surya"
+                    "Surya engine requires additional dependencies. " "Install with: npdf install surya"
                 )
         elif engine_type == "easyocr":
             try:
@@ -42,7 +42,7 @@ class OCRFactory:
             except ImportError:
                 raise ImportError(
                     "EasyOCR engine requires the 'easyocr' package. "
-                    "Install with: pip install easyocr (or natural-pdf install easyocr when available)"
+                    "Install with: pip install easyocr (or npdf install easyocr when available)"
                 )
         elif engine_type == "paddle":
             try:
@@ -52,7 +52,7 @@ class OCRFactory:
             except ImportError:
                 raise ImportError(
                     "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
-                    "Install with: natural-pdf install paddle"
+                    "Install with: npdf install paddle"
                 )
         elif engine_type == "doctr":
             try:
@@ -137,9 +137,9 @@ class OCRFactory:
         # If we get here, no engines are available
         raise ImportError(
-            "No OCR engines are installed. You can add one via the natural-pdf installer, e.g.:\n"
-            "  natural-pdf install easyocr   # fastest to set up\n"
-            "  natural-pdf install paddle    # best Asian-language accuracy\n"
-            "  natural-pdf install surya     # Surya OCR engine\n"
-            "  natural-pdf install yolo      # Layout detection (YOLO)\n"
+            "No OCR engines are installed. You can add one via the npdf installer, e.g.:\n"
+            "  npdf install easyocr   # fastest to set up\n"
+            "  npdf install paddle    # best Asian-language accuracy\n"
+            "  npdf install surya     # Surya OCR engine\n"
+            "  npdf install yolo      # Layout detection (YOLO)\n"
         )

natural_pdf/ocr/ocr_manager.py CHANGED Viewed

@@ -94,7 +94,7 @@ class OCRManager:
                 engine_instance = engine_class()  # Instantiate first
                 if not engine_instance.is_available():
                     # Check availability before storing
-                    install_hint = f"natural-pdf install {engine_name}"
+                    install_hint = f"npdf install {engine_name}"
                     raise RuntimeError(
                         f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
@@ -295,3 +295,53 @@ class OCRManager:
                 )  # Log check failures at debug level
                 pass  # Ignore engines that fail to instantiate or check
         return available
+    def cleanup_engine(self, engine_name: Optional[str] = None) -> int:
+        """
+        Cleanup OCR engine instances to free memory.
+        Args:
+            engine_name: Specific engine to cleanup, or None to cleanup all engines
+        Returns:
+            Number of engines cleaned up
+        """
+        cleaned_count = 0
+        if engine_name:
+            # Cleanup specific engine
+            engine_name = engine_name.lower()
+            if engine_name in self._engine_instances:
+                engine = self._engine_instances.pop(engine_name)
+                if hasattr(engine, 'cleanup'):
+                    try:
+                        engine.cleanup()
+                    except Exception as e:
+                        logger.debug(f"Engine {engine_name} cleanup method failed: {e}")
+                # Clear associated locks
+                self._engine_locks.pop(engine_name, None)
+                self._engine_inference_locks.pop(engine_name, None)
+                logger.info(f"Cleaned up OCR engine: {engine_name}")
+                cleaned_count = 1
+        else:
+            # Cleanup all engines
+            for name, engine in list(self._engine_instances.items()):
+                if hasattr(engine, 'cleanup'):
+                    try:
+                        engine.cleanup()
+                    except Exception as e:
+                        logger.debug(f"Engine {name} cleanup method failed: {e}")
+            # Clear all caches
+            engine_count = len(self._engine_instances)
+            self._engine_instances.clear()
+            self._engine_locks.clear()
+            self._engine_inference_locks.clear()
+            if engine_count > 0:
+                logger.info(f"Cleaned up {engine_count} OCR engines")
+            cleaned_count = engine_count
+        return cleaned_count

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -224,6 +224,18 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     selector = selector.strip()
+    # ------------------------------------------------------------------
+    # Handle wildcard selector (leading "*")
+    # ------------------------------------------------------------------
+    # A selector can start with "*" to denote "any element type", optionally
+    # followed by attribute blocks or pseudo-classes – e.g. *[width>100].
+    # We strip the asterisk but keep the remainder so the normal attribute
+    # / pseudo-class parsing logic can proceed.
+    if selector.startswith("*"):
+        # Keep everything *after* the asterisk (attributes, pseudos, etc.).
+        selector = selector[1:].strip()
     # --- Handle OR operators first (| or ,) ---
     # Check if selector contains OR operators at the top level only
     # (not inside quotes, parentheses, or brackets)
@@ -253,13 +265,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     # --- Continue with single selector parsing (existing logic) ---
-    # --- Handle wildcard selector explicitly ---
-    if selector == "*":
-        # Wildcard matches any type, already the default.
-        # Clear selector so the loop doesn't run and error out.
-        selector = ""
-    # --- END NEW ---
     # 1. Extract type (optional, at the beginning)
     # Only run if selector wasn't '*'
     if selector:
@@ -741,6 +746,21 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
         elif name == "vertical":
             filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
+        # --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
+        elif name in ("strike", "strikethrough", "strikeout"):
+            filter_lambda = lambda el: hasattr(el, "strike") and bool(getattr(el, "strike"))
+            filter_name = f"pseudo-class :{name}"
+        elif name in ("underline", "underlined"):
+            filter_lambda = lambda el: hasattr(el, "underline") and bool(getattr(el, "underline"))
+            filter_name = f"pseudo-class :{name}"
+        elif name in ("highlight", "highlighted"):
+            # Match only if the element exposes an `is_highlighted` boolean flag.
+            # We deliberately avoid looking at the generic `.highlight()` method on
+            # Element, because it is a callable present on every element and would
+            # incorrectly mark everything as highlighted.
+            filter_lambda = lambda el: bool(getattr(el, "is_highlighted", False))
+            filter_name = f"pseudo-class :{name}"
         # Check predefined lambda functions (e.g., :first-child, :empty)
         elif name in PSEUDO_CLASS_FUNCTIONS:
             filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]

natural_pdf/tables/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# new file
+# Re-export for convenient import
+from .result import TableResult
+__all__ = ["TableResult"]

natural_pdf/tables/result.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Sequence wrapper for table data with convenient DataFrame helpers."""
+from __future__ import annotations
+from collections.abc import Sequence
+from typing import Any, List, Iterator, Optional, Union
+class TableResult(Sequence):
+    """List-of-rows plus `.df` / `.to_df()` helpers.
+    The object behaves like an immutable sequence of rows (each row is a
+    list of cell values) but offers an easy hand-off to *pandas*.
+    """
+    _IMMUTABLE_MESSAGE = (
+        "TableResult is read-only; convert to list(result) if you need to mutate"
+    )
+    def __init__(self, rows: Optional[List[List[Any]]] = None) -> None:
+        # Normalise to list of list so that Sequence operations work as expected
+        self._rows: List[List[Any]] = list(rows or [])
+    # ---------------------------------------------------------------------
+    # Sequence API
+    # ---------------------------------------------------------------------
+    def __getitem__(self, index):  # type: ignore[override]
+        return self._rows[index]
+    def __len__(self) -> int:  # type: ignore[override]
+        return len(self._rows)
+    def __iter__(self) -> Iterator[List[Any]]:  # type: ignore[override]
+        return iter(self._rows)
+    # ------------------------------------------------------------------
+    # Convenience helpers
+    # ------------------------------------------------------------------
+    @property
+    def df(self):
+        """Quick property alias → calls :py:meth:`to_df` with default args."""
+        return self.to_df()
+    def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
+        """Convert to *pandas* DataFrame.
+        Parameters
+        ----------
+        header : "first" | int | list[int] | None, default "first"
+            • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • None/False– no header.
+        index_col : same semantics as pandas, forwarded.
+        **kwargs  : forwarded to :pyclass:`pandas.DataFrame`.
+        """
+        try:
+            import pandas as pd  # type: ignore
+        except ModuleNotFoundError as exc:
+            raise ImportError(
+                "pandas is required for TableResult.to_df(); install via `pip install pandas`."
+            ) from exc
+        rows = self._rows
+        if not rows:
+            return pd.DataFrame()
+        # Determine header rows and body rows
+        body = rows
+        hdr = None
+        if header == "first":
+            hdr = rows[0]
+            body = rows[1:]
+        elif header is None or header is False:
+            hdr = None
+        elif isinstance(header, int):
+            hdr = rows[header]
+            body = rows[:header] + rows[header + 1 :]
+        elif isinstance(header, (list, tuple)):
+            hdr_rows = [rows[i] for i in header]
+            body = [r for idx, r in enumerate(rows) if idx not in header]
+            hdr = hdr_rows
+        else:
+            raise ValueError("Invalid value for header parameter")
+        df = pd.DataFrame(body, columns=hdr)
+        if index_col is not None and not df.empty:
+            df.set_index(df.columns[index_col] if isinstance(index_col, int) else index_col, inplace=True)
+        if kwargs:
+            df = pd.DataFrame(df, **kwargs)
+        return df
+    # ------------------------------------------------------------------
+    # Block mutating operations to keep result read-only
+    # ------------------------------------------------------------------
+    def _readonly(self, *args, **kwargs):
+        raise TypeError(self._IMMUTABLE_MESSAGE)
+    append = extend = insert = __setitem__ = __delitem__ = clear = pop = remove = _readonly  # type: ignore
+    # Nice repr in notebooks
+    def __repr__(self) -> str:  # noqa: D401 (simple)
+        preview = "…" if len(self._rows) > 5 else ""
+        return f"TableResult(rows={len(self._rows)}{preview})"

natural_pdf/utils/bidi_mirror.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Light-weight bracket mirroring for RTL text.
+This module provides `mirror_brackets`, a fast pure-python helper that
+replaces each bracket/parenthesis character with its Unicode-defined pair.
+For everyday PDFs the six ASCII pairs are enough, but the mapping can be
+extended easily from Unicode's BidiBrackets.txt.
+"""
+from typing import Dict
+# Minimal mapping – ( ) [ ] { }
+_ASCII_MIRROR: Dict[int, str] = {
+    0x0028: ")",  # ( -> )
+    0x0029: "(",  # ) -> (
+    0x005B: "]",  # [ -> ]
+    0x005D: "[",  # ] -> [
+    0x007B: "}",  # { -> }
+    0x007D: "{",  # } -> {
+}
+def mirror_brackets(text: str) -> str:  # pragma: no cover
+    """Return *text* with each bracket replaced by its mirror partner.
+    The function is context-free: it blindly flips every character found in
+    the mapping, which is sufficient once the string is already in visual
+    order (e.g., after `bidi.algorithm.get_display`).
+    """
+    if not text:
+        return text
+    # Fast path: only allocate when needed
+    out_chars = []
+    append = out_chars.append
+    for ch in text:
+        append(_ASCII_MIRROR.get(ord(ch), ch))
+    return "".join(out_chars)

natural_pdf/utils/visualization.py CHANGED Viewed

@@ -235,12 +235,26 @@ def merge_images_with_legend(
 def render_plain_page(page, resolution):
+    """
+    Render a page to PIL Image using the specified resolution.
+    Args:
+        page: Page object to render
+        resolution: DPI resolution for rendering
+    Returns:
+        PIL Image of the rendered page
+    """
     doc = pypdfium2.PdfDocument(page._page.pdf.stream)
     pdf_page = doc[page.index]
+    # Convert resolution (DPI) to scale factor for pypdfium2
+    # PDF standard is 72 DPI, so scale = resolution / 72
+    scale_factor = resolution / 72.0
     bitmap = pdf_page.render(
-        scale=resolution / 72,
+        scale=scale_factor,
     )
     image = bitmap.to_pil().convert("RGB")

{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.27
+Version: 0.1.30
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -24,6 +24,7 @@ Requires-Dist: pydantic
 Requires-Dist: jenkspy
 Requires-Dist: scipy
 Requires-Dist: ipywidgets>=7.0.0
+Requires-Dist: python-bidi
 Provides-Extra: test
 Requires-Dist: pytest; extra == "test"
 Requires-Dist: pytest-xdist; extra == "test"

natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl