PyPI - natural-pdf - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

natural-pdf 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +226 -70
natural_pdf/core/pdf.py +19 -22
natural_pdf/elements/base.py +9 -9
natural_pdf/elements/collections.py +105 -50
natural_pdf/elements/region.py +320 -113
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0

natural_pdf/__init__.py CHANGED Viewed

@@ -37,6 +37,36 @@ def configure_logging(level=logging.INFO, handler=None):
     logger.propagate = False
+# Global options system
+class ConfigSection:
+    """A configuration section that holds key-value option pairs."""
+    def __init__(self, **defaults):
+        self.__dict__.update(defaults)
+    def __repr__(self):
+        items = [f"{k}={v!r}" for k, v in self.__dict__.items()]
+        return f"{self.__class__.__name__}({', '.join(items)})"
+class Options:
+    """Global options for natural-pdf, similar to pandas options."""
+    def __init__(self):
+        # Image rendering defaults
+        self.image = ConfigSection(width=None, resolution=150)
+        # OCR defaults
+        self.ocr = ConfigSection(engine="easyocr", languages=["en"], min_confidence=0.5)
+        # Text extraction defaults (empty for now)
+        self.text = ConfigSection()
+# Create global options instance
+options = Options()
 # Version
 __version__ = "0.1.1"
@@ -90,6 +120,7 @@ __all__ = [
     "MultiModalSearchOptions",
     "BaseSearchOptions",
     "configure_logging",
+    "options",
 ]
 # Add QA components to __all__ if available

natural_pdf/analyzers/layout/gemini.py CHANGED Viewed

@@ -9,16 +9,6 @@ from typing import Any, Dict, List, Optional
 from PIL import Image
 from pydantic import BaseModel, Field
-# Use OpenAI library for interaction
-try:
-    from openai import OpenAI
-    from openai.types.chat import ChatCompletion
-    # Import OpenAIError for exception handling if needed
-except ImportError:
-    OpenAI = None
-    ChatCompletion = None
 try:
     from .base import LayoutDetector
     from .layout_options import BaseLayoutOptions, GeminiLayoutOptions
@@ -58,22 +48,28 @@ class DetectedRegion(BaseModel):
 class GeminiLayoutDetector(LayoutDetector):
-    """Document layout detector using Google's Gemini models via OpenAI compatibility layer."""
+    """
+    GeminiLayoutDetector: Layout analysis using Gemini via OpenAI-compatible API.
-    # Base URL for the Gemini OpenAI-compatible endpoint
-    GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
+    To use this detector, you must provide a compatible OpenAI client (e.g., from the openai package) via GeminiLayoutOptions.client.
+    See the documentation for an example of how to use Gemini layout analysis with natural-pdf.
+    """
     def __init__(self):
         super().__init__()
         self.supported_classes = set()  # Indicate dynamic nature
     def is_available(self) -> bool:
-        """Check if openai library is installed."""
-        if OpenAI is None:
-            logger.warning(
-                "openai package not found. Gemini detector (via OpenAI lib) will not be available. Run: pip install openai"
-            )
-            return False
+        """
+        Check if the Gemini detector is available.
+        Since this detector expects users to provide their own compatible OpenAI client,
+        the detector itself is always available. Users must ensure they have a compatible
+        client (e.g., from the openai package) and provide it via GeminiLayoutOptions.client.
+        Returns:
+            True - the detector is always available, but requires a compatible client.
+        """
         return True
     def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
@@ -87,21 +83,13 @@ class GeminiLayoutDetector(LayoutDetector):
     def _load_model_from_options(self, options: GeminiLayoutOptions) -> Any:
         """Validate options and return the model name."""
-        if not self.is_available():
-            raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
         if not isinstance(options, GeminiLayoutOptions):
             raise TypeError("Incorrect options type provided for Gemini model loading.")
         # Model loading is deferred to detect() based on whether a client is provided
         return options.model_name
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
         """Detect layout elements in an image using Gemini via OpenAI library."""
-        if not self.is_available():
-            # The is_available check now only confirms library presence
-            raise RuntimeError("OpenAI library not installed. Please run: pip install openai")
         # Ensure options are the correct type
         final_options: GeminiLayoutOptions
         if isinstance(options, GeminiLayoutOptions):
@@ -124,160 +112,147 @@ class GeminiLayoutDetector(LayoutDetector):
         model_name = self._get_model(final_options)
         detections = []
-        try:
-            # --- 1. Initialize OpenAI Client ---
-            client: Optional[OpenAI] = None
-            # Use the provided client instance
-            if hasattr(final_options.client, "beta") and hasattr(
-                final_options.client.beta.chat.completions, "parse"
-            ):
-                client = final_options.client
-                logger.debug("Using provided client instance.")
-            else:
-                logger.error(
-                    "Provided client does not seem compatible (missing beta.chat.completions.parse)."
-                )
-                raise TypeError(
-                    "Provided client is not compatible with the expected OpenAI interface."
-                )
+        # --- 1. Initialize OpenAI Client ---
+        client = getattr(final_options, "client", None)
+        if client is None:
+            raise RuntimeError(
+                "No client provided. Please provide a compatible OpenAI client via GeminiLayoutOptions.client."
+            )
-            if not client:
-                # This should not happen if logic above is correct, but as a safeguard
-                raise RuntimeError("Failed to obtain a valid client for Gemini detection.")
+        if not (
+            hasattr(client, "beta")
+            and hasattr(getattr(client.beta, "chat", None), "completions")
+            and hasattr(getattr(client.beta.chat.completions, "parse", None), "__call__")
+        ):
+            raise RuntimeError(
+                "Provided client is not compatible with the expected OpenAI interface."
+            )
+        logger.debug("Using provided client instance.")
-            # --- 2. Prepare Input for OpenAI API ---
-            if not final_options.classes:
-                logger.error("Gemini layout detection requires a list of classes to find.")
-                return []
+        # --- 2. Prepare Input for OpenAI API ---
+        if not final_options.classes:
+            logger.error("Gemini layout detection requires a list of classes to find.")
+            return []
-            width, height = image.size
+        width, height = image.size
+        # Convert image to base64
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        image_url = f"data:image/png;base64,{img_base64}"
+        class_list_str = ", ".join(f"`{c}`" for c in final_options.classes)
+        prompt_text = (
+            f"Analyze the provided image of a document page ({width}x{height}). "
+            f"Identify all regions corresponding to the following types: {class_list_str}. "
+            f"Return ONLY the structured data requested as formatted JSON."
+        )
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt_text},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
+                    },
+                ],
+            }
+        ]
-            # Convert image to base64
-            buffered = io.BytesIO()
-            image.save(buffered, format="PNG")
-            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
-            image_url = f"data:image/png;base64,{img_base64}"
+        logger.debug(
+            f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {final_options.classes}"
+        )
-            class_list_str = ", ".join(f"`{c}`" for c in final_options.classes)
-            prompt_text = (
-                f"Analyze the provided image of a document page ({width}x{height}). "
-                f"Identify all regions corresponding to the following types: {class_list_str}. "
-                f"Return ONLY the structured data requested as formatted JSON."
-            )
+        completion_kwargs = {
+            "temperature": final_options.extra_args.get("temperature", 0.0),  # Default to low temp
+            "max_tokens": final_options.extra_args.get("max_tokens", 4096),
+        }
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt_text},
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": image_url},
-                        },
-                    ],
-                }
-            ]
+        completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
-            logger.debug(
-                f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {final_options.classes}"
-            )
+        class ImageContents(BaseModel):
+            regions: List[DetectedRegion]
-            completion_kwargs = {
-                "temperature": final_options.extra_args.get(
-                    "temperature", 0.0
-                ),  # Default to low temp
-                "max_tokens": final_options.extra_args.get("max_tokens", 4096),
-            }
+        completion: "ChatCompletion" = client.beta.chat.completions.parse(
+            model=model_name,
+            messages=messages,
+            response_format=ImageContents,
+            **completion_kwargs,
+        )
-            completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
+        logger.debug(f"Gemini response received via OpenAI lib.")
-            class ImageContents(BaseModel):
-                regions: List[DetectedRegion]
+        # --- 4. Process Parsed Response ---
+        if not completion.choices:
+            logger.error("Gemini response (via OpenAI lib) contained no choices.")
+            return []
-            completion: ChatCompletion = client.beta.chat.completions.parse(
-                model=model_name,
-                messages=messages,
-                response_format=ImageContents,
-                **completion_kwargs,
+        # Get the parsed Pydantic objects
+        parsed_results = completion.choices[0].message.parsed.regions
+        if not parsed_results or not isinstance(parsed_results, list):
+            logger.error(
+                f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
             )
+            return []
-            logger.debug(f"Gemini response received via OpenAI lib.")
-            # --- 4. Process Parsed Response ---
-            if not completion.choices:
-                logger.error("Gemini response (via OpenAI lib) contained no choices.")
-                return []
-            # Get the parsed Pydantic objects
-            parsed_results = completion.choices[0].message.parsed.regions
-            if not parsed_results or not isinstance(parsed_results, list):
-                logger.error(
-                    f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}"
+        # --- 5. Convert to Detections & Filter ---
+        normalized_classes_req = {self._normalize_class_name(c) for c in final_options.classes}
+        normalized_classes_excl = (
+            {self._normalize_class_name(c) for c in final_options.exclude_classes}
+            if final_options.exclude_classes
+            else set()
+        )
+        for item in parsed_results:
+            # The item is already a validated DetectedRegion Pydantic object
+            # Access fields directly
+            label = item.label
+            bbox_raw = item.bbox
+            confidence_score = item.confidence
+            # Coordinates should already be floats, but ensure tuple format
+            xmin, ymin, xmax, ymax = tuple(bbox_raw)
+            # --- Apply Filtering ---
+            normalized_class = self._normalize_class_name(label)
+            # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
+            if normalized_class not in normalized_classes_req:
+                logger.warning(
+                    f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
                 )
-                return []
-            # --- 5. Convert to Detections & Filter ---
-            normalized_classes_req = {self._normalize_class_name(c) for c in final_options.classes}
-            normalized_classes_excl = (
-                {self._normalize_class_name(c) for c in final_options.exclude_classes}
-                if final_options.exclude_classes
-                else set()
-            )
+                continue
-            for item in parsed_results:
-                # The item is already a validated DetectedRegion Pydantic object
-                # Access fields directly
-                label = item.label
-                bbox_raw = item.bbox
-                confidence_score = item.confidence
-                # Coordinates should already be floats, but ensure tuple format
-                xmin, ymin, xmax, ymax = tuple(bbox_raw)
-                # --- Apply Filtering ---
-                normalized_class = self._normalize_class_name(label)
-                # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
-                if normalized_class not in normalized_classes_req:
-                    logger.warning(
-                        f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping."
-                    )
-                    continue
-                # Check against excluded classes
-                if normalized_class in normalized_classes_excl:
-                    logger.debug(
-                        f"Skipping excluded class '{label}' (normalized: {normalized_class})."
-                    )
-                    continue
-                # Check against base confidence threshold from options
-                if confidence_score < final_options.confidence:
-                    logger.debug(
-                        f"Skipping item with confidence {confidence_score:.3f} below threshold {final_options.confidence}."
-                    )
-                    continue
-                # Add detection
-                detections.append(
-                    {
-                        "bbox": (xmin, ymin, xmax, ymax),
-                        "class": label,  # Use original label from LLM
-                        "confidence": confidence_score,
-                        "normalized_class": normalized_class,
-                        "source": "layout",
-                        "model": "gemini",  # Keep model name generic as gemini
-                    }
+            # Check against excluded classes
+            if normalized_class in normalized_classes_excl:
+                logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
+                continue
+            # Check against base confidence threshold from options
+            if confidence_score < final_options.confidence:
+                logger.debug(
+                    f"Skipping item with confidence {confidence_score:.3f} below threshold {final_options.confidence}."
                 )
+                continue
-            self.logger.info(
-                f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
+            # Add detection
+            detections.append(
+                {
+                    "bbox": (xmin, ymin, xmax, ymax),
+                    "class": label,  # Use original label from LLM
+                    "confidence": confidence_score,
+                    "normalized_class": normalized_class,
+                    "source": "layout",
+                    "model": "gemini",  # Keep model name generic as gemini
+                }
             )
-        except Exception as e:
-            # Catch potential OpenAI API errors or other issues
-            self.logger.error(f"Error during Gemini detection (via OpenAI lib): {e}", exc_info=True)
-            return []
+        self.logger.info(
+            f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
+        )
         return detections

natural_pdf/analyzers/layout/layout_manager.py CHANGED Viewed

@@ -121,14 +121,18 @@ class LayoutManager:
                 # Construct helpful error message with install hint
                 install_hint = ""
                 if engine_name == "yolo":
-                    install_hint = "pip install 'natural-pdf[layout_yolo]'"
+                    install_hint = "pip install doclayout_yolo"
                 elif engine_name == "tatr":
-                    install_hint = "pip install 'natural-pdf[core-ml]'"
+                    # This should now be installed with core dependencies
+                    install_hint = "(should be installed with natural-pdf, check for import errors)"
                 elif engine_name == "paddle":
-                    install_hint = "pip install 'natural-pdf[paddle]'"
+                    install_hint = "pip install paddleocr paddlepaddle"
                 elif engine_name == "surya":
-                    install_hint = "pip install 'natural-pdf[surya]'"
-                # Add other engines like docling if they become optional extras
+                    install_hint = "pip install surya-ocr"
+                elif engine_name == "docling":
+                    install_hint = "pip install docling"
+                elif engine_name == "gemini":
+                    install_hint = "pip install openai"
                 else:
                     install_hint = f"(Check installation requirements for {engine_name})"

natural_pdf/analyzers/layout/layout_options.py CHANGED Viewed

@@ -54,14 +54,84 @@ class TATRLayoutOptions(BaseLayoutOptions):
 # --- Paddle Specific Options ---
 @dataclass
 class PaddleLayoutOptions(BaseLayoutOptions):
-    """Options specific to PaddlePaddle PP-Structure layout detection."""
-    lang: str = "en"  # Language ('en', 'ch', etc.)
-    use_angle_cls: bool = False  # Use text angle classification?
-    enable_table: bool = True  # Enable table structure detection?
-    show_log: bool = False  # Show Paddle internal logs?
-    detect_text: bool = True  # Also detect raw text boxes using PaddleOCR?
+    """
+    Options specific to PaddlePaddle PP-StructureV3 layout detection.
+    See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html
+    """
+    # Model paths and names
+    layout_detection_model_name: Optional[str] = None
+    layout_detection_model_dir: Optional[str] = None
+    layout_threshold: Optional[float] = None
+    layout_nms: Optional[bool] = None
+    layout_unclip_ratio: Optional[float] = None
+    layout_merge_bboxes_mode: Optional[str] = None
+    chart_recognition_model_name: Optional[str] = None
+    chart_recognition_model_dir: Optional[str] = None
+    chart_recognition_batch_size: Optional[int] = None
+    region_detection_model_name: Optional[str] = None
+    region_detection_model_dir: Optional[str] = None
+    doc_orientation_classify_model_name: Optional[str] = None
+    doc_orientation_classify_model_dir: Optional[str] = None
+    doc_unwarping_model_name: Optional[str] = None
+    doc_unwarping_model_dir: Optional[str] = None
+    text_detection_model_name: Optional[str] = None
+    text_detection_model_dir: Optional[str] = None
+    text_det_limit_side_len: Optional[int] = None
+    text_det_limit_type: Optional[str] = None
+    text_det_thresh: Optional[float] = None
+    text_det_box_thresh: Optional[float] = None
+    text_det_unclip_ratio: Optional[float] = None
+    textline_orientation_model_name: Optional[str] = None
+    textline_orientation_model_dir: Optional[str] = None
+    textline_orientation_batch_size: Optional[int] = None
+    text_recognition_model_name: Optional[str] = None
+    text_recognition_model_dir: Optional[str] = None
+    text_recognition_batch_size: Optional[int] = None
+    text_rec_score_thresh: Optional[float] = None
+    table_classification_model_name: Optional[str] = None
+    table_classification_model_dir: Optional[str] = None
+    wired_table_structure_recognition_model_name: Optional[str] = None
+    wired_table_structure_recognition_model_dir: Optional[str] = None
+    wireless_table_structure_recognition_model_name: Optional[str] = None
+    wireless_table_structure_recognition_model_dir: Optional[str] = None
+    wired_table_cells_detection_model_name: Optional[str] = None
+    wired_table_cells_detection_model_dir: Optional[str] = None
+    wireless_table_cells_detection_model_name: Optional[str] = None
+    wireless_table_cells_detection_model_dir: Optional[str] = None
+    seal_text_detection_model_name: Optional[str] = None
+    seal_text_detection_model_dir: Optional[str] = None
+    seal_det_limit_side_len: Optional[int] = None
+    seal_det_limit_type: Optional[str] = None
+    seal_det_thresh: Optional[float] = None
+    seal_det_box_thresh: Optional[float] = None
+    seal_det_unclip_ratio: Optional[float] = None
+    seal_text_recognition_model_name: Optional[str] = None
+    seal_text_recognition_model_dir: Optional[str] = None
+    seal_text_recognition_batch_size: Optional[int] = None
+    seal_rec_score_thresh: Optional[float] = None
+    formula_recognition_model_name: Optional[str] = None
+    formula_recognition_model_dir: Optional[str] = None
+    formula_recognition_batch_size: Optional[int] = None
+    # Module usage flags
+    use_doc_orientation_classify: Optional[bool] = True
+    use_doc_unwarping: Optional[bool] = True
+    use_textline_orientation: Optional[bool] = True
+    use_seal_recognition: Optional[bool] = False
+    use_table_recognition: Optional[bool] = True
+    use_formula_recognition: Optional[bool] = False
+    use_chart_recognition: Optional[bool] = True
+    use_region_detection: Optional[bool] = True
+    # General parameters
+    device: Optional[str] = None
+    enable_hpi: Optional[bool] = None
+    use_tensorrt: Optional[bool] = None
+    precision: Optional[str] = None
+    enable_mkldnn: Optional[bool] = False
+    cpu_threads: Optional[int] = None
+    paddlex_config: Optional[str] = None
+    lang: Optional[str] = None  # For English model selection
     verbose: bool = False  # Verbose logging for the detector class
+    create_cells: Optional[bool] = True
 # --- Surya Specific Options ---

natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

natural-pdf 0.1.14py3-none-any.whl → 0.1.16py3-none-any.whl