PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -13,6 +13,14 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
+# --- Classification Imports --- #
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
+# --- End Classification Imports --- #
+from natural_pdf.utils.locks import pdf_render_lock # Import the lock
+from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.text import TextElement
@@ -27,7 +35,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Region(DirectionalMixin):
+class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
     """
     Represents a rectangular region on a page.
     """
@@ -57,6 +65,12 @@ class Region(DirectionalMixin):
         self.start_element = None
         self.end_element = None
+        # --- ADDED --- Metadata store for mixins
+        self.metadata: Dict[str, Any] = {}
+        # --- NEW --- Central registry for analysis results
+        self.analyses: Dict[str, Any] = {}
+        # --- END ADDED ---
         # Standard attributes for all elements
         self.object_type = "region"  # For selector compatibility
@@ -600,6 +614,18 @@ class Region(DirectionalMixin):
         x1 = int(self.x1 * scale_factor)
         bottom = int(self.bottom * scale_factor)
+        # Ensure coords are valid for cropping (left < right, top < bottom)
+        if x0 >= x1:
+             logger.warning(
+                 f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
+             )
+             return None
+        if top >= bottom:
+             logger.warning(
+                 f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
+             )
+             return None
         # Crop the image to just this region
         region_image = page_image.crop((x0, top, x1, bottom))
@@ -776,11 +802,6 @@ class Region(DirectionalMixin):
         debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
         logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
-        # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
-        # For now, let's bypass this and always use the standard extraction flow
-        # based on contained elements to ensure consistency.
-        # if self.model == 'docling' or hasattr(self, 'text_content'): ...
         # 1. Get Word Elements potentially within this region (initial broad phase)
         # Optimization: Could use spatial query if page elements were indexed
         page_words = self.page.words  # Get all words from the page
@@ -829,7 +850,7 @@ class Region(DirectionalMixin):
         result = generate_text_layout(
             char_dicts=filtered_chars,
             layout_context_bbox=self.bbox,  # Use region's bbox for context
-            user_kwargs=kwargs,
+            user_kwargs=kwargs, # Pass original kwargs to layout generator
         )
         logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1084,11 +1105,14 @@ class Region(DirectionalMixin):
         filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
         return ElementCollection(filtered_elements)
-    def apply_ocr(self, **ocr_params) -> "Region":
+    def apply_ocr(self, replace=True, **ocr_params) -> "Region":
         """
         Apply OCR to this region and return the created text elements.
         Args:
+            replace: If True (default), removes existing OCR elements in the region
+                    before adding new ones. If False, adds new OCR elements without
+                    removing existing ones.
             **ocr_params: Keyword arguments passed to the OCR Manager.
                           Common parameters like `engine`, `languages`, `min_confidence`,
                           `device`, and `resolution` (for image rendering) should be
@@ -1098,12 +1122,28 @@ class Region(DirectionalMixin):
                           an `options` object (e.g., `options=EasyOCROptions(...)`).
         Returns:
-            List of created TextElement objects representing OCR words/lines.
+            Self for method chaining.
         """
         # Ensure OCRManager is available
         if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
             logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
-            return []
+            return self
+        # If replace is True, find and remove existing OCR elements in this region
+        if replace:
+            logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
+            # Find all OCR elements in this region
+            ocr_selector = "text[source=ocr]"
+            ocr_elements = self.find_all(ocr_selector)
+            if ocr_elements:
+                logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
+                # Remove these elements from their page
+                removed_count = ocr_elements.remove()
+                logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
+            else:
+                logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
         ocr_mgr = self.page._parent._ocr_manager
         # Determine rendering resolution from parameters
@@ -1123,11 +1163,11 @@ class Region(DirectionalMixin):
             )
             if not region_image:
                 logger.error("Failed to render region to image for OCR.")
-                return []
+                return self
             logger.debug(f"Region rendered to image size: {region_image.size}")
         except Exception as e:
             logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
-            return []
+            return self
         # Prepare args for the OCR Manager
         manager_args = {
@@ -1148,11 +1188,11 @@ class Region(DirectionalMixin):
                 logger.error(
                     f"OCRManager returned unexpected type for single region image: {type(results)}"
                 )
-                return []
+                return self
             logger.debug(f"Region OCR processing returned {len(results)} results.")
         except Exception as e:
             logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
-            return []
+            return self
         # Convert results to TextElements
         scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -1719,7 +1759,7 @@ class Region(DirectionalMixin):
         """
         # Find OCR elements specifically within this region
         # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
+        target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
@@ -1729,3 +1769,47 @@ class Region(DirectionalMixin):
         )
         return self  # Return self for chaining
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
+             raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
+        try:
+             # Use the PDF's manager registry accessor via page
+             return self.page.pdf.get_manager('classification')
+        except (ValueError, RuntimeError, AttributeError) as e:
+             # Wrap potential errors from get_manager for clarity
+             raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
+        if model_type == 'text':
+            text_content = self.extract_text(layout=False) # Simple join for classification
+            if not text_content or text_content.isspace():
+                raise ValueError("Cannot classify region with 'text' model: No text content found.")
+            return text_content
+        elif model_type == 'vision':
+            # Get resolution from manager/kwargs if possible, else default
+            # We access manager via the method to ensure it's available
+            manager = self._get_classification_manager()
+            default_resolution = 150 # Manager doesn't store default res, set here
+            # Note: classify() passes resolution via **kwargs if user specifies
+            resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
+            img = self.to_image(
+                resolution=resolution,
+                include_highlights=False, # No highlights for classification input
+                crop_only=True # Just the region content
+            )
+            if img is None:
+                raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
+            return img
+        else:
+            raise ValueError(f"Unsupported model_type for classification: {model_type}")
+    def _get_metadata_storage(self) -> Dict[str, Any]:
+        # Ensure metadata exists
+        if not hasattr(self, 'metadata') or self.metadata is None:
+            self.metadata = {}
+        return self.metadata
+    # --- End Classification Mixin Implementation --- #

natural_pdf/elements/text.py CHANGED Viewed

@@ -274,9 +274,12 @@ class TextElement(Element):
         return False
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:
         """String representation of the text element."""
-        preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
+        if self.text:
+            preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
+        else:
+            preview = "..."
         font_style = []
         if self.bold:
             font_style.append("bold")

natural_pdf/exporters/paddleocr.py CHANGED Viewed

@@ -48,7 +48,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
             selector: CSS-like selector to filter which TextElements to export.
                       If None and corrected_only is False, all 'text' elements are considered.
             corrected_only: If True, overrides selector and exports only elements likely
-                            originating from a correction manifest (selector="text[source^=manifest]").
+                            originating from a correction manifest (selector="text[source=manifest]").
                             (default: False).
             split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
                          If None, creates a single `label.txt` file (default: 0.9).

natural_pdf/extraction/manager.py ADDED Viewed

@@ -0,0 +1,134 @@
+import logging
+from typing import Any, Type, Optional
+from pydantic import BaseModel
+import io
+import base64
+from PIL import Image
+from natural_pdf.extraction.result import StructuredDataResult
+logger = logging.getLogger(__name__)
+class StructuredDataManager:
+    """
+    Manages the process of extracting structured data from elements using LLMs.
+    This manager is typically accessed via `pdf.get_manager('structured_data')`.
+    It is stateless and relies on parameters passed during method calls.
+    """
+    DEFAULT_TEXT_MODEL = "gpt-4o-mini"
+    DEFAULT_VISION_MODEL = "gpt-4o"
+    def __init__(self):
+        """Initializes the manager."""
+        logger.info("Initialized StructuredDataManager.")
+    def is_available(self) -> bool:
+        """Checks if necessary dependencies are available."""
+        try:
+            import pydantic
+            return True
+        except ImportError:
+            logger.warning("Pydantic is required for structured data extraction.")
+            return False
+    def _prepare_llm_messages(
+        self,
+        content: Any,
+        prompt: Optional[str],
+        using: str,
+        schema: Type[BaseModel]
+    ) -> list:
+        """Prepares the message list for the LLM API call."""
+        system_prompt = prompt or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
+        messages = [
+            {"role": "system", "content": system_prompt}
+        ]
+        if using == 'text':
+            messages.append({"role": "user", "content": str(content)})
+        elif using == 'vision':
+            if isinstance(content, Image.Image):
+                buffered = io.BytesIO()
+                content.save(buffered, format="PNG")
+                base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                messages.append({
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Extract information from this image based on the schema."},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{base64_image}"},
+                        },
+                    ],
+                })
+            else:
+                raise TypeError(f"Content must be a PIL Image for using='vision', got {type(content)}")
+        else:
+             raise ValueError(f"Unsupported value for 'using': {using}")
+        return messages
+    def extract(
+        self,
+        content: Any,
+        schema: Type[BaseModel],
+        client: Any,
+        prompt: Optional[str] = None,
+        using: str = 'text',
+        model: Optional[str] = None,
+        **kwargs
+    ) -> StructuredDataResult:
+        """
+        Extract structured data from content using an LLM.
+        Args:
+            content: Text string or Image object
+            schema: Pydantic model class for the desired structure
+            client: Initialized LLM client (e.g., OpenAI client)
+            prompt: Optional user-provided instructions
+            using: Modality ('text' or 'vision')
+            model: Specific LLM model identifier
+            **kwargs: Additional parameters for the LLM API call
+        Returns:
+            StructuredDataResult object
+        """
+        logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
+        if isinstance(content, list) and using == 'vision':
+            if len(content) == 1:
+                content = content[0]
+            elif len(content) > 1:
+                logger.error("Vision extraction not supported for multi-page PDFs")
+                raise NotImplementedError("Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead.")
+        selected_model = model or (self.DEFAULT_VISION_MODEL if using == 'vision' else self.DEFAULT_TEXT_MODEL)
+        messages = self._prepare_llm_messages(content, prompt, using, schema)
+        try:
+            logger.debug(f"Extracting with model '{selected_model}'")
+            completion = client.beta.chat.completions.parse(
+                model=selected_model,
+                messages=messages,
+                response_format=schema,
+                **kwargs
+            )
+            parsed_data = completion.choices[0].message.parsed
+            return StructuredDataResult(
+                data=parsed_data,
+                success=True,
+                error_message=None,
+                model=selected_model
+            )
+        except Exception as e:
+            logger.error(f"Extraction failed: {str(e)}")
+            return StructuredDataResult(
+                data=None,
+                success=False,
+                error_message=str(e),
+                model=selected_model
+            )

natural_pdf/extraction/mixin.py ADDED Viewed

@@ -0,0 +1,246 @@
+import logging
+from typing import TYPE_CHECKING, Any, Type, Optional
+from abc import ABC, abstractmethod
+from pydantic import BaseModel
+# Avoid circular import
+if TYPE_CHECKING:
+    from natural_pdf.extraction.result import StructuredDataResult
+    from natural_pdf.core.page import Page
+    from natural_pdf.elements.base import Element
+logger = logging.getLogger(__name__)
+DEFAULT_STRUCTURED_KEY = "default-structured" # Define default key
+class ExtractionMixin(ABC):
+    """
+    Mixin class providing structured data extraction capabilities to elements.
+    Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
+    """
+    def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
+        """
+        Retrieves the content (text or image) for extraction.
+        Args:
+            using: 'text' or 'vision'
+            **kwargs: Additional arguments passed to extract_text or to_image
+        Returns:
+            str: Extracted text if using='text'
+            PIL.Image.Image: Rendered image if using='vision'
+            None: If content cannot be retrieved
+        """
+        if not hasattr(self, 'extract_text') or not callable(self.extract_text):
+             logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
+             return None
+        if not hasattr(self, 'to_image') or not callable(self.to_image):
+             logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
+             return None
+        try:
+            if using == 'text':
+                layout = kwargs.pop('layout', True)
+                return self.extract_text(layout=layout, **kwargs)
+            elif using == 'vision':
+                resolution = kwargs.pop('resolution', 72)
+                include_highlights = kwargs.pop('include_highlights', False)
+                labels = kwargs.pop('labels', False)
+                return self.to_image(
+                    resolution=resolution,
+                    include_highlights=include_highlights,
+                    labels=labels,
+                    **kwargs
+                )
+            else:
+                logger.error(f"Unsupported value for 'using': {using}")
+                return None
+        except Exception as e:
+            logger.error(f"Error getting {using} content from {self!r}: {e}")
+            return None
+    def extract(
+        self: Any,
+        schema: Type[BaseModel],
+        client: Any,
+        analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
+        prompt: Optional[str] = None,
+        using: str = 'text',
+        model: Optional[str] = None,
+        overwrite: bool = False, # Add overwrite parameter
+        **kwargs
+    ) -> Any:
+        """
+        Extracts structured data according to the provided schema.
+        Results are stored in the element's `analyses` dictionary.
+        Args:
+            schema: Pydantic model class defining the desired structure
+            client: Initialized LLM client
+            analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
+            prompt: Optional user-provided prompt for the LLM
+            using: Modality ('text' or 'vision')
+            model: Optional specific LLM model identifier
+            overwrite: If True, allow overwriting an existing result at `analysis_key`.
+            **kwargs: Additional parameters for extraction
+        Returns:
+            Self for method chaining
+        """
+        if not analysis_key:
+            raise ValueError("analysis_key cannot be empty for extract operation")
+        # --- Overwrite Check --- #
+        if not hasattr(self, 'analyses') or self.analyses is None:
+            self.analyses = {}
+        if analysis_key in self.analyses and not overwrite:
+            raise ValueError(
+                f"Analysis key '{analysis_key}' already exists in analyses. "
+                f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
+            )
+        # --- End Overwrite Check --- #
+        # Determine PDF instance to get manager
+        pdf_instance = None
+        if hasattr(self, 'get_manager') and callable(self.get_manager):
+            # Handle case where self is the PDF instance itself
+            pdf_instance = self
+            logger.debug(f"Manager access via self ({type(self).__name__})")
+        elif hasattr(self, 'pdf') and hasattr(self.pdf, 'get_manager') and callable(self.pdf.get_manager):
+            # Handle Page or other elements with direct .pdf reference
+            pdf_instance = self.pdf
+            logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
+        elif hasattr(self, 'page') and hasattr(self.page, 'pdf') and hasattr(self.page.pdf, 'get_manager') and callable(self.page.pdf.get_manager):
+            # Handle Region or other elements with .page.pdf reference
+            pdf_instance = self.page.pdf
+            logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
+        else:
+            logger.error(f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf")
+            raise RuntimeError(f"Cannot access PDF manager: {type(self).__name__} lacks necessary references")
+        try:
+            manager = pdf_instance.get_manager('structured_data')
+        except Exception as e:
+            raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
+        if not manager or not manager.is_available():
+            raise RuntimeError("StructuredDataManager is not available")
+        # Get content
+        layout_for_text = kwargs.pop('layout', True)
+        content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs) # Pass kwargs
+        if content is None or (using == 'text' and isinstance(content, str) and not content.strip()):
+            logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
+            # Import here to avoid circularity at module level
+            from natural_pdf.extraction.result import StructuredDataResult
+            result = StructuredDataResult(
+                data=None,
+                success=False,
+                error_message=f"No content available for extraction (using='{using}')",
+                model=model # Use model requested, even if failed
+            )
+        else:
+            result = manager.extract(
+                content=content,
+                schema=schema,
+                client=client,
+                prompt=prompt,
+                using=using,
+                model=model,
+                **kwargs
+            )
+        # Store the result
+        self.analyses[analysis_key] = result
+        logger.info(f"Stored extraction result under key '{analysis_key}' (Success: {result.success})")
+        return self
+    def extracted(self, field_name: Optional[str] = None, analysis_key: Optional[str] = None) -> Any:
+        """
+        Convenience method to access results from structured data extraction.
+        Args:
+            field_name: The specific field to retrieve from the extracted data dictionary.
+                        If None, returns the entire data dictionary.
+            analysis_key: The key under which the extraction result was stored in `analyses`.
+                          If None, defaults to "default-structured".
+        Returns:
+            The requested field value, the entire data dictionary, or raises an error.
+        Raises:
+            KeyError: If the specified `analysis_key` is not found in `analyses`.
+            ValueError: If the stored result for `analysis_key` indicates a failed extraction.
+            AttributeError: If the element does not have an `analyses` attribute.
+            KeyError: (Standard Python) If `field_name` is specified but not found in the data.
+        """
+        target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
+        if not hasattr(self, 'analyses') or self.analyses is None:
+            raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
+        if target_key not in self.analyses:
+            available_keys = list(self.analyses.keys())
+            raise KeyError(
+                f"Extraction '{target_key}' not found in analyses. "
+                f"Available extractions: {available_keys}"
+            )
+        # Import here to avoid circularity and allow type checking
+        from natural_pdf.extraction.result import StructuredDataResult
+        result: StructuredDataResult = self.analyses[target_key]
+        if not isinstance(result, StructuredDataResult):
+            logger.warning(f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process.")
+            raise TypeError(f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}")
+        if not result.success:
+            raise ValueError(
+                f"Stored result for '{target_key}' indicates a failed extraction attempt. "
+                f"Error: {result.error_message}"
+            )
+        if result.data is None:
+             # This case might occur if success=True but data is somehow None
+             raise ValueError(f"Extraction result for '{target_key}' has no data available, despite success flag.")
+        if field_name is None:
+            # Return the whole data object (Pydantic model instance or dict)
+            return result.data
+        else:
+            # Try dictionary key access first, then attribute access
+            if isinstance(result.data, dict):
+                try:
+                    return result.data[field_name]
+                except KeyError:
+                    available_keys = list(result.data.keys())
+                    raise KeyError(
+                        f"Field/Key '{field_name}' not found in extracted dictionary "
+                        f"for key '{target_key}'. Available keys: {available_keys}"
+                    )
+            else:
+                # Assume it's an object, try attribute access
+                try:
+                    return getattr(result.data, field_name)
+                except AttributeError:
+                    # Try to get available fields from the object
+                    available_fields = []
+                    if hasattr(result.data, 'model_fields'): # Pydantic v2
+                        available_fields = list(result.data.model_fields.keys())
+                    elif hasattr(result.data, '__fields__'): # Pydantic v1
+                        available_fields = list(result.data.__fields__.keys())
+                    elif hasattr(result.data, '__dict__'): # Fallback
+                        available_fields = list(result.data.__dict__.keys())
+                    raise AttributeError(
+                        f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
+                        f"for key '{target_key}'. Available fields/attributes: {available_fields}"
+                    )
+                except Exception as e: # Catch other potential errors during getattr
+                     raise TypeError(f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}") from e

natural_pdf/extraction/result.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import Optional, TypeVar, Generic, Any
+from pydantic import BaseModel, Field
+# Generic type for the Pydantic model used in the schema
+T_Schema = TypeVar("T_Schema", bound=BaseModel)
+class StructuredDataResult(BaseModel, Generic[T_Schema]):
+    """
+    Represents the result of a structured data extraction operation.
+    Contains the extracted data, success status, and error information.
+    """
+    data: Optional[T_Schema] = Field(
+        None,
+        description="Validated data model or None on failure"
+    )
+    success: bool = Field(
+        ...,
+        description="Whether extraction succeeded"
+    )
+    error_message: Optional[str] = Field(
+        None,
+        description="Error details if extraction failed"
+    )
+    raw_output: Optional[Any] = Field(
+        None,
+        description="Raw output from the language model"
+    )
+    model_used: Optional[str] = Field(
+        None,
+        description="Identifier of the language model used"
+    )
+    class Config:
+        arbitrary_types_allowed = True

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl