PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +230 -151
natural_pdf/classification/mixin.py +49 -35
natural_pdf/classification/results.py +64 -46
natural_pdf/collections/mixins.py +68 -20
natural_pdf/collections/pdf_collection.py +177 -64
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +633 -190
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +503 -131
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/text.py CHANGED Viewed

@@ -274,7 +274,7 @@ class TextElement(Element):
         return False
-    def __repr__(self) -> str:
+    def __repr__(self) -> str:
         """String representation of the text element."""
         if self.text:
             preview = self.text[:10] + "..." if len(self.text) > 10 else self.text

natural_pdf/export/mixin.py ADDED Viewed

@@ -0,0 +1,137 @@
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+logger = logging.getLogger(__name__)
+class ExportMixin:
+    """
+    Mixin for exporting analyses from collections of elements.
+    This mixin is designed to be used with PDF, PDFCollection,
+    PageCollection, and ElementCollection classes.
+    """
+    def export_analyses(
+        self,
+        output_path: str,
+        analysis_keys: Union[str, List[str]],
+        format: str = "json",
+        include_content: bool = True,
+        include_images: bool = False,
+        image_dir: Optional[str] = None,
+        image_format: str = "jpg",
+        image_resolution: int = 72,
+        overwrite: bool = True,
+        **kwargs,
+    ) -> str:
+        """
+        Export analysis results to a file.
+        Args:
+            output_path: Path to save the export file
+            analysis_keys: Key(s) in the analyses dictionary to export
+            format: Export format ('json', 'csv', 'excel')
+            include_content: Whether to include extracted text
+            include_images: Whether to export images of elements
+            image_dir: Directory to save images (created if doesn't exist)
+            image_format: Format to save images ('jpg', 'png')
+            image_resolution: Resolution for exported images
+            overwrite: Whether to overwrite existing files
+            **kwargs: Additional format-specific options
+        Returns:
+            Path to the exported file
+        """
+        # Convert single key to list for consistency
+        if isinstance(analysis_keys, str):
+            analysis_keys = [analysis_keys]
+        # Create output directory
+        output_path = Path(output_path)
+        os.makedirs(output_path.parent, exist_ok=True)
+        # Check if file exists and handle overwrite
+        if output_path.exists() and not overwrite:
+            raise FileExistsError(f"Output file {output_path} already exists and overwrite=False")
+        # Prepare image directory if needed
+        if include_images:
+            if image_dir is None:
+                image_dir = output_path.parent / f"{output_path.stem}_images"
+            os.makedirs(image_dir, exist_ok=True)
+            image_dir = Path(image_dir)  # Convert to Path object
+        # Gather data from collection
+        data = self._gather_analysis_data(
+            analysis_keys=analysis_keys,
+            include_content=include_content,
+            include_images=include_images,
+            image_dir=image_dir,
+            image_format=image_format,
+            image_resolution=image_resolution,
+        )
+        # Export based on format
+        if format.lower() == "json":
+            return self._export_to_json(data, output_path, **kwargs)
+        elif format.lower() == "csv":
+            return self._export_to_csv(data, output_path, **kwargs)
+        elif format.lower() == "excel":
+            return self._export_to_excel(data, output_path, **kwargs)
+        else:
+            raise ValueError(f"Unsupported export format: {format}")
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from elements in the collection.
+        This method should be implemented by each collection class.
+        """
+        raise NotImplementedError("Subclasses must implement _gather_analysis_data")
+    def _export_to_json(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
+        """Export data to JSON format."""
+        with open(output_path, "w") as f:
+            json.dump(data, f, indent=2, **kwargs)
+        logger.info(f"Exported analysis data to {output_path}")
+        return str(output_path)
+    def _export_to_csv(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
+        """Export data to CSV format."""
+        try:
+            import pandas as pd
+            # Normalize nested data
+            df = pd.json_normalize(data)
+            df.to_csv(output_path, index=False, **kwargs)
+            logger.info(f"Exported analysis data to {output_path}")
+            return str(output_path)
+        except ImportError:
+            raise ImportError("Pandas is required for CSV export. Install with: pip install pandas")
+    def _export_to_excel(self, data: List[Dict[str, Any]], output_path: Path, **kwargs) -> str:
+        """Export data to Excel format."""
+        try:
+            import pandas as pd
+            # Normalize nested data
+            df = pd.json_normalize(data)
+            df.to_excel(output_path, index=False, **kwargs)
+            logger.info(f"Exported analysis data to {output_path}")
+            return str(output_path)
+        except ImportError:
+            raise ImportError(
+                "Pandas and openpyxl are required for Excel export. Install with: pip install pandas openpyxl"
+            )

natural_pdf/exporters/base.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import abc
 import logging
-from typing import Union, List, TYPE_CHECKING
+from typing import TYPE_CHECKING, List, Union
 if TYPE_CHECKING:
-    from natural_pdf.core.pdf import PDF
     from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.core.pdf import PDF
 logger = logging.getLogger(__name__)
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
         """
         Helper to consistently resolve the input source to a list of PDF objects.
         """
-        from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
         from natural_pdf.collections.pdf_collection import PDFCollection  # Avoid circular import
+        from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
         pdfs_to_process: List["PDF"] = []
         if isinstance(source, PDF):

natural_pdf/exporters/paddleocr.py CHANGED Viewed

@@ -1,8 +1,9 @@
-import os
 import logging
+import os
 import random
 import shutil
-from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
+from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 from tqdm import tqdm
 from natural_pdf.exporters.base import FinetuneExporter
@@ -11,8 +12,8 @@ from natural_pdf.exporters.base import FinetuneExporter
 from natural_pdf.utils.identifiers import generate_short_path_hash
 if TYPE_CHECKING:
-    from natural_pdf.core.pdf import PDF
     from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.core.pdf import PDF
     from natural_pdf.elements.text import TextElement
 logger = logging.getLogger(__name__)

natural_pdf/extraction/manager.py CHANGED Viewed

@@ -1,9 +1,10 @@
-import logging
-from typing import Any, Type, Optional
-from pydantic import BaseModel
-import io
 import base64
+import io
+import logging
+from typing import Any, Optional, Type
 from PIL import Image
+from pydantic import BaseModel
 from natural_pdf.extraction.result import StructuredDataResult
@@ -29,47 +30,52 @@ class StructuredDataManager:
         """Checks if necessary dependencies are available."""
         try:
             import pydantic
             return True
         except ImportError:
             logger.warning("Pydantic is required for structured data extraction.")
             return False
     def _prepare_llm_messages(
-        self,
-        content: Any,
-        prompt: Optional[str],
-        using: str,
-        schema: Type[BaseModel]
+        self, content: Any, prompt: Optional[str], using: str, schema: Type[BaseModel]
     ) -> list:
         """Prepares the message list for the LLM API call."""
-        system_prompt = prompt or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
-        messages = [
-            {"role": "system", "content": system_prompt}
-        ]
-        if using == 'text':
+        system_prompt = (
+            prompt
+            or f"Extract the information corresponding to the fields in the {schema.__name__} schema. Respond only with the structured data."
+        )
+        messages = [{"role": "system", "content": system_prompt}]
+        if using == "text":
             messages.append({"role": "user", "content": str(content)})
-        elif using == 'vision':
+        elif using == "vision":
             if isinstance(content, Image.Image):
                 buffered = io.BytesIO()
                 content.save(buffered, format="PNG")
                 base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
-                messages.append({
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Extract information from this image based on the schema."},
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/png;base64,{base64_image}"},
-                        },
-                    ],
-                })
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "Extract information from this image based on the schema.",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/png;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                )
             else:
-                raise TypeError(f"Content must be a PIL Image for using='vision', got {type(content)}")
+                raise TypeError(
+                    f"Content must be a PIL Image for using='vision', got {type(content)}"
+                )
         else:
-             raise ValueError(f"Unsupported value for 'using': {using}")
+            raise ValueError(f"Unsupported value for 'using': {using}")
         return messages
     def extract(
@@ -78,9 +84,9 @@ class StructuredDataManager:
         schema: Type[BaseModel],
         client: Any,
         prompt: Optional[str] = None,
-        using: str = 'text',
+        using: str = "text",
         model: Optional[str] = None,
-        **kwargs
+        **kwargs,
     ) -> StructuredDataResult:
         """
         Extract structured data from content using an LLM.
@@ -99,36 +105,31 @@ class StructuredDataManager:
         """
         logger.debug(f"Extract request: using='{using}', schema='{schema.__name__}'")
-        if isinstance(content, list) and using == 'vision':
+        if isinstance(content, list) and using == "vision":
             if len(content) == 1:
                 content = content[0]
             elif len(content) > 1:
                 logger.error("Vision extraction not supported for multi-page PDFs")
-                raise NotImplementedError("Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead.")
-        selected_model = model or (self.DEFAULT_VISION_MODEL if using == 'vision' else self.DEFAULT_TEXT_MODEL)
+                raise NotImplementedError(
+                    "Batch image extraction on multi-page PDF objects is not supported. Apply to individual pages or regions instead."
+                )
+        selected_model = model or (
+            self.DEFAULT_VISION_MODEL if using == "vision" else self.DEFAULT_TEXT_MODEL
+        )
         messages = self._prepare_llm_messages(content, prompt, using, schema)
         try:
             logger.debug(f"Extracting with model '{selected_model}'")
             completion = client.beta.chat.completions.parse(
-                model=selected_model,
-                messages=messages,
-                response_format=schema,
-                **kwargs
+                model=selected_model, messages=messages, response_format=schema, **kwargs
             )
             parsed_data = completion.choices[0].message.parsed
             return StructuredDataResult(
-                data=parsed_data,
-                success=True,
-                error_message=None,
-                model=selected_model
+                data=parsed_data, success=True, error_message=None, model=selected_model
             )
         except Exception as e:
             logger.error(f"Extraction failed: {str(e)}")
             return StructuredDataResult(
-                data=None,
-                success=False,
-                error_message=str(e),
-                model=selected_model
-            )
+                data=None, success=False, error_message=str(e), model=selected_model
+            )

natural_pdf/extraction/mixin.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import logging
-from typing import TYPE_CHECKING, Any, Type, Optional
 from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional, Type
 from pydantic import BaseModel
 # Avoid circular import
 if TYPE_CHECKING:
-    from natural_pdf.extraction.result import StructuredDataResult
     from natural_pdf.core.page import Page
     from natural_pdf.elements.base import Element
+    from natural_pdf.extraction.result import StructuredDataResult
 logger = logging.getLogger(__name__)
-DEFAULT_STRUCTURED_KEY = "default-structured" # Define default key
+DEFAULT_STRUCTURED_KEY = "structured"  # Define default key
 class ExtractionMixin(ABC):
     """
@@ -19,7 +21,7 @@ class ExtractionMixin(ABC):
     Assumes the inheriting class has `extract_text(**kwargs)` and `to_image(**kwargs)` methods.
     """
-    def _get_extraction_content(self, using: str = 'text', **kwargs) -> Any:
+    def _get_extraction_content(self, using: str = "text", **kwargs) -> Any:
         """
         Retrieves the content (text or image) for extraction.
@@ -32,26 +34,26 @@ class ExtractionMixin(ABC):
             PIL.Image.Image: Rendered image if using='vision'
             None: If content cannot be retrieved
         """
-        if not hasattr(self, 'extract_text') or not callable(self.extract_text):
-             logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
-             return None
-        if not hasattr(self, 'to_image') or not callable(self.to_image):
-             logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
-             return None
+        if not hasattr(self, "extract_text") or not callable(self.extract_text):
+            logger.error(f"ExtractionMixin requires 'extract_text' method on {self!r}")
+            return None
+        if not hasattr(self, "to_image") or not callable(self.to_image):
+            logger.error(f"ExtractionMixin requires 'to_image' method on {self!r}")
+            return None
         try:
-            if using == 'text':
-                layout = kwargs.pop('layout', True)
+            if using == "text":
+                layout = kwargs.pop("layout", True)
                 return self.extract_text(layout=layout, **kwargs)
-            elif using == 'vision':
-                resolution = kwargs.pop('resolution', 72)
-                include_highlights = kwargs.pop('include_highlights', False)
-                labels = kwargs.pop('labels', False)
+            elif using == "vision":
+                resolution = kwargs.pop("resolution", 72)
+                include_highlights = kwargs.pop("include_highlights", False)
+                labels = kwargs.pop("labels", False)
                 return self.to_image(
-                    resolution=resolution,
-                    include_highlights=include_highlights,
-                    labels=labels,
-                    **kwargs
+                    resolution=resolution,
+                    include_highlights=include_highlights,
+                    labels=labels,
+                    **kwargs,
                 )
             else:
                 logger.error(f"Unsupported value for 'using': {using}")
@@ -64,12 +66,12 @@ class ExtractionMixin(ABC):
         self: Any,
         schema: Type[BaseModel],
         client: Any,
-        analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
+        analysis_key: str = DEFAULT_STRUCTURED_KEY,  # Default key
         prompt: Optional[str] = None,
-        using: str = 'text',
+        using: str = "text",
         model: Optional[str] = None,
-        overwrite: bool = False, # Add overwrite parameter
-        **kwargs
+        overwrite: bool = False,  # Add overwrite parameter
+        **kwargs,
     ) -> Any:
         """
         Extracts structured data according to the provided schema.
@@ -91,39 +93,52 @@ class ExtractionMixin(ABC):
         """
         if not analysis_key:
             raise ValueError("analysis_key cannot be empty for extract operation")
         # --- Overwrite Check --- #
-        if not hasattr(self, 'analyses') or self.analyses is None:
+        if not hasattr(self, "analyses") or self.analyses is None:
             self.analyses = {}
         if analysis_key in self.analyses and not overwrite:
             raise ValueError(
                 f"Analysis key '{analysis_key}' already exists in analyses. "
                 f"Use overwrite=True to replace it. Available keys: {list(self.analyses.keys())}"
             )
         # --- End Overwrite Check --- #
         # Determine PDF instance to get manager
         pdf_instance = None
-        if hasattr(self, 'get_manager') and callable(self.get_manager):
+        if hasattr(self, "get_manager") and callable(self.get_manager):
             # Handle case where self is the PDF instance itself
             pdf_instance = self
             logger.debug(f"Manager access via self ({type(self).__name__})")
-        elif hasattr(self, 'pdf') and hasattr(self.pdf, 'get_manager') and callable(self.pdf.get_manager):
+        elif (
+            hasattr(self, "pdf")
+            and hasattr(self.pdf, "get_manager")
+            and callable(self.pdf.get_manager)
+        ):
             # Handle Page or other elements with direct .pdf reference
             pdf_instance = self.pdf
             logger.debug(f"Manager access via self.pdf ({type(self).__name__})")
-        elif hasattr(self, 'page') and hasattr(self.page, 'pdf') and hasattr(self.page.pdf, 'get_manager') and callable(self.page.pdf.get_manager):
+        elif (
+            hasattr(self, "page")
+            and hasattr(self.page, "pdf")
+            and hasattr(self.page.pdf, "get_manager")
+            and callable(self.page.pdf.get_manager)
+        ):
             # Handle Region or other elements with .page.pdf reference
             pdf_instance = self.page.pdf
             logger.debug(f"Manager access via self.page.pdf ({type(self).__name__})")
         else:
-            logger.error(f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf")
-            raise RuntimeError(f"Cannot access PDF manager: {type(self).__name__} lacks necessary references")
+            logger.error(
+                f"Could not find get_manager on {type(self).__name__}, self.pdf, or self.page.pdf"
+            )
+            raise RuntimeError(
+                f"Cannot access PDF manager: {type(self).__name__} lacks necessary references"
+            )
         try:
-            manager = pdf_instance.get_manager('structured_data')
+            manager = pdf_instance.get_manager("structured_data")
         except Exception as e:
             raise RuntimeError(f"Failed to get StructuredDataManager: {e}")
@@ -131,18 +146,23 @@ class ExtractionMixin(ABC):
             raise RuntimeError("StructuredDataManager is not available")
         # Get content
-        layout_for_text = kwargs.pop('layout', True)
-        content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs) # Pass kwargs
+        layout_for_text = kwargs.pop("layout", True)
+        content = self._get_extraction_content(
+            using=using, layout=layout_for_text, **kwargs
+        )  # Pass kwargs
-        if content is None or (using == 'text' and isinstance(content, str) and not content.strip()):
+        if content is None or (
+            using == "text" and isinstance(content, str) and not content.strip()
+        ):
             logger.warning(f"No content available for extraction (using='{using}') on {self!r}")
             # Import here to avoid circularity at module level
-            from natural_pdf.extraction.result import StructuredDataResult
+            from natural_pdf.extraction.result import StructuredDataResult
             result = StructuredDataResult(
                 data=None,
                 success=False,
                 error_message=f"No content available for extraction (using='{using}')",
-                model=model # Use model requested, even if failed
+                model=model,  # Use model requested, even if failed
             )
         else:
             result = manager.extract(
@@ -152,16 +172,20 @@ class ExtractionMixin(ABC):
                 prompt=prompt,
                 using=using,
                 model=model,
-                **kwargs
+                **kwargs,
             )
         # Store the result
         self.analyses[analysis_key] = result
-        logger.info(f"Stored extraction result under key '{analysis_key}' (Success: {result.success})")
+        logger.info(
+            f"Stored extraction result under key '{analysis_key}' (Success: {result.success})"
+        )
         return self
-    def extracted(self, field_name: Optional[str] = None, analysis_key: Optional[str] = None) -> Any:
+    def extracted(
+        self, field_name: Optional[str] = None, analysis_key: Optional[str] = None
+    ) -> Any:
         """
         Convenience method to access results from structured data extraction.
@@ -182,7 +206,7 @@ class ExtractionMixin(ABC):
         """
         target_key = analysis_key if analysis_key is not None else DEFAULT_STRUCTURED_KEY
-        if not hasattr(self, 'analyses') or self.analyses is None:
+        if not hasattr(self, "analyses") or self.analyses is None:
             raise AttributeError(f"{type(self).__name__} object has no 'analyses' attribute yet.")
         if target_key not in self.analyses:
@@ -194,21 +218,28 @@ class ExtractionMixin(ABC):
         # Import here to avoid circularity and allow type checking
         from natural_pdf.extraction.result import StructuredDataResult
         result: StructuredDataResult = self.analyses[target_key]
         if not isinstance(result, StructuredDataResult):
-            logger.warning(f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process.")
-            raise TypeError(f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}")
+            logger.warning(
+                f"Item found at key '{target_key}' is not a StructuredDataResult (type: {type(result)}). Cannot process."
+            )
+            raise TypeError(
+                f"Expected a StructuredDataResult at key '{target_key}', found {type(result).__name__}"
+            )
         if not result.success:
             raise ValueError(
                 f"Stored result for '{target_key}' indicates a failed extraction attempt. "
                 f"Error: {result.error_message}"
             )
         if result.data is None:
-             # This case might occur if success=True but data is somehow None
-             raise ValueError(f"Extraction result for '{target_key}' has no data available, despite success flag.")
+            # This case might occur if success=True but data is somehow None
+            raise ValueError(
+                f"Extraction result for '{target_key}' has no data available, despite success flag."
+            )
         if field_name is None:
             # Return the whole data object (Pydantic model instance or dict)
@@ -231,16 +262,18 @@ class ExtractionMixin(ABC):
                 except AttributeError:
                     # Try to get available fields from the object
                     available_fields = []
-                    if hasattr(result.data, 'model_fields'): # Pydantic v2
+                    if hasattr(result.data, "model_fields"):  # Pydantic v2
                         available_fields = list(result.data.model_fields.keys())
-                    elif hasattr(result.data, '__fields__'): # Pydantic v1
+                    elif hasattr(result.data, "__fields__"):  # Pydantic v1
                         available_fields = list(result.data.__fields__.keys())
-                    elif hasattr(result.data, '__dict__'): # Fallback
+                    elif hasattr(result.data, "__dict__"):  # Fallback
                         available_fields = list(result.data.__dict__.keys())
                     raise AttributeError(
                         f"Field/Attribute '{field_name}' not found on extracted object of type {type(result.data).__name__} "
                         f"for key '{target_key}'. Available fields/attributes: {available_fields}"
                     )
-                except Exception as e: # Catch other potential errors during getattr
-                     raise TypeError(f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}") from e
+                except Exception as e:  # Catch other potential errors during getattr
+                    raise TypeError(
+                        f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
+                    ) from e

natural_pdf/extraction/result.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import Optional, TypeVar, Generic, Any
+from typing import Any, Generic, Optional, TypeVar
 from pydantic import BaseModel, Field
 # Generic type for the Pydantic model used in the schema
@@ -8,30 +9,15 @@ T_Schema = TypeVar("T_Schema", bound=BaseModel)
 class StructuredDataResult(BaseModel, Generic[T_Schema]):
     """
     Represents the result of a structured data extraction operation.
     Contains the extracted data, success status, and error information.
     """
-    data: Optional[T_Schema] = Field(
-        None,
-        description="Validated data model or None on failure"
-    )
-    success: bool = Field(
-        ...,
-        description="Whether extraction succeeded"
-    )
-    error_message: Optional[str] = Field(
-        None,
-        description="Error details if extraction failed"
-    )
-    raw_output: Optional[Any] = Field(
-        None,
-        description="Raw output from the language model"
-    )
-    model_used: Optional[str] = Field(
-        None,
-        description="Identifier of the language model used"
-    )
+    data: Optional[T_Schema] = Field(None, description="Validated data model or None on failure")
+    success: bool = Field(..., description="Whether extraction succeeded")
+    error_message: Optional[str] = Field(None, description="Error details if extraction failed")
+    raw_output: Optional[Any] = Field(None, description="Raw output from the language model")
+    model_used: Optional[str] = Field(None, description="Identifier of the language model used")
     class Config:
-        arbitrary_types_allowed = True
+        arbitrary_types_allowed = True

natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.9py3-none-any.whl