PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/layout/docling.py CHANGED Viewed

@@ -1,24 +1,38 @@
 # layout_detector_docling.py
-import logging
 import importlib.util
+import logging
 import os
 import tempfile
-from typing import List, Dict, Any, Optional
+from typing import Any, Dict, List, Optional
 from PIL import Image
 # Assuming base class and options are importable
 try:
     from .base import LayoutDetector
-    from .layout_options import DoclingLayoutOptions, BaseLayoutOptions
+    from .layout_options import BaseLayoutOptions, DoclingLayoutOptions
 except ImportError:
     # Placeholders if run standalone or imports fail
-    class BaseLayoutOptions: pass
-    class DoclingLayoutOptions(BaseLayoutOptions): pass
+    class BaseLayoutOptions:
+        pass
+    class DoclingLayoutOptions(BaseLayoutOptions):
+        pass
     class LayoutDetector:
-         def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
-         def _get_model(self, options): raise NotImplementedError
-         def _normalize_class_name(self, n): return n
-         def validate_classes(self, c): pass
+        def __init__(self):
+            self.logger = logging.getLogger()
+            self.supported_classes = set()
+        def _get_model(self, options):
+            raise NotImplementedError
+        def _normalize_class_name(self, n):
+            return n
+        def validate_classes(self, c):
+            pass
     logging.basicConfig()
 logger = logging.getLogger(__name__)
@@ -42,11 +56,27 @@ class DoclingLayoutDetector(LayoutDetector):
         super().__init__()
         # Docling classes are dynamic/hierarchical, define common ones
         self.supported_classes = {
-            'Header', 'Footer', 'Paragraph', 'Heading', 'List', 'ListItem',
-            'Table', 'Figure', 'Caption', 'Footnote', 'PageNumber', 'Equation',
-            'Code', 'Title', 'Author', 'Abstract', 'Section', 'Unknown', 'Metadata' # Add more as needed
+            "Header",
+            "Footer",
+            "Paragraph",
+            "Heading",
+            "List",
+            "ListItem",
+            "Table",
+            "Figure",
+            "Caption",
+            "Footnote",
+            "PageNumber",
+            "Equation",
+            "Code",
+            "Title",
+            "Author",
+            "Abstract",
+            "Section",
+            "Unknown",
+            "Metadata",  # Add more as needed
         }
-        self._docling_document_cache = {} # Cache the output doc per image/options if needed
+        self._docling_document_cache = {}  # Cache the output doc per image/options if needed
     def is_available(self) -> bool:
         """Check if docling is installed."""
@@ -55,9 +85,9 @@ class DoclingLayoutDetector(LayoutDetector):
     def _get_cache_key(self, options: BaseLayoutOptions) -> str:
         """Generate cache key based on device and potentially converter args."""
         if not isinstance(options, DoclingLayoutOptions):
-             options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
+            options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
-        device_key = str(options.device).lower() if options.device else 'default_device'
+        device_key = str(options.device).lower() if options.device else "default_device"
         # Include hash of extra_args if they affect model loading/converter init
         extra_args_key = hash(frozenset(options.extra_args.items()))
         return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
@@ -88,12 +118,17 @@ class DoclingLayoutDetector(LayoutDetector):
             raise RuntimeError("Docling dependency not installed.")
         if not isinstance(options, DoclingLayoutOptions):
-             self.logger.warning("Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults.")
-             options = DoclingLayoutOptions(
-                 confidence=options.confidence, classes=options.classes,
-                 exclude_classes=options.exclude_classes, device=options.device,
-                 extra_args=options.extra_args, verbose=options.extra_args.get('verbose', False)
-             )
+            self.logger.warning(
+                "Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults."
+            )
+            options = DoclingLayoutOptions(
+                confidence=options.confidence,
+                classes=options.classes,
+                exclude_classes=options.exclude_classes,
+                device=options.device,
+                extra_args=options.extra_args,
+                verbose=options.extra_args.get("verbose", False),
+            )
         # Validate classes before proceeding (note: Docling classes are case-sensitive)
         # self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
@@ -105,18 +140,20 @@ class DoclingLayoutDetector(LayoutDetector):
         # Docling convert method requires an image path. Save temp file.
         detections = []
-        docling_doc = None # To store the result
+        docling_doc = None  # To store the result
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
             try:
-                self.logger.debug(f"Saving temporary image for Docling detector to: {temp_image_path}")
-                image.convert("RGB").save(temp_image_path) # Ensure RGB
+                self.logger.debug(
+                    f"Saving temporary image for Docling detector to: {temp_image_path}"
+                )
+                image.convert("RGB").save(temp_image_path)  # Ensure RGB
                 # Convert the document using Docling's DocumentConverter
                 self.logger.debug("Running Docling conversion...")
                 # Docling convert returns a Result object with a 'document' attribute
                 result = converter.convert(temp_image_path)
-                docling_doc = result.document # Store the DoclingDocument
+                docling_doc = result.document  # Store the DoclingDocument
                 self.logger.info(f"Docling conversion complete.")
                 # Convert Docling document to our detection format
@@ -124,12 +161,14 @@ class DoclingLayoutDetector(LayoutDetector):
             except Exception as e:
                 self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
-                raise # Re-raise the exception
+                raise  # Re-raise the exception
             finally:
-                 # Ensure temp file is removed
-                 if os.path.exists(temp_image_path):
-                      try: os.remove(temp_image_path)
-                      except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
+                # Ensure temp file is removed
+                if os.path.exists(temp_image_path):
+                    try:
+                        os.remove(temp_image_path)
+                    except OSError as e_rm:
+                        self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
         # Cache the docling document if needed elsewhere (maybe associate with page?)
         # self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
@@ -137,26 +176,37 @@ class DoclingLayoutDetector(LayoutDetector):
         self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
         return detections
-    def _convert_docling_to_detections(self, doc, options: DoclingLayoutOptions) -> List[Dict[str, Any]]:
+    def _convert_docling_to_detections(
+        self, doc, options: DoclingLayoutOptions
+    ) -> List[Dict[str, Any]]:
         """Convert a Docling document to our standard detection format."""
-        if not doc or not hasattr(doc, 'pages') or not doc.pages:
+        if not doc or not hasattr(doc, "pages") or not doc.pages:
             self.logger.warning("Invalid or empty Docling document for conversion.")
             return []
         detections = []
-        id_to_detection_index = {} # Map Docling ID to index in detections list
+        id_to_detection_index = {}  # Map Docling ID to index in detections list
         # Prepare normalized class filters once
-        normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
-        normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
+        normalized_classes_req = (
+            {self._normalize_class_name(c) for c in options.classes} if options.classes else None
+        )
+        normalized_classes_excl = (
+            {self._normalize_class_name(c) for c in options.exclude_classes}
+            if options.exclude_classes
+            else set()
+        )
         # --- Iterate through elements using Docling's structure ---
         # This requires traversing the hierarchy (e.g., doc.body.children)
         # or iterating through specific lists like doc.texts, doc.tables etc.
         elements_to_process = []
-        if hasattr(doc, 'texts'): elements_to_process.extend(doc.texts)
-        if hasattr(doc, 'tables'): elements_to_process.extend(doc.tables)
-        if hasattr(doc, 'pictures'): elements_to_process.extend(doc.pictures)
+        if hasattr(doc, "texts"):
+            elements_to_process.extend(doc.texts)
+        if hasattr(doc, "tables"):
+            elements_to_process.extend(doc.tables)
+        if hasattr(doc, "pictures"):
+            elements_to_process.extend(doc.pictures)
         # Add other element types from DoclingDocument as needed
         self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
@@ -164,16 +214,19 @@ class DoclingLayoutDetector(LayoutDetector):
         for elem in elements_to_process:
             try:
                 # Get Provenance (bbox and page number)
-                if not hasattr(elem, 'prov') or not elem.prov: continue
-                prov = elem.prov[0] # Use first provenance
-                if not hasattr(prov, 'bbox') or not prov.bbox: continue
+                if not hasattr(elem, "prov") or not elem.prov:
+                    continue
+                prov = elem.prov[0]  # Use first provenance
+                if not hasattr(prov, "bbox") or not prov.bbox:
+                    continue
                 bbox = prov.bbox
                 page_no = prov.page_no
                 # Get Page Dimensions (crucial for coordinate conversion)
-                if not hasattr(doc.pages.get(page_no), 'size'): continue
+                if not hasattr(doc.pages.get(page_no), "size"):
+                    continue
                 page_height = doc.pages[page_no].size.height
-                page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
+                page_width = doc.pages[page_no].size.width  # Needed? Bbox seems absolute
                 # Convert coordinates from Docling's system (often bottom-left origin)
                 # to standard top-left origin (0,0 at top-left)
@@ -182,46 +235,51 @@ class DoclingLayoutDetector(LayoutDetector):
                 x1 = float(bbox.r)
                 # Convert y: top_y = page_height - bottom_left_t
                 #            bottom_y = page_height - bottom_left_b
-                y0 = float(page_height - bbox.t) # Top y
-                y1 = float(page_height - bbox.b) # Bottom y
+                y0 = float(page_height - bbox.t)  # Top y
+                y1 = float(page_height - bbox.b)  # Bottom y
                 # Ensure y0 < y1
-                if y0 > y1: y0, y1 = y1, y0
+                if y0 > y1:
+                    y0, y1 = y1, y0
                 # Ensure x0 < x1
-                if x0 > x1: x0, x1 = x1, x0
+                if x0 > x1:
+                    x0, x1 = x1, x0
                 # Get Class Label
-                label_orig = str(getattr(elem, 'label', 'Unknown')) # Default if no label
+                label_orig = str(getattr(elem, "label", "Unknown"))  # Default if no label
                 normalized_label = self._normalize_class_name(label_orig)
                 # Apply Class Filtering
-                if normalized_classes_req and normalized_label not in normalized_classes_req: continue
-                if normalized_label in normalized_classes_excl: continue
+                if normalized_classes_req and normalized_label not in normalized_classes_req:
+                    continue
+                if normalized_label in normalized_classes_excl:
+                    continue
                 # Get Confidence (Docling often doesn't provide per-element confidence)
-                confidence = getattr(elem, 'confidence', 0.95) # Assign default confidence
-                if confidence < options.confidence: continue # Apply confidence threshold
+                confidence = getattr(elem, "confidence", 0.95)  # Assign default confidence
+                if confidence < options.confidence:
+                    continue  # Apply confidence threshold
                 # Get Text Content
-                text_content = getattr(elem, 'text', None)
+                text_content = getattr(elem, "text", None)
                 # Get IDs for hierarchy
-                docling_id = getattr(elem, 'self_ref', None)
-                parent_id_obj = getattr(elem, 'parent', None)
-                parent_id = getattr(parent_id_obj, 'self_ref', None) if parent_id_obj else None
+                docling_id = getattr(elem, "self_ref", None)
+                parent_id_obj = getattr(elem, "parent", None)
+                parent_id = getattr(parent_id_obj, "self_ref", None) if parent_id_obj else None
                 # Create Detection Dictionary
                 detection = {
-                    'bbox': (x0, y0, x1, y1),
-                    'class': label_orig,
-                    'normalized_class': normalized_label,
-                    'confidence': confidence,
-                    'text': text_content,
-                    'docling_id': docling_id,
-                    'parent_id': parent_id,
-                    'page_number': page_no, # Add page number if useful
-                    'source': 'layout',
-                    'model': 'docling'
+                    "bbox": (x0, y0, x1, y1),
+                    "class": label_orig,
+                    "normalized_class": normalized_label,
+                    "confidence": confidence,
+                    "text": text_content,
+                    "docling_id": docling_id,
+                    "parent_id": parent_id,
+                    "page_number": page_no,  # Add page number if useful
+                    "source": "layout",
+                    "model": "docling",
                 }
                 detections.append(detection)
@@ -229,8 +287,8 @@ class DoclingLayoutDetector(LayoutDetector):
                 # if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
             except Exception as conv_e:
-                 self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
-                 continue
+                self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
+                continue
         return detections
@@ -241,7 +299,8 @@ class DoclingLayoutDetector(LayoutDetector):
         """
         # This requires caching the doc based on image/options or re-running.
         # For simplicity, let's just re-run detect if needed.
-        self.logger.warning("get_docling_document: Re-running detection to ensure document is generated.")
-        self.detect(image, options) # Run detect to populate internal doc
-        return getattr(self, '_docling_document', None) # Return the stored doc
+        self.logger.warning(
+            "get_docling_document: Re-running detection to ensure document is generated."
+        )
+        self.detect(image, options)  # Run detect to populate internal doc
+        return getattr(self, "_docling_document", None)  # Return the stored doc

natural_pdf/analyzers/layout/gemini.py ADDED Viewed

@@ -0,0 +1,264 @@
+# layout_detector_gemini.py
+import importlib.util
+import logging
+import os
+from typing import Any, Dict, List, Optional
+import base64
+import io
+from pydantic import BaseModel, Field
+from PIL import Image
+# Use OpenAI library for interaction
+try:
+    from openai import OpenAI
+    from openai.types.chat import ChatCompletion
+    # Import OpenAIError for exception handling if needed
+except ImportError:
+    OpenAI = None
+    ChatCompletion = None
+try:
+    from .base import LayoutDetector
+    from .layout_options import BaseLayoutOptions, GeminiLayoutOptions
+except ImportError:
+    # Placeholders if run standalone or imports fail
+    class BaseLayoutOptions:
+        pass
+    class GeminiLayoutOptions(BaseLayoutOptions):
+        pass
+    class LayoutDetector:
+        def __init__(self):
+            self.logger = logging.getLogger()
+            self.supported_classes = set() # Will be dynamic based on user request
+        def _get_model(self, options):
+            raise NotImplementedError
+        def _normalize_class_name(self, n):
+            return n.lower().replace("_", "-").replace(" ", "-")
+        def validate_classes(self, c):
+            pass # Less strict validation needed for LLM
+    logging.basicConfig()
+logger = logging.getLogger(__name__)
+# Define Pydantic model for the expected output structure
+# This is used by the openai library's `response_format`
+class DetectedRegion(BaseModel):
+    label: str = Field(description="The identified class name.")
+    bbox: List[float] = Field(description="Bounding box coordinates [xmin, ymin, xmax, ymax].", min_items=4, max_items=4)
+    confidence: float = Field(description="Confidence score [0.0, 1.0].", ge=0.0, le=1.0)
+class GeminiLayoutDetector(LayoutDetector):
+    """Document layout detector using Google's Gemini models via OpenAI compatibility layer."""
+    # Base URL for the Gemini OpenAI-compatible endpoint
+    GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
+    def __init__(self):
+        super().__init__()
+        self.supported_classes = set() # Indicate dynamic nature
+    def is_available(self) -> bool:
+        """Check if openai library is installed and GOOGLE_API_KEY is available."""
+        api_key = os.environ.get("GOOGLE_API_KEY")
+        if not api_key:
+            logger.warning("GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.")
+            return False
+        if OpenAI is None:
+             logger.warning("openai package not found. Gemini detector (via OpenAI lib) will not be available.")
+             return False
+        return True
+    def _get_cache_key(self, options: GeminiLayoutOptions) -> str:
+        """Generate cache key based on model name."""
+        if not isinstance(options, GeminiLayoutOptions):
+            options = GeminiLayoutOptions() # Use defaults
+        model_key = options.model_name
+        # Prompt is built dynamically, so not part of cache key based on options
+        return f"{self.__class__.__name__}_{model_key}"
+    def _load_model_from_options(self, options: GeminiLayoutOptions) -> Any:
+        """Validate options and return the model name."""
+        if not self.is_available():
+            raise RuntimeError(
+                "OpenAI library not installed or GOOGLE_API_KEY not set. Please run: pip install openai"
+            )
+        if not isinstance(options, GeminiLayoutOptions):
+            raise TypeError("Incorrect options type provided for Gemini model loading.")
+        # Simply return the model name, client is created in detect()
+        return options.model_name
+    def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
+        """Detect layout elements in an image using Gemini via OpenAI library."""
+        if not self.is_available():
+            raise RuntimeError(
+                "OpenAI library not installed or GOOGLE_API_KEY not set."
+            )
+        # Ensure options are the correct type
+        if not isinstance(options, GeminiLayoutOptions):
+            self.logger.warning(
+                "Received BaseLayoutOptions, expected GeminiLayoutOptions. Using defaults."
+            )
+            options = GeminiLayoutOptions(
+                confidence=options.confidence,
+                classes=options.classes,
+                exclude_classes=options.exclude_classes,
+                device=options.device,
+                extra_args=options.extra_args,
+            )
+        model_name = self._get_model(options)
+        api_key = os.environ.get("GOOGLE_API_KEY")
+        detections = []
+        try:
+            # --- 1. Initialize OpenAI Client for Gemini ---
+            client = OpenAI(
+                api_key=api_key,
+                base_url=self.GEMINI_BASE_URL
+            )
+            # --- 2. Prepare Input for OpenAI API ---
+            if not options.classes:
+                logger.error("Gemini layout detection requires a list of classes to find.")
+                return []
+            width, height = image.size
+            # Convert image to base64
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            image_url = f"data:image/png;base64,{img_base64}"
+            # Construct the prompt text
+            class_list_str = ", ".join(f'`{c}`' for c in options.classes)
+            prompt_text = (
+                f"Analyze the provided image of a document page ({width}x{height}). "
+                f"Identify all regions corresponding to the following types: {class_list_str}. "
+                f"Return ONLY the structured data requested."
+            )
+            # Prepare messages for chat completions endpoint
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt_text},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_url},
+                        },
+                    ],
+                }
+            ]
+            # --- 3. Call OpenAI API using .parse for structured output ---
+            logger.debug(f"Running Gemini detection via OpenAI lib (Model: {model_name}). Asking for classes: {options.classes}")
+            # Extract relevant generation parameters from extra_args if provided
+            # Mapping common names: temperature, top_p, max_tokens
+            completion_kwargs = {
+                "temperature": options.extra_args.get("temperature", 0.2), # Default to low temp
+                "top_p": options.extra_args.get("top_p"),
+                "max_tokens": options.extra_args.get("max_tokens", 4096), # Map from max_output_tokens
+            }
+            # Filter out None values
+            completion_kwargs = {k: v for k, v in completion_kwargs.items() if v is not None}
+            completion: ChatCompletion = client.beta.chat.completions.parse(
+                model=model_name,
+                messages=messages,
+                response_format=List[DetectedRegion], # Pass the Pydantic model list
+                **completion_kwargs
+            )
+            logger.debug(f"Gemini response received via OpenAI lib.")
+            # --- 4. Process Parsed Response ---
+            if not completion.choices:
+                logger.error("Gemini response (via OpenAI lib) contained no choices.")
+                return []
+            # Get the parsed Pydantic objects
+            parsed_results = completion.choices[0].message.parsed
+            if not parsed_results or not isinstance(parsed_results, list):
+                 logger.error(f"Gemini response (via OpenAI lib) did not contain a valid list of parsed regions. Found: {type(parsed_results)}")
+                 return []
+            # --- 5. Convert to Detections & Filter ---
+            normalized_classes_req = {
+                self._normalize_class_name(c) for c in options.classes
+            }
+            normalized_classes_excl = {
+                self._normalize_class_name(c) for c in options.exclude_classes
+            } if options.exclude_classes else set()
+            for item in parsed_results:
+                # The item is already a validated DetectedRegion Pydantic object
+                # Access fields directly
+                label = item.label
+                bbox_raw = item.bbox
+                confidence_score = item.confidence
+                # Coordinates should already be floats, but ensure tuple format
+                xmin, ymin, xmax, ymax = tuple(bbox_raw)
+                # --- Apply Filtering ---
+                normalized_class = self._normalize_class_name(label)
+                # Check against requested classes (Should be guaranteed by schema, but doesn't hurt)
+                if normalized_class not in normalized_classes_req:
+                    logger.warning(f"Gemini (via OpenAI) returned unexpected class '{label}' despite schema. Skipping.")
+                    continue
+                # Check against excluded classes
+                if normalized_class in normalized_classes_excl:
+                    logger.debug(f"Skipping excluded class '{label}' (normalized: {normalized_class}).")
+                    continue
+                # Check against base confidence threshold from options
+                if confidence_score < options.confidence:
+                    logger.debug(f"Skipping item with confidence {confidence_score:.3f} below threshold {options.confidence}.")
+                    continue
+                # Add detection
+                detections.append({
+                    "bbox": (xmin, ymin, xmax, ymax),
+                    "class": label, # Use original label from LLM
+                    "confidence": confidence_score,
+                    "normalized_class": normalized_class,
+                    "source": "layout",
+                    "model": "gemini", # Keep model name generic as gemini
+                })
+            self.logger.info(
+                f"Gemini (via OpenAI lib) processed response. Detected {len(detections)} layout elements matching criteria."
+            )
+        except Exception as e:
+            # Catch potential OpenAI API errors or other issues
+            self.logger.error(f"Error during Gemini detection (via OpenAI lib): {e}", exc_info=True)
+            return []
+        return detections
+    def _normalize_class_name(self, name: str) -> str:
+        """Normalizes class names for filtering (lowercase, hyphenated)."""
+        return super()._normalize_class_name(name)
+    def validate_classes(self, classes: List[str]):
+        """Validation is less critical as we pass requested classes to the LLM."""
+        pass # Override base validation if needed, but likely not necessary

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl