PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -1,10 +1,12 @@
+import json
 import logging
-from typing import List, Dict, Any, Optional, Union, Tuple
-import numpy as np
-from PIL import Image, ImageDraw
 import os
 import tempfile
-import json
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+from PIL import Image, ImageDraw
 from natural_pdf.elements.collections import ElementCollection
 logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -12,41 +14,42 @@ logger = logging.getLogger("natural_pdf.qa.document_qa")
 # Global QA engine instance
 _QA_ENGINE_INSTANCE = None
 def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
     """
     Get or create a global QA engine instance.
     Args:
         model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
         **kwargs: Additional parameters to pass to the DocumentQA constructor
     Returns:
         DocumentQA instance
     """
     global _QA_ENGINE_INSTANCE
     if _QA_ENGINE_INSTANCE is None:
         try:
             _QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
         except Exception as e:
             logger.error(f"Failed to initialize QA engine: {e}")
             raise
     return _QA_ENGINE_INSTANCE
 class DocumentQA:
     """
     Document Question Answering using LayoutLM.
     This class provides the ability to ask natural language questions about document content,
     leveraging the spatial layout information from PDF pages.
     """
     def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
         """
         Initialize the Document QA engine.
         Args:
             model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
             device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
@@ -54,20 +57,20 @@ class DocumentQA:
         try:
             import torch
             from transformers import pipeline
             # Determine device
             if device is None:
-                device = 'cuda' if torch.cuda.is_available() else 'cpu'
+                device = "cuda" if torch.cuda.is_available() else "cpu"
             logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
             # Initialize the pipeline
             self.pipe = pipeline("document-question-answering", model=model_name, device=device)
             self.model_name = model_name
             self.device = device
             self._is_initialized = True
         except ImportError as e:
             logger.error(f"Failed to import required packages: {e}")
             self._is_initialized = False
@@ -79,56 +82,55 @@ class DocumentQA:
             logger.error(f"Failed to initialize DocumentQA: {e}")
             self._is_initialized = False
             raise
     def is_available(self) -> bool:
         """Check if the QA engine is properly initialized."""
         return self._is_initialized
     def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
         """
         Extract word boxes from text elements.
         Args:
             elements: List of TextElement objects
             offset_x: X-coordinate offset to subtract (for region cropping)
             offset_y: Y-coordinate offset to subtract (for region cropping)
         Returns:
             List of [text, [x0, top, x1, bottom]] entries
         """
         word_boxes = []
         for element in elements:
-            if hasattr(element, 'text') and element.text.strip():
+            if hasattr(element, "text") and element.text.strip():
                 # Apply offset for cropped regions
                 x0 = int(element.x0) - offset_x
                 top = int(element.top) - offset_y
                 x1 = int(element.x1) - offset_x
                 bottom = int(element.bottom) - offset_y
                 # Ensure coordinates are valid (non-negative)
                 x0 = max(0, x0)
                 top = max(0, top)
                 x1 = max(0, x1)
                 bottom = max(0, bottom)
-                word_boxes.append([
-                    element.text,
-                    [x0, top, x1, bottom]
-                ])
+                word_boxes.append([element.text, [x0, top, x1, bottom]])
         return word_boxes
-    def ask(self,
-            image: Union[str, Image.Image, np.ndarray],
-            question: str,
-            word_boxes: List = None,
-            min_confidence: float = 0.1,
-            debug: bool = False,
-            debug_output_dir: str = "output") -> Dict[str, Any]:
+    def ask(
+        self,
+        image: Union[str, Image.Image, np.ndarray],
+        question: str,
+        word_boxes: List = None,
+        min_confidence: float = 0.1,
+        debug: bool = False,
+        debug_output_dir: str = "output",
+    ) -> Dict[str, Any]:
         """
         Ask a question about document content.
         Args:
             image: PIL Image, numpy array, or path to image file
             question: Question to ask about the document
@@ -136,7 +138,7 @@ class DocumentQA:
             min_confidence: Minimum confidence threshold for answers
             debug: Whether to save debug information
             debug_output_dir: Directory to save debug files
         Returns:
             Dictionary with answer details: {
                 "answer": extracted text,
@@ -147,7 +149,7 @@ class DocumentQA:
         """
         if not self._is_initialized:
             raise RuntimeError("DocumentQA is not properly initialized")
         # Process the image
         if isinstance(image, str):
             # It's a file path
@@ -162,65 +164,68 @@ class DocumentQA:
             image_obj = image
         else:
             raise TypeError("Image must be a PIL Image, numpy array, or file path")
         # Prepare the query
-        query = {
-            "image": image_obj,
-            "question": question
-        }
+        query = {"image": image_obj, "question": question}
         # Add word boxes if provided
         if word_boxes:
             query["word_boxes"] = word_boxes
         # Save debug information if requested
-        if debug:
+        if debug:
             # Create debug directory
             os.makedirs(debug_output_dir, exist_ok=True)
             # Save the image
             image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
             image_obj.save(image_debug_path)
             # Save word boxes
             if word_boxes:
                 word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
-                with open(word_boxes_path, 'w') as f:
+                with open(word_boxes_path, "w") as f:
                     json.dump(word_boxes, f, indent=2)
                 # Generate a visualization of the boxes on the image
                 vis_image = image_obj.copy()
                 draw = ImageDraw.Draw(vis_image)
                 for i, (text, box) in enumerate(word_boxes):
                     x0, y0, x1, y1 = box
                     draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
                     # Add text index for reference
                     draw.text((x0, y0), str(i), fill=(255, 0, 0))
                 vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
                 vis_image.save(vis_path)
                 logger.info(f"Saved debug files to {debug_output_dir}")
                 logger.info(f"Question: {question}")
                 logger.info(f"Image: {image_debug_path}")
                 logger.info(f"Word boxes: {word_boxes_path}")
                 logger.info(f"Visualization: {vis_path}")
         # Run the query through the pipeline
         logger.info(f"Running document QA pipeline with question: {question}")
         result = self.pipe(query)[0]
         logger.info(f"Raw result: {result}")
         # Save the result if debugging
         if debug:
             result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
-            with open(result_path, 'w') as f:
+            with open(result_path, "w") as f:
                 # Convert any non-serializable data
-                serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
-                                    for k, v in result.items()}
+                serializable_result = {
+                    k: (
+                        str(v)
+                        if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
+                        else v
+                    )
+                    for k, v in result.items()
+                }
                 json.dump(serializable_result, f, indent=2)
         # Check confidence against threshold
         if result["score"] < min_confidence:
             logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
@@ -229,48 +234,49 @@ class DocumentQA:
                 "confidence": result["score"],
                 "start": result.get("start", -1),
                 "end": result.get("end", -1),
-                "found": False
+                "found": False,
             }
         return {
             "answer": result["answer"],
             "confidence": result["score"],
             "start": result.get("start", 0),
             "end": result.get("end", 0),
-            "found": True
+            "found": True,
         }
-    def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
+    def ask_pdf_page(
+        self, page, question: str, min_confidence: float = 0.1, debug: bool = False
+    ) -> Dict[str, Any]:
         """
         Ask a question about a specific PDF page.
         Args:
             page: natural_pdf.core.page.Page object
             question: Question to ask about the page
             min_confidence: Minimum confidence threshold for answers
         Returns:
             Dictionary with answer details
         """
         # Ensure we have text elements on the page
-        if not page.find_all('text'):
+        if not page.find_all("text"):
             # Apply OCR if no text is available
             logger.info(f"No text elements found on page {page.index}, applying OCR")
             page.apply_ocr()
         # Extract word boxes
-        elements = page.find_all('text')
+        elements = page.find_all("text")
         word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
         # Generate a high-resolution image of the page
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
             temp_path = temp_file.name
         # Save a high resolution image (300 DPI)
         page_image = page.to_image(resolution=300, include_highlights=False)
         page_image.save(temp_path)
         try:
             # Ask the question
             result = self.ask(
@@ -278,79 +284,81 @@ class DocumentQA:
                 question=question,
                 word_boxes=word_boxes,
                 min_confidence=min_confidence,
-                debug=debug
+                debug=debug,
             )
             # Add page reference to the result
             result["page_num"] = page.index
             # Add element references if possible
             if result.get("found", False) and "start" in result and "end" in result:
                 start_idx = result["start"]
                 end_idx = result["end"]
                 # Make sure we have valid indices and elements to work with
                 if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
                     # Find the actual source elements in the original list
                     # Since word_boxes may have filtered out some elements, we need to map indices
                     # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
+                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
                     # Find corresponding elements in the full element list
                     source_elements = []
                     for element in elements:
-                        if hasattr(element, 'text') and element.text in matched_texts:
+                        if hasattr(element, "text") and element.text in matched_texts:
                             source_elements.append(element)
                             # Remove from matched texts to avoid duplicates
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
                     result["source_elements"] = ElementCollection(source_elements)
             return result
         finally:
             # Clean up temporary file
             if os.path.exists(temp_path):
                 os.remove(temp_path)
-    def ask_pdf_region(self, region, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
+    def ask_pdf_region(
+        self, region, question: str, min_confidence: float = 0.1, debug: bool = False
+    ) -> Dict[str, Any]:
         """
         Ask a question about a specific region of a PDF page.
         Args:
             region: natural_pdf.elements.region.Region object
             question: Question to ask about the region
             min_confidence: Minimum confidence threshold for answers
         Returns:
             Dictionary with answer details
         """
         # Get all text elements within the region
-        elements = region.find_all('text')
+        elements = region.find_all("text")
         # Apply OCR if needed
         if not elements:
             logger.info(f"No text elements found in region, applying OCR")
             elements = region.apply_ocr()
         # Extract word boxes adjusted for the cropped region
         x0, top = int(region.x0), int(region.top)
         word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
         # Generate a cropped image of the region
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
             temp_path = temp_file.name
         # Get page image at high resolution - this returns a PIL Image directly
         page_image = region.page.to_image(resolution=300, include_highlights=False)
         # Crop to region
         x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
         region_image = page_image.crop((x0, top, x1, bottom))
         region_image.save(temp_path)
         try:
             # Ask the question
             result = self.ask(
@@ -358,40 +366,40 @@ class DocumentQA:
                 question=question,
                 word_boxes=word_boxes,
                 min_confidence=min_confidence,
-                debug=debug
+                debug=debug,
             )
             # Add region reference to the result
             result["region"] = region
             result["page_num"] = region.page.index
             # Add element references if possible
             if result.get("found", False) and "start" in result and "end" in result:
                 start_idx = result["start"]
                 end_idx = result["end"]
                 # Make sure we have valid indices and elements to work with
                 if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
                     # Find the actual source elements in the original list
                     # Since word_boxes may have filtered out some elements, we need to map indices
                     # Get the text from result word boxes
-                    matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
+                    matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
                     # Find corresponding elements in the full element list
                     source_elements = []
                     for element in elements:
-                        if hasattr(element, 'text') and element.text in matched_texts:
+                        if hasattr(element, "text") and element.text in matched_texts:
                             source_elements.append(element)
                             # Remove from matched texts to avoid duplicates
                             if element.text in matched_texts:
                                 matched_texts.remove(element.text)
                     result["source_elements"] = ElementCollection(source_elements)
             return result
         finally:
             # Clean up temporary file
             if os.path.exists(temp_path):
-                os.remove(temp_path)
+                os.remove(temp_path)

natural_pdf/search/__init__.py CHANGED Viewed

@@ -7,32 +7,29 @@ from typing import Optional
 # Import the concrete implementation
 from .haystack_search_service import HaystackSearchService
-# --- Protocol Import ---
-# Import the protocol for type hinting
-from .search_service_protocol import (
-    SearchServiceProtocol,
-    IndexConfigurationError,
-    Indexable
+# --- Utils Import ---
+from .haystack_utils import (  # Re-export flag and helper
+    HAS_HAYSTACK_EXTRAS,
+    check_haystack_availability,
 )
 # --- Option Imports (for convenience) ---
 # Make options easily available via `from natural_pdf.search import ...`
-from .search_options import (
-    BaseSearchOptions,
-    SearchOptions, # Alias for TextSearchOptions for simplicity?
-    TextSearchOptions,
-    MultiModalSearchOptions
-)
-# --- Utils Import ---
-from .haystack_utils import HAS_HAYSTACK_EXTRAS, check_haystack_availability # Re-export flag and helper
+from .search_options import SearchOptions  # Alias for TextSearchOptions for simplicity?
+from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
+# --- Protocol Import ---
+# Import the protocol for type hinting
+from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
 logger = logging.getLogger(__name__)
 # --- Factory Function ---
 def get_search_service(
-    collection_name: str, # Add collection_name as a required argument
-    persist: bool = False, # Default to In-Memory
+    collection_name: str,  # Add collection_name as a required argument
+    persist: bool = False,  # Default to In-Memory
     # Configuration for the service itself
     default_persist_path: Optional[str] = None,
     default_embedding_model: Optional[str] = None,
@@ -56,39 +53,48 @@ def get_search_service(
     Returns:
         An instance conforming to the SearchServiceProtocol for the specified collection.
     """
-    logger.debug(f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})...")
+    logger.debug(
+        f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})..."
+    )
     # For now, we only have one implementation
     # Collect arguments relevant to HaystackSearchService.__init__
     service_args = {}
-    service_args['collection_name'] = collection_name # Pass collection_name
-    service_args['persist'] = persist # Pass persist flag to service constructor
+    service_args["collection_name"] = collection_name  # Pass collection_name
+    service_args["persist"] = persist  # Pass persist flag to service constructor
     if default_persist_path is not None:
-        service_args['default_persist_path'] = default_persist_path
+        service_args["default_persist_path"] = default_persist_path
     if default_embedding_model is not None:
-        service_args['default_embedding_model'] = default_embedding_model
+        service_args["default_embedding_model"] = default_embedding_model
     # TODO: Implement caching/registry if needed to return the same instance
     # for the same configuration instead of always creating a new one.
     # cache_key = tuple(sorted(service_args.items()))
     # if cache_key in _service_instance_cache:
     #    return _service_instance_cache[cache_key]
     try:
         service_instance = HaystackSearchService(**service_args)
         # _service_instance_cache[cache_key] = service_instance
-        logger.info(f"Created new HaystackSearchService instance for collection '{collection_name}'.")
+        logger.info(
+            f"Created new HaystackSearchService instance for collection '{collection_name}'."
+        )
         return service_instance
     except ImportError as e:
-         logger.error(f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True)
-         raise ImportError("Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]") from e
+        logger.error(
+            f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
+        )
+        raise ImportError(
+            "Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
+        ) from e
     except Exception as e:
-         logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
-         raise RuntimeError("Could not create Search Service instance.") from e
+        logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
+        raise RuntimeError("Could not create Search Service instance.") from e
 # --- Optional: Define a default instance for extreme ease of use? ---
 # try:
 #     default_search_service = get_search_service()
 # except Exception:
-#     default_search_service = None
-#     logger.warning("Could not create default search service instance on import.")
+#     default_search_service = None
+#     logger.warning("Could not create default search service instance on import.")

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl