PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

docs/visual-debugging/index.md ADDED Viewed

@@ -0,0 +1,157 @@
+# Visual Debugging
+Sometimes it's hard to understand what's happening when working with PDFs. Natural PDF provides powerful visual debugging tools to help you see what you're extracting.
+## Adding Persistent Highlights
+Use the `.highlight()` method on `Element` or `ElementCollection` objects to add persistent highlights to a page. These highlights are stored and will appear when viewing the page later.
+```python
+from natural_pdf import PDF
+pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
+page = pdf.pages[0]
+# Find a specific element and add a persistent highlight
+page.find_all('text:contains("Summary")').highlight()
+page.find_all('text:contains("Date")').highlight()
+page.find_all('line').highlight()
+page.to_image(width=700)
+```
+## Customizing Persistent Highlights
+Customize the appearance of persistent highlights added with `.highlight()`:
+```python
+page.clear_highlights()
+title = page.find('text:bold[size>=12]')
+# Highlight with a specific color (string name, hex, or RGB/RGBA tuple)
+# title.highlight(color=(1, 0, 0, 0.3))  # Red with 30% opacity
+# title.highlight(color="#FF0000")        # Hex color
+title.highlight(color="red")           # Color name
+text = page.find('text:contains("Critical")')
+# Add a label to the highlight (appears in legend)
+text.highlight(label="Critical")
+# Combine color and label
+rect = page.find('rect')
+rect.highlight(color=(0, 0, 1, 0.2), label="Box")
+page.to_image(width=700)
+```
+## Highlighting Multiple Elements
+Highlighting an `ElementCollection` applies the highlight to all elements within it. By default, all elements in the collection get the same color and a label based on their type.
+```python
+# Find and highlight all headings with a single color/label
+headings = page.find_all('text[size>=14]:bold')
+headings.highlight(color=(0, 0.5, 0, 0.3), label="Headings")
+# Find and highlight all tables
+tables = page.find_all('region[type=table]')
+tables.highlight(color=(0, 0, 1, 0.2), label="Tables")
+# View the result
+page.viewer()
+```
+## Highlighting Regions
+You can highlight regions to see what area you're working with:
+```python
+# Find a title and create a region below it
+title = page.find('text:contains("Violations")')
+content = title.below(height=200)
+# Highlight the region
+content.show()
+```
+Or look at just the region by itself
+```python
+# Find a title and create a region below it
+title = page.find('text:contains("Violations")')
+content = title.below(height=200)
+# Crop to the region
+content.to_image(crop_only=True, include_highlights=False)
+```
+## Working with Text Styles
+Visualize text styles to understand the document structure:
+```python
+# Analyze and highlight text styles
+page.clear_highlights()
+page.analyze_text_styles()
+page.find_all('text').highlight(group_by='style_label')
+page.to_image(width=700)
+```
+## Displaying Attributes
+You can display element attributes directly on the highlights:
+```python
+pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
+page = pdf.pages[0]
+text = page.find_all('line')
+text.highlight(include_attrs=['width', 'color'])
+page.to_image(width=700)
+```
+Does it get busy? YES.
+## Clearing Highlights
+You can clear persistent highlights from a page:
+```python
+# Clear all highlights on the page
+page.clear_highlights()
+# Apply new highlights
+page.find_all('text:bold').highlight(label="Bold Text")
+page.viewer()
+```
+## Document QA Visualization
+Visualize document QA results:
+```python
+pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf")
+page = pdf.pages[0]
+page.to_image(width=700)
+```
+```python
+response = page.ask("How many votes did Kamala Harris get on Election Day?")
+response
+```
+```python
+response['source_elements'].show()
+```
+## Next Steps
+Now that you know how to visualize PDF content, you might want to explore:
+- [OCR capabilities](../ocr/index.md) for working with scanned documents
+- [Layout analysis](../layout-analysis/index.ipynb) for automatic structure detection
+- [Document QA](../document-qa/index.ipynb) for asking questions directly to your documents

docs/visual-debugging/region.png ADDED Viewed

Binary file

natural_pdf/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Natural PDF - A more intuitive interface for working with PDFs.
 """
 import logging
 # Create library logger
@@ -10,67 +11,83 @@ logger = logging.getLogger("natural_pdf")
 # (Best practice for libraries)
 logger.addHandler(logging.NullHandler())
-# Utility function for users to easily configure logging
 def configure_logging(level=logging.INFO, handler=None):
-    """Configure Natural PDF's logging.
+    """Configure logging for the natural_pdf package.
     Args:
-        level: The logging level (e.g., logging.INFO, logging.DEBUG)
-        handler: A custom handler, or None to use StreamHandler
+        level: Logging level (e.g., logging.INFO, logging.DEBUG)
+        handler: Optional custom handler. Defaults to a StreamHandler.
     """
-    # Remove NullHandler if present
-    if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
-        logger.removeHandler(logger.handlers[0])
+    # Avoid adding duplicate handlers
+    if any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
+        return
     if handler is None:
         handler = logging.StreamHandler()
-        formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
+        formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
         handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.setLevel(level)
-    # Propagate level to all child loggers
-    for name in logging.root.manager.loggerDict:
-        if name.startswith("natural_pdf."):
-            logging.getLogger(name).setLevel(level)
-from natural_pdf.core.pdf import PDF
+    logger.propagate = False
 from natural_pdf.core.page import Page
-from natural_pdf.elements.region import Region
+from natural_pdf.core.pdf import PDF
 from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.region import Region
 # Import QA module if available
 try:
     from natural_pdf.qa import DocumentQA, get_qa_engine
     HAS_QA = True
 except ImportError:
     HAS_QA = False
 __version__ = "0.1.1"
+__all__ = [
+    "PDF",
+    "PDFCollection",
+    "Page",
+    "Region",
+    "ElementCollection",
+    "TextSearchOptions",
+    "MultiModalSearchOptions",
+    "BaseSearchOptions",
+    "configure_logging",
+]
 if HAS_QA:
-    __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
-else:
-    __all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
+    __all__.extend(["DocumentQA", "get_qa_engine"])
+from .collections.pdf_collection import PDFCollection
 # Core classes
 from .core.pdf import PDF
-from .collections.pdf_collection import PDFCollection
 from .elements.region import Region
 # Search options (if extras installed)
 try:
-    from .search.search_options import TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
+    from .search.search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
 except ImportError:
     # Define dummy classes if extras not installed, so imports don't break
     # but using them will raise the ImportError from check_haystack_availability
     class TextSearchOptions:
-        def __init__(self, *args, **kwargs): pass
+        def __init__(self, *args, **kwargs):
+            pass
     class MultiModalSearchOptions:
-        def __init__(self, *args, **kwargs): pass
+        def __init__(self, *args, **kwargs):
+            pass
     class BaseSearchOptions:
-        def __init__(self, *args, **kwargs): pass
+        def __init__(self, *args, **kwargs):
+            pass
 # Expose logging setup? (Optional)
 # from . import logging_config
@@ -78,10 +95,10 @@ except ImportError:
 # Explicitly define what gets imported with 'from natural_pdf import *'
 __all__ = [
-    'PDF',
-    'PDFCollection',
-    'Region',
-    'TextSearchOptions',       # Include search options
-    'MultiModalSearchOptions',
-    'BaseSearchOptions'
-]
+    "PDF",
+    "PDFCollection",
+    "Region",
+    "TextSearchOptions",  # Include search options
+    "MultiModalSearchOptions",
+    "BaseSearchOptions",
+]

natural_pdf/analyzers/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Analyzers for natural-pdf.
 """
 from .layout import *
 from .text_structure import TextStyleAnalyzer
-from .utils import convert_to_regions
+from .utils import convert_to_regions

natural_pdf/analyzers/layout/base.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # layout_detector_base.py
 import logging
 from abc import ABC, abstractmethod
-from typing import Dict, List, Any, Optional, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
 from PIL import Image
 # Assuming layout_options defines BaseLayoutOptions
@@ -9,10 +10,13 @@ try:
     from .layout_options import BaseLayoutOptions
 except ImportError:
     # Placeholder if run standalone or options not found
-    class BaseLayoutOptions: pass
+    class BaseLayoutOptions:
+        pass
 logger = logging.getLogger(__name__)
 class LayoutDetector(ABC):
     """
     Abstract Base Class for layout detection engines.
@@ -26,8 +30,8 @@ class LayoutDetector(ABC):
         """Initializes the base layout detector."""
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
         self.logger.info(f"Initializing {self.__class__.__name__}")
-        self.supported_classes: Set[str] = set() # Subclasses should populate this
-        self._model_cache: Dict[str, Any] = {} # Cache for initialized models
+        self.supported_classes: Set[str] = set()  # Subclasses should populate this
+        self._model_cache: Dict[str, Any] = {}  # Cache for initialized models
     @abstractmethod
     def detect(self, image: Image.Image, options: BaseLayoutOptions) -> List[Dict[str, Any]]:
@@ -83,20 +87,20 @@ class LayoutDetector(ABC):
         """
         cache_key = self._get_cache_key(options)
         if cache_key not in self._model_cache:
-             self.logger.info(f"Loading model for cache key: {cache_key}")
-             try:
-                 # Ensure dependencies are met before loading
-                 if not self.is_available():
-                      raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
-                 self._model_cache[cache_key] = self._load_model_from_options(options)
-                 self.logger.info(f"Model loaded successfully for key: {cache_key}")
-             except Exception as e:
-                  self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
-                  # Remove potentially corrupted cache entry
-                  self._model_cache.pop(cache_key, None)
-                  raise # Re-raise exception after logging
+            self.logger.info(f"Loading model for cache key: {cache_key}")
+            try:
+                # Ensure dependencies are met before loading
+                if not self.is_available():
+                    raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
+                self._model_cache[cache_key] = self._load_model_from_options(options)
+                self.logger.info(f"Model loaded successfully for key: {cache_key}")
+            except Exception as e:
+                self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
+                # Remove potentially corrupted cache entry
+                self._model_cache.pop(cache_key, None)
+                raise  # Re-raise exception after logging
         else:
-             self.logger.debug(f"Using cached model for key: {cache_key}")
+            self.logger.debug(f"Using cached model for key: {cache_key}")
         return self._model_cache[cache_key]
     @abstractmethod
@@ -110,8 +114,9 @@ class LayoutDetector(ABC):
     def _normalize_class_name(self, name: str) -> str:
         """Convert class names with spaces/underscores to hyphenated lowercase format."""
-        if not isinstance(name, str): name = str(name) # Ensure string
-        return name.lower().replace(' ', '-').replace('_', '-')
+        if not isinstance(name, str):
+            name = str(name)  # Ensure string
+        return name.lower().replace(" ", "-").replace("_", "-")
     def validate_classes(self, classes: List[str]) -> None:
         """
@@ -124,8 +129,10 @@ class LayoutDetector(ABC):
             ValueError: If any class is not supported.
         """
         if not self.supported_classes:
-             self.logger.warning("Supported classes not defined for this detector. Skipping class validation.")
-             return
+            self.logger.warning(
+                "Supported classes not defined for this detector. Skipping class validation."
+            )
+            return
         if classes:
             # Normalize both requested and supported classes for comparison
@@ -138,8 +145,10 @@ class LayoutDetector(ABC):
                 unsupported_original = [
                     c for c in classes if self._normalize_class_name(c) in unsupported_normalized
                 ]
-                raise ValueError(f"Classes not supported by {self.__class__.__name__}: {unsupported_original}. "
-                               f"Supported (normalized): {sorted(list(normalized_supported))}")
+                raise ValueError(
+                    f"Classes not supported by {self.__class__.__name__}: {unsupported_original}. "
+                    f"Supported (normalized): {sorted(list(normalized_supported))}"
+                )
     def __del__(self):
         """Cleanup resources."""
@@ -148,4 +157,3 @@ class LayoutDetector(ABC):
         # Consider implications if models are shared or expensive to reload
         # del self._model_cache # Optional: uncomment if models should be released aggressively
         self._model_cache.clear()

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl