PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/analyzers/text_options.py CHANGED Viewed

@@ -4,12 +4,15 @@ from typing import List, Optional
 logger = logging.getLogger(__name__)
 @dataclass
 class TextStyleOptions:
     """Options for configuring text style analysis."""
     # Properties to consider when grouping elements by style
-    group_by: List[str] = field(default_factory=lambda: ['size', 'fontname', 'is_bold', 'is_italic', 'color'])
+    group_by: List[str] = field(
+        default_factory=lambda: ["size", "fontname", "is_bold", "is_italic", "color"]
+    )
     # Tolerance for comparing font sizes (e.g., 0.5 rounds to nearest 0.5 point)
     size_tolerance: float = 0.5
@@ -30,31 +33,35 @@ class TextStyleOptions:
     # Format string for descriptive labels. Placeholders match keys in style_properties dict.
     # Example: "{size}pt {weight}{style} {family} ({color})"
     # Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
-    label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
+    label_format: str = "{size}pt {weight}{style} {family}"  # Default format without color
     def __post_init__(self):
         # Validate size_tolerance
         if self.size_tolerance <= 0:
-            logger.warning(f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}")
+            logger.warning(
+                f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}"
+            )
             self.size_tolerance = 0.1
         # Ensure 'size' is always considered if tolerance is relevant
-        if 'size' not in self.group_by and self.size_tolerance > 0:
+        if "size" not in self.group_by and self.size_tolerance > 0:
             logger.debug("Adding 'size' to group_by keys because size_tolerance is set.")
-            if 'size' not in self.group_by: self.group_by.append('size')
+            if "size" not in self.group_by:
+                self.group_by.append("size")
-        if self.ignore_color and 'color' in self.group_by:
+        if self.ignore_color and "color" in self.group_by:
             logger.debug("Removing 'color' from group_by keys because ignore_color is True.")
-            self.group_by = [key for key in self.group_by if key != 'color']
-        elif not self.ignore_color and 'color' not in self.group_by:
-             # If color isn't ignored, ensure it's included if requested in label format?
-             # For now, just rely on explicit group_by setting.
-             pass
+            self.group_by = [key for key in self.group_by if key != "color"]
+        elif not self.ignore_color and "color" not in self.group_by:
+            # If color isn't ignored, ensure it's included if requested in label format?
+            # For now, just rely on explicit group_by setting.
+            pass
         # Basic validation for group_by keys
-        allowed_keys = {'size', 'fontname', 'is_bold', 'is_italic', 'color'}
+        allowed_keys = {"size", "fontname", "is_bold", "is_italic", "color"}
         invalid_keys = set(self.group_by) - allowed_keys
         if invalid_keys:
-            logger.warning(f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys.")
-            self.group_by = [key for key in self.group_by if key in allowed_keys]
+            logger.warning(
+                f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys."
+            )
+            self.group_by = [key for key in self.group_by if key in allowed_keys]

natural_pdf/analyzers/text_structure.py CHANGED Viewed

@@ -1,18 +1,21 @@
 """
 Text structure analyzer for natural-pdf.
 """
 import logging
 import re
-from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
 from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from natural_pdf.analyzers.text_options import TextStyleOptions
 # Import ElementCollection and TextStyleOptions
 from natural_pdf.elements.collections import ElementCollection
-from natural_pdf.analyzers.text_options import TextStyleOptions
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.base import Element
     # Remove ElementCollection from here if imported above
 logger = logging.getLogger(__name__)
@@ -21,63 +24,77 @@ logger = logging.getLogger(__name__)
 FONT_PREFIX_RE = re.compile(r"^[A-Z]{6}\+")
 # Common font weight/style keywords
-FONT_WEIGHTS = {"bold": "Bold", "black": "Bold", "heavy": "Bold", "medium": "", "light": "Light", "thin": "Thin"}
+FONT_WEIGHTS = {
+    "bold": "Bold",
+    "black": "Bold",
+    "heavy": "Bold",
+    "medium": "",
+    "light": "Light",
+    "thin": "Thin",
+}
 FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
 class TextStyleAnalyzer:
     """
     Analyzes and groups text elements by their style properties based on configuration.
     This analyzer groups text elements based on specified font properties
     (controlled by TextStyleOptions) and adds 'style_label', 'style_key',
     and 'style_properties' attributes to each processed text element.
     """
     def __init__(self, options: Optional[TextStyleOptions] = None):
         """
         Initialize the text style analyzer.
         Args:
             options: Configuration options for the analysis. Uses default if None.
         """
         self.options = options or TextStyleOptions()
         logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
-    def analyze(self, page: 'Page', options: Optional[TextStyleOptions] = None) -> 'ElementCollection':
+    def analyze(
+        self, page: "Page", options: Optional[TextStyleOptions] = None
+    ) -> "ElementCollection":
         """
         Analyze text styles on a page, group elements, and add style attributes.
         Args:
             page: The Page object to analyze.
             options: Override the analyzer's default TextStyleOptions for this run.
         Returns:
             ElementCollection containing all processed text elements (typically words)
             with added 'style_label', 'style_key', and 'style_properties' attributes.
         """
         current_options = options or self.options
-        logger.info(f"Starting text style analysis for page {page.number} with options: {current_options}")
+        logger.info(
+            f"Starting text style analysis for page {page.number} with options: {current_options}"
+        )
         # Use page.words for better granularity
         text_elements = page.words
         # Fallback if words are somehow empty/not generated
         if not text_elements:
-             text_elements = page.find_all('text').elements # Get list from collection
+            text_elements = page.find_all("text").elements  # Get list from collection
         # Skip empty pages or pages with no text elements
         if not text_elements:
             logger.warning(f"Page {page.number} has no text elements to analyze.")
             return ElementCollection([])
-        style_cache: Dict[Tuple, Dict[str, Any]] = {} # Maps style_key_tuple -> {'label': str, 'properties': dict}
-        processed_elements: List['Element'] = []
+        style_cache: Dict[Tuple, Dict[str, Any]] = (
+            {}
+        )  # Maps style_key_tuple -> {'label': str, 'properties': dict}
+        processed_elements: List["Element"] = []
         # Ensure consistent ordering for style key creation
         group_by_keys = sorted(current_options.group_by)
         for element in text_elements:
             # Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
-            if not hasattr(element, 'text') or not hasattr(element, 'size'):
+            if not hasattr(element, "text") or not hasattr(element, "size"):
                 logger.debug(f"Skipping element without text/size: {element}")
                 continue
@@ -86,37 +103,47 @@ class TextStyleAnalyzer:
                 style_key = self._create_style_key(style_properties, group_by_keys)
                 if style_key not in style_cache:
-                    label = self._generate_style_label(style_properties, current_options, len(style_cache) + 1)
-                    style_cache[style_key] = {'label': label, 'properties': style_properties}
-                    logger.debug(f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}")
+                    label = self._generate_style_label(
+                        style_properties, current_options, len(style_cache) + 1
+                    )
+                    style_cache[style_key] = {"label": label, "properties": style_properties}
+                    logger.debug(
+                        f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
+                    )
                 # Add attributes to the element
-                element.style_label = style_cache[style_key]['label']
+                element.style_label = style_cache[style_key]["label"]
                 element.style_key = style_key
                 # Add the full properties dict for potential detailed inspection
-                element.style_properties = style_cache[style_key]['properties']
+                element.style_properties = style_cache[style_key]["properties"]
                 processed_elements.append(element)
             except Exception as e:
-                 logger.warning(f"Error processing element {element} for text style: {e}", exc_info=True)
-                 # Optionally add element without style info or skip it
-                 # processed_elements.append(element) # Add anyway?
+                logger.warning(
+                    f"Error processing element {element} for text style: {e}", exc_info=True
+                )
+                # Optionally add element without style info or skip it
+                # processed_elements.append(element) # Add anyway?
         # Optionally store a summary on the page
         page._text_styles_summary = style_cache
-        logger.info(f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles.")
+        logger.info(
+            f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
+        )
         return ElementCollection(processed_elements)
-    def _extract_style_properties(self, element: 'Element', options: TextStyleOptions) -> Dict[str, Any]:
+    def _extract_style_properties(
+        self, element: "Element", options: TextStyleOptions
+    ) -> Dict[str, Any]:
         """
         Extract style properties from a text element based on options.
         Args:
             element: Text element.
             options: TextStyleOptions driving the extraction.
         Returns:
             Dictionary of extracted style properties.
         """
@@ -124,68 +151,81 @@ class TextStyleAnalyzer:
         # Font size
         font_size = None
-        if hasattr(element, 'size') and element.size is not None:
+        if hasattr(element, "size") and element.size is not None:
             # Round based on tolerance
             rounding_factor = 1.0 / options.size_tolerance
             font_size = round(element.size * rounding_factor) / rounding_factor
-        properties['size'] = font_size
+        properties["size"] = font_size
         # Font name
         font_name = None
         normalized_font_name = None
-        if hasattr(element, 'fontname') and element.fontname is not None:
+        if hasattr(element, "fontname") and element.fontname is not None:
             font_name = element.fontname
             normalized_font_name = self._normalize_font_name(font_name, options)
-        properties['fontname'] = normalized_font_name if options.normalize_fontname else font_name
+        properties["fontname"] = normalized_font_name if options.normalize_fontname else font_name
         # Font characteristics (derived from normalized name if available)
         name_to_check = normalized_font_name or font_name or ""
         name_lower = name_to_check.lower()
-        is_bold = ('bold' in name_lower or 'black' in name_lower or 'heavy' in name_lower or name_to_check.endswith('-B'))
-        is_italic = ('italic' in name_lower or 'oblique' in name_lower or name_to_check.endswith('-I'))
-        properties['is_bold'] = is_bold
-        properties['is_italic'] = is_italic
+        is_bold = (
+            "bold" in name_lower
+            or "black" in name_lower
+            or "heavy" in name_lower
+            or name_to_check.endswith("-B")
+        )
+        is_italic = (
+            "italic" in name_lower or "oblique" in name_lower or name_to_check.endswith("-I")
+        )
+        properties["is_bold"] = is_bold
+        properties["is_italic"] = is_italic
         # Text color
         color = None
-        if not options.ignore_color and hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
+        if (
+            not options.ignore_color
+            and hasattr(element, "non_stroking_color")
+            and element.non_stroking_color is not None
+        ):
             raw_color = element.non_stroking_color
             # Convert color to a hashable form (tuple)
             if isinstance(raw_color, (list, tuple)):
-                color = tuple(round(c, 3) for c in raw_color) # Round color components
+                color = tuple(round(c, 3) for c in raw_color)  # Round color components
             else:
                 # Handle simple grayscale or other non-list representations if needed
-                 try:
-                     color = round(float(raw_color), 3)
-                 except (ValueError, TypeError):
-                     color = str(raw_color) # Fallback to string if cannot convert
+                try:
+                    color = round(float(raw_color), 3)
+                except (ValueError, TypeError):
+                    color = str(raw_color)  # Fallback to string if cannot convert
             # Normalize common colors (optional, could be complex)
             # Example: (0.0, 0.0, 0.0) -> 'black', (1.0, 1.0, 1.0) -> 'white'
-            if color == (0.0, 0.0, 0.0) or color == 0.0: color = 'black'
-            if color == (1.0, 1.0, 1.0) or color == 1.0: color = 'white'
-        properties['color'] = color
+            if color == (0.0, 0.0, 0.0) or color == 0.0:
+                color = "black"
+            if color == (1.0, 1.0, 1.0) or color == 1.0:
+                color = "white"
+        properties["color"] = color
         return properties
     def _normalize_font_name(self, fontname: str, options: TextStyleOptions) -> str:
-        """ Basic normalization of font names. """
+        """Basic normalization of font names."""
         if not options.normalize_fontname:
             return fontname
         # Remove common subset prefixes like "ABCDEF+"
         name = FONT_PREFIX_RE.sub("", fontname)
         # Could add more rules here, e.g., removing version numbers, standardizing separators
         return name
     def _parse_font_name(self, normalized_fontname: str) -> Dict[str, str]:
-        """ Attempt to parse family, weight, and style from a font name. Very heuristic. """
+        """Attempt to parse family, weight, and style from a font name. Very heuristic."""
         if not normalized_fontname:
-            return {'family': 'Unknown', 'weight': '', 'style': ''}
+            return {"family": "Unknown", "weight": "", "style": ""}
-        parts = re.split(r'[-,_ ]', normalized_fontname)
+        parts = re.split(r"[-,_ ]", normalized_fontname)
         family_parts = []
-        weight = ''
-        style = ''
+        weight = ""
+        style = ""
         for part in parts:
             part_lower = part.lower()
@@ -196,7 +236,8 @@ class TextStyleAnalyzer:
                     weight = val
                     found = True
                     break
-            if found: continue # Skip part if it was a weight
+            if found:
+                continue  # Skip part if it was a weight
             # Check styles
             for key, val in FONT_STYLES.items():
@@ -204,67 +245,72 @@ class TextStyleAnalyzer:
                     style = val
                     found = True
                     break
-            if found: continue # Skip part if it was a style
+            if found:
+                continue  # Skip part if it was a style
             # If not weight or style, assume it's part of the family name
-            if part: # Avoid empty strings from multiple delimiters
-                 family_parts.append(part)
+            if part:  # Avoid empty strings from multiple delimiters
+                family_parts.append(part)
-        family = "".join(family_parts) or "Unknown" # Join remaining parts
+        family = "".join(family_parts) or "Unknown"  # Join remaining parts
         # Simple cleanup: Remove "MT" often appended? Maybe too aggressive.
         # if family.endswith("MT"): family = family[:-2]
-        return {'family': family, 'weight': weight, 'style': style}
+        return {"family": family, "weight": weight, "style": style}
     def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
-        """ Create a hashable tuple key based on selected properties. """
+        """Create a hashable tuple key based on selected properties."""
         key_parts = []
-        for key in group_by_keys: # Use the pre-sorted list
+        for key in group_by_keys:  # Use the pre-sorted list
             value = properties.get(key)
             # Ensure hashable - colors should already be tuples or basic types
-            if isinstance(value, list): # Should not happen if _extract handled color correctly
+            if isinstance(value, list):  # Should not happen if _extract handled color correctly
                 value = tuple(value)
             key_parts.append(value)
         return tuple(key_parts)
-    def _generate_style_label(self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int) -> str:
-        """ Generate a style label based on properties and options. """
+    def _generate_style_label(
+        self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int
+    ) -> str:
+        """Generate a style label based on properties and options."""
         if not options.descriptive_labels:
             return f"{options.label_prefix} {style_index}"
         try:
-            font_details = self._parse_font_name(properties.get('fontname', ''))
+            font_details = self._parse_font_name(properties.get("fontname", ""))
             label_data = {
-                'size': properties.get('size', '?'),
-                'fontname': properties.get('fontname', 'Unknown'),
-                'is_bold': properties.get('is_bold', False),
-                'is_italic': properties.get('is_italic', False),
-                'color': properties.get('color', ''),
-                'family': font_details['family'],
+                "size": properties.get("size", "?"),
+                "fontname": properties.get("fontname", "Unknown"),
+                "is_bold": properties.get("is_bold", False),
+                "is_italic": properties.get("is_italic", False),
+                "color": properties.get("color", ""),
+                "family": font_details["family"],
                 # Use parsed weight/style if available, otherwise fallback to is_bold/is_italic flags
-                'weight': font_details['weight'] or ('Bold' if properties.get('is_bold') else ''),
-                'style': font_details['style'] or ('Italic' if properties.get('is_italic') else ''),
+                "weight": font_details["weight"] or ("Bold" if properties.get("is_bold") else ""),
+                "style": font_details["style"] or ("Italic" if properties.get("is_italic") else ""),
             }
             # Ensure style has a space separator if both weight and style exist
-            if label_data['weight'] and label_data['style']:
-                label_data['style'] = " " + label_data['style']
+            if label_data["weight"] and label_data["style"]:
+                label_data["style"] = " " + label_data["style"]
             # Handle color formatting for label
-            color_val = label_data['color']
+            color_val = label_data["color"]
             if isinstance(color_val, tuple):
-                 color_str = f"rgb{color_val}" # Basic tuple representation
+                color_str = f"rgb{color_val}"  # Basic tuple representation
             elif isinstance(color_val, str):
-                 color_str = color_val # Already string ('black', 'white', or fallback)
+                color_str = color_val  # Already string ('black', 'white', or fallback)
             else:
-                 color_str = str(color_val) # Other types
-            label_data['color_str'] = color_str
+                color_str = str(color_val)  # Other types
+            label_data["color_str"] = color_str
             # Format the label, handle potential missing keys in format string gracefully
             label = options.label_format.format_map(defaultdict(str, label_data))
-            return label.strip().replace("  ", " ") # Cleanup extra spaces
+            return label.strip().replace("  ", " ")  # Cleanup extra spaces
         except Exception as e:
-            logger.warning(f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label.")
+            logger.warning(
+                f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
+            )
             # Fallback to numeric label on error
-            return f"{options.label_prefix} {style_index}"
+            return f"{options.label_prefix} {style_index}"

natural_pdf/analyzers/utils.py CHANGED Viewed

@@ -1,57 +1,64 @@
 import logging
-from typing import List, Dict, Any
+from typing import Any, Dict, List
 from ..elements.region import Region
-def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
-                      scale_factor: float = 1.0) -> List[Region]:
+def convert_to_regions(
+    page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
+) -> List[Region]:
     """
     Convert layout detections to Region objects.
     Args:
         page: Page object to create regions for
         detections: List of detection dictionaries
         scale_factor: Factor to scale coordinates from image to PDF space
     Returns:
         List of Region objects with layout metadata
     """
     conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
-    conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
+    conversion_logger.debug(
+        f"Converting {len(detections)} detections to regions with scale {scale_factor}"
+    )
     regions = []
     for det in detections:
         # Extract detection info
-        x_min, y_min, x_max, y_max = det['bbox']
+        x_min, y_min, x_max, y_max = det["bbox"]
         # Ensure coordinates are in proper order (min values are smaller)
         if x_min > x_max:
             x_min, x_max = x_max, x_min
         if y_min > y_max:
             y_min, y_max = y_max, y_min
         # Scale coordinates from image to PDF space
         if scale_factor != 1.0:
             x_min *= scale_factor
             y_min *= scale_factor
             x_max *= scale_factor
             y_max *= scale_factor
         # Create region with metadata
         region = Region(page, (x_min, y_min, x_max, y_max))
-        region.region_type = det['class']
-        region.confidence = det['confidence']
-        region.normalized_type = det['normalized_class']
+        region.region_type = det["class"]
+        region.confidence = det["confidence"]
+        region.normalized_type = det["normalized_class"]
         # Add source info - important for filtering
-        region.source = det.get('source', 'detected')
-        region.model = det.get('model', 'unknown')
+        region.source = det.get("source", "detected")
+        region.model = det.get("model", "unknown")
         # Add additional metadata if available
         for key, value in det.items():
-            if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
+            if key not in ("bbox", "class", "confidence", "normalized_class", "source", "model"):
                 setattr(region, key, value)
         regions.append(region)
-    conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
-    return regions
+    conversion_logger.debug(
+        f"Created {len(regions)} region objects from {len(detections)} detections"
+    )
+    return regions

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl