PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -1,28 +1,31 @@
 """
 CSS-like selector parser for natural-pdf.
 """
-import re
 import ast
-from typing import Dict, Any, List, Optional, Union, Tuple
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
 from colour import Color
 def safe_parse_value(value_str: str) -> Any:
     """
     Safely parse a value string without using eval().
     Args:
         value_str: String representation of a value (number, tuple, string, etc.)
     Returns:
         Parsed value
     """
     # Strip quotes first if it's a quoted string
     value_str = value_str.strip()
-    if (value_str.startswith('"') and value_str.endswith('"')) or \
-       (value_str.startswith("'") and value_str.endswith("'")):
+    if (value_str.startswith('"') and value_str.endswith('"')) or (
+        value_str.startswith("'") and value_str.endswith("'")
+    ):
         return value_str[1:-1]
     # Try parsing as a Python literal (numbers, tuples, lists)
     try:
         return ast.literal_eval(value_str)
@@ -34,15 +37,15 @@ def safe_parse_value(value_str: str) -> Any:
 def safe_parse_color(value_str: str) -> tuple:
     """
     Parse a color value which could be an RGB tuple, color name, or hex code.
     Args:
         value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
     Returns:
         RGB tuple (r, g, b) with values from 0 to 1
     """
     value_str = value_str.strip()
     # Try parsing as a Python literal (for RGB tuples)
     try:
         # If it's already a valid tuple or list, parse it
@@ -60,7 +63,7 @@ def safe_parse_color(value_str: str) -> tuple:
         except (ValueError, AttributeError):
             # If color parsing fails, return a default (black)
             return (0, 0, 0)
     # If we got here with a non-tuple, return default
     return (0, 0, 0)
@@ -68,260 +71,78 @@ def safe_parse_color(value_str: str) -> tuple:
 def parse_selector(selector: str) -> Dict[str, Any]:
     """
     Parse a CSS-like selector string into a structured selector object.
     Examples:
     - 'text:contains("Revenue")'
     - 'table:below("Financial Data")'
     - 'rect[fill=(1,0,0)]'
     Args:
         selector: CSS-like selector string
     Returns:
         Dict representing the parsed selector
     """
     # Basic structure for result
     result = {
-        'type': 'any',  # Default to any element type
-        'filters': [],
-        'attributes': {},
-        'pseudo_classes': [],
+        "type": "any",  # Default to any element type
+        "filters": [],
+        "attributes": {},
+        "pseudo_classes": [],
     }
     # Check if empty or None
     if not selector or not isinstance(selector, str):
         return result
     # Parse element type
-    type_match = re.match(r'^([a-zA-Z_]+)', selector)
+    type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
     if type_match:
-        result['type'] = type_match.group(1).lower()
-        selector = selector[len(type_match.group(0)):]
+        result["type"] = type_match.group(1).lower()
+        selector = selector[len(type_match.group(0)) :]
     # Parse attributes (e.g., [color=(1,0,0)])
-    attr_pattern = r'\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]'
+    attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
     attr_matches = re.findall(attr_pattern, selector)
     for name, op, value in attr_matches:
         # Handle special parsing for color attributes
-        if name in ['color', 'non_stroking_color', 'fill', 'stroke', 'strokeColor', 'fillColor']:
+        if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
             value = safe_parse_color(value)
         else:
             # Safe parsing for other attributes
             value = safe_parse_value(value)
         # Store attribute with operator
-        result['attributes'][name] = {
-            'op': op,
-            'value': value
-        }
+        result["attributes"][name] = {"op": op, "value": value}
     # Parse pseudo-classes (e.g., :contains("text"))
-    pseudo_pattern = r':([a-zA-Z_]+)(?:\(([^)]+)\))?'
+    pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
     pseudo_matches = re.findall(pseudo_pattern, selector)
     for name, args in pseudo_matches:
         # Process arguments
         processed_args = args
         if args:
-            if name in ['color', 'background']:
+            if name in ["color", "background"]:
                 processed_args = safe_parse_color(args)
             else:
                 processed_args = safe_parse_value(args)
-        result['pseudo_classes'].append({
-            'name': name,
-            'args': processed_args
-        })
-    return result
+        result["pseudo_classes"].append({"name": name, "args": processed_args})
-def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
-    """
-    Convert a parsed selector to a filter function.
-    Args:
-        selector: Parsed selector dictionary
-        **kwargs: Additional filter parameters including:
-                 - regex: Whether to use regex for text search
-                 - case: Whether to do case-sensitive text search
-    Returns:
-        Function that takes an element and returns True if it matches
-    """
-    def filter_func(element):
-        # Check element type
-        if selector['type'] != 'any':
-            # Special handling for 'text' type to match both 'text', 'char', and 'word'
-            if selector['type'] == 'text':
-                if element.type not in ['text', 'char', 'word']:
-                    return False
-            # Special handling for 'region' type to check for detected layout regions
-            elif selector['type'] == 'region':
-                # Check if this is a Region with region_type property
-                if not hasattr(element, 'region_type'):
-                    return False
-                # If 'type' attribute specified, it will be checked in the attributes section
-            # Check for Docling-specific types (section-header, etc.)
-            elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
-                # This is a direct match with a Docling region type
-                pass
-            # Otherwise, require exact match with the element's type attribute
-            elif not hasattr(element, 'type') or element.type != selector['type']:
-                return False
-        # Check attributes
-        for name, attr_info in selector['attributes'].items():
-            op = attr_info['op']
-            value = attr_info['value']
-            # Special case for fontname attribute - allow matching part of the name
-            if name == 'fontname' and op == '*=':
-                element_value = getattr(element, name, None)
-                if element_value is None or value.lower() not in element_value.lower():
-                    return False
-                continue
-            # Convert hyphenated attribute names to underscore for Python properties
-            python_name = name.replace('-', '_')
-            # Special case for region attributes
-            if selector['type'] == 'region':
-                if name == 'type':
-                    # Use normalized_type for comparison if available
-                    if hasattr(element, 'normalized_type') and element.normalized_type:
-                        element_value = element.normalized_type
-                    else:
-                        # Convert spaces to hyphens for consistency with the normalized format
-                        element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
-                elif name == 'model':
-                    # Special handling for model attribute in regions
-                    element_value = getattr(element, 'model', None)
-                else:
-                    # Get the attribute value from the element normally
-                    element_value = getattr(element, python_name, None)
-            else:
-                # Get the attribute value from the element normally for non-region elements
-                element_value = getattr(element, python_name, None)
-            if element_value is None:
-                return False
-            # Apply operator
-            if op == '=':
-                if element_value != value:
-                    return False
-            elif op == '~=':
-                # Approximate match (e.g., for colors)
-                if not _is_approximate_match(element_value, value):
-                    return False
-            elif op == '>=':
-                # Greater than or equal (element value must be >= specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value >= value):
-                    return False
-            elif op == '<=':
-                # Less than or equal (element value must be <= specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value <= value):
-                    return False
-            elif op == '>':
-                # Greater than (element value must be > specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value > value):
-                    return False
-            elif op == '<':
-                # Less than (element value must be < specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value < value):
-                    return False
-        # Check pseudo-classes
-        for pseudo in selector['pseudo_classes']:
-            name = pseudo['name']
-            args = pseudo['args']
-            # Handle various pseudo-classes
-            if name == 'contains' and hasattr(element, 'text'):
-                use_regex = kwargs.get('regex', False)
-                ignore_case = not kwargs.get('case', True)
-                if use_regex:
-                    import re
-                    if not element.text:
-                        return False
-                    try:
-                        pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
-                        if not pattern.search(element.text):
-                            return False
-                    except re.error:
-                        # If regex is invalid, fall back to literal text search
-                        element_text = element.text
-                        search_text = args
-                        if ignore_case:
-                            element_text = element_text.lower()
-                            search_text = search_text.lower()
-                        if search_text not in element_text:
-                            return False
-                else:
-                    # String comparison with case sensitivity option
-                    if not element.text:
-                        return False
-                    element_text = element.text
-                    search_text = args
-                    if ignore_case:
-                        element_text = element_text.lower()
-                        search_text = search_text.lower()
-                    if search_text not in element_text:
-                        return False
-            elif name == 'starts-with' and hasattr(element, 'text'):
-                if not element.text or not element.text.startswith(args):
-                    return False
-            elif name == 'ends-with' and hasattr(element, 'text'):
-                if not element.text or not element.text.endswith(args):
-                    return False
-            elif name == 'bold':
-                if not (hasattr(element, 'bold') and element.bold):
-                    return False
-            elif name == 'italic':
-                if not (hasattr(element, 'italic') and element.italic):
-                    return False
-            elif name == 'horizontal':
-                if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
-                    return False
-            elif name == 'vertical':
-                if not (hasattr(element, 'is_vertical') and element.is_vertical):
-                    return False
-            else:
-                # Potentially unsupported pseudo-class, or one handled elsewhere (like :not)
-                pass
-        # If we get here, all checks passed
-        return True
-    return filter_func
+    return result
 def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
     """
     Check if two values approximately match.
     This is mainly used for color comparisons with some tolerance.
     Args:
         value1: First value
         value2: Second value
         tolerance: Maximum difference allowed
     Returns:
         True if the values approximately match
     """
@@ -331,157 +152,177 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
             value1 = tuple(Color(value1).rgb)
         except:
             pass
     if isinstance(value2, str):
         try:
             value2 = tuple(Color(value2).rgb)
         except:
             pass
     # If both are tuples/lists with the same length (e.g., colors)
-    if (isinstance(value1, (list, tuple)) and
-        isinstance(value2, (list, tuple)) and
-        len(value1) == len(value2)):
+    if (
+        isinstance(value1, (list, tuple))
+        and isinstance(value2, (list, tuple))
+        and len(value1) == len(value2)
+    ):
         # Check if all components are within tolerance
         return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
     # If both are numbers
     if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
         return abs(value1 - value2) <= tolerance
     # Default to exact match for other types
     return value1 == value2
 PSEUDO_CLASS_FUNCTIONS = {
-    'bold': lambda el: hasattr(el, 'bold') and el.bold,
-    'italic': lambda el: hasattr(el, 'italic') and el.italic,
-    'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
-    'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
+    "bold": lambda el: hasattr(el, "bold") and el.bold,
+    "italic": lambda el: hasattr(el, "italic") and el.italic,
+    "first-child": lambda el: hasattr(el, "parent")
+    and el.parent
+    and el.parent.children[0] == el,  # Example placeholder
+    "last-child": lambda el: hasattr(el, "parent")
+    and el.parent
+    and el.parent.children[-1] == el,  # Example placeholder
     # Add the new pseudo-classes for negation
-    'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
-    'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
+    "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
+    "not-italic": lambda el: hasattr(el, "italic") and not el.italic,
 }
 def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
     """
     Convert a parsed selector to a filter function.
     Args:
         selector: Parsed selector dictionary
         **kwargs: Additional filter parameters including:
                  - regex: Whether to use regex for text search
                  - case: Whether to do case-sensitive text search
     Returns:
         Function that takes an element and returns True if it matches
     """
     def filter_func(element):
         # Check element type
-        if selector['type'] != 'any':
+        if selector["type"] != "any":
             # Special handling for 'text' type to match both 'text', 'char', and 'word'
-            if selector['type'] == 'text':
-                if element.type not in ['text', 'char', 'word']:
+            if selector["type"] == "text":
+                if element.type not in ["text", "char", "word"]:
                     return False
             # Special handling for 'region' type to check for detected layout regions
-            elif selector['type'] == 'region':
+            elif selector["type"] == "region":
                 # Check if this is a Region with region_type property
-                if not hasattr(element, 'region_type'):
+                if not hasattr(element, "region_type"):
                     return False
                 # If 'type' attribute specified, it will be checked in the attributes section
             # Check for Docling-specific types (section-header, etc.)
-            elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
+            elif (
+                hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
+            ):
                 # This is a direct match with a Docling region type
                 pass
             # Otherwise, require exact match with the element's type attribute
-            elif not hasattr(element, 'type') or element.type != selector['type']:
+            elif not hasattr(element, "type") or element.type != selector["type"]:
                 return False
         # Check attributes
-        for name, attr_info in selector['attributes'].items():
-            op = attr_info['op']
-            value = attr_info['value']
+        for name, attr_info in selector["attributes"].items():
+            op = attr_info["op"]
+            value = attr_info["value"]
             # Special case for fontname attribute - allow matching part of the name
-            if name == 'fontname' and op == '*=':
+            if name == "fontname" and op == "*=":
                 element_value = getattr(element, name, None)
                 if element_value is None or value.lower() not in element_value.lower():
                     return False
                 continue
             # Convert hyphenated attribute names to underscore for Python properties
-            python_name = name.replace('-', '_')
+            python_name = name.replace("-", "_")
             # Special case for region attributes
-            if selector['type'] == 'region':
-                if name == 'type':
+            if selector["type"] == "region":
+                if name == "type":
                     # Use normalized_type for comparison if available
-                    if hasattr(element, 'normalized_type') and element.normalized_type:
+                    if hasattr(element, "normalized_type") and element.normalized_type:
                         element_value = element.normalized_type
                     else:
                         # Convert spaces to hyphens for consistency with the normalized format
-                        element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
-                elif name == 'model':
+                        element_value = (
+                            getattr(element, "region_type", "").lower().replace(" ", "_")
+                        )
+                elif name == "model":
                     # Special handling for model attribute in regions
-                    element_value = getattr(element, 'model', None)
+                    element_value = getattr(element, "model", None)
                 else:
                     # Get the attribute value from the element normally
                     element_value = getattr(element, python_name, None)
             else:
                 # Get the attribute value from the element normally for non-region elements
                 element_value = getattr(element, python_name, None)
             if element_value is None:
                 return False
             # Apply operator
-            if op == '=':
+            if op == "=":
                 if element_value != value:
                     return False
-            elif op == '~=':
+            elif op == "~=":
                 # Approximate match (e.g., for colors)
                 if not _is_approximate_match(element_value, value):
                     return False
-            elif op == '>=':
+            elif op == ">=":
                 # Greater than or equal (element value must be >= specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value >= value):
+                if not (
+                    isinstance(element_value, (int, float))
+                    and isinstance(value, (int, float))
+                    and element_value >= value
+                ):
                     return False
-            elif op == '<=':
+            elif op == "<=":
                 # Less than or equal (element value must be <= specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value <= value):
+                if not (
+                    isinstance(element_value, (int, float))
+                    and isinstance(value, (int, float))
+                    and element_value <= value
+                ):
                     return False
-            elif op == '>':
+            elif op == ">":
                 # Greater than (element value must be > specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value > value):
+                if not (
+                    isinstance(element_value, (int, float))
+                    and isinstance(value, (int, float))
+                    and element_value > value
+                ):
                     return False
-            elif op == '<':
+            elif op == "<":
                 # Less than (element value must be < specified value)
-                if not (isinstance(element_value, (int, float)) and
-                        isinstance(value, (int, float)) and
-                        element_value < value):
+                if not (
+                    isinstance(element_value, (int, float))
+                    and isinstance(value, (int, float))
+                    and element_value < value
+                ):
                     return False
         # Check pseudo-classes
-        for pseudo in selector['pseudo_classes']:
-            name = pseudo['name']
-            args = pseudo['args']
+        for pseudo in selector["pseudo_classes"]:
+            name = pseudo["name"]
+            args = pseudo["args"]
             # Handle various pseudo-classes
-            if name == 'contains' and hasattr(element, 'text'):
-                use_regex = kwargs.get('regex', False)
-                ignore_case = not kwargs.get('case', True)
+            if name == "contains" and hasattr(element, "text"):
+                use_regex = kwargs.get("regex", False)
+                ignore_case = not kwargs.get("case", True)
                 if use_regex:
                     import re
                     if not element.text:
                         return False
                     try:
@@ -492,77 +333,79 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
                         # If regex is invalid, fall back to literal text search
                         element_text = element.text
                         search_text = args
                         if ignore_case:
                             element_text = element_text.lower()
                             search_text = search_text.lower()
                         if search_text not in element_text:
                             return False
                 else:
                     # String comparison with case sensitivity option
                     if not element.text:
                         return False
                     element_text = element.text
                     search_text = args
                     if ignore_case:
                         element_text = element_text.lower()
                         search_text = search_text.lower()
                     if search_text not in element_text:
                         return False
-            elif name == 'starts-with' and hasattr(element, 'text'):
+            elif name == "starts-with" and hasattr(element, "text"):
                 if not element.text or not element.text.startswith(args):
                     return False
-            elif name == 'ends-with' and hasattr(element, 'text'):
+            elif name == "ends-with" and hasattr(element, "text"):
                 if not element.text or not element.text.endswith(args):
                     return False
-            elif name == 'bold':
-                if not (hasattr(element, 'bold') and element.bold):
+            elif name == "bold":
+                if not (hasattr(element, "bold") and element.bold):
                     return False
-            elif name == 'italic':
-                if not (hasattr(element, 'italic') and element.italic):
+            elif name == "italic":
+                if not (hasattr(element, "italic") and element.italic):
                     return False
-            elif name == 'horizontal':
-                if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
+            elif name == "horizontal":
+                if not (hasattr(element, "is_horizontal") and element.is_horizontal):
                     return False
-            elif name == 'vertical':
-                if not (hasattr(element, 'is_vertical') and element.is_vertical):
+            elif name == "vertical":
+                if not (hasattr(element, "is_vertical") and element.is_vertical):
                     return False
             else:
                 # Check pseudo-classes (basic ones like :bold, :italic)
                 if name in PSEUDO_CLASS_FUNCTIONS:
                     if not PSEUDO_CLASS_FUNCTIONS[name](element):
                         return False
-                elif name == 'contains':
-                    if not hasattr(element, 'text') or not element.text:
+                elif name == "contains":
+                    if not hasattr(element, "text") or not element.text:
                         return False
                     text_to_check = element.text
                     search_term = args
-                    if not kwargs.get('case', True): # Check case flag from kwargs
+                    if not kwargs.get("case", True):  # Check case flag from kwargs
                         text_to_check = text_to_check.lower()
                         search_term = search_term.lower()
-                    if kwargs.get('regex', False): # Check regex flag from kwargs
+                    if kwargs.get("regex", False):  # Check regex flag from kwargs
                         try:
                             if not re.search(search_term, text_to_check):
                                 return False
                         except re.error as e:
-                             logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
-                             return False # Invalid regex cannot match
+                            logger.warning(
+                                f"Invalid regex in :contains selector '{search_term}': {e}"
+                            )
+                            return False  # Invalid regex cannot match
                     else:
                         if search_term not in text_to_check:
                             return False
                 # Skip complex pseudo-classes like :near, :above here, handled later
-                elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
-                    pass # Handled separately after initial filtering
+                elif name in ("above", "below", "near", "left-of", "right-of"):
+                    pass  # Handled separately after initial filtering
                 else:
-                     # Optionally log unknown pseudo-classes
-                     # logger.warning(f"Unknown pseudo-class: {name}")
-                     pass
-        return True # Element passes all attribute and simple pseudo-class filters
+                    # Optionally log unknown pseudo-classes
+                    # logger.warning(f"Unknown pseudo-class: {name}")
+                    pass
-    return filter_func
+        return True  # Element passes all attribute and simple pseudo-class filters
+    return filter_func

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl