PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -3,11 +3,14 @@ CSS-like selector parser for natural-pdf.
 """
 import ast
+import logging
 import re
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from colour import Color
+logger = logging.getLogger(__name__)
 def safe_parse_value(value_str: str) -> Any:
     """
@@ -72,10 +75,11 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     """
     Parse a CSS-like selector string into a structured selector object.
-    Examples:
-    - 'text:contains("Revenue")'
-    - 'table:below("Financial Data")'
-    - 'rect[fill=(1,0,0)]'
+    Handles:
+    - Element types (e.g., 'text', 'rect')
+    - Attribute presence (e.g., '[data-id]')
+    - Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
+    - Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
     Args:
         selector: CSS-like selector string
@@ -83,51 +87,154 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     Returns:
         Dict representing the parsed selector
     """
-    # Basic structure for result
     result = {
-        "type": "any",  # Default to any element type
-        "filters": [],
-        "attributes": {},
+        "type": "any",
+        "attributes": [],
         "pseudo_classes": [],
+        "filters": [],  # Keep this for potential future use
     }
-    # Check if empty or None
+    original_selector_for_error = selector  # Keep for error messages
     if not selector or not isinstance(selector, str):
         return result
-    # Parse element type
-    type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
-    if type_match:
-        result["type"] = type_match.group(1).lower()
-        selector = selector[len(type_match.group(0)) :]
-    # Parse attributes (e.g., [color=(1,0,0)])
-    attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
-    attr_matches = re.findall(attr_pattern, selector)
-    for name, op, value in attr_matches:
-        # Handle special parsing for color attributes
-        if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
-            value = safe_parse_color(value)
-        else:
-            # Safe parsing for other attributes
-            value = safe_parse_value(value)
-        # Store attribute with operator
-        result["attributes"][name] = {"op": op, "value": value}
-    # Parse pseudo-classes (e.g., :contains("text"))
-    pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
-    pseudo_matches = re.findall(pseudo_pattern, selector)
-    for name, args in pseudo_matches:
-        # Process arguments
-        processed_args = args
-        if args:
-            if name in ["color", "background"]:
-                processed_args = safe_parse_color(args)
+    selector = selector.strip()
+    # --- Handle wildcard selector explicitly ---
+    if selector == "*":
+        # Wildcard matches any type, already the default.
+        # Clear selector so the loop doesn't run and error out.
+        selector = ""
+    # --- END NEW ---
+    # 1. Extract type (optional, at the beginning)
+    # Only run if selector wasn't '*'
+    if selector:
+        type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
+        if type_match:
+            result["type"] = type_match.group(1).lower()
+            selector = selector[len(type_match.group(0)) :].strip()
+    # Only run if selector wasn't '*'
+    if selector:
+        type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
+        if type_match:
+            result["type"] = type_match.group(1).lower()
+            selector = selector[len(type_match.group(0)) :].strip()
+    # Regexes for parts at the START of the remaining string
+    # Attribute: Starts with [, ends with ], content is non-greedy non-] chars
+    attr_pattern = re.compile(r"^\[\s*([^\s\]]+.*?)\s*\]")
+    # Pseudo: Starts with :, name is letters/hyphen/underscore, optionally followed by (...)
+    pseudo_pattern = re.compile(r"^:([a-zA-Z_\-]+)(?:\((.*?)\))?")
+    # :not() specifically requires careful parenthesis matching later
+    not_pseudo_prefix = ":not("
+    # 2. Iteratively parse attributes and pseudo-classes
+    while selector:
+        processed_chunk = False
+        # Check for attribute block `[...]`
+        attr_match = attr_pattern.match(selector)
+        if attr_match:
+            block_content = attr_match.group(1).strip()
+            # Parse the content inside the block
+            # Pattern: name, optional op, optional value
+            detail_match = re.match(
+                r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
+            )
+            if not detail_match:
+                raise ValueError(
+                    f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
+                )
+            name, op, value_str = detail_match.groups()
+            if op is None:
+                # Presence selector [attr]
+                result["attributes"].append({"name": name, "op": "exists", "value": None})
             else:
-                processed_args = safe_parse_value(args)
+                # Operator exists, value must also exist (even if empty via quotes)
+                if value_str is None:  # Catches invalid [attr=]
+                    raise ValueError(
+                        f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
+                    )
+                # Parse value
+                parsed_value: Any
+                if name in [
+                    "color",
+                    "non_stroking_color",
+                    "fill",
+                    "stroke",
+                    "strokeColor",
+                    "fillColor",
+                ]:
+                    parsed_value = safe_parse_color(value_str)
+                else:
+                    parsed_value = safe_parse_value(value_str)  # Handles quotes
+                result["attributes"].append({"name": name, "op": op, "value": parsed_value})
+            selector = selector[attr_match.end() :].strip()
+            processed_chunk = True
+            continue
+        # Check for :not(...) block
+        if selector.lower().startswith(not_pseudo_prefix):
+            start_index = len(not_pseudo_prefix) - 1  # Index of '('
+            nesting = 1
+            end_index = -1
+            for i in range(start_index + 1, len(selector)):
+                if selector[i] == "(":
+                    nesting += 1
+                elif selector[i] == ")":
+                    nesting -= 1
+                    if nesting == 0:
+                        end_index = i
+                        break
+            if end_index == -1:
+                raise ValueError(
+                    f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
+                )
+            inner_selector_str = selector[start_index + 1 : end_index].strip()
+            if not inner_selector_str:
+                raise ValueError(
+                    f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
+                )
+            # Recursively parse the inner selector
+            parsed_inner_selector = parse_selector(inner_selector_str)
+            result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
+            selector = selector[end_index + 1 :].strip()
+            processed_chunk = True
+            continue
+        # Check for other pseudo-class blocks `:name` or `:name(...)`
+        pseudo_match = pseudo_pattern.match(selector)
+        if pseudo_match:
+            name, args_str = pseudo_match.groups()
+            name = name.lower()  # Normalize pseudo-class name
+            processed_args = args_str  # Keep as string initially, or None
+            if args_str is not None:
+                # Only parse args if they exist and based on the pseudo-class type
+                if name in ["color", "background"]:
+                    processed_args = safe_parse_color(args_str)
+                else:
+                    processed_args = safe_parse_value(args_str)
+            # else: args remain None
-        result["pseudo_classes"].append({"name": name, "args": processed_args})
+            result["pseudo_classes"].append({"name": name, "args": processed_args})
+            selector = selector[pseudo_match.end() :].strip()
+            processed_chunk = True
+            continue
+        # If we reach here and the selector string is not empty, something is wrong
+        if not processed_chunk and selector:
+            raise ValueError(
+                f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
+            )
     return result
@@ -180,21 +287,18 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
 PSEUDO_CLASS_FUNCTIONS = {
     "bold": lambda el: hasattr(el, "bold") and el.bold,
     "italic": lambda el: hasattr(el, "italic") and el.italic,
-    "first-child": lambda el: hasattr(el, "parent")
-    and el.parent
-    and el.parent.children[0] == el,  # Example placeholder
-    "last-child": lambda el: hasattr(el, "parent")
-    and el.parent
-    and el.parent.children[-1] == el,  # Example placeholder
-    # Add the new pseudo-classes for negation
+    "first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
+    "last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
+    "empty": lambda el: not el.text,
+    "not-empty": lambda el: el.text,
     "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
     "not-italic": lambda el: hasattr(el, "italic") and not el.italic,
 }
-def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
+def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
     """
-    Convert a parsed selector to a filter function.
+    Convert a parsed selector to a list of named filter functions.
     Args:
         selector: Parsed selector dictionary
@@ -203,209 +307,306 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
                  - case: Whether to do case-sensitive text search
     Returns:
-        Function that takes an element and returns True if it matches
+        List of dictionaries, each with 'name' (str) and 'func' (callable).
+        The callable takes an element and returns True if it matches the specific filter.
     """
-    def filter_func(element):
-        # Check element type
-        if selector["type"] != "any":
-            # Special handling for 'text' type to match both 'text', 'char', and 'word'
-            if selector["type"] == "text":
-                if element.type not in ["text", "char", "word"]:
-                    return False
-            # Special handling for 'region' type to check for detected layout regions
-            elif selector["type"] == "region":
-                # Check if this is a Region with region_type property
-                if not hasattr(element, "region_type"):
-                    return False
-                # If 'type' attribute specified, it will be checked in the attributes section
-            # Check for Docling-specific types (section-header, etc.)
-            elif (
-                hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
-            ):
-                # This is a direct match with a Docling region type
-                pass
-            # Otherwise, require exact match with the element's type attribute
-            elif not hasattr(element, "type") or element.type != selector["type"]:
-                return False
-        # Check attributes
-        for name, attr_info in selector["attributes"].items():
-            op = attr_info["op"]
-            value = attr_info["value"]
-            # Special case for fontname attribute - allow matching part of the name
-            if name == "fontname" and op == "*=":
-                element_value = getattr(element, name, None)
-                if element_value is None or value.lower() not in element_value.lower():
-                    return False
-                continue
-            # Convert hyphenated attribute names to underscore for Python properties
-            python_name = name.replace("-", "_")
+    filters: List[Dict[str, Any]] = []
+    selector_type = selector["type"]
+    # Filter by element type
+    if selector_type != "any":
+        filter_name = f"type is '{selector_type}'"
+        if selector_type == "text":
+            filter_name = "type is 'text', 'char', or 'word'"
+            func = lambda el: hasattr(el, "type") and el.type in ["text", "char", "word"]
+        elif selector_type == "region":
+            filter_name = "type is 'region' (has region_type)"
+            # Note: Specific region type attribute (e.g., [type=table]) is checked below
+            func = lambda el: hasattr(el, "region_type")
+        else:
+            # Check against normalized_type first, then element.type
+            func = lambda el: (
+                hasattr(el, "normalized_type") and el.normalized_type == selector_type
+            ) or (
+                not hasattr(
+                    el, "normalized_type"
+                )  # Only check element.type if normalized_type doesn't exist/match
+                and hasattr(el, "type")
+                and el.type == selector_type
+            )
+        filters.append({"name": filter_name, "func": func})
+    # Filter by attributes
+    for attr_filter in selector["attributes"]:
+        name = attr_filter["name"]
+        op = attr_filter["op"]
+        value = attr_filter["value"]
+        python_name = name.replace("-", "_")  # Convert CSS-style names
+        # --- Define the core value retrieval logic ---
+        def get_element_value(
+            element, name=name, python_name=python_name, selector_type=selector_type
+        ):
+            bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
+            if name in bbox_mapping:
+                bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
+                return bbox[bbox_mapping[name]]
             # Special case for region attributes
-            if selector["type"] == "region":
+            if selector_type == "region":
                 if name == "type":
-                    # Use normalized_type for comparison if available
                     if hasattr(element, "normalized_type") and element.normalized_type:
-                        element_value = element.normalized_type
+                        return element.normalized_type
                     else:
-                        # Convert spaces to hyphens for consistency with the normalized format
-                        element_value = (
-                            getattr(element, "region_type", "").lower().replace(" ", "_")
-                        )
+                        return getattr(element, "region_type", "").lower().replace(" ", "_")
                 elif name == "model":
-                    # Special handling for model attribute in regions
-                    element_value = getattr(element, "model", None)
+                    return getattr(element, "model", None)
                 else:
-                    # Get the attribute value from the element normally
-                    element_value = getattr(element, python_name, None)
+                    return getattr(element, python_name, None)
             else:
-                # Get the attribute value from the element normally for non-region elements
-                element_value = getattr(element, python_name, None)
-            if element_value is None:
-                return False
+                # General case for non-region elements
+                return getattr(element, python_name, None)
+        # --- Define the comparison function or direct check ---
+        filter_lambda: Callable[[Any], bool]
+        filter_name: str
+        if op == "exists":
+            # Special handling for attribute presence check [attr]
+            filter_name = f"attribute [{name} exists]"
+            # Lambda checks that the retrieved value is not None
+            filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
+        else:
+            # Handle operators with values (e.g., =, !=, *=, etc.)
+            compare_func: Callable[[Any, Any], bool]
+            op_desc = f"{op} {value!r}"  # Default description
-            # Apply operator
+            # Determine compare_func based on op (reuse existing logic)
             if op == "=":
-                if element_value != value:
-                    return False
-            elif op == "~=":
-                # Approximate match (e.g., for colors)
-                if not _is_approximate_match(element_value, value):
-                    return False
+                compare_func = lambda el_val, sel_val: el_val == sel_val
+            elif op == "!=":
+                compare_func = lambda el_val, sel_val: el_val != sel_val
+            elif op == "~":
+                op_desc = f"~= {value!r} (approx)"
+                compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
+            elif op == "^=":
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, str)
+                    and isinstance(sel_val, str)
+                    and el_val.startswith(sel_val)
+                )
+            elif op == "$=":
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, str)
+                    and isinstance(sel_val, str)
+                    and el_val.endswith(sel_val)
+                )
+            elif op == "*=":
+                if name == "fontname":
+                    op_desc = f"*= {value!r} (contains, case-insensitive)"
+                    compare_func = (
+                        lambda el_val, sel_val: isinstance(el_val, str)
+                        and isinstance(sel_val, str)
+                        and sel_val.lower() in el_val.lower()
+                    )
+                else:
+                    op_desc = f"*= {value!r} (contains)"
+                    compare_func = (
+                        lambda el_val, sel_val: isinstance(el_val, str)
+                        and isinstance(sel_val, str)
+                        and sel_val in el_val
+                    )
             elif op == ">=":
-                # Greater than or equal (element value must be >= specified value)
-                if not (
-                    isinstance(element_value, (int, float))
-                    and isinstance(value, (int, float))
-                    and element_value >= value
-                ):
-                    return False
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val >= sel_val
+                )
             elif op == "<=":
-                # Less than or equal (element value must be <= specified value)
-                if not (
-                    isinstance(element_value, (int, float))
-                    and isinstance(value, (int, float))
-                    and element_value <= value
-                ):
-                    return False
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val <= sel_val
+                )
             elif op == ">":
-                # Greater than (element value must be > specified value)
-                if not (
-                    isinstance(element_value, (int, float))
-                    and isinstance(value, (int, float))
-                    and element_value > value
-                ):
-                    return False
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val > sel_val
+                )
             elif op == "<":
-                # Less than (element value must be < specified value)
-                if not (
-                    isinstance(element_value, (int, float))
-                    and isinstance(value, (int, float))
-                    and element_value < value
-                ):
-                    return False
-        # Check pseudo-classes
-        for pseudo in selector["pseudo_classes"]:
-            name = pseudo["name"]
-            args = pseudo["args"]
-            # Handle various pseudo-classes
-            if name == "contains" and hasattr(element, "text"):
-                use_regex = kwargs.get("regex", False)
-                ignore_case = not kwargs.get("case", True)
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val < sel_val
+                )
+            else:
+                # Should not happen with current parsing logic
+                logger.warning(
+                    f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
+                )
+                continue  # Skip this attribute filter
+            # --- Create the final filter function for operators with values ---
+            filter_name = f"attribute [{name}{op_desc}]"
+            # Capture loop variables correctly in the lambda
+            filter_lambda = (
+                lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
+                    element_value := get_val(el)
+                )
+                is not None
+                and compare(element_value, expected_val)
+            )
+        filters.append({"name": filter_name, "func": filter_lambda})
+    # Filter by pseudo-classes
+    for pseudo in selector["pseudo_classes"]:
+        name = pseudo["name"]
+        args = pseudo["args"]
+        filter_lambda = None
+        # Start with a base name, modify for specifics like :not
+        filter_name = f"pseudo-class :{name}"
+        # Relational pseudo-classes are handled separately by the caller
+        if name in ("above", "below", "near", "left-of", "right-of"):
+            continue
+        # --- Handle :not() ---
+        elif name == "not":
+            if not isinstance(args, dict):  # args should be the parsed inner selector
+                logger.error(f"Invalid arguments for :not pseudo-class: {args}")
+                raise TypeError(
+                    "Internal error: :not pseudo-class requires a parsed selector dictionary as args."
+                )
+            # Recursively get the filter function for the inner selector
+            # Pass kwargs down in case regex/case flags affect the inner selector
+            inner_filter_func = selector_to_filter_func(args, **kwargs)
+            # The filter lambda applies the inner function and inverts the result
+            filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
+            # Try to create a descriptive name (can be long)
+            # Maybe simplify this later if needed
+            inner_filter_list = _build_filter_list(args, **kwargs)
+            inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
+            filter_name = f"pseudo-class :not({inner_filter_names})"
+        # --- Handle text-based pseudo-classes ---
+        elif name == "contains" and args is not None:
+            use_regex = kwargs.get("regex", False)
+            ignore_case = not kwargs.get("case", True)  # Default case sensitive
+            filter_name = (
+                f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
+            )
+            def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
+                if not hasattr(element, "text") or not element.text:
+                    return False  # Element must have non-empty text
+                element_text = element.text
+                search_term = str(args)  # Ensure args is string
                 if use_regex:
-                    import re
-                    if not element.text:
-                        return False
                     try:
-                        pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
-                        if not pattern.search(element.text):
-                            return False
-                    except re.error:
-                        # If regex is invalid, fall back to literal text search
-                        element_text = element.text
-                        search_text = args
+                        pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
+                        return bool(pattern.search(element_text))
+                    except re.error as e:
+                        logger.warning(
+                            f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
+                        )
+                        # Fallback to literal search on regex error
                         if ignore_case:
-                            element_text = element_text.lower()
-                            search_text = search_text.lower()
+                            return search_term.lower() in element_text.lower()
+                        else:
+                            return search_term in element_text
+                else:  # Literal search
+                    if ignore_case:
+                        return search_term.lower() in element_text.lower()
+                    else:
+                        return search_term in element_text
+            filter_lambda = contains_check
+        elif name == "starts-with" and args is not None:
+            filter_lambda = (
+                lambda el, arg=args: hasattr(el, "text")
+                and el.text
+                and el.text.startswith(str(arg))
+            )
+        elif name == "ends-with" and args is not None:
+            filter_lambda = (
+                lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
+            )
+        # Boolean attribute pseudo-classes
+        elif name == "bold":
+            filter_lambda = lambda el: hasattr(el, "bold") and el.bold
+        elif name == "italic":
+            filter_lambda = lambda el: hasattr(el, "italic") and el.italic
+        elif name == "horizontal":
+            filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
+        elif name == "vertical":
+            filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
+        # Check predefined lambda functions (e.g., :first-child, :empty)
+        elif name in PSEUDO_CLASS_FUNCTIONS:
+            filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
+            filter_name = f"pseudo-class :{name}"  # Set name for predefined ones
+        else:
+            raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
-                        if search_text not in element_text:
-                            return False
-                else:
-                    # String comparison with case sensitivity option
-                    if not element.text:
-                        return False
+        if filter_lambda:
+            # Use the potentially updated filter_name
+            filters.append({"name": filter_name, "func": filter_lambda})
-                    element_text = element.text
-                    search_text = args
+    return filters
-                    if ignore_case:
-                        element_text = element_text.lower()
-                        search_text = search_text.lower()
-                    if search_text not in element_text:
-                        return False
-            elif name == "starts-with" and hasattr(element, "text"):
-                if not element.text or not element.text.startswith(args):
-                    return False
-            elif name == "ends-with" and hasattr(element, "text"):
-                if not element.text or not element.text.endswith(args):
-                    return False
-            elif name == "bold":
-                if not (hasattr(element, "bold") and element.bold):
-                    return False
-            elif name == "italic":
-                if not (hasattr(element, "italic") and element.italic):
-                    return False
-            elif name == "horizontal":
-                if not (hasattr(element, "is_horizontal") and element.is_horizontal):
-                    return False
-            elif name == "vertical":
-                if not (hasattr(element, "is_vertical") and element.is_vertical):
+def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool]:
+    """
+    Combine a list of named filter functions into a single callable.
+    Args:
+        filters: List of dictionaries, each with 'name' and 'func'.
+    Returns:
+        A single function that takes an element and returns True only if
+        it passes ALL filters in the list.
+    """
+    def combined_filter(element):
+        for f in filters:
+            try:
+                if not f["func"](element):
                     return False
-            else:
-                # Check pseudo-classes (basic ones like :bold, :italic)
-                if name in PSEUDO_CLASS_FUNCTIONS:
-                    if not PSEUDO_CLASS_FUNCTIONS[name](element):
-                        return False
-                elif name == "contains":
-                    if not hasattr(element, "text") or not element.text:
-                        return False
-                    text_to_check = element.text
-                    search_term = args
-                    if not kwargs.get("case", True):  # Check case flag from kwargs
-                        text_to_check = text_to_check.lower()
-                        search_term = search_term.lower()
-                    if kwargs.get("regex", False):  # Check regex flag from kwargs
-                        try:
-                            if not re.search(search_term, text_to_check):
-                                return False
-                        except re.error as e:
-                            logger.warning(
-                                f"Invalid regex in :contains selector '{search_term}': {e}"
-                            )
-                            return False  # Invalid regex cannot match
-                    else:
-                        if search_term not in text_to_check:
-                            return False
-                # Skip complex pseudo-classes like :near, :above here, handled later
-                elif name in ("above", "below", "near", "left-of", "right-of"):
-                    pass  # Handled separately after initial filtering
-                else:
-                    # Optionally log unknown pseudo-classes
-                    # logger.warning(f"Unknown pseudo-class: {name}")
-                    pass
+            except Exception as e:
+                logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
+                return False  # Treat errors as filter failures
+        return True
+    return combined_filter
+def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
+    """
+    Convert a parsed selector to a single filter function.
+    Internally, this builds a list of individual filters and then combines them.
+    To inspect the individual filters, call `_build_filter_list` directly.
+    Args:
+        selector: Parsed selector dictionary
+        **kwargs: Additional filter parameters (e.g., regex, case).
+    Returns:
+        Function that takes an element and returns True if it matches the selector.
+    """
+    filter_list = _build_filter_list(selector, **kwargs)
-        return True  # Element passes all attribute and simple pseudo-class filters
+    if logger.isEnabledFor(logging.DEBUG):
+        filter_names = [f["name"] for f in filter_list]
+        logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
-    return filter_func
+    return _assemble_filter_func(filter_list)

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl