PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +241 -158
natural_pdf/classification/mixin.py +52 -38
natural_pdf/classification/results.py +71 -45
natural_pdf/collections/mixins.py +85 -20
natural_pdf/collections/pdf_collection.py +245 -100
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +694 -195
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +610 -134
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
natural_pdf-0.1.10.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -3,11 +3,11 @@ CSS-like selector parser for natural-pdf.
 """
 import ast
+import logging
 import re
-from typing import Any, Dict, List, Optional, Tuple, Union, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from colour import Color
-import logging
 logger = logging.getLogger(__name__)
@@ -89,31 +89,37 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     """
     result = {
         "type": "any",
-        "attributes": {},
+        "attributes": [],
         "pseudo_classes": [],
-        "filters": [], # Keep this for potential future use
+        "filters": [],  # Keep this for potential future use
     }
-    original_selector_for_error = selector # Keep for error messages
+    original_selector_for_error = selector  # Keep for error messages
     if not selector or not isinstance(selector, str):
         return result
     selector = selector.strip()
-    # --- NEW: Handle wildcard selector explicitly ---
+    # --- Handle wildcard selector explicitly ---
     if selector == "*":
         # Wildcard matches any type, already the default.
         # Clear selector so the loop doesn't run and error out.
-        selector = ""
+        selector = ""
     # --- END NEW ---
     # 1. Extract type (optional, at the beginning)
     # Only run if selector wasn't '*'
-    if selector:
+    if selector:
+        type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
+        if type_match:
+            result["type"] = type_match.group(1).lower()
+            selector = selector[len(type_match.group(0)) :].strip()
+    # Only run if selector wasn't '*'
+    if selector:
         type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
         if type_match:
             result["type"] = type_match.group(1).lower()
-            selector = selector[len(type_match.group(0)):].strip()
+            selector = selector[len(type_match.group(0)) :].strip()
     # Regexes for parts at the START of the remaining string
     # Attribute: Starts with [, ends with ], content is non-greedy non-] chars
@@ -133,58 +139,74 @@ def parse_selector(selector: str) -> Dict[str, Any]:
             block_content = attr_match.group(1).strip()
             # Parse the content inside the block
             # Pattern: name, optional op, optional value
-            detail_match = re.match(r"^([a-zA-Z_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content)
+            detail_match = re.match(
+                r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
+            )
             if not detail_match:
-                raise ValueError(f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'")
+                raise ValueError(
+                    f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
+                )
             name, op, value_str = detail_match.groups()
             if op is None:
-                 # Presence selector [attr]
-                 result["attributes"][name] = {"op": "exists", "value": None}
+                # Presence selector [attr]
+                result["attributes"].append({"name": name, "op": "exists", "value": None})
             else:
-                 # Operator exists, value must also exist (even if empty via quotes)
-                 if value_str is None: # Catches invalid [attr=]
-                     raise ValueError(
-                         f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
-                     )
-                 # Parse value
-                 parsed_value: Any
-                 if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
-                     parsed_value = safe_parse_color(value_str)
-                 else:
-                     parsed_value = safe_parse_value(value_str) # Handles quotes
-                 result["attributes"][name] = {"op": op, "value": parsed_value}
-            selector = selector[attr_match.end():].strip()
+                # Operator exists, value must also exist (even if empty via quotes)
+                if value_str is None:  # Catches invalid [attr=]
+                    raise ValueError(
+                        f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
+                    )
+                # Parse value
+                parsed_value: Any
+                if name in [
+                    "color",
+                    "non_stroking_color",
+                    "fill",
+                    "stroke",
+                    "strokeColor",
+                    "fillColor",
+                ]:
+                    parsed_value = safe_parse_color(value_str)
+                else:
+                    parsed_value = safe_parse_value(value_str)  # Handles quotes
+                result["attributes"].append({"name": name, "op": op, "value": parsed_value})
+            selector = selector[attr_match.end() :].strip()
             processed_chunk = True
             continue
         # Check for :not(...) block
         if selector.lower().startswith(not_pseudo_prefix):
-            start_index = len(not_pseudo_prefix) - 1 # Index of '('
+            start_index = len(not_pseudo_prefix) - 1  # Index of '('
             nesting = 1
             end_index = -1
             for i in range(start_index + 1, len(selector)):
-                if selector[i] == '(': nesting += 1
-                elif selector[i] == ')':
+                if selector[i] == "(":
+                    nesting += 1
+                elif selector[i] == ")":
                     nesting -= 1
                     if nesting == 0:
                         end_index = i
                         break
             if end_index == -1:
-                raise ValueError(f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'")
+                raise ValueError(
+                    f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
+                )
             inner_selector_str = selector[start_index + 1 : end_index].strip()
             if not inner_selector_str:
-                 raise ValueError(f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'")
+                raise ValueError(
+                    f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
+                )
             # Recursively parse the inner selector
             parsed_inner_selector = parse_selector(inner_selector_str)
-            result["pseudo_classes"].append({'name': 'not', 'args': parsed_inner_selector})
+            result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
-            selector = selector[end_index + 1:].strip()
+            selector = selector[end_index + 1 :].strip()
             processed_chunk = True
             continue
@@ -192,25 +214,27 @@ def parse_selector(selector: str) -> Dict[str, Any]:
         pseudo_match = pseudo_pattern.match(selector)
         if pseudo_match:
             name, args_str = pseudo_match.groups()
-            name = name.lower() # Normalize pseudo-class name
-            processed_args = args_str # Keep as string initially, or None
+            name = name.lower()  # Normalize pseudo-class name
+            processed_args = args_str  # Keep as string initially, or None
             if args_str is not None:
                 # Only parse args if they exist and based on the pseudo-class type
-                 if name in ["color", "background"]:
+                if name in ["color", "background"]:
                     processed_args = safe_parse_color(args_str)
-                 else:
+                else:
                     processed_args = safe_parse_value(args_str)
             # else: args remain None
             result["pseudo_classes"].append({"name": name, "args": processed_args})
-            selector = selector[pseudo_match.end():].strip()
+            selector = selector[pseudo_match.end() :].strip()
             processed_chunk = True
             continue
         # If we reach here and the selector string is not empty, something is wrong
         if not processed_chunk and selector:
-            raise ValueError(f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'")
+            raise ValueError(
+                f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
+            )
     return result
@@ -263,12 +287,8 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
 PSEUDO_CLASS_FUNCTIONS = {
     "bold": lambda el: hasattr(el, "bold") and el.bold,
     "italic": lambda el: hasattr(el, "italic") and el.italic,
-    "first-child": lambda el: hasattr(el, "parent")
-    and el.parent
-    and el.parent.children[0] == el,
-    "last-child": lambda el: hasattr(el, "parent")
-    and el.parent
-    and el.parent.children[-1] == el,
+    "first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
+    "last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
     "empty": lambda el: not el.text,
     "not-empty": lambda el: el.text,
     "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
@@ -308,34 +328,44 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
             func = lambda el: (
                 hasattr(el, "normalized_type") and el.normalized_type == selector_type
             ) or (
-                not hasattr(el, "normalized_type") # Only check element.type if normalized_type doesn't exist/match
-                and hasattr(el, "type") and el.type == selector_type
+                not hasattr(
+                    el, "normalized_type"
+                )  # Only check element.type if normalized_type doesn't exist/match
+                and hasattr(el, "type")
+                and el.type == selector_type
             )
         filters.append({"name": filter_name, "func": func})
     # Filter by attributes
-    for name, attr_info in selector["attributes"].items():
-        op = attr_info["op"]
-        value = attr_info["value"]
-        python_name = name.replace("-", "_") # Convert CSS-style names
+    for attr_filter in selector["attributes"]:
+        name = attr_filter["name"]
+        op = attr_filter["op"]
+        value = attr_filter["value"]
+        python_name = name.replace("-", "_")  # Convert CSS-style names
         # --- Define the core value retrieval logic ---
-        def get_element_value(element, name=name, python_name=python_name, selector_type=selector_type):
-             # Special case for region attributes
-             if selector_type == "region":
-                 if name == "type":
-                     if hasattr(element, "normalized_type") and element.normalized_type:
-                         return element.normalized_type
-                     else:
-                         return getattr(element, "region_type", "").lower().replace(" ", "_")
-                 elif name == "model":
-                     return getattr(element, "model", None)
-                 else:
-                      return getattr(element, python_name, None)
-             else:
-                 # General case for non-region elements
-                 return getattr(element, python_name, None)
+        def get_element_value(
+            element, name=name, python_name=python_name, selector_type=selector_type
+        ):
+            bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
+            if name in bbox_mapping:
+                bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
+                return bbox[bbox_mapping[name]]
+            # Special case for region attributes
+            if selector_type == "region":
+                if name == "type":
+                    if hasattr(element, "normalized_type") and element.normalized_type:
+                        return element.normalized_type
+                    else:
+                        return getattr(element, "region_type", "").lower().replace(" ", "_")
+                elif name == "model":
+                    return getattr(element, "model", None)
+                else:
+                    return getattr(element, python_name, None)
+            else:
+                # General case for non-region elements
+                return getattr(element, python_name, None)
         # --- Define the comparison function or direct check ---
         filter_lambda: Callable[[Any], bool]
@@ -345,14 +375,11 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
             # Special handling for attribute presence check [attr]
             filter_name = f"attribute [{name} exists]"
             # Lambda checks that the retrieved value is not None
-            filter_lambda = (
-                lambda el, get_val=get_element_value:
-                get_val(el) is not None
-            )
+            filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
         else:
             # Handle operators with values (e.g., =, !=, *=, etc.)
             compare_func: Callable[[Any, Any], bool]
-            op_desc = f"{op} {value!r}" # Default description
+            op_desc = f"{op} {value!r}"  # Default description
             # Determine compare_func based on op (reuse existing logic)
             if op == "=":
@@ -363,40 +390,76 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
                 op_desc = f"~= {value!r} (approx)"
                 compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
             elif op == "^=":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.startswith(sel_val)
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, str)
+                    and isinstance(sel_val, str)
+                    and el_val.startswith(sel_val)
+                )
             elif op == "$=":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.endswith(sel_val)
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, str)
+                    and isinstance(sel_val, str)
+                    and el_val.endswith(sel_val)
+                )
             elif op == "*=":
                 if name == "fontname":
-                     op_desc = f"*= {value!r} (contains, case-insensitive)"
-                     compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val.lower() in el_val.lower()
+                    op_desc = f"*= {value!r} (contains, case-insensitive)"
+                    compare_func = (
+                        lambda el_val, sel_val: isinstance(el_val, str)
+                        and isinstance(sel_val, str)
+                        and sel_val.lower() in el_val.lower()
+                    )
                 else:
-                     op_desc = f"*= {value!r} (contains)"
-                     compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val in el_val
+                    op_desc = f"*= {value!r} (contains)"
+                    compare_func = (
+                        lambda el_val, sel_val: isinstance(el_val, str)
+                        and isinstance(sel_val, str)
+                        and sel_val in el_val
+                    )
             elif op == ">=":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val >= sel_val
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val >= sel_val
+                )
             elif op == "<=":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val <= sel_val
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val <= sel_val
+                )
             elif op == ">":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val > sel_val
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val > sel_val
+                )
             elif op == "<":
-                compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val < sel_val
+                compare_func = (
+                    lambda el_val, sel_val: isinstance(el_val, (int, float))
+                    and isinstance(sel_val, (int, float))
+                    and el_val < sel_val
+                )
             else:
                 # Should not happen with current parsing logic
-                logger.warning(f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'")
-                continue # Skip this attribute filter
+                logger.warning(
+                    f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
+                )
+                continue  # Skip this attribute filter
             # --- Create the final filter function for operators with values ---
             filter_name = f"attribute [{name}{op_desc}]"
             # Capture loop variables correctly in the lambda
             filter_lambda = (
-                lambda el, get_val=get_element_value, compare=compare_func, expected_val=value:
-                (element_value := get_val(el)) is not None and compare(element_value, expected_val)
+                lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
+                    element_value := get_val(el)
+                )
+                is not None
+                and compare(element_value, expected_val)
             )
         filters.append({"name": filter_name, "func": filter_lambda})
     # Filter by pseudo-classes
     for pseudo in selector["pseudo_classes"]:
         name = pseudo["name"]
@@ -407,62 +470,75 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
         # Relational pseudo-classes are handled separately by the caller
         if name in ("above", "below", "near", "left-of", "right-of"):
-             continue
+            continue
-        # --- Handle :not() ---
+        # --- Handle :not() ---
         elif name == "not":
-             if not isinstance(args, dict): # args should be the parsed inner selector
-                 logger.error(f"Invalid arguments for :not pseudo-class: {args}")
-                 raise TypeError("Internal error: :not pseudo-class requires a parsed selector dictionary as args.")
-             # Recursively get the filter function for the inner selector
-             # Pass kwargs down in case regex/case flags affect the inner selector
-             inner_filter_func = selector_to_filter_func(args, **kwargs)
-             # The filter lambda applies the inner function and inverts the result
-             filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
-             # Try to create a descriptive name (can be long)
-             # Maybe simplify this later if needed
-             inner_filter_list = _build_filter_list(args, **kwargs)
-             inner_filter_names = ", ".join([f['name'] for f in inner_filter_list])
-             filter_name = f"pseudo-class :not({inner_filter_names})"
-        # --- Handle text-based pseudo-classes ---
+            if not isinstance(args, dict):  # args should be the parsed inner selector
+                logger.error(f"Invalid arguments for :not pseudo-class: {args}")
+                raise TypeError(
+                    "Internal error: :not pseudo-class requires a parsed selector dictionary as args."
+                )
+            # Recursively get the filter function for the inner selector
+            # Pass kwargs down in case regex/case flags affect the inner selector
+            inner_filter_func = selector_to_filter_func(args, **kwargs)
+            # The filter lambda applies the inner function and inverts the result
+            filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
+            # Try to create a descriptive name (can be long)
+            # Maybe simplify this later if needed
+            inner_filter_list = _build_filter_list(args, **kwargs)
+            inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
+            filter_name = f"pseudo-class :not({inner_filter_names})"
+        # --- Handle text-based pseudo-classes ---
         elif name == "contains" and args is not None:
             use_regex = kwargs.get("regex", False)
-            ignore_case = not kwargs.get("case", True) # Default case sensitive
-            filter_name = f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
+            ignore_case = not kwargs.get("case", True)  # Default case sensitive
+            filter_name = (
+                f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
+            )
             def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
-                 if not hasattr(element, "text") or not element.text:
-                     return False # Element must have non-empty text
-                 element_text = element.text
-                 search_term = str(args) # Ensure args is string
-                 if use_regex:
-                     try:
-                         pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
-                         return bool(pattern.search(element_text))
-                     except re.error as e:
-                         logger.warning(f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search.")
-                         # Fallback to literal search on regex error
-                         if ignore_case:
-                              return search_term.lower() in element_text.lower()
-                         else:
-                              return search_term in element_text
-                 else: # Literal search
-                     if ignore_case:
-                         return search_term.lower() in element_text.lower()
-                     else:
-                         return search_term in element_text
+                if not hasattr(element, "text") or not element.text:
+                    return False  # Element must have non-empty text
+                element_text = element.text
+                search_term = str(args)  # Ensure args is string
+                if use_regex:
+                    try:
+                        pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
+                        return bool(pattern.search(element_text))
+                    except re.error as e:
+                        logger.warning(
+                            f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
+                        )
+                        # Fallback to literal search on regex error
+                        if ignore_case:
+                            return search_term.lower() in element_text.lower()
+                        else:
+                            return search_term in element_text
+                else:  # Literal search
+                    if ignore_case:
+                        return search_term.lower() in element_text.lower()
+                    else:
+                        return search_term in element_text
             filter_lambda = contains_check
         elif name == "starts-with" and args is not None:
-            filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.startswith(str(arg))
+            filter_lambda = (
+                lambda el, arg=args: hasattr(el, "text")
+                and el.text
+                and el.text.startswith(str(arg))
+            )
         elif name == "ends-with" and args is not None:
-             filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
+            filter_lambda = (
+                lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
+            )
         # Boolean attribute pseudo-classes
         elif name == "bold":
@@ -477,11 +553,10 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
         # Check predefined lambda functions (e.g., :first-child, :empty)
         elif name in PSEUDO_CLASS_FUNCTIONS:
             filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
-            filter_name = f"pseudo-class :{name}" # Set name for predefined ones
+            filter_name = f"pseudo-class :{name}"  # Set name for predefined ones
         else:
             raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
         if filter_lambda:
             # Use the potentially updated filter_name
             filters.append({"name": filter_name, "func": filter_lambda})
@@ -500,15 +575,17 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
         A single function that takes an element and returns True only if
         it passes ALL filters in the list.
     """
     def combined_filter(element):
         for f in filters:
             try:
-                if not f['func'](element):
+                if not f["func"](element):
                     return False
             except Exception as e:
-                 logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
-                 return False # Treat errors as filter failures
+                logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
+                return False  # Treat errors as filter failures
         return True
     return combined_filter
@@ -529,8 +606,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
     filter_list = _build_filter_list(selector, **kwargs)
     if logger.isEnabledFor(logging.DEBUG):
-       filter_names = [f['name'] for f in filter_list]
-       logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
-    return _assemble_filter_func(filter_list)
+        filter_names = [f["name"] for f in filter_list]
+        logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
+    return _assemble_filter_func(filter_list)

natural_pdf/utils/debug.py CHANGED Viewed

@@ -3,13 +3,13 @@ OCR debug utilities for natural-pdf.
 """
 import base64
+import importlib.resources
+import importlib.util
 import io
 import json
 import os
-import importlib.util
-import importlib.resources
 import webbrowser
-from typing import Dict, List, Any, Optional, Union, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 from PIL import Image

natural_pdf/utils/identifiers.py CHANGED Viewed

@@ -2,8 +2,8 @@
 Utilities for generating consistent identifiers.
 """
-import hashlib
 import base64
+import hashlib
 import os

natural_pdf/utils/locks.py CHANGED Viewed

@@ -5,4 +5,4 @@ Shared locks for thread synchronization across the natural-pdf library.
 import threading
 # Global lock for PDF rendering operations to prevent PDFium concurrency issues
-pdf_render_lock = threading.RLock()
+pdf_render_lock = threading.RLock()

natural_pdf/utils/packaging.py CHANGED Viewed

@@ -2,23 +2,25 @@
 Utilities for packaging data for external processes, like correction tasks.
 """
-import os
 import base64
 import io
 import json
-import zipfile
-import tempfile
 import logging
+import os
 import shutil
-from typing import Any, List, Union, Iterable, TYPE_CHECKING, Dict
+import tempfile
+import zipfile
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Union
 from tqdm import tqdm
 from natural_pdf.elements.text import TextElement
 # Import the specific PDF/Page types if possible, otherwise use Any
 if TYPE_CHECKING:
-    from natural_pdf.core.pdf import PDF
-    from natural_pdf.core.page import Page
     from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.core.page import Page
+    from natural_pdf.core.pdf import PDF
 else:
     PDF = Any
     Page = Any

natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl