PyPI - natural-pdf - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

natural_pdf/__init__.py +7 -2
natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +3 -4
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +146 -75
natural_pdf/core/page.py +287 -188
natural_pdf/core/pdf.py +57 -42
natural_pdf/elements/base.py +51 -0
natural_pdf/elements/collections.py +362 -67
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +396 -23
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +40 -61
natural_pdf/exporters/hocr_font.py +7 -13
natural_pdf/exporters/original_pdf.py +10 -13
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/exporters/searchable_pdf.py +0 -10
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/selectors/parser.py +163 -8
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0

natural_pdf/search/searchable_mixin.py CHANGED Viewed

@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
 # Now import the flag from the canonical source - this import should always work
-from .haystack_utils import HAS_HAYSTACK_EXTRAS
 DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
             logger.info(
                 f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
             )
-            # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
             self._search_service = service
         else:
             # Create new service
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
             logger.info(
                 f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
             )
-            try:
-                service_args = {
-                    "collection_name": effective_collection_name,
-                    "persist": effective_persist,
-                    **kwargs,
-                }
-                if embedding_model:
-                    service_args["embedding_model"] = embedding_model
-                self._search_service = get_search_service(**service_args)
-            except ImportError as ie:  # Catch the specific ImportError first
-                logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
-                raise ie  # Re-raise the original ImportError
-            except Exception as e:
-                logger.error(
-                    f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
-                )
-                # Keep the RuntimeError for other unexpected creation errors
-                raise RuntimeError(
-                    "Could not create SearchService instance due to an unexpected error."
-                ) from e
+            # Direct creation without try/except
+            service_args = {
+                "collection_name": effective_collection_name,
+                "persist": effective_persist,
+                **kwargs,
+            }
+            if embedding_model:
+                service_args["embedding_model"] = embedding_model
+            self._search_service = get_search_service(**service_args)
-        # --- Optional Immediate Indexing (with safety check for persistent) ---
         if index:
             if not self._search_service:  # Should not happen if logic above is correct
                 raise RuntimeError(
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
                 logger.warning(
                     f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
                 )
-            # else: # Not persistent, safe to proceed without existence check
-            #     logger.debug("Proceeding with index=True for non-persistent index.")
             # Proceed with indexing if checks passed or not applicable
             logger.info(
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
             f"Starting internal indexing process into SearchService collection '{collection_name}'..."
         )
-        # Use the abstract method to get items
-        try:
-            indexable_items = list(self.get_indexable_items())  # Consume iterator
-        except Exception as e:
-            logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
-            raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
+        # Get indexable items without try/except
+        indexable_items = list(self.get_indexable_items())  # Consume iterator
         if not indexable_items:
             logger.warning(
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
             return
         logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
-        try:
-            logger.debug(
-                f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
-            )
-            self._search_service.index(
-                documents=indexable_items,
-                embedder_device=embedder_device,
-                force_reindex=force_reindex,
-            )
-            logger.info(
-                f"Successfully completed indexing into SearchService collection '{collection_name}'."
-            )
-        except IndexConfigurationError as ice:
-            logger.error(
-                f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
-                exc_info=True,
-            )
-            raise  # Re-raise specific error
-        except Exception as e:  # Catch other indexing errors from the service
-            logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
-            raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
+        logger.debug(
+            f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
+        )
+        # Call index without try/except
+        self._search_service.index(
+            documents=indexable_items,
+            embedder_device=embedder_device,
+            force_reindex=force_reindex,
+        )
+        logger.info(
+            f"Successfully completed indexing into SearchService collection '{collection_name}'."
+        )
     def index_for_search(
         self,
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
         Returns:
             Self for method chaining.
         """
-        # --- Ensure Service is Initialized (Use Default if Needed) ---
         if not self._search_service:
             logger.info(
                 "Search service not initialized prior to index_for_search. Initializing default in-memory service."
             )
             self.init_search()  # Call init with defaults
-        # --- Perform Indexing ---
         self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
         return self
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
             RuntimeError: If no search service is configured or provided, or if search fails.
             FileNotFoundError: If the collection managed by the service does not exist.
         """
-        # --- Determine which Search Service to use ---
         effective_service = search_service or self._search_service
         if not effective_service:
             raise RuntimeError(
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
             f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
         )
-        # --- Prepare Query and Options ---
         query_input = query
-        # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
-        # If we keep it here, it makes the mixin less generic.
-        # Let's assume the SearchService handles the query type appropriately for now.
-        # if isinstance(query, Region):
-        #     logger.debug("Query is a Region object. Extracting text.")
-        #     query_input = query.extract_text()
-        #     if not query_input or query_input.isspace():
-        #         logger.warning("Region provided for query has no extractable text.")
-        #         return []
         effective_options = options if options is not None else TextSearchOptions()
-        # --- Call SearchService Search Method ---
         try:
             results = effective_service.search(
                 query=query_input,
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
             # Consider wrapping in a SearchError?
             raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
-    # --- NEW Sync Method ---
     def sync_index(
         self,
         strategy: str = "full",  # 'full' (add/update/delete) or 'upsert_only'
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
         )
         summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
-        # --- Check Service Capabilities for 'full' sync ---
         if strategy == "full":
             required_methods = ["list_documents", "delete_documents"]
             missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
                     f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
                 )
-        # --- 1. Get Desired State (from current collection) ---
         desired_state: Dict[str, Indexable] = {}  # {id: item}
         desired_hashes: Dict[str, Optional[str]] = {}  # {id: hash or None}
         try:
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
         logger.info(f"Desired state contains {len(desired_state)} indexable items.")
-        # --- 2. Handle Different Strategies ---
         if strategy == "upsert_only":
             # Simple case: just index everything, let the service handle upserts
             items_to_index = list(desired_state.values())

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -71,6 +71,91 @@ def safe_parse_color(value_str: str) -> tuple:
     return (0, 0, 0)
+def _split_top_level_or(selector: str) -> List[str]:
+    """
+    Split a selector string on top-level OR operators (| or ,) only.
+    Respects parsing contexts and does not split when | or , appear inside:
+    - Quoted strings (both single and double quotes)
+    - Parentheses (for pseudo-class arguments like :not(...))
+    - Square brackets (for attribute selectors like [attr="value"])
+    Args:
+        selector: The selector string to split
+    Returns:
+        List of selector parts. If no top-level OR operators found, returns [selector].
+    Examples:
+        >>> _split_top_level_or('text:contains("a|b")|text:bold')
+        ['text:contains("a|b")', 'text:bold']
+        >>> _split_top_level_or('text:contains("hello,world")')
+        ['text:contains("hello,world")']
+    """
+    if not selector or not isinstance(selector, str):
+        return [selector] if selector else []
+    parts = []
+    current_part = ""
+    i = 0
+    # Parsing state
+    in_double_quotes = False
+    in_single_quotes = False
+    paren_depth = 0
+    bracket_depth = 0
+    while i < len(selector):
+        char = selector[i]
+        # Handle escape sequences in quotes
+        if i > 0 and selector[i-1] == '\\':
+            current_part += char
+            i += 1
+            continue
+        # Handle quote state changes
+        if char == '"' and not in_single_quotes:
+            in_double_quotes = not in_double_quotes
+        elif char == "'" and not in_double_quotes:
+            in_single_quotes = not in_single_quotes
+        # Handle parentheses and brackets only when not in quotes
+        elif not in_double_quotes and not in_single_quotes:
+            if char == '(':
+                paren_depth += 1
+            elif char == ')':
+                paren_depth -= 1
+            elif char == '[':
+                bracket_depth += 1
+            elif char == ']':
+                bracket_depth -= 1
+            # Check for top-level OR operators
+            elif (char == '|' or char == ',') and paren_depth == 0 and bracket_depth == 0:
+                # Found a top-level OR operator
+                part = current_part.strip()
+                if part:  # Only add non-empty parts
+                    parts.append(part)
+                current_part = ""
+                i += 1
+                continue
+        # Add character to current part
+        current_part += char
+        i += 1
+    # Add the final part
+    final_part = current_part.strip()
+    if final_part:
+        parts.append(final_part)
+    # If we only found one part, return it as a single-element list
+    # If we found multiple parts, those are the OR-separated parts
+    return parts if parts else [selector]
 def parse_selector(selector: str) -> Dict[str, Any]:
     """
     Parse a CSS-like selector string into a structured selector object.
@@ -80,12 +165,28 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     - Attribute presence (e.g., '[data-id]')
     - Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
     - Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
+    - OR operators (e.g., 'text:contains("A")|text:bold', 'sel1,sel2')
     Args:
         selector: CSS-like selector string
     Returns:
-        Dict representing the parsed selector
+        Dict representing the parsed selector, or compound selector with OR logic
+    Examples:
+        >>> parse_selector('text:contains("hello")')  # Single selector
+        {'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
+        >>> parse_selector('text:contains("A")|text:bold')  # OR with pipe
+        {'type': 'or', 'selectors': [...]}
+        >>> parse_selector('text:contains("A"),line[width>5]')  # OR with comma
+        {'type': 'or', 'selectors': [...]}
+    Note:
+        OR operators work with all selector types except spatial pseudo-classes
+        (:above, :below, :near, :left-of, :right-of) which require page context.
+        Spatial relationships within OR selectors are not currently supported.
     """
     result = {
         "type": "any",
@@ -100,6 +201,36 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     selector = selector.strip()
+    # --- Handle OR operators first (| or ,) ---
+    # Check if selector contains OR operators at the top level only
+    # (not inside quotes, parentheses, or brackets)
+    or_parts = _split_top_level_or(selector)
+    # If we found OR parts, parse each one recursively and return compound selector
+    if len(or_parts) > 1:
+        parsed_selectors = []
+        for part in or_parts:
+            try:
+                parsed_selectors.append(parse_selector(part))
+            except (ValueError, TypeError) as e:
+                logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
+                continue
+        if len(parsed_selectors) > 1:
+            return {
+                "type": "or",
+                "selectors": parsed_selectors
+            }
+        elif len(parsed_selectors) == 1:
+            # Only one valid part, return it directly
+            return parsed_selectors[0]
+        else:
+            # No valid parts, return default
+            logger.warning(f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector")
+            return result
+    # --- Continue with single selector parsing (existing logic) ---
     # --- Handle wildcard selector explicitly ---
     if selector == "*":
         # Wildcard matches any type, already the default.
@@ -109,12 +240,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
     # 1. Extract type (optional, at the beginning)
     # Only run if selector wasn't '*'
-    if selector:
-        type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
-        if type_match:
-            result["type"] = type_match.group(1).lower()
-            selector = selector[len(type_match.group(0)) :].strip()
-    # Only run if selector wasn't '*'
     if selector:
         type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
         if type_match:
@@ -597,12 +722,42 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
     To inspect the individual filters, call `_build_filter_list` directly.
     Args:
-        selector: Parsed selector dictionary
+        selector: Parsed selector dictionary (single or compound OR selector)
         **kwargs: Additional filter parameters (e.g., regex, case).
     Returns:
         Function that takes an element and returns True if it matches the selector.
     """
+    # Handle compound OR selectors
+    if selector.get("type") == "or":
+        sub_selectors = selector.get("selectors", [])
+        if not sub_selectors:
+            # Empty OR selector, return a function that never matches
+            return lambda element: False
+        # Create filter functions for each sub-selector
+        sub_filter_funcs = []
+        for sub_selector in sub_selectors:
+            sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
+        # Return OR combination - element matches if ANY sub-selector matches
+        def or_filter(element):
+            for func in sub_filter_funcs:
+                try:
+                    if func(element):
+                        return True
+                except Exception as e:
+                    logger.error(f"Error applying OR sub-filter to element: {e}", exc_info=True)
+                    # Continue to next sub-filter on error
+                    continue
+            return False
+        return or_filter
+    # Handle single selectors (existing logic)
     filter_list = _build_filter_list(selector, **kwargs)
     if logger.isEnabledFor(logging.DEBUG):

natural_pdf/widgets/viewer.py CHANGED Viewed

@@ -31,20 +31,6 @@ try:
     from PIL import Image
     from traitlets import Dict, List, Unicode, observe
-    # --- Read JS code from file (only needed if widgets are defined) --- #
-    _MODULE_DIR = os.path.dirname(__file__)
-    _FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
-    try:
-        with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
-            _FRONTEND_JS_CODE = f.read()
-        logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
-    except FileNotFoundError:
-        logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
-        _FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
-    except Exception as e:
-        logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
-        _FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
     # --- Define Widget Classes ONLY if ipywidgets is available ---
     class SimpleInteractiveViewerWidget(widgets.DOMWidget):
         def __init__(self, pdf_data=None, **kwargs):
@@ -631,7 +617,7 @@ try:
             # Filter out 'char' elements
             filtered_page_elements = [
-                el for el in page_elements if getattr(el, "type", "").lower() != "char"
+                el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
             ]
             logger.debug(
                 f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
@@ -659,19 +645,21 @@ try:
             for i, element in enumerate(filtered_page_elements):
                 # Get original coordinates and calculated width/height (always present via base class)
+                # Assuming 'element' is always an object with these attributes now
                 original_x0 = element.x0
                 original_y0 = element.top
                 original_x1 = element.x1
                 original_y1 = element.bottom
                 width = element.width
                 height = element.height
+                current_element_type = element.type  # Direct attribute access
                 scale = 1.0
                 # Base element dict with required info
                 elem_dict = {
                     "id": i,
                     # Use the standardized .type property
-                    "type": element.type,
+                    "type": current_element_type,
                     # Scaled coordinates for positioning in HTML/SVG
                     "x0": original_x0 * scale,
                     "y0": original_y0 * scale,
@@ -684,21 +672,24 @@ try:
                 # --- Get Default Attributes --- #
                 attributes_found = set()
                 for attr_name in default_attributes_to_get:
+                    # Assuming 'element' is always an object
                     if hasattr(element, attr_name):
                         try:
-                            value = getattr(element, attr_name)
+                            value_to_process = getattr(element, attr_name)
                             # Convert non-JSON serializable types to string
-                            processed_value = value
+                            processed_value = value_to_process
                             if (
-                                not isinstance(value, (str, int, float, bool, list, dict, tuple))
-                                and value is not None
+                                not isinstance(
+                                    value_to_process, (str, int, float, bool, list, dict, tuple)
+                                )
+                                and value_to_process is not None
                             ):
-                                processed_value = str(value)
+                                processed_value = str(value_to_process)
                             elem_dict[attr_name] = processed_value
                             attributes_found.add(attr_name)
                         except Exception as e:
                             logger.warning(
-                                f"Could not get or process default attribute '{attr_name}' for element {i} ({element.type}): {e}"
+                                f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
                             )
                 # --- Get User-Requested Attributes (if any) --- #
@@ -707,23 +698,23 @@ try:
                         # Only process if not already added and exists
                         if attr_name not in attributes_found and hasattr(element, attr_name):
                             try:
-                                value = getattr(element, attr_name)
-                                processed_value = value
+                                value_to_process = getattr(element, attr_name)
+                                processed_value = value_to_process
                                 if (
                                     not isinstance(
-                                        value, (str, int, float, bool, list, dict, tuple)
+                                        value_to_process, (str, int, float, bool, list, dict, tuple)
                                     )
-                                    and value is not None
+                                    and value_to_process is not None
                                 ):
-                                    processed_value = str(value)
+                                    processed_value = str(value_to_process)
                                 elem_dict[attr_name] = processed_value
                             except Exception as e:
                                 logger.warning(
-                                    f"Could not get or process requested attribute '{attr_name}' for element {i} ({element.type}): {e}"
+                                    f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
                                 )
-                for attr_name in elem_dict:
-                    if isinstance(elem_dict[attr_name], float):
-                        elem_dict[attr_name] = round(elem_dict[attr_name], 2)
+                for attr_name_val in elem_dict:  # Renamed to avoid conflict
+                    if isinstance(elem_dict[attr_name_val], float):
+                        elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
                 elements.append(elem_dict)
             logger.debug(

natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

natural-pdf 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl