PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/utils/logging/connector.py ADDED Viewed

@@ -0,0 +1,365 @@
+from typing import Any, Dict, Optional
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.utils.logging.sanitizer import DataSanitizer
+class LoggingConfig:
+    """Configuration for connector logging behavior."""
+    def __init__(
+        self,
+        log_file_paths: bool = False,
+        log_document_locations: Optional[bool] = None,
+        log_ids: bool = False,
+        log_document_ids: Optional[bool] = None,
+        log_progress_interval: int = 10,
+        sanitize_logs: bool = True,
+        show_connection_details: bool = False,
+    ):
+        # Backward compatibility: if new parameters aren't specified, use old ones
+        self.log_file_paths = log_file_paths
+        self.log_document_locations = (
+            log_document_locations if log_document_locations is not None else log_file_paths
+        )
+        self.log_ids = log_ids
+        self.log_document_ids = log_document_ids if log_document_ids is not None else log_ids
+        self.log_progress_interval = log_progress_interval
+        self.sanitize_logs = sanitize_logs
+        self.show_connection_details = show_connection_details
+class ConnectorLoggingMixin:
+    """Mixin class providing standardized logging patterns for connectors."""
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the mixin by setting up logging configuration and data sanitization.
+        This method ensures that the mixin provides standardized logging patterns for connectors.
+        It initializes:
+        - `_logging_config`: Manages logging behavior and settings.
+        - `_sanitizer`: Handles sanitization of sensitive data in logs.
+        Args:
+            *args: Positional arguments passed to the parent class.
+            **kwargs: Keyword arguments passed to the parent class.
+        """
+        super().__init__(*args, **kwargs)
+        self._logging_config = LoggingConfig()
+        self._sanitizer = DataSanitizer()
+    def set_logging_config(self, config: LoggingConfig):
+        """Set the logging configuration for this connector."""
+        self._logging_config = config
+    def _should_sanitize(self) -> bool:
+        """Check if log sanitization is enabled."""
+        return self._logging_config.sanitize_logs
+    def log_operation_start(self, operation: str, **kwargs):
+        """Log the start of a major operation."""
+        logger.info("Starting %s", operation)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s parameters: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s parameters: %s", operation, kwargs)
+    def log_operation_complete(self, operation: str, count: Optional[int] = None, **kwargs):
+        """Log the completion of a major operation."""
+        if count is not None:
+            logger.info("Completed %s (%s items)", operation, count)
+        else:
+            logger.info("Completed %s", operation)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s results: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s results: %s", operation, kwargs)
+    def log_connection_validated(self, connector_type: str, endpoint: Optional[str] = None):
+        """Log successful connection validation."""
+        if self._logging_config.show_connection_details and endpoint:
+            if self._should_sanitize():
+                sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+                logger.debug(
+                    "Connection to %s validated successfully: %s",
+                    connector_type,
+                    sanitized_endpoint,
+                )
+            else:
+                logger.debug(
+                    "Connection to %s validated successfully: %s", connector_type, endpoint
+                )
+        else:
+            logger.debug("Connection to %s validated successfully", connector_type)
+    def log_connection_failed(
+        self, connector_type: str, error: Exception, endpoint: Optional[str] = None
+    ):
+        """Log connection validation failure."""
+        if endpoint:
+            if self._should_sanitize():
+                sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+                logger.error(
+                    "Failed to validate %s connection to %s: %s",
+                    connector_type,
+                    sanitized_endpoint,
+                    error,
+                    exc_info=True,
+                )
+            else:
+                logger.error(
+                    "Failed to validate %s connection to %s: %s",
+                    connector_type,
+                    endpoint,
+                    error,
+                    exc_info=True,
+                )
+        else:
+            logger.error(
+                "Failed to validate %s connection: %s", connector_type, error, exc_info=True
+            )
+    def log_progress(
+        self, current: int, total: int, item_type: str = "items", operation: str = "Processing"
+    ):
+        """Log progress for long-running operations."""
+        if total > 0 and current % self._logging_config.log_progress_interval == 0:
+            progress = (current / total) * 100
+            logger.info("%s: %s/%s %s (%.1f%%)", operation, current, total, item_type, progress)
+    def log_batch_progress(
+        self, batch_num: int, total_batches: int, batch_size: int, operation: str = "Processing"
+    ):
+        """Log progress for batch operations."""
+        logger.info("%s batch %s/%s (%s items)", operation, batch_num, total_batches, batch_size)
+    def log_document_operation(
+        self,
+        operation: str,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        content_size: Optional[int] = None,
+        **kwargs,
+    ):
+        """Log document-related operations (universal for all connector types)."""
+        if self._logging_config.log_document_locations and document_location:
+            if self._should_sanitize():
+                sanitized_location = self._sanitizer.sanitize_location(document_location)
+                logger.debug("%s: %s", operation, sanitized_location)
+            else:
+                logger.debug("%s: %s", operation, document_location)
+        elif self._logging_config.log_document_ids and document_id:
+            if self._should_sanitize():
+                sanitized_id = self._sanitizer.sanitize_document_id(document_id)
+                logger.debug("%s: %s", operation, sanitized_id)
+            else:
+                logger.debug("%s: %s", operation, document_id)
+        else:
+            logger.debug("%s: <document>", operation)
+        if content_size is not None:
+            kwargs["content_size"] = content_size
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s details: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s details: %s", operation, kwargs)
+    def log_file_operation(
+        self,
+        operation: str,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        **kwargs,
+    ):
+        """Log file-related operations (backward compatibility wrapper)."""
+        self.log_document_operation(
+            operation=operation, document_location=file_path, document_id=file_id, **kwargs
+        )
+    def log_document_download_start(
+        self,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        content_size: Optional[int] = None,
+    ):
+        """Log the start of a document download/retrieval."""
+        logger.info("Starting document download")
+        self.log_document_operation(
+            "Download",
+            document_location=document_location,
+            document_id=document_id,
+            content_size=content_size,
+        )
+    def log_document_download_complete(
+        self,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        download_path: Optional[str] = None,
+        content_size: Optional[int] = None,
+        items_retrieved: Optional[int] = None,
+    ):
+        """Log the completion of a document download/retrieval."""
+        logger.info("Document download completed")
+        details = {}
+        if download_path:
+            details["download_path"] = download_path
+        if items_retrieved is not None:
+            details["items_retrieved"] = items_retrieved
+        self.log_document_operation(
+            "Download completed",
+            document_location=document_location,
+            document_id=document_id,
+            content_size=content_size,
+            **details,
+        )
+    def log_download_start(
+        self,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the start of a file download (backward compatibility wrapper)."""
+        self.log_document_download_start(
+            document_location=file_path, document_id=file_id, content_size=file_size
+        )
+    def log_download_complete(
+        self,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        download_path: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the completion of a file download (backward compatibility wrapper)."""
+        self.log_document_download_complete(
+            document_location=file_path,
+            document_id=file_id,
+            download_path=download_path,
+            content_size=file_size,
+        )
+    def log_upload_start(
+        self,
+        file_path: Optional[str] = None,
+        destination: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the start of a file upload."""
+        logger.info("Starting file upload")
+        details = {}
+        if destination:
+            details["destination"] = destination
+        self.log_file_operation("Upload", file_path=file_path, **details)
+    def log_upload_complete(
+        self,
+        file_path: Optional[str] = None,
+        destination: Optional[str] = None,
+        file_id: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the completion of a file upload."""
+        logger.info("File upload completed")
+        details = {}
+        if destination:
+            details["destination"] = destination
+        if file_id:
+            details["file_id"] = file_id
+        self.log_file_operation("Upload completed", file_path=file_path, **details)
+    def log_indexing_start(self, source_type: str, count: Optional[int] = None):
+        """Log the start of indexing operation."""
+        if count:
+            logger.info("Starting indexing of %s (%s items)", source_type, count)
+        else:
+            logger.info("Starting indexing of %s", source_type)
+    def log_indexing_complete(self, source_type: str, count: int):
+        """Log the completion of indexing operation."""
+        logger.info("Indexing completed: %s %s items indexed", count, source_type)
+    def log_info(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log an info message with optional context and sanitization."""
+        logger.info(message)
+        self._log_context("Info", context, **kwargs)
+    def log_debug(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log a debug message with optional context and sanitization."""
+        logger.debug(message)
+        self._log_context("Debug", context, **kwargs)
+    def log_warning(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log a warning message with optional context and sanitization."""
+        logger.warning(message)
+        self._log_context("Warning", context, **kwargs)
+    def log_error(
+        self,
+        message: str,
+        error: Optional[Exception] = None,
+        context: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """Log an error message with optional exception, context and sanitization."""
+        if error:
+            logger.error("%s: %s", message, error, exc_info=True)
+        else:
+            logger.error(message)
+        self._log_context("Error", context, **kwargs)
+    def _log_context(self, log_type: str, context: Optional[Dict[str, Any]], **kwargs):
+        """Helper method to log context with sanitization."""
+        all_context = {}
+        if context:
+            all_context.update(context)
+        if kwargs:
+            all_context.update(kwargs)
+        if all_context:
+            if self._should_sanitize():
+                sanitized_context = self._sanitizer.sanitize_dict(all_context)
+                logger.debug("%s context: %s", log_type, sanitized_context)
+            else:
+                logger.debug("%s context: %s", log_type, all_context)
+    def log_api_call(self, method: str, endpoint: str, status_code: Optional[int] = None, **kwargs):
+        """Log API call details."""
+        if self._should_sanitize():
+            sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+            if status_code:
+                logger.debug("API call: %s %s -> %s", method, sanitized_endpoint, status_code)
+            else:
+                logger.debug("API call: %s %s", method, sanitized_endpoint)
+        else:
+            if status_code:
+                logger.debug("API call: %s %s -> %s", method, endpoint, status_code)
+            else:
+                logger.debug("API call: %s %s", method, endpoint)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("API call details: %s", sanitized_kwargs)
+            else:
+                logger.debug("API call details: %s", kwargs)

unstructured_ingest/processes/utils/logging/sanitizer.py ADDED Viewed

@@ -0,0 +1,117 @@
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+from urllib.parse import urlparse
+class DataSanitizer:
+    """Utility class for sanitizing sensitive data in logs."""
+    @staticmethod
+    def sanitize_path(path: Union[str, Path]) -> str:
+        """Sanitize file paths for logging, showing only filename and partial path."""
+        if not path:
+            return "<empty>"
+        path_str = str(path)
+        path_obj = Path(path_str)
+        if len(path_obj.parts) > 2:
+            return f".../{path_obj.parent.name}/{path_obj.name}"
+        return path_obj.name
+    @staticmethod
+    def sanitize_id(identifier: str) -> str:
+        """Sanitize IDs for logging, showing only first/last few characters."""
+        if not identifier:
+            return "<id>"
+        if len(identifier) < 10:
+            half_len = len(identifier) // 2
+            return f"{identifier[:half_len]}..."
+        return f"{identifier[:4]}...{identifier[-4:]}"
+    @staticmethod
+    def sanitize_url(url: str) -> str:
+        """Sanitize URLs for logging, removing sensitive query parameters."""
+        if not url:
+            return "<url>"
+        try:
+            parsed = urlparse(url)
+            return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
+        except (ValueError, TypeError):
+            return "<url>"
+    @staticmethod
+    def sanitize_token(token: str) -> str:
+        """Sanitize tokens and secrets for logging."""
+        if not token:
+            return "<token>"
+        if len(token) < 10:
+            half_len = len(token) // 2
+            return f"{token[:half_len]}..."
+        return f"{token[:4]}...{token[-4:]}"
+    @staticmethod
+    def sanitize_location(location: Union[str, Path]) -> str:
+        """Sanitize document locations (file paths, URLs, database references) for logging."""
+        if not location:
+            return "<empty>"
+        location_str = str(location)
+        # Handle URLs
+        if location_str.startswith(("http://", "https://", "ftp://", "ftps://")):
+            return DataSanitizer.sanitize_url(location_str)
+        # Handle database-style references (table:id, collection/document, etc.)
+        if ":" in location_str and not location_str.startswith("/"):
+            parts = location_str.split(":", 1)
+            if len(parts) == 2:
+                table_name, record_id = parts
+                return f"{table_name}:{DataSanitizer.sanitize_id(record_id)}"
+        return DataSanitizer.sanitize_path(location_str)
+    @staticmethod
+    def sanitize_document_id(document_id: str) -> str:
+        """Sanitize document IDs for logging (alias for sanitize_id for clarity)."""
+        return DataSanitizer.sanitize_id(document_id)
+    @staticmethod
+    def sanitize_dict(data: Dict[str, Any], sensitive_keys: Optional[set] = None) -> Dict[str, Any]:
+        """Sanitize dictionary data for logging."""
+        if sensitive_keys is None:
+            sensitive_keys = {
+                "password",
+                "token",
+                "secret",
+                "key",
+                "api_key",
+                "access_token",
+                "refresh_token",
+                "client_secret",
+                "private_key",
+                "credentials",
+            }
+        sanitized = {}
+        for k, v in data.items():
+            key_lower = k.lower()
+            if any(sensitive_key in key_lower for sensitive_key in sensitive_keys):
+                sanitized[k] = DataSanitizer.sanitize_token(str(v))
+            elif isinstance(v, dict):
+                sanitized[k] = DataSanitizer.sanitize_dict(v, sensitive_keys)
+            elif isinstance(v, (str, Path)) and (
+                "path" in key_lower
+                or "file" in key_lower
+                or "location" in key_lower
+                or "document_location" in key_lower
+            ):
+                sanitized[k] = DataSanitizer.sanitize_location(v)
+            elif isinstance(v, str) and (
+                ("id" in key_lower and len(str(v)) > 8)
+                or ("document_id" in key_lower and len(str(v)) > 8)
+            ):
+                sanitized[k] = DataSanitizer.sanitize_document_id(v)
+            else:
+                sanitized[k] = v
+        return sanitized

unstructured_ingest/unstructured_api.py ADDED Viewed

@@ -0,0 +1,140 @@
+from dataclasses import fields
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from unstructured_ingest.error import ProviderError, QuotaError, UserAuthError, UserError
+from unstructured_ingest.logger import logger
+if TYPE_CHECKING:
+    from unstructured_client.models.operations import PartitionRequest
+def create_partition_request(filename: Path, parameters_dict: dict) -> "PartitionRequest":
+    """Given a filename and a dict of API parameters, return a PartitionRequest for use
+    by unstructured-client. Remove any params that aren't recognized by the SDK.
+    Args:
+        filename: Path to the file being partitioned
+        parameters_dict: A mapping of all API params we want to send
+    Returns: A PartitionRequest containing the file and all valid params
+    """
+    from unstructured_client.models.operations import PartitionRequest
+    from unstructured_client.models.shared import Files, PartitionParameters
+    # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
+    # Prior to this it was a dataclass which doesn't have .__fields
+    try:
+        possible_fields = PartitionParameters.model_fields
+    except AttributeError:
+        possible_fields = [f.name for f in fields(PartitionParameters)]
+    filtered_partition_request = {k: v for k, v in parameters_dict.items() if k in possible_fields}
+    if len(filtered_partition_request) != len(parameters_dict):
+        logger.debug(
+            "Following fields were omitted due to not being "
+            "supported by the currently used unstructured client: {}".format(
+                ", ".join([v for v in parameters_dict if v not in filtered_partition_request])
+            )
+        )
+    logger.debug(f"using hosted partitioner with kwargs: {parameters_dict}")
+    with open(filename, "rb") as f:
+        files = Files(
+            content=f.read(),
+            file_name=str(filename.resolve()),
+        )
+        filtered_partition_request["files"] = files
+    partition_params = PartitionParameters(**filtered_partition_request)
+    return PartitionRequest(partition_parameters=partition_params)
+def wrap_error(e: Exception) -> Exception:
+    from unstructured_client.models.errors.httpvalidationerror import HTTPValidationError
+    from unstructured_client.models.errors.sdkerror import SDKError
+    from unstructured_client.models.errors.servererror import ServerError
+    if isinstance(e, HTTPValidationError):
+        return UserError(e.data.detail)
+    if isinstance(e, ServerError):
+        return ProviderError(e.data.detail)
+    if not isinstance(e, SDKError):
+        logger.error(f"Uncaught Error calling API: {e}")
+        raise e
+    status_code = e.status_code
+    body = e.body
+    if status_code == 402:
+        return QuotaError(body)
+    if status_code in [401, 403]:
+        return UserAuthError(body)
+    if 400 <= status_code < 500:
+        return UserError(body)
+    if status_code >= 500:
+        return ProviderError(body)
+    logger.error(f"Uncaught Error calling API: {e}")
+    raise e
+async def call_api_async(
+    server_url: Optional[str],
+    api_key: Optional[str],
+    filename: Path,
+    api_parameters: dict,
+    timeout_ms: Optional[int] = None,
+) -> list[dict]:
+    """Call the Unstructured API using unstructured-client.
+    Args:
+        server_url: The base URL where the API is hosted
+        api_key: The user's API key (can be empty if this is a self hosted API)
+        filename: Path to the file being partitioned
+        api_parameters: A dict containing the requested API parameters
+    Returns: A list of the file's elements, or an empty list if there was an error
+    """
+    from unstructured_client import UnstructuredClient
+    client = UnstructuredClient(server_url=server_url, api_key_auth=api_key)
+    partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
+    try:
+        res = await client.general.partition_async(request=partition_request, timeout_ms=timeout_ms)
+    except Exception as e:
+        raise wrap_error(e)
+    return res.elements or []
+def call_api(
+    server_url: Optional[str],
+    api_key: Optional[str],
+    filename: Path,
+    api_parameters: dict,
+    timeout_ms: Optional[int] = None,
+) -> list[dict]:
+    """Call the Unstructured API using unstructured-client.
+    Args:
+        server_url: The base URL where the API is hosted
+        api_key: The user's API key (can be empty if this is a self hosted API)
+        filename: Path to the file being partitioned
+        api_parameters: A dict containing the requested API parameters
+    Returns: A list of the file's elements, or an empty list if there was an error
+    """
+    from unstructured_client import UnstructuredClient
+    client = UnstructuredClient(
+        server_url=server_url,
+        api_key_auth=api_key,
+    )
+    partition_request = create_partition_request(filename=filename, parameters_dict=api_parameters)
+    try:
+        res = client.general.partition(request=partition_request, timeout_ms=timeout_ms)
+    except Exception as e:
+        raise wrap_error(e)
+    return res.elements or []

unstructured_ingest/utils/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Utility functions for unstructured-ingest."""
+from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
+__all__ = ["mkdir_concurrent_safe"]

unstructured_ingest/utils/chunking.py ADDED Viewed

@@ -0,0 +1,56 @@
+import base64
+import hashlib
+import json
+import zlib
+from itertools import groupby
+def id_to_hash(element: dict, sequence_number: int) -> str:
+    """Calculates and assigns a deterministic hash as an ID.
+    The hash ID is based on element's text, sequence number on page,
+    page number and its filename.
+    Args:
+        sequence_number: index on page
+    Returns: new ID value
+    """
+    filename = element["metadata"].get("filename")
+    text = element["text"]
+    page_number = element["metadata"].get("page_number")
+    data = f"{filename}{text}{page_number}{sequence_number}"
+    element["element_id"] = hashlib.sha256(data.encode()).hexdigest()[:32]
+    return element["element_id"]
+def assign_and_map_hash_ids(elements: list[dict]) -> list[dict]:
+    # -- generate sequence number for each element on a page --
+    elements = elements.copy()
+    page_numbers = [e["metadata"].get("page_number") for e in elements]
+    page_seq_pairs = [
+        seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
+    ]
+    # -- assign hash IDs to elements --
+    old_to_new_mapping = {
+        element["element_id"]: id_to_hash(element=element, sequence_number=seq_on_page_counter)
+        for element, seq_on_page_counter in zip(elements, page_seq_pairs)
+    }
+    # -- map old parent IDs to new ones --
+    for e in elements:
+        parent_id = e["metadata"].get("parent_id")
+        if not parent_id:
+            continue
+        e["metadata"]["parent_id"] = old_to_new_mapping[parent_id]
+    return elements
+def elements_from_base64_gzipped_json(raw_s: str) -> list[dict]:
+    decoded_b64_bytes = base64.b64decode(raw_s)
+    elements_json_bytes = zlib.decompress(decoded_b64_bytes)
+    elements_json_str = elements_json_bytes.decode("utf-8")
+    element_dicts = json.loads(elements_json_str)
+    return element_dicts