PyPI - unstructured-ingest - Versions diffs - 1.0.55__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

unstructured-ingest 1.0.55py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (20) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0~~.55~~" # pragma: no cover
1	+ __version__ = "1.1.0" # pragma: no cover

unstructured_ingest/embed/azure_openai.py CHANGED Viewed

@@ -9,6 +9,7 @@ from unstructured_ingest.embed.openai import (
     OpenAIEmbeddingEncoder,
 )
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
 if TYPE_CHECKING:
     from openai import AsyncAzureOpenAI, AzureOpenAI
@@ -23,9 +24,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
     @requires_dependencies(["openai"], extras="openai")
     def get_client(self) -> "AzureOpenAI":
-        from openai import AzureOpenAI
+        from openai import AzureOpenAI, DefaultHttpxClient
+        client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
         return AzureOpenAI(
+            http_client=client,
             api_key=self.api_key.get_secret_value(),
             api_version=self.api_version,
             azure_endpoint=self.azure_endpoint,
@@ -33,9 +36,11 @@ class AzureOpenAIEmbeddingConfig(OpenAIEmbeddingConfig):
     @requires_dependencies(["openai"], extras="openai")
     def get_async_client(self) -> "AsyncAzureOpenAI":
-        from openai import AsyncAzureOpenAI
+        from openai import AsyncAzureOpenAI, DefaultAsyncHttpxClient
+        client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
         return AsyncAzureOpenAI(
+            http_client=client,
             api_key=self.api_key.get_secret_value(),
             api_version=self.api_version,
             azure_endpoint=self.azure_endpoint,

unstructured_ingest/embed/openai.py CHANGED Viewed

@@ -18,6 +18,7 @@ from unstructured_ingest.errors_v2 import (
 )
 from unstructured_ingest.logger import logger
 from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.utils.tls import ssl_context_with_optional_ca_override
 if TYPE_CHECKING:
     from openai import AsyncOpenAI, OpenAI
@@ -86,15 +87,21 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
     @requires_dependencies(["openai"], extras="openai")
     def get_client(self) -> "OpenAI":
-        from openai import OpenAI
+        from openai import DefaultHttpxClient, OpenAI
-        return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
+        client = DefaultHttpxClient(verify=ssl_context_with_optional_ca_override())
+        return OpenAI(
+            api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
+        )
     @requires_dependencies(["openai"], extras="openai")
     def get_async_client(self) -> "AsyncOpenAI":
-        from openai import AsyncOpenAI
+        from openai import AsyncOpenAI, DefaultAsyncHttpxClient
-        return AsyncOpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
+        client = DefaultAsyncHttpxClient(verify=ssl_context_with_optional_ca_override())
+        return AsyncOpenAI(
+            api_key=self.api_key.get_secret_value(), http_client=client, base_url=self.base_url
+        )
 @dataclass

unstructured_ingest/interfaces/connector.py CHANGED Viewed

@@ -5,6 +5,8 @@ from typing import Any, TypeVar, Union
 from pydantic import BaseModel, Secret, model_validator
 from pydantic.types import _SecretBase
+from unstructured_ingest.processes.utils.logging.connector import ConnectorLoggingMixin
 class AccessConfig(BaseModel):
     """Meant to designate holding any sensitive information associated with other configs
@@ -46,5 +48,9 @@ ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
 @dataclass
-class BaseConnector(ABC):
+class BaseConnector(ABC, ConnectorLoggingMixin):
     connection_config: ConnectionConfigT
+    def __post_init__(self):
+        """Initialize the logging mixin after dataclass initialization."""
+        ConnectorLoggingMixin.__init__(self)

unstructured_ingest/otel.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import os
 from dataclasses import dataclass, field
 from typing import Callable, ClassVar, Optional, Protocol, Sequence
@@ -31,13 +32,27 @@ class LogSpanExporter(ConsoleSpanExporter):
             self.log_out(self.formatter(span))
         return SpanExportResult.SUCCESS
+def get_log_out() -> Callable:
+    level_names_mapping = {
+        'CRITICAL': logging.CRITICAL,
+        'FATAL': logging.FATAL,
+        'ERROR': logging.ERROR,
+        'WARN': logging.WARNING,
+        'WARNING': logging.WARNING,
+        'INFO': logging.INFO,
+        'DEBUG': logging.DEBUG,
+        'NOTSET': logging.NOTSET,
+    }
+    log_level = os.getenv("OTEL_LOG_LEVEL", "DEBUG").upper()
+    log_level_int = level_names_mapping.get(log_level, logging.DEBUG)
+    return lambda message: logger.log(log_level_int, message)
 @dataclass
 class OtelHandler:
     otel_endpoint: Optional[str] = None
     service_name: str = "unstructured-ingest"
     trace_provider: TracerProvider = field(init=False)
-    log_out: Callable = field(default=logger.info)
+    log_out: Callable = field(default=get_log_out())
     trace_context_key: ClassVar[str] = "_trace_context"
     def init_trace(self):

unstructured_ingest/processes/connectors/fsspec/fsspec.py CHANGED Viewed

@@ -28,7 +28,6 @@ from unstructured_ingest.interfaces import (
     Uploader,
     UploaderConfig,
 )
-from unstructured_ingest.logger import logger
 from unstructured_ingest.processes.connectors.fsspec.utils import sterilize_dict
 from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
@@ -106,6 +105,12 @@ class FsspecIndexer(Indexer):
     def precheck(self) -> None:
         from fsspec import get_filesystem_class
+        self.log_operation_start(
+            "Connection validation",
+            protocol=self.index_config.protocol,
+            path=self.index_config.path_without_protocol,
+        )
         try:
             fs = get_filesystem_class(self.index_config.protocol)(
                 **self.connection_config.get_access_config(),
@@ -113,13 +118,24 @@ class FsspecIndexer(Indexer):
             files = fs.ls(path=self.index_config.path_without_protocol, detail=True)
             valid_files = [x.get("name") for x in files if x.get("type") == "file"]
             if not valid_files:
+                self.log_operation_complete("Connection validation", count=0)
                 return
             file_to_sample = valid_files[0]
-            logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
+            self.log_debug(f"attempting to make HEAD request for file: {file_to_sample}")
             with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
                 client.head(path=file_to_sample)
+            self.log_connection_validated(
+                connector_type=self.connector_type,
+                endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
+            )
         except Exception as e:
-            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            self.log_connection_failed(
+                connector_type=self.connector_type,
+                error=e,
+                endpoint=f"{self.index_config.protocol}://{self.index_config.path_without_protocol}",
+            )
             raise self.wrap_error(e=e)
     def get_file_info(self) -> list[dict[str, Any]]:
@@ -150,7 +166,7 @@ class FsspecIndexer(Indexer):
     def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
         if len(files) <= n:
-            logger.warning(
+            self.log_warning(
                 f"number of files to be sampled={n} is not smaller than the number"
                 f" of files found ({len(files)}). Returning all of the files as the"
                 " sample."
@@ -201,9 +217,22 @@ class FsspecIndexer(Indexer):
         init_file_data.additional_metadata = self.get_metadata(file_info=file_info)
     def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        self.log_indexing_start(f"{self.connector_type} files")
         files = self.get_file_info()
-        for file_info in files:
+        total_files = len(files)
+        self.log_operation_start("File indexing", total_files=total_files)
+        for i, file_info in enumerate(files):
             file_path = self.get_path(file_info=file_info)
+            # Only log progress for larger operations
+            if total_files > 5:
+                self.log_progress(
+                    current=i + 1, total=total_files, item_type="files", operation="Indexing"
+                )
             # Note: we remove any remaining leading slashes (Box introduces these)
             # to get a valid relative path
             rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
@@ -223,6 +252,8 @@ class FsspecIndexer(Indexer):
                 display_name=file_path,
             )
+        self.log_indexing_complete(f"{self.connector_type} files", total_files)
 class FsspecDownloaderConfig(DownloaderConfig):
     pass
@@ -272,25 +303,57 @@ class FsspecDownloader(Downloader):
     def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         mkdir_concurrent_safe(download_path.parent)
+        rpath = file_data.additional_metadata["original_file_path"]
+        file_size = file_data.metadata.filesize_bytes
+        self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
         try:
-            rpath = file_data.additional_metadata["original_file_path"]
             with self.connection_config.get_client(protocol=self.protocol) as client:
                 client.get_file(rpath=rpath, lpath=download_path.as_posix())
             self.handle_directory_download(lpath=download_path)
         except Exception as e:
+            self.log_error(
+                "File download failed",
+                error=e,
+                context={"file_path": rpath, "file_id": file_data.identifier},
+            )
             raise self.wrap_error(e=e)
+        self.log_download_complete(
+            file_path=rpath,
+            file_id=file_data.identifier,
+            download_path=str(download_path),
+        )
         return self.generate_download_response(file_data=file_data, download_path=download_path)
     async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
         download_path = self.get_download_path(file_data=file_data)
         mkdir_concurrent_safe(download_path.parent)
+        rpath = file_data.additional_metadata["original_file_path"]
+        file_size = file_data.metadata.filesize_bytes
+        self.log_download_start(file_path=rpath, file_id=file_data.identifier, file_size=file_size)
         try:
-            rpath = file_data.additional_metadata["original_file_path"]
             with self.connection_config.get_client(protocol=self.protocol) as client:
                 await client.get_file(rpath=rpath, lpath=download_path.as_posix())
             self.handle_directory_download(lpath=download_path)
         except Exception as e:
+            self.log_error(
+                "File download failed",
+                error=e,
+                context={"file_path": rpath, "file_id": file_data.identifier},
+            )
             raise self.wrap_error(e=e)
+        self.log_download_complete(
+            file_path=rpath,
+            file_id=file_data.identifier,
+            download_path=str(download_path),
+        )
         return self.generate_download_response(file_data=file_data, download_path=download_path)
@@ -321,6 +384,7 @@ class FsspecUploader(Uploader):
         )
     def __post_init__(self):
+        super().__post_init__()
         # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove:
         if not self.upload_config:
             raise TypeError(
@@ -334,6 +398,8 @@ class FsspecUploader(Uploader):
     def precheck(self) -> None:
         from fsspec import get_filesystem_class
+        self.log_operation_start("Connection validation", protocol=self.upload_config.protocol)
         try:
             fs = get_filesystem_class(self.upload_config.protocol)(
                 **self.connection_config.get_access_config(),
@@ -341,7 +407,16 @@ class FsspecUploader(Uploader):
             upload_path = Path(self.upload_config.path_without_protocol) / "_empty"
             fs.write_bytes(path=upload_path.as_posix(), value=b"")
         except Exception as e:
+            self.log_connection_failed(
+                connector_type=self.connector_type,
+                error=e,
+                endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
+            )
             raise self.wrap_error(e=e)
+        self.log_connection_validated(
+            connector_type=self.connector_type,
+            endpoint=f"{self.upload_config.protocol}://{self.upload_config.path_without_protocol}",
+        )
     def get_upload_path(self, file_data: FileData) -> Path:
         upload_path = Path(
@@ -353,14 +428,31 @@ class FsspecUploader(Uploader):
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
-        logger.debug(f"writing local file {path_str} to {upload_path}")
-        with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
-            client.upload(lpath=path_str, rpath=upload_path.as_posix())
+        self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
+        try:
+            with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
+                client.upload(lpath=path_str, rpath=upload_path.as_posix())
+        except Exception as e:
+            self.log_error(
+                "File upload failed",
+                error=e,
+                context={"file_path": path_str, "destination": upload_path.as_posix()},
+            )
+            raise self.wrap_error(e=e)
+        self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())
     async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
         path_str = str(path.resolve())
         upload_path = self.get_upload_path(file_data=file_data)
-        # Odd that fsspec doesn't run exists() as async even when client support async
-        logger.debug(f"writing local file {path_str} to {upload_path}")
-        with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
-            client.upload(lpath=path_str, rpath=upload_path.as_posix())
+        self.log_upload_start(file_path=path_str, destination=upload_path.as_posix())
+        try:
+            with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
+                client.upload(lpath=path_str, rpath=upload_path.as_posix())
+        except Exception as e:
+            self.log_error(
+                "File upload failed",
+                error=e,
+                context={"file_path": path_str, "destination": upload_path.as_posix()},
+            )
+            raise self.wrap_error(e=e)
+        self.log_upload_complete(file_path=path_str, destination=upload_path.as_posix())

unstructured_ingest/processes/connectors/fsspec/s3.py CHANGED Viewed

@@ -104,7 +104,13 @@ class S3ConnectionConfig(FsspecConnectionConfig):
                 return UserError(message)
             if http_code >= 500:
                 return ProviderError(message)
-        logger.error(f"unhandled exception from s3 ({type(e)}): {e}", exc_info=True)
+        logger.error(
+            "Unhandled exception from S3 (type: %s, endpoint: %s): %s",
+            type(e).__name__,
+            self.endpoint_url or "default",
+            e,
+            exc_info=True,
+        )
         return e
@@ -122,6 +128,10 @@ class S3Indexer(FsspecIndexer):
     def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
         path = file_info["Key"]
+        self.log_debug("Getting metadata for S3 object", context={"file_path": path})
+        self.log_file_operation("Getting metadata", file_path=path)
         date_created = None
         date_modified = None
         modified = file_info.get("LastModified")
@@ -147,9 +157,9 @@ class S3Indexer(FsspecIndexer):
             record_locator["metadata"] = metadata
         issue_characters = [char for char in CHARACTERS_TO_AVOID if char in path]
         if issue_characters:
-            logger.warning(
-                f"File path {path} contains characters "
-                f"that can cause issues with S3: {issue_characters}"
+            self.log_warning(
+                f"File path contains characters that can cause issues with S3: {issue_characters}",
+                context={"path": path, "problematic_characters": issue_characters},
             )
         return FileDataSourceMetadata(
             date_created=date_created,

unstructured_ingest/processes/connectors/onedrive.py CHANGED Viewed

@@ -115,23 +115,24 @@ class OnedriveConnectionConfig(ConnectionConfig):
             except ValueError as exc:
                 logger.error("Couldn't set up credentials.")
                 raise exc
             if "error" in token:
                 error_codes = token.get("error_codes", [])
                 error_type = token.get("error", "")
                 error_description = token.get("error_description", "")
                 # 7000215: Invalid client secret provided
                 # 7000218: Invalid client id provided
                 # 700016: Application not found in directory
                 # 90002: Tenant not found
                 auth_error_codes = [7000215, 7000218, 700016, 90002]
-                if (any(code in error_codes for code in auth_error_codes) or
-                    error_type in ["invalid_client", "unauthorized_client", "invalid_grant"]):
-                    raise UserAuthError(
-                        f"Authentication failed: {error_type}: {error_description}"
-                    )
+                if any(code in error_codes for code in auth_error_codes) or error_type in [
+                    "invalid_client",
+                    "unauthorized_client",
+                    "invalid_grant",
+                ]:
+                    raise UserAuthError(f"Authentication failed: {error_type}: {error_description}")
                 else:
                     raise SourceConnectionNetworkError(
                         f"Failed to fetch token: {error_type}: {error_description}"

unstructured_ingest/processes/connectors/sharepoint.py CHANGED Viewed

@@ -87,6 +87,7 @@ class SharepointIndexerConfig(OnedriveIndexerConfig):
     # TODO: We can probably make path non-optional on OnedriveIndexerConfig once tested
     path: str = Field(default="")
 @dataclass
 class SharepointIndexer(OnedriveIndexer):
     connection_config: SharepointConnectionConfig
@@ -114,14 +115,14 @@ class SharepointIndexer(OnedriveIndexer):
     def _is_root_path(self, path: str) -> bool:
         """Check if the path represents root access (empty string or legacy default)."""
         return not path or not path.strip() or path == LEGACY_DEFAULT_PATH
     def _get_target_drive_item(self, site_drive_item: DriveItem, path: str) -> DriveItem:
         """Get the drive item to search in based on the path."""
         if self._is_root_path(path):
             return site_drive_item
         else:
             return site_drive_item.get_by_path(path).get().execute_query()
     def _validate_folder_path(self, site_drive_item: DriveItem, path: str) -> None:
         """Validate that a specific folder path exists and is accessible."""
         from office365.runtime.client_request_exception import ClientRequestException

unstructured_ingest/processes/utils/__init__.py CHANGED Viewed

@@ -0,0 +1,8 @@
+from .logging.connector import ConnectorLoggingMixin, LoggingConfig
+from .logging.sanitizer import DataSanitizer
+__all__ = [
+    "ConnectorLoggingMixin",
+    "DataSanitizer",
+    "LoggingConfig",
+]

unstructured_ingest/processes/utils/logging/connector.py ADDED Viewed

@@ -0,0 +1,365 @@
+from typing import Any, Dict, Optional
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.utils.logging.sanitizer import DataSanitizer
+class LoggingConfig:
+    """Configuration for connector logging behavior."""
+    def __init__(
+        self,
+        log_file_paths: bool = False,
+        log_document_locations: Optional[bool] = None,
+        log_ids: bool = False,
+        log_document_ids: Optional[bool] = None,
+        log_progress_interval: int = 10,
+        sanitize_logs: bool = True,
+        show_connection_details: bool = False,
+    ):
+        # Backward compatibility: if new parameters aren't specified, use old ones
+        self.log_file_paths = log_file_paths
+        self.log_document_locations = (
+            log_document_locations if log_document_locations is not None else log_file_paths
+        )
+        self.log_ids = log_ids
+        self.log_document_ids = log_document_ids if log_document_ids is not None else log_ids
+        self.log_progress_interval = log_progress_interval
+        self.sanitize_logs = sanitize_logs
+        self.show_connection_details = show_connection_details
+class ConnectorLoggingMixin:
+    """Mixin class providing standardized logging patterns for connectors."""
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize the mixin by setting up logging configuration and data sanitization.
+        This method ensures that the mixin provides standardized logging patterns for connectors.
+        It initializes:
+        - `_logging_config`: Manages logging behavior and settings.
+        - `_sanitizer`: Handles sanitization of sensitive data in logs.
+        Args:
+            *args: Positional arguments passed to the parent class.
+            **kwargs: Keyword arguments passed to the parent class.
+        """
+        super().__init__(*args, **kwargs)
+        self._logging_config = LoggingConfig()
+        self._sanitizer = DataSanitizer()
+    def set_logging_config(self, config: LoggingConfig):
+        """Set the logging configuration for this connector."""
+        self._logging_config = config
+    def _should_sanitize(self) -> bool:
+        """Check if log sanitization is enabled."""
+        return self._logging_config.sanitize_logs
+    def log_operation_start(self, operation: str, **kwargs):
+        """Log the start of a major operation."""
+        logger.info("Starting %s", operation)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s parameters: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s parameters: %s", operation, kwargs)
+    def log_operation_complete(self, operation: str, count: Optional[int] = None, **kwargs):
+        """Log the completion of a major operation."""
+        if count is not None:
+            logger.info("Completed %s (%s items)", operation, count)
+        else:
+            logger.info("Completed %s", operation)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s results: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s results: %s", operation, kwargs)
+    def log_connection_validated(self, connector_type: str, endpoint: Optional[str] = None):
+        """Log successful connection validation."""
+        if self._logging_config.show_connection_details and endpoint:
+            if self._should_sanitize():
+                sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+                logger.debug(
+                    "Connection to %s validated successfully: %s",
+                    connector_type,
+                    sanitized_endpoint,
+                )
+            else:
+                logger.debug(
+                    "Connection to %s validated successfully: %s", connector_type, endpoint
+                )
+        else:
+            logger.debug("Connection to %s validated successfully", connector_type)
+    def log_connection_failed(
+        self, connector_type: str, error: Exception, endpoint: Optional[str] = None
+    ):
+        """Log connection validation failure."""
+        if endpoint:
+            if self._should_sanitize():
+                sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+                logger.error(
+                    "Failed to validate %s connection to %s: %s",
+                    connector_type,
+                    sanitized_endpoint,
+                    error,
+                    exc_info=True,
+                )
+            else:
+                logger.error(
+                    "Failed to validate %s connection to %s: %s",
+                    connector_type,
+                    endpoint,
+                    error,
+                    exc_info=True,
+                )
+        else:
+            logger.error(
+                "Failed to validate %s connection: %s", connector_type, error, exc_info=True
+            )
+    def log_progress(
+        self, current: int, total: int, item_type: str = "items", operation: str = "Processing"
+    ):
+        """Log progress for long-running operations."""
+        if total > 0 and current % self._logging_config.log_progress_interval == 0:
+            progress = (current / total) * 100
+            logger.info("%s: %s/%s %s (%.1f%%)", operation, current, total, item_type, progress)
+    def log_batch_progress(
+        self, batch_num: int, total_batches: int, batch_size: int, operation: str = "Processing"
+    ):
+        """Log progress for batch operations."""
+        logger.info("%s batch %s/%s (%s items)", operation, batch_num, total_batches, batch_size)
+    def log_document_operation(
+        self,
+        operation: str,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        content_size: Optional[int] = None,
+        **kwargs,
+    ):
+        """Log document-related operations (universal for all connector types)."""
+        if self._logging_config.log_document_locations and document_location:
+            if self._should_sanitize():
+                sanitized_location = self._sanitizer.sanitize_location(document_location)
+                logger.debug("%s: %s", operation, sanitized_location)
+            else:
+                logger.debug("%s: %s", operation, document_location)
+        elif self._logging_config.log_document_ids and document_id:
+            if self._should_sanitize():
+                sanitized_id = self._sanitizer.sanitize_document_id(document_id)
+                logger.debug("%s: %s", operation, sanitized_id)
+            else:
+                logger.debug("%s: %s", operation, document_id)
+        else:
+            logger.debug("%s: <document>", operation)
+        if content_size is not None:
+            kwargs["content_size"] = content_size
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("%s details: %s", operation, sanitized_kwargs)
+            else:
+                logger.debug("%s details: %s", operation, kwargs)
+    def log_file_operation(
+        self,
+        operation: str,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        **kwargs,
+    ):
+        """Log file-related operations (backward compatibility wrapper)."""
+        self.log_document_operation(
+            operation=operation, document_location=file_path, document_id=file_id, **kwargs
+        )
+    def log_document_download_start(
+        self,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        content_size: Optional[int] = None,
+    ):
+        """Log the start of a document download/retrieval."""
+        logger.info("Starting document download")
+        self.log_document_operation(
+            "Download",
+            document_location=document_location,
+            document_id=document_id,
+            content_size=content_size,
+        )
+    def log_document_download_complete(
+        self,
+        document_location: Optional[str] = None,
+        document_id: Optional[str] = None,
+        download_path: Optional[str] = None,
+        content_size: Optional[int] = None,
+        items_retrieved: Optional[int] = None,
+    ):
+        """Log the completion of a document download/retrieval."""
+        logger.info("Document download completed")
+        details = {}
+        if download_path:
+            details["download_path"] = download_path
+        if items_retrieved is not None:
+            details["items_retrieved"] = items_retrieved
+        self.log_document_operation(
+            "Download completed",
+            document_location=document_location,
+            document_id=document_id,
+            content_size=content_size,
+            **details,
+        )
+    def log_download_start(
+        self,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the start of a file download (backward compatibility wrapper)."""
+        self.log_document_download_start(
+            document_location=file_path, document_id=file_id, content_size=file_size
+        )
+    def log_download_complete(
+        self,
+        file_path: Optional[str] = None,
+        file_id: Optional[str] = None,
+        download_path: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the completion of a file download (backward compatibility wrapper)."""
+        self.log_document_download_complete(
+            document_location=file_path,
+            document_id=file_id,
+            download_path=download_path,
+            content_size=file_size,
+        )
+    def log_upload_start(
+        self,
+        file_path: Optional[str] = None,
+        destination: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the start of a file upload."""
+        logger.info("Starting file upload")
+        details = {}
+        if destination:
+            details["destination"] = destination
+        self.log_file_operation("Upload", file_path=file_path, **details)
+    def log_upload_complete(
+        self,
+        file_path: Optional[str] = None,
+        destination: Optional[str] = None,
+        file_id: Optional[str] = None,
+        file_size: Optional[int] = None,
+    ):
+        """Log the completion of a file upload."""
+        logger.info("File upload completed")
+        details = {}
+        if destination:
+            details["destination"] = destination
+        if file_id:
+            details["file_id"] = file_id
+        self.log_file_operation("Upload completed", file_path=file_path, **details)
+    def log_indexing_start(self, source_type: str, count: Optional[int] = None):
+        """Log the start of indexing operation."""
+        if count:
+            logger.info("Starting indexing of %s (%s items)", source_type, count)
+        else:
+            logger.info("Starting indexing of %s", source_type)
+    def log_indexing_complete(self, source_type: str, count: int):
+        """Log the completion of indexing operation."""
+        logger.info("Indexing completed: %s %s items indexed", count, source_type)
+    def log_info(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log an info message with optional context and sanitization."""
+        logger.info(message)
+        self._log_context("Info", context, **kwargs)
+    def log_debug(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log a debug message with optional context and sanitization."""
+        logger.debug(message)
+        self._log_context("Debug", context, **kwargs)
+    def log_warning(self, message: str, context: Optional[Dict[str, Any]] = None, **kwargs):
+        """Log a warning message with optional context and sanitization."""
+        logger.warning(message)
+        self._log_context("Warning", context, **kwargs)
+    def log_error(
+        self,
+        message: str,
+        error: Optional[Exception] = None,
+        context: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        """Log an error message with optional exception, context and sanitization."""
+        if error:
+            logger.error("%s: %s", message, error, exc_info=True)
+        else:
+            logger.error(message)
+        self._log_context("Error", context, **kwargs)
+    def _log_context(self, log_type: str, context: Optional[Dict[str, Any]], **kwargs):
+        """Helper method to log context with sanitization."""
+        all_context = {}
+        if context:
+            all_context.update(context)
+        if kwargs:
+            all_context.update(kwargs)
+        if all_context:
+            if self._should_sanitize():
+                sanitized_context = self._sanitizer.sanitize_dict(all_context)
+                logger.debug("%s context: %s", log_type, sanitized_context)
+            else:
+                logger.debug("%s context: %s", log_type, all_context)
+    def log_api_call(self, method: str, endpoint: str, status_code: Optional[int] = None, **kwargs):
+        """Log API call details."""
+        if self._should_sanitize():
+            sanitized_endpoint = self._sanitizer.sanitize_url(endpoint)
+            if status_code:
+                logger.debug("API call: %s %s -> %s", method, sanitized_endpoint, status_code)
+            else:
+                logger.debug("API call: %s %s", method, sanitized_endpoint)
+        else:
+            if status_code:
+                logger.debug("API call: %s %s -> %s", method, endpoint, status_code)
+            else:
+                logger.debug("API call: %s %s", method, endpoint)
+        if kwargs:
+            if self._should_sanitize():
+                sanitized_kwargs = self._sanitizer.sanitize_dict(kwargs)
+                logger.debug("API call details: %s", sanitized_kwargs)
+            else:
+                logger.debug("API call details: %s", kwargs)

unstructured_ingest/processes/utils/logging/sanitizer.py ADDED Viewed

@@ -0,0 +1,117 @@
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+from urllib.parse import urlparse
+class DataSanitizer:
+    """Utility class for sanitizing sensitive data in logs."""
+    @staticmethod
+    def sanitize_path(path: Union[str, Path]) -> str:
+        """Sanitize file paths for logging, showing only filename and partial path."""
+        if not path:
+            return "<empty>"
+        path_str = str(path)
+        path_obj = Path(path_str)
+        if len(path_obj.parts) > 2:
+            return f".../{path_obj.parent.name}/{path_obj.name}"
+        return path_obj.name
+    @staticmethod
+    def sanitize_id(identifier: str) -> str:
+        """Sanitize IDs for logging, showing only first/last few characters."""
+        if not identifier:
+            return "<id>"
+        if len(identifier) < 10:
+            half_len = len(identifier) // 2
+            return f"{identifier[:half_len]}..."
+        return f"{identifier[:4]}...{identifier[-4:]}"
+    @staticmethod
+    def sanitize_url(url: str) -> str:
+        """Sanitize URLs for logging, removing sensitive query parameters."""
+        if not url:
+            return "<url>"
+        try:
+            parsed = urlparse(url)
+            return f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
+        except (ValueError, TypeError):
+            return "<url>"
+    @staticmethod
+    def sanitize_token(token: str) -> str:
+        """Sanitize tokens and secrets for logging."""
+        if not token:
+            return "<token>"
+        if len(token) < 10:
+            half_len = len(token) // 2
+            return f"{token[:half_len]}..."
+        return f"{token[:4]}...{token[-4:]}"
+    @staticmethod
+    def sanitize_location(location: Union[str, Path]) -> str:
+        """Sanitize document locations (file paths, URLs, database references) for logging."""
+        if not location:
+            return "<empty>"
+        location_str = str(location)
+        # Handle URLs
+        if location_str.startswith(("http://", "https://", "ftp://", "ftps://")):
+            return DataSanitizer.sanitize_url(location_str)
+        # Handle database-style references (table:id, collection/document, etc.)
+        if ":" in location_str and not location_str.startswith("/"):
+            parts = location_str.split(":", 1)
+            if len(parts) == 2:
+                table_name, record_id = parts
+                return f"{table_name}:{DataSanitizer.sanitize_id(record_id)}"
+        return DataSanitizer.sanitize_path(location_str)
+    @staticmethod
+    def sanitize_document_id(document_id: str) -> str:
+        """Sanitize document IDs for logging (alias for sanitize_id for clarity)."""
+        return DataSanitizer.sanitize_id(document_id)
+    @staticmethod
+    def sanitize_dict(data: Dict[str, Any], sensitive_keys: Optional[set] = None) -> Dict[str, Any]:
+        """Sanitize dictionary data for logging."""
+        if sensitive_keys is None:
+            sensitive_keys = {
+                "password",
+                "token",
+                "secret",
+                "key",
+                "api_key",
+                "access_token",
+                "refresh_token",
+                "client_secret",
+                "private_key",
+                "credentials",
+            }
+        sanitized = {}
+        for k, v in data.items():
+            key_lower = k.lower()
+            if any(sensitive_key in key_lower for sensitive_key in sensitive_keys):
+                sanitized[k] = DataSanitizer.sanitize_token(str(v))
+            elif isinstance(v, dict):
+                sanitized[k] = DataSanitizer.sanitize_dict(v, sensitive_keys)
+            elif isinstance(v, (str, Path)) and (
+                "path" in key_lower
+                or "file" in key_lower
+                or "location" in key_lower
+                or "document_location" in key_lower
+            ):
+                sanitized[k] = DataSanitizer.sanitize_location(v)
+            elif isinstance(v, str) and (
+                ("id" in key_lower and len(str(v)) > 8)
+                or ("document_id" in key_lower and len(str(v)) > 8)
+            ):
+                sanitized[k] = DataSanitizer.sanitize_document_id(v)
+            else:
+                sanitized[k] = v
+        return sanitized

unstructured_ingest/utils/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from unstructured_ingest.utils.filesystem import mkdir_concurrent_safe
-__all__ = ["mkdir_concurrent_safe"]
+__all__ = ["mkdir_concurrent_safe"]

unstructured_ingest/utils/html.py CHANGED Viewed

@@ -129,6 +129,7 @@ class HtmlMixin(BaseModel):
         )
         result_file_data = file_data.model_copy(deep=True)
         result_file_data.metadata.url = url
+        result_file_data.display_name = filename
         if result_file_data.metadata.record_locator is None:
             result_file_data.metadata.record_locator = {}
         result_file_data.metadata.record_locator["parent_url"] = url

unstructured_ingest/utils/tls.py ADDED Viewed

@@ -0,0 +1,15 @@
+import os
+import ssl
+import certifi
+def ssl_context_with_optional_ca_override():
+    """
+    # https://www.python-httpx.org/advanced/ssl/#working-with-ssl_cert_file-and-ssl_cert_dir
+    # We choose REQUESTS_CA_BUNDLE because that works with many other Python packages.
+    """
+    return ssl.create_default_context(
+        cafile=os.environ.get("REQUESTS_CA_BUNDLE", certifi.where()),
+        capath=os.environ.get("REQUESTS_CA_BUNDLE"),
+    )

{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: unstructured_ingest
-Version: 1.0.55
+Version: 1.1.0
 Summary: Local ETL data pipeline to get data RAG ready
 Author-email: Unstructured Technologies <devops@unstructuredai.io>
 License-Expression: Apache-2.0
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: <3.13,>=3.9
+Requires-Dist: certifi>=2025.7.14
 Requires-Dist: click
 Requires-Dist: opentelemetry-sdk
 Requires-Dist: pydantic>=2.7

{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=WcrHy96lfPCfFMicHHTxBEY2M7zSC_2LVKoyMsYUTrI,43
+unstructured_ingest/__version__.py,sha256=OTJtt59bB59UuRwC7CjPgJNmkdDC7RUC5Ukrfd-P-CE,42
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
 unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
 unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
-unstructured_ingest/otel.py,sha256=NsUqOolA0gt69eFhZLABjVpcKoM9aus-AbxIKqWqPTc,4127
+unstructured_ingest/otel.py,sha256=wxnkdZqFtlypmOn4QX6uLxjGa7jSoFabP3PEG5FjH1g,4669
 unstructured_ingest/unstructured_api.py,sha256=4e2ZNWIihk0eje4R3ZQ0NOYNbmMZDv_O-rnJo94kaGE,5127
 unstructured_ingest/cli/README.md,sha256=lfsXY2jOO__OuDYcIs8N0yLhZWzrSQ_dyXbSFtEMlQ8,1504
 unstructured_ingest/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,18 +22,18 @@ unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 unstructured_ingest/data_types/entities.py,sha256=ECc6EkZ5_ZUvK7uaALYOynfFmofIrHYIJZfb67hUIxA,371
 unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
 unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-unstructured_ingest/embed/azure_openai.py,sha256=fk9yTG-Xr1TSu4n4l8O3DQo9-oceVL9fX_8rehwXsNM,1798
+unstructured_ingest/embed/azure_openai.py,sha256=Q_buBkAcx9FBuTsAqKbRU8vd9vDh8JoDOEth4fFxHbg,2160
 unstructured_ingest/embed/bedrock.py,sha256=dzfCsatB0i8hUp1YnXmoImoxgvUdZ4srKI6eSvn-lYM,9132
 unstructured_ingest/embed/huggingface.py,sha256=6Gx9L3xa3cv9fX4AMuLsePJQF4T_jwkKjovfqF5X1NM,2435
 unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFXXkrPVby-HY,5137
 unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
 unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
-unstructured_ingest/embed/openai.py,sha256=TMEOPVfm_OSs4tb3Ymd6q5J49R_-YKvO4TOqCHb3bwk,4647
+unstructured_ingest/embed/openai.py,sha256=09I5BIrb-iGsv92LOV46-F7oZ7j1JnJIOQFARNKVq3k,5029
 unstructured_ingest/embed/togetherai.py,sha256=ykaveEUBxBGBzRlmWc9utCFQuUWHdbW4F9KAb-uBAJM,3630
 unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
 unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
 unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
-unstructured_ingest/interfaces/connector.py,sha256=qUFFJ3qgDMenTCZMtVRjq1DIwsVak6pxNjQOH2eVkMw,1623
+unstructured_ingest/interfaces/connector.py,sha256=wYWIEAL99KdQDDzzDYSf_yE8p1wjThSPMgEV5qyfiPc,1885
 unstructured_ingest/interfaces/downloader.py,sha256=xX0ZzsFRSzZb7SAeoeQph8sIbVq13DRw-3MYkdADrY0,2918
 unstructured_ingest/interfaces/indexer.py,sha256=c2FwWJEQHfFD6vO-tGfYLpLiIs-TYViLAt8YmHfDbaM,824
 unstructured_ingest/interfaces/process.py,sha256=S3A_9gkwwGC-iQxvnpj3Er6IJAjAT5npzpSgxuFAzUM,449
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icG
 unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
 unstructured_ingest/processes/connectors/mongodb.py,sha256=idjolwS5TXShcIz2jR_socSgh8HOzJwyOnzE1qLUPBw,15362
 unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
-unstructured_ingest/processes/connectors/onedrive.py,sha256=fGwa-x9D3gyLQtaSXbz6pfiFiLpnO2GVtJmU5kb-qd0,20197
+unstructured_ingest/processes/connectors/onedrive.py,sha256=JPa30X2abVx9SHye_cLOOj4csj_ut8nMjwRnMcgHFhI,20163
 unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
 unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
 unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
 unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
-unstructured_ingest/processes/connectors/sharepoint.py,sha256=IV6gs4vx4q-QEDwA-Rm6yYCwzopuVl8bKC8CcBU1Lkk,10677
+unstructured_ingest/processes/connectors/sharepoint.py,sha256=ooPJoAEHj-epEM39iiYbNWdDUdEwt466fLjIcYSNTM8,10670
 unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
 unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
 unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -109,9 +109,9 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
 unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
 unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
 unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
-unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=S1siX888TfHAByEXuvOqkTbcNAzx-m5UNqhKjiEKR5s,14524
+unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=p0u6JL6ouEPe4R_i_rAhzlvSDyMO3-NDHiw_CtPaCTc,17875
 unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
-unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
+unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=P5nd3hamhLFO3l5nV3lMuIxHtb_rZYFP4F6q_py3xpc,7492
 unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
 unstructured_ingest/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
 unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py,sha256=kf0UpgdAY2KK1R1FbAB6GEBBAIOeYQ8cZIr3bp660qM,374
@@ -218,22 +218,25 @@ unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0
 unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
 unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
-unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+unstructured_ingest/processes/utils/__init__.py,sha256=v3IQ-Ft0f7PoHhGcYiiD6Yrr6oi-RiGeD6nTKowbEDk,199
 unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
-unstructured_ingest/utils/__init__.py,sha256=URnsQu-y3Vmc7vn6GVL2sYuXxlSJ3naR3c9o6oKSm3w,157
+unstructured_ingest/processes/utils/logging/connector.py,sha256=xKsXSavbu2U8ZP0KP7jk5192ZDr5HzaBCBCf0GKe1HI,14109
+unstructured_ingest/processes/utils/logging/sanitizer.py,sha256=ZG4Cdcc2yrVmmgdUOJCaUKgp5mZhBpEOMjAbj5Cth_s,4251
+unstructured_ingest/utils/__init__.py,sha256=mU8mlrdah00MPuZM6JqXTkrpXK-sDYiv5y5Mwl8eesM,158
 unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
 unstructured_ingest/utils/compression.py,sha256=PPC-ys3qEAtELf6-irhp8v8M634pFFCJEvA6o7PXaLI,2712
 unstructured_ingest/utils/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
 unstructured_ingest/utils/data_prep.py,sha256=yqrv7x_nlj0y3uaN0m0Bnsekb7VIQnwABWPa24KU5QI,7426
 unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
 unstructured_ingest/utils/filesystem.py,sha256=nWxpQd8ogTgmXb7ZouupX6sE5v_qFXNzPl4DtZSStwE,1036
-unstructured_ingest/utils/html.py,sha256=78ou1vVZ0SJ3c6-Nmxg2iR5MoqubJTvwiuTNMtSFDh4,6816
+unstructured_ingest/utils/html.py,sha256=lm5lVYhVl7ztntquxzMLVQ8EmK7wkvYgNvlIuHnenoM,6865
 unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
 unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
 unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
 unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
-unstructured_ingest-1.0.55.dist-info/METADATA,sha256=x-w3d3LQjOuPDVtLQbGgyeCzfMlgVF4OyhGWwm91o8w,8842
-unstructured_ingest-1.0.55.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-unstructured_ingest-1.0.55.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-1.0.55.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-1.0.55.dist-info/RECORD,,
+unstructured_ingest/utils/tls.py,sha256=Ra8Mii1F4VqErRreg76PBI0eAqPBC009l0sSHa8FdnA,448
+unstructured_ingest-1.1.0.dist-info/METADATA,sha256=tJonV6SbQB5XL3BeyL8coDFhzzChMKGuSPQWQ3aoOdE,8875
+unstructured_ingest-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+unstructured_ingest-1.1.0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-1.1.0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-1.1.0.dist-info/RECORD,,

{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-1.0.55.dist-info → unstructured_ingest-1.1.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

unstructured-ingest 1.0.55__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 1.0.55py3-none-any.whl → 1.1.0py3-none-any.whl