PyPI - unstructured-ingest - Versions diffs - 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl - Mend

unstructured-ingest 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (13) hide show

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.23" # pragma: no cover
1	+ __version__ = "0.0.25" # pragma: no cover

unstructured_ingest/utils/dep_check.py CHANGED Viewed

@@ -20,6 +20,18 @@ def requires_dependencies(
     dependencies: str | list[str],
     extras: Optional[str] = None,
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """Decorator ensuring required modules are installed.
+    Use on functions with local imports to ensure required modules are available and log
+    an installation instruction if they're not.
+    Args:
+        dependencies: Name(s) of module(s) required by the decorated function.
+        extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
+    Raises:
+        ImportError: When at least one of the `dependencies` is not available.
+    """
     if isinstance(dependencies, str):
         dependencies = [dependencies]

unstructured_ingest/v2/cli/utils/model_conversion.py CHANGED Viewed

@@ -155,14 +155,14 @@ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
 def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
     param_decls = [option_name]
-    help = field_info.description or ""
+    help_text = field_info.description or ""
     if examples := field_info.examples:
-        help += f" [Examples: {', '.join(examples)}]"
+        help_text += f" [Examples: {', '.join(examples)}]"
     option_kwargs = {
         "type": _get_type_from_field(field_info),
         "default": get_default_value_from_field(field_info),
         "required": field_info.is_required(),
-        "help": help,
+        "help": str(help_text),
         "is_flag": is_boolean_flag(field_info),
         "show_default": field_info.default is not PydanticUndefined,
     }

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -17,7 +17,10 @@ from .chroma import chroma_destination_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
 from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
-from .databricks_volumes import databricks_volumes_destination_entry
+from .databricks_volumes import (
+    databricks_volumes_destination_entry,
+    databricks_volumes_source_entry,
+)
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -34,6 +37,8 @@ from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
 from .onedrive import onedrive_source_entry
 from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
 from .opensearch import opensearch_destination_entry, opensearch_source_entry
+from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
+from .outlook import outlook_source_entry
 from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
 from .pinecone import pinecone_destination_entry
 from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
@@ -78,6 +83,10 @@ add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_d
 add_destination_entry(
     destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
 )
+add_source_entry(
+    source_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_source_entry
+)
 add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
@@ -95,3 +104,5 @@ add_destination_entry(
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
 add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
+add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)

unstructured_ingest/v2/processes/connectors/databricks_volumes.py CHANGED Viewed

@@ -1,21 +1,35 @@
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, Generator, Optional
 from pydantic import Field, Secret
-from unstructured_ingest.error import DestinationConnectionError
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
 from unstructured_ingest.utils.dep_check import requires_dependencies
 from unstructured_ingest.v2.interfaces import (
     AccessConfig,
     ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
     FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
     Uploader,
     UploaderConfig,
 )
 from unstructured_ingest.v2.logger import logger
-from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
 if TYPE_CHECKING:
     from databricks.sdk import WorkspaceClient
@@ -32,16 +46,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
         "https://accounts.azuredatabricks.net/ (Azure), "
         "or https://accounts.gcp.databricks.com/ (GCP).",
     )
-    username: Optional[str] = Field(
-        default=None,
-        description="The Databricks username part of basic authentication. "
-        "Only possible when Host is *.cloud.databricks.com (AWS).",
-    )
-    password: Optional[str] = Field(
-        default=None,
-        description="The Databricks password part of basic authentication. "
-        "Only possible when Host is *.cloud.databricks.com (AWS).",
-    )
     client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
     client_secret: Optional[str] = Field(
         default=None, description="Client Secret of the OAuth app."
@@ -78,7 +82,6 @@ class DatabricksVolumesAccessConfig(AccessConfig):
         "argument. This argument also holds the currently "
         "selected auth.",
     )
-    cluster_id: Optional[str] = None
     google_credentials: Optional[str] = None
     google_service_account: Optional[str] = None
@@ -93,17 +96,11 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig):
         "Databricks workspace endpoint or the "
         "Databricks accounts endpoint.",
     )
-class DatabricksVolumesUploaderConfig(UploaderConfig):
     volume: str = Field(description="Name of volume in the Unity Catalog")
     catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
     volume_path: Optional[str] = Field(
         default=None, description="Optional path within the volume to write to"
     )
-    overwrite: bool = Field(
-        default=False, description="If true, an existing file will be overwritten."
-    )
     databricks_schema: str = Field(
         default="default",
         alias="schema",
@@ -117,33 +114,121 @@ class DatabricksVolumesUploaderConfig(UploaderConfig):
             path = f"{path}/{self.volume_path}"
         return path
-@dataclass
-class DatabricksVolumesUploader(Uploader):
-    connector_type: str = CONNECTOR_TYPE
-    upload_config: DatabricksVolumesUploaderConfig
-    connection_config: DatabricksVolumesConnectionConfig
     @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
     def get_client(self) -> "WorkspaceClient":
         from databricks.sdk import WorkspaceClient
         return WorkspaceClient(
-            host=self.connection_config.host,
-            **self.connection_config.access_config.get_secret_value().model_dump(),
+            host=self.host,
+            **self.access_config.get_secret_value().model_dump(),
         )
+@dataclass
+class DatabricksVolumesIndexerConfig(IndexerConfig):
+    recursive: bool = False
+@dataclass
+class DatabricksVolumesIndexer(Indexer):
+    index_config: DatabricksVolumesIndexerConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
     def precheck(self) -> None:
         try:
-            assert self.get_client().current_user.me().active
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        for file_info in self.connection_config.get_client().dbfs.list(
+            path=self.connection_config.path, recursive=self.index_config.recursive
+        ):
+            if file_info.is_dir:
+                continue
+            rel_path = file_info.path.replace(self.connection_config.path, "")
+            if rel_path.startswith("/"):
+                rel_path = rel_path[1:]
+            filename = Path(file_info.path).name
+            yield FileData(
+                identifier=file_info.path,
+                connector_type=CONNECTOR_TYPE,
+                source_identifiers=SourceIdentifiers(
+                    filename=filename,
+                    rel_path=rel_path,
+                    fullpath=file_info.path,
+                ),
+                additional_metadata={
+                    "catalog": self.connection_config.catalog,
+                },
+                metadata=FileDataSourceMetadata(
+                    url=file_info.path, date_modified=str(file_info.modification_time)
+                ),
+            )
+@dataclass
+class DatabricksVolumesDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class DatabricksVolumesDownloader(Downloader):
+    download_config: DatabricksVolumesDownloaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def get_download_path(self, file_data: FileData) -> Path:
+        return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        download_path = self.get_download_path(file_data=file_data)
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Writing {file_data.identifier} to {download_path}")
+        try:
+            with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
+                read_content = c._read_handle.read()
+            with open(download_path, "wb") as f:
+                f.write(read_content)
+        except Exception as e:
+            logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+class DatabricksVolumesUploaderConfig(UploaderConfig):
+    overwrite: bool = Field(
+        default=False, description="If true, an existing file will be overwritten."
+    )
+@dataclass
+class DatabricksVolumesUploader(Uploader):
+    upload_config: DatabricksVolumesUploaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            assert self.connection_config.get_client().current_user.me().active
         except Exception as e:
             logger.error(f"failed to validate connection: {e}", exc_info=True)
             raise DestinationConnectionError(f"failed to validate connection: {e}")
     def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
-        output_path = os.path.join(self.upload_config.path, path.name)
+        output_path = os.path.join(self.connection_config.path, path.name)
         with open(path, "rb") as elements_file:
-            self.get_client().files.upload(
+            self.connection_config.get_client().files.upload(
                 file_path=output_path,
                 contents=elements_file,
                 overwrite=self.upload_config.overwrite,
@@ -155,3 +240,11 @@ databricks_volumes_destination_entry = DestinationRegistryEntry(
     uploader=DatabricksVolumesUploader,
     uploader_config=DatabricksVolumesUploaderConfig,
 )
+databricks_volumes_source_entry = SourceRegistryEntry(
+    connection_config=DatabricksVolumesConnectionConfig,
+    indexer=DatabricksVolumesIndexer,
+    indexer_config=DatabricksVolumesIndexerConfig,
+    downloader=DatabricksVolumesDownloader,
+    downloader_config=DatabricksVolumesDownloaderConfig,
+)

unstructured_ingest/v2/processes/connectors/outlook.py ADDED Viewed

@@ -0,0 +1,239 @@
+import hashlib
+import time
+from dataclasses import dataclass, field
+from datetime import timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Coroutine, Generator
+from pydantic import Field, Secret
+from unstructured_ingest.error import SourceConnectionError
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    FileData,
+    Indexer,
+    IndexerConfig,
+    download_responses,
+)
+from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
+from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
+MAX_EMAILS_PER_FOLDER = 1_000_000  # Maximum number of emails per folder
+if TYPE_CHECKING:
+    from office365.graph_client import GraphClient
+    from office365.outlook.mail.folders.folder import MailFolder
+    from office365.outlook.mail.messages.message import Message
+CONNECTOR_TYPE = "outlook"
+class OutlookAccessConfig(AccessConfig):
+    client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
+class OutlookConnectionConfig(ConnectionConfig):
+    access_config: Secret[OutlookAccessConfig]
+    client_id: str = Field(description="Azure AD App client ID")
+    tenant: str = Field(
+        default="common", description="ID or domain name associated with your Azure AD instance"
+    )
+    authority_url: str = Field(
+        default="https://login.microsoftonline.com",
+        description="Authentication token provider for Microsoft apps",
+    )
+    @requires_dependencies(["msal"], extras="outlook")
+    def _acquire_token(self):
+        """Acquire token via MSAL"""
+        from msal import ConfidentialClientApplication
+        # NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
+        # constant as default in the future but they do not fit well with `authority_url` right now
+        authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
+        app = ConfidentialClientApplication(
+            authority=authority_url,
+            client_id=self.client_id,
+            client_credential=self.access_config.get_secret_value().client_credential,
+        )
+        token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+        return token
+    @requires_dependencies(["office365"], extras="outlook")
+    @SourceConnectionError.wrap
+    def get_client(self) -> "GraphClient":
+        from office365.graph_client import GraphClient
+        return GraphClient(self._acquire_token)
+class OutlookIndexerConfig(IndexerConfig):
+    outlook_folders: list[str] = Field(
+        description="Folders to download email messages from. Do not specify subfolders. "
+        "Use quotes if there are spaces in folder names."
+    )
+    recursive: bool = Field(
+        default=False,
+        description="Recursively download files in their respective folders otherwise stop at the"
+        " files in provided folder level.",
+    )
+    user_email: str = Field(description="Outlook email to download messages from.")
+@dataclass
+class OutlookIndexer(Indexer):
+    index_config: OutlookIndexerConfig
+    connection_config: OutlookConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        messages = self._list_messages(recursive=self.index_config.recursive)
+        for message in messages:
+            yield self._message_to_file_data(message)
+    def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
+        raise NotImplementedError
+    @SourceConnectionError.wrap
+    def precheck(self) -> None:
+        client = self.connection_config.get_client()
+        client.users[self.index_config.user_email].get().execute_query()
+    def is_async(self) -> bool:
+        return False
+    def _list_messages(self, recursive: bool) -> list["Message"]:
+        mail_folders = self._get_selected_root_folders()
+        messages = []
+        while mail_folders:
+            mail_folder = mail_folders.pop()
+            messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
+            if recursive:
+                mail_folders += list(mail_folder.child_folders.get().execute_query())
+        return messages
+    def _get_selected_root_folders(self) -> list["MailFolder"]:
+        client_user = self.connection_config.get_client().users[self.index_config.user_email]
+        root_mail_folders = client_user.mail_folders.get().execute_query()
+        selected_names_normalized = [
+            folder_name.lower() for folder_name in self.index_config.outlook_folders
+        ]
+        selected_root_mail_folders = [
+            folder
+            for folder in root_mail_folders
+            if folder.display_name.lower() in selected_names_normalized
+        ]
+        if not selected_root_mail_folders:
+            logger.error(
+                f"Root folders selected in configuration: {self.index_config.outlook_folders}"
+                f"not found for user email {self.index_config.user_email}. Aborting."
+            )
+            raise ValueError("Root folders selected in configuration not found.")
+        return selected_root_mail_folders
+    def _message_to_file_data(self, message: "Message") -> FileData:
+        fullpath = self._generate_fullpath(message)
+        return FileData(
+            identifier=message.id,
+            connector_type=CONNECTOR_TYPE,
+            source_identifiers=SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath)),
+            metadata=FileDataSourceMetadata(
+                url=message.resource_url,
+                version=message.change_key,
+                date_modified=str(
+                    message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
+                ),
+                date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
+                date_processed=str(time.time()),
+                record_locator={
+                    "message_id": message.id,
+                    "user_email": self.index_config.user_email,
+                },
+            ),
+            additional_metadata={
+                "sent_from": str(message.sent_from),
+                "to_recipients": [str(recipient) for recipient in message.to_recipients],
+                "bcc_recipients": [str(recipient) for recipient in message.to_recipients],
+                "subject": message.subject,
+                "conversation_id": message.conversation_id,
+                "is_draft": message.is_draft,
+                "is_read": message.is_read,
+                "has_attachments": message.has_attachments,
+                "importance": message.importance,
+            },
+        )
+    def _generate_fullpath(self, message: "Message") -> Path:
+        return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
+class OutlookDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class OutlookDownloader(Downloader):
+    connector_type: str = CONNECTOR_TYPE
+    connection_config: OutlookConnectionConfig
+    download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        # NOTE: Indexer should provide source identifiers required to generate the download path
+        download_path = self.get_download_path(file_data)
+        if download_path is None:
+            logger.error(
+                "Generated download path is None, source_identifiers might be missing"
+                "from FileData."
+            )
+            raise ValueError("Generated invalid download path.")
+        self._download_message(file_data, download_path)
+        return self.generate_download_response(file_data, download_path)
+    def is_async(self) -> bool:
+        return False
+    def _download_message(self, file_data: FileData, download_path: Path) -> None:
+        # NOTE: Indexer should supply the record locator in metadata
+        if (
+            file_data.metadata.record_locator is None
+            or "user_email" not in file_data.metadata.record_locator
+            or "message_id" not in file_data.metadata.record_locator
+        ):
+            logger.error(
+                f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
+                "Keys 'user_email' and 'message_id' must be present."
+            )
+            raise ValueError("Invalid record locator.")
+        user_email = file_data.metadata.record_locator["user_email"]
+        message_id = file_data.metadata.record_locator["message_id"]
+        message = self.connection_config.get_client().users[user_email].messages[message_id]
+        download_path.parent.mkdir(exist_ok=True, parents=True)
+        with open(download_path, "wb") as file:
+            message.download(file).execute_query()
+outlook_source_entry = SourceRegistryEntry(
+    indexer=OutlookIndexer,
+    indexer_config=OutlookIndexerConfig,
+    downloader=OutlookDownloader,
+    downloader_config=OutlookDownloaderConfig,
+    connection_config=OutlookConnectionConfig,
+)

unstructured_ingest/v2/processes/connectors/pinecone.py CHANGED Viewed

@@ -58,20 +58,6 @@ class PineconeConnectionConfig(ConnectionConfig):
         return index
-class PineconeUploadStagerConfig(UploadStagerConfig):
-    pass
-class PineconeUploaderConfig(UploaderConfig):
-    batch_size: Optional[int] = Field(
-        default=None,
-        description="Optional number of records per batch. Will otherwise limit by size.",
-    )
-    pool_threads: Optional[int] = Field(
-        default=1, description="Optional limit on number of threads to use for upload"
-    )
 ALLOWED_FIELDS = (
     "element_id",
     "text",
@@ -86,31 +72,60 @@ ALLOWED_FIELDS = (
     "is_continuation",
     "link_urls",
     "link_texts",
+    "text_as_html",
 )
+class PineconeUploadStagerConfig(UploadStagerConfig):
+    metadata_fields: list[str] = Field(
+        default=str(ALLOWED_FIELDS),
+        description=(
+            "which metadata from the source element to map to the payload metadata being sent to "
+            "Pinecone."
+        ),
+    )
+class PineconeUploaderConfig(UploaderConfig):
+    batch_size: Optional[int] = Field(
+        default=None,
+        description="Optional number of records per batch. Will otherwise limit by size.",
+    )
+    pool_threads: Optional[int] = Field(
+        default=1, description="Optional limit on number of threads to use for upload"
+    )
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The namespace to write to. If not specified, the default namespace is used",
+    )
 @dataclass
 class PineconeUploadStager(UploadStager):
     upload_stager_config: PineconeUploadStagerConfig = field(
         default_factory=lambda: PineconeUploadStagerConfig()
     )
-    @staticmethod
-    def conform_dict(element_dict: dict) -> dict:
+    def conform_dict(self, element_dict: dict) -> dict:
         embeddings = element_dict.pop("embeddings", None)
         metadata: dict[str, Any] = element_dict.pop("metadata", {})
         data_source = metadata.pop("data_source", {})
         coordinates = metadata.pop("coordinates", {})
-        element_dict.update(metadata)
-        element_dict.update(data_source)
-        element_dict.update(coordinates)
+        pinecone_metadata = {}
+        for possible_meta in [element_dict, metadata, data_source, coordinates]:
+            pinecone_metadata.update(
+                {
+                    k: v
+                    for k, v in possible_meta.items()
+                    if k in self.upload_stager_config.metadata_fields
+                }
+            )
         return {
             "id": str(uuid.uuid4()),
             "values": embeddings,
             "metadata": flatten_dict(
-                {k: v for k, v in element_dict.items() if k in ALLOWED_FIELDS},
+                pinecone_metadata,
                 separator="-",
                 flatten_lists=True,
                 remove_none=True,
@@ -172,7 +187,11 @@ class PineconeUploader(Uploader):
             pool_threads = max_pool_threads
         index = self.connection_config.get_index(pool_threads=pool_threads)
         with index:
-            async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
+            upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
+            if namespace := self.upload_config.namespace:
+                for kwargs in upsert_kwargs:
+                    kwargs["namespace"] = namespace
+            async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
             # Wait for and retrieve responses (this raises in case of error)
             try:
                 results = [async_result.get() for async_result in async_results]

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: unstructured-ingest
-Version: 0.0.23
+Version: 0.0.25
 Summary: A library that prepares raw documents for downstream ML tasks.
 Home-page: https://github.com/Unstructured-IO/unstructured-ingest
 Author: Unstructured Technologies
@@ -23,26 +23,26 @@ Requires-Python: >=3.9.0,<3.13
 Description-Content-Type: text/markdown
 License-File: LICENSE.md
 Requires-Dist: pydantic>=2.7
-Requires-Dist: python-dateutil
-Requires-Dist: click
+Requires-Dist: dataclasses-json
 Requires-Dist: opentelemetry-sdk
+Requires-Dist: python-dateutil
 Requires-Dist: pandas
-Requires-Dist: dataclasses-json
 Requires-Dist: tqdm
+Requires-Dist: click
 Provides-Extra: airtable
 Requires-Dist: pyairtable; extra == "airtable"
 Provides-Extra: astradb
 Requires-Dist: astrapy; extra == "astradb"
 Provides-Extra: azure
-Requires-Dist: fsspec; extra == "azure"
 Requires-Dist: adlfs; extra == "azure"
+Requires-Dist: fsspec; extra == "azure"
 Provides-Extra: azure-cognitive-search
 Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
 Provides-Extra: bedrock
 Requires-Dist: boto3; extra == "bedrock"
 Provides-Extra: biomed
-Requires-Dist: bs4; extra == "biomed"
 Requires-Dist: requests; extra == "biomed"
+Requires-Dist: bs4; extra == "biomed"
 Provides-Extra: box
 Requires-Dist: fsspec; extra == "box"
 Requires-Dist: boxfs; extra == "box"
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
 Provides-Extra: databricks-volumes
 Requires-Dist: databricks-sdk; extra == "databricks-volumes"
 Provides-Extra: delta-table
-Requires-Dist: fsspec; extra == "delta-table"
 Requires-Dist: deltalake; extra == "delta-table"
+Requires-Dist: fsspec; extra == "delta-table"
 Provides-Extra: discord
 Requires-Dist: discord-py; extra == "discord"
 Provides-Extra: doc
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
 Provides-Extra: docx
 Requires-Dist: unstructured[docx]; extra == "docx"
 Provides-Extra: dropbox
-Requires-Dist: fsspec; extra == "dropbox"
 Requires-Dist: dropboxdrivefs; extra == "dropbox"
+Requires-Dist: fsspec; extra == "dropbox"
 Provides-Extra: elasticsearch
 Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
 Provides-Extra: embed-huggingface
@@ -87,9 +87,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
 Provides-Extra: epub
 Requires-Dist: unstructured[epub]; extra == "epub"
 Provides-Extra: gcs
-Requires-Dist: bs4; extra == "gcs"
-Requires-Dist: fsspec; extra == "gcs"
 Requires-Dist: gcsfs; extra == "gcs"
+Requires-Dist: fsspec; extra == "gcs"
+Requires-Dist: bs4; extra == "gcs"
 Provides-Extra: github
 Requires-Dist: requests; extra == "github"
 Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -98,8 +98,8 @@ Requires-Dist: python-gitlab; extra == "gitlab"
 Provides-Extra: google-drive
 Requires-Dist: google-api-python-client; extra == "google-drive"
 Provides-Extra: hubspot
-Requires-Dist: urllib3; extra == "hubspot"
 Requires-Dist: hubspot-api-client; extra == "hubspot"
+Requires-Dist: urllib3; extra == "hubspot"
 Provides-Extra: jira
 Requires-Dist: atlassian-python-api; extra == "jira"
 Provides-Extra: kafka
@@ -115,16 +115,16 @@ Requires-Dist: pymongo; extra == "mongodb"
 Provides-Extra: msg
 Requires-Dist: unstructured[msg]; extra == "msg"
 Provides-Extra: notion
-Requires-Dist: backoff; extra == "notion"
+Requires-Dist: notion-client; extra == "notion"
 Requires-Dist: httpx; extra == "notion"
 Requires-Dist: htmlBuilder; extra == "notion"
-Requires-Dist: notion-client; extra == "notion"
+Requires-Dist: backoff; extra == "notion"
 Provides-Extra: odt
 Requires-Dist: unstructured[odt]; extra == "odt"
 Provides-Extra: onedrive
-Requires-Dist: bs4; extra == "onedrive"
-Requires-Dist: msal; extra == "onedrive"
 Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
+Requires-Dist: msal; extra == "onedrive"
+Requires-Dist: bs4; extra == "onedrive"
 Provides-Extra: openai
 Requires-Dist: openai; extra == "openai"
 Requires-Dist: tiktoken; extra == "openai"
@@ -133,8 +133,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
 Provides-Extra: org
 Requires-Dist: unstructured[org]; extra == "org"
 Provides-Extra: outlook
-Requires-Dist: msal; extra == "outlook"
 Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
+Requires-Dist: msal; extra == "outlook"
 Provides-Extra: pdf
 Requires-Dist: unstructured[pdf]; extra == "pdf"
 Provides-Extra: pinecone
@@ -156,16 +156,16 @@ Requires-Dist: unstructured[rst]; extra == "rst"
 Provides-Extra: rtf
 Requires-Dist: unstructured[rtf]; extra == "rtf"
 Provides-Extra: s3
-Requires-Dist: fsspec; extra == "s3"
 Requires-Dist: s3fs; extra == "s3"
+Requires-Dist: fsspec; extra == "s3"
 Provides-Extra: salesforce
 Requires-Dist: simple-salesforce; extra == "salesforce"
 Provides-Extra: sftp
 Requires-Dist: paramiko; extra == "sftp"
 Requires-Dist: fsspec; extra == "sftp"
 Provides-Extra: sharepoint
-Requires-Dist: msal; extra == "sharepoint"
 Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
+Requires-Dist: msal; extra == "sharepoint"
 Provides-Extra: singlestore
 Requires-Dist: singlestoredb; extra == "singlestore"
 Provides-Extra: slack

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
-unstructured_ingest/__version__.py,sha256=HgbcmBIk6mQp0Bz81M53L-kPIBJnMYIFOGkRL73EChs,43
+unstructured_ingest/__version__.py,sha256=WG3ykkrrofptunFgyMVyh_5Uyla9d5aYDfBtMqyZ_lE,43
 unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
 unstructured_ingest/interfaces.py,sha256=0r0gQoHJQ4DVSQEVbUPBA3N6WyvGMkR1u6U2SwUvoAQ,31361
 unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -257,7 +257,7 @@ unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
 unstructured_ingest/utils/chunking.py,sha256=efWEfMcCukG5zASZrXhkNgAX8AzHa6t3rClMzm2TwFE,1521
 unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
 unstructured_ingest/utils/data_prep.py,sha256=9UKewDHB8-cMlQ8POvokhjVsy-ksiSqAAW2ibqPYAfk,4400
-unstructured_ingest/utils/dep_check.py,sha256=cVEqZtMwji8BIt7pjtUOMtEmN7KaNXRXwelEKFpOdW8,1914
+unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
 unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
 unstructured_ingest/utils/string_and_date_utils.py,sha256=LwcbLmWpwt1zEabLlyUd5kIf9oOWcZxsRzxDglLCMeU,1375
 unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
@@ -277,7 +277,7 @@ unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8n
 unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
 unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 unstructured_ingest/v2/cli/utils/click.py,sha256=Wn2s3PuvBCKB0lsK-W7X_Y0eYyWnS6Y9wWo1OhVBOzY,6344
-unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=73DKHQQ6Tm0Lz5NCRduDlyfOhY2KH-MZN1n6jUgrsuU,7480
+unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
 unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
 unstructured_ingest/v2/interfaces/connector.py,sha256=KG0pHdAcpuO5h72xrAkJzADmjxbav31TZ2Wo3PBvwT0,765
 unstructured_ingest/v2/interfaces/downloader.py,sha256=PKT1kr79Mz1urW_8xCyq9sBuK93gDvyTXg5e4ma4htU,2871
@@ -308,13 +308,13 @@ unstructured_ingest/v2/processes/embedder.py,sha256=nFYiOmIJwWLodBt_cC-E5h7zmYB9
 unstructured_ingest/v2/processes/filter.py,sha256=kjUmMw2SDq2bme0JCAOxs6cJriIG6Ty09KOznS-xz08,2145
 unstructured_ingest/v2/processes/partitioner.py,sha256=bpqmZDsKKi6qtxNWdIWBfQmr1ccQUhU0axecpGAUf_4,7739
 unstructured_ingest/v2/processes/uncompress.py,sha256=Z_XfsITGdyaRwhtNUc7bMj5Y2jLuBge8KoK4nxhqKag,2425
-unstructured_ingest/v2/processes/connectors/__init__.py,sha256=6iBdoH6BW8oMK1ZvEi0IgEchuk0cNUPoNIaikpzeML8,4992
+unstructured_ingest/v2/processes/connectors/__init__.py,sha256=XZWdbUKXioO4vfCYjgNNV4ZDNPQ_VrAUcHMjHGIys3E,5334
 unstructured_ingest/v2/processes/connectors/airtable.py,sha256=Yi7PEv_FejZ9_y3BPY3gu5YGVfeLh-9YX-qLyQHjJsY,8921
 unstructured_ingest/v2/processes/connectors/astradb.py,sha256=ZctZRfXcOAMBGPkKgHvhTmV_-2F0YN5vqwfY9UCHIlU,5791
 unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=S55v7TXu30rEdgythMBB_2VcuomyMPmcPtLYykbhw_E,8466
 unstructured_ingest/v2/processes/connectors/chroma.py,sha256=skrxRPHZ8y3JxNa0dt5SVitHiDQ5WVxLvY_kh2-QUrQ,8029
 unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=SONLywyEfoAlLc-HPabXeGzoiwKnekMHIbRMXd4CGXs,12146
-unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=E_4DzeemC4mhZsVuLmSXtfy4MR1MoU6CNyvpRqsKnJU,6030
+unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=BQHHpCDwE51inD3pZF4tL4zLr7lv6iBcwnA1NazrHqY,9423
 unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=ojxMUHkLa6ZG50aTGn2YWhDHZ1n38uFRn5p8_ghAIvM,16762
 unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=7xOQthcqBd9auJxB0nxZlhh1vdjXpMX_CtQZa6YfZz0,13088
 unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=D71gt8fsPOXi2-Rir8mATw6dRM3BdzYGnn62qG1iaBw,5586
@@ -323,7 +323,8 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYH
 unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=2_R_hrEAaTU4vJTCK9oKblWTgv6BKjyUhFtC7uq3q2w,4859
 unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
 unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
-unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=gCueI1Px7UkI1flNovLMRvcbPGczHI3IlYhOPYlb3WU,6748
+unstructured_ingest/v2/processes/connectors/outlook.py,sha256=NK67Pd8Nk5oUIXTK-sK18K7rZ_Cl0UuCbeF2ExBEZho,9294
+unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=k_GH55S_OQ6-wCLC6gkhRrNpXIFECYZ_2Gjz_XRtY6Y,7561
 unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
 unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
 unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
@@ -339,9 +340,9 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
 unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
 unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
 unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
-unstructured_ingest-0.0.23.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
-unstructured_ingest-0.0.23.dist-info/METADATA,sha256=iWfV6hzGvmClCO7_huz8s-h9FST1mJsc-mUHZQaGQU4,7108
-unstructured_ingest-0.0.23.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-unstructured_ingest-0.0.23.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
-unstructured_ingest-0.0.23.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
-unstructured_ingest-0.0.23.dist-info/RECORD,,
+unstructured_ingest-0.0.25.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
+unstructured_ingest-0.0.25.dist-info/METADATA,sha256=NdNIJw4d0nu0NKP_FD5c8RZ2Tt3hWMMm0pJNdKGZdQU,7108
+unstructured_ingest-0.0.25.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+unstructured_ingest-0.0.25.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
+unstructured_ingest-0.0.25.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
+unstructured_ingest-0.0.25.dist-info/RECORD,,

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/WHEEL RENAMED Viewed

File without changes

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{unstructured_ingest-0.0.23.dist-info → unstructured_ingest-0.0.25.dist-info}/top_level.txt RENAMED Viewed

File without changes

unstructured-ingest 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl