PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/connectors/outlook.py ADDED Viewed

@@ -0,0 +1,242 @@
+import hashlib
+import time
+from dataclasses import dataclass, field
+from datetime import timezone
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Coroutine, Generator
+from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import (
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.error import SourceConnectionError, ValueError
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    Indexer,
+    IndexerConfig,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import SourceRegistryEntry
+from unstructured_ingest.utils.dep_check import requires_dependencies
+MAX_EMAILS_PER_FOLDER = 1_000_000  # Maximum number of emails per folder
+if TYPE_CHECKING:
+    from office365.graph_client import GraphClient
+    from office365.outlook.mail.folders.folder import MailFolder
+    from office365.outlook.mail.messages.message import Message
+CONNECTOR_TYPE = "outlook"
+class OutlookAccessConfig(AccessConfig):
+    client_credential: str = Field(description="Azure AD App client secret", alias="client_cred")
+class OutlookConnectionConfig(ConnectionConfig):
+    access_config: Secret[OutlookAccessConfig]
+    client_id: str = Field(description="Azure AD App client ID")
+    tenant: str = Field(
+        default="common", description="ID or domain name associated with your Azure AD instance"
+    )
+    authority_url: str = Field(
+        default="https://login.microsoftonline.com",
+        description="Authentication token provider for Microsoft apps",
+    )
+    @requires_dependencies(["msal"], extras="outlook")
+    def _acquire_token(self):
+        """Acquire token via MSAL"""
+        from msal import ConfidentialClientApplication
+        # NOTE: It'd be nice to use `msal.authority.AuthorityBuilder` here paired with AZURE_PUBLIC
+        # constant as default in the future but they do not fit well with `authority_url` right now
+        authority_url = f"{self.authority_url.rstrip('/')}/{self.tenant}"
+        app = ConfidentialClientApplication(
+            authority=authority_url,
+            client_id=self.client_id,
+            client_credential=self.access_config.get_secret_value().client_credential,
+        )
+        token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
+        return token
+    @requires_dependencies(["office365"], extras="outlook")
+    @SourceConnectionError.wrap
+    def get_client(self) -> "GraphClient":
+        from office365.graph_client import GraphClient
+        return GraphClient(self._acquire_token)
+class OutlookIndexerConfig(IndexerConfig):
+    outlook_folders: list[str] = Field(
+        description="Folders to download email messages from. Do not specify subfolders. "
+        "Use quotes if there are spaces in folder names."
+    )
+    recursive: bool = Field(
+        default=False,
+        description="Recursively download files in their respective folders otherwise stop at the"
+        " files in provided folder level.",
+    )
+    user_email: str = Field(description="Outlook email to download messages from.")
+@dataclass
+class OutlookIndexer(Indexer):
+    index_config: OutlookIndexerConfig
+    connection_config: OutlookConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        messages = self._list_messages(recursive=self.index_config.recursive)
+        for message in messages:
+            yield self._message_to_file_data(message)
+    def run_async(self, **kwargs: Any) -> Coroutine[Any, Any, Any]:
+        raise NotImplementedError
+    @SourceConnectionError.wrap
+    def precheck(self) -> None:
+        client = self.connection_config.get_client()
+        client.users[self.index_config.user_email].get().execute_query()
+    def is_async(self) -> bool:
+        return False
+    def _list_messages(self, recursive: bool) -> list["Message"]:
+        mail_folders = self._get_selected_root_folders()
+        messages = []
+        while mail_folders:
+            mail_folder = mail_folders.pop()
+            messages += list(mail_folder.messages.get().top(MAX_EMAILS_PER_FOLDER).execute_query())
+            if recursive:
+                mail_folders += list(mail_folder.child_folders.get().execute_query())
+        return messages
+    def _get_selected_root_folders(self) -> list["MailFolder"]:
+        client_user = self.connection_config.get_client().users[self.index_config.user_email]
+        root_mail_folders = client_user.mail_folders.get().execute_query()
+        selected_names_normalized = [
+            folder_name.lower() for folder_name in self.index_config.outlook_folders
+        ]
+        selected_root_mail_folders = [
+            folder
+            for folder in root_mail_folders
+            if folder.display_name.lower() in selected_names_normalized
+        ]
+        if not selected_root_mail_folders:
+            logger.error(
+                f"Root folders selected in configuration: {self.index_config.outlook_folders}"
+                f"not found for user email {self.index_config.user_email}. Aborting."
+            )
+            raise ValueError("Root folders selected in configuration not found.")
+        return selected_root_mail_folders
+    def _message_to_file_data(self, message: "Message") -> FileData:
+        fullpath = self._generate_fullpath(message)
+        source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
+        return FileData(
+            identifier=message.id,
+            connector_type=CONNECTOR_TYPE,
+            source_identifiers=source_identifiers,
+            metadata=FileDataSourceMetadata(
+                url=message.resource_url,
+                version=message.change_key,
+                date_modified=str(
+                    message.last_modified_datetime.replace(tzinfo=timezone.utc).timestamp()
+                ),
+                date_created=str(message.created_datetime.replace(tzinfo=timezone.utc).timestamp()),
+                date_processed=str(time.time()),
+                record_locator={
+                    "message_id": message.id,
+                    "user_email": self.index_config.user_email,
+                },
+            ),
+            additional_metadata={
+                "sent_from": str(message.sent_from),
+                "to_recipients": [str(recipient) for recipient in message.to_recipients],
+                "bcc_recipients": [str(recipient) for recipient in message.to_recipients],
+                "subject": message.subject,
+                "conversation_id": message.conversation_id,
+                "is_draft": message.is_draft,
+                "is_read": message.is_read,
+                "has_attachments": message.has_attachments,
+                "importance": message.importance,
+            },
+            display_name=source_identifiers.fullpath,
+        )
+    def _generate_fullpath(self, message: "Message") -> Path:
+        return Path(hashlib.sha256(message.id.encode("utf-8")).hexdigest()[:16] + ".eml")
+class OutlookDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class OutlookDownloader(Downloader):
+    connector_type: str = CONNECTOR_TYPE
+    connection_config: OutlookConnectionConfig
+    download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        # NOTE: Indexer should provide source identifiers required to generate the download path
+        download_path = self.get_download_path(file_data)
+        if download_path is None:
+            logger.error(
+                "Generated download path is None, source_identifiers might be missingfrom FileData."
+            )
+            raise ValueError("Generated invalid download path.")
+        self._download_message(file_data, download_path)
+        return self.generate_download_response(file_data, download_path)
+    def is_async(self) -> bool:
+        return False
+    def _download_message(self, file_data: FileData, download_path: Path) -> None:
+        # NOTE: Indexer should supply the record locator in metadata
+        if (
+            file_data.metadata.record_locator is None
+            or "user_email" not in file_data.metadata.record_locator
+            or "message_id" not in file_data.metadata.record_locator
+        ):
+            logger.error(
+                f"Invalid record locator in metadata: {file_data.metadata.record_locator}."
+                "Keys 'user_email' and 'message_id' must be present."
+            )
+            raise ValueError("Invalid record locator.")
+        user_email = file_data.metadata.record_locator["user_email"]
+        message_id = file_data.metadata.record_locator["message_id"]
+        message = self.connection_config.get_client().users[user_email].messages[message_id]
+        download_path.parent.mkdir(exist_ok=True, parents=True)
+        with open(download_path, "wb") as file:
+            message.download(file).execute_query()
+outlook_source_entry = SourceRegistryEntry(
+    indexer=OutlookIndexer,
+    indexer_config=OutlookIndexerConfig,
+    downloader=OutlookDownloader,
+    downloader_config=OutlookDownloaderConfig,
+    connection_config=OutlookConnectionConfig,
+)

unstructured_ingest/processes/connectors/pinecone.py ADDED Viewed

@@ -0,0 +1,400 @@
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal, Optional
+from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    NotFoundError,
+    UnstructuredIngestError,
+    UserError,
+)
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+    VectorDBUploader,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
+from unstructured_ingest.utils import ndjson
+from unstructured_ingest.utils.constants import RECORD_ID_LABEL
+from unstructured_ingest.utils.data_prep import (
+    flatten_dict,
+    generator_batching_wbytes,
+    get_enhanced_element_id,
+    get_json_data,
+    write_data,
+)
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from pinecone import Index as PineconeIndex
+    from pinecone import Pinecone
+CONNECTOR_TYPE = "pinecone"
+MAX_PAYLOAD_SIZE = 2 * 1024 * 1024  # 2MB
+MAX_POOL_THREADS = 100
+MAX_METADATA_BYTES = 40960  # 40KB https://docs.pinecone.io/reference/quotas-and-limits#hard-limits
+MAX_QUERY_RESULTS = 10000
+class PineconeAccessConfig(AccessConfig):
+    pinecone_api_key: Optional[str] = Field(
+        default=None, description="API key for Pinecone.", alias="api_key"
+    )
+class PineconeConnectionConfig(ConnectionConfig):
+    index_name: Optional[str] = Field(description="Name of the index to connect to.", default=None)
+    access_config: Secret[PineconeAccessConfig] = Field(
+        default=PineconeAccessConfig(), validate_default=True
+    )
+    @requires_dependencies(["pinecone"], extras="pinecone")
+    def get_client(self, **index_kwargs) -> "Pinecone":
+        from pinecone import Pinecone
+        from unstructured_ingest import __version__ as unstructured_version
+        return Pinecone(
+            api_key=self.access_config.get_secret_value().pinecone_api_key,
+            source_tag=f"unstructured_ingest=={unstructured_version}",
+        )
+    def get_index(self, **index_kwargs) -> "PineconeIndex":
+        pc = self.get_client()
+        index = pc.Index(name=self.index_name, **index_kwargs)
+        logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
+        return index
+ALLOWED_FIELDS = (
+    "element_id",
+    "text",
+    "parent_id",
+    "category_depth",
+    "emphasized_text_tags",
+    "emphasized_text_contents",
+    "coordinates",
+    "last_modified",
+    "page_number",
+    "filename",
+    "is_continuation",
+    "link_urls",
+    "link_texts",
+    "text_as_html",
+    "entities",
+)
+class PineconeUploadStagerConfig(UploadStagerConfig):
+    metadata_fields: list[str] = Field(
+        default=list(ALLOWED_FIELDS),
+        description=(
+            "which metadata from the source element to map to the payload metadata being sent to "
+            "Pinecone."
+        ),
+    )
+class PineconeUploaderConfig(UploaderConfig):
+    batch_size: Optional[int] = Field(
+        default=None,
+        description="Optional number of records per batch. Will otherwise limit by size.",
+    )
+    pool_threads: Optional[int] = Field(
+        default=1, description="Optional limit on number of threads to use for upload"
+    )
+    namespace: Optional[str] = Field(
+        default=None,
+        description="The namespace to write to. If not specified, the default namespace is used",
+    )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
+@dataclass
+class PineconeUploadStager(UploadStager):
+    upload_stager_config: PineconeUploadStagerConfig = field(
+        default_factory=lambda: PineconeUploadStagerConfig()
+    )
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        embeddings = element_dict.pop("embeddings", None)
+        metadata: dict[str, Any] = element_dict.pop("metadata", {})
+        data_source = metadata.pop("data_source", {})
+        coordinates = metadata.pop("coordinates", {})
+        pinecone_metadata = {}
+        for possible_meta in [element_dict, metadata, data_source, coordinates]:
+            pinecone_metadata.update(
+                {
+                    k: v
+                    for k, v in possible_meta.items()
+                    if k in self.upload_stager_config.metadata_fields
+                }
+            )
+        metadata = flatten_dict(
+            pinecone_metadata,
+            separator="-",
+            flatten_lists=True,
+            remove_none=True,
+        )
+        metadata_size_bytes = len(json.dumps(metadata).encode())
+        if metadata_size_bytes > MAX_METADATA_BYTES:
+            logger.info(
+                f"Metadata size is {metadata_size_bytes} bytes, which exceeds the limit of"
+                f" {MAX_METADATA_BYTES} bytes per vector. Dropping the metadata."
+            )
+            metadata = {}
+        metadata[RECORD_ID_LABEL] = file_data.identifier
+        # To support more optimal deletes, a prefix is suggested for each record:
+        # https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
+        return {
+            "id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}",  # noqa:E501
+            "values": embeddings,
+            "metadata": metadata,
+        }
+    def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
+        with input_file.open() as in_f:
+            reader = ndjson.reader(in_f)
+            with output_file.open("w") as out_f:
+                writer = ndjson.writer(out_f)
+                for element in reader:
+                    if "embeddings" not in element:
+                        continue
+                    conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
+                    writer.write(row=conformed_element)
+                    writer.f.flush()
+    def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
+        elements_contents = get_json_data(path=input_file)
+        conformed_elements = [
+            self.conform_dict(element_dict=element, file_data=file_data)
+            for element in elements_contents
+            if "embeddings" in element
+        ]
+        write_data(path=output_file, data=conformed_elements)
+@dataclass
+class PineconeUploader(VectorDBUploader):
+    upload_config: PineconeUploaderConfig
+    connection_config: PineconeConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def init(self, **kwargs: Any) -> None:
+        self.create_destination(**kwargs)
+    def index_exists(self, index_name: Optional[str]) -> bool:
+        from pinecone.exceptions import NotFoundException
+        index_name = index_name or self.connection_config.index_name
+        pc = self.connection_config.get_client()
+        try:
+            pc.describe_index(index_name)
+            return True
+        except NotFoundException:
+            return False
+        except Exception as e:
+            logger.error(f"failed to check if pinecone index exists : {e}")
+            raise DestinationConnectionError(f"failed to check if pinecone index exists : {e}")
+    def precheck(self):
+        try:
+            # just a connection check here. not an actual index_exists check
+            self.index_exists("just-checking-our-connection")
+            if self.connection_config.index_name and not self.index_exists(
+                self.connection_config.index_name
+            ):
+                raise NotFoundError(f"index {self.connection_config.index_name} does not exist")
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def format_destination_name(self, destination_name: str) -> str:
+        # Pinecone naming requirements:
+        # can only contain lowercase letters, numbers, and hyphens
+        # must be 45 characters or less
+        formatted = re.sub(r"[^a-z0-9]", "-", destination_name.lower())
+        return formatted
+    def create_destination(
+        self,
+        vector_length: int,
+        destination_name: str = "unstructuredautocreated",
+        destination_type: Literal["pod", "serverless"] = "serverless",
+        serverless_cloud: str = "aws",
+        serverless_region: str = "us-east-1",
+        pod_environment: str = "us-east1-gcp",
+        pod_type: str = "p1.x1",
+        pod_count: int = 1,
+        **kwargs: Any,
+    ) -> bool:
+        from pinecone import PodSpec, ServerlessSpec
+        index_name = self.connection_config.index_name or destination_name
+        index_name = self.format_destination_name(index_name)
+        self.connection_config.index_name = index_name
+        if not self.index_exists(index_name):
+            logger.info(f"creating pinecone index {index_name}")
+            pc = self.connection_config.get_client()
+            if destination_type == "serverless":
+                pc.create_index(
+                    name=index_name,
+                    dimension=vector_length,
+                    spec=ServerlessSpec(cloud=serverless_cloud, region=serverless_region),
+                )
+                return True
+            elif destination_type == "pod":
+                pc.create_index(
+                    name=destination_name,
+                    dimension=vector_length,
+                    spec=PodSpec(environment=pod_environment, pod_type=pod_type, pods=pod_count),
+                )
+                return True
+            else:
+                raise ValueError(f"unexpected destination type: {destination_type}")
+        else:
+            logger.debug(f"index {index_name} already exists, skipping creation")
+            return False
+    def pod_delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone pod index"
+        )
+        index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
+        delete_kwargs = {
+            "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
+        }
+        if namespace := self.upload_config.namespace:
+            delete_kwargs["namespace"] = namespace
+            try:
+                index.delete(**delete_kwargs)
+            except UserError as e:
+                logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
+        logger.debug(
+            f"deleted any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone index: {delete_kwargs}"
+        )
+    def serverless_delete_by_record_id(self, file_data: FileData) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone serverless index"
+        )
+        index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
+        list_kwargs = {"prefix": f"{file_data.identifier}#"}
+        deleted_ids = 0
+        if namespace := self.upload_config.namespace:
+            list_kwargs["namespace"] = namespace
+        for ids in index.list(**list_kwargs):
+            deleted_ids += len(ids)
+            delete_kwargs = {"ids": ids}
+            if namespace := self.upload_config.namespace:
+                delete_kwargs["namespace"] = namespace
+            try:
+                index.delete(**delete_kwargs)
+            except UserError as e:
+                logger.error(f"failed to delete batch of ids: {delete_kwargs} {e}")
+        logger.info(
+            f"deleted {deleted_ids} records with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from pinecone index"
+        )
+    @requires_dependencies(["pinecone"], extras="pinecone")
+    def upsert_batches_async(self, elements_dict: list[dict]):
+        from pinecone.exceptions import PineconeApiException
+        chunks = list(
+            generator_batching_wbytes(
+                iterable=elements_dict,
+                batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
+                max_batch_size=self.upload_config.batch_size,
+            )
+        )
+        logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
+        max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
+        if self.upload_config.pool_threads:
+            pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
+        else:
+            pool_threads = max_pool_threads
+        index = self.connection_config.get_index(pool_threads=pool_threads)
+        with index:
+            upsert_kwargs = [{"vectors": chunk, "async_req": True} for chunk in chunks]
+            if namespace := self.upload_config.namespace:
+                for kwargs in upsert_kwargs:
+                    kwargs["namespace"] = namespace
+            async_results = [index.upsert(**kwarg) for kwarg in upsert_kwargs]
+            # Wait for and retrieve responses (this raises in case of error)
+            try:
+                results = [async_result.get() for async_result in async_results]
+            except PineconeApiException as api_error:
+                raise UnstructuredIngestError(f"http error: {api_error}") from api_error
+            logger.debug(f"results: {results}")
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        logger.info(
+            f"writing a total of {len(data)} elements via"
+            f" document batches to destination"
+            f" index named {self.connection_config.index_name}"
+        )
+        # Determine if serverless or pod based index
+        pinecone_client = self.connection_config.get_client()
+        if not self.connection_config.index_name:
+            raise ValueError("No index name specified")
+        index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
+        if "serverless" in index_description.get("spec"):
+            self.serverless_delete_by_record_id(file_data=file_data)
+        elif "pod" in index_description.get("spec"):
+            self.pod_delete_by_record_id(file_data=file_data)
+        else:
+            raise ValueError(f"unexpected spec type in index description: {index_description}")
+        self.upsert_batches_async(elements_dict=data)
+pinecone_destination_entry = DestinationRegistryEntry(
+    connection_config=PineconeConnectionConfig,
+    uploader=PineconeUploader,
+    uploader_config=PineconeUploaderConfig,
+    upload_stager=PineconeUploadStager,
+    upload_stager_config=PineconeUploadStagerConfig,
+)

unstructured_ingest/processes/connectors/qdrant/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+from __future__ import annotations
+from unstructured_ingest.processes.connector_registry import (
+    add_destination_entry,
+)
+from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR_TYPE
+from .cloud import qdrant_cloud_destination_entry
+from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
+from .local import qdrant_local_destination_entry
+from .server import CONNECTOR_TYPE as SERVER_CONNECTOR_TYPE
+from .server import qdrant_server_destination_entry
+add_destination_entry(destination_type=CLOUD_CONNECTOR_TYPE, entry=qdrant_cloud_destination_entry)
+add_destination_entry(destination_type=SERVER_CONNECTOR_TYPE, entry=qdrant_server_destination_entry)
+add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=qdrant_local_destination_entry)