PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/connectors/azure_ai_search.py ADDED Viewed

@@ -0,0 +1,275 @@
+import json
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Generator
+from pydantic import Field, Secret
+from unstructured_ingest.data_types.file_data import FileData
+from unstructured_ingest.error import DestinationConnectionError, ValueError, WriteError
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import (
+    DestinationRegistryEntry,
+)
+from unstructured_ingest.processes.connectors.utils import parse_datetime
+from unstructured_ingest.utils.constants import RECORD_ID_LABEL
+from unstructured_ingest.utils.data_prep import batch_generator, get_enhanced_element_id
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from azure.search.documents import SearchClient
+    from azure.search.documents.indexes import SearchIndexClient
+CONNECTOR_TYPE = "azure_ai_search"
+class AzureAISearchAccessConfig(AccessConfig):
+    azure_ai_search_key: str = Field(
+        alias="key", description="Credential that is used for authenticating to an Azure service"
+    )
+class AzureAISearchConnectionConfig(ConnectionConfig):
+    endpoint: str = Field(
+        description="The URL endpoint of an Azure AI (Cognitive) search service. "
+        "In the form of https://{{service_name}}.search.windows.net"
+    )
+    index: str = Field(
+        description="The name of the Azure AI (Cognitive) Search index to connect to."
+    )
+    access_config: Secret[AzureAISearchAccessConfig]
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    @contextmanager
+    def get_search_client(self) -> Generator["SearchClient", None, None]:
+        from azure.core.credentials import AzureKeyCredential
+        from azure.search.documents import SearchClient
+        with SearchClient(
+            endpoint=self.endpoint,
+            index_name=self.index,
+            credential=AzureKeyCredential(
+                self.access_config.get_secret_value().azure_ai_search_key
+            ),
+        ) as client:
+            yield client
+    @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
+    @contextmanager
+    def get_search_index_client(self) -> Generator["SearchIndexClient", None, None]:
+        from azure.core.credentials import AzureKeyCredential
+        from azure.search.documents.indexes import SearchIndexClient
+        with SearchIndexClient(
+            endpoint=self.endpoint,
+            credential=AzureKeyCredential(
+                self.access_config.get_secret_value().azure_ai_search_key
+            ),
+        ) as search_index_client:
+            yield search_index_client
+class AzureAISearchUploadStagerConfig(UploadStagerConfig):
+    pass
+class AzureAISearchUploaderConfig(UploaderConfig):
+    batch_size: int = Field(default=100, description="Number of records per batch")
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
+@dataclass
+class AzureAISearchUploadStager(UploadStager):
+    upload_stager_config: AzureAISearchUploadStagerConfig = field(
+        default_factory=lambda: AzureAISearchUploadStagerConfig()
+    )
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        """
+        updates the dictionary that is from each Element being converted into a dict/json
+        into a dictionary that conforms to the schema expected by the
+        Azure Cognitive Search index
+        """
+        data = element_dict.copy()
+        data["id"] = get_enhanced_element_id(element_dict=data, file_data=file_data)
+        data[RECORD_ID_LABEL] = file_data.identifier
+        if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
+            data["metadata"]["coordinates"]["points"] = json.dumps(points)
+        if version := data.get("metadata", {}).get("data_source", {}).get("version"):
+            data["metadata"]["data_source"]["version"] = str(version)
+        if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
+            data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
+        if permissions_data := (
+            data.get("metadata", {}).get("data_source", {}).get("permissions_data")
+        ):
+            data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
+        if links := data.get("metadata", {}).get("links"):
+            data["metadata"]["links"] = [json.dumps(link) for link in links]
+        if last_modified := data.get("metadata", {}).get("last_modified"):
+            data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime(
+                "%Y-%m-%dT%H:%M:%S.%fZ"
+            )
+        if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"):
+            data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime(
+                "%Y-%m-%dT%H:%M:%S.%fZ"
+            )
+        if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"):
+            data["metadata"]["data_source"]["date_modified"] = parse_datetime(
+                date_modified
+            ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+        if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"):
+            data["metadata"]["data_source"]["date_processed"] = parse_datetime(
+                date_processed
+            ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
+        if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
+            data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
+        if page_number := data.get("metadata", {}).get("page_number"):
+            data["metadata"]["page_number"] = str(page_number)
+        return data
+@dataclass
+class AzureAISearchUploader(Uploader):
+    upload_config: AzureAISearchUploaderConfig
+    connection_config: AzureAISearchConnectionConfig
+    connector_type: str = CONNECTOR_TYPE
+    def query_docs(self, record_id: str, index_key: str) -> list[str]:
+        with self.connection_config.get_search_client() as search_client:
+            results = list(
+                search_client.search(filter=f"record_id eq '{record_id}'", select=[index_key])
+            )
+        return [result[index_key] for result in results]
+    def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
+        logger.debug(
+            f"deleting any content with metadata "
+            f"{self.upload_config.record_id_key}={file_data.identifier} "
+            f"from azure cognitive search index: {self.connection_config.index}"
+        )
+        doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
+        if not doc_ids_to_delete:
+            return
+        with self.connection_config.get_search_client() as search_client:
+            results = search_client.delete_documents(
+                documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
+            )
+        errors = []
+        success = []
+        for result in results:
+            if result.succeeded:
+                success.append(result)
+            else:
+                errors.append(result)
+        logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
+        if errors:
+            raise WriteError(
+                ", ".join(
+                    [f"[{error.status_code}] {error.error_message}" for error in errors],
+                ),
+            )
+    @DestinationConnectionError.wrap
+    @requires_dependencies(["azure"], extras="azure-ai-search")
+    def write_dict(
+        self, elements_dict: list[dict[str, Any]], search_client: "SearchClient"
+    ) -> None:
+        import azure.core.exceptions
+        logger.info(
+            f"writing {len(elements_dict)} documents to destination "
+            f"index at {self.connection_config.index}",
+        )
+        try:
+            results = search_client.upload_documents(documents=elements_dict)
+        except azure.core.exceptions.HttpResponseError as http_error:
+            raise WriteError(f"http error: {http_error}") from http_error
+        errors = []
+        success = []
+        for result in results:
+            if result.succeeded:
+                success.append(result)
+            else:
+                errors.append(result)
+        logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
+        if errors:
+            raise WriteError(
+                ", ".join(
+                    [
+                        f"{error.key}: [{error.status_code}] {error.error_message}"
+                        for error in errors
+                    ],
+                ),
+            )
+    def can_delete(self) -> bool:
+        with self.connection_config.get_search_index_client() as search_index_client:
+            index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        record_id_fields = [
+            field for field in index_fields if field.name == self.upload_config.record_id_key
+        ]
+        if not record_id_fields:
+            return False
+        record_id_field = record_id_fields[0]
+        return record_id_field.filterable
+    def get_index_key(self) -> str:
+        with self.connection_config.get_search_index_client() as search_index_client:
+            index = search_index_client.get_index(name=self.connection_config.index)
+        index_fields = index.fields
+        key_fields = [field for field in index_fields if field.key]
+        if not key_fields:
+            raise ValueError("no key field found in index fields")
+        return key_fields[0].name
+    def precheck(self) -> None:
+        try:
+            with self.connection_config.get_search_client() as search_client:
+                search_client.get_document_count()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        logger.info(
+            f"writing document batches to destination"
+            f" endpoint at {str(self.connection_config.endpoint)}"
+            f" index at {str(self.connection_config.index)}"
+            f" with batch size {str(self.upload_config.batch_size)}"
+        )
+        if self.can_delete():
+            index_key = self.get_index_key()
+            self.delete_by_record_id(file_data=file_data, index_key=index_key)
+        else:
+            logger.warning("criteria for deleting previous content not met, skipping")
+        batch_size = self.upload_config.batch_size
+        with self.connection_config.get_search_client() as search_client:
+            for chunk in batch_generator(data, batch_size):
+                self.write_dict(elements_dict=chunk, search_client=search_client)  # noqa: E203
+azure_ai_search_destination_entry = DestinationRegistryEntry(
+    connection_config=AzureAISearchConnectionConfig,
+    uploader=AzureAISearchUploader,
+    uploader_config=AzureAISearchUploaderConfig,
+    upload_stager=AzureAISearchUploadStager,
+    upload_stager_config=AzureAISearchUploadStagerConfig,
+)

unstructured_ingest/processes/connectors/chroma.py ADDED Viewed

@@ -0,0 +1,193 @@
+from dataclasses import dataclass, field
+from datetime import date, datetime
+from typing import TYPE_CHECKING, Annotated, Any, Optional
+from dateutil import parser
+from pydantic import Field, Secret
+from pydantic.functional_validators import BeforeValidator
+from unstructured_ingest.data_types.file_data import FileData
+from unstructured_ingest.error import DestinationConnectionError, ValueError
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
+from unstructured_ingest.utils.data_prep import (
+    batch_generator,
+    flatten_dict,
+    get_enhanced_element_id,
+)
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from .utils import conform_string_to_dict
+if TYPE_CHECKING:
+    from chromadb import Client
+CONNECTOR_TYPE = "chroma"
+class ChromaAccessConfig(AccessConfig):
+    settings: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
+        default=None, description="A dictionary of settings to communicate with the chroma server."
+    )
+    headers: Optional[Annotated[dict, BeforeValidator(conform_string_to_dict)]] = Field(
+        default=None, description="A dictionary of headers to send to the Chroma server."
+    )
+class ChromaConnectionConfig(ConnectionConfig):
+    access_config: Secret[ChromaAccessConfig] = Field(
+        default=ChromaAccessConfig(), validate_default=True
+    )
+    path: Optional[str] = Field(
+        default=None, description="Location where Chroma is persisted, if not connecting via http."
+    )
+    tenant: Optional[str] = Field(
+        default="default_tenant", description="The tenant to use for this client."
+    )
+    database: Optional[str] = Field(
+        default="default_database", description="The database to use for this client."
+    )
+    host: Optional[str] = Field(default=None, description="The hostname of the Chroma server.")
+    port: Optional[int] = Field(default=None, description="The port of the Chroma server.")
+    ssl: bool = Field(
+        default=False, description="Whether to use SSL to connect to the Chroma server."
+    )
+    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    @requires_dependencies(["chromadb"], extras="chroma")
+    def get_client(self) -> "Client":
+        import chromadb
+        access_config = self.access_config.get_secret_value()
+        if path := self.path:
+            return chromadb.PersistentClient(
+                path=path,
+                settings=access_config.settings,
+                tenant=self.tenant,
+                database=self.database,
+            )
+        elif (host := self.host) and (port := self.port):
+            return chromadb.HttpClient(
+                host=host,
+                port=str(port),
+                ssl=self.ssl,
+                headers=access_config.headers,
+                settings=access_config.settings,
+                tenant=self.tenant,
+                database=self.database,
+            )
+        else:
+            raise ValueError("Chroma connector requires either path or host and port to be set.")
+class ChromaUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class ChromaUploadStager(UploadStager):
+    upload_stager_config: ChromaUploadStagerConfig = field(
+        default_factory=lambda: ChromaUploadStagerConfig()
+    )
+    @staticmethod
+    def parse_date_string(date_string: str) -> date:
+        try:
+            timestamp = float(date_string)
+            return datetime.fromtimestamp(timestamp)
+        except Exception as e:
+            logger.debug(f"date {date_string} string not a timestamp: {e}")
+        return parser.parse(date_string)
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        """
+        Prepares dictionary in the format that Chroma requires
+        """
+        data = element_dict.copy()
+        return {
+            "id": get_enhanced_element_id(element_dict=data, file_data=file_data),
+            "embedding": data.pop("embeddings", None),
+            "document": data.pop("text", None),
+            "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True),
+        }
+class ChromaUploaderConfig(UploaderConfig):
+    collection_name: str = Field(description="The name of the Chroma collection to write into.")
+    batch_size: int = Field(default=100, description="Number of records per batch")
+@dataclass
+class ChromaUploader(Uploader):
+    connector_type: str = CONNECTOR_TYPE
+    upload_config: ChromaUploaderConfig
+    connection_config: ChromaConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @DestinationConnectionError.wrap
+    def upsert_batch(self, collection, batch):
+        try:
+            # Chroma wants lists even if there is only one element
+            # Upserting to prevent duplicates
+            collection.upsert(
+                ids=batch["ids"],
+                documents=batch["documents"],
+                embeddings=batch["embeddings"],
+                metadatas=batch["metadatas"],
+            )
+        except Exception as e:
+            raise DestinationConnectionError(f"chroma error: {e}") from e
+    @staticmethod
+    def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]:
+        """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb.
+        ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}"""
+        chroma_dict = {}
+        chroma_dict["ids"] = [x.get("id") for x in chunk]
+        chroma_dict["documents"] = [x.get("document") for x in chunk]
+        chroma_dict["embeddings"] = [x.get("embedding") for x in chunk]
+        chroma_dict["metadatas"] = [x.get("metadata") for x in chunk]
+        # Make sure all lists are of the same length
+        assert (
+            len(chroma_dict["ids"])
+            == len(chroma_dict["documents"])
+            == len(chroma_dict["embeddings"])
+            == len(chroma_dict["metadatas"])
+        )
+        return chroma_dict
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        logger.info(
+            f"writing {len(data)} objects to destination "
+            f"collection {self.upload_config.collection_name} "
+            f"at {self.connection_config.host}",
+        )
+        client = self.connection_config.get_client()
+        collection = client.get_or_create_collection(name=self.upload_config.collection_name)
+        for chunk in batch_generator(data, self.upload_config.batch_size):
+            self.upsert_batch(collection, self.prepare_chroma_list(chunk))
+chroma_destination_entry = DestinationRegistryEntry(
+    connection_config=ChromaConnectionConfig,
+    uploader=ChromaUploader,
+    uploader_config=ChromaUploaderConfig,
+    upload_stager=ChromaUploadStager,
+    upload_stager_config=ChromaUploadStagerConfig,
+)