PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/connectors/astradb.py ADDED Viewed

@@ -0,0 +1,592 @@
+import asyncio
+import csv
+import hashlib
+import os
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from time import time
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from pydantic import BaseModel, Field, Secret
+from unstructured_ingest.__version__ import __version__ as integration_version
+from unstructured_ingest.data_types.file_data import (
+    BatchFileData,
+    BatchItem,
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    Indexer,
+    IndexerConfig,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+    download_responses,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
+from unstructured_ingest.processes.connectors.utils import format_and_truncate_orig_elements
+from unstructured_ingest.utils.constants import RECORD_ID_LABEL
+from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.utils.string_and_date_utils import truncate_string_bytes
+if TYPE_CHECKING:
+    from astrapy import AsyncCollection as AstraDBAsyncCollection
+    from astrapy import Collection as AstraDBCollection
+    from astrapy import DataAPIClient as AstraDBClient
+    from astrapy import Database as AstraDB
+CONNECTOR_TYPE = "astradb"
+MAX_CONTENT_PARAM_BYTE_SIZE = 8000
+class AstraDBAdditionalMetadata(BaseModel):
+    collection_name: str
+    keyspace: Optional[str] = None
+class AstraDBBatchFileData(BatchFileData):
+    additional_metadata: AstraDBAdditionalMetadata
+class AstraDBAccessConfig(AccessConfig):
+    token: str = Field(description="Astra DB Token with access to the database.")
+    api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
+class AstraDBConnectionConfig(ConnectionConfig):
+    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    access_config: Secret[AstraDBAccessConfig]
+    @requires_dependencies(["astrapy"], extras="astradb")
+    def get_client(self) -> "AstraDBClient":
+        from astrapy import DataAPIClient as AstraDBClient
+        # Create a client object to interact with the Astra DB
+        # caller_name/version for Astra DB tracking
+        user_agent = os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
+        return AstraDBClient(callers=[(user_agent, integration_version)])
+def get_astra_db(
+    connection_config: AstraDBConnectionConfig,
+    keyspace: str,
+) -> "AstraDB":
+    # Build the Astra DB object.
+    access_configs = connection_config.access_config.get_secret_value()
+    # Create a client object to interact with the Astra DB
+    # caller_name/version for Astra DB tracking
+    client = connection_config.get_client()
+    # Get the database object
+    astra_db = client.get_database(
+        api_endpoint=access_configs.api_endpoint,
+        token=access_configs.token,
+        keyspace=keyspace,
+    )
+    return astra_db
+def get_astra_collection(
+    connection_config: AstraDBConnectionConfig,
+    collection_name: str,
+    keyspace: str,
+) -> "AstraDBCollection":
+    astra_db = get_astra_db(connection_config=connection_config, keyspace=keyspace)
+    # astradb will return a collection object in all cases (even if it doesn't exist)
+    astra_db_collection = astra_db.get_collection(name=collection_name)
+    return astra_db_collection
+async def get_async_astra_collection(
+    connection_config: AstraDBConnectionConfig,
+    collection_name: str,
+    keyspace: str,
+) -> "AstraDBAsyncCollection":
+    # Build the Astra DB object.
+    access_configs = connection_config.access_config.get_secret_value()
+    # Create a client object to interact with the Astra DB
+    client = connection_config.get_client()
+    # Get the async database object
+    async_astra_db = client.get_async_database(
+        api_endpoint=access_configs.api_endpoint,
+        token=access_configs.token,
+        keyspace=keyspace,
+    )
+    # Get async collection from AsyncDatabase
+    async_astra_db_collection = async_astra_db.get_collection(name=collection_name)
+    return async_astra_db_collection
+class AstraDBIndexerConfig(IndexerConfig):
+    collection_name: str = Field(
+        description="The name of the Astra DB collection. "
+        "Note that the collection name must only include letters, "
+        "numbers, and underscores."
+    )
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    batch_size: int = Field(default=20, description="Number of records per batch")
+@dataclass
+class AstraDBIndexer(Indexer):
+    connection_config: AstraDBConnectionConfig
+    index_config: AstraDBIndexerConfig
+    def get_collection(self) -> "AstraDBCollection":
+        return get_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=self.index_config.collection_name,
+            keyspace=self.index_config.keyspace,
+        )
+    def precheck(self) -> None:
+        try:
+            self.get_collection().options()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def _get_doc_ids(self) -> set[str]:
+        """Fetches all document ids in an index"""
+        # Get the collection
+        collection = self.get_collection()
+        # Perform the find operation to get all items
+        astra_db_docs_cursor = collection.find({}, projection={"_id": True})
+        # Iterate over the cursor
+        astra_db_docs = []
+        for result in astra_db_docs_cursor:
+            astra_db_docs.append(result)
+        # Create file data for each astra record
+        ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
+        return set(ids)
+    def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
+        all_ids = self._get_doc_ids()
+        ids = list(all_ids)
+        id_batches = batch_generator(ids, self.index_config.batch_size)
+        for batch in id_batches:
+            batch_items = [BatchItem(identifier=b) for b in batch]
+            display_name = (
+                f"{self.index_config.collection_name}-{self.index_config.keyspace}"
+                f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
+            )
+            fd = AstraDBBatchFileData(
+                connector_type=CONNECTOR_TYPE,
+                metadata=FileDataSourceMetadata(
+                    date_processed=str(time()),
+                ),
+                additional_metadata=AstraDBAdditionalMetadata(
+                    collection_name=self.index_config.collection_name,
+                    keyspace=self.index_config.keyspace,
+                ),
+                batch_items=batch_items,
+                display_name=display_name,
+            )
+            yield fd
+class AstraDBDownloaderConfig(DownloaderConfig):
+    fields: list[str] = field(default_factory=list)
+@dataclass
+class AstraDBDownloader(Downloader):
+    connection_config: AstraDBConnectionConfig
+    download_config: AstraDBDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return True
+    def get_identifier(self, record_id: str) -> str:
+        f = f"{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def write_astra_result_to_csv(self, astra_result: dict, download_path: str) -> None:
+        with open(download_path, "w", encoding="utf8") as f:
+            writer = csv.writer(f)
+            writer.writerow(astra_result.keys())
+            writer.writerow(astra_result.values())
+    def generate_download_response(
+        self, result: dict, file_data: AstraDBBatchFileData
+    ) -> DownloadResponse:
+        record_id = result["_id"]
+        filename_id = self.get_identifier(record_id=record_id)
+        filename = f"{filename_id}.csv"  # csv to preserve column info
+        download_path = self.download_dir / Path(filename)
+        logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
+        except Exception as e:
+            logger.error(
+                f"failed to download from record {record_id} to {download_path}: {e}",
+                exc_info=True,
+            )
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        # modify input file_data for download_response
+        file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
+        cast_file_data = FileData.cast(file_data=file_data)
+        cast_file_data.identifier = filename
+        cast_file_data.metadata.date_processed = str(time())
+        cast_file_data.metadata.record_locator = {"document_id": record_id}
+        return super().generate_download_response(
+            file_data=cast_file_data, download_path=download_path
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        raise NotImplementedError("Use astradb run_async instead")
+    async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        # Get metadata from file_data
+        astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
+        ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
+        collection_name: str = astra_file_data.additional_metadata.collection_name
+        keyspace: str = astra_file_data.additional_metadata.keyspace
+        # Retrieve results from async collection
+        download_responses = []
+        async_astra_collection = await get_async_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=collection_name,
+            keyspace=keyspace,
+        )
+        async for result in async_astra_collection.find({"_id": {"$in": ids}}):
+            download_responses.append(
+                self.generate_download_response(result=result, file_data=astra_file_data)
+            )
+        return download_responses
+class AstraDBUploadStagerConfig(UploadStagerConfig):
+    flatten_metadata: Optional[bool] = Field(
+        default=False, description="Move metadata to top level of the record."
+    )
+    astra_generated_embeddings: bool = Field(
+        default=False,
+        description="Select this if you've configured an embedding provider integration "
+        "for your collection. Content will be inserted into the $vectorize field and "
+        "embeddings will be generated externally.",
+    )
+    enable_lexical_search: bool = Field(
+        default=False,
+        description="Select this to insert content into the $lexical field "
+        "for lexicographical or hybrid search.",
+    )
+@dataclass
+class AstraDBUploadStager(UploadStager):
+    upload_stager_config: AstraDBUploadStagerConfig = field(
+        default_factory=lambda: AstraDBUploadStagerConfig()
+    )
+    def truncate_dict_elements(self, element_dict: dict) -> None:
+        text = element_dict.pop("text", None)
+        if text is not None:
+            element_dict["text"] = truncate_string_bytes(text, MAX_CONTENT_PARAM_BYTE_SIZE)
+        metadata = element_dict.get("metadata")
+        if metadata is not None and isinstance(metadata, dict):
+            text_as_html = element_dict["metadata"].pop("text_as_html", None)
+            if text_as_html is not None:
+                element_dict["metadata"]["text_as_html"] = truncate_string_bytes(
+                    text_as_html, MAX_CONTENT_PARAM_BYTE_SIZE
+                )
+            metadata["original_elements"] = format_and_truncate_orig_elements(element_dict)
+            metadata.pop("orig_elements", None)
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        self.truncate_dict_elements(element_dict)
+        if self.upload_stager_config.flatten_metadata:
+            # move metadata to top level so it isn't nested in metadata column
+            metadata = element_dict.pop("metadata", None)
+            if metadata:
+                element_dict.update(metadata)
+        content = element_dict.pop("text", None)
+        embeddings = element_dict.pop("embeddings", None)
+        result = {
+            "content": content,
+            RECORD_ID_LABEL: file_data.identifier,
+            "metadata": element_dict,
+        }
+        # (Austin): We support bring-your-own embeddings XOR Astra-generated embeddings.
+        # Using neither /is/ a valid state, but for now we're enforcing Astra as a vector store.
+        has_unstructured_embeddings = embeddings is not None and len(embeddings) > 0
+        generate_embeddings = self.upload_stager_config.astra_generated_embeddings
+        if not has_unstructured_embeddings and not generate_embeddings:
+            raise ValueError(
+                "No vectors provided. "
+                "Please enable an Unstructured embedding provider or "
+                "configure Astra to generate embeddings."
+            )
+        elif has_unstructured_embeddings and generate_embeddings:
+            raise ValueError(
+                "Cannot use Unstructured embeddings and Astra-generated embeddings simultaneously. "
+                "Please disable Astra generated embeddings or remove the Unstructured embedder."
+            )
+        elif generate_embeddings:
+            result["$vectorize"] = content
+        elif has_unstructured_embeddings:
+            result["$vector"] = embeddings
+        if self.upload_stager_config.enable_lexical_search:
+            result["$lexical"] = content
+        return result
+class AstraDBUploaderConfig(UploaderConfig):
+    collection_name: Optional[str] = Field(
+        description="The name of the Astra DB collection. "
+        "Note that the collection name must only include letters, "
+        "numbers, and underscores.",
+        default=None,
+    )
+    keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
+    requested_indexing_policy: Optional[dict[str, Any]] = Field(
+        default=None,
+        description="The indexing policy to use for the collection.",
+        examples=['{"deny": ["metadata"]}'],
+    )
+    batch_size: int = Field(default=20, description="Number of records per batch")
+    max_concurrent_batches: int = Field(
+        default=10,
+        description="Maximum number of batches to upload concurrently. "
+        "Lower values reduce API load but may be slower. "
+        "Higher values may cause timeouts with very large uploads.",
+    )
+    record_id_key: str = Field(
+        default=RECORD_ID_LABEL,
+        description="searchable key to find entries for the same record on previous runs",
+    )
+    binary_encode_vectors: bool = Field(
+        default=True,
+        description="Upload vectors in a binary format. If set to False, "
+        "vectors will be a human-readable list of floats. "
+        "WARNING: Disabling this option may make the upload slower!",
+    )
+@dataclass
+class AstraDBUploader(Uploader):
+    connection_config: AstraDBConnectionConfig
+    upload_config: AstraDBUploaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return True
+    def init(self, **kwargs: Any) -> None:
+        self.create_destination(**kwargs)
+    @requires_dependencies(["astrapy"], extras="astradb")
+    def precheck(self) -> None:
+        try:
+            if self.upload_config.collection_name:
+                collection = get_astra_collection(
+                    connection_config=self.connection_config,
+                    collection_name=self.upload_config.collection_name,
+                    keyspace=self.upload_config.keyspace,
+                )
+                collection.options()
+            else:
+                # check for db connection only if collection name is not provided
+                get_astra_db(
+                    connection_config=self.connection_config,
+                    keyspace=self.upload_config.keyspace,
+                )
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def _collection_exists(self, collection_name: str):
+        collection = get_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=collection_name,
+            keyspace=self.upload_config.keyspace,
+        )
+        try:
+            collection.options()
+            return True
+        except RuntimeError as e:
+            if "not found" in str(e):
+                return False
+            raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
+        except Exception as e:
+            logger.error(f"failed to check if astra collection exists : {e}")
+            raise DestinationConnectionError(f"failed to check if astra collection exists : {e}")
+    def format_destination_name(self, destination_name: str) -> str:
+        # AstraDB collection naming requirements:
+        # must be below 50 characters
+        # must be lowercase alphanumeric and underscores only
+        formatted = re.sub(r"[^a-z0-9]", "_", destination_name.lower())
+        return formatted
+    def create_destination(
+        self,
+        destination_name: str = "unstructuredautocreated",
+        vector_length: Optional[int] = None,
+        similarity_metric: Optional[str] = "cosine",
+        **kwargs: Any,
+    ) -> bool:
+        destination_name = self.format_destination_name(destination_name)
+        collection_name = self.upload_config.collection_name or destination_name
+        self.upload_config.collection_name = collection_name
+        if not self._collection_exists(collection_name):
+            from astrapy.info import CollectionDefinition
+            astra_db = get_astra_db(
+                connection_config=self.connection_config, keyspace=self.upload_config.keyspace
+            )
+            logger.info(
+                f"creating default astra collection '{collection_name}' with dimension "
+                f"{vector_length} and metric {similarity_metric}"
+            )
+            definition = (
+                CollectionDefinition.builder()
+                .set_vector_dimension(dimension=vector_length)
+                .set_vector_metric(similarity_metric)
+                .build()
+            )
+            (astra_db.create_collection(collection_name, definition=definition),)
+            return True
+        logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
+        return False
+    async def delete_by_record_id(self, collection: "AstraDBAsyncCollection", file_data: FileData):
+        logger.debug(
+            f"deleting records from collection {collection.name} "
+            f"with {self.upload_config.record_id_key} "
+            f"set to {file_data.identifier}"
+        )
+        delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
+        delete_resp = await collection.delete_many(filter=delete_filter)
+        logger.debug(
+            f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
+        )
+    @requires_dependencies(["astrapy"], extras="astradb")
+    async def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        logger.info(
+            f"writing {len(data)} objects to destination "
+            f"collection {self.upload_config.collection_name}"
+        )
+        astra_db_batch_size = self.upload_config.batch_size
+        max_concurrent = self.upload_config.max_concurrent_batches
+        async_astra_collection = await get_async_astra_collection(
+            connection_config=self.connection_config,
+            collection_name=self.upload_config.collection_name,
+            keyspace=self.upload_config.keyspace,
+        )
+        # If we're disabling binary encoded vectors, update the collection settings
+        if not self.upload_config.binary_encode_vectors:
+            from astrapy.api_options import APIOptions, SerdesOptions
+            async_astra_collection = async_astra_collection.with_options(
+                api_options=APIOptions(serdes_options=SerdesOptions(binary_encode_vectors=False))
+            )
+        await self.delete_by_record_id(collection=async_astra_collection, file_data=file_data)
+        batches = list(batch_generator(data, astra_db_batch_size))
+        total_batches = len(batches)
+        logger.info(
+            f"Uploading {len(data)} elements in {total_batches} batches "
+            f"(batch_size={astra_db_batch_size}, max_concurrent={max_concurrent})"
+        )
+        semaphore = asyncio.Semaphore(max_concurrent)
+        log_interval = 100
+        async def upload_batch_with_semaphore(batch: tuple[dict, ...], batch_num: int) -> None:
+            async with semaphore:
+                try:
+                    await async_astra_collection.insert_many(batch)
+                    if (batch_num + 1) % log_interval == 0 or batch_num == total_batches - 1:
+                        logger.debug(
+                            f"Upload progress: {batch_num + 1}/{total_batches} batches completed "
+                            f"({(batch_num + 1) / total_batches * 100:.1f}%)"
+                        )
+                except Exception as e:
+                    logger.error(
+                        f"Failed to upload batch {batch_num + 1}/{total_batches}: {e}"
+                    )
+                    raise
+        await asyncio.gather(
+            *[
+                upload_batch_with_semaphore(batch, batch_num)
+                for batch_num, batch in enumerate(batches)
+            ]
+        )
+    async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        data = get_json_data(path=path)
+        await self.run_data(data=data, file_data=file_data)
+    def run(self, **kwargs: Any) -> Any:
+        raise NotImplementedError("Use astradb run_async instead")
+astra_db_source_entry = SourceRegistryEntry(
+    indexer=AstraDBIndexer,
+    indexer_config=AstraDBIndexerConfig,
+    downloader=AstraDBDownloader,
+    downloader_config=AstraDBDownloaderConfig,
+    connection_config=AstraDBConnectionConfig,
+)
+astra_db_destination_entry = DestinationRegistryEntry(
+    connection_config=AstraDBConnectionConfig,
+    upload_stager_config=AstraDBUploadStagerConfig,
+    upload_stager=AstraDBUploadStager,
+    uploader_config=AstraDBUploaderConfig,
+    uploader=AstraDBUploader,
+)