PyPI - unstructured-ingest - Versions diffs - 1.2.32__py3-none-any.whl - Mend

unstructured-ingest 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (243) hide show

unstructured_ingest/processes/connectors/couchbase.py ADDED Viewed

@@ -0,0 +1,336 @@
+import hashlib
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import timedelta
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, List
+from pydantic import BaseModel, Field, Secret
+from unstructured_ingest.data_types.file_data import (
+    BatchFileData,
+    BatchItem,
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    Indexer,
+    IndexerConfig,
+    Uploader,
+    UploaderConfig,
+    UploadStager,
+    UploadStagerConfig,
+    download_responses,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
+from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from couchbase.cluster import Cluster
+    from couchbase.collection import Collection
+CONNECTOR_TYPE = "couchbase"
+SERVER_API_VERSION = "1"
+class CouchbaseAdditionalMetadata(BaseModel):
+    bucket: str
+class CouchbaseBatchFileData(BatchFileData):
+    additional_metadata: CouchbaseAdditionalMetadata
+class CouchbaseAccessConfig(AccessConfig):
+    password: str = Field(description="The password for the Couchbase server")
+class CouchbaseConnectionConfig(ConnectionConfig):
+    username: str = Field(description="The username for the Couchbase server")
+    bucket: str = Field(description="The bucket to connect to on the Couchbase server")
+    connection_string: str = Field(
+        default="couchbase://localhost", description="The connection string of the Couchbase server"
+    )
+    scope: str = Field(
+        default="_default", description="The scope to connect to on the Couchbase server"
+    )
+    collection: str = Field(
+        default="_default", description="The collection to connect to on the Couchbase server"
+    )
+    connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
+    access_config: Secret[CouchbaseAccessConfig]
+    @requires_dependencies(["couchbase"], extras="couchbase")
+    @contextmanager
+    def get_client(self) -> Generator["Cluster", None, None]:
+        from couchbase.auth import PasswordAuthenticator
+        from couchbase.cluster import Cluster
+        from couchbase.options import ClusterOptions
+        auth = PasswordAuthenticator(self.username, self.access_config.get_secret_value().password)
+        options = ClusterOptions(auth)
+        options.apply_profile("wan_development")
+        cluster = None
+        try:
+            cluster = Cluster(self.connection_string, options)
+            cluster.wait_until_ready(timedelta(seconds=5))
+            yield cluster
+        finally:
+            if cluster:
+                cluster.close()
+class CouchbaseUploadStagerConfig(UploadStagerConfig):
+    pass
+@dataclass
+class CouchbaseUploadStager(UploadStager):
+    upload_stager_config: CouchbaseUploadStagerConfig = field(
+        default_factory=lambda: CouchbaseUploadStagerConfig()
+    )
+    def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
+        data = element_dict.copy()
+        return {
+            data["element_id"]: {
+                "embedding": data.get("embeddings", None),
+                "text": data.get("text", None),
+                "metadata": data.get("metadata", None),
+                "type": data.get("type", None),
+            }
+        }
+class CouchbaseUploaderConfig(UploaderConfig):
+    batch_size: int = Field(default=50, description="Number of documents to upload per batch")
+@dataclass
+class CouchbaseUploader(Uploader):
+    connection_config: CouchbaseConnectionConfig
+    upload_config: CouchbaseUploaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
+        logger.info(
+            f"writing {len(data)} objects to destination "
+            f"bucket, {self.connection_config.bucket} "
+            f"at {self.connection_config.connection_string}",
+        )
+        with self.connection_config.get_client() as client:
+            bucket = client.bucket(self.connection_config.bucket)
+            scope = bucket.scope(self.connection_config.scope)
+            collection = scope.collection(self.connection_config.collection)
+            for chunk in batch_generator(data, self.upload_config.batch_size):
+                collection.upsert_multi(
+                    {doc_id: doc for doc in chunk for doc_id, doc in doc.items()}
+                )
+class CouchbaseIndexerConfig(IndexerConfig):
+    batch_size: int = Field(default=50, description="Number of documents to index per batch")
+@dataclass
+class CouchbaseIndexer(Indexer):
+    connection_config: CouchbaseConnectionConfig
+    index_config: CouchbaseIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"Failed to validate connection {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    @requires_dependencies(["couchbase"], extras="couchbase")
+    def _get_doc_ids(self) -> List[str]:
+        query = (
+            f"SELECT META(d).id "
+            f"FROM `{self.connection_config.bucket}`."
+            f"`{self.connection_config.scope}`."
+            f"`{self.connection_config.collection}` as d"
+        )
+        max_attempts = 5
+        attempts = 0
+        while attempts < max_attempts:
+            try:
+                with self.connection_config.get_client() as client:
+                    result = client.query(query)
+                    document_ids = [row["id"] for row in result]
+                    return document_ids
+            except Exception as e:
+                attempts += 1
+                time.sleep(3)
+                if attempts == max_attempts:
+                    raise SourceConnectionError(f"failed to get document ids: {e}")
+    def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
+        ids = self._get_doc_ids()
+        for batch in batch_generator(ids, self.index_config.batch_size):
+            # Make sure the hash is always a positive number to create identified
+            yield CouchbaseBatchFileData(
+                connector_type=CONNECTOR_TYPE,
+                metadata=FileDataSourceMetadata(
+                    url=f"{self.connection_config.connection_string}/"
+                    f"{self.connection_config.bucket}",
+                    date_processed=str(time.time()),
+                ),
+                additional_metadata=CouchbaseAdditionalMetadata(
+                    bucket=self.connection_config.bucket
+                ),
+                batch_items=[BatchItem(identifier=b) for b in batch],
+            )
+class CouchbaseDownloaderConfig(DownloaderConfig):
+    collection_id: str = Field(
+        default="id", description="The unique key of the id field in the collection"
+    )
+    fields: list[str] = field(default_factory=list)
+@dataclass
+class CouchbaseDownloader(Downloader):
+    connection_config: CouchbaseConnectionConfig
+    download_config: CouchbaseDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+    def is_async(self) -> bool:
+        return False
+    def get_identifier(self, bucket: str, record_id: str) -> str:
+        f = f"{bucket}-{record_id}"
+        if self.download_config.fields:
+            f = "{}-{}".format(
+                f,
+                hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8],
+            )
+        return f
+    def map_cb_results(self, cb_results: dict) -> str:
+        doc_body = cb_results
+        flattened_dict = flatten_dict(dictionary=doc_body)
+        str_values = [str(value) for value in flattened_dict.values()]
+        concatenated_values = "\n".join(str_values)
+        return concatenated_values
+    def generate_download_response(
+        self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
+    ) -> DownloadResponse:
+        record_id = result[self.download_config.collection_id]
+        filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
+        filename = f"{filename_id}.txt"
+        download_path = self.download_dir / Path(filename)
+        logger.debug(
+            f"Downloading results from bucket {bucket} and id {record_id} to {download_path}"
+        )
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            with open(download_path, "w", encoding="utf8") as f:
+                f.write(self.map_cb_results(cb_results=result))
+        except Exception as e:
+            logger.error(
+                f"failed to download from bucket {bucket} "
+                f"and id {record_id} to {download_path}: {e}",
+                exc_info=True,
+            )
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
+        cast_file_data = FileData.cast(file_data=file_data)
+        cast_file_data.identifier = filename_id
+        cast_file_data.metadata.date_processed = str(time.time())
+        cast_file_data.metadata.record_locator = {
+            "connection_string": self.connection_config.connection_string,
+            "bucket": bucket,
+            "scope": self.connection_config.scope,
+            "collection": self.connection_config.collection,
+            "document_id": record_id,
+        }
+        return super().generate_download_response(
+            file_data=cast_file_data,
+            download_path=download_path,
+        )
+    def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
+        bucket_name: str = couchbase_file_data.additional_metadata.bucket
+        ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
+        with self.connection_config.get_client() as client:
+            bucket = client.bucket(bucket_name)
+            scope = bucket.scope(self.connection_config.scope)
+            collection = scope.collection(self.connection_config.collection)
+            download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
+            return list(download_resp)
+    def process_doc_id(
+        self,
+        doc_id: str,
+        collection: "Collection",
+        bucket_name: str,
+        file_data: CouchbaseBatchFileData,
+    ):
+        result = collection.get(doc_id)
+        return self.generate_download_response(
+            result=result.content_as[dict], bucket=bucket_name, file_data=file_data
+        )
+    def process_all_doc_ids(
+        self,
+        ids: list[str],
+        collection: "Collection",
+        bucket_name: str,
+        file_data: CouchbaseBatchFileData,
+    ):
+        for doc_id in ids:
+            yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
+    async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
+        raise NotImplementedError()
+couchbase_destination_entry = DestinationRegistryEntry(
+    connection_config=CouchbaseConnectionConfig,
+    uploader=CouchbaseUploader,
+    uploader_config=CouchbaseUploaderConfig,
+    upload_stager=CouchbaseUploadStager,
+    upload_stager_config=CouchbaseUploadStagerConfig,
+)
+couchbase_source_entry = SourceRegistryEntry(
+    connection_config=CouchbaseConnectionConfig,
+    indexer=CouchbaseIndexer,
+    indexer_config=CouchbaseIndexerConfig,
+    downloader=CouchbaseDownloader,
+    downloader_config=CouchbaseDownloaderConfig,
+)

unstructured_ingest/processes/connectors/databricks/__init__.py ADDED Viewed

@@ -0,0 +1,58 @@
+from __future__ import annotations
+from unstructured_ingest.processes.connector_registry import (
+    add_destination_entry,
+    add_source_entry,
+)
+from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
+from .volumes_aws import (
+    databricks_aws_volumes_destination_entry,
+    databricks_aws_volumes_source_entry,
+)
+from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
+from .volumes_azure import (
+    databricks_azure_volumes_destination_entry,
+    databricks_azure_volumes_source_entry,
+)
+from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
+from .volumes_gcp import (
+    databricks_gcp_volumes_destination_entry,
+    databricks_gcp_volumes_source_entry,
+)
+from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
+from .volumes_native import (
+    databricks_native_volumes_destination_entry,
+    databricks_native_volumes_source_entry,
+)
+from .volumes_table import CONNECTOR_TYPE as VOLUMES_TABLE_CONNECTOR_TYPE
+from .volumes_table import databricks_volumes_delta_tables_destination_entry
+add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
+add_destination_entry(
+    destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
+)
+add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
+add_destination_entry(
+    destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
+)
+add_source_entry(
+    source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
+)
+add_destination_entry(
+    destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
+    entry=databricks_native_volumes_destination_entry,
+)
+add_source_entry(
+    source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
+)
+add_destination_entry(
+    destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
+)
+add_destination_entry(
+    destination_type=VOLUMES_TABLE_CONNECTOR_TYPE,
+    entry=databricks_volumes_delta_tables_destination_entry,
+)

unstructured_ingest/processes/connectors/databricks/volumes.py ADDED Viewed

@@ -0,0 +1,233 @@
+import io
+import os
+from abc import ABC
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from uuid import NAMESPACE_DNS, uuid5
+from pydantic import BaseModel, Field, Secret
+from unstructured_ingest.data_types.file_data import (
+    FileData,
+    FileDataSourceMetadata,
+    SourceIdentifiers,
+)
+from unstructured_ingest.error import (
+    ProviderError,
+    RateLimitError,
+    UserAuthError,
+    UserError,
+)
+from unstructured_ingest.interfaces import (
+    AccessConfig,
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    Indexer,
+    IndexerConfig,
+    Uploader,
+    UploaderConfig,
+)
+from unstructured_ingest.logger import logger
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from databricks.sdk import WorkspaceClient
+class DatabricksPathMixin(BaseModel):
+    volume: str = Field(description="Name of volume in the Unity Catalog")
+    catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
+    volume_path: Optional[str] = Field(
+        default=None, description="Optional path within the volume to write to"
+    )
+    databricks_schema: str = Field(
+        default="default",
+        alias="schema",
+        description="Schema associated with the volume to write to in the Unity Catalog service",
+    )
+    @property
+    def path(self) -> str:
+        path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
+        if self.volume_path:
+            path = f"{path}/{self.volume_path}"
+        return path
+class DatabricksVolumesAccessConfig(AccessConfig):
+    token: Optional[str] = Field(default=None, description="Databricks Personal Access Token")
+class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
+    access_config: Secret[DatabricksVolumesAccessConfig]
+    host: Optional[str] = Field(
+        default=None,
+        description="The Databricks host URL for either the "
+        "Databricks workspace endpoint or the "
+        "Databricks accounts endpoint.",
+    )
+    def wrap_error(self, e: Exception) -> Exception:
+        from databricks.sdk.errors.base import DatabricksError
+        from databricks.sdk.errors.platform import STATUS_CODE_MAPPING
+        if isinstance(e, ValueError):
+            error_message = e.args[0]
+            message_split = error_message.split(":")
+            if (message_split[0].endswith("auth")) or (
+                "Client authentication failed" in error_message
+            ):
+                return UserAuthError(e)
+        if isinstance(e, DatabricksError):
+            reverse_mapping = {v: k for k, v in STATUS_CODE_MAPPING.items()}
+            if status_code := reverse_mapping.get(type(e)):
+                if status_code in [401, 403]:
+                    return UserAuthError(e)
+                if status_code == 429:
+                    return RateLimitError(e)
+                if 400 <= status_code < 500:
+                    return UserError(e)
+                if 500 <= status_code < 600:
+                    return ProviderError(e)
+        logger.error(f"unhandled exception from databricks: {e}", exc_info=True)
+        return e
+    @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
+    def get_client(self) -> "WorkspaceClient":
+        from databricks.sdk import WorkspaceClient
+        from databricks.sdk.core import Config
+        config = Config(
+            host=self.host,
+            **self.access_config.get_secret_value().model_dump(),
+        ).with_user_agent_extra(
+            "PyDatabricksSdk", os.getenv("UNSTRUCTURED_USER_AGENT", "unstructuredio_oss")
+        )
+        return WorkspaceClient(config=config)
+class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
+    recursive: bool = False
+@dataclass
+class DatabricksVolumesIndexer(Indexer, ABC):
+    index_config: DatabricksVolumesIndexerConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            raise self.connection_config.wrap_error(e=e) from e
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        try:
+            for file_info in self.connection_config.get_client().dbfs.list(
+                path=self.index_config.path, recursive=self.index_config.recursive
+            ):
+                if file_info.is_dir:
+                    continue
+                rel_path = file_info.path.replace(self.index_config.path, "")
+                if rel_path.startswith("/"):
+                    rel_path = rel_path[1:]
+                filename = Path(file_info.path).name
+                source_identifiers = SourceIdentifiers(
+                    filename=filename,
+                    rel_path=rel_path,
+                    fullpath=file_info.path,
+                )
+                yield FileData(
+                    identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
+                    connector_type=self.connector_type,
+                    source_identifiers=source_identifiers,
+                    additional_metadata={
+                        "catalog": self.index_config.catalog,
+                        "path": file_info.path,
+                    },
+                    metadata=FileDataSourceMetadata(
+                        url=file_info.path, date_modified=str(file_info.modification_time)
+                    ),
+                    display_name=source_identifiers.fullpath,
+                )
+        except Exception as e:
+            raise self.connection_config.wrap_error(e=e)
+class DatabricksVolumesDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class DatabricksVolumesDownloader(Downloader, ABC):
+    download_config: DatabricksVolumesDownloaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            raise self.connection_config.wrap_error(e=e)
+    def get_download_path(self, file_data: FileData) -> Path:
+        return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        download_path = self.get_download_path(file_data=file_data)
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        volumes_path = file_data.additional_metadata["path"]
+        logger.info(f"Writing {file_data.identifier} to {download_path}")
+        try:
+            with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
+                read_content = c._read_handle.read()
+        except Exception as e:
+            raise self.connection_config.wrap_error(e=e)
+        with open(download_path, "wb") as f:
+            f.write(read_content)
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
+    pass
+@dataclass
+class DatabricksVolumesUploader(Uploader, ABC):
+    upload_config: DatabricksVolumesUploaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def get_output_path(self, file_data: FileData) -> str:
+        if file_data.source_identifiers.relative_path:
+            return os.path.join(
+                self.upload_config.path,
+                f"{file_data.source_identifiers.relative_path.lstrip('/')}.json",
+            )
+        else:
+            return os.path.join(
+                self.upload_config.path, f"{file_data.source_identifiers.filename}.json"
+            )
+    def precheck(self) -> None:
+        try:
+            assert self.connection_config.get_client().current_user.me().active
+        except Exception as e:
+            raise self.connection_config.wrap_error(e=e)
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        output_path = self.get_output_path(file_data=file_data)
+        with open(path, "rb") as elements_file:
+            try:
+                # Read file bytes and wrap in BytesIO to create BinaryIO object
+                file_bytes = elements_file.read()
+                binary_data = io.BytesIO(file_bytes)
+                self.connection_config.get_client().files.upload(
+                    file_path=output_path,
+                    content=binary_data,  # Changed from 'contents' to 'content' in SDK 0.70.0+
+                    overwrite=True,
+                )
+            except Exception as e:
+                raise self.connection_config.wrap_error(e=e)