PyPI - unstructured-ingest - Versions diffs - 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

unstructured-ingest 0.0.24py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show

test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +42 -0
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +15 -0
test/integration/connectors/databricks_tests/__init__.py +0 -0
test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
test/integration/connectors/test_postgres.py +100 -0
test/integration/connectors/test_s3.py +152 -0
test/integration/connectors/test_sqlite.py +91 -0
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +7 -0
test/integration/connectors/utils/docker_compose.py +44 -0
test/integration/connectors/utils/validation.py +198 -0
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +13 -0
test/integration/embedders/test_bedrock.py +49 -0
test/integration/embedders/test_huggingface.py +26 -0
test/integration/embedders/test_mixedbread.py +47 -0
test/integration/embedders/test_octoai.py +41 -0
test/integration/embedders/test_openai.py +41 -0
test/integration/embedders/test_vertexai.py +41 -0
test/integration/embedders/test_voyageai.py +41 -0
test/integration/embedders/togetherai.py +43 -0
test/integration/embedders/utils.py +44 -0
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +75 -0
test/integration/utils.py +15 -0
test/unit/__init__.py +0 -0
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +41 -0
test/unit/embed/test_octoai.py +20 -0
test/unit/embed/test_openai.py +20 -0
test/unit/embed/test_vertexai.py +25 -0
test/unit/embed/test_voyageai.py +24 -0
test/unit/test_chunking_utils.py +36 -0
test/unit/test_error.py +27 -0
test/unit/test_interfaces.py +280 -0
test/unit/test_interfaces_v2.py +26 -0
test/unit/test_logger.py +78 -0
test/unit/test_utils.py +164 -0
test/unit/test_utils_v2.py +82 -0
unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/interfaces.py +2 -2
unstructured_ingest/connector/notion/types/block.py +1 -0
unstructured_ingest/connector/notion/types/database.py +1 -0
unstructured_ingest/connector/notion/types/page.py +1 -0
unstructured_ingest/embed/bedrock.py +0 -20
unstructured_ingest/embed/huggingface.py +0 -21
unstructured_ingest/embed/interfaces.py +29 -3
unstructured_ingest/embed/mixedbreadai.py +0 -36
unstructured_ingest/embed/octoai.py +2 -24
unstructured_ingest/embed/openai.py +0 -20
unstructured_ingest/embed/togetherai.py +40 -0
unstructured_ingest/embed/vertexai.py +0 -20
unstructured_ingest/embed/voyageai.py +1 -24
unstructured_ingest/interfaces.py +1 -1
unstructured_ingest/utils/dep_check.py +12 -0
unstructured_ingest/v2/cli/utils/click.py +21 -2
unstructured_ingest/v2/interfaces/connector.py +22 -2
unstructured_ingest/v2/interfaces/downloader.py +1 -0
unstructured_ingest/v2/processes/chunker.py +1 -1
unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
unstructured_ingest/v2/processes/embedder.py +13 -0
unstructured_ingest/v2/processes/partitioner.py +2 -1
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
unstructured_ingest/v2/processes/connectors/sql.py +0 -275
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0

unstructured_ingest/embed/voyageai.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
-import numpy as np
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -13,7 +12,7 @@ if TYPE_CHECKING:
 class VoyageAIEmbeddingConfig(EmbeddingConfig):
     api_key: SecretStr
-    embedder_model_name: str = Field(alias="model_name")
+    embedder_model_name: str = Field(default="voyage-3", alias="model_name")
     batch_size: Optional[int] = Field(default=None)
     truncation: Optional[bool] = Field(default=None)
     max_retries: int = 0
@@ -39,19 +38,6 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
 class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VoyageAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query(query="A sample query.")
-    @property
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    @property
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def _embed_documents(self, elements: list[str]) -> list[list[float]]:
         client: VoyageAIClient = self.config.get_client()
         response = client.embed(texts=elements, model=self.config.embedder_model_name)
@@ -63,12 +49,3 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
     def embed_query(self, query: str) -> list[float]:
         return self._embed_documents(elements=[query])[0]
-    @staticmethod
-    def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured_ingest/interfaces.py CHANGED Viewed

@@ -100,7 +100,7 @@ class PartitionConfig(BaseConfig):
     flatten_metadata: bool = False
     metadata_exclude: list[str] = field(default_factory=list)
     metadata_include: list[str] = field(default_factory=list)
-    partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
+    partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
     partition_by_api: bool = False
     api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
     hi_res_model_name: Optional[str] = None

unstructured_ingest/utils/dep_check.py CHANGED Viewed

@@ -20,6 +20,18 @@ def requires_dependencies(
     dependencies: str | list[str],
     extras: Optional[str] = None,
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """Decorator ensuring required modules are installed.
+    Use on functions with local imports to ensure required modules are available and log
+    an installation instruction if they're not.
+    Args:
+        dependencies: Name(s) of module(s) required by the decorated function.
+        extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
+    Raises:
+        ImportError: When at least one of the `dependencies` is not available.
+    """
     if isinstance(dependencies, str):
         dependencies = [dependencies]

unstructured_ingest/v2/cli/utils/click.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os.path
 from gettext import gettext, ngettext
 from gettext import gettext as _
 from pathlib import Path
-from typing import Any, Optional, Type, TypeVar
+from typing import Any, Optional, Type, TypeVar, Union
 import click
 from pydantic import BaseModel, ConfigDict, Secret
@@ -112,6 +112,20 @@ class DelimitedString(click.ParamType):
 BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
+def unwrap_optional(val: Any) -> tuple[Any, bool]:
+    if (
+        hasattr(val, "__origin__")
+        and hasattr(val, "__args__")
+        and val.__origin__ is Union
+        and len(val.__args__) == 2
+        and type(None) in val.__args__
+    ):
+        args = val.__args__
+        args = [a for a in args if a is not None]
+        return args[0], True
+    return val, False
 def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
     fields = config.model_fields
     config.model_config = ConfigDict(extra="ignore")
@@ -119,6 +133,7 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
     data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
     if access_config := fields.get("access_config"):
         access_config_type = access_config.annotation
+        access_config_type, is_optional = unwrap_optional(access_config_type)
         # Check if raw type is wrapped by a secret
         if (
             hasattr(access_config_type, "__origin__")
@@ -132,9 +147,13 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
         else:
             raise TypeError(f"Unrecognized access_config type: {access_config_type}")
         ac_field_names = [v.alias or k for k, v in ac_fields.items()]
-        data["access_config"] = {
+        access_config_data = {
             k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
         }
+        if not access_config_data and is_optional:
+            data["access_config"] = None
+        else:
+            data["access_config"] = access_config_data
     return config.model_validate(obj=data)

unstructured_ingest/v2/interfaces/connector.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from abc import ABC
 from dataclasses import dataclass
-from typing import Any, TypeVar
+from typing import Any, TypeVar, Union
-from pydantic import BaseModel, Secret
+from pydantic import BaseModel, Secret, model_validator
+from pydantic.types import _SecretBase
 class AccessConfig(BaseModel):
@@ -21,6 +22,25 @@ class ConnectionConfig(BaseModel):
             return {}
         return self.access_config.get_secret_value().model_dump()
+    @model_validator(mode="after")
+    def check_access_config(self):
+        access_config = self.access_config
+        if self._is_access_config_optional() and access_config is None:
+            return self
+        if not isinstance(access_config, _SecretBase):
+            raise ValueError("access_config must be an instance of SecretBase")
+        return self
+    def _is_access_config_optional(self) -> bool:
+        access_config_type = self.model_fields["access_config"].annotation
+        return (
+            hasattr(access_config_type, "__origin__")
+            and hasattr(access_config_type, "__args__")
+            and access_config_type.__origin__ is Union
+            and len(access_config_type.__args__) == 2
+            and type(None) in access_config_type.__args__
+        )
 ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)

unstructured_ingest/v2/interfaces/downloader.py CHANGED Viewed

@@ -62,6 +62,7 @@ class Downloader(BaseProcess, BaseConnector, ABC):
             date_modified = float(file_data.metadata.date_modified)
             date_created = float(file_data.metadata.date_created)
             os.utime(download_path, times=(date_created, date_modified))
+        file_data.local_download_path = str(download_path.resolve())
         return DownloadResponse(file_data=file_data, path=download_path)
     @property

unstructured_ingest/v2/processes/chunker.py CHANGED Viewed

@@ -20,7 +20,7 @@ class ChunkerConfig(BaseModel):
         default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
     )
     chunking_endpoint: Optional[str] = Field(
-        default="https://api.unstructured.io/general/v0/general",
+        default="https://api.unstructuredapp.io/general/v0/general",
         description="If chunking via api, use the following host.",
     )
     chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")

unstructured_ingest/v2/processes/connectors/__init__.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import unstructured_ingest.v2.processes.connectors.databricks  # noqa: F401
 import unstructured_ingest.v2.processes.connectors.fsspec  # noqa: F401
+import unstructured_ingest.v2.processes.connectors.sql  # noqa: F401
 from unstructured_ingest.v2.processes.connector_registry import (
     add_destination_entry,
     add_source_entry,
@@ -16,8 +18,6 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
 from .chroma import chroma_destination_entry
 from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
 from .couchbase import couchbase_destination_entry, couchbase_source_entry
-from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
-from .databricks_volumes import databricks_volumes_destination_entry
 from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
 from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
 from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -29,11 +29,13 @@ from .local import local_destination_entry, local_source_entry
 from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
 from .milvus import milvus_destination_entry
 from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
-from .mongodb import mongodb_destination_entry
+from .mongodb import mongodb_destination_entry, mongodb_source_entry
 from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
 from .onedrive import onedrive_source_entry
 from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
 from .opensearch import opensearch_destination_entry, opensearch_source_entry
+from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
+from .outlook import outlook_source_entry
 from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
 from .pinecone import pinecone_destination_entry
 from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
@@ -42,8 +44,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
 from .sharepoint import sharepoint_source_entry
 from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
 from .singlestore import singlestore_destination_entry
-from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
-from .sql import sql_destination_entry
 from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
 from .weaviate import weaviate_destination_entry
@@ -75,13 +75,9 @@ add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_
 add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
-add_destination_entry(
-    destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
-)
-add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
 add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
+add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
 add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
 add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
 add_destination_entry(
@@ -95,3 +91,5 @@ add_destination_entry(
 add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
 add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
+add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)

unstructured_ingest/v2/processes/connectors/databricks/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+from unstructured_ingest.v2.processes.connector_registry import (
+    add_destination_entry,
+    add_source_entry,
+)
+from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
+from .volumes_aws import (
+    databricks_aws_volumes_destination_entry,
+    databricks_aws_volumes_source_entry,
+)
+from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
+from .volumes_azure import (
+    databricks_azure_volumes_destination_entry,
+    databricks_azure_volumes_source_entry,
+)
+from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
+from .volumes_gcp import (
+    databricks_gcp_volumes_destination_entry,
+    databricks_gcp_volumes_source_entry,
+)
+from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
+from .volumes_native import (
+    databricks_native_volumes_destination_entry,
+    databricks_native_volumes_source_entry,
+)
+add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
+add_destination_entry(
+    destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
+)
+add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
+add_destination_entry(
+    destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
+)
+add_source_entry(
+    source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
+)
+add_destination_entry(
+    destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
+    entry=databricks_native_volumes_destination_entry,
+)
+add_source_entry(
+    source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
+)
+add_destination_entry(
+    destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
+)

unstructured_ingest/v2/processes/connectors/databricks/volumes.py ADDED Viewed

@@ -0,0 +1,175 @@
+import os
+from abc import ABC
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Generator, Optional
+from uuid import NAMESPACE_DNS, uuid5
+from pydantic import BaseModel, Field
+from unstructured_ingest.error import (
+    DestinationConnectionError,
+    SourceConnectionError,
+    SourceConnectionNetworkError,
+)
+from unstructured_ingest.utils.dep_check import requires_dependencies
+from unstructured_ingest.v2.interfaces import (
+    ConnectionConfig,
+    Downloader,
+    DownloaderConfig,
+    DownloadResponse,
+    FileData,
+    FileDataSourceMetadata,
+    Indexer,
+    IndexerConfig,
+    SourceIdentifiers,
+    Uploader,
+    UploaderConfig,
+)
+from unstructured_ingest.v2.logger import logger
+if TYPE_CHECKING:
+    from databricks.sdk import WorkspaceClient
+class DatabricksPathMixin(BaseModel):
+    volume: str = Field(description="Name of volume in the Unity Catalog")
+    catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
+    volume_path: Optional[str] = Field(
+        default=None, description="Optional path within the volume to write to"
+    )
+    databricks_schema: str = Field(
+        default="default",
+        alias="schema",
+        description="Schema associated with the volume to write to in the Unity Catalog service",
+    )
+    @property
+    def path(self) -> str:
+        path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
+        if self.volume_path:
+            path = f"{path}/{self.volume_path}"
+        return path
+class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
+    host: Optional[str] = Field(
+        default=None,
+        description="The Databricks host URL for either the "
+        "Databricks workspace endpoint or the "
+        "Databricks accounts endpoint.",
+    )
+    @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
+    def get_client(self) -> "WorkspaceClient":
+        from databricks.sdk import WorkspaceClient
+        return WorkspaceClient(
+            host=self.host,
+            **self.access_config.get_secret_value().model_dump(),
+        )
+class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
+    recursive: bool = False
+@dataclass
+class DatabricksVolumesIndexer(Indexer, ABC):
+    index_config: DatabricksVolumesIndexerConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
+        for file_info in self.connection_config.get_client().dbfs.list(
+            path=self.index_config.path, recursive=self.index_config.recursive
+        ):
+            if file_info.is_dir:
+                continue
+            rel_path = file_info.path.replace(self.index_config.path, "")
+            if rel_path.startswith("/"):
+                rel_path = rel_path[1:]
+            filename = Path(file_info.path).name
+            yield FileData(
+                identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
+                connector_type=self.connector_type,
+                source_identifiers=SourceIdentifiers(
+                    filename=filename,
+                    rel_path=rel_path,
+                    fullpath=file_info.path,
+                ),
+                additional_metadata={"catalog": self.index_config.catalog, "path": file_info.path},
+                metadata=FileDataSourceMetadata(
+                    url=file_info.path, date_modified=str(file_info.modification_time)
+                ),
+            )
+class DatabricksVolumesDownloaderConfig(DownloaderConfig):
+    pass
+@dataclass
+class DatabricksVolumesDownloader(Downloader, ABC):
+    download_config: DatabricksVolumesDownloaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def precheck(self) -> None:
+        try:
+            self.connection_config.get_client()
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise SourceConnectionError(f"failed to validate connection: {e}")
+    def get_download_path(self, file_data: FileData) -> Path:
+        return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
+    def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
+        download_path = self.get_download_path(file_data=file_data)
+        download_path.parent.mkdir(parents=True, exist_ok=True)
+        volumes_path = file_data.additional_metadata["path"]
+        logger.info(f"Writing {file_data.identifier} to {download_path}")
+        try:
+            with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
+                read_content = c._read_handle.read()
+            with open(download_path, "wb") as f:
+                f.write(read_content)
+        except Exception as e:
+            logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
+            raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
+        return self.generate_download_response(file_data=file_data, download_path=download_path)
+class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
+    overwrite: bool = Field(
+        default=False, description="If true, an existing file will be overwritten."
+    )
+@dataclass
+class DatabricksVolumesUploader(Uploader, ABC):
+    upload_config: DatabricksVolumesUploaderConfig
+    connection_config: DatabricksVolumesConnectionConfig
+    def precheck(self) -> None:
+        try:
+            assert self.connection_config.get_client().current_user.me().active
+        except Exception as e:
+            logger.error(f"failed to validate connection: {e}", exc_info=True)
+            raise DestinationConnectionError(f"failed to validate connection: {e}")
+    def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
+        output_path = os.path.join(self.upload_config.path, path.name)
+        with open(path, "rb") as elements_file:
+            self.connection_config.get_client().files.upload(
+                file_path=output_path,
+                contents=elements_file,
+                overwrite=self.upload_config.overwrite,
+            )

unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py ADDED Viewed

@@ -0,0 +1,87 @@
+from dataclasses import dataclass, field
+from typing import Optional
+from pydantic import Field, Secret
+from unstructured_ingest.v2.interfaces import AccessConfig
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
+from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesConnectionConfig,
+    DatabricksVolumesDownloader,
+    DatabricksVolumesDownloaderConfig,
+    DatabricksVolumesIndexer,
+    DatabricksVolumesIndexerConfig,
+    DatabricksVolumesUploader,
+    DatabricksVolumesUploaderConfig,
+)
+CONNECTOR_TYPE = "databricks_volumes_aws"
+class DatabricksAWSVolumesAccessConfig(AccessConfig):
+    account_id: Optional[str] = Field(
+        default=None,
+        description="The Databricks account ID for the Databricks " "accounts endpoint",
+    )
+    profile: Optional[str] = None
+    token: Optional[str] = Field(
+        default=None,
+        description="The Databricks personal access token (PAT)",
+    )
+class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
+    access_config: Secret[DatabricksAWSVolumesAccessConfig]
+class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
+    pass
+@dataclass
+class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
+    connection_config: DatabricksAWSVolumesConnectionConfig
+    index_config: DatabricksAWSVolumesIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
+    pass
+@dataclass
+class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
+    connection_config: DatabricksAWSVolumesConnectionConfig
+    download_config: DatabricksVolumesDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
+    pass
+@dataclass
+class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
+    connection_config: DatabricksAWSVolumesConnectionConfig
+    upload_config: DatabricksAWSVolumesUploaderConfig = field(
+        default_factory=DatabricksAWSVolumesUploaderConfig
+    )
+    connector_type: str = CONNECTOR_TYPE
+databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
+    connection_config=DatabricksAWSVolumesConnectionConfig,
+    uploader=DatabricksAWSVolumesUploader,
+    uploader_config=DatabricksAWSVolumesUploaderConfig,
+)
+databricks_aws_volumes_source_entry = SourceRegistryEntry(
+    connection_config=DatabricksAWSVolumesConnectionConfig,
+    indexer=DatabricksAWSVolumesIndexer,
+    indexer_config=DatabricksAWSVolumesIndexerConfig,
+    downloader=DatabricksAWSVolumesDownloader,
+    downloader_config=DatabricksAWSVolumesDownloaderConfig,
+)

unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py ADDED Viewed

@@ -0,0 +1,102 @@
+from dataclasses import dataclass, field
+from typing import Optional
+from pydantic import Field, Secret
+from unstructured_ingest.v2.interfaces import AccessConfig
+from unstructured_ingest.v2.processes.connector_registry import (
+    DestinationRegistryEntry,
+    SourceRegistryEntry,
+)
+from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
+    DatabricksVolumesConnectionConfig,
+    DatabricksVolumesDownloader,
+    DatabricksVolumesDownloaderConfig,
+    DatabricksVolumesIndexer,
+    DatabricksVolumesIndexerConfig,
+    DatabricksVolumesUploader,
+    DatabricksVolumesUploaderConfig,
+)
+CONNECTOR_TYPE = "databricks_volumes_azure"
+class DatabricksAzureVolumesAccessConfig(AccessConfig):
+    account_id: Optional[str] = Field(
+        default=None,
+        description="The Databricks account ID for the Databricks " "accounts endpoint.",
+    )
+    profile: Optional[str] = None
+    azure_workspace_resource_id: Optional[str] = Field(
+        default=None,
+        description="The Azure Resource Manager ID for the Azure Databricks workspace, "
+        "which is exchanged for a Databricks host URL.",
+    )
+    azure_client_secret: Optional[str] = Field(
+        default=None, description="The Azure AD service principal’s client secret."
+    )
+    azure_client_id: Optional[str] = Field(
+        default=None, description="The Azure AD service principal’s application ID."
+    )
+    azure_tenant_id: Optional[str] = Field(
+        default=None, description="The Azure AD service principal’s tenant ID."
+    )
+    azure_environment: Optional[str] = Field(
+        default=None,
+        description="The Azure environment type for a " "specific set of API endpoints",
+        examples=["Public", "UsGov", "China", "Germany"],
+    )
+class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
+    access_config: Secret[DatabricksAzureVolumesAccessConfig]
+class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
+    pass
+@dataclass
+class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
+    connection_config: DatabricksAzureVolumesConnectionConfig
+    index_config: DatabricksAzureVolumesIndexerConfig
+    connector_type: str = CONNECTOR_TYPE
+class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
+    pass
+@dataclass
+class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
+    connection_config: DatabricksAzureVolumesConnectionConfig
+    download_config: DatabricksVolumesDownloaderConfig
+    connector_type: str = CONNECTOR_TYPE
+class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
+    pass
+@dataclass
+class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
+    connection_config: DatabricksAzureVolumesConnectionConfig
+    upload_config: DatabricksAzureVolumesUploaderConfig = field(
+        default_factory=DatabricksAzureVolumesUploaderConfig
+    )
+    connector_type: str = CONNECTOR_TYPE
+databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
+    connection_config=DatabricksAzureVolumesConnectionConfig,
+    uploader=DatabricksAzureVolumesUploader,
+    uploader_config=DatabricksAzureVolumesUploaderConfig,
+)
+databricks_azure_volumes_source_entry = SourceRegistryEntry(
+    connection_config=DatabricksAzureVolumesConnectionConfig,
+    indexer=DatabricksAzureVolumesIndexer,
+    indexer_config=DatabricksAzureVolumesIndexerConfig,
+    downloader=DatabricksAzureVolumesDownloader,
+    downloader_config=DatabricksAzureVolumesDownloaderConfig,
+)

unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.24py3-none-any.whl → 0.1.0py3-none-any.whl