unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
|
-
import numpy as np
|
|
5
4
|
from pydantic import Field, SecretStr
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -13,7 +12,7 @@ if TYPE_CHECKING:
|
|
|
13
12
|
|
|
14
13
|
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
15
14
|
api_key: SecretStr
|
|
16
|
-
embedder_model_name: str = Field(alias="model_name")
|
|
15
|
+
embedder_model_name: str = Field(default="voyage-3", alias="model_name")
|
|
17
16
|
batch_size: Optional[int] = Field(default=None)
|
|
18
17
|
truncation: Optional[bool] = Field(default=None)
|
|
19
18
|
max_retries: int = 0
|
|
@@ -39,19 +38,6 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
|
39
38
|
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
40
39
|
config: VoyageAIEmbeddingConfig
|
|
41
40
|
|
|
42
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
43
|
-
return self.embed_query(query="A sample query.")
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
47
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
48
|
-
return np.shape(exemplary_embedding)
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def is_unit_vector(self) -> bool:
|
|
52
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
53
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
54
|
-
|
|
55
41
|
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
56
42
|
client: VoyageAIClient = self.config.get_client()
|
|
57
43
|
response = client.embed(texts=elements, model=self.config.embedder_model_name)
|
|
@@ -63,12 +49,3 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
63
49
|
|
|
64
50
|
def embed_query(self, query: str) -> list[float]:
|
|
65
51
|
return self._embed_documents(elements=[query])[0]
|
|
66
|
-
|
|
67
|
-
@staticmethod
|
|
68
|
-
def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
|
|
69
|
-
assert len(elements) == len(embeddings)
|
|
70
|
-
elements_w_embedding = []
|
|
71
|
-
for i, element in enumerate(elements):
|
|
72
|
-
element["embeddings"] = embeddings[i]
|
|
73
|
-
elements_w_embedding.append(element)
|
|
74
|
-
return elements
|
|
@@ -100,7 +100,7 @@ class PartitionConfig(BaseConfig):
|
|
|
100
100
|
flatten_metadata: bool = False
|
|
101
101
|
metadata_exclude: list[str] = field(default_factory=list)
|
|
102
102
|
metadata_include: list[str] = field(default_factory=list)
|
|
103
|
-
partition_endpoint: Optional[str] = "https://api.
|
|
103
|
+
partition_endpoint: Optional[str] = "https://api.unstructuredapp.io/general/v0/general"
|
|
104
104
|
partition_by_api: bool = False
|
|
105
105
|
api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None
|
|
106
106
|
hi_res_model_name: Optional[str] = None
|
|
@@ -20,6 +20,18 @@ def requires_dependencies(
|
|
|
20
20
|
dependencies: str | list[str],
|
|
21
21
|
extras: Optional[str] = None,
|
|
22
22
|
) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
|
|
23
|
+
"""Decorator ensuring required modules are installed.
|
|
24
|
+
|
|
25
|
+
Use on functions with local imports to ensure required modules are available and log
|
|
26
|
+
an installation instruction if they're not.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
dependencies: Name(s) of module(s) required by the decorated function.
|
|
30
|
+
extras: unstructured-ingest extra which installs required `dependencies`. Defaults to None.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ImportError: When at least one of the `dependencies` is not available.
|
|
34
|
+
"""
|
|
23
35
|
if isinstance(dependencies, str):
|
|
24
36
|
dependencies = [dependencies]
|
|
25
37
|
|
|
@@ -3,7 +3,7 @@ import os.path
|
|
|
3
3
|
from gettext import gettext, ngettext
|
|
4
4
|
from gettext import gettext as _
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional, Type, TypeVar
|
|
6
|
+
from typing import Any, Optional, Type, TypeVar, Union
|
|
7
7
|
|
|
8
8
|
import click
|
|
9
9
|
from pydantic import BaseModel, ConfigDict, Secret
|
|
@@ -112,6 +112,20 @@ class DelimitedString(click.ParamType):
|
|
|
112
112
|
BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def unwrap_optional(val: Any) -> tuple[Any, bool]:
|
|
116
|
+
if (
|
|
117
|
+
hasattr(val, "__origin__")
|
|
118
|
+
and hasattr(val, "__args__")
|
|
119
|
+
and val.__origin__ is Union
|
|
120
|
+
and len(val.__args__) == 2
|
|
121
|
+
and type(None) in val.__args__
|
|
122
|
+
):
|
|
123
|
+
args = val.__args__
|
|
124
|
+
args = [a for a in args if a is not None]
|
|
125
|
+
return args[0], True
|
|
126
|
+
return val, False
|
|
127
|
+
|
|
128
|
+
|
|
115
129
|
def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
|
|
116
130
|
fields = config.model_fields
|
|
117
131
|
config.model_config = ConfigDict(extra="ignore")
|
|
@@ -119,6 +133,7 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
|
|
|
119
133
|
data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
|
|
120
134
|
if access_config := fields.get("access_config"):
|
|
121
135
|
access_config_type = access_config.annotation
|
|
136
|
+
access_config_type, is_optional = unwrap_optional(access_config_type)
|
|
122
137
|
# Check if raw type is wrapped by a secret
|
|
123
138
|
if (
|
|
124
139
|
hasattr(access_config_type, "__origin__")
|
|
@@ -132,9 +147,13 @@ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
|
|
|
132
147
|
else:
|
|
133
148
|
raise TypeError(f"Unrecognized access_config type: {access_config_type}")
|
|
134
149
|
ac_field_names = [v.alias or k for k, v in ac_fields.items()]
|
|
135
|
-
|
|
150
|
+
access_config_data = {
|
|
136
151
|
k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
|
|
137
152
|
}
|
|
153
|
+
if not access_config_data and is_optional:
|
|
154
|
+
data["access_config"] = None
|
|
155
|
+
else:
|
|
156
|
+
data["access_config"] = access_config_data
|
|
138
157
|
return config.model_validate(obj=data)
|
|
139
158
|
|
|
140
159
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any, TypeVar
|
|
3
|
+
from typing import Any, TypeVar, Union
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, Secret
|
|
5
|
+
from pydantic import BaseModel, Secret, model_validator
|
|
6
|
+
from pydantic.types import _SecretBase
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class AccessConfig(BaseModel):
|
|
@@ -21,6 +22,25 @@ class ConnectionConfig(BaseModel):
|
|
|
21
22
|
return {}
|
|
22
23
|
return self.access_config.get_secret_value().model_dump()
|
|
23
24
|
|
|
25
|
+
@model_validator(mode="after")
|
|
26
|
+
def check_access_config(self):
|
|
27
|
+
access_config = self.access_config
|
|
28
|
+
if self._is_access_config_optional() and access_config is None:
|
|
29
|
+
return self
|
|
30
|
+
if not isinstance(access_config, _SecretBase):
|
|
31
|
+
raise ValueError("access_config must be an instance of SecretBase")
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def _is_access_config_optional(self) -> bool:
|
|
35
|
+
access_config_type = self.model_fields["access_config"].annotation
|
|
36
|
+
return (
|
|
37
|
+
hasattr(access_config_type, "__origin__")
|
|
38
|
+
and hasattr(access_config_type, "__args__")
|
|
39
|
+
and access_config_type.__origin__ is Union
|
|
40
|
+
and len(access_config_type.__args__) == 2
|
|
41
|
+
and type(None) in access_config_type.__args__
|
|
42
|
+
)
|
|
43
|
+
|
|
24
44
|
|
|
25
45
|
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
26
46
|
|
|
@@ -62,6 +62,7 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
62
62
|
date_modified = float(file_data.metadata.date_modified)
|
|
63
63
|
date_created = float(file_data.metadata.date_created)
|
|
64
64
|
os.utime(download_path, times=(date_created, date_modified))
|
|
65
|
+
file_data.local_download_path = str(download_path.resolve())
|
|
65
66
|
return DownloadResponse(file_data=file_data, path=download_path)
|
|
66
67
|
|
|
67
68
|
@property
|
|
@@ -20,7 +20,7 @@ class ChunkerConfig(BaseModel):
|
|
|
20
20
|
default=None, description="The rule-set to use to form chunks. Omit to disable chunking."
|
|
21
21
|
)
|
|
22
22
|
chunking_endpoint: Optional[str] = Field(
|
|
23
|
-
default="https://api.
|
|
23
|
+
default="https://api.unstructuredapp.io/general/v0/general",
|
|
24
24
|
description="If chunking via api, use the following host.",
|
|
25
25
|
)
|
|
26
26
|
chunk_by_api: bool = Field(default=False, description="Flag to use api for chunking")
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
|
|
3
4
|
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
|
+
import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
|
|
4
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
5
7
|
add_destination_entry,
|
|
6
8
|
add_source_entry,
|
|
@@ -16,8 +18,6 @@ from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
|
|
|
16
18
|
from .chroma import chroma_destination_entry
|
|
17
19
|
from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
|
|
18
20
|
from .couchbase import couchbase_destination_entry, couchbase_source_entry
|
|
19
|
-
from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE
|
|
20
|
-
from .databricks_volumes import databricks_volumes_destination_entry
|
|
21
21
|
from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
|
|
22
22
|
from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
|
|
23
23
|
from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
@@ -29,11 +29,13 @@ from .local import local_destination_entry, local_source_entry
|
|
|
29
29
|
from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
30
30
|
from .milvus import milvus_destination_entry
|
|
31
31
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
32
|
-
from .mongodb import mongodb_destination_entry
|
|
32
|
+
from .mongodb import mongodb_destination_entry, mongodb_source_entry
|
|
33
33
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
34
34
|
from .onedrive import onedrive_source_entry
|
|
35
35
|
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
|
|
36
36
|
from .opensearch import opensearch_destination_entry, opensearch_source_entry
|
|
37
|
+
from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
|
|
38
|
+
from .outlook import outlook_source_entry
|
|
37
39
|
from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
|
|
38
40
|
from .pinecone import pinecone_destination_entry
|
|
39
41
|
from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE
|
|
@@ -42,8 +44,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
|
|
|
42
44
|
from .sharepoint import sharepoint_source_entry
|
|
43
45
|
from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE
|
|
44
46
|
from .singlestore import singlestore_destination_entry
|
|
45
|
-
from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE
|
|
46
|
-
from .sql import sql_destination_entry
|
|
47
47
|
from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
|
|
48
48
|
from .weaviate import weaviate_destination_entry
|
|
49
49
|
|
|
@@ -75,13 +75,9 @@ add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_
|
|
|
75
75
|
|
|
76
76
|
add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
|
|
77
77
|
|
|
78
|
-
add_destination_entry(
|
|
79
|
-
destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry)
|
|
83
|
-
|
|
84
78
|
add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
|
|
79
|
+
add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
|
|
80
|
+
|
|
85
81
|
add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry)
|
|
86
82
|
add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry)
|
|
87
83
|
add_destination_entry(
|
|
@@ -95,3 +91,5 @@ add_destination_entry(
|
|
|
95
91
|
|
|
96
92
|
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
97
93
|
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|
|
94
|
+
|
|
95
|
+
add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
4
|
+
add_destination_entry,
|
|
5
|
+
add_source_entry,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from .volumes_aws import CONNECTOR_TYPE as VOLUMES_AWS_CONNECTOR_TYPE
|
|
9
|
+
from .volumes_aws import (
|
|
10
|
+
databricks_aws_volumes_destination_entry,
|
|
11
|
+
databricks_aws_volumes_source_entry,
|
|
12
|
+
)
|
|
13
|
+
from .volumes_azure import CONNECTOR_TYPE as VOLUMES_AZURE_CONNECTOR_TYPE
|
|
14
|
+
from .volumes_azure import (
|
|
15
|
+
databricks_azure_volumes_destination_entry,
|
|
16
|
+
databricks_azure_volumes_source_entry,
|
|
17
|
+
)
|
|
18
|
+
from .volumes_gcp import CONNECTOR_TYPE as VOLUMES_GCP_CONNECTOR_TYPE
|
|
19
|
+
from .volumes_gcp import (
|
|
20
|
+
databricks_gcp_volumes_destination_entry,
|
|
21
|
+
databricks_gcp_volumes_source_entry,
|
|
22
|
+
)
|
|
23
|
+
from .volumes_native import CONNECTOR_TYPE as VOLUMES_NATIVE_CONNECTOR_TYPE
|
|
24
|
+
from .volumes_native import (
|
|
25
|
+
databricks_native_volumes_destination_entry,
|
|
26
|
+
databricks_native_volumes_source_entry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
add_source_entry(source_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_source_entry)
|
|
30
|
+
add_destination_entry(
|
|
31
|
+
destination_type=VOLUMES_AWS_CONNECTOR_TYPE, entry=databricks_aws_volumes_destination_entry
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
add_source_entry(source_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_source_entry)
|
|
35
|
+
add_destination_entry(
|
|
36
|
+
destination_type=VOLUMES_GCP_CONNECTOR_TYPE, entry=databricks_gcp_volumes_destination_entry
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
add_source_entry(
|
|
40
|
+
source_type=VOLUMES_NATIVE_CONNECTOR_TYPE, entry=databricks_native_volumes_source_entry
|
|
41
|
+
)
|
|
42
|
+
add_destination_entry(
|
|
43
|
+
destination_type=VOLUMES_NATIVE_CONNECTOR_TYPE,
|
|
44
|
+
entry=databricks_native_volumes_destination_entry,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
add_source_entry(
|
|
48
|
+
source_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_source_entry
|
|
49
|
+
)
|
|
50
|
+
add_destination_entry(
|
|
51
|
+
destination_type=VOLUMES_AZURE_CONNECTOR_TYPE, entry=databricks_azure_volumes_destination_entry
|
|
52
|
+
)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
DestinationConnectionError,
|
|
12
|
+
SourceConnectionError,
|
|
13
|
+
SourceConnectionNetworkError,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
Downloader,
|
|
19
|
+
DownloaderConfig,
|
|
20
|
+
DownloadResponse,
|
|
21
|
+
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
SourceIdentifiers,
|
|
26
|
+
Uploader,
|
|
27
|
+
UploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.v2.logger import logger
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from databricks.sdk import WorkspaceClient
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatabricksPathMixin(BaseModel):
|
|
36
|
+
volume: str = Field(description="Name of volume in the Unity Catalog")
|
|
37
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
38
|
+
volume_path: Optional[str] = Field(
|
|
39
|
+
default=None, description="Optional path within the volume to write to"
|
|
40
|
+
)
|
|
41
|
+
databricks_schema: str = Field(
|
|
42
|
+
default="default",
|
|
43
|
+
alias="schema",
|
|
44
|
+
description="Schema associated with the volume to write to in the Unity Catalog service",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def path(self) -> str:
|
|
49
|
+
path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
|
|
50
|
+
if self.volume_path:
|
|
51
|
+
path = f"{path}/{self.volume_path}"
|
|
52
|
+
return path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
56
|
+
host: Optional[str] = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="The Databricks host URL for either the "
|
|
59
|
+
"Databricks workspace endpoint or the "
|
|
60
|
+
"Databricks accounts endpoint.",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
64
|
+
def get_client(self) -> "WorkspaceClient":
|
|
65
|
+
from databricks.sdk import WorkspaceClient
|
|
66
|
+
|
|
67
|
+
return WorkspaceClient(
|
|
68
|
+
host=self.host,
|
|
69
|
+
**self.access_config.get_secret_value().model_dump(),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
|
|
74
|
+
recursive: bool = False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class DatabricksVolumesIndexer(Indexer, ABC):
|
|
79
|
+
index_config: DatabricksVolumesIndexerConfig
|
|
80
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
81
|
+
|
|
82
|
+
def precheck(self) -> None:
|
|
83
|
+
try:
|
|
84
|
+
self.connection_config.get_client()
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
87
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
88
|
+
|
|
89
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
90
|
+
for file_info in self.connection_config.get_client().dbfs.list(
|
|
91
|
+
path=self.index_config.path, recursive=self.index_config.recursive
|
|
92
|
+
):
|
|
93
|
+
if file_info.is_dir:
|
|
94
|
+
continue
|
|
95
|
+
rel_path = file_info.path.replace(self.index_config.path, "")
|
|
96
|
+
if rel_path.startswith("/"):
|
|
97
|
+
rel_path = rel_path[1:]
|
|
98
|
+
filename = Path(file_info.path).name
|
|
99
|
+
yield FileData(
|
|
100
|
+
identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
|
|
101
|
+
connector_type=self.connector_type,
|
|
102
|
+
source_identifiers=SourceIdentifiers(
|
|
103
|
+
filename=filename,
|
|
104
|
+
rel_path=rel_path,
|
|
105
|
+
fullpath=file_info.path,
|
|
106
|
+
),
|
|
107
|
+
additional_metadata={"catalog": self.index_config.catalog, "path": file_info.path},
|
|
108
|
+
metadata=FileDataSourceMetadata(
|
|
109
|
+
url=file_info.path, date_modified=str(file_info.modification_time)
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class DatabricksVolumesDownloaderConfig(DownloaderConfig):
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class DatabricksVolumesDownloader(Downloader, ABC):
|
|
120
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
121
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
122
|
+
|
|
123
|
+
def precheck(self) -> None:
|
|
124
|
+
try:
|
|
125
|
+
self.connection_config.get_client()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
128
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
129
|
+
|
|
130
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
131
|
+
return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
132
|
+
|
|
133
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
134
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
135
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
volumes_path = file_data.additional_metadata["path"]
|
|
137
|
+
logger.info(f"Writing {file_data.identifier} to {download_path}")
|
|
138
|
+
try:
|
|
139
|
+
with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
|
|
140
|
+
read_content = c._read_handle.read()
|
|
141
|
+
with open(download_path, "wb") as f:
|
|
142
|
+
f.write(read_content)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
145
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
146
|
+
|
|
147
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
151
|
+
overwrite: bool = Field(
|
|
152
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class DatabricksVolumesUploader(Uploader, ABC):
|
|
158
|
+
upload_config: DatabricksVolumesUploaderConfig
|
|
159
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
160
|
+
|
|
161
|
+
def precheck(self) -> None:
|
|
162
|
+
try:
|
|
163
|
+
assert self.connection_config.get_client().current_user.me().active
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
166
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
167
|
+
|
|
168
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
169
|
+
output_path = os.path.join(self.upload_config.path, path.name)
|
|
170
|
+
with open(path, "rb") as elements_file:
|
|
171
|
+
self.connection_config.get_client().files.upload(
|
|
172
|
+
file_path=output_path,
|
|
173
|
+
contents=elements_file,
|
|
174
|
+
overwrite=self.upload_config.overwrite,
|
|
175
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksAWSVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
token: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Databricks personal access token (PAT)",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
37
|
+
access_config: Secret[DatabricksAWSVolumesAccessConfig]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
|
|
46
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
47
|
+
index_config: DatabricksAWSVolumesIndexerConfig
|
|
48
|
+
connector_type: str = CONNECTOR_TYPE
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
|
|
57
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
58
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
59
|
+
connector_type: str = CONNECTOR_TYPE
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
|
|
68
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
69
|
+
upload_config: DatabricksAWSVolumesUploaderConfig = field(
|
|
70
|
+
default_factory=DatabricksAWSVolumesUploaderConfig
|
|
71
|
+
)
|
|
72
|
+
connector_type: str = CONNECTOR_TYPE
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
|
|
76
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
77
|
+
uploader=DatabricksAWSVolumesUploader,
|
|
78
|
+
uploader_config=DatabricksAWSVolumesUploaderConfig,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
databricks_aws_volumes_source_entry = SourceRegistryEntry(
|
|
82
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
83
|
+
indexer=DatabricksAWSVolumesIndexer,
|
|
84
|
+
indexer_config=DatabricksAWSVolumesIndexerConfig,
|
|
85
|
+
downloader=DatabricksAWSVolumesDownloader,
|
|
86
|
+
downloader_config=DatabricksAWSVolumesDownloaderConfig,
|
|
87
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksAzureVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
azure_client_secret: Optional[str] = Field(
|
|
36
|
+
default=None, description="The Azure AD service principal’s client secret."
|
|
37
|
+
)
|
|
38
|
+
azure_client_id: Optional[str] = Field(
|
|
39
|
+
default=None, description="The Azure AD service principal’s application ID."
|
|
40
|
+
)
|
|
41
|
+
azure_tenant_id: Optional[str] = Field(
|
|
42
|
+
default=None, description="The Azure AD service principal’s tenant ID."
|
|
43
|
+
)
|
|
44
|
+
azure_environment: Optional[str] = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="The Azure environment type for a " "specific set of API endpoints",
|
|
47
|
+
examples=["Public", "UsGov", "China", "Germany"],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
52
|
+
access_config: Secret[DatabricksAzureVolumesAccessConfig]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
|
|
61
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
62
|
+
index_config: DatabricksAzureVolumesIndexerConfig
|
|
63
|
+
connector_type: str = CONNECTOR_TYPE
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
|
|
72
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
73
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
74
|
+
connector_type: str = CONNECTOR_TYPE
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
|
|
83
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
84
|
+
upload_config: DatabricksAzureVolumesUploaderConfig = field(
|
|
85
|
+
default_factory=DatabricksAzureVolumesUploaderConfig
|
|
86
|
+
)
|
|
87
|
+
connector_type: str = CONNECTOR_TYPE
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
91
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
92
|
+
uploader=DatabricksAzureVolumesUploader,
|
|
93
|
+
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
97
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
98
|
+
indexer=DatabricksAzureVolumesIndexer,
|
|
99
|
+
indexer_config=DatabricksAzureVolumesIndexerConfig,
|
|
100
|
+
downloader=DatabricksAzureVolumesDownloader,
|
|
101
|
+
downloader_config=DatabricksAzureVolumesDownloaderConfig,
|
|
102
|
+
)
|