unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_postgres.py +178 -0
- test/integration/connectors/sql/test_sqlite.py +151 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +203 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
- unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +177 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +310 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +172 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/RECORD +85 -31
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
DestinationConnectionError,
|
|
12
|
+
SourceConnectionError,
|
|
13
|
+
SourceConnectionNetworkError,
|
|
14
|
+
)
|
|
15
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
|
+
from unstructured_ingest.v2.interfaces import (
|
|
17
|
+
ConnectionConfig,
|
|
18
|
+
Downloader,
|
|
19
|
+
DownloaderConfig,
|
|
20
|
+
DownloadResponse,
|
|
21
|
+
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
23
|
+
Indexer,
|
|
24
|
+
IndexerConfig,
|
|
25
|
+
SourceIdentifiers,
|
|
26
|
+
Uploader,
|
|
27
|
+
UploaderConfig,
|
|
28
|
+
)
|
|
29
|
+
from unstructured_ingest.v2.logger import logger
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from databricks.sdk import WorkspaceClient
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatabricksPathMixin(BaseModel):
|
|
36
|
+
volume: str = Field(description="Name of volume in the Unity Catalog")
|
|
37
|
+
catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
|
|
38
|
+
volume_path: Optional[str] = Field(
|
|
39
|
+
default=None, description="Optional path within the volume to write to"
|
|
40
|
+
)
|
|
41
|
+
databricks_schema: str = Field(
|
|
42
|
+
default="default",
|
|
43
|
+
alias="schema",
|
|
44
|
+
description="Schema associated with the volume to write to in the Unity Catalog service",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def path(self) -> str:
|
|
49
|
+
path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
|
|
50
|
+
if self.volume_path:
|
|
51
|
+
path = f"{path}/{self.volume_path}"
|
|
52
|
+
return path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
56
|
+
host: Optional[str] = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="The Databricks host URL for either the "
|
|
59
|
+
"Databricks workspace endpoint or the "
|
|
60
|
+
"Databricks accounts endpoint.",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
64
|
+
def get_client(self) -> "WorkspaceClient":
|
|
65
|
+
from databricks.sdk import WorkspaceClient
|
|
66
|
+
|
|
67
|
+
return WorkspaceClient(
|
|
68
|
+
host=self.host,
|
|
69
|
+
**self.access_config.get_secret_value().model_dump(),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
|
|
74
|
+
recursive: bool = False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class DatabricksVolumesIndexer(Indexer, ABC):
|
|
79
|
+
index_config: DatabricksVolumesIndexerConfig
|
|
80
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
81
|
+
|
|
82
|
+
def precheck(self) -> None:
|
|
83
|
+
try:
|
|
84
|
+
self.connection_config.get_client()
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
87
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
88
|
+
|
|
89
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
90
|
+
for file_info in self.connection_config.get_client().dbfs.list(
|
|
91
|
+
path=self.index_config.path, recursive=self.index_config.recursive
|
|
92
|
+
):
|
|
93
|
+
if file_info.is_dir:
|
|
94
|
+
continue
|
|
95
|
+
rel_path = file_info.path.replace(self.index_config.path, "")
|
|
96
|
+
if rel_path.startswith("/"):
|
|
97
|
+
rel_path = rel_path[1:]
|
|
98
|
+
filename = Path(file_info.path).name
|
|
99
|
+
yield FileData(
|
|
100
|
+
identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
|
|
101
|
+
connector_type=self.connector_type,
|
|
102
|
+
source_identifiers=SourceIdentifiers(
|
|
103
|
+
filename=filename,
|
|
104
|
+
rel_path=rel_path,
|
|
105
|
+
fullpath=file_info.path,
|
|
106
|
+
),
|
|
107
|
+
additional_metadata={"catalog": self.index_config.catalog, "path": file_info.path},
|
|
108
|
+
metadata=FileDataSourceMetadata(
|
|
109
|
+
url=file_info.path, date_modified=str(file_info.modification_time)
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class DatabricksVolumesDownloaderConfig(DownloaderConfig):
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class DatabricksVolumesDownloader(Downloader, ABC):
|
|
120
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
121
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
122
|
+
|
|
123
|
+
def precheck(self) -> None:
|
|
124
|
+
try:
|
|
125
|
+
self.connection_config.get_client()
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
128
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
129
|
+
|
|
130
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
131
|
+
return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
132
|
+
|
|
133
|
+
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
134
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
135
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
136
|
+
volumes_path = file_data.additional_metadata["path"]
|
|
137
|
+
logger.info(f"Writing {file_data.identifier} to {download_path}")
|
|
138
|
+
try:
|
|
139
|
+
with self.connection_config.get_client().dbfs.download(path=volumes_path) as c:
|
|
140
|
+
read_content = c._read_handle.read()
|
|
141
|
+
with open(download_path, "wb") as f:
|
|
142
|
+
f.write(read_content)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
145
|
+
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
146
|
+
|
|
147
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class DatabricksVolumesUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
151
|
+
overwrite: bool = Field(
|
|
152
|
+
default=False, description="If true, an existing file will be overwritten."
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class DatabricksVolumesUploader(Uploader, ABC):
|
|
158
|
+
upload_config: DatabricksVolumesUploaderConfig
|
|
159
|
+
connection_config: DatabricksVolumesConnectionConfig
|
|
160
|
+
|
|
161
|
+
def precheck(self) -> None:
|
|
162
|
+
try:
|
|
163
|
+
assert self.connection_config.get_client().current_user.me().active
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
166
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
167
|
+
|
|
168
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
169
|
+
output_path = os.path.join(self.upload_config.path, path.name)
|
|
170
|
+
with open(path, "rb") as elements_file:
|
|
171
|
+
self.connection_config.get_client().files.upload(
|
|
172
|
+
file_path=output_path,
|
|
173
|
+
contents=elements_file,
|
|
174
|
+
overwrite=self.upload_config.overwrite,
|
|
175
|
+
)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_aws"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksAWSVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
token: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Databricks personal access token (PAT)",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatabricksAWSVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
37
|
+
access_config: Secret[DatabricksAWSVolumesAccessConfig]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DatabricksAWSVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class DatabricksAWSVolumesIndexer(DatabricksVolumesIndexer):
|
|
46
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
47
|
+
index_config: DatabricksAWSVolumesIndexerConfig
|
|
48
|
+
connector_type: str = CONNECTOR_TYPE
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabricksAWSVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class DatabricksAWSVolumesDownloader(DatabricksVolumesDownloader):
|
|
57
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
58
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
59
|
+
connector_type: str = CONNECTOR_TYPE
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DatabricksAWSVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class DatabricksAWSVolumesUploader(DatabricksVolumesUploader):
|
|
68
|
+
connection_config: DatabricksAWSVolumesConnectionConfig
|
|
69
|
+
upload_config: DatabricksAWSVolumesUploaderConfig = field(
|
|
70
|
+
default_factory=DatabricksAWSVolumesUploaderConfig
|
|
71
|
+
)
|
|
72
|
+
connector_type: str = CONNECTOR_TYPE
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
|
|
76
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
77
|
+
uploader=DatabricksAWSVolumesUploader,
|
|
78
|
+
uploader_config=DatabricksAWSVolumesUploaderConfig,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
databricks_aws_volumes_source_entry = SourceRegistryEntry(
|
|
82
|
+
connection_config=DatabricksAWSVolumesConnectionConfig,
|
|
83
|
+
indexer=DatabricksAWSVolumesIndexer,
|
|
84
|
+
indexer_config=DatabricksAWSVolumesIndexerConfig,
|
|
85
|
+
downloader=DatabricksAWSVolumesDownloader,
|
|
86
|
+
downloader_config=DatabricksAWSVolumesDownloaderConfig,
|
|
87
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_azure"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksAzureVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
azure_client_secret: Optional[str] = Field(
|
|
36
|
+
default=None, description="The Azure AD service principal’s client secret."
|
|
37
|
+
)
|
|
38
|
+
azure_client_id: Optional[str] = Field(
|
|
39
|
+
default=None, description="The Azure AD service principal’s application ID."
|
|
40
|
+
)
|
|
41
|
+
azure_tenant_id: Optional[str] = Field(
|
|
42
|
+
default=None, description="The Azure AD service principal’s tenant ID."
|
|
43
|
+
)
|
|
44
|
+
azure_environment: Optional[str] = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="The Azure environment type for a " "specific set of API endpoints",
|
|
47
|
+
examples=["Public", "UsGov", "China", "Germany"],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DatabricksAzureVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
52
|
+
access_config: Secret[DatabricksAzureVolumesAccessConfig]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class DatabricksAzureVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class DatabricksAzureVolumesIndexer(DatabricksVolumesIndexer):
|
|
61
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
62
|
+
index_config: DatabricksAzureVolumesIndexerConfig
|
|
63
|
+
connector_type: str = CONNECTOR_TYPE
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DatabricksAzureVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class DatabricksAzureVolumesDownloader(DatabricksVolumesDownloader):
|
|
72
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
73
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
74
|
+
connector_type: str = CONNECTOR_TYPE
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class DatabricksAzureVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class DatabricksAzureVolumesUploader(DatabricksVolumesUploader):
|
|
83
|
+
connection_config: DatabricksAzureVolumesConnectionConfig
|
|
84
|
+
upload_config: DatabricksAzureVolumesUploaderConfig = field(
|
|
85
|
+
default_factory=DatabricksAzureVolumesUploaderConfig
|
|
86
|
+
)
|
|
87
|
+
connector_type: str = CONNECTOR_TYPE
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
|
|
91
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
92
|
+
uploader=DatabricksAzureVolumesUploader,
|
|
93
|
+
uploader_config=DatabricksAzureVolumesUploaderConfig,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
databricks_azure_volumes_source_entry = SourceRegistryEntry(
|
|
97
|
+
connection_config=DatabricksAzureVolumesConnectionConfig,
|
|
98
|
+
indexer=DatabricksAzureVolumesIndexer,
|
|
99
|
+
indexer_config=DatabricksAzureVolumesIndexerConfig,
|
|
100
|
+
downloader=DatabricksAzureVolumesDownloader,
|
|
101
|
+
downloader_config=DatabricksAzureVolumesDownloaderConfig,
|
|
102
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes_gcp"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksGoogleVolumesAccessConfig(AccessConfig):
|
|
25
|
+
account_id: Optional[str] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="The Databricks account ID for the Databricks " "accounts endpoint.",
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
google_credentials: Optional[str] = None
|
|
31
|
+
google_service_account: Optional[str] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatabricksGoogleVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
35
|
+
access_config: Secret[DatabricksGoogleVolumesAccessConfig]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DatabricksGoogleVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DatabricksGoogleVolumesIndexer(DatabricksVolumesIndexer):
|
|
44
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
45
|
+
index_config: DatabricksGoogleVolumesIndexerConfig
|
|
46
|
+
connector_type: str = CONNECTOR_TYPE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DatabricksGoogleVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class DatabricksGoogleVolumesDownloader(DatabricksVolumesDownloader):
|
|
55
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
56
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
57
|
+
connector_type: str = CONNECTOR_TYPE
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DatabricksGoogleVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class DatabricksGoogleVolumesUploader(DatabricksVolumesUploader):
|
|
66
|
+
connection_config: DatabricksGoogleVolumesConnectionConfig
|
|
67
|
+
upload_config: DatabricksGoogleVolumesUploaderConfig = field(
|
|
68
|
+
default_factory=DatabricksGoogleVolumesUploaderConfig
|
|
69
|
+
)
|
|
70
|
+
connector_type: str = CONNECTOR_TYPE
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
|
|
74
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
75
|
+
uploader=DatabricksGoogleVolumesUploader,
|
|
76
|
+
uploader_config=DatabricksGoogleVolumesUploaderConfig,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
databricks_gcp_volumes_source_entry = SourceRegistryEntry(
|
|
80
|
+
connection_config=DatabricksGoogleVolumesConnectionConfig,
|
|
81
|
+
indexer=DatabricksGoogleVolumesIndexer,
|
|
82
|
+
indexer_config=DatabricksGoogleVolumesIndexerConfig,
|
|
83
|
+
downloader=DatabricksGoogleVolumesDownloader,
|
|
84
|
+
downloader_config=DatabricksGoogleVolumesDownloaderConfig,
|
|
85
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, Secret
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
|
+
DestinationRegistryEntry,
|
|
9
|
+
SourceRegistryEntry,
|
|
10
|
+
)
|
|
11
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
12
|
+
DatabricksVolumesConnectionConfig,
|
|
13
|
+
DatabricksVolumesDownloader,
|
|
14
|
+
DatabricksVolumesDownloaderConfig,
|
|
15
|
+
DatabricksVolumesIndexer,
|
|
16
|
+
DatabricksVolumesIndexerConfig,
|
|
17
|
+
DatabricksVolumesUploader,
|
|
18
|
+
DatabricksVolumesUploaderConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksNativeVolumesAccessConfig(AccessConfig):
|
|
25
|
+
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
26
|
+
client_secret: Optional[str] = Field(
|
|
27
|
+
default=None, description="Client Secret of the OAuth app."
|
|
28
|
+
)
|
|
29
|
+
profile: Optional[str] = None
|
|
30
|
+
azure_workspace_resource_id: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The Azure Resource Manager ID for the Azure Databricks workspace, "
|
|
33
|
+
"which is exchanged for a Databricks host URL.",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatabricksNativeVolumesConnectionConfig(DatabricksVolumesConnectionConfig):
|
|
38
|
+
access_config: Secret[DatabricksNativeVolumesAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksNativeVolumesIndexerConfig(DatabricksVolumesIndexerConfig):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class DatabricksNativeVolumesIndexer(DatabricksVolumesIndexer):
|
|
47
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
48
|
+
index_config: DatabricksNativeVolumesIndexerConfig
|
|
49
|
+
connector_type: str = CONNECTOR_TYPE
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class DatabricksNativeVolumesDownloaderConfig(DatabricksVolumesDownloaderConfig):
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class DatabricksNativeVolumesDownloader(DatabricksVolumesDownloader):
|
|
58
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
59
|
+
download_config: DatabricksVolumesDownloaderConfig
|
|
60
|
+
connector_type: str = CONNECTOR_TYPE
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class DatabricksNativeVolumesUploaderConfig(DatabricksVolumesUploaderConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class DatabricksNativeVolumesUploader(DatabricksVolumesUploader):
|
|
69
|
+
connection_config: DatabricksNativeVolumesConnectionConfig
|
|
70
|
+
upload_config: DatabricksNativeVolumesUploaderConfig
|
|
71
|
+
connector_type: str = CONNECTOR_TYPE
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
databricks_native_volumes_destination_entry = DestinationRegistryEntry(
|
|
75
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
76
|
+
uploader=DatabricksNativeVolumesUploader,
|
|
77
|
+
uploader_config=DatabricksNativeVolumesUploaderConfig,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
databricks_native_volumes_source_entry = SourceRegistryEntry(
|
|
81
|
+
connection_config=DatabricksNativeVolumesConnectionConfig,
|
|
82
|
+
indexer=DatabricksNativeVolumesIndexer,
|
|
83
|
+
indexer_config=DatabricksNativeVolumesIndexerConfig,
|
|
84
|
+
downloader=DatabricksNativeVolumesDownloader,
|
|
85
|
+
downloader_config=DatabricksNativeVolumesDownloaderConfig,
|
|
86
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import random
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -63,6 +64,7 @@ class FileConfig(BaseModel):
|
|
|
63
64
|
|
|
64
65
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
65
66
|
recursive: bool = False
|
|
67
|
+
sample_n_files: Optional[int] = None
|
|
66
68
|
|
|
67
69
|
|
|
68
70
|
class FsspecAccessConfig(AccessConfig):
|
|
@@ -128,8 +130,23 @@ class FsspecIndexer(Indexer):
|
|
|
128
130
|
filtered_files = [
|
|
129
131
|
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
130
132
|
]
|
|
133
|
+
|
|
134
|
+
if self.index_config.sample_n_files:
|
|
135
|
+
filtered_files = self.sample_n_files(filtered_files, self.index_config.sample_n_files)
|
|
136
|
+
|
|
131
137
|
return filtered_files
|
|
132
138
|
|
|
139
|
+
def sample_n_files(self, files: list[dict[str, Any]], n) -> list[dict[str, Any]]:
|
|
140
|
+
if len(files) <= n:
|
|
141
|
+
logger.warning(
|
|
142
|
+
f"number of files to be sampled={n} is not smaller than the number"
|
|
143
|
+
f" of files found ({len(files)}). Returning all of the files as the"
|
|
144
|
+
" sample."
|
|
145
|
+
)
|
|
146
|
+
return files
|
|
147
|
+
|
|
148
|
+
return random.sample(files, n)
|
|
149
|
+
|
|
133
150
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
134
151
|
raise NotImplementedError()
|
|
135
152
|
|
|
@@ -26,7 +26,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
26
26
|
)
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
|
-
from kdbai_client import Session, Table
|
|
29
|
+
from kdbai_client import Database, Session, Table
|
|
30
30
|
|
|
31
31
|
CONNECTOR_TYPE = "kdbai"
|
|
32
32
|
|
|
@@ -99,6 +99,9 @@ class KdbaiUploadStager(UploadStager):
|
|
|
99
99
|
|
|
100
100
|
|
|
101
101
|
class KdbaiUploaderConfig(UploaderConfig):
|
|
102
|
+
database_name: str = Field(
|
|
103
|
+
default="default", description="The name of the KDBAI database to write into."
|
|
104
|
+
)
|
|
102
105
|
table_name: str = Field(description="The name of the KDBAI table to write into.")
|
|
103
106
|
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
104
107
|
|
|
@@ -111,24 +114,29 @@ class KdbaiUploader(Uploader):
|
|
|
111
114
|
|
|
112
115
|
def precheck(self) -> None:
|
|
113
116
|
try:
|
|
114
|
-
self.
|
|
117
|
+
self.get_database()
|
|
115
118
|
except Exception as e:
|
|
116
119
|
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
117
120
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
118
121
|
|
|
119
|
-
def
|
|
122
|
+
def get_database(self) -> "Database":
|
|
120
123
|
session: Session = self.connection_config.get_session()
|
|
121
|
-
|
|
124
|
+
db = session.database(self.upload_config.database_name)
|
|
125
|
+
return db
|
|
126
|
+
|
|
127
|
+
def get_table(self) -> "Table":
|
|
128
|
+
db = self.get_database()
|
|
129
|
+
table = db.table(self.upload_config.table_name)
|
|
122
130
|
return table
|
|
123
131
|
|
|
124
132
|
def upsert_batch(self, batch: pd.DataFrame):
|
|
125
133
|
table = self.get_table()
|
|
126
|
-
table.insert(
|
|
134
|
+
table.insert(batch)
|
|
127
135
|
|
|
128
136
|
def process_dataframe(self, df: pd.DataFrame):
|
|
129
137
|
logger.debug(
|
|
130
138
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
131
|
-
f"db in table {self.upload_config.table_name}"
|
|
139
|
+
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
132
140
|
)
|
|
133
141
|
for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
|
|
134
142
|
self.upsert_batch(batch=batch_df)
|