unstructured-ingest 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (34) hide show
  1. test/integration/connectors/conftest.py +13 -0
  2. test/integration/connectors/databricks_tests/test_volumes_native.py +8 -4
  3. test/integration/connectors/sql/__init__.py +0 -0
  4. test/integration/connectors/{test_postgres.py → sql/test_postgres.py} +76 -2
  5. test/integration/connectors/sql/test_snowflake.py +205 -0
  6. test/integration/connectors/{test_sqlite.py → sql/test_sqlite.py} +68 -12
  7. test/integration/connectors/test_delta_table.py +138 -0
  8. test/integration/connectors/utils/constants.py +1 -1
  9. test/integration/connectors/utils/docker.py +78 -0
  10. test/integration/connectors/utils/validation.py +100 -4
  11. unstructured_ingest/__version__.py +1 -1
  12. unstructured_ingest/v2/cli/utils/click.py +32 -1
  13. unstructured_ingest/v2/cli/utils/model_conversion.py +10 -3
  14. unstructured_ingest/v2/interfaces/indexer.py +4 -1
  15. unstructured_ingest/v2/pipeline/pipeline.py +10 -2
  16. unstructured_ingest/v2/pipeline/steps/index.py +18 -1
  17. unstructured_ingest/v2/processes/connectors/__init__.py +10 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +1 -1
  19. unstructured_ingest/v2/processes/connectors/delta_table.py +185 -0
  20. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +17 -0
  21. unstructured_ingest/v2/processes/connectors/kdbai.py +14 -6
  22. unstructured_ingest/v2/processes/connectors/slack.py +248 -0
  23. unstructured_ingest/v2/processes/connectors/sql/__init__.py +10 -2
  24. unstructured_ingest/v2/processes/connectors/sql/postgres.py +77 -25
  25. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +164 -0
  26. unstructured_ingest/v2/processes/connectors/sql/sql.py +163 -6
  27. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +86 -24
  28. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/METADATA +16 -14
  29. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/RECORD +33 -27
  30. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -250
  31. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/LICENSE.md +0 -0
  32. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/WHEEL +0 -0
  33. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/entry_points.txt +0 -0
  34. {unstructured_ingest-0.1.0.dist-info → unstructured_ingest-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,250 +0,0 @@
1
- import os
2
- from dataclasses import dataclass
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Generator, Optional
5
-
6
- from pydantic import Field, Secret
7
-
8
- from unstructured_ingest.error import (
9
- DestinationConnectionError,
10
- SourceConnectionError,
11
- SourceConnectionNetworkError,
12
- )
13
- from unstructured_ingest.utils.dep_check import requires_dependencies
14
- from unstructured_ingest.v2.interfaces import (
15
- AccessConfig,
16
- ConnectionConfig,
17
- Downloader,
18
- DownloaderConfig,
19
- DownloadResponse,
20
- FileData,
21
- FileDataSourceMetadata,
22
- Indexer,
23
- IndexerConfig,
24
- SourceIdentifiers,
25
- Uploader,
26
- UploaderConfig,
27
- )
28
- from unstructured_ingest.v2.logger import logger
29
- from unstructured_ingest.v2.processes.connector_registry import (
30
- DestinationRegistryEntry,
31
- SourceRegistryEntry,
32
- )
33
-
34
- if TYPE_CHECKING:
35
- from databricks.sdk import WorkspaceClient
36
-
37
- CONNECTOR_TYPE = "databricks_volumes"
38
-
39
-
40
- class DatabricksVolumesAccessConfig(AccessConfig):
41
- account_id: Optional[str] = Field(
42
- default=None,
43
- description="The Databricks account ID for the Databricks "
44
- "accounts endpoint. Only has effect when Host is "
45
- "either https://accounts.cloud.databricks.com/ (AWS), "
46
- "https://accounts.azuredatabricks.net/ (Azure), "
47
- "or https://accounts.gcp.databricks.com/ (GCP).",
48
- )
49
- client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
50
- client_secret: Optional[str] = Field(
51
- default=None, description="Client Secret of the OAuth app."
52
- )
53
- token: Optional[str] = Field(
54
- default=None,
55
- description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
56
- "Azure Active Directory (Azure AD) token (Azure).",
57
- )
58
- profile: Optional[str] = None
59
- azure_workspace_resource_id: Optional[str] = Field(
60
- default=None,
61
- description="The Azure Resource Manager ID for the Azure Databricks workspace, "
62
- "which is exchanged for a Databricks host URL.",
63
- )
64
- azure_client_secret: Optional[str] = Field(
65
- default=None, description="The Azure AD service principal’s client secret."
66
- )
67
- azure_client_id: Optional[str] = Field(
68
- default=None, description="The Azure AD service principal’s application ID."
69
- )
70
- azure_tenant_id: Optional[str] = Field(
71
- default=None, description="The Azure AD service principal’s tenant ID."
72
- )
73
- azure_environment: Optional[str] = Field(
74
- default=None,
75
- description="The Azure environment type for a " "specific set of API endpoints",
76
- examples=["Public", "UsGov", "China", "Germany"],
77
- )
78
- auth_type: Optional[str] = Field(
79
- default=None,
80
- description="When multiple auth attributes are available in the "
81
- "environment, use the auth type specified by this "
82
- "argument. This argument also holds the currently "
83
- "selected auth.",
84
- )
85
- google_credentials: Optional[str] = None
86
- google_service_account: Optional[str] = None
87
-
88
-
89
- class DatabricksVolumesConnectionConfig(ConnectionConfig):
90
- access_config: Secret[DatabricksVolumesAccessConfig] = Field(
91
- default=DatabricksVolumesAccessConfig(), validate_default=True
92
- )
93
- host: Optional[str] = Field(
94
- default=None,
95
- description="The Databricks host URL for either the "
96
- "Databricks workspace endpoint or the "
97
- "Databricks accounts endpoint.",
98
- )
99
- volume: str = Field(description="Name of volume in the Unity Catalog")
100
- catalog: str = Field(description="Name of the catalog in the Databricks Unity Catalog service")
101
- volume_path: Optional[str] = Field(
102
- default=None, description="Optional path within the volume to write to"
103
- )
104
- databricks_schema: str = Field(
105
- default="default",
106
- alias="schema",
107
- description="Schema associated with the volume to write to in the Unity Catalog service",
108
- )
109
-
110
- @property
111
- def path(self) -> str:
112
- path = f"/Volumes/{self.catalog}/{self.databricks_schema}/{self.volume}"
113
- if self.volume_path:
114
- path = f"{path}/{self.volume_path}"
115
- return path
116
-
117
- @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
118
- def get_client(self) -> "WorkspaceClient":
119
- from databricks.sdk import WorkspaceClient
120
-
121
- return WorkspaceClient(
122
- host=self.host,
123
- **self.access_config.get_secret_value().model_dump(),
124
- )
125
-
126
-
127
- @dataclass
128
- class DatabricksVolumesIndexerConfig(IndexerConfig):
129
- recursive: bool = False
130
-
131
-
132
- @dataclass
133
- class DatabricksVolumesIndexer(Indexer):
134
- index_config: DatabricksVolumesIndexerConfig
135
- connection_config: DatabricksVolumesConnectionConfig
136
- connector_type: str = CONNECTOR_TYPE
137
-
138
- def precheck(self) -> None:
139
- try:
140
- self.connection_config.get_client()
141
- except Exception as e:
142
- logger.error(f"failed to validate connection: {e}", exc_info=True)
143
- raise SourceConnectionError(f"failed to validate connection: {e}")
144
-
145
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
146
- for file_info in self.connection_config.get_client().dbfs.list(
147
- path=self.connection_config.path, recursive=self.index_config.recursive
148
- ):
149
- if file_info.is_dir:
150
- continue
151
- rel_path = file_info.path.replace(self.connection_config.path, "")
152
- if rel_path.startswith("/"):
153
- rel_path = rel_path[1:]
154
- filename = Path(file_info.path).name
155
- yield FileData(
156
- identifier=file_info.path,
157
- connector_type=CONNECTOR_TYPE,
158
- source_identifiers=SourceIdentifiers(
159
- filename=filename,
160
- rel_path=rel_path,
161
- fullpath=file_info.path,
162
- ),
163
- additional_metadata={
164
- "catalog": self.connection_config.catalog,
165
- },
166
- metadata=FileDataSourceMetadata(
167
- url=file_info.path, date_modified=str(file_info.modification_time)
168
- ),
169
- )
170
-
171
-
172
- @dataclass
173
- class DatabricksVolumesDownloaderConfig(DownloaderConfig):
174
- pass
175
-
176
-
177
- @dataclass
178
- class DatabricksVolumesDownloader(Downloader):
179
- download_config: DatabricksVolumesDownloaderConfig
180
- connection_config: DatabricksVolumesConnectionConfig
181
- connector_type: str = CONNECTOR_TYPE
182
-
183
- def precheck(self) -> None:
184
- try:
185
- self.connection_config.get_client()
186
- except Exception as e:
187
- logger.error(f"failed to validate connection: {e}", exc_info=True)
188
- raise SourceConnectionError(f"failed to validate connection: {e}")
189
-
190
- def get_download_path(self, file_data: FileData) -> Path:
191
- return self.download_config.download_dir / Path(file_data.source_identifiers.relative_path)
192
-
193
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
194
- download_path = self.get_download_path(file_data=file_data)
195
- download_path.parent.mkdir(parents=True, exist_ok=True)
196
- logger.info(f"Writing {file_data.identifier} to {download_path}")
197
- try:
198
- with self.connection_config.get_client().dbfs.download(path=file_data.identifier) as c:
199
- read_content = c._read_handle.read()
200
- with open(download_path, "wb") as f:
201
- f.write(read_content)
202
- except Exception as e:
203
- logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
204
- raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
205
-
206
- return self.generate_download_response(file_data=file_data, download_path=download_path)
207
-
208
-
209
- class DatabricksVolumesUploaderConfig(UploaderConfig):
210
- overwrite: bool = Field(
211
- default=False, description="If true, an existing file will be overwritten."
212
- )
213
-
214
-
215
- @dataclass
216
- class DatabricksVolumesUploader(Uploader):
217
- upload_config: DatabricksVolumesUploaderConfig
218
- connection_config: DatabricksVolumesConnectionConfig
219
- connector_type: str = CONNECTOR_TYPE
220
-
221
- def precheck(self) -> None:
222
- try:
223
- assert self.connection_config.get_client().current_user.me().active
224
- except Exception as e:
225
- logger.error(f"failed to validate connection: {e}", exc_info=True)
226
- raise DestinationConnectionError(f"failed to validate connection: {e}")
227
-
228
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
229
- output_path = os.path.join(self.connection_config.path, path.name)
230
- with open(path, "rb") as elements_file:
231
- self.connection_config.get_client().files.upload(
232
- file_path=output_path,
233
- contents=elements_file,
234
- overwrite=self.upload_config.overwrite,
235
- )
236
-
237
-
238
- databricks_volumes_destination_entry = DestinationRegistryEntry(
239
- connection_config=DatabricksVolumesConnectionConfig,
240
- uploader=DatabricksVolumesUploader,
241
- uploader_config=DatabricksVolumesUploaderConfig,
242
- )
243
-
244
- databricks_volumes_source_entry = SourceRegistryEntry(
245
- connection_config=DatabricksVolumesConnectionConfig,
246
- indexer=DatabricksVolumesIndexer,
247
- indexer_config=DatabricksVolumesIndexerConfig,
248
- downloader=DatabricksVolumesDownloader,
249
- downloader_config=DatabricksVolumesDownloaderConfig,
250
- )