ygg 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/METADATA +1 -1
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/RECORD +16 -13
- yggdrasil/databricks/jobs/config.py +2 -30
- yggdrasil/databricks/sql/statement_result.py +1 -1
- yggdrasil/databricks/sql/types.py +16 -0
- yggdrasil/databricks/workspaces/__init__.py +3 -1
- yggdrasil/databricks/workspaces/filesytem.py +161 -0
- yggdrasil/databricks/workspaces/io.py +745 -0
- yggdrasil/databricks/workspaces/path.py +1120 -0
- yggdrasil/databricks/workspaces/path_kind.py +10 -0
- yggdrasil/databricks/workspaces/workspace.py +97 -150
- yggdrasil/types/cast/arrow_cast.py +9 -0
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.31.dist-info}/top_level.txt +0 -0
|
@@ -16,7 +16,7 @@ from typing import (
|
|
|
16
16
|
if TYPE_CHECKING:
|
|
17
17
|
from ..compute.cluster import Cluster
|
|
18
18
|
|
|
19
|
-
from .
|
|
19
|
+
from .path import DatabricksPath, DatabricksPathKind
|
|
20
20
|
from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
|
|
21
21
|
|
|
22
22
|
if databricks_sdk is not None:
|
|
@@ -133,17 +133,19 @@ class Workspace:
|
|
|
133
133
|
|
|
134
134
|
def __enter__(self) -> "Workspace":
|
|
135
135
|
self._was_connected = self._sdk is not None
|
|
136
|
-
self.connect()
|
|
137
|
-
return self
|
|
136
|
+
return self.connect()
|
|
138
137
|
|
|
139
138
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
140
139
|
if not self._was_connected:
|
|
141
140
|
self.close()
|
|
142
141
|
|
|
142
|
+
def __del__(self):
|
|
143
|
+
self.close()
|
|
144
|
+
|
|
143
145
|
# -------------------------
|
|
144
146
|
# Clone
|
|
145
147
|
# -------------------------
|
|
146
|
-
def
|
|
148
|
+
def clone_instance(
|
|
147
149
|
self,
|
|
148
150
|
**kwargs
|
|
149
151
|
) -> "Workspace":
|
|
@@ -154,86 +156,94 @@ class Workspace:
|
|
|
154
156
|
# -------------------------
|
|
155
157
|
# SDK connection
|
|
156
158
|
# -------------------------
|
|
157
|
-
|
|
159
|
+
@property
|
|
160
|
+
def connected(self):
|
|
161
|
+
return self._sdk is not None
|
|
162
|
+
|
|
163
|
+
def connect(self, reset: bool = False, clone: bool = False) -> "Workspace":
|
|
158
164
|
if reset:
|
|
159
165
|
self._sdk = None
|
|
160
166
|
|
|
161
|
-
if self._sdk is None:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
167
|
+
if self._sdk is not None:
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
instance = self.clone_instance() if clone else self
|
|
171
|
+
|
|
172
|
+
require_databricks_sdk()
|
|
173
|
+
logger.debug("Connecting %s", self)
|
|
174
|
+
|
|
175
|
+
# Build Config from config_dict if available, else from fields.
|
|
176
|
+
kwargs = {
|
|
177
|
+
"host": instance.host,
|
|
178
|
+
"account_id": instance.account_id,
|
|
179
|
+
"token": instance.token,
|
|
180
|
+
"client_id": instance.client_id,
|
|
181
|
+
"client_secret": instance.client_secret,
|
|
182
|
+
"token_audience": instance.token_audience,
|
|
183
|
+
"azure_workspace_resource_id": instance.azure_workspace_resource_id,
|
|
184
|
+
"azure_use_msi": instance.azure_use_msi,
|
|
185
|
+
"azure_client_secret": instance.azure_client_secret,
|
|
186
|
+
"azure_client_id": instance.azure_client_id,
|
|
187
|
+
"azure_tenant_id": instance.azure_tenant_id,
|
|
188
|
+
"azure_environment": instance.azure_environment,
|
|
189
|
+
"google_credentials": instance.google_credentials,
|
|
190
|
+
"google_service_account": instance.google_service_account,
|
|
191
|
+
"profile": instance.profile,
|
|
192
|
+
"config_file": instance.config_file,
|
|
193
|
+
"auth_type": instance.auth_type,
|
|
194
|
+
"http_timeout_seconds": instance.http_timeout_seconds,
|
|
195
|
+
"retry_timeout_seconds": instance.retry_timeout_seconds,
|
|
196
|
+
"debug_truncate_bytes": instance.debug_truncate_bytes,
|
|
197
|
+
"debug_headers": instance.debug_headers,
|
|
198
|
+
"rate_limit": instance.rate_limit,
|
|
199
|
+
"product": instance.product,
|
|
200
|
+
"product_version": instance.product_version,
|
|
201
|
+
}
|
|
194
202
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
203
|
+
build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
207
|
+
except ValueError as e:
|
|
208
|
+
if "cannot configure default credentials" in str(e) and instance.auth_type is None:
|
|
209
|
+
last_error = e
|
|
210
|
+
|
|
211
|
+
auth_types = ["runtime"] if instance.is_in_databricks_environment() else ["external-browser"]
|
|
200
212
|
|
|
201
|
-
|
|
213
|
+
for auth_type in auth_types:
|
|
214
|
+
build_kwargs["auth_type"] = auth_type
|
|
202
215
|
|
|
203
|
-
|
|
204
|
-
|
|
216
|
+
try:
|
|
217
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
218
|
+
break
|
|
219
|
+
except Exception as se:
|
|
220
|
+
last_error = se
|
|
221
|
+
build_kwargs.pop("auth_type")
|
|
222
|
+
|
|
223
|
+
if instance._sdk is None:
|
|
224
|
+
if instance.is_in_databricks_environment() and instance._cached_token:
|
|
225
|
+
build_kwargs["token"] = instance._cached_token
|
|
205
226
|
|
|
206
227
|
try:
|
|
207
|
-
|
|
208
|
-
break
|
|
228
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
209
229
|
except Exception as se:
|
|
210
230
|
last_error = se
|
|
211
|
-
build_kwargs.pop("auth_type")
|
|
212
|
-
|
|
213
|
-
if self._sdk is None:
|
|
214
|
-
if self.is_in_databricks_environment() and self._cached_token:
|
|
215
|
-
build_kwargs["token"] = self._cached_token
|
|
216
231
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
232
|
+
if instance._sdk is None:
|
|
233
|
+
raise last_error
|
|
234
|
+
else:
|
|
235
|
+
raise e
|
|
221
236
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
237
|
+
# backfill resolved config values
|
|
238
|
+
for key in list(kwargs.keys()):
|
|
239
|
+
if getattr(instance, key, None) is None:
|
|
240
|
+
v = getattr(instance._sdk.config, key, None)
|
|
241
|
+
if v is not None:
|
|
242
|
+
setattr(instance, key, v)
|
|
226
243
|
|
|
227
|
-
|
|
228
|
-
for key in list(kwargs.keys()):
|
|
229
|
-
if getattr(self, key, None) is None:
|
|
230
|
-
v = getattr(self._sdk.config, key, None)
|
|
231
|
-
if v is not None:
|
|
232
|
-
setattr(self, key, v)
|
|
244
|
+
logger.info("Connected %s", instance)
|
|
233
245
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
return self
|
|
246
|
+
return instance
|
|
237
247
|
|
|
238
248
|
# ------------------------------------------------------------------ #
|
|
239
249
|
# Context manager + lifecycle
|
|
@@ -287,6 +297,20 @@ class Workspace:
|
|
|
287
297
|
# ------------------------------------------------------------------ #
|
|
288
298
|
# Path helpers
|
|
289
299
|
# ------------------------------------------------------------------ #
|
|
300
|
+
def filesytem(
|
|
301
|
+
self,
|
|
302
|
+
workspace: Optional["Workspace"] = None,
|
|
303
|
+
):
|
|
304
|
+
from .filesytem import DatabricksFileSystem, DatabricksFileSystemHandler
|
|
305
|
+
|
|
306
|
+
handler = DatabricksFileSystemHandler(
|
|
307
|
+
workspace=self if workspace is None else workspace
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
return DatabricksFileSystem(
|
|
311
|
+
handler=handler
|
|
312
|
+
)
|
|
313
|
+
|
|
290
314
|
def dbfs_path(
|
|
291
315
|
self,
|
|
292
316
|
parts: Union[List[str], str],
|
|
@@ -297,14 +321,14 @@ class Workspace:
|
|
|
297
321
|
|
|
298
322
|
if kind is None or isinstance(parts, str):
|
|
299
323
|
return DatabricksPath.parse(
|
|
300
|
-
|
|
324
|
+
obj=parts,
|
|
301
325
|
workspace=workspace
|
|
302
326
|
)
|
|
303
327
|
|
|
304
328
|
return DatabricksPath(
|
|
305
329
|
kind=kind,
|
|
306
330
|
parts=parts,
|
|
307
|
-
|
|
331
|
+
_workspace=workspace
|
|
308
332
|
)
|
|
309
333
|
|
|
310
334
|
def shared_cache_path(
|
|
@@ -329,60 +353,6 @@ class Workspace:
|
|
|
329
353
|
def sdk(self) -> "WorkspaceClient":
|
|
330
354
|
return self.connect()._sdk
|
|
331
355
|
|
|
332
|
-
# ------------------------------------------------------------------ #
|
|
333
|
-
# UC volume + directory management
|
|
334
|
-
# ------------------------------------------------------------------ #
|
|
335
|
-
|
|
336
|
-
def ensure_uc_volume_and_dir(
|
|
337
|
-
self,
|
|
338
|
-
target_path: str,
|
|
339
|
-
) -> None:
|
|
340
|
-
"""
|
|
341
|
-
Ensure catalog, schema, volume exist for a UC volume path
|
|
342
|
-
like /Volumes/<catalog>/<schema>/<volume>/...,
|
|
343
|
-
then create the directory.
|
|
344
|
-
"""
|
|
345
|
-
sdk = self.sdk()
|
|
346
|
-
parts = target_path.split("/")
|
|
347
|
-
|
|
348
|
-
# basic sanity check
|
|
349
|
-
if len(parts) < 5 or parts[1] != "Volumes":
|
|
350
|
-
raise ValueError(
|
|
351
|
-
f"Unexpected UC volume path: {target_path!r}. "
|
|
352
|
-
"Expected /Volumes/<catalog>/<schema>/<volume>/..."
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
# /Volumes/<catalog>/<schema>/<volume>/...
|
|
356
|
-
_, _, catalog_name, schema_name, volume_name, *subpath = parts
|
|
357
|
-
|
|
358
|
-
# 1) ensure catalog
|
|
359
|
-
try:
|
|
360
|
-
sdk.catalogs.get(name=catalog_name)
|
|
361
|
-
except NotFound:
|
|
362
|
-
sdk.catalogs.create(name=catalog_name)
|
|
363
|
-
|
|
364
|
-
# 2) ensure schema
|
|
365
|
-
schema_full_name = f"{catalog_name}.{schema_name}"
|
|
366
|
-
try:
|
|
367
|
-
sdk.schemas.get(full_name=schema_full_name)
|
|
368
|
-
except NotFound:
|
|
369
|
-
sdk.schemas.create(name=schema_name, catalog_name=catalog_name)
|
|
370
|
-
|
|
371
|
-
# 3) ensure volume (managed volume is simplest)
|
|
372
|
-
volume_full_name = f"{catalog_name}.{schema_name}.{volume_name}"
|
|
373
|
-
try:
|
|
374
|
-
sdk.volumes.read(name=volume_full_name)
|
|
375
|
-
except NotFound:
|
|
376
|
-
sdk.volumes.create(
|
|
377
|
-
catalog_name=catalog_name,
|
|
378
|
-
schema_name=schema_name,
|
|
379
|
-
name=volume_name,
|
|
380
|
-
volume_type=catalog_svc.VolumeType.MANAGED,
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
# 4) finally create the directory path itself
|
|
384
|
-
sdk.files.create_directory(target_path)
|
|
385
|
-
|
|
386
356
|
# ------------------------------------------------------------------ #
|
|
387
357
|
# List / open / delete / SQL
|
|
388
358
|
# ------------------------------------------------------------------ #
|
|
@@ -462,31 +432,8 @@ class Workspace:
|
|
|
462
432
|
|
|
463
433
|
# Workspace path
|
|
464
434
|
fmt = workspace_format or ExportFormat.AUTO
|
|
465
|
-
return sdk.workspace.download(path=path, format=fmt)
|
|
466
|
-
|
|
467
|
-
def delete_path(
|
|
468
|
-
self,
|
|
469
|
-
target_path: str,
|
|
470
|
-
recursive: bool = True,
|
|
471
|
-
ignore_missing: bool = True,
|
|
472
|
-
) -> None:
|
|
473
|
-
"""
|
|
474
|
-
Delete a path in Databricks Workspace (file or directory).
|
|
475
|
-
|
|
476
|
-
- If recursive=True and target_path is a directory, deletes entire tree.
|
|
477
|
-
- If ignore_missing=True, missing paths won't raise.
|
|
478
|
-
"""
|
|
479
|
-
sdk = self.sdk()
|
|
480
435
|
|
|
481
|
-
|
|
482
|
-
sdk.workspace.delete(
|
|
483
|
-
path=target_path,
|
|
484
|
-
recursive=recursive,
|
|
485
|
-
)
|
|
486
|
-
except ResourceDoesNotExist:
|
|
487
|
-
if ignore_missing:
|
|
488
|
-
return
|
|
489
|
-
raise
|
|
436
|
+
return sdk.workspace.download(path=path, format=fmt)
|
|
490
437
|
|
|
491
438
|
@staticmethod
|
|
492
439
|
def is_in_databricks_environment():
|
|
@@ -6,6 +6,7 @@ from typing import Optional, Union, List, Tuple, Any
|
|
|
6
6
|
|
|
7
7
|
import pyarrow as pa
|
|
8
8
|
import pyarrow.compute as pc
|
|
9
|
+
import pyarrow.dataset as pds
|
|
9
10
|
|
|
10
11
|
from .cast_options import CastOptions
|
|
11
12
|
from .registry import register_converter
|
|
@@ -1095,6 +1096,14 @@ def record_batch_reader_to_record_batch(
|
|
|
1095
1096
|
return table_to_record_batch(table, options)
|
|
1096
1097
|
|
|
1097
1098
|
|
|
1099
|
+
@register_converter(pds.Dataset, pa.Table)
|
|
1100
|
+
def arrow_dataset_to_table(
|
|
1101
|
+
data: pds.Dataset,
|
|
1102
|
+
options: Optional[CastOptions] = None,
|
|
1103
|
+
) -> pa.Field:
|
|
1104
|
+
table = data.to_table()
|
|
1105
|
+
return cast_arrow_tabular(table, options)
|
|
1106
|
+
|
|
1098
1107
|
# ---------------------------------------------------------------------------
|
|
1099
1108
|
# Field / Schema converters
|
|
1100
1109
|
# ---------------------------------------------------------------------------
|