ygg 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/METADATA +1 -1
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/RECORD +23 -20
- yggdrasil/databricks/compute/cluster.py +41 -21
- yggdrasil/databricks/compute/execution_context.py +9 -10
- yggdrasil/databricks/compute/remote.py +10 -6
- yggdrasil/databricks/jobs/config.py +2 -30
- yggdrasil/databricks/sql/engine.py +4 -2
- yggdrasil/databricks/sql/statement_result.py +18 -3
- yggdrasil/databricks/sql/types.py +16 -0
- yggdrasil/databricks/workspaces/__init__.py +3 -1
- yggdrasil/databricks/workspaces/filesytem.py +161 -0
- yggdrasil/databricks/workspaces/io.py +745 -0
- yggdrasil/databricks/workspaces/path.py +1120 -0
- yggdrasil/databricks/workspaces/path_kind.py +10 -0
- yggdrasil/databricks/workspaces/workspace.py +146 -562
- yggdrasil/pyutils/callable_serde.py +1 -0
- yggdrasil/pyutils/modules.py +1 -1
- yggdrasil/pyutils/python_env.py +81 -264
- yggdrasil/types/cast/arrow_cast.py +9 -0
- yggdrasil/databricks/workspaces/databricks_path.py +0 -875
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/WHEEL +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.29.dist-info → ygg-0.1.31.dist-info}/top_level.txt +0 -0
|
@@ -1,29 +1,28 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
import dataclasses
|
|
3
|
-
import io
|
|
4
2
|
import logging
|
|
5
3
|
import os
|
|
6
4
|
import posixpath
|
|
7
5
|
from abc import ABC
|
|
8
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
6
|
from dataclasses import dataclass
|
|
10
7
|
from pathlib import Path
|
|
11
8
|
from typing import (
|
|
12
9
|
Any,
|
|
13
10
|
BinaryIO,
|
|
14
11
|
Iterator,
|
|
15
|
-
List,
|
|
16
12
|
Optional,
|
|
17
|
-
Union
|
|
13
|
+
Union, TYPE_CHECKING, List
|
|
18
14
|
)
|
|
19
15
|
|
|
20
|
-
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from ..compute.cluster import Cluster
|
|
18
|
+
|
|
19
|
+
from .path import DatabricksPath, DatabricksPathKind
|
|
21
20
|
from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
|
|
22
21
|
|
|
23
22
|
if databricks_sdk is not None:
|
|
24
23
|
from databricks.sdk import WorkspaceClient
|
|
25
24
|
from databricks.sdk.errors import ResourceDoesNotExist, NotFound
|
|
26
|
-
from databricks.sdk.service.workspace import
|
|
25
|
+
from databricks.sdk.service.workspace import ExportFormat, ObjectInfo
|
|
27
26
|
from databricks.sdk.service import catalog as catalog_svc
|
|
28
27
|
from databricks.sdk.dbutils import FileInfo
|
|
29
28
|
from databricks.sdk.service.files import DirectoryEntry
|
|
@@ -62,31 +61,8 @@ def _get_env_product_tag():
|
|
|
62
61
|
v = os.getenv("DATABRICKS_PRODUCT_TAG")
|
|
63
62
|
|
|
64
63
|
if not v:
|
|
65
|
-
return "default"
|
|
66
|
-
|
|
67
|
-
return v.strip().lower()
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _get_remote_size(sdk, target_path: str) -> Optional[int]:
|
|
71
|
-
"""
|
|
72
|
-
Best-effort fetch remote file size for target_path across
|
|
73
|
-
DBFS, Volumes, and Workspace. Returns None if not found.
|
|
74
|
-
"""
|
|
75
|
-
try:
|
|
76
|
-
if target_path.startswith("dbfs:/"):
|
|
77
|
-
st = sdk.dbfs.get_status(target_path)
|
|
78
|
-
return getattr(st, "file_size", None)
|
|
79
|
-
|
|
80
|
-
if target_path.startswith("/Volumes"):
|
|
81
|
-
st = sdk.files.get_status(file_path=target_path)
|
|
82
|
-
return getattr(st, "file_size", None)
|
|
83
|
-
|
|
84
|
-
# Workspace path
|
|
85
|
-
st = sdk.workspace.get_status(target_path)
|
|
86
|
-
return getattr(st, "size", None)
|
|
87
|
-
|
|
88
|
-
except ResourceDoesNotExist:
|
|
89
64
|
return None
|
|
65
|
+
return v.strip().lower()
|
|
90
66
|
|
|
91
67
|
|
|
92
68
|
@dataclass
|
|
@@ -140,9 +116,7 @@ class Workspace:
|
|
|
140
116
|
state = self.__dict__.copy()
|
|
141
117
|
state.pop("_sdk", None)
|
|
142
118
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
state["_was_connected"] = was_connected
|
|
119
|
+
state["_was_connected"] = self._sdk is not None
|
|
146
120
|
state["_cached_token"] = self.current_token()
|
|
147
121
|
|
|
148
122
|
return state
|
|
@@ -159,102 +133,117 @@ class Workspace:
|
|
|
159
133
|
|
|
160
134
|
def __enter__(self) -> "Workspace":
|
|
161
135
|
self._was_connected = self._sdk is not None
|
|
162
|
-
self.connect()
|
|
163
|
-
return self
|
|
136
|
+
return self.connect()
|
|
164
137
|
|
|
165
138
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
166
139
|
if not self._was_connected:
|
|
167
140
|
self.close()
|
|
168
141
|
|
|
142
|
+
def __del__(self):
|
|
143
|
+
self.close()
|
|
144
|
+
|
|
169
145
|
# -------------------------
|
|
170
146
|
# Clone
|
|
171
147
|
# -------------------------
|
|
172
|
-
def
|
|
173
|
-
|
|
148
|
+
def clone_instance(
|
|
149
|
+
self,
|
|
150
|
+
**kwargs
|
|
151
|
+
) -> "Workspace":
|
|
152
|
+
state = self.__getstate__()
|
|
153
|
+
state.update(kwargs)
|
|
154
|
+
return Workspace().__setstate__(state)
|
|
174
155
|
|
|
175
156
|
# -------------------------
|
|
176
157
|
# SDK connection
|
|
177
158
|
# -------------------------
|
|
178
|
-
|
|
159
|
+
@property
|
|
160
|
+
def connected(self):
|
|
161
|
+
return self._sdk is not None
|
|
162
|
+
|
|
163
|
+
def connect(self, reset: bool = False, clone: bool = False) -> "Workspace":
|
|
179
164
|
if reset:
|
|
180
165
|
self._sdk = None
|
|
181
166
|
|
|
182
|
-
if self._sdk is None:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
167
|
+
if self._sdk is not None:
|
|
168
|
+
return self
|
|
169
|
+
|
|
170
|
+
instance = self.clone_instance() if clone else self
|
|
171
|
+
|
|
172
|
+
require_databricks_sdk()
|
|
173
|
+
logger.debug("Connecting %s", self)
|
|
174
|
+
|
|
175
|
+
# Build Config from config_dict if available, else from fields.
|
|
176
|
+
kwargs = {
|
|
177
|
+
"host": instance.host,
|
|
178
|
+
"account_id": instance.account_id,
|
|
179
|
+
"token": instance.token,
|
|
180
|
+
"client_id": instance.client_id,
|
|
181
|
+
"client_secret": instance.client_secret,
|
|
182
|
+
"token_audience": instance.token_audience,
|
|
183
|
+
"azure_workspace_resource_id": instance.azure_workspace_resource_id,
|
|
184
|
+
"azure_use_msi": instance.azure_use_msi,
|
|
185
|
+
"azure_client_secret": instance.azure_client_secret,
|
|
186
|
+
"azure_client_id": instance.azure_client_id,
|
|
187
|
+
"azure_tenant_id": instance.azure_tenant_id,
|
|
188
|
+
"azure_environment": instance.azure_environment,
|
|
189
|
+
"google_credentials": instance.google_credentials,
|
|
190
|
+
"google_service_account": instance.google_service_account,
|
|
191
|
+
"profile": instance.profile,
|
|
192
|
+
"config_file": instance.config_file,
|
|
193
|
+
"auth_type": instance.auth_type,
|
|
194
|
+
"http_timeout_seconds": instance.http_timeout_seconds,
|
|
195
|
+
"retry_timeout_seconds": instance.retry_timeout_seconds,
|
|
196
|
+
"debug_truncate_bytes": instance.debug_truncate_bytes,
|
|
197
|
+
"debug_headers": instance.debug_headers,
|
|
198
|
+
"rate_limit": instance.rate_limit,
|
|
199
|
+
"product": instance.product,
|
|
200
|
+
"product_version": instance.product_version,
|
|
201
|
+
}
|
|
215
202
|
|
|
216
|
-
|
|
217
|
-
self._sdk = WorkspaceClient(**build_kwargs)
|
|
218
|
-
except ValueError as e:
|
|
219
|
-
if "cannot configure default credentials" in str(e) and self.auth_type is None:
|
|
220
|
-
last_error = e
|
|
203
|
+
build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
|
221
204
|
|
|
222
|
-
|
|
205
|
+
try:
|
|
206
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
207
|
+
except ValueError as e:
|
|
208
|
+
if "cannot configure default credentials" in str(e) and instance.auth_type is None:
|
|
209
|
+
last_error = e
|
|
223
210
|
|
|
224
|
-
|
|
225
|
-
|
|
211
|
+
auth_types = ["runtime"] if instance.is_in_databricks_environment() else ["external-browser"]
|
|
212
|
+
|
|
213
|
+
for auth_type in auth_types:
|
|
214
|
+
build_kwargs["auth_type"] = auth_type
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
218
|
+
break
|
|
219
|
+
except Exception as se:
|
|
220
|
+
last_error = se
|
|
221
|
+
build_kwargs.pop("auth_type")
|
|
222
|
+
|
|
223
|
+
if instance._sdk is None:
|
|
224
|
+
if instance.is_in_databricks_environment() and instance._cached_token:
|
|
225
|
+
build_kwargs["token"] = instance._cached_token
|
|
226
226
|
|
|
227
227
|
try:
|
|
228
|
-
|
|
229
|
-
break
|
|
228
|
+
instance._sdk = WorkspaceClient(**build_kwargs)
|
|
230
229
|
except Exception as se:
|
|
231
230
|
last_error = se
|
|
232
|
-
build_kwargs.pop("auth_type")
|
|
233
|
-
|
|
234
|
-
if self._sdk is None:
|
|
235
|
-
if self.is_in_databricks_environment() and self._cached_token:
|
|
236
|
-
build_kwargs["token"] = self._cached_token
|
|
237
231
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if self._sdk is None:
|
|
244
|
-
raise last_error
|
|
245
|
-
else:
|
|
246
|
-
raise e
|
|
232
|
+
if instance._sdk is None:
|
|
233
|
+
raise last_error
|
|
234
|
+
else:
|
|
235
|
+
raise e
|
|
247
236
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
237
|
+
# backfill resolved config values
|
|
238
|
+
for key in list(kwargs.keys()):
|
|
239
|
+
if getattr(instance, key, None) is None:
|
|
240
|
+
v = getattr(instance._sdk.config, key, None)
|
|
241
|
+
if v is not None:
|
|
242
|
+
setattr(instance, key, v)
|
|
254
243
|
|
|
255
|
-
|
|
244
|
+
logger.info("Connected %s", instance)
|
|
256
245
|
|
|
257
|
-
return
|
|
246
|
+
return instance
|
|
258
247
|
|
|
259
248
|
# ------------------------------------------------------------------ #
|
|
260
249
|
# Context manager + lifecycle
|
|
@@ -308,17 +297,44 @@ class Workspace:
|
|
|
308
297
|
# ------------------------------------------------------------------ #
|
|
309
298
|
# Path helpers
|
|
310
299
|
# ------------------------------------------------------------------ #
|
|
311
|
-
def
|
|
300
|
+
def filesytem(
|
|
301
|
+
self,
|
|
302
|
+
workspace: Optional["Workspace"] = None,
|
|
303
|
+
):
|
|
304
|
+
from .filesytem import DatabricksFileSystem, DatabricksFileSystemHandler
|
|
305
|
+
|
|
306
|
+
handler = DatabricksFileSystemHandler(
|
|
307
|
+
workspace=self if workspace is None else workspace
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
return DatabricksFileSystem(
|
|
311
|
+
handler=handler
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def dbfs_path(
|
|
315
|
+
self,
|
|
316
|
+
parts: Union[List[str], str],
|
|
317
|
+
kind: Optional[DatabricksPathKind] = None,
|
|
318
|
+
workspace: Optional["Workspace"] = None
|
|
319
|
+
):
|
|
320
|
+
workspace = self if workspace is None else workspace
|
|
321
|
+
|
|
322
|
+
if kind is None or isinstance(parts, str):
|
|
323
|
+
return DatabricksPath.parse(
|
|
324
|
+
obj=parts,
|
|
325
|
+
workspace=workspace
|
|
326
|
+
)
|
|
327
|
+
|
|
312
328
|
return DatabricksPath(
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
329
|
+
kind=kind,
|
|
330
|
+
parts=parts,
|
|
331
|
+
_workspace=workspace
|
|
316
332
|
)
|
|
317
333
|
|
|
318
|
-
@staticmethod
|
|
319
334
|
def shared_cache_path(
|
|
335
|
+
self,
|
|
320
336
|
suffix: Optional[str] = None
|
|
321
|
-
) ->
|
|
337
|
+
) -> DatabricksPath:
|
|
322
338
|
"""
|
|
323
339
|
Shared cache base under Volumes for the current user.
|
|
324
340
|
"""
|
|
@@ -328,31 +344,7 @@ class Workspace:
|
|
|
328
344
|
return base
|
|
329
345
|
|
|
330
346
|
suffix = suffix.lstrip("/")
|
|
331
|
-
return f"{base}/{suffix}"
|
|
332
|
-
|
|
333
|
-
def temp_volume_folder(
|
|
334
|
-
self,
|
|
335
|
-
suffix: Optional[str] = None,
|
|
336
|
-
catalog_name: Optional[str] = None,
|
|
337
|
-
schema_name: Optional[str] = None,
|
|
338
|
-
volume_name: Optional[str] = None,
|
|
339
|
-
) -> str:
|
|
340
|
-
"""
|
|
341
|
-
Temporary folder either under a UC Volume or dbfs:/FileStore/.ygg/tmp/<user>.
|
|
342
|
-
"""
|
|
343
|
-
if volume_name:
|
|
344
|
-
catalog_name = catalog_name or os.getenv("DATABRICKS_CATALOG_NAME")
|
|
345
|
-
schema_name = schema_name or os.getenv("DATABRICKS_SCHEMA_NAME")
|
|
346
|
-
|
|
347
|
-
base = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
|
|
348
|
-
else:
|
|
349
|
-
base = f"dbfs:/FileStore/.ygg/tmp/{self.current_user.user_name}"
|
|
350
|
-
|
|
351
|
-
if not suffix:
|
|
352
|
-
return base
|
|
353
|
-
|
|
354
|
-
suffix = suffix.lstrip("/")
|
|
355
|
-
return f"{base}/{suffix}"
|
|
347
|
+
return self.dbfs_path(f"{base}/{suffix}")
|
|
356
348
|
|
|
357
349
|
# ------------------------------------------------------------------ #
|
|
358
350
|
# SDK access / connection
|
|
@@ -361,400 +353,6 @@ class Workspace:
|
|
|
361
353
|
def sdk(self) -> "WorkspaceClient":
|
|
362
354
|
return self.connect()._sdk
|
|
363
355
|
|
|
364
|
-
# ------------------------------------------------------------------ #
|
|
365
|
-
# UC volume + directory management
|
|
366
|
-
# ------------------------------------------------------------------ #
|
|
367
|
-
|
|
368
|
-
def ensure_uc_volume_and_dir(
|
|
369
|
-
self,
|
|
370
|
-
target_path: str,
|
|
371
|
-
) -> None:
|
|
372
|
-
"""
|
|
373
|
-
Ensure catalog, schema, volume exist for a UC volume path
|
|
374
|
-
like /Volumes/<catalog>/<schema>/<volume>/...,
|
|
375
|
-
then create the directory.
|
|
376
|
-
"""
|
|
377
|
-
sdk = self.sdk()
|
|
378
|
-
parts = target_path.split("/")
|
|
379
|
-
|
|
380
|
-
# basic sanity check
|
|
381
|
-
if len(parts) < 5 or parts[1] != "Volumes":
|
|
382
|
-
raise ValueError(
|
|
383
|
-
f"Unexpected UC volume path: {target_path!r}. "
|
|
384
|
-
"Expected /Volumes/<catalog>/<schema>/<volume>/..."
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
# /Volumes/<catalog>/<schema>/<volume>/...
|
|
388
|
-
_, _, catalog_name, schema_name, volume_name, *subpath = parts
|
|
389
|
-
|
|
390
|
-
# 1) ensure catalog
|
|
391
|
-
try:
|
|
392
|
-
sdk.catalogs.get(name=catalog_name)
|
|
393
|
-
except NotFound:
|
|
394
|
-
sdk.catalogs.create(name=catalog_name)
|
|
395
|
-
|
|
396
|
-
# 2) ensure schema
|
|
397
|
-
schema_full_name = f"{catalog_name}.{schema_name}"
|
|
398
|
-
try:
|
|
399
|
-
sdk.schemas.get(full_name=schema_full_name)
|
|
400
|
-
except NotFound:
|
|
401
|
-
sdk.schemas.create(name=schema_name, catalog_name=catalog_name)
|
|
402
|
-
|
|
403
|
-
# 3) ensure volume (managed volume is simplest)
|
|
404
|
-
volume_full_name = f"{catalog_name}.{schema_name}.{volume_name}"
|
|
405
|
-
try:
|
|
406
|
-
sdk.volumes.read(name=volume_full_name)
|
|
407
|
-
except NotFound:
|
|
408
|
-
sdk.volumes.create(
|
|
409
|
-
catalog_name=catalog_name,
|
|
410
|
-
schema_name=schema_name,
|
|
411
|
-
name=volume_name,
|
|
412
|
-
volume_type=catalog_svc.VolumeType.MANAGED,
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
# 4) finally create the directory path itself
|
|
416
|
-
sdk.files.create_directory(target_path)
|
|
417
|
-
|
|
418
|
-
# ------------------------------------------------------------------ #
|
|
419
|
-
# Upload helpers
|
|
420
|
-
# ------------------------------------------------------------------ #
|
|
421
|
-
def upload_file_content(
|
|
422
|
-
self,
|
|
423
|
-
content: Union[bytes, BinaryIO],
|
|
424
|
-
target_path: str,
|
|
425
|
-
makedirs: bool = True,
|
|
426
|
-
overwrite: bool = True,
|
|
427
|
-
only_if_size_diff: bool = False,
|
|
428
|
-
parallel_pool: Optional[ThreadPoolExecutor] = None,
|
|
429
|
-
):
|
|
430
|
-
"""
|
|
431
|
-
Upload a single content blob into Databricks (Workspace / Volumes / DBFS).
|
|
432
|
-
|
|
433
|
-
content:
|
|
434
|
-
bytes or a binary file-like object.
|
|
435
|
-
|
|
436
|
-
target_path:
|
|
437
|
-
- "dbfs:/..." → DBFS via dbfs.put
|
|
438
|
-
- "/Volumes/..." → Unity Catalog Volumes via files.upload
|
|
439
|
-
- anything else → Workspace via workspace.upload
|
|
440
|
-
|
|
441
|
-
If parallel_pool is provided, this schedules the upload on the pool
|
|
442
|
-
and returns a Future. The underlying call is non-parallel (no nested pool).
|
|
443
|
-
|
|
444
|
-
If only_if_size_diff=True, it will:
|
|
445
|
-
- compute local content size (len(bytes))
|
|
446
|
-
- fetch remote size (best-effort)
|
|
447
|
-
- skip upload if sizes match.
|
|
448
|
-
"""
|
|
449
|
-
# If we're doing this in a pool, normalize content to bytes *before*
|
|
450
|
-
# submitting so we don't share a live file handle across threads.
|
|
451
|
-
if parallel_pool is not None:
|
|
452
|
-
if hasattr(content, "read"):
|
|
453
|
-
data = content.read()
|
|
454
|
-
else:
|
|
455
|
-
data = content
|
|
456
|
-
|
|
457
|
-
# use a cloned workspace so clients don't collide across threads
|
|
458
|
-
return parallel_pool.submit(
|
|
459
|
-
self.clone().upload_file_content,
|
|
460
|
-
content=data,
|
|
461
|
-
target_path=target_path,
|
|
462
|
-
makedirs=makedirs,
|
|
463
|
-
overwrite=overwrite,
|
|
464
|
-
only_if_size_diff=only_if_size_diff,
|
|
465
|
-
parallel_pool=None,
|
|
466
|
-
)
|
|
467
|
-
|
|
468
|
-
with self.connect() as connected:
|
|
469
|
-
sdk = connected.sdk()
|
|
470
|
-
|
|
471
|
-
# Normalize content to bytes once
|
|
472
|
-
if hasattr(content, "read"): # BinaryIO
|
|
473
|
-
data = content.read()
|
|
474
|
-
else:
|
|
475
|
-
data = content
|
|
476
|
-
|
|
477
|
-
if not isinstance(data, (bytes, bytearray)):
|
|
478
|
-
if isinstance(data, str):
|
|
479
|
-
data = data.encode()
|
|
480
|
-
else:
|
|
481
|
-
raise TypeError(
|
|
482
|
-
f"content must be bytes or BinaryIO, got {type(content)!r}"
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
data_bytes = bytes(data)
|
|
486
|
-
local_size = len(data_bytes)
|
|
487
|
-
|
|
488
|
-
# Only-if-size-diff: check remote size and bail early if equal
|
|
489
|
-
if only_if_size_diff:
|
|
490
|
-
remote_size = _get_remote_size(sdk, target_path)
|
|
491
|
-
if remote_size is not None and remote_size == local_size:
|
|
492
|
-
# Same size remotely -> skip upload
|
|
493
|
-
return None
|
|
494
|
-
|
|
495
|
-
# Ensure parent directory if requested
|
|
496
|
-
parent = os.path.dirname(target_path)
|
|
497
|
-
|
|
498
|
-
if target_path.startswith("dbfs:/"):
|
|
499
|
-
# --- DBFS path ---
|
|
500
|
-
if makedirs and parent and parent != "dbfs:/":
|
|
501
|
-
sdk.dbfs.mkdirs(parent)
|
|
502
|
-
|
|
503
|
-
data_str = base64.b64encode(data_bytes).decode("utf-8")
|
|
504
|
-
sdk.dbfs.put(
|
|
505
|
-
path=target_path,
|
|
506
|
-
contents=data_str,
|
|
507
|
-
overwrite=overwrite,
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
elif target_path.startswith("/Volumes"):
|
|
511
|
-
# --- Unity Catalog Volumes path ---
|
|
512
|
-
if makedirs and parent and parent != "/":
|
|
513
|
-
try:
|
|
514
|
-
sdk.files.create_directory(parent)
|
|
515
|
-
except NotFound:
|
|
516
|
-
connected.ensure_uc_volume_and_dir(parent)
|
|
517
|
-
|
|
518
|
-
sdk.files.upload(
|
|
519
|
-
file_path=target_path,
|
|
520
|
-
contents=io.BytesIO(data_bytes),
|
|
521
|
-
overwrite=overwrite,
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
else:
|
|
525
|
-
# --- Workspace Files / Notebooks ---
|
|
526
|
-
if makedirs and parent:
|
|
527
|
-
sdk.workspace.mkdirs(parent)
|
|
528
|
-
|
|
529
|
-
sdk.workspace.upload(
|
|
530
|
-
path=target_path,
|
|
531
|
-
format=ImportFormat.RAW,
|
|
532
|
-
content=data_bytes,
|
|
533
|
-
overwrite=overwrite,
|
|
534
|
-
)
|
|
535
|
-
|
|
536
|
-
def upload_local_path(
|
|
537
|
-
self,
|
|
538
|
-
local_path: str,
|
|
539
|
-
target_path: str,
|
|
540
|
-
makedirs: bool = True,
|
|
541
|
-
overwrite: bool = True,
|
|
542
|
-
only_if_size_diff: bool = False,
|
|
543
|
-
parallel_pool: Optional[ThreadPoolExecutor] = None,
|
|
544
|
-
):
|
|
545
|
-
if os.path.isfile(local_path):
|
|
546
|
-
return self.upload_local_file(
|
|
547
|
-
local_path=local_path,
|
|
548
|
-
target_path=target_path,
|
|
549
|
-
makedirs=makedirs,
|
|
550
|
-
overwrite=overwrite,
|
|
551
|
-
only_if_size_diff=only_if_size_diff,
|
|
552
|
-
parallel_pool=parallel_pool
|
|
553
|
-
)
|
|
554
|
-
else:
|
|
555
|
-
return self.upload_local_folder(
|
|
556
|
-
local_path=local_path,
|
|
557
|
-
target_path=target_path,
|
|
558
|
-
makedirs=makedirs,
|
|
559
|
-
only_if_size_diff=only_if_size_diff,
|
|
560
|
-
parallel_pool=parallel_pool
|
|
561
|
-
)
|
|
562
|
-
|
|
563
|
-
def upload_local_file(
|
|
564
|
-
self,
|
|
565
|
-
local_path: str,
|
|
566
|
-
target_path: str,
|
|
567
|
-
makedirs: bool = True,
|
|
568
|
-
overwrite: bool = True,
|
|
569
|
-
only_if_size_diff: bool = False,
|
|
570
|
-
parallel_pool: Optional[ThreadPoolExecutor] = None,
|
|
571
|
-
):
|
|
572
|
-
"""
|
|
573
|
-
Upload a single local file into Databricks.
|
|
574
|
-
|
|
575
|
-
If parallel_pool is provided, this schedules the upload on the pool
|
|
576
|
-
and returns a Future.
|
|
577
|
-
|
|
578
|
-
If only_if_size_diff=True, it will:
|
|
579
|
-
- For large files (>4 MiB), check remote file status
|
|
580
|
-
- Skip upload if remote size == local size
|
|
581
|
-
"""
|
|
582
|
-
if parallel_pool is not None:
|
|
583
|
-
# Submit a *non-parallel* variant into the pool
|
|
584
|
-
return parallel_pool.submit(
|
|
585
|
-
self.upload_local_file,
|
|
586
|
-
local_path=local_path,
|
|
587
|
-
target_path=target_path,
|
|
588
|
-
makedirs=makedirs,
|
|
589
|
-
overwrite=overwrite,
|
|
590
|
-
only_if_size_diff=only_if_size_diff,
|
|
591
|
-
parallel_pool=None,
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
sdk = self.sdk()
|
|
595
|
-
|
|
596
|
-
local_size = os.path.getsize(local_path)
|
|
597
|
-
large_threshold = 32 * 1024
|
|
598
|
-
|
|
599
|
-
if only_if_size_diff and local_size > large_threshold:
|
|
600
|
-
try:
|
|
601
|
-
info = sdk.workspace.get_status(path=target_path)
|
|
602
|
-
remote_size = getattr(info, "size", None)
|
|
603
|
-
|
|
604
|
-
if remote_size is not None and remote_size == local_size:
|
|
605
|
-
return
|
|
606
|
-
except ResourceDoesNotExist:
|
|
607
|
-
# Doesn't exist → upload below
|
|
608
|
-
pass
|
|
609
|
-
|
|
610
|
-
with open(local_path, "rb") as f:
|
|
611
|
-
content = f.read()
|
|
612
|
-
|
|
613
|
-
return self.upload_file_content(
|
|
614
|
-
content=content,
|
|
615
|
-
target_path=target_path,
|
|
616
|
-
makedirs=makedirs,
|
|
617
|
-
overwrite=overwrite,
|
|
618
|
-
only_if_size_diff=False,
|
|
619
|
-
parallel_pool=parallel_pool,
|
|
620
|
-
)
|
|
621
|
-
|
|
622
|
-
def upload_local_folder(
|
|
623
|
-
self,
|
|
624
|
-
local_path: str,
|
|
625
|
-
target_path: str,
|
|
626
|
-
makedirs: bool = True,
|
|
627
|
-
only_if_size_diff: bool = True,
|
|
628
|
-
exclude_dir_names: Optional[List[str]] = None,
|
|
629
|
-
exclude_hidden: bool = True,
|
|
630
|
-
parallel_pool: Optional[Union[ThreadPoolExecutor, int]] = None,
|
|
631
|
-
):
|
|
632
|
-
"""
|
|
633
|
-
Recursively upload a local folder into Databricks Workspace Files.
|
|
634
|
-
|
|
635
|
-
- Traverses subdirectories recursively.
|
|
636
|
-
- Optionally skips files that match size/mtime of remote entries.
|
|
637
|
-
- Can upload files in parallel using a ThreadPoolExecutor.
|
|
638
|
-
|
|
639
|
-
Args:
|
|
640
|
-
local_path: Local directory to upload from.
|
|
641
|
-
target_path: Workspace path to upload into.
|
|
642
|
-
makedirs: Create remote directories as needed.
|
|
643
|
-
only_if_size_diff: Skip upload if remote file exists with same size and newer mtime.
|
|
644
|
-
exclude_dir_names: Directory names to skip entirely.
|
|
645
|
-
exclude_hidden: Skip dot-prefixed files/directories.
|
|
646
|
-
parallel_pool: None | ThreadPoolExecutor | int (max_workers).
|
|
647
|
-
"""
|
|
648
|
-
sdk = self.sdk()
|
|
649
|
-
local_path = os.path.abspath(local_path)
|
|
650
|
-
exclude_dirs_set = set(exclude_dir_names or [])
|
|
651
|
-
|
|
652
|
-
try:
|
|
653
|
-
existing_objs = list(sdk.workspace.list(target_path))
|
|
654
|
-
except ResourceDoesNotExist:
|
|
655
|
-
existing_objs = []
|
|
656
|
-
|
|
657
|
-
# --- setup pool semantics ---
|
|
658
|
-
created_pool: Optional[ThreadPoolExecutor] = None
|
|
659
|
-
if isinstance(parallel_pool, int):
|
|
660
|
-
created_pool = ThreadPoolExecutor(max_workers=parallel_pool)
|
|
661
|
-
pool: Optional[ThreadPoolExecutor] = created_pool
|
|
662
|
-
elif isinstance(parallel_pool, ThreadPoolExecutor):
|
|
663
|
-
pool = parallel_pool
|
|
664
|
-
else:
|
|
665
|
-
pool = None
|
|
666
|
-
|
|
667
|
-
futures = []
|
|
668
|
-
|
|
669
|
-
def _upload_dir(local_root: str, remote_root: str, ensure_dir: bool):
|
|
670
|
-
# Ensure remote directory exists if requested
|
|
671
|
-
existing_remote_root_obj = [
|
|
672
|
-
_ for _ in existing_objs
|
|
673
|
-
if _.path.startswith(remote_root)
|
|
674
|
-
]
|
|
675
|
-
|
|
676
|
-
if ensure_dir and not existing_remote_root_obj:
|
|
677
|
-
sdk.workspace.mkdirs(remote_root)
|
|
678
|
-
|
|
679
|
-
try:
|
|
680
|
-
local_entries = list(os.scandir(local_root))
|
|
681
|
-
except FileNotFoundError:
|
|
682
|
-
return
|
|
683
|
-
|
|
684
|
-
local_files = []
|
|
685
|
-
local_dirs = []
|
|
686
|
-
|
|
687
|
-
for local_entry in local_entries:
|
|
688
|
-
# Skip hidden if requested
|
|
689
|
-
if exclude_hidden and local_entry.name.startswith("."):
|
|
690
|
-
continue
|
|
691
|
-
|
|
692
|
-
if local_entry.is_dir():
|
|
693
|
-
if local_entry.name in exclude_dirs_set:
|
|
694
|
-
continue
|
|
695
|
-
local_dirs.append(local_entry)
|
|
696
|
-
elif existing_objs:
|
|
697
|
-
found_same_remote = None
|
|
698
|
-
for exiting_obj in existing_objs:
|
|
699
|
-
existing_obj_name = os.path.basename(exiting_obj.path)
|
|
700
|
-
if existing_obj_name == local_entry.name:
|
|
701
|
-
found_same_remote = exiting_obj
|
|
702
|
-
break
|
|
703
|
-
|
|
704
|
-
if found_same_remote:
|
|
705
|
-
found_same_remote_epoch = found_same_remote.modified_at / 1000
|
|
706
|
-
local_stats = local_entry.stat()
|
|
707
|
-
|
|
708
|
-
if (
|
|
709
|
-
only_if_size_diff
|
|
710
|
-
and found_same_remote.size
|
|
711
|
-
and found_same_remote.size != local_stats.st_size
|
|
712
|
-
):
|
|
713
|
-
pass # size diff -> upload
|
|
714
|
-
elif local_stats.st_mtime < found_same_remote_epoch:
|
|
715
|
-
# remote is newer -> skip
|
|
716
|
-
continue
|
|
717
|
-
else:
|
|
718
|
-
local_files.append(local_entry)
|
|
719
|
-
else:
|
|
720
|
-
local_files.append(local_entry)
|
|
721
|
-
else:
|
|
722
|
-
local_files.append(local_entry)
|
|
723
|
-
|
|
724
|
-
# ---- upload files in this directory ----
|
|
725
|
-
for local_entry in local_files:
|
|
726
|
-
remote_path = posixpath.join(remote_root, local_entry.name)
|
|
727
|
-
|
|
728
|
-
entry_fut = self.upload_local_file(
|
|
729
|
-
local_path=local_entry.path,
|
|
730
|
-
target_path=remote_path,
|
|
731
|
-
makedirs=False,
|
|
732
|
-
overwrite=True,
|
|
733
|
-
only_if_size_diff=False,
|
|
734
|
-
parallel_pool=pool,
|
|
735
|
-
)
|
|
736
|
-
|
|
737
|
-
if pool is not None:
|
|
738
|
-
futures.append(entry_fut)
|
|
739
|
-
|
|
740
|
-
# ---- recurse into subdirectories ----
|
|
741
|
-
for local_entry in local_dirs:
|
|
742
|
-
_upload_dir(
|
|
743
|
-
local_entry.path,
|
|
744
|
-
posixpath.join(remote_root, local_entry.name),
|
|
745
|
-
ensure_dir=makedirs,
|
|
746
|
-
)
|
|
747
|
-
|
|
748
|
-
try:
|
|
749
|
-
_upload_dir(local_path, target_path, ensure_dir=makedirs)
|
|
750
|
-
|
|
751
|
-
if pool is not None:
|
|
752
|
-
for fut in as_completed(futures):
|
|
753
|
-
fut.result()
|
|
754
|
-
finally:
|
|
755
|
-
if created_pool is not None:
|
|
756
|
-
created_pool.shutdown(wait=True)
|
|
757
|
-
|
|
758
356
|
# ------------------------------------------------------------------ #
|
|
759
357
|
# List / open / delete / SQL
|
|
760
358
|
# ------------------------------------------------------------------ #
|
|
@@ -834,31 +432,8 @@ class Workspace:
|
|
|
834
432
|
|
|
835
433
|
# Workspace path
|
|
836
434
|
fmt = workspace_format or ExportFormat.AUTO
|
|
837
|
-
return sdk.workspace.download(path=path, format=fmt)
|
|
838
|
-
|
|
839
|
-
def delete_path(
|
|
840
|
-
self,
|
|
841
|
-
target_path: str,
|
|
842
|
-
recursive: bool = True,
|
|
843
|
-
ignore_missing: bool = True,
|
|
844
|
-
) -> None:
|
|
845
|
-
"""
|
|
846
|
-
Delete a path in Databricks Workspace (file or directory).
|
|
847
|
-
|
|
848
|
-
- If recursive=True and target_path is a directory, deletes entire tree.
|
|
849
|
-
- If ignore_missing=True, missing paths won't raise.
|
|
850
|
-
"""
|
|
851
|
-
sdk = self.sdk()
|
|
852
435
|
|
|
853
|
-
|
|
854
|
-
sdk.workspace.delete(
|
|
855
|
-
path=target_path,
|
|
856
|
-
recursive=recursive,
|
|
857
|
-
)
|
|
858
|
-
except ResourceDoesNotExist:
|
|
859
|
-
if ignore_missing:
|
|
860
|
-
return
|
|
861
|
-
raise
|
|
436
|
+
return sdk.workspace.download(path=path, format=fmt)
|
|
862
437
|
|
|
863
438
|
@staticmethod
|
|
864
439
|
def is_in_databricks_environment():
|
|
@@ -895,15 +470,15 @@ class Workspace:
|
|
|
895
470
|
**kwargs
|
|
896
471
|
)
|
|
897
472
|
|
|
898
|
-
def
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
473
|
+
def clusters(
|
|
474
|
+
self,
|
|
475
|
+
cluster_id: Optional[str] = None,
|
|
476
|
+
cluster_name: Optional[str] = None,
|
|
477
|
+
**kwargs
|
|
478
|
+
) -> "Cluster":
|
|
904
479
|
from ..compute.cluster import Cluster
|
|
905
480
|
|
|
906
|
-
return Cluster(workspace=self, **kwargs)
|
|
481
|
+
return Cluster(workspace=self, cluster_id=cluster_id, cluster_name=cluster_name, **kwargs)
|
|
907
482
|
|
|
908
483
|
|
|
909
484
|
# ---------------------------------------------------------------------------
|
|
@@ -935,8 +510,17 @@ class WorkspaceService(ABC):
|
|
|
935
510
|
self.workspace = self.workspace.connect()
|
|
936
511
|
return self
|
|
937
512
|
|
|
938
|
-
def
|
|
939
|
-
|
|
513
|
+
def dbfs_path(
|
|
514
|
+
self,
|
|
515
|
+
parts: Union[List[str], str],
|
|
516
|
+
kind: Optional[DatabricksPathKind] = None,
|
|
517
|
+
workspace: Optional["Workspace"] = None
|
|
518
|
+
):
|
|
519
|
+
return self.workspace.dbfs_path(
|
|
520
|
+
kind=kind,
|
|
521
|
+
parts=parts,
|
|
522
|
+
workspace=workspace
|
|
523
|
+
)
|
|
940
524
|
|
|
941
525
|
def sdk(self):
|
|
942
526
|
return self.workspace.sdk()
|