ygg 0.1.29__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,28 @@
1
- import base64
2
1
  import dataclasses
3
- import io
4
2
  import logging
5
3
  import os
6
4
  import posixpath
7
5
  from abc import ABC
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
6
  from dataclasses import dataclass
10
7
  from pathlib import Path
11
8
  from typing import (
12
9
  Any,
13
10
  BinaryIO,
14
11
  Iterator,
15
- List,
16
12
  Optional,
17
- Union
13
+ Union, TYPE_CHECKING, List
18
14
  )
19
15
 
20
- from .databricks_path import DatabricksPath
16
+ if TYPE_CHECKING:
17
+ from ..compute.cluster import Cluster
18
+
19
+ from .path import DatabricksPath, DatabricksPathKind
21
20
  from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
22
21
 
23
22
  if databricks_sdk is not None:
24
23
  from databricks.sdk import WorkspaceClient
25
24
  from databricks.sdk.errors import ResourceDoesNotExist, NotFound
26
- from databricks.sdk.service.workspace import ImportFormat, ExportFormat, ObjectInfo
25
+ from databricks.sdk.service.workspace import ExportFormat, ObjectInfo
27
26
  from databricks.sdk.service import catalog as catalog_svc
28
27
  from databricks.sdk.dbutils import FileInfo
29
28
  from databricks.sdk.service.files import DirectoryEntry
@@ -62,31 +61,8 @@ def _get_env_product_tag():
62
61
  v = os.getenv("DATABRICKS_PRODUCT_TAG")
63
62
 
64
63
  if not v:
65
- return "default"
66
-
67
- return v.strip().lower()
68
-
69
-
70
- def _get_remote_size(sdk, target_path: str) -> Optional[int]:
71
- """
72
- Best-effort fetch remote file size for target_path across
73
- DBFS, Volumes, and Workspace. Returns None if not found.
74
- """
75
- try:
76
- if target_path.startswith("dbfs:/"):
77
- st = sdk.dbfs.get_status(target_path)
78
- return getattr(st, "file_size", None)
79
-
80
- if target_path.startswith("/Volumes"):
81
- st = sdk.files.get_status(file_path=target_path)
82
- return getattr(st, "file_size", None)
83
-
84
- # Workspace path
85
- st = sdk.workspace.get_status(target_path)
86
- return getattr(st, "size", None)
87
-
88
- except ResourceDoesNotExist:
89
64
  return None
65
+ return v.strip().lower()
90
66
 
91
67
 
92
68
  @dataclass
@@ -140,9 +116,7 @@ class Workspace:
140
116
  state = self.__dict__.copy()
141
117
  state.pop("_sdk", None)
142
118
 
143
- was_connected = self._sdk is not None
144
-
145
- state["_was_connected"] = was_connected
119
+ state["_was_connected"] = self._sdk is not None
146
120
  state["_cached_token"] = self.current_token()
147
121
 
148
122
  return state
@@ -159,102 +133,117 @@ class Workspace:
159
133
 
160
134
  def __enter__(self) -> "Workspace":
161
135
  self._was_connected = self._sdk is not None
162
- self.connect()
163
- return self
136
+ return self.connect()
164
137
 
165
138
  def __exit__(self, exc_type, exc_val, exc_tb) -> None:
166
139
  if not self._was_connected:
167
140
  self.close()
168
141
 
142
+ def __del__(self):
143
+ self.close()
144
+
169
145
  # -------------------------
170
146
  # Clone
171
147
  # -------------------------
172
- def clone(self) -> "Workspace":
173
- return Workspace().__setstate__(self.__getstate__())
148
+ def clone_instance(
149
+ self,
150
+ **kwargs
151
+ ) -> "Workspace":
152
+ state = self.__getstate__()
153
+ state.update(kwargs)
154
+ return Workspace().__setstate__(state)
174
155
 
175
156
  # -------------------------
176
157
  # SDK connection
177
158
  # -------------------------
178
- def connect(self, reset: bool = False) -> "Workspace":
159
+ @property
160
+ def connected(self):
161
+ return self._sdk is not None
162
+
163
+ def connect(self, reset: bool = False, clone: bool = False) -> "Workspace":
179
164
  if reset:
180
165
  self._sdk = None
181
166
 
182
- if self._sdk is None:
183
- require_databricks_sdk()
184
- logger.debug("Connecting %s", self)
185
-
186
- # Build Config from config_dict if available, else from fields.
187
- kwargs = {
188
- "host": self.host,
189
- "account_id": self.account_id,
190
- "token": self.token,
191
- "client_id": self.client_id,
192
- "client_secret": self.client_secret,
193
- "token_audience": self.token_audience,
194
- "azure_workspace_resource_id": self.azure_workspace_resource_id,
195
- "azure_use_msi": self.azure_use_msi,
196
- "azure_client_secret": self.azure_client_secret,
197
- "azure_client_id": self.azure_client_id,
198
- "azure_tenant_id": self.azure_tenant_id,
199
- "azure_environment": self.azure_environment,
200
- "google_credentials": self.google_credentials,
201
- "google_service_account": self.google_service_account,
202
- "profile": self.profile,
203
- "config_file": self.config_file,
204
- "auth_type": self.auth_type,
205
- "http_timeout_seconds": self.http_timeout_seconds,
206
- "retry_timeout_seconds": self.retry_timeout_seconds,
207
- "debug_truncate_bytes": self.debug_truncate_bytes,
208
- "debug_headers": self.debug_headers,
209
- "rate_limit": self.rate_limit,
210
- "product": self.product,
211
- "product_version": self.product_version,
212
- }
213
-
214
- build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
167
+ if self._sdk is not None:
168
+ return self
169
+
170
+ instance = self.clone_instance() if clone else self
171
+
172
+ require_databricks_sdk()
173
+ logger.debug("Connecting %s", self)
174
+
175
+ # Build Config from config_dict if available, else from fields.
176
+ kwargs = {
177
+ "host": instance.host,
178
+ "account_id": instance.account_id,
179
+ "token": instance.token,
180
+ "client_id": instance.client_id,
181
+ "client_secret": instance.client_secret,
182
+ "token_audience": instance.token_audience,
183
+ "azure_workspace_resource_id": instance.azure_workspace_resource_id,
184
+ "azure_use_msi": instance.azure_use_msi,
185
+ "azure_client_secret": instance.azure_client_secret,
186
+ "azure_client_id": instance.azure_client_id,
187
+ "azure_tenant_id": instance.azure_tenant_id,
188
+ "azure_environment": instance.azure_environment,
189
+ "google_credentials": instance.google_credentials,
190
+ "google_service_account": instance.google_service_account,
191
+ "profile": instance.profile,
192
+ "config_file": instance.config_file,
193
+ "auth_type": instance.auth_type,
194
+ "http_timeout_seconds": instance.http_timeout_seconds,
195
+ "retry_timeout_seconds": instance.retry_timeout_seconds,
196
+ "debug_truncate_bytes": instance.debug_truncate_bytes,
197
+ "debug_headers": instance.debug_headers,
198
+ "rate_limit": instance.rate_limit,
199
+ "product": instance.product,
200
+ "product_version": instance.product_version,
201
+ }
215
202
 
216
- try:
217
- self._sdk = WorkspaceClient(**build_kwargs)
218
- except ValueError as e:
219
- if "cannot configure default credentials" in str(e) and self.auth_type is None:
220
- last_error = e
203
+ build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
221
204
 
222
- auth_types = ["runtime"] if self.is_in_databricks_environment() else ["external-browser"]
205
+ try:
206
+ instance._sdk = WorkspaceClient(**build_kwargs)
207
+ except ValueError as e:
208
+ if "cannot configure default credentials" in str(e) and instance.auth_type is None:
209
+ last_error = e
223
210
 
224
- for auth_type in auth_types:
225
- build_kwargs["auth_type"] = auth_type
211
+ auth_types = ["runtime"] if instance.is_in_databricks_environment() else ["external-browser"]
212
+
213
+ for auth_type in auth_types:
214
+ build_kwargs["auth_type"] = auth_type
215
+
216
+ try:
217
+ instance._sdk = WorkspaceClient(**build_kwargs)
218
+ break
219
+ except Exception as se:
220
+ last_error = se
221
+ build_kwargs.pop("auth_type")
222
+
223
+ if instance._sdk is None:
224
+ if instance.is_in_databricks_environment() and instance._cached_token:
225
+ build_kwargs["token"] = instance._cached_token
226
226
 
227
227
  try:
228
- self._sdk = WorkspaceClient(**build_kwargs)
229
- break
228
+ instance._sdk = WorkspaceClient(**build_kwargs)
230
229
  except Exception as se:
231
230
  last_error = se
232
- build_kwargs.pop("auth_type")
233
-
234
- if self._sdk is None:
235
- if self.is_in_databricks_environment() and self._cached_token:
236
- build_kwargs["token"] = self._cached_token
237
231
 
238
- try:
239
- self._sdk = WorkspaceClient(**build_kwargs)
240
- except Exception as se:
241
- last_error = se
242
-
243
- if self._sdk is None:
244
- raise last_error
245
- else:
246
- raise e
232
+ if instance._sdk is None:
233
+ raise last_error
234
+ else:
235
+ raise e
247
236
 
248
- # backfill resolved config values
249
- for key in list(kwargs.keys()):
250
- if getattr(self, key, None) is None:
251
- v = getattr(self._sdk.config, key, None)
252
- if v is not None:
253
- setattr(self, key, v)
237
+ # backfill resolved config values
238
+ for key in list(kwargs.keys()):
239
+ if getattr(instance, key, None) is None:
240
+ v = getattr(instance._sdk.config, key, None)
241
+ if v is not None:
242
+ setattr(instance, key, v)
254
243
 
255
- logger.info("Connected %s", self)
244
+ logger.info("Connected %s", instance)
256
245
 
257
- return self
246
+ return instance
258
247
 
259
248
  # ------------------------------------------------------------------ #
260
249
  # Context manager + lifecycle
@@ -308,17 +297,44 @@ class Workspace:
308
297
  # ------------------------------------------------------------------ #
309
298
  # Path helpers
310
299
  # ------------------------------------------------------------------ #
311
- def path(self, *parts, workspace: Optional["Workspace"] = None, **kwargs):
300
+ def filesytem(
301
+ self,
302
+ workspace: Optional["Workspace"] = None,
303
+ ):
304
+ from .filesytem import DatabricksFileSystem, DatabricksFileSystemHandler
305
+
306
+ handler = DatabricksFileSystemHandler(
307
+ workspace=self if workspace is None else workspace
308
+ )
309
+
310
+ return DatabricksFileSystem(
311
+ handler=handler
312
+ )
313
+
314
+ def dbfs_path(
315
+ self,
316
+ parts: Union[List[str], str],
317
+ kind: Optional[DatabricksPathKind] = None,
318
+ workspace: Optional["Workspace"] = None
319
+ ):
320
+ workspace = self if workspace is None else workspace
321
+
322
+ if kind is None or isinstance(parts, str):
323
+ return DatabricksPath.parse(
324
+ obj=parts,
325
+ workspace=workspace
326
+ )
327
+
312
328
  return DatabricksPath(
313
- *parts,
314
- workspace=self if workspace is None else workspace,
315
- **kwargs
329
+ kind=kind,
330
+ parts=parts,
331
+ _workspace=workspace
316
332
  )
317
333
 
318
- @staticmethod
319
334
  def shared_cache_path(
335
+ self,
320
336
  suffix: Optional[str] = None
321
- ) -> str:
337
+ ) -> DatabricksPath:
322
338
  """
323
339
  Shared cache base under Volumes for the current user.
324
340
  """
@@ -328,31 +344,7 @@ class Workspace:
328
344
  return base
329
345
 
330
346
  suffix = suffix.lstrip("/")
331
- return f"{base}/{suffix}"
332
-
333
- def temp_volume_folder(
334
- self,
335
- suffix: Optional[str] = None,
336
- catalog_name: Optional[str] = None,
337
- schema_name: Optional[str] = None,
338
- volume_name: Optional[str] = None,
339
- ) -> str:
340
- """
341
- Temporary folder either under a UC Volume or dbfs:/FileStore/.ygg/tmp/<user>.
342
- """
343
- if volume_name:
344
- catalog_name = catalog_name or os.getenv("DATABRICKS_CATALOG_NAME")
345
- schema_name = schema_name or os.getenv("DATABRICKS_SCHEMA_NAME")
346
-
347
- base = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
348
- else:
349
- base = f"dbfs:/FileStore/.ygg/tmp/{self.current_user.user_name}"
350
-
351
- if not suffix:
352
- return base
353
-
354
- suffix = suffix.lstrip("/")
355
- return f"{base}/{suffix}"
347
+ return self.dbfs_path(f"{base}/{suffix}")
356
348
 
357
349
  # ------------------------------------------------------------------ #
358
350
  # SDK access / connection
@@ -361,400 +353,6 @@ class Workspace:
361
353
  def sdk(self) -> "WorkspaceClient":
362
354
  return self.connect()._sdk
363
355
 
364
- # ------------------------------------------------------------------ #
365
- # UC volume + directory management
366
- # ------------------------------------------------------------------ #
367
-
368
- def ensure_uc_volume_and_dir(
369
- self,
370
- target_path: str,
371
- ) -> None:
372
- """
373
- Ensure catalog, schema, volume exist for a UC volume path
374
- like /Volumes/<catalog>/<schema>/<volume>/...,
375
- then create the directory.
376
- """
377
- sdk = self.sdk()
378
- parts = target_path.split("/")
379
-
380
- # basic sanity check
381
- if len(parts) < 5 or parts[1] != "Volumes":
382
- raise ValueError(
383
- f"Unexpected UC volume path: {target_path!r}. "
384
- "Expected /Volumes/<catalog>/<schema>/<volume>/..."
385
- )
386
-
387
- # /Volumes/<catalog>/<schema>/<volume>/...
388
- _, _, catalog_name, schema_name, volume_name, *subpath = parts
389
-
390
- # 1) ensure catalog
391
- try:
392
- sdk.catalogs.get(name=catalog_name)
393
- except NotFound:
394
- sdk.catalogs.create(name=catalog_name)
395
-
396
- # 2) ensure schema
397
- schema_full_name = f"{catalog_name}.{schema_name}"
398
- try:
399
- sdk.schemas.get(full_name=schema_full_name)
400
- except NotFound:
401
- sdk.schemas.create(name=schema_name, catalog_name=catalog_name)
402
-
403
- # 3) ensure volume (managed volume is simplest)
404
- volume_full_name = f"{catalog_name}.{schema_name}.{volume_name}"
405
- try:
406
- sdk.volumes.read(name=volume_full_name)
407
- except NotFound:
408
- sdk.volumes.create(
409
- catalog_name=catalog_name,
410
- schema_name=schema_name,
411
- name=volume_name,
412
- volume_type=catalog_svc.VolumeType.MANAGED,
413
- )
414
-
415
- # 4) finally create the directory path itself
416
- sdk.files.create_directory(target_path)
417
-
418
- # ------------------------------------------------------------------ #
419
- # Upload helpers
420
- # ------------------------------------------------------------------ #
421
- def upload_file_content(
422
- self,
423
- content: Union[bytes, BinaryIO],
424
- target_path: str,
425
- makedirs: bool = True,
426
- overwrite: bool = True,
427
- only_if_size_diff: bool = False,
428
- parallel_pool: Optional[ThreadPoolExecutor] = None,
429
- ):
430
- """
431
- Upload a single content blob into Databricks (Workspace / Volumes / DBFS).
432
-
433
- content:
434
- bytes or a binary file-like object.
435
-
436
- target_path:
437
- - "dbfs:/..." → DBFS via dbfs.put
438
- - "/Volumes/..." → Unity Catalog Volumes via files.upload
439
- - anything else → Workspace via workspace.upload
440
-
441
- If parallel_pool is provided, this schedules the upload on the pool
442
- and returns a Future. The underlying call is non-parallel (no nested pool).
443
-
444
- If only_if_size_diff=True, it will:
445
- - compute local content size (len(bytes))
446
- - fetch remote size (best-effort)
447
- - skip upload if sizes match.
448
- """
449
- # If we're doing this in a pool, normalize content to bytes *before*
450
- # submitting so we don't share a live file handle across threads.
451
- if parallel_pool is not None:
452
- if hasattr(content, "read"):
453
- data = content.read()
454
- else:
455
- data = content
456
-
457
- # use a cloned workspace so clients don't collide across threads
458
- return parallel_pool.submit(
459
- self.clone().upload_file_content,
460
- content=data,
461
- target_path=target_path,
462
- makedirs=makedirs,
463
- overwrite=overwrite,
464
- only_if_size_diff=only_if_size_diff,
465
- parallel_pool=None,
466
- )
467
-
468
- with self.connect() as connected:
469
- sdk = connected.sdk()
470
-
471
- # Normalize content to bytes once
472
- if hasattr(content, "read"): # BinaryIO
473
- data = content.read()
474
- else:
475
- data = content
476
-
477
- if not isinstance(data, (bytes, bytearray)):
478
- if isinstance(data, str):
479
- data = data.encode()
480
- else:
481
- raise TypeError(
482
- f"content must be bytes or BinaryIO, got {type(content)!r}"
483
- )
484
-
485
- data_bytes = bytes(data)
486
- local_size = len(data_bytes)
487
-
488
- # Only-if-size-diff: check remote size and bail early if equal
489
- if only_if_size_diff:
490
- remote_size = _get_remote_size(sdk, target_path)
491
- if remote_size is not None and remote_size == local_size:
492
- # Same size remotely -> skip upload
493
- return None
494
-
495
- # Ensure parent directory if requested
496
- parent = os.path.dirname(target_path)
497
-
498
- if target_path.startswith("dbfs:/"):
499
- # --- DBFS path ---
500
- if makedirs and parent and parent != "dbfs:/":
501
- sdk.dbfs.mkdirs(parent)
502
-
503
- data_str = base64.b64encode(data_bytes).decode("utf-8")
504
- sdk.dbfs.put(
505
- path=target_path,
506
- contents=data_str,
507
- overwrite=overwrite,
508
- )
509
-
510
- elif target_path.startswith("/Volumes"):
511
- # --- Unity Catalog Volumes path ---
512
- if makedirs and parent and parent != "/":
513
- try:
514
- sdk.files.create_directory(parent)
515
- except NotFound:
516
- connected.ensure_uc_volume_and_dir(parent)
517
-
518
- sdk.files.upload(
519
- file_path=target_path,
520
- contents=io.BytesIO(data_bytes),
521
- overwrite=overwrite,
522
- )
523
-
524
- else:
525
- # --- Workspace Files / Notebooks ---
526
- if makedirs and parent:
527
- sdk.workspace.mkdirs(parent)
528
-
529
- sdk.workspace.upload(
530
- path=target_path,
531
- format=ImportFormat.RAW,
532
- content=data_bytes,
533
- overwrite=overwrite,
534
- )
535
-
536
- def upload_local_path(
537
- self,
538
- local_path: str,
539
- target_path: str,
540
- makedirs: bool = True,
541
- overwrite: bool = True,
542
- only_if_size_diff: bool = False,
543
- parallel_pool: Optional[ThreadPoolExecutor] = None,
544
- ):
545
- if os.path.isfile(local_path):
546
- return self.upload_local_file(
547
- local_path=local_path,
548
- target_path=target_path,
549
- makedirs=makedirs,
550
- overwrite=overwrite,
551
- only_if_size_diff=only_if_size_diff,
552
- parallel_pool=parallel_pool
553
- )
554
- else:
555
- return self.upload_local_folder(
556
- local_path=local_path,
557
- target_path=target_path,
558
- makedirs=makedirs,
559
- only_if_size_diff=only_if_size_diff,
560
- parallel_pool=parallel_pool
561
- )
562
-
563
- def upload_local_file(
564
- self,
565
- local_path: str,
566
- target_path: str,
567
- makedirs: bool = True,
568
- overwrite: bool = True,
569
- only_if_size_diff: bool = False,
570
- parallel_pool: Optional[ThreadPoolExecutor] = None,
571
- ):
572
- """
573
- Upload a single local file into Databricks.
574
-
575
- If parallel_pool is provided, this schedules the upload on the pool
576
- and returns a Future.
577
-
578
- If only_if_size_diff=True, it will:
579
- - For large files (>4 MiB), check remote file status
580
- - Skip upload if remote size == local size
581
- """
582
- if parallel_pool is not None:
583
- # Submit a *non-parallel* variant into the pool
584
- return parallel_pool.submit(
585
- self.upload_local_file,
586
- local_path=local_path,
587
- target_path=target_path,
588
- makedirs=makedirs,
589
- overwrite=overwrite,
590
- only_if_size_diff=only_if_size_diff,
591
- parallel_pool=None,
592
- )
593
-
594
- sdk = self.sdk()
595
-
596
- local_size = os.path.getsize(local_path)
597
- large_threshold = 32 * 1024
598
-
599
- if only_if_size_diff and local_size > large_threshold:
600
- try:
601
- info = sdk.workspace.get_status(path=target_path)
602
- remote_size = getattr(info, "size", None)
603
-
604
- if remote_size is not None and remote_size == local_size:
605
- return
606
- except ResourceDoesNotExist:
607
- # Doesn't exist → upload below
608
- pass
609
-
610
- with open(local_path, "rb") as f:
611
- content = f.read()
612
-
613
- return self.upload_file_content(
614
- content=content,
615
- target_path=target_path,
616
- makedirs=makedirs,
617
- overwrite=overwrite,
618
- only_if_size_diff=False,
619
- parallel_pool=parallel_pool,
620
- )
621
-
622
- def upload_local_folder(
623
- self,
624
- local_path: str,
625
- target_path: str,
626
- makedirs: bool = True,
627
- only_if_size_diff: bool = True,
628
- exclude_dir_names: Optional[List[str]] = None,
629
- exclude_hidden: bool = True,
630
- parallel_pool: Optional[Union[ThreadPoolExecutor, int]] = None,
631
- ):
632
- """
633
- Recursively upload a local folder into Databricks Workspace Files.
634
-
635
- - Traverses subdirectories recursively.
636
- - Optionally skips files that match size/mtime of remote entries.
637
- - Can upload files in parallel using a ThreadPoolExecutor.
638
-
639
- Args:
640
- local_path: Local directory to upload from.
641
- target_path: Workspace path to upload into.
642
- makedirs: Create remote directories as needed.
643
- only_if_size_diff: Skip upload if remote file exists with same size and newer mtime.
644
- exclude_dir_names: Directory names to skip entirely.
645
- exclude_hidden: Skip dot-prefixed files/directories.
646
- parallel_pool: None | ThreadPoolExecutor | int (max_workers).
647
- """
648
- sdk = self.sdk()
649
- local_path = os.path.abspath(local_path)
650
- exclude_dirs_set = set(exclude_dir_names or [])
651
-
652
- try:
653
- existing_objs = list(sdk.workspace.list(target_path))
654
- except ResourceDoesNotExist:
655
- existing_objs = []
656
-
657
- # --- setup pool semantics ---
658
- created_pool: Optional[ThreadPoolExecutor] = None
659
- if isinstance(parallel_pool, int):
660
- created_pool = ThreadPoolExecutor(max_workers=parallel_pool)
661
- pool: Optional[ThreadPoolExecutor] = created_pool
662
- elif isinstance(parallel_pool, ThreadPoolExecutor):
663
- pool = parallel_pool
664
- else:
665
- pool = None
666
-
667
- futures = []
668
-
669
- def _upload_dir(local_root: str, remote_root: str, ensure_dir: bool):
670
- # Ensure remote directory exists if requested
671
- existing_remote_root_obj = [
672
- _ for _ in existing_objs
673
- if _.path.startswith(remote_root)
674
- ]
675
-
676
- if ensure_dir and not existing_remote_root_obj:
677
- sdk.workspace.mkdirs(remote_root)
678
-
679
- try:
680
- local_entries = list(os.scandir(local_root))
681
- except FileNotFoundError:
682
- return
683
-
684
- local_files = []
685
- local_dirs = []
686
-
687
- for local_entry in local_entries:
688
- # Skip hidden if requested
689
- if exclude_hidden and local_entry.name.startswith("."):
690
- continue
691
-
692
- if local_entry.is_dir():
693
- if local_entry.name in exclude_dirs_set:
694
- continue
695
- local_dirs.append(local_entry)
696
- elif existing_objs:
697
- found_same_remote = None
698
- for exiting_obj in existing_objs:
699
- existing_obj_name = os.path.basename(exiting_obj.path)
700
- if existing_obj_name == local_entry.name:
701
- found_same_remote = exiting_obj
702
- break
703
-
704
- if found_same_remote:
705
- found_same_remote_epoch = found_same_remote.modified_at / 1000
706
- local_stats = local_entry.stat()
707
-
708
- if (
709
- only_if_size_diff
710
- and found_same_remote.size
711
- and found_same_remote.size != local_stats.st_size
712
- ):
713
- pass # size diff -> upload
714
- elif local_stats.st_mtime < found_same_remote_epoch:
715
- # remote is newer -> skip
716
- continue
717
- else:
718
- local_files.append(local_entry)
719
- else:
720
- local_files.append(local_entry)
721
- else:
722
- local_files.append(local_entry)
723
-
724
- # ---- upload files in this directory ----
725
- for local_entry in local_files:
726
- remote_path = posixpath.join(remote_root, local_entry.name)
727
-
728
- entry_fut = self.upload_local_file(
729
- local_path=local_entry.path,
730
- target_path=remote_path,
731
- makedirs=False,
732
- overwrite=True,
733
- only_if_size_diff=False,
734
- parallel_pool=pool,
735
- )
736
-
737
- if pool is not None:
738
- futures.append(entry_fut)
739
-
740
- # ---- recurse into subdirectories ----
741
- for local_entry in local_dirs:
742
- _upload_dir(
743
- local_entry.path,
744
- posixpath.join(remote_root, local_entry.name),
745
- ensure_dir=makedirs,
746
- )
747
-
748
- try:
749
- _upload_dir(local_path, target_path, ensure_dir=makedirs)
750
-
751
- if pool is not None:
752
- for fut in as_completed(futures):
753
- fut.result()
754
- finally:
755
- if created_pool is not None:
756
- created_pool.shutdown(wait=True)
757
-
758
356
  # ------------------------------------------------------------------ #
759
357
  # List / open / delete / SQL
760
358
  # ------------------------------------------------------------------ #
@@ -834,31 +432,8 @@ class Workspace:
834
432
 
835
433
  # Workspace path
836
434
  fmt = workspace_format or ExportFormat.AUTO
837
- return sdk.workspace.download(path=path, format=fmt)
838
-
839
- def delete_path(
840
- self,
841
- target_path: str,
842
- recursive: bool = True,
843
- ignore_missing: bool = True,
844
- ) -> None:
845
- """
846
- Delete a path in Databricks Workspace (file or directory).
847
-
848
- - If recursive=True and target_path is a directory, deletes entire tree.
849
- - If ignore_missing=True, missing paths won't raise.
850
- """
851
- sdk = self.sdk()
852
435
 
853
- try:
854
- sdk.workspace.delete(
855
- path=target_path,
856
- recursive=recursive,
857
- )
858
- except ResourceDoesNotExist:
859
- if ignore_missing:
860
- return
861
- raise
436
+ return sdk.workspace.download(path=path, format=fmt)
862
437
 
863
438
  @staticmethod
864
439
  def is_in_databricks_environment():
@@ -895,15 +470,15 @@ class Workspace:
895
470
  **kwargs
896
471
  )
897
472
 
898
- def cluster(self, **kwargs):
899
- from ..compute.cluster import Cluster
900
-
901
- return Cluster(workspace=self, **kwargs)
902
-
903
- def clusters(self, **kwargs):
473
+ def clusters(
474
+ self,
475
+ cluster_id: Optional[str] = None,
476
+ cluster_name: Optional[str] = None,
477
+ **kwargs
478
+ ) -> "Cluster":
904
479
  from ..compute.cluster import Cluster
905
480
 
906
- return Cluster(workspace=self, **kwargs)
481
+ return Cluster(workspace=self, cluster_id=cluster_id, cluster_name=cluster_name, **kwargs)
907
482
 
908
483
 
909
484
  # ---------------------------------------------------------------------------
@@ -935,8 +510,17 @@ class WorkspaceService(ABC):
935
510
  self.workspace = self.workspace.connect()
936
511
  return self
937
512
 
938
- def path(self, *parts, workspace: Optional["Workspace"] = None, **kwargs):
939
- return self.workspace.path(*parts, workspace=workspace, **kwargs)
513
+ def dbfs_path(
514
+ self,
515
+ parts: Union[List[str], str],
516
+ kind: Optional[DatabricksPathKind] = None,
517
+ workspace: Optional["Workspace"] = None
518
+ ):
519
+ return self.workspace.dbfs_path(
520
+ kind=kind,
521
+ parts=parts,
522
+ workspace=workspace
523
+ )
940
524
 
941
525
  def sdk(self):
942
526
  return self.workspace.sdk()