ygg 0.1.29__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,28 @@
1
- import base64
2
1
  import dataclasses
3
- import io
4
2
  import logging
5
3
  import os
6
4
  import posixpath
7
5
  from abc import ABC
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
6
  from dataclasses import dataclass
10
7
  from pathlib import Path
11
8
  from typing import (
12
9
  Any,
13
10
  BinaryIO,
14
11
  Iterator,
15
- List,
16
12
  Optional,
17
- Union
13
+ Union, TYPE_CHECKING, List
18
14
  )
19
15
 
20
- from .databricks_path import DatabricksPath
16
+ if TYPE_CHECKING:
17
+ from ..compute.cluster import Cluster
18
+
19
+ from .databricks_path import DatabricksPath, DatabricksPathKind
21
20
  from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
22
21
 
23
22
  if databricks_sdk is not None:
24
23
  from databricks.sdk import WorkspaceClient
25
24
  from databricks.sdk.errors import ResourceDoesNotExist, NotFound
26
- from databricks.sdk.service.workspace import ImportFormat, ExportFormat, ObjectInfo
25
+ from databricks.sdk.service.workspace import ExportFormat, ObjectInfo
27
26
  from databricks.sdk.service import catalog as catalog_svc
28
27
  from databricks.sdk.dbutils import FileInfo
29
28
  from databricks.sdk.service.files import DirectoryEntry
@@ -62,31 +61,8 @@ def _get_env_product_tag():
62
61
  v = os.getenv("DATABRICKS_PRODUCT_TAG")
63
62
 
64
63
  if not v:
65
- return "default"
66
-
67
- return v.strip().lower()
68
-
69
-
70
- def _get_remote_size(sdk, target_path: str) -> Optional[int]:
71
- """
72
- Best-effort fetch remote file size for target_path across
73
- DBFS, Volumes, and Workspace. Returns None if not found.
74
- """
75
- try:
76
- if target_path.startswith("dbfs:/"):
77
- st = sdk.dbfs.get_status(target_path)
78
- return getattr(st, "file_size", None)
79
-
80
- if target_path.startswith("/Volumes"):
81
- st = sdk.files.get_status(file_path=target_path)
82
- return getattr(st, "file_size", None)
83
-
84
- # Workspace path
85
- st = sdk.workspace.get_status(target_path)
86
- return getattr(st, "size", None)
87
-
88
- except ResourceDoesNotExist:
89
64
  return None
65
+ return v.strip().lower()
90
66
 
91
67
 
92
68
  @dataclass
@@ -140,9 +116,7 @@ class Workspace:
140
116
  state = self.__dict__.copy()
141
117
  state.pop("_sdk", None)
142
118
 
143
- was_connected = self._sdk is not None
144
-
145
- state["_was_connected"] = was_connected
119
+ state["_was_connected"] = self._sdk is not None
146
120
  state["_cached_token"] = self.current_token()
147
121
 
148
122
  return state
@@ -169,8 +143,13 @@ class Workspace:
169
143
  # -------------------------
170
144
  # Clone
171
145
  # -------------------------
172
- def clone(self) -> "Workspace":
173
- return Workspace().__setstate__(self.__getstate__())
146
+ def clone(
147
+ self,
148
+ **kwargs
149
+ ) -> "Workspace":
150
+ state = self.__getstate__()
151
+ state.update(kwargs)
152
+ return Workspace().__setstate__(state)
174
153
 
175
154
  # -------------------------
176
155
  # SDK connection
@@ -308,17 +287,30 @@ class Workspace:
308
287
  # ------------------------------------------------------------------ #
309
288
  # Path helpers
310
289
  # ------------------------------------------------------------------ #
311
- def path(self, *parts, workspace: Optional["Workspace"] = None, **kwargs):
290
+ def dbfs_path(
291
+ self,
292
+ parts: Union[List[str], str],
293
+ kind: Optional[DatabricksPathKind] = None,
294
+ workspace: Optional["Workspace"] = None
295
+ ):
296
+ workspace = self if workspace is None else workspace
297
+
298
+ if kind is None or isinstance(parts, str):
299
+ return DatabricksPath.parse(
300
+ parts=parts,
301
+ workspace=workspace
302
+ )
303
+
312
304
  return DatabricksPath(
313
- *parts,
314
- workspace=self if workspace is None else workspace,
315
- **kwargs
305
+ kind=kind,
306
+ parts=parts,
307
+ workspace=workspace
316
308
  )
317
309
 
318
- @staticmethod
319
310
  def shared_cache_path(
311
+ self,
320
312
  suffix: Optional[str] = None
321
- ) -> str:
313
+ ) -> DatabricksPath:
322
314
  """
323
315
  Shared cache base under Volumes for the current user.
324
316
  """
@@ -328,31 +320,7 @@ class Workspace:
328
320
  return base
329
321
 
330
322
  suffix = suffix.lstrip("/")
331
- return f"{base}/{suffix}"
332
-
333
- def temp_volume_folder(
334
- self,
335
- suffix: Optional[str] = None,
336
- catalog_name: Optional[str] = None,
337
- schema_name: Optional[str] = None,
338
- volume_name: Optional[str] = None,
339
- ) -> str:
340
- """
341
- Temporary folder either under a UC Volume or dbfs:/FileStore/.ygg/tmp/<user>.
342
- """
343
- if volume_name:
344
- catalog_name = catalog_name or os.getenv("DATABRICKS_CATALOG_NAME")
345
- schema_name = schema_name or os.getenv("DATABRICKS_SCHEMA_NAME")
346
-
347
- base = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
348
- else:
349
- base = f"dbfs:/FileStore/.ygg/tmp/{self.current_user.user_name}"
350
-
351
- if not suffix:
352
- return base
353
-
354
- suffix = suffix.lstrip("/")
355
- return f"{base}/{suffix}"
323
+ return self.dbfs_path(f"{base}/{suffix}")
356
324
 
357
325
  # ------------------------------------------------------------------ #
358
326
  # SDK access / connection
@@ -415,346 +383,6 @@ class Workspace:
415
383
  # 4) finally create the directory path itself
416
384
  sdk.files.create_directory(target_path)
417
385
 
418
- # ------------------------------------------------------------------ #
419
- # Upload helpers
420
- # ------------------------------------------------------------------ #
421
- def upload_file_content(
422
- self,
423
- content: Union[bytes, BinaryIO],
424
- target_path: str,
425
- makedirs: bool = True,
426
- overwrite: bool = True,
427
- only_if_size_diff: bool = False,
428
- parallel_pool: Optional[ThreadPoolExecutor] = None,
429
- ):
430
- """
431
- Upload a single content blob into Databricks (Workspace / Volumes / DBFS).
432
-
433
- content:
434
- bytes or a binary file-like object.
435
-
436
- target_path:
437
- - "dbfs:/..." → DBFS via dbfs.put
438
- - "/Volumes/..." → Unity Catalog Volumes via files.upload
439
- - anything else → Workspace via workspace.upload
440
-
441
- If parallel_pool is provided, this schedules the upload on the pool
442
- and returns a Future. The underlying call is non-parallel (no nested pool).
443
-
444
- If only_if_size_diff=True, it will:
445
- - compute local content size (len(bytes))
446
- - fetch remote size (best-effort)
447
- - skip upload if sizes match.
448
- """
449
- # If we're doing this in a pool, normalize content to bytes *before*
450
- # submitting so we don't share a live file handle across threads.
451
- if parallel_pool is not None:
452
- if hasattr(content, "read"):
453
- data = content.read()
454
- else:
455
- data = content
456
-
457
- # use a cloned workspace so clients don't collide across threads
458
- return parallel_pool.submit(
459
- self.clone().upload_file_content,
460
- content=data,
461
- target_path=target_path,
462
- makedirs=makedirs,
463
- overwrite=overwrite,
464
- only_if_size_diff=only_if_size_diff,
465
- parallel_pool=None,
466
- )
467
-
468
- with self.connect() as connected:
469
- sdk = connected.sdk()
470
-
471
- # Normalize content to bytes once
472
- if hasattr(content, "read"): # BinaryIO
473
- data = content.read()
474
- else:
475
- data = content
476
-
477
- if not isinstance(data, (bytes, bytearray)):
478
- if isinstance(data, str):
479
- data = data.encode()
480
- else:
481
- raise TypeError(
482
- f"content must be bytes or BinaryIO, got {type(content)!r}"
483
- )
484
-
485
- data_bytes = bytes(data)
486
- local_size = len(data_bytes)
487
-
488
- # Only-if-size-diff: check remote size and bail early if equal
489
- if only_if_size_diff:
490
- remote_size = _get_remote_size(sdk, target_path)
491
- if remote_size is not None and remote_size == local_size:
492
- # Same size remotely -> skip upload
493
- return None
494
-
495
- # Ensure parent directory if requested
496
- parent = os.path.dirname(target_path)
497
-
498
- if target_path.startswith("dbfs:/"):
499
- # --- DBFS path ---
500
- if makedirs and parent and parent != "dbfs:/":
501
- sdk.dbfs.mkdirs(parent)
502
-
503
- data_str = base64.b64encode(data_bytes).decode("utf-8")
504
- sdk.dbfs.put(
505
- path=target_path,
506
- contents=data_str,
507
- overwrite=overwrite,
508
- )
509
-
510
- elif target_path.startswith("/Volumes"):
511
- # --- Unity Catalog Volumes path ---
512
- if makedirs and parent and parent != "/":
513
- try:
514
- sdk.files.create_directory(parent)
515
- except NotFound:
516
- connected.ensure_uc_volume_and_dir(parent)
517
-
518
- sdk.files.upload(
519
- file_path=target_path,
520
- contents=io.BytesIO(data_bytes),
521
- overwrite=overwrite,
522
- )
523
-
524
- else:
525
- # --- Workspace Files / Notebooks ---
526
- if makedirs and parent:
527
- sdk.workspace.mkdirs(parent)
528
-
529
- sdk.workspace.upload(
530
- path=target_path,
531
- format=ImportFormat.RAW,
532
- content=data_bytes,
533
- overwrite=overwrite,
534
- )
535
-
536
- def upload_local_path(
537
- self,
538
- local_path: str,
539
- target_path: str,
540
- makedirs: bool = True,
541
- overwrite: bool = True,
542
- only_if_size_diff: bool = False,
543
- parallel_pool: Optional[ThreadPoolExecutor] = None,
544
- ):
545
- if os.path.isfile(local_path):
546
- return self.upload_local_file(
547
- local_path=local_path,
548
- target_path=target_path,
549
- makedirs=makedirs,
550
- overwrite=overwrite,
551
- only_if_size_diff=only_if_size_diff,
552
- parallel_pool=parallel_pool
553
- )
554
- else:
555
- return self.upload_local_folder(
556
- local_path=local_path,
557
- target_path=target_path,
558
- makedirs=makedirs,
559
- only_if_size_diff=only_if_size_diff,
560
- parallel_pool=parallel_pool
561
- )
562
-
563
- def upload_local_file(
564
- self,
565
- local_path: str,
566
- target_path: str,
567
- makedirs: bool = True,
568
- overwrite: bool = True,
569
- only_if_size_diff: bool = False,
570
- parallel_pool: Optional[ThreadPoolExecutor] = None,
571
- ):
572
- """
573
- Upload a single local file into Databricks.
574
-
575
- If parallel_pool is provided, this schedules the upload on the pool
576
- and returns a Future.
577
-
578
- If only_if_size_diff=True, it will:
579
- - For large files (>4 MiB), check remote file status
580
- - Skip upload if remote size == local size
581
- """
582
- if parallel_pool is not None:
583
- # Submit a *non-parallel* variant into the pool
584
- return parallel_pool.submit(
585
- self.upload_local_file,
586
- local_path=local_path,
587
- target_path=target_path,
588
- makedirs=makedirs,
589
- overwrite=overwrite,
590
- only_if_size_diff=only_if_size_diff,
591
- parallel_pool=None,
592
- )
593
-
594
- sdk = self.sdk()
595
-
596
- local_size = os.path.getsize(local_path)
597
- large_threshold = 32 * 1024
598
-
599
- if only_if_size_diff and local_size > large_threshold:
600
- try:
601
- info = sdk.workspace.get_status(path=target_path)
602
- remote_size = getattr(info, "size", None)
603
-
604
- if remote_size is not None and remote_size == local_size:
605
- return
606
- except ResourceDoesNotExist:
607
- # Doesn't exist → upload below
608
- pass
609
-
610
- with open(local_path, "rb") as f:
611
- content = f.read()
612
-
613
- return self.upload_file_content(
614
- content=content,
615
- target_path=target_path,
616
- makedirs=makedirs,
617
- overwrite=overwrite,
618
- only_if_size_diff=False,
619
- parallel_pool=parallel_pool,
620
- )
621
-
622
- def upload_local_folder(
623
- self,
624
- local_path: str,
625
- target_path: str,
626
- makedirs: bool = True,
627
- only_if_size_diff: bool = True,
628
- exclude_dir_names: Optional[List[str]] = None,
629
- exclude_hidden: bool = True,
630
- parallel_pool: Optional[Union[ThreadPoolExecutor, int]] = None,
631
- ):
632
- """
633
- Recursively upload a local folder into Databricks Workspace Files.
634
-
635
- - Traverses subdirectories recursively.
636
- - Optionally skips files that match size/mtime of remote entries.
637
- - Can upload files in parallel using a ThreadPoolExecutor.
638
-
639
- Args:
640
- local_path: Local directory to upload from.
641
- target_path: Workspace path to upload into.
642
- makedirs: Create remote directories as needed.
643
- only_if_size_diff: Skip upload if remote file exists with same size and newer mtime.
644
- exclude_dir_names: Directory names to skip entirely.
645
- exclude_hidden: Skip dot-prefixed files/directories.
646
- parallel_pool: None | ThreadPoolExecutor | int (max_workers).
647
- """
648
- sdk = self.sdk()
649
- local_path = os.path.abspath(local_path)
650
- exclude_dirs_set = set(exclude_dir_names or [])
651
-
652
- try:
653
- existing_objs = list(sdk.workspace.list(target_path))
654
- except ResourceDoesNotExist:
655
- existing_objs = []
656
-
657
- # --- setup pool semantics ---
658
- created_pool: Optional[ThreadPoolExecutor] = None
659
- if isinstance(parallel_pool, int):
660
- created_pool = ThreadPoolExecutor(max_workers=parallel_pool)
661
- pool: Optional[ThreadPoolExecutor] = created_pool
662
- elif isinstance(parallel_pool, ThreadPoolExecutor):
663
- pool = parallel_pool
664
- else:
665
- pool = None
666
-
667
- futures = []
668
-
669
- def _upload_dir(local_root: str, remote_root: str, ensure_dir: bool):
670
- # Ensure remote directory exists if requested
671
- existing_remote_root_obj = [
672
- _ for _ in existing_objs
673
- if _.path.startswith(remote_root)
674
- ]
675
-
676
- if ensure_dir and not existing_remote_root_obj:
677
- sdk.workspace.mkdirs(remote_root)
678
-
679
- try:
680
- local_entries = list(os.scandir(local_root))
681
- except FileNotFoundError:
682
- return
683
-
684
- local_files = []
685
- local_dirs = []
686
-
687
- for local_entry in local_entries:
688
- # Skip hidden if requested
689
- if exclude_hidden and local_entry.name.startswith("."):
690
- continue
691
-
692
- if local_entry.is_dir():
693
- if local_entry.name in exclude_dirs_set:
694
- continue
695
- local_dirs.append(local_entry)
696
- elif existing_objs:
697
- found_same_remote = None
698
- for exiting_obj in existing_objs:
699
- existing_obj_name = os.path.basename(exiting_obj.path)
700
- if existing_obj_name == local_entry.name:
701
- found_same_remote = exiting_obj
702
- break
703
-
704
- if found_same_remote:
705
- found_same_remote_epoch = found_same_remote.modified_at / 1000
706
- local_stats = local_entry.stat()
707
-
708
- if (
709
- only_if_size_diff
710
- and found_same_remote.size
711
- and found_same_remote.size != local_stats.st_size
712
- ):
713
- pass # size diff -> upload
714
- elif local_stats.st_mtime < found_same_remote_epoch:
715
- # remote is newer -> skip
716
- continue
717
- else:
718
- local_files.append(local_entry)
719
- else:
720
- local_files.append(local_entry)
721
- else:
722
- local_files.append(local_entry)
723
-
724
- # ---- upload files in this directory ----
725
- for local_entry in local_files:
726
- remote_path = posixpath.join(remote_root, local_entry.name)
727
-
728
- entry_fut = self.upload_local_file(
729
- local_path=local_entry.path,
730
- target_path=remote_path,
731
- makedirs=False,
732
- overwrite=True,
733
- only_if_size_diff=False,
734
- parallel_pool=pool,
735
- )
736
-
737
- if pool is not None:
738
- futures.append(entry_fut)
739
-
740
- # ---- recurse into subdirectories ----
741
- for local_entry in local_dirs:
742
- _upload_dir(
743
- local_entry.path,
744
- posixpath.join(remote_root, local_entry.name),
745
- ensure_dir=makedirs,
746
- )
747
-
748
- try:
749
- _upload_dir(local_path, target_path, ensure_dir=makedirs)
750
-
751
- if pool is not None:
752
- for fut in as_completed(futures):
753
- fut.result()
754
- finally:
755
- if created_pool is not None:
756
- created_pool.shutdown(wait=True)
757
-
758
386
  # ------------------------------------------------------------------ #
759
387
  # List / open / delete / SQL
760
388
  # ------------------------------------------------------------------ #
@@ -895,15 +523,15 @@ class Workspace:
895
523
  **kwargs
896
524
  )
897
525
 
898
- def cluster(self, **kwargs):
899
- from ..compute.cluster import Cluster
900
-
901
- return Cluster(workspace=self, **kwargs)
902
-
903
- def clusters(self, **kwargs):
526
+ def clusters(
527
+ self,
528
+ cluster_id: Optional[str] = None,
529
+ cluster_name: Optional[str] = None,
530
+ **kwargs
531
+ ) -> "Cluster":
904
532
  from ..compute.cluster import Cluster
905
533
 
906
- return Cluster(workspace=self, **kwargs)
534
+ return Cluster(workspace=self, cluster_id=cluster_id, cluster_name=cluster_name, **kwargs)
907
535
 
908
536
 
909
537
  # ---------------------------------------------------------------------------
@@ -935,8 +563,17 @@ class WorkspaceService(ABC):
935
563
  self.workspace = self.workspace.connect()
936
564
  return self
937
565
 
938
- def path(self, *parts, workspace: Optional["Workspace"] = None, **kwargs):
939
- return self.workspace.path(*parts, workspace=workspace, **kwargs)
566
+ def dbfs_path(
567
+ self,
568
+ parts: Union[List[str], str],
569
+ kind: Optional[DatabricksPathKind] = None,
570
+ workspace: Optional["Workspace"] = None
571
+ ):
572
+ return self.workspace.dbfs_path(
573
+ kind=kind,
574
+ parts=parts,
575
+ workspace=workspace
576
+ )
940
577
 
941
578
  def sdk(self):
942
579
  return self.workspace.sdk()
@@ -423,6 +423,7 @@ class CallableSerde:
423
423
  Prints one line: "{result_tag}:{base64(blob)}"
424
424
  where blob is raw dill bytes or framed+zlib.
425
425
  """
426
+ args = args or ()
426
427
  kwargs = kwargs or {}
427
428
 
428
429
  serde_dict = self.dump(
@@ -236,7 +236,7 @@ class PipIndexSettings:
236
236
  @property
237
237
  def extra_index_url(self):
238
238
  if self.extra_index_urls:
239
- return self.extra_index_urls[0]
239
+ return " ".join(self.extra_index_urls)
240
240
  return None
241
241
 
242
242
  def as_dict(self) -> dict: