ygg 0.1.48__py3-none-any.whl → 0.1.50__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.48
3
+ Version: 0.1.50
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -235,6 +235,8 @@ Requires-Dist: pytest-asyncio; extra == "dev"
235
235
  Requires-Dist: black; extra == "dev"
236
236
  Requires-Dist: ruff; extra == "dev"
237
237
  Requires-Dist: mypy; extra == "dev"
238
+ Requires-Dist: build; extra == "dev"
239
+ Requires-Dist: twine; extra == "dev"
238
240
  Dynamic: license-file
239
241
 
240
242
  # Yggdrasil (Python)
@@ -1,31 +1,31 @@
1
- ygg-0.1.48.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
1
+ ygg-0.1.50.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
2
2
  yggdrasil/__init__.py,sha256=4-ghPak2S6zfMqmnlxW2GCgPb5s79znpKa2hGEGXcE4,24
3
- yggdrasil/version.py,sha256=GL56LdSW6fsXlq5LHiGjsIVgyhxVQeeDdO3Sd6nzZYc,22
3
+ yggdrasil/version.py,sha256=pMWaMbj0sqJPaN27zeKuthOtJ3nuofEVeTxWuJmKhTw,22
4
4
  yggdrasil/databricks/__init__.py,sha256=skctY2c8W-hI81upx9F_PWRe5ishL3hrdiTuizgDjdw,152
5
5
  yggdrasil/databricks/compute/__init__.py,sha256=NvdzmaJSNYY1uJthv1hHdBuNu3bD_-Z65DWnaJt9yXg,289
6
- yggdrasil/databricks/compute/cluster.py,sha256=0QjYHlaXSMgYqzMRy1Jypm2j7xoGRkPdwURZsQn_73U,43228
7
- yggdrasil/databricks/compute/execution_context.py,sha256=anOxfNms83dZ5FTknbfT8uj889LjheMqEx9W5NtJC9E,23094
8
- yggdrasil/databricks/compute/remote.py,sha256=nEN_Fr1Ouul_iKOf4B5QjEGscYAcl7nHjGsl2toRzrU,2874
6
+ yggdrasil/databricks/compute/cluster.py,sha256=YomLfvB0oxbgl6WDgBRxI1UXsxwlEbR6gq3FUbPHscY,44199
7
+ yggdrasil/databricks/compute/execution_context.py,sha256=jIV6uru2NeX3O5lg-3KEqmXtLxxq45CFgkBQgQIIOHQ,23327
8
+ yggdrasil/databricks/compute/remote.py,sha256=yicEhyQypssRa2ByscO36s3cBkEgORFsRME9aaq91Pc,3045
9
9
  yggdrasil/databricks/jobs/__init__.py,sha256=snxGSJb0M5I39v0y3IR-uEeSlZR248cQ_4DJ1sYs-h8,154
10
10
  yggdrasil/databricks/jobs/config.py,sha256=9LGeHD04hbfy0xt8_6oobC4moKJh4_DTjZiK4Q2Tqjk,11557
11
11
  yggdrasil/databricks/sql/__init__.py,sha256=Vp_1cFaX1l-JGzCknvkbiB8CBFX2fQbBNntIeVn3lEg,231
12
12
  yggdrasil/databricks/sql/engine.py,sha256=K5WmGKpXU78JA3UdK8dLxBD_GXKidZJFe7hytuC5UHg,41029
13
13
  yggdrasil/databricks/sql/exceptions.py,sha256=uC-BoG0u0LtORKUS1X3iLID8nc-0TV5MQN3M8RXHsO4,1495
14
- yggdrasil/databricks/sql/statement_result.py,sha256=kMBvpwyRv3_JUZSvxMS0c9Vqlh6LtCRJvXsDpu9RIAs,16137
14
+ yggdrasil/databricks/sql/statement_result.py,sha256=GZyVhhrUK5opNo-8HGqsMx0Rp9fa_0zqvn8McSHPQ8U,16310
15
15
  yggdrasil/databricks/sql/types.py,sha256=5G-BM9_eOsRKEMzeDTWUsWW5g4Idvs-czVCpOCrMhdA,6412
16
16
  yggdrasil/databricks/sql/warehouse.py,sha256=1J0dyQLJb-OS1_1xU1eAVZ4CoL2-FhFeowKSvU3RzFc,9773
17
17
  yggdrasil/databricks/workspaces/__init__.py,sha256=dv2zotoFVhNFlTCdRq6gwf5bEzeZkOZszoNZMs0k59g,114
18
18
  yggdrasil/databricks/workspaces/filesytem.py,sha256=Z8JXU7_XUEbw9fpTQT1avRQKi-IAP2KemXBMPkUoY4w,9805
19
- yggdrasil/databricks/workspaces/io.py,sha256=Tdde4LaGNJNT50R11OkEYZyNacyIW9QrOXMAicAlIr4,32208
20
- yggdrasil/databricks/workspaces/path.py,sha256=-XnCD9p42who3DAwnITVE1KyrZUSoXDKHA8iZi-7wk4,47743
19
+ yggdrasil/databricks/workspaces/io.py,sha256=D-B31roMGEJesAtUWl-O30lZJfgo-oFdK6KExzFc13I,33260
20
+ yggdrasil/databricks/workspaces/path.py,sha256=BAzaxEL2mWJ_6EnETnQdsPj06zkrbTO2f3reruR439k,49265
21
21
  yggdrasil/databricks/workspaces/path_kind.py,sha256=Xc319NysH8_6E9C0Q8nCxDHYG07_SnzyUVKHe0dNdDQ,305
22
- yggdrasil/databricks/workspaces/workspace.py,sha256=c6CBBun2BskEnsP74pbLVOe_TKXZs4L4r4gPQtIzlQE,23821
22
+ yggdrasil/databricks/workspaces/workspace.py,sha256=zBlQdYNT_xKwUCYo3O4Q4g-8pfMvff3I26efyCfY_TY,24961
23
23
  yggdrasil/dataclasses/__init__.py,sha256=_RkhfF3KC1eSORby1dzvBXQ0-UGG3u6wyUQWX2jq1Pc,108
24
24
  yggdrasil/dataclasses/dataclass.py,sha256=LxrCjwvmBnb8yRI_N-c31RHHxB4XoJPixmKg9iBIuaI,1148
25
25
  yggdrasil/libs/__init__.py,sha256=zdC9OU0Xy36CLY9mg2drxN6S7isPR8aTLzJA6xVIeLE,91
26
26
  yggdrasil/libs/databrickslib.py,sha256=NHJeUViHhZc8LI5oDVfi1axRyUy_pDJLy4hjD0KZEBQ,980
27
- yggdrasil/libs/pandaslib.py,sha256=Edm3SXgvr8qe2wsojuRvD1ewNB-Sff0RWoTqaddVruI,509
28
- yggdrasil/libs/polarslib.py,sha256=7EWP5iS8F9cW79M6d8Yg5ysjnOY3w4_k7TW-5DCRACw,511
27
+ yggdrasil/libs/pandaslib.py,sha256=GoUjh9dxZAFLe9hs8-6RliLD3jsH_BexYW1w-8BZzb0,618
28
+ yggdrasil/libs/polarslib.py,sha256=hnL8x6ygsyIoiJyIUMaeoji3fRzab4lBiHcMqa29C_Q,618
29
29
  yggdrasil/libs/sparklib.py,sha256=FQ3W1iz2EIpQreorOiQuFt15rdhq2QhGEAWp8Zrbl9A,10177
30
30
  yggdrasil/libs/extensions/__init__.py,sha256=mcXW5Li3Cbprbs4Ci-b5A0Ju0wmLcfvEiFusTx6xNjU,117
31
31
  yggdrasil/libs/extensions/polars_extensions.py,sha256=RTkGi8llhPJjX7x9egix7-yXWo2X24zIAPSKXV37SSA,12397
@@ -37,7 +37,7 @@ yggdrasil/pyutils/exceptions.py,sha256=ssKNm-rjhavHUOZmGA7_1Gq9tSHDrb2EFI-cnBuWg
37
37
  yggdrasil/pyutils/expiring_dict.py,sha256=pr2u25LGwPVbLfsLptiHGovUtYRRo0AMjaJtCtJl7nQ,8477
38
38
  yggdrasil/pyutils/modules.py,sha256=B7IP99YqUMW6-DIESFzBx8-09V1d0a8qrIJUDFhhL2g,11424
39
39
  yggdrasil/pyutils/parallel.py,sha256=ubuq2m9dJzWYUyKCga4Y_9bpaeMYUrleYxdp49CHr44,6781
40
- yggdrasil/pyutils/python_env.py,sha256=tuglnjdqHQjNh18qDladVoSEOjCD0RcnMEPYJ0tArOs,50985
40
+ yggdrasil/pyutils/python_env.py,sha256=Gh5geFK9ABpyWEfyegGUfIJUoPxKwcH0pqLBiMrW9Rw,51103
41
41
  yggdrasil/pyutils/retry.py,sha256=n5sr-Zu7fYrdLbjJ4WifK2lk0gEGmHv5FYt2HaCm1Qc,11916
42
42
  yggdrasil/requests/__init__.py,sha256=dMesyzq97_DmI765x0TwaDPEfsxFtgGNgchk8LvEN-o,103
43
43
  yggdrasil/requests/msal.py,sha256=s2GCyzbgFdgdlJ1JqMrZ4qYVbmoG46-ZOTcaVQhZ-sQ,9220
@@ -49,14 +49,14 @@ yggdrasil/types/cast/__init__.py,sha256=Oft3pTs2bRM5hT7YqJAuOKTYYk-SACLaMOXUVdaf
49
49
  yggdrasil/types/cast/arrow_cast.py,sha256=_OMYc4t5GlgE4ztlWaCoK8Jnba09rgDbmHVP-QXhOL0,41523
50
50
  yggdrasil/types/cast/cast_options.py,sha256=nDaEvCCs7TBamhTWyDrYf3LVaBWzioIP2Q5_LXrChF4,15532
51
51
  yggdrasil/types/cast/pandas_cast.py,sha256=I3xu0sZ59ZbK3NDcQ2dslzdeKzhpFV5zR02ZEixd5hI,8713
52
- yggdrasil/types/cast/polars_cast.py,sha256=K2nnQ7bexArneYEhUPgV_6er4JNq6N5RmbMUhw-2_Xw,28766
52
+ yggdrasil/types/cast/polars_cast.py,sha256=RILcbfL4o1XDMp5H-06c0BMrDal5pehOT7ACiItDB6E,28791
53
53
  yggdrasil/types/cast/polars_pandas_cast.py,sha256=CS0P7teVv15IdX5g7v40RfkH1VMg6b-HM0V_gOfacm8,5071
54
54
  yggdrasil/types/cast/registry.py,sha256=_zdFGmUBB7P-e_LIcJlOxMcxAkXoA-UXB6HqLMgTokg,21491
55
55
  yggdrasil/types/cast/spark_cast.py,sha256=_KAsl1DqmKMSfWxqhVE7gosjYdgiL1C5bDQv6eP3HtA,24926
56
56
  yggdrasil/types/cast/spark_pandas_cast.py,sha256=BuTiWrdCANZCdD_p2MAytqm74eq-rdRXd-LGojBRrfU,5023
57
57
  yggdrasil/types/cast/spark_polars_cast.py,sha256=btmZNHXn2NSt3fUuB4xg7coaE0RezIBdZD92H8NK0Jw,9073
58
- ygg-0.1.48.dist-info/METADATA,sha256=gpScM9WWu0y7C5ebXB6gsJBe9VbehZEU__E7HfWp8hk,18452
59
- ygg-0.1.48.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
- ygg-0.1.48.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
61
- ygg-0.1.48.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
62
- ygg-0.1.48.dist-info/RECORD,,
58
+ ygg-0.1.50.dist-info/METADATA,sha256=ygOCZJjNIbuKuD-qKLnttguy71qIBxR0KnHDJE_XPSU,18528
59
+ ygg-0.1.50.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
60
+ ygg-0.1.50.dist-info/entry_points.txt,sha256=6q-vpWG3kvw2dhctQ0LALdatoeefkN855Ev02I1dKGY,70
61
+ ygg-0.1.50.dist-info/top_level.txt,sha256=iBe9Kk4VIVbLpgv_p8OZUIfxgj4dgJ5wBg6vO3rigso,10
62
+ ygg-0.1.50.dist-info/RECORD,,
@@ -144,6 +144,7 @@ class Cluster(WorkspaceService):
144
144
  single_user_name: Optional[str] = None,
145
145
  runtime_engine: Optional["RuntimeEngine"] = None,
146
146
  libraries: Optional[list[str]] = None,
147
+ update_timeout: Optional[Union[float, dt.timedelta]] = dt.timedelta(minutes=20),
147
148
  **kwargs
148
149
  ) -> "Cluster":
149
150
  """Create or reuse a cluster that mirrors the current Python environment.
@@ -152,9 +153,10 @@ class Cluster(WorkspaceService):
152
153
  workspace: Workspace to use for the cluster.
153
154
  cluster_id: Optional cluster id to reuse.
154
155
  cluster_name: Optional cluster name to reuse.
155
- single_user_name: Optional user name for single-user clusters.
156
+ single_user_name: Optional username for single-user clusters.
156
157
  runtime_engine: Optional Databricks runtime engine.
157
158
  libraries: Optional list of libraries to install.
159
+ update_timeout: wait timeout, if None it will not wait completion
158
160
  **kwargs: Additional cluster specification overrides.
159
161
 
160
162
  Returns:
@@ -176,6 +178,7 @@ class Cluster(WorkspaceService):
176
178
  single_user_name=single_user_name,
177
179
  runtime_engine=runtime_engine,
178
180
  libraries=libraries,
181
+ update_timeout=update_timeout,
179
182
  **kwargs
180
183
  )
181
184
  )
@@ -190,6 +193,7 @@ class Cluster(WorkspaceService):
190
193
  single_user_name: Optional[str] = "current",
191
194
  runtime_engine: Optional["RuntimeEngine"] = None,
192
195
  libraries: Optional[list[str]] = None,
196
+ update_timeout: Optional[Union[float, dt.timedelta]] = dt.timedelta(minutes=20),
193
197
  **kwargs
194
198
  ) -> "Cluster":
195
199
  """Create/update a cluster to match the local Python environment.
@@ -198,9 +202,10 @@ class Cluster(WorkspaceService):
198
202
  source: Optional PythonEnv to mirror (defaults to current).
199
203
  cluster_id: Optional cluster id to update.
200
204
  cluster_name: Optional cluster name to update.
201
- single_user_name: Optional single user name for the cluster.
205
+ single_user_name: Optional single username for the cluster.
202
206
  runtime_engine: Optional runtime engine selection.
203
207
  libraries: Optional list of libraries to install.
208
+ update_timeout: wait timeout, if None it will not wait completion
204
209
  **kwargs: Additional cluster specification overrides.
205
210
 
206
211
  Returns:
@@ -242,6 +247,7 @@ class Cluster(WorkspaceService):
242
247
  single_user_name=single_user_name,
243
248
  runtime_engine=runtime_engine or RuntimeEngine.PHOTON,
244
249
  libraries=libraries,
250
+ update_timeout=update_timeout,
245
251
  **kwargs
246
252
  )
247
253
 
@@ -380,7 +386,9 @@ class Cluster(WorkspaceService):
380
386
  start = time.time()
381
387
  sleep_time = tick
382
388
 
383
- if isinstance(timeout, dt.timedelta):
389
+ if not timeout:
390
+ timeout = 20 * 60.0
391
+ elif isinstance(timeout, dt.timedelta):
384
392
  timeout = timeout.total_seconds()
385
393
 
386
394
  while self.is_pending:
@@ -412,12 +420,14 @@ class Cluster(WorkspaceService):
412
420
  # Extract "major.minor" from strings like "17.3.x-scala2.13-ml-gpu"
413
421
  v = self.spark_version
414
422
 
415
- if v is None:
423
+ if not v:
416
424
  return None
417
425
 
418
426
  parts = v.split(".")
427
+
419
428
  if len(parts) < 2:
420
429
  return None
430
+
421
431
  return ".".join(parts[:2]) # e.g. "17.3"
422
432
 
423
433
  @property
@@ -428,8 +438,10 @@ class Cluster(WorkspaceService):
428
438
  When the runtime can't be mapped, returns ``None``.
429
439
  """
430
440
  v = self.runtime_version
431
- if v is None:
441
+
442
+ if not v:
432
443
  return None
444
+
433
445
  return _PYTHON_BY_DBR.get(v)
434
446
 
435
447
  # ------------------------------------------------------------------ #
@@ -586,6 +598,7 @@ class Cluster(WorkspaceService):
586
598
  cluster_id: Optional[str] = None,
587
599
  cluster_name: Optional[str] = None,
588
600
  libraries: Optional[List[Union[str, "Library"]]] = None,
601
+ update_timeout: Optional[Union[float, dt.timedelta]] = dt.timedelta(minutes=20),
589
602
  **cluster_spec: Any
590
603
  ):
591
604
  """Create a new cluster or update an existing one.
@@ -594,6 +607,7 @@ class Cluster(WorkspaceService):
594
607
  cluster_id: Optional cluster id to update.
595
608
  cluster_name: Optional cluster name to update or create.
596
609
  libraries: Optional libraries to install.
610
+ update_timeout: wait timeout, if None it will not wait completion
597
611
  **cluster_spec: Cluster specification overrides.
598
612
 
599
613
  Returns:
@@ -609,24 +623,28 @@ class Cluster(WorkspaceService):
609
623
  return found.update(
610
624
  cluster_name=cluster_name,
611
625
  libraries=libraries,
626
+ wait_timeout=update_timeout,
612
627
  **cluster_spec
613
628
  )
614
629
 
615
630
  return self.create(
616
631
  cluster_name=cluster_name,
617
632
  libraries=libraries,
633
+ wait_timeout=update_timeout,
618
634
  **cluster_spec
619
635
  )
620
636
 
621
637
  def create(
622
638
  self,
623
639
  libraries: Optional[List[Union[str, "Library"]]] = None,
640
+ wait_timeout: Union[float, dt.timedelta] = dt.timedelta(minutes=20),
624
641
  **cluster_spec: Any
625
642
  ) -> str:
626
643
  """Create a new cluster and optionally install libraries.
627
644
 
628
645
  Args:
629
646
  libraries: Optional list of libraries to install after creation.
647
+ wait_timeout: wait timeout, if None it will not wait completion
630
648
  **cluster_spec: Cluster specification overrides.
631
649
 
632
650
  Returns:
@@ -646,14 +664,17 @@ class Cluster(WorkspaceService):
646
664
  update_details,
647
665
  )
648
666
 
649
- self.details = self.clusters_client().create_and_wait(**update_details)
667
+ self.details = self.clusters_client().create(**update_details)
650
668
 
651
669
  LOGGER.info(
652
670
  "Created %s",
653
671
  self
654
672
  )
655
673
 
656
- self.install_libraries(libraries=libraries, raise_error=False)
674
+ self.install_libraries(libraries=libraries, raise_error=False, wait_timeout=None)
675
+
676
+ if wait_timeout:
677
+ self.wait_for_status(timeout=wait_timeout)
657
678
 
658
679
  return self
659
680
 
@@ -661,7 +682,7 @@ class Cluster(WorkspaceService):
661
682
  self,
662
683
  libraries: Optional[List[Union[str, "Library"]]] = None,
663
684
  access_control_list: Optional[List["ClusterAccessControlRequest"]] = None,
664
- wait_timeout: Union[float, dt.timedelta] = dt.timedelta(minutes=20),
685
+ wait_timeout: Optional[Union[float, dt.timedelta]] = dt.timedelta(minutes=20),
665
686
  **cluster_spec: Any
666
687
  ) -> "Cluster":
667
688
  """Update cluster configuration and optionally install libraries.
@@ -708,7 +729,7 @@ class Cluster(WorkspaceService):
708
729
  self, diff
709
730
  )
710
731
 
711
- self.wait_for_status()
732
+ self.wait_for_status(timeout=wait_timeout)
712
733
  self.clusters_client().edit(**update_details)
713
734
  self.update_permissions(access_control_list=access_control_list)
714
735
 
@@ -727,7 +748,7 @@ class Cluster(WorkspaceService):
727
748
  access_control_list: Optional[List["ClusterAccessControlRequest"]] = None,
728
749
  ):
729
750
  if not access_control_list:
730
- access_control_list = self.default_permissions()
751
+ return self
731
752
 
732
753
  access_control_list = self._check_permission(access_control_list)
733
754
 
@@ -745,6 +766,7 @@ class Cluster(WorkspaceService):
745
766
  permission_level=ClusterPermissionLevel.CAN_MANAGE
746
767
  )
747
768
  for name in current_groups
769
+ if name not in {"users"}
748
770
  ]
749
771
 
750
772
  def _check_permission(
@@ -862,18 +884,22 @@ class Cluster(WorkspaceService):
862
884
  Returns:
863
885
  The current Cluster instance.
864
886
  """
887
+ if self.is_running:
888
+ return self
889
+
865
890
  self.wait_for_status()
866
891
 
867
- if not self.is_running:
868
- LOGGER.debug("Starting %s", self)
892
+ if self.is_running:
893
+ return self
869
894
 
870
- if wait_timeout:
871
- self.clusters_client().start(cluster_id=self.cluster_id)
872
- self.wait_for_status(timeout=wait_timeout.total_seconds())
873
- else:
874
- self.clusters_client().start(cluster_id=self.cluster_id)
895
+ LOGGER.debug("Starting %s", self)
896
+
897
+ self.clusters_client().start(cluster_id=self.cluster_id)
875
898
 
876
- LOGGER.info("Started %s", self)
899
+ LOGGER.info("Started %s", self)
900
+
901
+ if wait_timeout:
902
+ self.wait_for_status(timeout=wait_timeout.total_seconds())
877
903
 
878
904
  return self
879
905
 
@@ -889,7 +915,7 @@ class Cluster(WorkspaceService):
889
915
 
890
916
  if self.is_running:
891
917
  self.details = self.clusters_client().restart_and_wait(cluster_id=self.cluster_id)
892
- return self.wait_for_status()
918
+ return self
893
919
 
894
920
  return self.start()
895
921
 
@@ -180,7 +180,7 @@ print(json.dumps(meta))"""
180
180
  """
181
181
  return self.cluster.workspace.sdk()
182
182
 
183
- def _create_command(
183
+ def create_command(
184
184
  self,
185
185
  language: "Language",
186
186
  ) -> any:
@@ -192,17 +192,29 @@ print(json.dumps(meta))"""
192
192
  Returns:
193
193
  The created command execution context response.
194
194
  """
195
- self.cluster.ensure_running()
196
-
197
195
  LOGGER.debug(
198
196
  "Creating Databricks command execution context for %s",
199
197
  self.cluster
200
198
  )
201
199
 
202
- created = self._workspace_client().command_execution.create_and_wait(
203
- cluster_id=self.cluster.cluster_id,
204
- language=language,
200
+ try:
201
+ created = self._workspace_client().command_execution.create_and_wait(
202
+ cluster_id=self.cluster.cluster_id,
203
+ language=language,
204
+ )
205
+ except:
206
+ self.cluster.ensure_running()
207
+
208
+ created = self._workspace_client().command_execution.create_and_wait(
209
+ cluster_id=self.cluster.cluster_id,
210
+ language=language,
211
+ )
212
+
213
+ LOGGER.info(
214
+ "Created Databricks command execution context %s",
215
+ self
205
216
  )
217
+
206
218
  created = getattr(created, "response", created)
207
219
 
208
220
  return created
@@ -220,10 +232,6 @@ print(json.dumps(meta))"""
220
232
  The connected ExecutionContext instance.
221
233
  """
222
234
  if self.context_id is not None:
223
- LOGGER.debug(
224
- "Execution context already open for %s",
225
- self
226
- )
227
235
  return self
228
236
 
229
237
  self.language = language or self.language
@@ -231,7 +239,7 @@ print(json.dumps(meta))"""
231
239
  if self.language is None:
232
240
  self.language = Language.PYTHON
233
241
 
234
- ctx = self._create_command(language=self.language)
242
+ ctx = self.create_command(language=self.language)
235
243
 
236
244
  context_id = ctx.id
237
245
  if not context_id:
@@ -39,6 +39,7 @@ def databricks_remote_compute(
39
39
  timeout: Optional[dt.timedelta] = None,
40
40
  env_keys: Optional[List[str]] = None,
41
41
  force_local: bool = False,
42
+ update_timeout: Optional[Union[float, dt.timedelta]] = None,
42
43
  **options
43
44
  ) -> Callable[[Callable[..., ReturnType]], Callable[..., ReturnType]]:
44
45
  """Return a decorator that executes functions on a remote cluster.
@@ -52,6 +53,7 @@ def databricks_remote_compute(
52
53
  timeout: Optional execution timeout for remote calls.
53
54
  env_keys: Optional environment variable names to forward.
54
55
  force_local: Force local execution
56
+ update_timeout: creation or update wait timeout
55
57
  **options: Extra options forwarded to the execution decorator.
56
58
 
57
59
  Returns:
@@ -82,7 +84,8 @@ def databricks_remote_compute(
82
84
  cluster = workspace.clusters().replicated_current_environment(
83
85
  workspace=workspace,
84
86
  cluster_name=cluster_name,
85
- single_user_name=workspace.current_user.user_name
87
+ single_user_name=workspace.current_user.user_name,
88
+ update_timeout=update_timeout
86
89
  )
87
90
 
88
91
  cluster.ensure_running(wait_timeout=None)
@@ -344,10 +344,17 @@ class StatementResult:
344
344
  if self.persisted:
345
345
  if self._arrow_table is not None:
346
346
  return self._arrow_table.schema
347
- return spark_schema_to_arrow_schema(self._spark_df.schema)
347
+ elif self._spark_df is not None:
348
+ return spark_schema_to_arrow_schema(self._spark_df.schema)
349
+ raise NotImplementedError("")
350
+
351
+ manifest = self.manifest
352
+
353
+ if manifest is None:
354
+ return pa.schema([])
348
355
 
349
356
  fields = [
350
- column_info_to_arrow_field(_) for _ in self.manifest.schema.columns
357
+ column_info_to_arrow_field(_) for _ in manifest.schema.columns
351
358
  ]
352
359
 
353
360
  return pa.schema(fields)
@@ -362,7 +369,7 @@ class StatementResult:
362
369
  An Arrow Table containing all rows.
363
370
  """
364
371
  if self.persisted:
365
- if self._arrow_table:
372
+ if self._arrow_table is not None:
366
373
  return self._arrow_table
367
374
  else:
368
375
  return self._spark_df.toArrow()
@@ -370,7 +377,6 @@ class StatementResult:
370
377
  batches = list(self.to_arrow_batches(parallel_pool=parallel_pool))
371
378
 
372
379
  if not batches:
373
- # empty table with no columns
374
380
  return pa.Table.from_batches([], schema=self.arrow_schema())
375
381
 
376
382
  return pa.Table.from_batches(batches)
@@ -501,8 +507,9 @@ class StatementResult:
501
507
  Returns:
502
508
  A Spark DataFrame with the result rows.
503
509
  """
504
- if self._spark_df:
510
+ if self._spark_df is not None:
505
511
  return self._spark_df
506
512
 
507
513
  self._spark_df = arrow_table_to_spark_dataframe(self.to_arrow_table())
514
+
508
515
  return self._spark_df
@@ -13,8 +13,8 @@ from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat
13
13
 
14
14
  from .path_kind import DatabricksPathKind
15
15
  from ...libs.databrickslib import databricks
16
- from ...types.cast.pandas_cast import PandasDataFrame
17
- from ...types.cast.polars_pandas_cast import PolarsDataFrame
16
+ from ...libs.pandaslib import pandas, PandasDataFrame
17
+ from ...libs.polarslib import polars, PolarsDataFrame
18
18
  from ...types.cast.registry import convert
19
19
 
20
20
  if databricks is not None:
@@ -42,7 +42,6 @@ class DatabricksIO(ABC, IO):
42
42
  path: "DatabricksPath",
43
43
  mode: str,
44
44
  encoding: Optional[str] = None,
45
- compression: Optional[str] = "detect",
46
45
  position: int = 0,
47
46
  buffer: Optional[io.BytesIO] = None,
48
47
  ):
@@ -50,7 +49,6 @@ class DatabricksIO(ABC, IO):
50
49
 
51
50
  self.encoding = encoding
52
51
  self.mode = mode
53
- self.compression = compression
54
52
 
55
53
  self.path = path
56
54
 
@@ -111,7 +109,6 @@ class DatabricksIO(ABC, IO):
111
109
  path=path,
112
110
  mode=mode,
113
111
  encoding=encoding,
114
- compression=compression,
115
112
  position=position,
116
113
  buffer=buffer,
117
114
  )
@@ -120,7 +117,6 @@ class DatabricksIO(ABC, IO):
120
117
  path=path,
121
118
  mode=mode,
122
119
  encoding=encoding,
123
- compression=compression,
124
120
  position=position,
125
121
  buffer=buffer,
126
122
  )
@@ -129,7 +125,6 @@ class DatabricksIO(ABC, IO):
129
125
  path=path,
130
126
  mode=mode,
131
127
  encoding=encoding,
132
- compression=compression,
133
128
  position=position,
134
129
  buffer=buffer,
135
130
  )
@@ -226,7 +221,6 @@ class DatabricksIO(ABC, IO):
226
221
  path=kwargs.get("path", self.path),
227
222
  mode=kwargs.get("mode", self.mode),
228
223
  encoding=kwargs.get("encoding", self.encoding),
229
- compression=kwargs.get("compression", self.compression),
230
224
  position=kwargs.get("position", self.position),
231
225
  buffer=kwargs.get("buffer", self._buffer),
232
226
  )
@@ -264,8 +258,7 @@ class DatabricksIO(ABC, IO):
264
258
  None.
265
259
  """
266
260
  self.flush()
267
- if self._buffer is not None:
268
- self._buffer.close()
261
+ self.clear_buffer()
269
262
 
270
263
  def fileno(self):
271
264
  """Return a pseudo file descriptor based on object hash.
@@ -403,9 +396,6 @@ class DatabricksIO(ABC, IO):
403
396
  Returns:
404
397
  The read bytes or string depending on mode.
405
398
  """
406
- if not self.readable():
407
- raise IOError("File not open for reading")
408
-
409
399
  current_position = self.position
410
400
  all_data = self.read_all_bytes(use_cache=use_cache)
411
401
 
@@ -431,9 +421,6 @@ class DatabricksIO(ABC, IO):
431
421
  Returns:
432
422
  The next line as bytes or string.
433
423
  """
434
- if not self.readable():
435
- raise IOError("File not open for reading")
436
-
437
424
  if self.encoding:
438
425
  # Text-mode: accumulate characters
439
426
  out_chars = []
@@ -475,9 +462,6 @@ class DatabricksIO(ABC, IO):
475
462
  Returns:
476
463
  A list of lines.
477
464
  """
478
- if not self.readable():
479
- raise IOError("File not open for reading")
480
-
481
465
  lines = []
482
466
  total = 0
483
467
 
@@ -492,14 +476,6 @@ class DatabricksIO(ABC, IO):
492
476
 
493
477
  return lines
494
478
 
495
- def appendable(self):
496
- """Return True when the file is open in append mode.
497
-
498
- Returns:
499
- True if in append mode.
500
- """
501
- return "a" in self.mode
502
-
503
479
  def writable(self):
504
480
  """Return True to indicate write support.
505
481
 
@@ -561,9 +537,6 @@ class DatabricksIO(ABC, IO):
561
537
  Returns:
562
538
  The number of bytes written.
563
539
  """
564
- if not self.writable():
565
- raise IOError("File not open for writing")
566
-
567
540
  if isinstance(data, str):
568
541
  data = data.encode(self.encoding or "utf-8")
569
542
 
@@ -664,8 +637,12 @@ class DatabricksIO(ABC, IO):
664
637
  return self.write_polars(table, file_format=file_format, batch_size=batch_size, **kwargs)
665
638
  elif isinstance(table, PandasDataFrame):
666
639
  return self.write_pandas(table, file_format=file_format, batch_size=batch_size, **kwargs)
667
- else:
668
- raise ValueError(f"Cannot write {type(table)} to {self.path}")
640
+
641
+ return self.write_arrow(
642
+ table=table,
643
+ file_format=file_format,
644
+ batch_size=batch_size
645
+ )
669
646
 
670
647
  # ---- Arrow ----
671
648
 
@@ -689,16 +666,18 @@ class DatabricksIO(ABC, IO):
689
666
  self.seek(0)
690
667
 
691
668
  if isinstance(file_format, ParquetFileFormat):
692
- return pq.read_table(self, **kwargs)
669
+ pq.read_table(self, **kwargs)
693
670
 
694
- if isinstance(file_format, CsvFileFormat):
695
- return pcsv.read_csv(self, parse_options=file_format.parse_options)
671
+ elif isinstance(file_format, CsvFileFormat):
672
+ pcsv.read_csv(self, parse_options=file_format.parse_options)
696
673
 
697
- raise ValueError(f"Unsupported file format for Arrow table: {file_format}")
674
+ else:
675
+ ValueError(f"Unsupported file format for Arrow table: {file_format}")
698
676
 
699
677
  def write_arrow(
700
678
  self,
701
679
  table: Union[pa.Table, pa.RecordBatch],
680
+ file_format: Optional[FileFormat] = None,
702
681
  batch_size: Optional[int] = None,
703
682
  **kwargs
704
683
  ):
@@ -706,6 +685,7 @@ class DatabricksIO(ABC, IO):
706
685
 
707
686
  Args:
708
687
  table: Arrow table or batch to write.
688
+ file_format: Optional file format override.
709
689
  batch_size: Optional batch size for writes.
710
690
  **kwargs: Format-specific options.
711
691
 
@@ -717,6 +697,7 @@ class DatabricksIO(ABC, IO):
717
697
 
718
698
  return self.write_arrow_table(
719
699
  table=table,
700
+ file_format=file_format,
720
701
  batch_size=batch_size,
721
702
  **kwargs
722
703
  )
@@ -776,12 +757,14 @@ class DatabricksIO(ABC, IO):
776
757
 
777
758
  def read_arrow_batches(
778
759
  self,
760
+ file_format: Optional[FileFormat] = None,
779
761
  batch_size: Optional[int] = None,
780
762
  **kwargs
781
763
  ):
782
764
  """Yield Arrow record batches from the file.
783
765
 
784
766
  Args:
767
+ file_format: Optional file format override.
785
768
  batch_size: Optional batch size for reads.
786
769
  **kwargs: Format-specific options.
787
770
 
@@ -790,7 +773,11 @@ class DatabricksIO(ABC, IO):
790
773
  """
791
774
  return (
792
775
  self
793
- .read_arrow_table(batch_size=batch_size, **kwargs)
776
+ .read_arrow_table(
777
+ file_format=file_format,
778
+ batch_size=batch_size,
779
+ **kwargs
780
+ )
794
781
  .to_batches(max_chunksize=batch_size)
795
782
  )
796
783
 
@@ -798,23 +785,36 @@ class DatabricksIO(ABC, IO):
798
785
 
799
786
  def read_pandas(
800
787
  self,
788
+ file_format: Optional[FileFormat] = None,
801
789
  batch_size: Optional[int] = None,
802
790
  **kwargs
803
791
  ):
804
792
  """Read the file into a pandas DataFrame.
805
793
 
806
794
  Args:
795
+ file_format: Optional file format override.
807
796
  batch_size: Optional batch size for reads.
808
797
  **kwargs: Format-specific options.
809
798
 
810
799
  Returns:
811
800
  A pandas DataFrame with the file contents.
812
801
  """
813
- return self.read_arrow_table(batch_size=batch_size, **kwargs).to_pandas()
802
+ file_format = self.path.file_format if file_format is None else file_format
803
+ self.seek(0)
804
+
805
+ if isinstance(file_format, ParquetFileFormat):
806
+ pandas.read_parquet(self, **kwargs)
807
+
808
+ elif isinstance(file_format, CsvFileFormat):
809
+ pandas.read_csv(self, **kwargs)
810
+
811
+ else:
812
+ raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
814
813
 
815
814
  def write_pandas(
816
815
  self,
817
- df,
816
+ df: PandasDataFrame,
817
+ file_format: Optional[FileFormat] = None,
818
818
  batch_size: Optional[int] = None,
819
819
  **kwargs
820
820
  ):
@@ -822,13 +822,26 @@ class DatabricksIO(ABC, IO):
822
822
 
823
823
  Args:
824
824
  df: pandas DataFrame to write.
825
+ file_format: Optional file format override.
825
826
  batch_size: Optional batch size for writes.
826
827
  **kwargs: Format-specific options.
827
828
 
828
829
  Returns:
829
830
  None.
830
831
  """
831
- self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
832
+ file_format = self.path.file_format if file_format is None else FileFormat
833
+ buffer = io.BytesIO()
834
+
835
+ if isinstance(file_format, ParquetFileFormat):
836
+ df.to_parquet(buffer, **kwargs)
837
+
838
+ elif isinstance(file_format, CsvFileFormat):
839
+ df.to_csv(buffer, **kwargs)
840
+
841
+ else:
842
+ raise ValueError(f"Unsupported file format for Pandas DataFrame: {file_format}")
843
+
844
+ self.write_all_bytes(data=buffer.getvalue())
832
845
 
833
846
  # ---- Polars ----
834
847
 
@@ -848,22 +861,21 @@ class DatabricksIO(ABC, IO):
848
861
  Returns:
849
862
  A polars DataFrame with the file contents.
850
863
  """
851
- import polars as pl
852
-
853
864
  file_format = self.path.file_format if file_format is None else file_format
854
865
  self.seek(0)
855
866
 
856
867
  if isinstance(file_format, ParquetFileFormat):
857
- return pl.read_parquet(self, **kwargs)
868
+ polars.read_parquet(self, **kwargs)
858
869
 
859
- if isinstance(file_format, CsvFileFormat):
860
- return pl.read_csv(self, **kwargs)
870
+ elif isinstance(file_format, CsvFileFormat):
871
+ polars.read_csv(self, **kwargs)
861
872
 
862
- raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
873
+ else:
874
+ raise ValueError(f"Unsupported file format for Polars DataFrame: {file_format}")
863
875
 
864
876
  def write_polars(
865
877
  self,
866
- df,
878
+ df: PolarsDataFrame,
867
879
  file_format: Optional[FileFormat] = None,
868
880
  batch_size: Optional[int] = None,
869
881
  **kwargs
@@ -975,28 +987,40 @@ class DatabricksVolumeIO(DatabricksIO):
975
987
  """Read bytes from a volume file.
976
988
 
977
989
  Args:
978
- start: Starting byte offset.
990
+ start: Starting byte offset (0-based).
979
991
  length: Number of bytes to read.
980
992
  allow_not_found: Whether to suppress missing-path errors.
981
993
 
982
994
  Returns:
983
995
  Bytes read from the file.
984
996
  """
985
- if length == 0:
997
+ if length <= 0:
986
998
  return b""
999
+ if start < 0:
1000
+ raise ValueError(f"start must be >= 0, got {start}")
1001
+ if length < 0:
1002
+ raise ValueError(f"length must be >= 0, got {length}")
987
1003
 
988
1004
  sdk = self.workspace.sdk()
989
1005
  client = sdk.files
990
1006
  full_path = self.path.files_full_path()
991
1007
 
992
- resp = client.download(full_path)
993
- result = (
994
- resp.contents
995
- .seek(start, io.SEEK_SET)
996
- .read(length)
997
- )
1008
+ try:
1009
+ resp = client.download(full_path)
1010
+ except Exception as e:
1011
+ # Databricks SDK exceptions vary a bit by version; keep it pragmatic.
1012
+ if allow_not_found and any(s in str(e).lower() for s in ("not found", "not exist", "404")):
1013
+ return b""
1014
+ raise
998
1015
 
999
- return result
1016
+ data = resp.contents.read()
1017
+
1018
+ # If start is past EOF, return empty (common file-like behavior).
1019
+ if start >= len(data):
1020
+ return b""
1021
+
1022
+ end = start + length
1023
+ return data[start:end]
1000
1024
 
1001
1025
  def write_all_bytes(self, data: bytes):
1002
1026
  """Write bytes to a volume file.
@@ -12,17 +12,18 @@ from pathlib import PurePosixPath
12
12
  from typing import Optional, Tuple, Union, TYPE_CHECKING, List, Iterable
13
13
 
14
14
  import pyarrow as pa
15
+ import pyarrow.dataset as ds
15
16
  from pyarrow.dataset import FileFormat, ParquetFileFormat, CsvFileFormat, JsonFileFormat
16
17
  from pyarrow.fs import FileInfo, FileType, FileSystem
17
- import pyarrow.dataset as ds
18
18
 
19
19
  from .io import DatabricksIO
20
20
  from .path_kind import DatabricksPathKind
21
21
  from ...libs.databrickslib import databricks
22
- from ...types import cast_arrow_tabular, cast_polars_dataframe
22
+ from ...libs.pandaslib import PandasDataFrame
23
+ from ...libs.polarslib import polars, PolarsDataFrame
24
+ from ...types.cast.arrow_cast import cast_arrow_tabular
23
25
  from ...types.cast.cast_options import CastOptions
24
- from ...types.cast.polars_cast import polars_converter
25
- from ...types.cast.polars_pandas_cast import PolarsDataFrame
26
+ from ...types.cast.polars_cast import polars_converter, cast_polars_dataframe
26
27
  from ...types.cast.registry import convert, register_converter
27
28
 
28
29
  if databricks is not None:
@@ -494,13 +495,17 @@ class DatabricksPath:
494
495
 
495
496
  try:
496
497
  info = sdk.files.get_directory_metadata(full_path)
497
- mtime = (
498
- dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
499
- if info.last_modified
500
- else None
501
- )
502
498
 
503
- return self.reset_metadata(is_file=False, is_dir=True, size=info, mtime=mtime)
499
+ if info is None:
500
+ mtime = dt.datetime.now(tz=dt.timezone.utc)
501
+ else:
502
+ mtime = (
503
+ dt.datetime.strptime(info.last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(tzinfo=dt.timezone.utc)
504
+ if info.last_modified
505
+ else None
506
+ )
507
+
508
+ return self.reset_metadata(is_file=False, is_dir=True, size=0, mtime=mtime)
504
509
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
505
510
  pass
506
511
 
@@ -635,22 +640,12 @@ class DatabricksPath:
635
640
  Returns:
636
641
  The DatabricksPath instance.
637
642
  """
638
- try:
639
- if self.kind == DatabricksPathKind.WORKSPACE:
640
- self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
641
- elif self.kind == DatabricksPathKind.VOLUME:
642
- self.make_volume_dir(parents=parents, exist_ok=exist_ok)
643
- elif self.kind == DatabricksPathKind.DBFS:
644
- self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
645
- except (NotFound, ResourceDoesNotExist):
646
- if not parents or self.parent == self:
647
- raise
648
-
649
- self.parent.mkdir(parents=True, exist_ok=True)
650
- self.mkdir(parents=False, exist_ok=exist_ok)
651
- except (AlreadyExists, ResourceAlreadyExists):
652
- if not exist_ok:
653
- raise
643
+ if self.kind == DatabricksPathKind.WORKSPACE:
644
+ self.make_workspace_dir(parents=parents, exist_ok=exist_ok)
645
+ elif self.kind == DatabricksPathKind.VOLUME:
646
+ self.make_volume_dir(parents=parents, exist_ok=exist_ok)
647
+ elif self.kind == DatabricksPathKind.DBFS:
648
+ self.make_dbfs_dir(parents=parents, exist_ok=exist_ok)
654
649
 
655
650
  return self
656
651
 
@@ -766,15 +761,13 @@ class DatabricksPath:
766
761
  Returns:
767
762
  The DatabricksPath instance.
768
763
  """
769
- try:
770
- if self.kind == DatabricksPathKind.VOLUME:
771
- return self._remove_volume_file()
772
- elif self.kind == DatabricksPathKind.WORKSPACE:
773
- return self._remove_workspace_file()
774
- elif self.kind == DatabricksPathKind.DBFS:
775
- return self._remove_dbfs_file()
776
- finally:
777
- self.reset_metadata()
764
+ if self.kind == DatabricksPathKind.VOLUME:
765
+ return self._remove_volume_file()
766
+ elif self.kind == DatabricksPathKind.WORKSPACE:
767
+ return self._remove_workspace_file()
768
+ elif self.kind == DatabricksPathKind.DBFS:
769
+ return self._remove_dbfs_file()
770
+
778
771
  return self
779
772
 
780
773
  def _remove_volume_file(self):
@@ -783,6 +776,9 @@ class DatabricksPath:
783
776
  sdk.files.delete(self.files_full_path())
784
777
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
785
778
  pass
779
+ finally:
780
+ self.reset_metadata()
781
+
786
782
  return self
787
783
 
788
784
  def _remove_workspace_file(self):
@@ -791,6 +787,9 @@ class DatabricksPath:
791
787
  sdk.workspace.delete(self.workspace_full_path(), recursive=True)
792
788
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
793
789
  pass
790
+ finally:
791
+ self.reset_metadata()
792
+
794
793
  return self
795
794
 
796
795
  def _remove_dbfs_file(self):
@@ -799,6 +798,9 @@ class DatabricksPath:
799
798
  sdk.dbfs.delete(self.dbfs_full_path(), recursive=True)
800
799
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
801
800
  pass
801
+ finally:
802
+ self.reset_metadata()
803
+
802
804
  return self
803
805
 
804
806
  def rmdir(self, recursive: bool = True):
@@ -823,7 +825,9 @@ class DatabricksPath:
823
825
  sdk.workspace.delete(self.workspace_full_path(), recursive=recursive)
824
826
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
825
827
  pass
826
- self.reset_metadata()
828
+ finally:
829
+ self.reset_metadata()
830
+
827
831
  return self
828
832
 
829
833
  def _remove_dbfs_dir(self, recursive: bool = True):
@@ -832,7 +836,9 @@ class DatabricksPath:
832
836
  sdk.dbfs.delete(self.dbfs_full_path(), recursive=recursive)
833
837
  except (NotFound, ResourceDoesNotExist, BadRequest, PermissionDenied):
834
838
  pass
835
- self.reset_metadata()
839
+ finally:
840
+ self.reset_metadata()
841
+
836
842
  return self
837
843
 
838
844
  def _remove_volume_dir(self, recursive: bool = True):
@@ -1038,7 +1044,7 @@ class DatabricksPath:
1038
1044
  Returns:
1039
1045
  None.
1040
1046
  """
1041
- if self.is_file() and dest.is_file():
1047
+ if self.is_file():
1042
1048
  with self.open(mode="rb") as src:
1043
1049
  src.copy_to(dest=dest)
1044
1050
 
@@ -1063,6 +1069,13 @@ class DatabricksPath:
1063
1069
  else:
1064
1070
  raise FileNotFoundError(f"Path {self} does not exist, or dest is not same file or folder type")
1065
1071
 
1072
+ def write_bytes(self, data: bytes):
1073
+ if hasattr(data, "read"):
1074
+ data = data.read()
1075
+
1076
+ with self.open("wb") as f:
1077
+ f.write_all_bytes(data=data)
1078
+
1066
1079
  # -------------------------
1067
1080
  # Data ops (Arrow / Pandas / Polars)
1068
1081
  # -------------------------
@@ -1206,6 +1219,7 @@ class DatabricksPath:
1206
1219
 
1207
1220
  def read_pandas(
1208
1221
  self,
1222
+ file_format: Optional[FileFormat] = None,
1209
1223
  batch_size: Optional[int] = None,
1210
1224
  concat: bool = True,
1211
1225
  **kwargs
@@ -1213,6 +1227,7 @@ class DatabricksPath:
1213
1227
  """Read the path into a pandas DataFrame.
1214
1228
 
1215
1229
  Args:
1230
+ file_format: Optional file format override.
1216
1231
  batch_size: Optional batch size for reads.
1217
1232
  concat: Whether to concatenate results for directories.
1218
1233
  **kwargs: Format-specific options.
@@ -1221,14 +1236,26 @@ class DatabricksPath:
1221
1236
  A pandas DataFrame or list of DataFrames if concat=False.
1222
1237
  """
1223
1238
  if concat:
1224
- return self.read_arrow_table(batch_size=batch_size, concat=True, **kwargs).to_pandas()
1239
+ return self.read_arrow_table(
1240
+ file_format=file_format,
1241
+ batch_size=batch_size,
1242
+ concat=True,
1243
+ **kwargs
1244
+ ).to_pandas()
1245
+
1246
+ tables = self.read_arrow_table(
1247
+ batch_size=batch_size,
1248
+ file_format=file_format,
1249
+ concat=False,
1250
+ **kwargs
1251
+ )
1225
1252
 
1226
- tables = self.read_arrow_table(batch_size=batch_size, concat=False, **kwargs)
1227
1253
  return [t.to_pandas() for t in tables] # type: ignore[arg-type]
1228
1254
 
1229
1255
  def write_pandas(
1230
1256
  self,
1231
- df,
1257
+ df: PandasDataFrame,
1258
+ file_format: Optional[FileFormat] = None,
1232
1259
  batch_size: Optional[int] = None,
1233
1260
  **kwargs
1234
1261
  ):
@@ -1236,13 +1263,41 @@ class DatabricksPath:
1236
1263
 
1237
1264
  Args:
1238
1265
  df: pandas DataFrame to write.
1266
+ file_format: Optional file format override.
1239
1267
  batch_size: Optional batch size for writes.
1240
1268
  **kwargs: Format-specific options.
1241
1269
 
1242
1270
  Returns:
1243
1271
  The DatabricksPath instance.
1244
1272
  """
1245
- return self.write_arrow_table(pa.table(df), batch_size=batch_size, **kwargs)
1273
+ with self.connect(clone=False) as connected:
1274
+ if connected.is_dir_sink():
1275
+ seed = int(time.time() * 1000)
1276
+
1277
+ def df_batches(pdf, bs: int):
1278
+ for start in range(0, len(pdf), batch_size):
1279
+ yield pdf.iloc[start:start + batch_size]
1280
+
1281
+ for i, batch in enumerate(df_batches(df, batch_size)):
1282
+ part_path = connected / f"{seed}-{i:05d}-{_rand_str(4)}.parquet"
1283
+
1284
+ with part_path.open(mode="wb", clone=False) as f:
1285
+ f.write_pandas(
1286
+ batch,
1287
+ file_format=file_format,
1288
+ batch_size=batch_size,
1289
+ **kwargs
1290
+ )
1291
+ else:
1292
+ with connected.open(mode="wb", clone=False) as f:
1293
+ f.write_pandas(
1294
+ df,
1295
+ file_format=file_format,
1296
+ batch_size=batch_size,
1297
+ **kwargs
1298
+ )
1299
+
1300
+ return self
1246
1301
 
1247
1302
  def read_polars(
1248
1303
  self,
@@ -1264,8 +1319,6 @@ class DatabricksPath:
1264
1319
  Returns:
1265
1320
  A polars DataFrame or list of DataFrames if concat=False.
1266
1321
  """
1267
- import polars as pl
1268
-
1269
1322
  if self.is_file():
1270
1323
  with self.open("rb") as f:
1271
1324
  return f.read_polars(batch_size=batch_size, **kwargs)
@@ -1278,10 +1331,10 @@ class DatabricksPath:
1278
1331
  dfs.append(f.read_polars(batch_size=batch_size, **kwargs))
1279
1332
 
1280
1333
  if not dfs:
1281
- return pl.DataFrame()
1334
+ return polars.DataFrame()
1282
1335
 
1283
1336
  if concat:
1284
- return pl.concat(dfs, how=how, rechunk=rechunk)
1337
+ return polars.concat(dfs, how=how, rechunk=rechunk)
1285
1338
  return dfs # type: ignore[return-value]
1286
1339
 
1287
1340
  raise FileNotFoundError(f"Path does not exist: {self}")
@@ -1312,12 +1365,10 @@ class DatabricksPath:
1312
1365
  Notes:
1313
1366
  - If `df` is a LazyFrame, we collect it first (optionally streaming).
1314
1367
  """
1315
- import polars as pl
1316
-
1317
- if isinstance(df, pl.LazyFrame):
1368
+ if isinstance(df, polars.LazyFrame):
1318
1369
  df = df.collect()
1319
1370
 
1320
- if not isinstance(df, pl.DataFrame):
1371
+ if not isinstance(df, polars.DataFrame):
1321
1372
  raise TypeError(f"write_polars expects pl.DataFrame or pl.LazyFrame, got {type(df)!r}")
1322
1373
 
1323
1374
  with self.connect() as connected:
@@ -8,7 +8,6 @@ from abc import ABC
8
8
  from dataclasses import dataclass
9
9
  from pathlib import Path
10
10
  from typing import (
11
- Any,
12
11
  BinaryIO,
13
12
  Iterator,
14
13
  Optional,
@@ -55,7 +54,9 @@ def _get_env_product_version():
55
54
  v = os.getenv("DATABRICKS_PRODUCT_VERSION")
56
55
 
57
56
  if not v:
58
- return YGGDRASIL_VERSION
57
+ if _get_env_product() == "yggdrasil":
58
+ return YGGDRASIL_VERSION
59
+ return None
59
60
  return v.strip().lower()
60
61
 
61
62
 
@@ -106,11 +107,12 @@ class Workspace:
106
107
  product: Optional[str] = dataclasses.field(default_factory=_get_env_product, repr=False)
107
108
  product_version: Optional[str] = dataclasses.field(default_factory=_get_env_product_version, repr=False)
108
109
  product_tag: Optional[str] = dataclasses.field(default_factory=_get_env_product_tag, repr=False)
110
+ custom_tags: Optional[dict] = dataclasses.field(default=None, repr=False)
109
111
 
110
112
  # Runtime cache (never serialized)
111
- _sdk: Any = dataclasses.field(init=False, default=None, repr=False, compare=False, hash=False)
112
- _was_connected: bool = dataclasses.field(init=False, default=False, repr=False, compare=False)
113
- _cached_token: Optional[str] = dataclasses.field(init=False, default=None, repr=False, compare=False)
113
+ _sdk: Optional["WorkspaceClient"] = dataclasses.field(default=None, repr=False, compare=False, hash=False)
114
+ _was_connected: bool = dataclasses.field(default=None, repr=False, compare=False, hash=False)
115
+ _cached_token: Optional[str] = dataclasses.field(default=None, repr=False, compare=False, hash=False)
114
116
 
115
117
  # -------------------------
116
118
  # Pickle support
@@ -175,19 +177,43 @@ class Workspace:
175
177
  # -------------------------
176
178
  def clone_instance(
177
179
  self,
178
- **kwargs
179
180
  ) -> "Workspace":
180
181
  """Clone the workspace config with overrides.
181
182
 
182
- Args:
183
- **kwargs: Field overrides for the clone.
184
-
185
183
  Returns:
186
184
  A new Workspace instance with updated fields.
187
185
  """
188
- state = self.__getstate__()
189
- state.update(kwargs)
190
- return Workspace().__setstate__(state)
186
+ return Workspace(
187
+ host = self.host,
188
+ account_id = self.account_id,
189
+ token = self.token,
190
+ client_id = self.client_id,
191
+ client_secret = self.client_secret,
192
+ token_audience = self.token_audience,
193
+ azure_workspace_resource_id = self.azure_workspace_resource_id,
194
+ azure_use_msi = self.azure_use_msi,
195
+ azure_client_secret = self.azure_client_secret,
196
+ azure_client_id = self.azure_client_id,
197
+ azure_tenant_id = self.azure_tenant_id,
198
+ azure_environment = self.azure_environment,
199
+ google_credentials = self.google_credentials,
200
+ google_service_account = self.google_service_account,
201
+ profile = self.profile,
202
+ config_file = self.config_file,
203
+ auth_type = self.auth_type,
204
+ http_timeout_seconds = self.http_timeout_seconds,
205
+ retry_timeout_seconds = self.retry_timeout_seconds,
206
+ debug_truncate_bytes = self.debug_truncate_bytes,
207
+ debug_headers = self.debug_headers,
208
+ rate_limit = self.rate_limit,
209
+ product = self.product,
210
+ product_version = self.product_version,
211
+ product_tag = self.product_tag,
212
+ custom_tags = self.custom_tags,
213
+ _sdk = self._sdk,
214
+ _was_connected = self._was_connected,
215
+ _cached_token = self._cached_token,
216
+ )
191
217
 
192
218
  # -------------------------
193
219
  # SDK connection
@@ -300,8 +326,9 @@ class Workspace:
300
326
  Drop the cached WorkspaceClient (no actual close needed, but this
301
327
  avoids reusing stale config).
302
328
  """
303
- self._sdk = None
304
- self._was_connected = False
329
+ if self._sdk is not None:
330
+ self._sdk = None
331
+ self._was_connected = False
305
332
 
306
333
  # ------------------------------------------------------------------ #
307
334
  # Properties
@@ -561,28 +588,19 @@ class Workspace:
561
588
  Returns:
562
589
  A dict of default tags.
563
590
  """
564
- return {
591
+ base = {
565
592
  k: v
566
593
  for k, v in (
567
594
  ("Product", self.product),
568
- ("ProductVersion", self.product_version),
569
595
  ("ProductTag", self.product_tag),
570
- ("ProductUser", self.current_user.user_name)
571
596
  )
572
597
  if v
573
598
  }
574
599
 
575
- def merge_tags(self, existing: dict | None = None):
576
- """Merge default tags with an existing set.
577
-
578
- Args:
579
- existing: Optional existing tags.
600
+ if self.custom_tags:
601
+ base.update(self.custom_tags)
580
602
 
581
- Returns:
582
- A dict of merged tags.
583
- """
584
- if existing:
585
- return self.default_tags()
603
+ return base
586
604
 
587
605
  def sql(
588
606
  self,
@@ -3,9 +3,14 @@
3
3
  try:
4
4
  import pandas # type: ignore
5
5
  pandas = pandas
6
+
7
+ PandasDataFrame = pandas.DataFrame
6
8
  except ImportError:
7
9
  pandas = None
8
10
 
11
+ class PandasDataFrame:
12
+ pass
13
+
9
14
 
10
15
  def require_pandas():
11
16
  """Ensure pandas is available before using pandas helpers.
@@ -23,4 +28,5 @@ def require_pandas():
23
28
  __all__ = [
24
29
  "pandas",
25
30
  "require_pandas",
31
+ "PandasDataFrame"
26
32
  ]
@@ -4,13 +4,18 @@ try:
4
4
  import polars # type: ignore
5
5
 
6
6
  polars = polars
7
+
8
+ PolarsDataFrame = polars.DataFrame
7
9
  except ImportError:
8
10
  polars = None
9
11
 
12
+ class PolarsDataFrame:
13
+ pass
10
14
 
11
15
  __all__ = [
12
16
  "polars",
13
17
  "require_polars",
18
+ "PolarsDataFrame"
14
19
  ]
15
20
 
16
21
 
@@ -16,7 +16,7 @@ import sys
16
16
  import tempfile
17
17
  import threading
18
18
  from contextlib import contextmanager
19
- from dataclasses import dataclass
19
+ from dataclasses import dataclass, field
20
20
  from pathlib import Path
21
21
  from typing import Any, Iterable, Iterator, Mapping, MutableMapping, Optional, Union, List, Tuple
22
22
 
@@ -415,11 +415,13 @@ def _locked_env(root: Path):
415
415
  # PythonEnv
416
416
  # -----------------------
417
417
 
418
- @dataclass(frozen=True)
418
+ @dataclass
419
419
  class PythonEnv:
420
420
  """Represent a managed Python environment rooted at a filesystem path."""
421
421
  root: Path
422
422
 
423
+ _version: Optional[str] = field(default=None, repr=False)
424
+
423
425
  def __post_init__(self) -> None:
424
426
  """Normalize the root path after dataclass initialization.
425
427
 
@@ -862,8 +864,9 @@ class PythonEnv:
862
864
  Returns:
863
865
  Version string.
864
866
  """
865
- out = self.exec_code("import sys; print(sys.version.split()[0])", check=True)
866
- return out.strip()
867
+ if self._version is None:
868
+ self._version = self.exec_code("import sys; print(sys.version.split()[0])", check=True).strip()
869
+ return self._version
867
870
 
868
871
  @property
869
872
  def version_info(self) -> tuple[int, int, int]:
@@ -15,6 +15,7 @@ from ..python_defaults import default_arrow_scalar
15
15
  from ...libs.polarslib import polars
16
16
 
17
17
  __all__ = [
18
+ "polars_converter",
18
19
  "cast_polars_array",
19
20
  "cast_polars_dataframe",
20
21
  "arrow_type_to_polars_type",
yggdrasil/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.48"
1
+ __version__ = "0.1.50"
File without changes