ygg 0.1.30__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+
3
+
4
+ __all__ = ["DatabricksPathKind"]
5
+
6
+
7
+ class DatabricksPathKind(str, Enum):
8
+ WORKSPACE = "workspace"
9
+ VOLUME = "volume"
10
+ DBFS = "dbfs"
@@ -16,7 +16,7 @@ from typing import (
16
16
  if TYPE_CHECKING:
17
17
  from ..compute.cluster import Cluster
18
18
 
19
- from .databricks_path import DatabricksPath, DatabricksPathKind
19
+ from .path import DatabricksPath, DatabricksPathKind
20
20
  from ...libs.databrickslib import require_databricks_sdk, databricks_sdk
21
21
 
22
22
  if databricks_sdk is not None:
@@ -133,17 +133,19 @@ class Workspace:
133
133
 
134
134
  def __enter__(self) -> "Workspace":
135
135
  self._was_connected = self._sdk is not None
136
- self.connect()
137
- return self
136
+ return self.connect()
138
137
 
139
138
  def __exit__(self, exc_type, exc_val, exc_tb) -> None:
140
139
  if not self._was_connected:
141
140
  self.close()
142
141
 
142
+ def __del__(self):
143
+ self.close()
144
+
143
145
  # -------------------------
144
146
  # Clone
145
147
  # -------------------------
146
- def clone(
148
+ def clone_instance(
147
149
  self,
148
150
  **kwargs
149
151
  ) -> "Workspace":
@@ -154,86 +156,94 @@ class Workspace:
154
156
  # -------------------------
155
157
  # SDK connection
156
158
  # -------------------------
157
- def connect(self, reset: bool = False) -> "Workspace":
159
+ @property
160
+ def connected(self):
161
+ return self._sdk is not None
162
+
163
+ def connect(self, reset: bool = False, clone: bool = False) -> "Workspace":
158
164
  if reset:
159
165
  self._sdk = None
160
166
 
161
- if self._sdk is None:
162
- require_databricks_sdk()
163
- logger.debug("Connecting %s", self)
164
-
165
- # Build Config from config_dict if available, else from fields.
166
- kwargs = {
167
- "host": self.host,
168
- "account_id": self.account_id,
169
- "token": self.token,
170
- "client_id": self.client_id,
171
- "client_secret": self.client_secret,
172
- "token_audience": self.token_audience,
173
- "azure_workspace_resource_id": self.azure_workspace_resource_id,
174
- "azure_use_msi": self.azure_use_msi,
175
- "azure_client_secret": self.azure_client_secret,
176
- "azure_client_id": self.azure_client_id,
177
- "azure_tenant_id": self.azure_tenant_id,
178
- "azure_environment": self.azure_environment,
179
- "google_credentials": self.google_credentials,
180
- "google_service_account": self.google_service_account,
181
- "profile": self.profile,
182
- "config_file": self.config_file,
183
- "auth_type": self.auth_type,
184
- "http_timeout_seconds": self.http_timeout_seconds,
185
- "retry_timeout_seconds": self.retry_timeout_seconds,
186
- "debug_truncate_bytes": self.debug_truncate_bytes,
187
- "debug_headers": self.debug_headers,
188
- "rate_limit": self.rate_limit,
189
- "product": self.product,
190
- "product_version": self.product_version,
191
- }
192
-
193
- build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
167
+ if self._sdk is not None:
168
+ return self
169
+
170
+ instance = self.clone_instance() if clone else self
171
+
172
+ require_databricks_sdk()
173
+ logger.debug("Connecting %s", self)
174
+
175
+ # Build Config from config_dict if available, else from fields.
176
+ kwargs = {
177
+ "host": instance.host,
178
+ "account_id": instance.account_id,
179
+ "token": instance.token,
180
+ "client_id": instance.client_id,
181
+ "client_secret": instance.client_secret,
182
+ "token_audience": instance.token_audience,
183
+ "azure_workspace_resource_id": instance.azure_workspace_resource_id,
184
+ "azure_use_msi": instance.azure_use_msi,
185
+ "azure_client_secret": instance.azure_client_secret,
186
+ "azure_client_id": instance.azure_client_id,
187
+ "azure_tenant_id": instance.azure_tenant_id,
188
+ "azure_environment": instance.azure_environment,
189
+ "google_credentials": instance.google_credentials,
190
+ "google_service_account": instance.google_service_account,
191
+ "profile": instance.profile,
192
+ "config_file": instance.config_file,
193
+ "auth_type": instance.auth_type,
194
+ "http_timeout_seconds": instance.http_timeout_seconds,
195
+ "retry_timeout_seconds": instance.retry_timeout_seconds,
196
+ "debug_truncate_bytes": instance.debug_truncate_bytes,
197
+ "debug_headers": instance.debug_headers,
198
+ "rate_limit": instance.rate_limit,
199
+ "product": instance.product,
200
+ "product_version": instance.product_version,
201
+ }
194
202
 
195
- try:
196
- self._sdk = WorkspaceClient(**build_kwargs)
197
- except ValueError as e:
198
- if "cannot configure default credentials" in str(e) and self.auth_type is None:
199
- last_error = e
203
+ build_kwargs = {k: v for k, v in kwargs.items() if v is not None}
204
+
205
+ try:
206
+ instance._sdk = WorkspaceClient(**build_kwargs)
207
+ except ValueError as e:
208
+ if "cannot configure default credentials" in str(e) and instance.auth_type is None:
209
+ last_error = e
210
+
211
+ auth_types = ["runtime"] if instance.is_in_databricks_environment() else ["external-browser"]
200
212
 
201
- auth_types = ["runtime"] if self.is_in_databricks_environment() else ["external-browser"]
213
+ for auth_type in auth_types:
214
+ build_kwargs["auth_type"] = auth_type
202
215
 
203
- for auth_type in auth_types:
204
- build_kwargs["auth_type"] = auth_type
216
+ try:
217
+ instance._sdk = WorkspaceClient(**build_kwargs)
218
+ break
219
+ except Exception as se:
220
+ last_error = se
221
+ build_kwargs.pop("auth_type")
222
+
223
+ if instance._sdk is None:
224
+ if instance.is_in_databricks_environment() and instance._cached_token:
225
+ build_kwargs["token"] = instance._cached_token
205
226
 
206
227
  try:
207
- self._sdk = WorkspaceClient(**build_kwargs)
208
- break
228
+ instance._sdk = WorkspaceClient(**build_kwargs)
209
229
  except Exception as se:
210
230
  last_error = se
211
- build_kwargs.pop("auth_type")
212
-
213
- if self._sdk is None:
214
- if self.is_in_databricks_environment() and self._cached_token:
215
- build_kwargs["token"] = self._cached_token
216
231
 
217
- try:
218
- self._sdk = WorkspaceClient(**build_kwargs)
219
- except Exception as se:
220
- last_error = se
232
+ if instance._sdk is None:
233
+ raise last_error
234
+ else:
235
+ raise e
221
236
 
222
- if self._sdk is None:
223
- raise last_error
224
- else:
225
- raise e
237
+ # backfill resolved config values
238
+ for key in list(kwargs.keys()):
239
+ if getattr(instance, key, None) is None:
240
+ v = getattr(instance._sdk.config, key, None)
241
+ if v is not None:
242
+ setattr(instance, key, v)
226
243
 
227
- # backfill resolved config values
228
- for key in list(kwargs.keys()):
229
- if getattr(self, key, None) is None:
230
- v = getattr(self._sdk.config, key, None)
231
- if v is not None:
232
- setattr(self, key, v)
244
+ logger.info("Connected %s", instance)
233
245
 
234
- logger.info("Connected %s", self)
235
-
236
- return self
246
+ return instance
237
247
 
238
248
  # ------------------------------------------------------------------ #
239
249
  # Context manager + lifecycle
@@ -287,6 +297,20 @@ class Workspace:
287
297
  # ------------------------------------------------------------------ #
288
298
  # Path helpers
289
299
  # ------------------------------------------------------------------ #
300
+ def filesytem(
301
+ self,
302
+ workspace: Optional["Workspace"] = None,
303
+ ):
304
+ from .filesytem import DatabricksFileSystem, DatabricksFileSystemHandler
305
+
306
+ handler = DatabricksFileSystemHandler(
307
+ workspace=self if workspace is None else workspace
308
+ )
309
+
310
+ return DatabricksFileSystem(
311
+ handler=handler
312
+ )
313
+
290
314
  def dbfs_path(
291
315
  self,
292
316
  parts: Union[List[str], str],
@@ -297,14 +321,14 @@ class Workspace:
297
321
 
298
322
  if kind is None or isinstance(parts, str):
299
323
  return DatabricksPath.parse(
300
- parts=parts,
324
+ obj=parts,
301
325
  workspace=workspace
302
326
  )
303
327
 
304
328
  return DatabricksPath(
305
329
  kind=kind,
306
330
  parts=parts,
307
- workspace=workspace
331
+ _workspace=workspace
308
332
  )
309
333
 
310
334
  def shared_cache_path(
@@ -329,60 +353,6 @@ class Workspace:
329
353
  def sdk(self) -> "WorkspaceClient":
330
354
  return self.connect()._sdk
331
355
 
332
- # ------------------------------------------------------------------ #
333
- # UC volume + directory management
334
- # ------------------------------------------------------------------ #
335
-
336
- def ensure_uc_volume_and_dir(
337
- self,
338
- target_path: str,
339
- ) -> None:
340
- """
341
- Ensure catalog, schema, volume exist for a UC volume path
342
- like /Volumes/<catalog>/<schema>/<volume>/...,
343
- then create the directory.
344
- """
345
- sdk = self.sdk()
346
- parts = target_path.split("/")
347
-
348
- # basic sanity check
349
- if len(parts) < 5 or parts[1] != "Volumes":
350
- raise ValueError(
351
- f"Unexpected UC volume path: {target_path!r}. "
352
- "Expected /Volumes/<catalog>/<schema>/<volume>/..."
353
- )
354
-
355
- # /Volumes/<catalog>/<schema>/<volume>/...
356
- _, _, catalog_name, schema_name, volume_name, *subpath = parts
357
-
358
- # 1) ensure catalog
359
- try:
360
- sdk.catalogs.get(name=catalog_name)
361
- except NotFound:
362
- sdk.catalogs.create(name=catalog_name)
363
-
364
- # 2) ensure schema
365
- schema_full_name = f"{catalog_name}.{schema_name}"
366
- try:
367
- sdk.schemas.get(full_name=schema_full_name)
368
- except NotFound:
369
- sdk.schemas.create(name=schema_name, catalog_name=catalog_name)
370
-
371
- # 3) ensure volume (managed volume is simplest)
372
- volume_full_name = f"{catalog_name}.{schema_name}.{volume_name}"
373
- try:
374
- sdk.volumes.read(name=volume_full_name)
375
- except NotFound:
376
- sdk.volumes.create(
377
- catalog_name=catalog_name,
378
- schema_name=schema_name,
379
- name=volume_name,
380
- volume_type=catalog_svc.VolumeType.MANAGED,
381
- )
382
-
383
- # 4) finally create the directory path itself
384
- sdk.files.create_directory(target_path)
385
-
386
356
  # ------------------------------------------------------------------ #
387
357
  # List / open / delete / SQL
388
358
  # ------------------------------------------------------------------ #
@@ -462,31 +432,8 @@ class Workspace:
462
432
 
463
433
  # Workspace path
464
434
  fmt = workspace_format or ExportFormat.AUTO
465
- return sdk.workspace.download(path=path, format=fmt)
466
-
467
- def delete_path(
468
- self,
469
- target_path: str,
470
- recursive: bool = True,
471
- ignore_missing: bool = True,
472
- ) -> None:
473
- """
474
- Delete a path in Databricks Workspace (file or directory).
475
-
476
- - If recursive=True and target_path is a directory, deletes entire tree.
477
- - If ignore_missing=True, missing paths won't raise.
478
- """
479
- sdk = self.sdk()
480
435
 
481
- try:
482
- sdk.workspace.delete(
483
- path=target_path,
484
- recursive=recursive,
485
- )
486
- except ResourceDoesNotExist:
487
- if ignore_missing:
488
- return
489
- raise
436
+ return sdk.workspace.download(path=path, format=fmt)
490
437
 
491
438
  @staticmethod
492
439
  def is_in_databricks_environment():
@@ -6,6 +6,7 @@ from typing import Optional, Union, List, Tuple, Any
6
6
 
7
7
  import pyarrow as pa
8
8
  import pyarrow.compute as pc
9
+ import pyarrow.dataset as pds
9
10
 
10
11
  from .cast_options import CastOptions
11
12
  from .registry import register_converter
@@ -1095,6 +1096,14 @@ def record_batch_reader_to_record_batch(
1095
1096
  return table_to_record_batch(table, options)
1096
1097
 
1097
1098
 
1099
+ @register_converter(pds.Dataset, pa.Table)
1100
+ def arrow_dataset_to_table(
1101
+ data: pds.Dataset,
1102
+ options: Optional[CastOptions] = None,
1103
+ ) -> pa.Field:
1104
+ table = data.to_table()
1105
+ return cast_arrow_tabular(table, options)
1106
+
1098
1107
  # ---------------------------------------------------------------------------
1099
1108
  # Field / Schema converters
1100
1109
  # ---------------------------------------------------------------------------