ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
  2. ygg-0.1.32.dist-info/RECORD +60 -0
  3. yggdrasil/__init__.py +2 -0
  4. yggdrasil/databricks/__init__.py +2 -0
  5. yggdrasil/databricks/compute/__init__.py +2 -0
  6. yggdrasil/databricks/compute/cluster.py +241 -2
  7. yggdrasil/databricks/compute/execution_context.py +100 -11
  8. yggdrasil/databricks/compute/remote.py +16 -0
  9. yggdrasil/databricks/jobs/__init__.py +5 -0
  10. yggdrasil/databricks/jobs/config.py +31 -34
  11. yggdrasil/databricks/sql/__init__.py +2 -0
  12. yggdrasil/databricks/sql/engine.py +217 -36
  13. yggdrasil/databricks/sql/exceptions.py +1 -0
  14. yggdrasil/databricks/sql/statement_result.py +148 -1
  15. yggdrasil/databricks/sql/types.py +49 -1
  16. yggdrasil/databricks/workspaces/__init__.py +4 -1
  17. yggdrasil/databricks/workspaces/filesytem.py +344 -0
  18. yggdrasil/databricks/workspaces/io.py +1123 -0
  19. yggdrasil/databricks/workspaces/path.py +1415 -0
  20. yggdrasil/databricks/workspaces/path_kind.py +13 -0
  21. yggdrasil/databricks/workspaces/workspace.py +298 -154
  22. yggdrasil/dataclasses/__init__.py +2 -0
  23. yggdrasil/dataclasses/dataclass.py +42 -1
  24. yggdrasil/libs/__init__.py +2 -0
  25. yggdrasil/libs/databrickslib.py +9 -0
  26. yggdrasil/libs/extensions/__init__.py +2 -0
  27. yggdrasil/libs/extensions/polars_extensions.py +72 -0
  28. yggdrasil/libs/extensions/spark_extensions.py +116 -0
  29. yggdrasil/libs/pandaslib.py +7 -0
  30. yggdrasil/libs/polarslib.py +7 -0
  31. yggdrasil/libs/sparklib.py +41 -0
  32. yggdrasil/pyutils/__init__.py +4 -0
  33. yggdrasil/pyutils/callable_serde.py +106 -0
  34. yggdrasil/pyutils/exceptions.py +16 -0
  35. yggdrasil/pyutils/modules.py +44 -1
  36. yggdrasil/pyutils/parallel.py +29 -0
  37. yggdrasil/pyutils/python_env.py +301 -0
  38. yggdrasil/pyutils/retry.py +57 -0
  39. yggdrasil/requests/__init__.py +4 -0
  40. yggdrasil/requests/msal.py +124 -3
  41. yggdrasil/requests/session.py +18 -0
  42. yggdrasil/types/__init__.py +2 -0
  43. yggdrasil/types/cast/__init__.py +2 -1
  44. yggdrasil/types/cast/arrow_cast.py +131 -0
  45. yggdrasil/types/cast/cast_options.py +119 -1
  46. yggdrasil/types/cast/pandas_cast.py +29 -0
  47. yggdrasil/types/cast/polars_cast.py +47 -0
  48. yggdrasil/types/cast/polars_pandas_cast.py +29 -0
  49. yggdrasil/types/cast/registry.py +176 -0
  50. yggdrasil/types/cast/spark_cast.py +76 -0
  51. yggdrasil/types/cast/spark_pandas_cast.py +29 -0
  52. yggdrasil/types/cast/spark_polars_cast.py +28 -0
  53. yggdrasil/types/libs.py +2 -0
  54. yggdrasil/types/python_arrow.py +191 -0
  55. yggdrasil/types/python_defaults.py +73 -0
  56. yggdrasil/version.py +1 -0
  57. ygg-0.1.30.dist-info/RECORD +0 -56
  58. yggdrasil/databricks/workspaces/databricks_path.py +0 -784
  59. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
  60. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
  61. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
  62. {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ """Result wrapper for Databricks SQL statement execution."""
2
+
1
3
  import dataclasses
2
4
  import threading
3
5
  import time
@@ -49,6 +51,7 @@ __all__ = [
49
51
 
50
52
  @dataclasses.dataclass
51
53
  class StatementResult:
54
+ """Container for statement responses, data extraction, and conversions."""
52
55
  engine: "SQLEngine"
53
56
  statement_id: str
54
57
  disposition: "Disposition"
@@ -60,6 +63,11 @@ class StatementResult:
60
63
  _arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
61
64
 
62
65
  def __getstate__(self):
66
+ """Serialize statement results, converting Spark dataframes to Arrow.
67
+
68
+ Returns:
69
+ A pickle-ready state dictionary.
70
+ """
63
71
  state = self.__dict__.copy()
64
72
 
65
73
  _spark_df = state.pop("_spark_df", None)
@@ -70,38 +78,71 @@ class StatementResult:
70
78
  return state
71
79
 
72
80
  def __setstate__(self, state):
81
+ """Restore statement result state, rehydrating cached data.
82
+
83
+ Args:
84
+ state: Serialized state dictionary.
85
+ """
73
86
  _spark_df = state.pop("_spark_df")
74
87
 
75
88
  def __iter__(self):
89
+ """Iterate over Arrow record batches."""
76
90
  return self.to_arrow_batches()
77
91
 
78
92
  @property
79
93
  def is_spark_sql(self):
94
+ """Return True when this result was produced by Spark SQL."""
80
95
  return self._spark_df is not None
81
96
 
82
97
  @property
83
98
  def response(self):
99
+ """Return the latest statement response, refreshing when needed.
100
+
101
+ Returns:
102
+ The current StatementResponse object.
103
+ """
84
104
  if self._response is None and not self.is_spark_sql:
85
105
  self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
86
106
  return self._response
87
107
 
88
108
  @response.setter
89
109
  def response(self, value: "StatementResponse"):
110
+ """Update the cached response and refresh timestamp.
111
+
112
+ Args:
113
+ value: StatementResponse to cache.
114
+ """
90
115
  self._response = value
91
116
  self._response_refresh_time = time.time()
92
117
 
93
118
  self.statement_id = self._response.statement_id
94
119
 
95
120
  def fresh_response(self, delay: float):
121
+ """Refresh the response if it is older than ``delay`` seconds.
122
+
123
+ Args:
124
+ delay: Minimum age in seconds before refreshing.
125
+
126
+ Returns:
127
+ The refreshed StatementResponse object.
128
+ """
96
129
  if self.is_spark_sql:
97
130
  return self._response
98
131
 
99
- if not self.done and self.statement_id and time.time() - self._response_refresh_time > delay:
132
+ if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
100
133
  self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
101
134
 
102
135
  return self._response
103
136
 
104
137
  def result_data_at(self, chunk_index: int):
138
+ """Fetch a specific result chunk by index.
139
+
140
+ Args:
141
+ chunk_index: Result chunk index to retrieve.
142
+
143
+ Returns:
144
+ The SDK result chunk response.
145
+ """
105
146
  sdk = self.workspace.sdk()
106
147
 
107
148
  return sdk.statement_execution.get_statement_result_chunk_n(
@@ -111,10 +152,20 @@ class StatementResult:
111
152
 
112
153
  @property
113
154
  def workspace(self):
155
+ """Expose the underlying workspace from the engine.
156
+
157
+ Returns:
158
+ The Workspace instance backing this statement.
159
+ """
114
160
  return self.engine.workspace
115
161
 
116
162
  @property
117
163
  def status(self):
164
+ """Return the statement status, handling persisted data.
165
+
166
+ Returns:
167
+ A StatementStatus object.
168
+ """
118
169
  if self.persisted:
119
170
  return StatementStatus(
120
171
  state=StatementState.SUCCEEDED
@@ -129,20 +180,40 @@ class StatementResult:
129
180
 
130
181
  @property
131
182
  def state(self):
183
+ """Return the statement state.
184
+
185
+ Returns:
186
+ The StatementState enum value.
187
+ """
132
188
  return self.status.state
133
189
 
134
190
  @property
135
191
  def manifest(self):
192
+ """Return the SQL result manifest, if available.
193
+
194
+ Returns:
195
+ The result manifest or None for Spark SQL results.
196
+ """
136
197
  if self.is_spark_sql:
137
198
  return None
138
199
  return self.response.manifest
139
200
 
140
201
  @property
141
202
  def result(self):
203
+ """Return the raw statement result object.
204
+
205
+ Returns:
206
+ The statement result payload from the API.
207
+ """
142
208
  return self.response.result
143
209
 
144
210
  @property
145
211
  def done(self):
212
+ """Return True when the statement is in a terminal state.
213
+
214
+ Returns:
215
+ True if the statement is done, otherwise False.
216
+ """
146
217
  if self.persisted:
147
218
  return True
148
219
 
@@ -155,6 +226,11 @@ class StatementResult:
155
226
 
156
227
  @property
157
228
  def failed(self):
229
+ """Return True when the statement failed or was cancelled.
230
+
231
+ Returns:
232
+ True if the statement failed or was cancelled.
233
+ """
158
234
  if self.persisted:
159
235
  return True
160
236
 
@@ -165,14 +241,29 @@ class StatementResult:
165
241
 
166
242
  @property
167
243
  def persisted(self):
244
+ """Return True when data is cached locally.
245
+
246
+ Returns:
247
+ True when cached Arrow or Spark data is present.
248
+ """
168
249
  return self._spark_df is not None or self._arrow_table is not None
169
250
 
170
251
  def persist(self):
252
+ """Cache the statement result locally as Arrow data.
253
+
254
+ Returns:
255
+ The current StatementResult instance.
256
+ """
171
257
  if not self.persisted:
172
258
  self._arrow_table = self.to_arrow_table()
173
259
  return self
174
260
 
175
261
  def external_links(self):
262
+ """Yield external result links for EXTERNAL_LINKS dispositions.
263
+
264
+ Yields:
265
+ External link objects in result order.
266
+ """
176
267
  assert self.disposition == Disposition.EXTERNAL_LINKS, "Cannot get from %s, disposition %s != %s" % (
177
268
  self, self.disposition, Disposition.EXTERNAL_LINKS
178
269
  )
@@ -222,6 +313,11 @@ class StatementResult:
222
313
  )
223
314
 
224
315
  def raise_for_status(self):
316
+ """Raise a ValueError if the statement failed.
317
+
318
+ Returns:
319
+ None.
320
+ """
225
321
  if self.failed:
226
322
  # grab error info if present
227
323
  err = self.status.error
@@ -244,6 +340,15 @@ class StatementResult:
244
340
  timeout: Optional[int] = None,
245
341
  poll_interval: Optional[float] = None
246
342
  ):
343
+ """Wait for statement completion with optional timeout.
344
+
345
+ Args:
346
+ timeout: Maximum seconds to wait.
347
+ poll_interval: Initial poll interval in seconds.
348
+
349
+ Returns:
350
+ The current StatementResult instance.
351
+ """
247
352
  if self.done:
248
353
  return self
249
354
 
@@ -265,6 +370,11 @@ class StatementResult:
265
370
  return current
266
371
 
267
372
  def arrow_schema(self):
373
+ """Return the Arrow schema for the result.
374
+
375
+ Returns:
376
+ An Arrow Schema instance.
377
+ """
268
378
  if self.persisted:
269
379
  if self._arrow_table is not None:
270
380
  return self._arrow_table.schema
@@ -277,6 +387,14 @@ class StatementResult:
277
387
  return pa.schema(fields)
278
388
 
279
389
  def to_arrow_table(self, parallel_pool: Optional[int] = 4) -> pa.Table:
390
+ """Collect the statement result into a single Arrow table.
391
+
392
+ Args:
393
+ parallel_pool: Maximum parallel fetch workers.
394
+
395
+ Returns:
396
+ An Arrow Table containing all rows.
397
+ """
280
398
  if self.persisted:
281
399
  if self._arrow_table:
282
400
  return self._arrow_table
@@ -295,6 +413,14 @@ class StatementResult:
295
413
  self,
296
414
  parallel_pool: Optional[int] = 4
297
415
  ) -> Iterator[pa.RecordBatch]:
416
+ """Stream the result as Arrow record batches.
417
+
418
+ Args:
419
+ parallel_pool: Maximum parallel fetch workers.
420
+
421
+ Yields:
422
+ Arrow RecordBatch objects.
423
+ """
298
424
  if self.persisted:
299
425
  if self._arrow_table is not None:
300
426
  for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
@@ -379,15 +505,36 @@ class StatementResult:
379
505
  self,
380
506
  parallel_pool: Optional[int] = 4
381
507
  ) -> "pandas.DataFrame":
508
+ """Return the result as a pandas DataFrame.
509
+
510
+ Args:
511
+ parallel_pool: Maximum parallel fetch workers.
512
+
513
+ Returns:
514
+ A pandas DataFrame with the result rows.
515
+ """
382
516
  return self.to_arrow_table(parallel_pool=parallel_pool).to_pandas()
383
517
 
384
518
  def to_polars(
385
519
  self,
386
520
  parallel_pool: Optional[int] = 4
387
521
  ) -> "polars.DataFrame":
522
+ """Return the result as a polars DataFrame.
523
+
524
+ Args:
525
+ parallel_pool: Maximum parallel fetch workers.
526
+
527
+ Returns:
528
+ A polars DataFrame with the result rows.
529
+ """
388
530
  return polars.from_arrow(self.to_arrow_table(parallel_pool=parallel_pool))
389
531
 
390
532
  def to_spark(self):
533
+ """Return the result as a Spark DataFrame, caching it locally.
534
+
535
+ Returns:
536
+ A Spark DataFrame with the result rows.
537
+ """
391
538
  if self._spark_df:
392
539
  return self._spark_df
393
540
 
@@ -1,3 +1,5 @@
1
+ """Type utilities for Databricks SQL metadata and Arrow."""
2
+
1
3
  import json
2
4
  import re
3
5
  from typing import Union
@@ -86,6 +88,14 @@ _struct_re = re.compile(r"^STRUCT\s*<\s*(.+)\s*>$", re.IGNORECASE)
86
88
 
87
89
 
88
90
  def _split_top_level_commas(s: str):
91
+ """Split a type string by commas, respecting nested angle brackets.
92
+
93
+ Args:
94
+ s: Type string to split.
95
+
96
+ Returns:
97
+ A list of top-level comma-separated parts.
98
+ """
89
99
  parts, cur, depth = [], [], 0
90
100
  for ch in s:
91
101
  if ch == '<':
@@ -102,12 +112,38 @@ def _split_top_level_commas(s: str):
102
112
  return parts
103
113
 
104
114
 
115
+ def _safe_bytes(obj):
116
+ """Convert an object to UTF-8 bytes, with safe handling for None.
117
+
118
+ Args:
119
+ obj: Value to convert.
120
+
121
+ Returns:
122
+ UTF-8 encoded bytes.
123
+ """
124
+ if not isinstance(obj, bytes):
125
+ if not obj:
126
+ return b""
127
+
128
+ if not isinstance(obj, str):
129
+ obj = str(obj)
130
+
131
+ return obj.encode("utf-8")
132
+ return obj
133
+
134
+
105
135
  def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
106
136
  """
107
137
  Adapted parser that:
108
138
  - looks up base types in STRING_TYPE_MAP (expects uppercase keys)
109
139
  - supports DECIMAL(p,s), ARRAY<...>, MAP<k,v>, STRUCT<...> recursively
110
140
  - raises ValueError if it cannot map the provided type string
141
+
142
+ Args:
143
+ type_str: SQL type string to parse.
144
+
145
+ Returns:
146
+ The corresponding Arrow DataType.
111
147
  """
112
148
  if not type_str:
113
149
  raise ValueError("Empty type string")
@@ -165,11 +201,23 @@ def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
165
201
 
166
202
 
167
203
  def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
204
+ """Convert Databricks SQL/Catalog column info into an Arrow field.
205
+
206
+ Args:
207
+ col: ColumnInfo from SQL or Catalog APIs.
208
+
209
+ Returns:
210
+ An Arrow Field for the column.
211
+ """
168
212
  arrow_type = parse_sql_type_to_pa(col.type_text)
169
213
 
170
214
  if isinstance(col, CatalogColumnInfo):
171
215
  parsed = json.loads(col.type_json)
172
216
  md = parsed.get("metadata", {}) or {}
217
+ md = {
218
+ _safe_bytes(k): _safe_bytes(v)
219
+ for k, v in md.items()
220
+ }
173
221
  nullable = col.nullable
174
222
  elif isinstance(col, SQLColumnInfo):
175
223
  md = {}
@@ -182,4 +230,4 @@ def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
182
230
  arrow_type,
183
231
  nullable=nullable,
184
232
  metadata=md
185
- )
233
+ )
@@ -1,2 +1,5 @@
1
+ """Workspace, filesystem, and path utilities for Databricks."""
2
+
1
3
  from .workspace import *
2
- from .databricks_path import *
4
+ from .path import *
5
+ from .io import *