ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +31 -34
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +148 -1
- yggdrasil/databricks/sql/types.py +49 -1
- yggdrasil/databricks/workspaces/__init__.py +4 -1
- yggdrasil/databricks/workspaces/filesytem.py +344 -0
- yggdrasil/databricks/workspaces/io.py +1123 -0
- yggdrasil/databricks/workspaces/path.py +1415 -0
- yggdrasil/databricks/workspaces/path_kind.py +13 -0
- yggdrasil/databricks/workspaces/workspace.py +298 -154
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +131 -0
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.30.dist-info/RECORD +0 -56
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Result wrapper for Databricks SQL statement execution."""
|
|
2
|
+
|
|
1
3
|
import dataclasses
|
|
2
4
|
import threading
|
|
3
5
|
import time
|
|
@@ -49,6 +51,7 @@ __all__ = [
|
|
|
49
51
|
|
|
50
52
|
@dataclasses.dataclass
|
|
51
53
|
class StatementResult:
|
|
54
|
+
"""Container for statement responses, data extraction, and conversions."""
|
|
52
55
|
engine: "SQLEngine"
|
|
53
56
|
statement_id: str
|
|
54
57
|
disposition: "Disposition"
|
|
@@ -60,6 +63,11 @@ class StatementResult:
|
|
|
60
63
|
_arrow_table: Optional[pa.Table] = dataclasses.field(default=None, repr=False)
|
|
61
64
|
|
|
62
65
|
def __getstate__(self):
|
|
66
|
+
"""Serialize statement results, converting Spark dataframes to Arrow.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A pickle-ready state dictionary.
|
|
70
|
+
"""
|
|
63
71
|
state = self.__dict__.copy()
|
|
64
72
|
|
|
65
73
|
_spark_df = state.pop("_spark_df", None)
|
|
@@ -70,38 +78,71 @@ class StatementResult:
|
|
|
70
78
|
return state
|
|
71
79
|
|
|
72
80
|
def __setstate__(self, state):
|
|
81
|
+
"""Restore statement result state, rehydrating cached data.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
state: Serialized state dictionary.
|
|
85
|
+
"""
|
|
73
86
|
_spark_df = state.pop("_spark_df")
|
|
74
87
|
|
|
75
88
|
def __iter__(self):
|
|
89
|
+
"""Iterate over Arrow record batches."""
|
|
76
90
|
return self.to_arrow_batches()
|
|
77
91
|
|
|
78
92
|
@property
|
|
79
93
|
def is_spark_sql(self):
|
|
94
|
+
"""Return True when this result was produced by Spark SQL."""
|
|
80
95
|
return self._spark_df is not None
|
|
81
96
|
|
|
82
97
|
@property
|
|
83
98
|
def response(self):
|
|
99
|
+
"""Return the latest statement response, refreshing when needed.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The current StatementResponse object.
|
|
103
|
+
"""
|
|
84
104
|
if self._response is None and not self.is_spark_sql:
|
|
85
105
|
self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
|
|
86
106
|
return self._response
|
|
87
107
|
|
|
88
108
|
@response.setter
|
|
89
109
|
def response(self, value: "StatementResponse"):
|
|
110
|
+
"""Update the cached response and refresh timestamp.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
value: StatementResponse to cache.
|
|
114
|
+
"""
|
|
90
115
|
self._response = value
|
|
91
116
|
self._response_refresh_time = time.time()
|
|
92
117
|
|
|
93
118
|
self.statement_id = self._response.statement_id
|
|
94
119
|
|
|
95
120
|
def fresh_response(self, delay: float):
|
|
121
|
+
"""Refresh the response if it is older than ``delay`` seconds.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
delay: Minimum age in seconds before refreshing.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
The refreshed StatementResponse object.
|
|
128
|
+
"""
|
|
96
129
|
if self.is_spark_sql:
|
|
97
130
|
return self._response
|
|
98
131
|
|
|
99
|
-
if
|
|
132
|
+
if self.statement_id and not self.done and time.time() - self._response_refresh_time > delay:
|
|
100
133
|
self.response = self.workspace.sdk().statement_execution.get_statement(self.statement_id)
|
|
101
134
|
|
|
102
135
|
return self._response
|
|
103
136
|
|
|
104
137
|
def result_data_at(self, chunk_index: int):
|
|
138
|
+
"""Fetch a specific result chunk by index.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
chunk_index: Result chunk index to retrieve.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
The SDK result chunk response.
|
|
145
|
+
"""
|
|
105
146
|
sdk = self.workspace.sdk()
|
|
106
147
|
|
|
107
148
|
return sdk.statement_execution.get_statement_result_chunk_n(
|
|
@@ -111,10 +152,20 @@ class StatementResult:
|
|
|
111
152
|
|
|
112
153
|
@property
|
|
113
154
|
def workspace(self):
|
|
155
|
+
"""Expose the underlying workspace from the engine.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
The Workspace instance backing this statement.
|
|
159
|
+
"""
|
|
114
160
|
return self.engine.workspace
|
|
115
161
|
|
|
116
162
|
@property
|
|
117
163
|
def status(self):
|
|
164
|
+
"""Return the statement status, handling persisted data.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A StatementStatus object.
|
|
168
|
+
"""
|
|
118
169
|
if self.persisted:
|
|
119
170
|
return StatementStatus(
|
|
120
171
|
state=StatementState.SUCCEEDED
|
|
@@ -129,20 +180,40 @@ class StatementResult:
|
|
|
129
180
|
|
|
130
181
|
@property
|
|
131
182
|
def state(self):
|
|
183
|
+
"""Return the statement state.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
The StatementState enum value.
|
|
187
|
+
"""
|
|
132
188
|
return self.status.state
|
|
133
189
|
|
|
134
190
|
@property
|
|
135
191
|
def manifest(self):
|
|
192
|
+
"""Return the SQL result manifest, if available.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
The result manifest or None for Spark SQL results.
|
|
196
|
+
"""
|
|
136
197
|
if self.is_spark_sql:
|
|
137
198
|
return None
|
|
138
199
|
return self.response.manifest
|
|
139
200
|
|
|
140
201
|
@property
|
|
141
202
|
def result(self):
|
|
203
|
+
"""Return the raw statement result object.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
The statement result payload from the API.
|
|
207
|
+
"""
|
|
142
208
|
return self.response.result
|
|
143
209
|
|
|
144
210
|
@property
|
|
145
211
|
def done(self):
|
|
212
|
+
"""Return True when the statement is in a terminal state.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
True if the statement is done, otherwise False.
|
|
216
|
+
"""
|
|
146
217
|
if self.persisted:
|
|
147
218
|
return True
|
|
148
219
|
|
|
@@ -155,6 +226,11 @@ class StatementResult:
|
|
|
155
226
|
|
|
156
227
|
@property
|
|
157
228
|
def failed(self):
|
|
229
|
+
"""Return True when the statement failed or was cancelled.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
True if the statement failed or was cancelled.
|
|
233
|
+
"""
|
|
158
234
|
if self.persisted:
|
|
159
235
|
return True
|
|
160
236
|
|
|
@@ -165,14 +241,29 @@ class StatementResult:
|
|
|
165
241
|
|
|
166
242
|
@property
|
|
167
243
|
def persisted(self):
|
|
244
|
+
"""Return True when data is cached locally.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
True when cached Arrow or Spark data is present.
|
|
248
|
+
"""
|
|
168
249
|
return self._spark_df is not None or self._arrow_table is not None
|
|
169
250
|
|
|
170
251
|
def persist(self):
|
|
252
|
+
"""Cache the statement result locally as Arrow data.
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
The current StatementResult instance.
|
|
256
|
+
"""
|
|
171
257
|
if not self.persisted:
|
|
172
258
|
self._arrow_table = self.to_arrow_table()
|
|
173
259
|
return self
|
|
174
260
|
|
|
175
261
|
def external_links(self):
|
|
262
|
+
"""Yield external result links for EXTERNAL_LINKS dispositions.
|
|
263
|
+
|
|
264
|
+
Yields:
|
|
265
|
+
External link objects in result order.
|
|
266
|
+
"""
|
|
176
267
|
assert self.disposition == Disposition.EXTERNAL_LINKS, "Cannot get from %s, disposition %s != %s" % (
|
|
177
268
|
self, self.disposition, Disposition.EXTERNAL_LINKS
|
|
178
269
|
)
|
|
@@ -222,6 +313,11 @@ class StatementResult:
|
|
|
222
313
|
)
|
|
223
314
|
|
|
224
315
|
def raise_for_status(self):
|
|
316
|
+
"""Raise a ValueError if the statement failed.
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
None.
|
|
320
|
+
"""
|
|
225
321
|
if self.failed:
|
|
226
322
|
# grab error info if present
|
|
227
323
|
err = self.status.error
|
|
@@ -244,6 +340,15 @@ class StatementResult:
|
|
|
244
340
|
timeout: Optional[int] = None,
|
|
245
341
|
poll_interval: Optional[float] = None
|
|
246
342
|
):
|
|
343
|
+
"""Wait for statement completion with optional timeout.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
timeout: Maximum seconds to wait.
|
|
347
|
+
poll_interval: Initial poll interval in seconds.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
The current StatementResult instance.
|
|
351
|
+
"""
|
|
247
352
|
if self.done:
|
|
248
353
|
return self
|
|
249
354
|
|
|
@@ -265,6 +370,11 @@ class StatementResult:
|
|
|
265
370
|
return current
|
|
266
371
|
|
|
267
372
|
def arrow_schema(self):
|
|
373
|
+
"""Return the Arrow schema for the result.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
An Arrow Schema instance.
|
|
377
|
+
"""
|
|
268
378
|
if self.persisted:
|
|
269
379
|
if self._arrow_table is not None:
|
|
270
380
|
return self._arrow_table.schema
|
|
@@ -277,6 +387,14 @@ class StatementResult:
|
|
|
277
387
|
return pa.schema(fields)
|
|
278
388
|
|
|
279
389
|
def to_arrow_table(self, parallel_pool: Optional[int] = 4) -> pa.Table:
|
|
390
|
+
"""Collect the statement result into a single Arrow table.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
An Arrow Table containing all rows.
|
|
397
|
+
"""
|
|
280
398
|
if self.persisted:
|
|
281
399
|
if self._arrow_table:
|
|
282
400
|
return self._arrow_table
|
|
@@ -295,6 +413,14 @@ class StatementResult:
|
|
|
295
413
|
self,
|
|
296
414
|
parallel_pool: Optional[int] = 4
|
|
297
415
|
) -> Iterator[pa.RecordBatch]:
|
|
416
|
+
"""Stream the result as Arrow record batches.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
420
|
+
|
|
421
|
+
Yields:
|
|
422
|
+
Arrow RecordBatch objects.
|
|
423
|
+
"""
|
|
298
424
|
if self.persisted:
|
|
299
425
|
if self._arrow_table is not None:
|
|
300
426
|
for batch in self._arrow_table.to_batches(max_chunksize=64 * 1024):
|
|
@@ -379,15 +505,36 @@ class StatementResult:
|
|
|
379
505
|
self,
|
|
380
506
|
parallel_pool: Optional[int] = 4
|
|
381
507
|
) -> "pandas.DataFrame":
|
|
508
|
+
"""Return the result as a pandas DataFrame.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
A pandas DataFrame with the result rows.
|
|
515
|
+
"""
|
|
382
516
|
return self.to_arrow_table(parallel_pool=parallel_pool).to_pandas()
|
|
383
517
|
|
|
384
518
|
def to_polars(
|
|
385
519
|
self,
|
|
386
520
|
parallel_pool: Optional[int] = 4
|
|
387
521
|
) -> "polars.DataFrame":
|
|
522
|
+
"""Return the result as a polars DataFrame.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
parallel_pool: Maximum parallel fetch workers.
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
A polars DataFrame with the result rows.
|
|
529
|
+
"""
|
|
388
530
|
return polars.from_arrow(self.to_arrow_table(parallel_pool=parallel_pool))
|
|
389
531
|
|
|
390
532
|
def to_spark(self):
|
|
533
|
+
"""Return the result as a Spark DataFrame, caching it locally.
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
A Spark DataFrame with the result rows.
|
|
537
|
+
"""
|
|
391
538
|
if self._spark_df:
|
|
392
539
|
return self._spark_df
|
|
393
540
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Type utilities for Databricks SQL metadata and Arrow."""
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import re
|
|
3
5
|
from typing import Union
|
|
@@ -86,6 +88,14 @@ _struct_re = re.compile(r"^STRUCT\s*<\s*(.+)\s*>$", re.IGNORECASE)
|
|
|
86
88
|
|
|
87
89
|
|
|
88
90
|
def _split_top_level_commas(s: str):
|
|
91
|
+
"""Split a type string by commas, respecting nested angle brackets.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
s: Type string to split.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
A list of top-level comma-separated parts.
|
|
98
|
+
"""
|
|
89
99
|
parts, cur, depth = [], [], 0
|
|
90
100
|
for ch in s:
|
|
91
101
|
if ch == '<':
|
|
@@ -102,12 +112,38 @@ def _split_top_level_commas(s: str):
|
|
|
102
112
|
return parts
|
|
103
113
|
|
|
104
114
|
|
|
115
|
+
def _safe_bytes(obj):
|
|
116
|
+
"""Convert an object to UTF-8 bytes, with safe handling for None.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
obj: Value to convert.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
UTF-8 encoded bytes.
|
|
123
|
+
"""
|
|
124
|
+
if not isinstance(obj, bytes):
|
|
125
|
+
if not obj:
|
|
126
|
+
return b""
|
|
127
|
+
|
|
128
|
+
if not isinstance(obj, str):
|
|
129
|
+
obj = str(obj)
|
|
130
|
+
|
|
131
|
+
return obj.encode("utf-8")
|
|
132
|
+
return obj
|
|
133
|
+
|
|
134
|
+
|
|
105
135
|
def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
|
|
106
136
|
"""
|
|
107
137
|
Adapted parser that:
|
|
108
138
|
- looks up base types in STRING_TYPE_MAP (expects uppercase keys)
|
|
109
139
|
- supports DECIMAL(p,s), ARRAY<...>, MAP<k,v>, STRUCT<...> recursively
|
|
110
140
|
- raises ValueError if it cannot map the provided type string
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
type_str: SQL type string to parse.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
The corresponding Arrow DataType.
|
|
111
147
|
"""
|
|
112
148
|
if not type_str:
|
|
113
149
|
raise ValueError("Empty type string")
|
|
@@ -165,11 +201,23 @@ def parse_sql_type_to_pa(type_str: str) -> pa.DataType:
|
|
|
165
201
|
|
|
166
202
|
|
|
167
203
|
def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
|
|
204
|
+
"""Convert Databricks SQL/Catalog column info into an Arrow field.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
col: ColumnInfo from SQL or Catalog APIs.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
An Arrow Field for the column.
|
|
211
|
+
"""
|
|
168
212
|
arrow_type = parse_sql_type_to_pa(col.type_text)
|
|
169
213
|
|
|
170
214
|
if isinstance(col, CatalogColumnInfo):
|
|
171
215
|
parsed = json.loads(col.type_json)
|
|
172
216
|
md = parsed.get("metadata", {}) or {}
|
|
217
|
+
md = {
|
|
218
|
+
_safe_bytes(k): _safe_bytes(v)
|
|
219
|
+
for k, v in md.items()
|
|
220
|
+
}
|
|
173
221
|
nullable = col.nullable
|
|
174
222
|
elif isinstance(col, SQLColumnInfo):
|
|
175
223
|
md = {}
|
|
@@ -182,4 +230,4 @@ def column_info_to_arrow_field(col: Union[SQLColumnInfo, CatalogColumnInfo]):
|
|
|
182
230
|
arrow_type,
|
|
183
231
|
nullable=nullable,
|
|
184
232
|
metadata=md
|
|
185
|
-
)
|
|
233
|
+
)
|