ygg 0.1.30__py3-none-any.whl → 0.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/METADATA +1 -1
- ygg-0.1.32.dist-info/RECORD +60 -0
- yggdrasil/__init__.py +2 -0
- yggdrasil/databricks/__init__.py +2 -0
- yggdrasil/databricks/compute/__init__.py +2 -0
- yggdrasil/databricks/compute/cluster.py +241 -2
- yggdrasil/databricks/compute/execution_context.py +100 -11
- yggdrasil/databricks/compute/remote.py +16 -0
- yggdrasil/databricks/jobs/__init__.py +5 -0
- yggdrasil/databricks/jobs/config.py +31 -34
- yggdrasil/databricks/sql/__init__.py +2 -0
- yggdrasil/databricks/sql/engine.py +217 -36
- yggdrasil/databricks/sql/exceptions.py +1 -0
- yggdrasil/databricks/sql/statement_result.py +148 -1
- yggdrasil/databricks/sql/types.py +49 -1
- yggdrasil/databricks/workspaces/__init__.py +4 -1
- yggdrasil/databricks/workspaces/filesytem.py +344 -0
- yggdrasil/databricks/workspaces/io.py +1123 -0
- yggdrasil/databricks/workspaces/path.py +1415 -0
- yggdrasil/databricks/workspaces/path_kind.py +13 -0
- yggdrasil/databricks/workspaces/workspace.py +298 -154
- yggdrasil/dataclasses/__init__.py +2 -0
- yggdrasil/dataclasses/dataclass.py +42 -1
- yggdrasil/libs/__init__.py +2 -0
- yggdrasil/libs/databrickslib.py +9 -0
- yggdrasil/libs/extensions/__init__.py +2 -0
- yggdrasil/libs/extensions/polars_extensions.py +72 -0
- yggdrasil/libs/extensions/spark_extensions.py +116 -0
- yggdrasil/libs/pandaslib.py +7 -0
- yggdrasil/libs/polarslib.py +7 -0
- yggdrasil/libs/sparklib.py +41 -0
- yggdrasil/pyutils/__init__.py +4 -0
- yggdrasil/pyutils/callable_serde.py +106 -0
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +44 -1
- yggdrasil/pyutils/parallel.py +29 -0
- yggdrasil/pyutils/python_env.py +301 -0
- yggdrasil/pyutils/retry.py +57 -0
- yggdrasil/requests/__init__.py +4 -0
- yggdrasil/requests/msal.py +124 -3
- yggdrasil/requests/session.py +18 -0
- yggdrasil/types/__init__.py +2 -0
- yggdrasil/types/cast/__init__.py +2 -1
- yggdrasil/types/cast/arrow_cast.py +131 -0
- yggdrasil/types/cast/cast_options.py +119 -1
- yggdrasil/types/cast/pandas_cast.py +29 -0
- yggdrasil/types/cast/polars_cast.py +47 -0
- yggdrasil/types/cast/polars_pandas_cast.py +29 -0
- yggdrasil/types/cast/registry.py +176 -0
- yggdrasil/types/cast/spark_cast.py +76 -0
- yggdrasil/types/cast/spark_pandas_cast.py +29 -0
- yggdrasil/types/cast/spark_polars_cast.py +28 -0
- yggdrasil/types/libs.py +2 -0
- yggdrasil/types/python_arrow.py +191 -0
- yggdrasil/types/python_defaults.py +73 -0
- yggdrasil/version.py +1 -0
- ygg-0.1.30.dist-info/RECORD +0 -56
- yggdrasil/databricks/workspaces/databricks_path.py +0 -784
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/WHEEL +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.30.dist-info → ygg-0.1.32.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Dataclass helpers that integrate with Arrow schemas and safe casting."""
|
|
2
|
+
|
|
1
3
|
import dataclasses
|
|
2
4
|
from inspect import isclass
|
|
3
5
|
from typing import Any, Iterable, Mapping, Tuple
|
|
@@ -18,6 +20,7 @@ def is_yggdataclass(cls_or_instance: Any) -> bool:
|
|
|
18
20
|
|
|
19
21
|
Args:
|
|
20
22
|
cls_or_instance: The class or instance to check.
|
|
23
|
+
|
|
21
24
|
Returns:
|
|
22
25
|
True if the class or instance
|
|
23
26
|
is a yggdrasil dataclass, False otherwise.
|
|
@@ -26,6 +29,14 @@ def is_yggdataclass(cls_or_instance: Any) -> bool:
|
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
def get_dataclass_arrow_field(cls_or_instance: Any) -> pa.Field:
|
|
32
|
+
"""Return a cached Arrow Field describing the dataclass type.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
cls_or_instance: Dataclass class or instance.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Arrow field describing the dataclass schema.
|
|
39
|
+
"""
|
|
29
40
|
if is_yggdataclass(cls_or_instance):
|
|
30
41
|
return cls_or_instance.__arrow_field__()
|
|
31
42
|
|
|
@@ -58,7 +69,7 @@ def yggdataclass(
|
|
|
58
69
|
kw_only=False, slots=False,
|
|
59
70
|
weakref_slot=False
|
|
60
71
|
):
|
|
61
|
-
"""
|
|
72
|
+
"""Decorate a class with dataclass behavior plus Arrow helpers.
|
|
62
73
|
|
|
63
74
|
Examines PEP 526 __annotations__ to determine fields.
|
|
64
75
|
|
|
@@ -73,7 +84,24 @@ def yggdataclass(
|
|
|
73
84
|
"""
|
|
74
85
|
|
|
75
86
|
def wrap(c):
|
|
87
|
+
"""Wrap a class with yggdrasil dataclass enhancements.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
c: Class to decorate.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Decorated dataclass type.
|
|
94
|
+
"""
|
|
95
|
+
|
|
76
96
|
def _init_public_fields(cls):
|
|
97
|
+
"""Return init-enabled, public dataclass fields.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
cls: Dataclass type.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of dataclasses.Field objects.
|
|
104
|
+
"""
|
|
77
105
|
return [
|
|
78
106
|
field
|
|
79
107
|
for field in dataclasses.fields(cls)
|
|
@@ -83,6 +111,11 @@ def yggdataclass(
|
|
|
83
111
|
if not hasattr(c, "default_instance"):
|
|
84
112
|
@classmethod
|
|
85
113
|
def default_instance(cls):
|
|
114
|
+
"""Return a default instance built from type defaults.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Default instance of the dataclass.
|
|
118
|
+
"""
|
|
86
119
|
from yggdrasil.types import default_scalar
|
|
87
120
|
|
|
88
121
|
if not hasattr(cls, "__default_instance__"):
|
|
@@ -135,6 +168,14 @@ def yggdataclass(
|
|
|
135
168
|
if not hasattr(c, "__arrow_field__"):
|
|
136
169
|
@classmethod
|
|
137
170
|
def __arrow_field__(cls, name: str | None = None):
|
|
171
|
+
"""Return an Arrow field representing the dataclass schema.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
name: Optional override for the field name.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Arrow field describing the dataclass schema.
|
|
178
|
+
"""
|
|
138
179
|
from yggdrasil.types.python_arrow import arrow_field_from_hint
|
|
139
180
|
|
|
140
181
|
return arrow_field_from_hint(cls, name=name)
|
yggdrasil/libs/__init__.py
CHANGED
yggdrasil/libs/databrickslib.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Optional Databricks SDK dependency helpers."""
|
|
2
|
+
|
|
1
3
|
try:
|
|
2
4
|
import databricks
|
|
3
5
|
import databricks.sdk # type: ignore
|
|
@@ -6,7 +8,9 @@ try:
|
|
|
6
8
|
databricks_sdk = databricks.sdk
|
|
7
9
|
except ImportError:
|
|
8
10
|
class _DatabricksDummy:
|
|
11
|
+
"""Placeholder object that raises if Databricks SDK is required."""
|
|
9
12
|
def __getattr__(self, item):
|
|
13
|
+
"""Raise an error when accessing missing Databricks SDK attributes."""
|
|
10
14
|
require_databricks_sdk()
|
|
11
15
|
|
|
12
16
|
databricks = _DatabricksDummy
|
|
@@ -14,6 +18,11 @@ except ImportError:
|
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
def require_databricks_sdk():
|
|
21
|
+
"""Ensure the Databricks SDK is available before use.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
None.
|
|
25
|
+
"""
|
|
17
26
|
if databricks_sdk is None:
|
|
18
27
|
raise ImportError(
|
|
19
28
|
"databricks_sdk is required to use this function. "
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Polars DataFrame extension helpers for joins and resampling."""
|
|
2
|
+
|
|
1
3
|
from __future__ import annotations
|
|
2
4
|
|
|
3
5
|
import datetime
|
|
@@ -39,6 +41,14 @@ def join_coalesced(
|
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
def _normalize_group_by(group_by: str | Sequence[str] | None) -> list[str] | None:
|
|
44
|
+
"""Normalize group_by inputs into a list or None.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
group_by: Grouping column or columns.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of column names or None.
|
|
51
|
+
"""
|
|
42
52
|
if group_by is None:
|
|
43
53
|
return None
|
|
44
54
|
if isinstance(group_by, str):
|
|
@@ -57,6 +67,15 @@ def _filter_kwargs_for_callable(fn: object, kwargs: dict[str, Any]) -> dict[str,
|
|
|
57
67
|
|
|
58
68
|
|
|
59
69
|
def _expr_from_agg(col: str, agg: Any) -> "pl.Expr":
|
|
70
|
+
"""Build a Polars expression from an aggregation spec.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
col: Column name to aggregate.
|
|
74
|
+
agg: Aggregation spec (expr, callable, or string).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Polars expression.
|
|
78
|
+
"""
|
|
60
79
|
base = pl.col(col)
|
|
61
80
|
|
|
62
81
|
if isinstance(agg, pl.Expr):
|
|
@@ -80,6 +99,14 @@ def _expr_from_agg(col: str, agg: Any) -> "pl.Expr":
|
|
|
80
99
|
|
|
81
100
|
|
|
82
101
|
def _normalize_aggs(agg: AggSpec) -> list["pl.Expr"]:
|
|
102
|
+
"""Normalize aggregation specs into a list of Polars expressions.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
agg: Mapping or sequence of aggregation specs.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of Polars expressions.
|
|
109
|
+
"""
|
|
83
110
|
if isinstance(agg, Mapping):
|
|
84
111
|
return [_expr_from_agg(col, spec) for col, spec in agg.items()]
|
|
85
112
|
|
|
@@ -91,11 +118,27 @@ def _normalize_aggs(agg: AggSpec) -> list["pl.Expr"]:
|
|
|
91
118
|
|
|
92
119
|
|
|
93
120
|
def _is_datetime(dtype: object) -> bool:
|
|
121
|
+
"""Return True when the dtype is a Polars datetime.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
dtype: Polars dtype to inspect.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
True if dtype is Polars Datetime.
|
|
128
|
+
"""
|
|
94
129
|
# Datetime-only inference (per requirement), version-safe.
|
|
95
130
|
return isinstance(dtype, pl.Datetime)
|
|
96
131
|
|
|
97
132
|
|
|
98
133
|
def _infer_time_col(df: "pl.DataFrame") -> str:
|
|
134
|
+
"""Infer the first datetime-like column name from a DataFrame.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
df: Polars DataFrame to inspect.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Column name of the first datetime field.
|
|
141
|
+
"""
|
|
99
142
|
# Find first Datetime column in schema order; ignore Date columns.
|
|
100
143
|
for name, dtype in df.schema.items():
|
|
101
144
|
if _is_datetime(dtype):
|
|
@@ -106,6 +149,15 @@ def _infer_time_col(df: "pl.DataFrame") -> str:
|
|
|
106
149
|
|
|
107
150
|
|
|
108
151
|
def _ensure_datetime_like(df: "pl.DataFrame", time_col: str) -> "pl.DataFrame":
|
|
152
|
+
"""Ensure a time column is cast to datetime for resampling.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
df: Polars DataFrame.
|
|
156
|
+
time_col: Column name to validate.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
DataFrame with time column cast to datetime if needed.
|
|
160
|
+
"""
|
|
109
161
|
dtype = df.schema.get(time_col)
|
|
110
162
|
if dtype is None:
|
|
111
163
|
raise KeyError(f"resample: time_col '{time_col}' not found in DataFrame columns.")
|
|
@@ -151,6 +203,14 @@ def _timedelta_to_polars_duration(td: datetime.timedelta) -> str:
|
|
|
151
203
|
|
|
152
204
|
|
|
153
205
|
def _normalize_duration(v: str | datetime.timedelta | None) -> str | None:
|
|
206
|
+
"""Normalize duration inputs to a Polars duration string.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
v: Duration string, timedelta, or None.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Normalized duration string or None.
|
|
213
|
+
"""
|
|
154
214
|
if v is None:
|
|
155
215
|
return None
|
|
156
216
|
if isinstance(v, str):
|
|
@@ -168,6 +228,18 @@ def _upsample_single(
|
|
|
168
228
|
offset: str | datetime.timedelta | None,
|
|
169
229
|
keep_group_order: bool,
|
|
170
230
|
) -> "pl.DataFrame":
|
|
231
|
+
"""Upsample a single DataFrame with normalized duration arguments.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
df: Polars DataFrame to upsample.
|
|
235
|
+
time_col: Name of the time column.
|
|
236
|
+
every: Sampling interval.
|
|
237
|
+
offset: Optional offset interval.
|
|
238
|
+
keep_group_order: Preserve input order when grouping.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Upsampled Polars DataFrame.
|
|
242
|
+
"""
|
|
171
243
|
df = df.sort(time_col)
|
|
172
244
|
|
|
173
245
|
every_n = _normalize_duration(every)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Spark DataFrame extension helpers for aliases and resampling."""
|
|
2
|
+
|
|
1
3
|
import datetime
|
|
2
4
|
import inspect
|
|
3
5
|
import re
|
|
@@ -30,6 +32,15 @@ _COL_RE = re.compile(r"Column<\s*['\"]?`?(.+?)`?['\"]?\s*>")
|
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
def _require_pyspark(fn_name: str) -> None:
|
|
35
|
+
"""Raise when PySpark is unavailable for a requested helper."""
|
|
36
|
+
"""Raise when PySpark is unavailable for a requested helper.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
fn_name: Name of the calling function.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
None.
|
|
43
|
+
"""
|
|
33
44
|
if pyspark is None or F is None or T is None:
|
|
34
45
|
raise RuntimeError(
|
|
35
46
|
f"{fn_name} requires PySpark to be available. "
|
|
@@ -41,6 +52,15 @@ def getAliases(
|
|
|
41
52
|
obj: Union[SparkDataFrame, SparkColumn, str, Iterable[Union[SparkDataFrame, SparkColumn, str]]],
|
|
42
53
|
full: bool = True,
|
|
43
54
|
) -> list[str]:
|
|
55
|
+
"""Return aliases for Spark columns/dataframes or collections.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
obj: Spark DataFrame/Column, string, or iterable of these.
|
|
59
|
+
full: Whether to return full qualified names.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of alias strings.
|
|
63
|
+
"""
|
|
44
64
|
if obj is None:
|
|
45
65
|
return []
|
|
46
66
|
|
|
@@ -92,6 +112,16 @@ def latest(
|
|
|
92
112
|
partitionBy: List[Union[str, SparkColumn]],
|
|
93
113
|
orderBy: List[Union[str, SparkColumn]],
|
|
94
114
|
) -> SparkDataFrame:
|
|
115
|
+
"""Return the latest rows per partition based on ordering.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
df: Spark DataFrame.
|
|
119
|
+
partitionBy: Columns to partition by.
|
|
120
|
+
orderBy: Columns to order by.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Spark DataFrame with latest rows per partition.
|
|
124
|
+
"""
|
|
95
125
|
_require_pyspark("latest")
|
|
96
126
|
|
|
97
127
|
partition_col_names = getAliases(partitionBy)
|
|
@@ -123,12 +153,30 @@ def _infer_time_col_spark(df: "pyspark.sql.DataFrame") -> str:
|
|
|
123
153
|
|
|
124
154
|
|
|
125
155
|
def _filter_kwargs_for_callable(fn: object, kwargs: dict[str, Any]) -> dict[str, Any]:
|
|
156
|
+
"""Filter kwargs to only those accepted by the callable.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
fn: Callable to inspect.
|
|
160
|
+
kwargs: Candidate keyword arguments.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Filtered keyword arguments.
|
|
164
|
+
"""
|
|
126
165
|
sig = inspect.signature(fn) # type: ignore[arg-type]
|
|
127
166
|
allowed = set(sig.parameters.keys())
|
|
128
167
|
return {k: v for k, v in kwargs.items() if (k in allowed and v is not None)}
|
|
129
168
|
|
|
130
169
|
|
|
131
170
|
def _append_drop_col_to_spark_schema(schema: "T.StructType", drop_col: str) -> "T.StructType":
|
|
171
|
+
"""Ensure the drop column exists in the Spark schema.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
schema: Spark schema to augment.
|
|
175
|
+
drop_col: Column name to add if missing.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Updated Spark schema.
|
|
179
|
+
"""
|
|
132
180
|
_require_pyspark("_append_drop_col_to_spark_schema")
|
|
133
181
|
if drop_col in schema.fieldNames():
|
|
134
182
|
return schema
|
|
@@ -169,6 +217,14 @@ def upsample(
|
|
|
169
217
|
spark_schema = arrow_field_to_spark_field(options.target_field)
|
|
170
218
|
|
|
171
219
|
def within_group(tb: pa.Table) -> pa.Table:
|
|
220
|
+
"""Apply upsample logic to a grouped Arrow table.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
tb: Arrow table for a grouped partition.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Arrow table with upsampled data.
|
|
227
|
+
"""
|
|
172
228
|
res = (
|
|
173
229
|
arrow_table_to_polars_dataframe(tb, options)
|
|
174
230
|
.sort(time_col_name)
|
|
@@ -277,6 +333,14 @@ def resample(
|
|
|
277
333
|
out_options = CastOptions.check_arg(out_arrow_field)
|
|
278
334
|
|
|
279
335
|
def within_group(tb: pa.Table) -> pa.Table:
|
|
336
|
+
"""Apply resample logic to a grouped Arrow table.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
tb: Arrow table for a grouped partition.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
Arrow table with resampled data.
|
|
343
|
+
"""
|
|
280
344
|
from .polars_extensions import resample
|
|
281
345
|
|
|
282
346
|
pdf = arrow_table_to_polars_dataframe(tb, in_options)
|
|
@@ -329,6 +393,18 @@ def checkJoin(
|
|
|
329
393
|
*args,
|
|
330
394
|
**kwargs,
|
|
331
395
|
):
|
|
396
|
+
"""Join two DataFrames with schema-aware column casting.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
df: Left Spark DataFrame.
|
|
400
|
+
other: Right Spark DataFrame.
|
|
401
|
+
on: Join keys or mapping.
|
|
402
|
+
*args: Positional args passed to join.
|
|
403
|
+
**kwargs: Keyword args passed to join.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Joined Spark DataFrame.
|
|
407
|
+
"""
|
|
332
408
|
_require_pyspark("checkJoin")
|
|
333
409
|
|
|
334
410
|
other = convert(other, SparkDataFrame)
|
|
@@ -371,12 +447,32 @@ def checkMapInArrow(
|
|
|
371
447
|
*args,
|
|
372
448
|
**kwargs,
|
|
373
449
|
):
|
|
450
|
+
"""Wrap mapInArrow to enforce output schema conversion.
|
|
451
|
+
|
|
452
|
+
Args:
|
|
453
|
+
df: Spark DataFrame.
|
|
454
|
+
func: Generator function yielding RecordBatches.
|
|
455
|
+
schema: Output schema (Spark StructType or DDL string).
|
|
456
|
+
*args: Positional args passed to mapInArrow.
|
|
457
|
+
**kwargs: Keyword args passed to mapInArrow.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Spark DataFrame with enforced schema.
|
|
461
|
+
"""
|
|
374
462
|
_require_pyspark("mapInArrow")
|
|
375
463
|
|
|
376
464
|
spark_schema = convert(schema, T.StructType)
|
|
377
465
|
arrow_schema = convert(schema, pa.Field)
|
|
378
466
|
|
|
379
467
|
def patched(batches: Iterable[pa.RecordBatch]):
|
|
468
|
+
"""Convert batches yielded by user function to the target schema.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
batches: Input RecordBatch iterable.
|
|
472
|
+
|
|
473
|
+
Yields:
|
|
474
|
+
RecordBatch instances conforming to the output schema.
|
|
475
|
+
"""
|
|
380
476
|
for src in func(batches):
|
|
381
477
|
yield convert(src, pa.RecordBatch, arrow_schema)
|
|
382
478
|
|
|
@@ -395,6 +491,18 @@ def checkMapInPandas(
|
|
|
395
491
|
*args,
|
|
396
492
|
**kwargs,
|
|
397
493
|
):
|
|
494
|
+
"""Wrap mapInPandas to enforce output schema conversion.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
df: Spark DataFrame.
|
|
498
|
+
func: Generator function yielding pandas DataFrames.
|
|
499
|
+
schema: Output schema (Spark StructType or DDL string).
|
|
500
|
+
*args: Positional args passed to mapInPandas.
|
|
501
|
+
**kwargs: Keyword args passed to mapInPandas.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
Spark DataFrame with enforced schema.
|
|
505
|
+
"""
|
|
398
506
|
_require_pyspark("mapInPandas")
|
|
399
507
|
|
|
400
508
|
import pandas as _pd # local import so we don't shadow the ..pandas module
|
|
@@ -403,6 +511,14 @@ def checkMapInPandas(
|
|
|
403
511
|
arrow_schema = convert(schema, pa.Field)
|
|
404
512
|
|
|
405
513
|
def patched(batches: Iterable[_pd.DataFrame]):
|
|
514
|
+
"""Convert pandas batches yielded by user function to the target schema.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
batches: Input pandas DataFrame iterable.
|
|
518
|
+
|
|
519
|
+
Yields:
|
|
520
|
+
pandas DataFrames conforming to the output schema.
|
|
521
|
+
"""
|
|
406
522
|
for src in func(batches):
|
|
407
523
|
yield convert(src, _pd.DataFrame, arrow_schema)
|
|
408
524
|
|
yggdrasil/libs/pandaslib.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Optional pandas dependency helpers."""
|
|
2
|
+
|
|
1
3
|
try:
|
|
2
4
|
import pandas # type: ignore
|
|
3
5
|
pandas = pandas
|
|
@@ -6,6 +8,11 @@ except ImportError:
|
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
def require_pandas():
|
|
11
|
+
"""Ensure pandas is available before using pandas helpers.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
None.
|
|
15
|
+
"""
|
|
9
16
|
if pandas is None:
|
|
10
17
|
raise ImportError(
|
|
11
18
|
"pandas is required to use this function. "
|
yggdrasil/libs/polarslib.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Optional Polars dependency helpers."""
|
|
2
|
+
|
|
1
3
|
try:
|
|
2
4
|
import polars # type: ignore
|
|
3
5
|
|
|
@@ -13,6 +15,11 @@ __all__ = [
|
|
|
13
15
|
|
|
14
16
|
|
|
15
17
|
def require_polars():
|
|
18
|
+
"""Ensure polars is available before using polars helpers.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
None.
|
|
22
|
+
"""
|
|
16
23
|
if polars is None:
|
|
17
24
|
raise ImportError(
|
|
18
25
|
"polars is required to use this function. "
|
yggdrasil/libs/sparklib.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Optional Spark dependency helpers and Arrow/Spark type conversions."""
|
|
2
|
+
|
|
1
3
|
from typing import Any
|
|
2
4
|
|
|
3
5
|
import pyarrow as pa
|
|
@@ -51,18 +53,23 @@ except ImportError: # pragma: no cover - Spark not available
|
|
|
51
53
|
pyspark = None
|
|
52
54
|
|
|
53
55
|
class SparkSession:
|
|
56
|
+
"""Fallback SparkSession placeholder when pyspark is unavailable."""
|
|
54
57
|
|
|
55
58
|
@classmethod
|
|
56
59
|
def getActiveSession(cls):
|
|
60
|
+
"""Return None to indicate no active session is available."""
|
|
57
61
|
return None
|
|
58
62
|
|
|
59
63
|
class SparkDataFrame:
|
|
64
|
+
"""Fallback DataFrame placeholder when pyspark is unavailable."""
|
|
60
65
|
pass
|
|
61
66
|
|
|
62
67
|
class SparkColumn:
|
|
68
|
+
"""Fallback Column placeholder when pyspark is unavailable."""
|
|
63
69
|
pass
|
|
64
70
|
|
|
65
71
|
class SparkDataType:
|
|
72
|
+
"""Fallback DataType placeholder when pyspark is unavailable."""
|
|
66
73
|
pass
|
|
67
74
|
|
|
68
75
|
ARROW_TO_SPARK = {}
|
|
@@ -91,6 +98,12 @@ __all__ = [
|
|
|
91
98
|
def require_pyspark(active_session: bool = False):
|
|
92
99
|
"""
|
|
93
100
|
Optionally enforce that pyspark (and an active SparkSession) exists.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
active_session: Require an active SparkSession if True.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
None.
|
|
94
107
|
"""
|
|
95
108
|
if pyspark is None:
|
|
96
109
|
raise ImportError(
|
|
@@ -116,6 +129,13 @@ def arrow_type_to_spark_type(
|
|
|
116
129
|
) -> "T.DataType":
|
|
117
130
|
"""
|
|
118
131
|
Convert a pyarrow.DataType to a pyspark.sql.types.DataType.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
arrow_type: Arrow data type to convert.
|
|
135
|
+
cast_options: Optional casting options.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Spark SQL data type.
|
|
119
139
|
"""
|
|
120
140
|
require_pyspark()
|
|
121
141
|
|
|
@@ -191,6 +211,13 @@ def arrow_field_to_spark_field(
|
|
|
191
211
|
) -> "T.StructField":
|
|
192
212
|
"""
|
|
193
213
|
Convert a pyarrow.Field to a pyspark StructField.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
field: Arrow field to convert.
|
|
217
|
+
cast_options: Optional casting options.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Spark StructField representation.
|
|
194
221
|
"""
|
|
195
222
|
spark_type = arrow_type_to_spark_type(field.type, cast_options)
|
|
196
223
|
|
|
@@ -208,6 +235,13 @@ def spark_type_to_arrow_type(
|
|
|
208
235
|
) -> pa.DataType:
|
|
209
236
|
"""
|
|
210
237
|
Convert a pyspark.sql.types.DataType to a pyarrow.DataType.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
spark_type: Spark SQL data type to convert.
|
|
241
|
+
cast_options: Optional casting options.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Arrow data type.
|
|
211
245
|
"""
|
|
212
246
|
require_pyspark()
|
|
213
247
|
from pyspark.sql.types import (
|
|
@@ -287,6 +321,13 @@ def spark_field_to_arrow_field(
|
|
|
287
321
|
) -> pa.Field:
|
|
288
322
|
"""
|
|
289
323
|
Convert a pyspark StructField to a pyarrow.Field.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
field: Spark StructField to convert.
|
|
327
|
+
cast_options: Optional casting options.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Arrow field.
|
|
290
331
|
"""
|
|
291
332
|
arrow_type = spark_type_to_arrow_type(field.dataType, cast_options)
|
|
292
333
|
|
yggdrasil/pyutils/__init__.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
|
+
"""Python utility helpers for retries, parallelism, and environment management."""
|
|
2
|
+
|
|
1
3
|
from .retry import retry
|
|
2
4
|
from .parallel import parallelize
|
|
3
5
|
from .python_env import PythonEnv
|
|
4
6
|
from .callable_serde import CallableSerde
|
|
7
|
+
|
|
8
|
+
__all__ = ["retry", "parallelize", "PythonEnv", "CallableSerde"]
|