FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +140 -30
- flowerpower/pipeline/base.py +3 -1
- flowerpower/pipeline/registry.py +9 -9
- flowerpower/plugins/io/base.py +13 -7
- flowerpower/plugins/io/helpers/polars.py +346 -124
- flowerpower/plugins/io/helpers/pyarrow.py +406 -0
- flowerpower/settings/general.py +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/METADATA +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/RECORD +13 -12
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,346 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
import pandas as pd
|
1
|
+
import numpy as np
|
4
2
|
import polars as pl
|
5
3
|
import polars.selectors as cs
|
6
4
|
|
7
5
|
from .datetime import get_timedelta_str, get_timestamp_column
|
8
6
|
|
9
|
-
#
|
7
|
+
# Pre-compiled regex patterns (identical to original)
|
8
|
+
INTEGER_REGEX = r"^[-+]?\d+$"
|
9
|
+
FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
|
10
|
+
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
|
11
|
+
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
|
12
|
+
DATETIME_REGEX = (
|
13
|
+
r"^("
|
14
|
+
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
|
15
|
+
r"|"
|
16
|
+
r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
|
17
|
+
r"|"
|
18
|
+
r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
|
19
|
+
r"|"
|
20
|
+
r"\d{8}" # Compact: 20231231
|
21
|
+
r")"
|
22
|
+
r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
|
23
|
+
r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
|
24
|
+
r"$"
|
25
|
+
)
|
26
|
+
|
27
|
+
# Float32 range limits
|
28
|
+
F32_MIN = float(np.finfo(np.float32).min)
|
29
|
+
F32_MAX = float(np.finfo(np.float32).max)
|
30
|
+
|
31
|
+
|
32
|
+
def _clean_string_expr(col_name: str) -> pl.Expr:
|
33
|
+
"""Create expression to clean string values."""
|
34
|
+
return (
|
35
|
+
pl.col(col_name).str.strip_chars().replace({"-": None, "": None, "None": None})
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
def _can_downcast_to_float32(series: pl.Series) -> bool:
|
40
|
+
"""Check if float values are within Float32 range."""
|
41
|
+
finite_values = series.filter(series.is_finite())
|
42
|
+
if finite_values.is_empty():
|
43
|
+
return True
|
44
|
+
|
45
|
+
min_val, max_val = finite_values.min(), finite_values.max()
|
46
|
+
return F32_MIN <= min_val <= max_val <= F32_MAX
|
47
|
+
|
48
|
+
|
49
|
+
def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) -> pl.Expr:
|
50
|
+
"""Optimize numeric column types."""
|
51
|
+
if not shrink:
|
52
|
+
return pl.col(col_name)
|
53
|
+
|
54
|
+
if series.dtype == pl.Float64 and not _can_downcast_to_float32(series):
|
55
|
+
return pl.col(col_name)
|
56
|
+
|
57
|
+
return pl.col(col_name).shrink_dtype()
|
58
|
+
|
59
|
+
|
60
|
+
def _optimize_string_column(
|
61
|
+
series: pl.Series,
|
62
|
+
col_name: str,
|
63
|
+
shrink_numerics: bool,
|
64
|
+
time_zone: str | None = None,
|
65
|
+
) -> pl.Expr:
|
66
|
+
"""Convert string column to appropriate type based on content analysis."""
|
67
|
+
# Return early for empty or null-only series
|
68
|
+
cleaned_expr = _clean_string_expr(col_name)
|
69
|
+
non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
|
70
|
+
if len(non_null) == 0:
|
71
|
+
return pl.col(col_name).cast(pl.Int8)
|
72
|
+
|
73
|
+
stripped = non_null.str.strip_chars()
|
74
|
+
lowercase = stripped.str.to_lowercase()
|
75
|
+
|
76
|
+
# Check for boolean values
|
77
|
+
if lowercase.str.contains(BOOLEAN_REGEX).all():
|
78
|
+
return (
|
79
|
+
cleaned_expr.str.to_lowercase()
|
80
|
+
.str.contains(BOOLEAN_TRUE_REGEX)
|
81
|
+
.alias(col_name)
|
82
|
+
)
|
83
|
+
|
84
|
+
elif stripped.str.contains(INTEGER_REGEX).all():
|
85
|
+
int_expr = cleaned_expr.cast(pl.Int64)
|
86
|
+
return (
|
87
|
+
int_expr.shrink_dtype().alias(col_name)
|
88
|
+
if shrink_numerics
|
89
|
+
else int_expr.alias(col_name)
|
90
|
+
)
|
91
|
+
|
92
|
+
# Check for numeric values
|
93
|
+
elif stripped.str.contains(FLOAT_REGEX).all():
|
94
|
+
float_expr = cleaned_expr.str.replace_all(",", ".").cast(pl.Float64)
|
95
|
+
|
96
|
+
if shrink_numerics:
|
97
|
+
# Check if values can fit in Float32
|
98
|
+
temp_floats = stripped.str.replace_all(",", ".").cast(
|
99
|
+
pl.Float64, strict=False
|
100
|
+
)
|
101
|
+
if _can_downcast_to_float32(temp_floats):
|
102
|
+
return float_expr.shrink_dtype().alias(col_name)
|
103
|
+
|
104
|
+
return float_expr.alias(col_name)
|
105
|
+
|
106
|
+
try:
|
107
|
+
if stripped.str.contains(DATETIME_REGEX).all():
|
108
|
+
return cleaned_expr.str.to_datetime(
|
109
|
+
strict=False, time_unit="us", time_zone=time_zone
|
110
|
+
).alias(col_name)
|
111
|
+
except pl.exceptions.PolarsError:
|
112
|
+
pass
|
113
|
+
|
114
|
+
# Keep original if no conversion applies
|
115
|
+
return pl.col(col_name)
|
116
|
+
|
117
|
+
|
118
|
+
def _get_column_expr(
|
119
|
+
df: pl.DataFrame, col_name: str, shrink_numerics: bool, time_zone: str | None = None
|
120
|
+
) -> pl.Expr:
|
121
|
+
"""Generate optimization expression for a single column."""
|
122
|
+
series = df[col_name]
|
123
|
+
|
124
|
+
# Handle all-null columns
|
125
|
+
if series.is_null().all():
|
126
|
+
return pl.col(col_name).cast(pl.Int8)
|
127
|
+
|
128
|
+
# Process based on current type
|
129
|
+
if series.dtype.is_numeric():
|
130
|
+
return _optimize_numeric_column(series, col_name, shrink_numerics)
|
131
|
+
elif series.dtype == pl.Utf8:
|
132
|
+
return _optimize_string_column(series, col_name, shrink_numerics, time_zone)
|
133
|
+
|
134
|
+
# Keep original for other types
|
135
|
+
return pl.col(col_name)
|
136
|
+
|
137
|
+
|
138
|
+
def opt_dtype(
|
139
|
+
df: pl.DataFrame,
|
140
|
+
include: str | list[str] | None = None,
|
141
|
+
exclude: str | list[str] | None = None,
|
142
|
+
time_zone: str | None = None,
|
143
|
+
shrink_numerics: bool = True,
|
144
|
+
) -> pl.DataFrame:
|
145
|
+
"""
|
146
|
+
Optimize data types of a Polars DataFrame for performance and memory efficiency.
|
147
|
+
|
148
|
+
This function analyzes each column and converts it to the most appropriate
|
149
|
+
data type based on content, handling string-to-type conversions and
|
150
|
+
numeric type downcasting.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
df: DataFrame to optimize
|
154
|
+
include: Column(s) to include in optimization (default: all columns)
|
155
|
+
exclude: Column(s) to exclude from optimization
|
156
|
+
time_zone: Optional time zone for datetime parsing
|
157
|
+
shrink_numerics: Whether to downcast numeric types when possible
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
DataFrame with optimized data types
|
161
|
+
"""
|
162
|
+
# Normalize include/exclude parameters
|
163
|
+
if isinstance(include, str):
|
164
|
+
include = [include]
|
165
|
+
if isinstance(exclude, str):
|
166
|
+
exclude = [exclude]
|
167
|
+
|
168
|
+
# Determine columns to process
|
169
|
+
cols_to_process = df.columns
|
170
|
+
if include:
|
171
|
+
cols_to_process = [col for col in include if col in df.columns]
|
172
|
+
if exclude:
|
173
|
+
cols_to_process = [col for col in cols_to_process if col not in exclude]
|
174
|
+
|
175
|
+
# Generate optimization expressions for all columns
|
176
|
+
expressions = [
|
177
|
+
_get_column_expr(df, col_name, shrink_numerics, time_zone)
|
178
|
+
for col_name in cols_to_process
|
179
|
+
]
|
180
|
+
|
181
|
+
# Apply all transformations at once if any exist
|
182
|
+
return df if not expressions else df.with_columns(expressions)
|
183
|
+
|
184
|
+
|
185
|
+
# def opt_dtype(
|
186
|
+
# df: pl.DataFrame,
|
187
|
+
# include: str | list[str] | None = None,
|
188
|
+
# exclude: str | list[str] | None = None,
|
189
|
+
# time_zone: str | None = None,
|
190
|
+
# shrink_numerics: bool = True,
|
191
|
+
# ) -> pl.DataFrame:
|
192
|
+
# """
|
193
|
+
# Analyzes and optimizes the data types of a Polars DataFrame for performance
|
194
|
+
# and memory efficiency.
|
195
|
+
|
196
|
+
# This version includes:
|
197
|
+
# - Robust numeric, boolean, and datetime casting from strings.
|
198
|
+
# - Handling of whitespace and common null-like string values.
|
199
|
+
# - Casting of columns containing only nulls to pl.Int8.
|
200
|
+
# - Optional shrinking of numeric columns to the smallest possible type.
|
201
|
+
|
202
|
+
# Args:
|
203
|
+
# df: The DataFrame to optimize.
|
204
|
+
# include: A list of columns to forcefully include in the optimization.
|
205
|
+
# exclude: A list of columns to exclude from the optimization.
|
206
|
+
# time_zone: Optional time zone for datetime parsing.
|
207
|
+
# shrink_numerics: If True, numeric columns (both existing and newly converted from strings)
|
208
|
+
# will be downcast to the smallest possible type that can hold their values (e.g., Int64 to Int32, Float64 to Float32),
|
209
|
+
# similar to Polars' shrink_dtype() behavior. If False, this shrinking step is skipped.
|
210
|
+
|
211
|
+
# Returns:
|
212
|
+
# An optimized Polars DataFrame with improved data types.
|
213
|
+
# """
|
214
|
+
# # Phase 1: Analysis - Determine columns to process and build a list of
|
215
|
+
# # transformation expressions without executing them immediately.
|
216
|
+
# if isinstance(include, str):
|
217
|
+
# include = [include]
|
218
|
+
# if isinstance(exclude, str):
|
219
|
+
# exclude = [exclude]
|
220
|
+
|
221
|
+
# cols_to_process = df.columns
|
222
|
+
# if include:
|
223
|
+
# cols_to_process = [col for col in include if col in df.columns]
|
224
|
+
# if exclude:
|
225
|
+
# cols_to_process = [col for col in cols_to_process if col not in exclude]
|
226
|
+
|
227
|
+
# expressions = []
|
228
|
+
# for col_name in cols_to_process:
|
229
|
+
# s = df[col_name]
|
230
|
+
|
231
|
+
# # NEW: If a column is entirely null, cast it to Int8 and skip other checks.
|
232
|
+
# if s.is_null().all():
|
233
|
+
# expressions.append(pl.col(col_name).cast(pl.Int8))
|
234
|
+
# continue
|
235
|
+
|
236
|
+
# dtype = s.dtype
|
237
|
+
|
238
|
+
# # 1. Optimize numeric columns by shrinking their size
|
239
|
+
# if dtype.is_numeric():
|
240
|
+
# if shrink_numerics:
|
241
|
+
# if dtype == pl.Float64:
|
242
|
+
# column_series = df[col_name]
|
243
|
+
# finite_values_series = column_series.filter(
|
244
|
+
# column_series.is_finite()
|
245
|
+
# )
|
246
|
+
# can_shrink = True
|
247
|
+
# if not finite_values_series.is_empty():
|
248
|
+
# min_finite_val = finite_values_series.min()
|
249
|
+
# max_finite_val = finite_values_series.max()
|
250
|
+
# if (min_finite_val < F32_MIN_FINITE) or (
|
251
|
+
# max_finite_val > F32_MAX_FINITE
|
252
|
+
# ):
|
253
|
+
# can_shrink = False
|
254
|
+
# if can_shrink:
|
255
|
+
# expressions.append(pl.col(col_name).shrink_dtype())
|
256
|
+
# else:
|
257
|
+
# expressions.append(pl.col(col_name))
|
258
|
+
# else:
|
259
|
+
# expressions.append(pl.col(col_name).shrink_dtype())
|
260
|
+
# else:
|
261
|
+
# expressions.append(pl.col(col_name))
|
262
|
+
# continue
|
263
|
+
|
264
|
+
# # 2. Optimize string columns by casting to more specific types
|
265
|
+
# if dtype == pl.Utf8:
|
266
|
+
# # Create a cleaned column expression that first strips whitespace, then
|
267
|
+
# # replaces common null-like strings.
|
268
|
+
# cleaned_col = (
|
269
|
+
# pl.col(col_name)
|
270
|
+
# .str.strip_chars()
|
271
|
+
# .replace({"-": None, "": None, "None": None})
|
272
|
+
# )
|
273
|
+
|
274
|
+
# # Analyze a stripped, non-null version of the series to decide the cast type
|
275
|
+
# s_non_null = s.drop_nulls()
|
276
|
+
# if len(s_non_null) == 0:
|
277
|
+
# # The column only contains nulls or null-like strings.
|
278
|
+
# # Cast to Int8 as requested for all-null columns.
|
279
|
+
# expressions.append(pl.col(col_name).cast(pl.Int8))
|
280
|
+
# continue
|
281
|
+
|
282
|
+
# s_stripped_non_null = s_non_null.str.strip_chars()
|
283
|
+
|
284
|
+
# # Check for boolean type
|
285
|
+
# if s_stripped_non_null.str.to_lowercase().str.contains(BOOLEAN_REGEX).all():
|
286
|
+
# expr = cleaned_col.str.to_lowercase().str.contains(BOOLEAN_TRUE_REGEX)
|
287
|
+
# expressions.append(expr.alias(col_name))
|
288
|
+
# continue
|
289
|
+
|
290
|
+
# # Check for numeric type
|
291
|
+
# if s_stripped_non_null.str.contains(NUMERIC_REGEX).all():
|
292
|
+
# is_float = s_stripped_non_null.str.contains(r"[.,eE]").any()
|
293
|
+
# numeric_col = cleaned_col.str.replace_all(",", ".")
|
294
|
+
# if is_float:
|
295
|
+
# if shrink_numerics:
|
296
|
+
# temp_float_series = s_stripped_non_null.str.replace_all(
|
297
|
+
# ",", "."
|
298
|
+
# ).cast(pl.Float64, strict=False)
|
299
|
+
# finite_values_series = temp_float_series.filter(
|
300
|
+
# temp_float_series.is_finite()
|
301
|
+
# )
|
302
|
+
# can_shrink = True
|
303
|
+
# if not finite_values_series.is_empty():
|
304
|
+
# min_finite_val = finite_values_series.min()
|
305
|
+
# max_finite_val = finite_values_series.max()
|
306
|
+
# if (min_finite_val < F32_MIN_FINITE) or (
|
307
|
+
# max_finite_val > F32_MAX_FINITE
|
308
|
+
# ):
|
309
|
+
# can_shrink = False
|
310
|
+
# base_expr = numeric_col.cast(pl.Float64)
|
311
|
+
# if can_shrink:
|
312
|
+
# expressions.append(base_expr.shrink_dtype().alias(col_name))
|
313
|
+
# else:
|
314
|
+
# expressions.append(base_expr.alias(col_name))
|
315
|
+
# else:
|
316
|
+
# expressions.append(numeric_col.cast(pl.Float64).alias(col_name))
|
317
|
+
# else:
|
318
|
+
# if shrink_numerics:
|
319
|
+
# expressions.append(
|
320
|
+
# numeric_col.cast(pl.Int64).shrink_dtype().alias(col_name)
|
321
|
+
# )
|
322
|
+
# else:
|
323
|
+
# expressions.append(numeric_col.cast(pl.Int64).alias(col_name))
|
324
|
+
# continue
|
325
|
+
|
326
|
+
# # Check for datetime type using a fast heuristic
|
327
|
+
# try:
|
328
|
+
# if s_stripped_non_null.str.contains(DATETIME_REGEX).all():
|
329
|
+
# expressions.append(
|
330
|
+
# cleaned_col.str.to_datetime(
|
331
|
+
# strict=False, time_unit="us", time_zone=time_zone
|
332
|
+
# ).alias(col_name)
|
333
|
+
# )
|
334
|
+
# continue
|
335
|
+
# except pl.exceptions.PolarsError:
|
336
|
+
# pass
|
337
|
+
|
338
|
+
# # Phase 2: Execution - If any optimizations were identified, apply them
|
339
|
+
# # all at once for maximum parallelism and performance.
|
340
|
+
# if not expressions:
|
341
|
+
# return df
|
342
|
+
|
343
|
+
# return df.with_columns(expressions)
|
10
344
|
|
11
345
|
|
12
346
|
def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None):
|
@@ -45,119 +379,6 @@ def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None)
|
|
45
379
|
return df
|
46
380
|
|
47
381
|
|
48
|
-
def _opt_dtype(
|
49
|
-
s: pl.Series, strict: bool = True, shrink_dtype: bool = True
|
50
|
-
) -> pl.Series:
|
51
|
-
if s.dtype == pl.Utf8():
|
52
|
-
try:
|
53
|
-
s = s.set(s == "-", None).set(s == "", None).set(s == "None", None)
|
54
|
-
|
55
|
-
# cast string numbers to int or float
|
56
|
-
if (
|
57
|
-
s.str.contains(r"^[-+]?[0-9]*[.,]?[0-9]+([eE][-+]?[0-9]+)?$")
|
58
|
-
| s.is_null()
|
59
|
-
| s.str.contains(r"^$")
|
60
|
-
).all():
|
61
|
-
s = (
|
62
|
-
s.str.replace_all(",", ".")
|
63
|
-
# .str.replace_all("^0{1,}$", "+0")
|
64
|
-
# .str.strip_chars_start("0")
|
65
|
-
.str.replace_all(r"\.0*$", "")
|
66
|
-
)
|
67
|
-
s = s.set(s == "-", None).set(s == "", None).set(s == "None", None)
|
68
|
-
if s.str.contains(r"\.").any() | s.str.contains("NaN").any():
|
69
|
-
s = s.cast(pl.Float64(), strict=True)
|
70
|
-
if shrink_dtype:
|
71
|
-
try:
|
72
|
-
if s.min() >= -16777216 and s.max() <= 16777216:
|
73
|
-
s = s.cast(pl.Float32(), strict=True)
|
74
|
-
except TypeError:
|
75
|
-
# if min or max is None, we cannot cast to Float32
|
76
|
-
pass
|
77
|
-
else:
|
78
|
-
s = s.cast(pl.Int64(), strict=True)
|
79
|
-
if shrink_dtype:
|
80
|
-
s = s.shrink_dtype()
|
81
|
-
|
82
|
-
# cast str to datetime
|
83
|
-
|
84
|
-
elif (
|
85
|
-
s.str.contains(r"^\d{4}-\d{2}-\d{2}$")
|
86
|
-
| s.str.contains(r"^\d{1,2}\/\d{1,2}\/\d{4}$")
|
87
|
-
| s.str.contains(
|
88
|
-
r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?.\d{0,}$"
|
89
|
-
)
|
90
|
-
| s.str.contains(
|
91
|
-
r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?\.\d{0,}$"
|
92
|
-
)
|
93
|
-
| s.str.contains(
|
94
|
-
r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?\.\d{1,}\w{0,1}\+\d{0,2}:\d{0,2}:\d{0,2}$"
|
95
|
-
)
|
96
|
-
| s.is_null()
|
97
|
-
| s.str.contains("^$")
|
98
|
-
).all():
|
99
|
-
s = pl.Series(
|
100
|
-
name=s.name, values=pd.to_datetime(s, format="mixed")
|
101
|
-
).cast(pl.Datetime("us"))
|
102
|
-
|
103
|
-
# cast str to bool
|
104
|
-
elif (
|
105
|
-
s.str.to_lowercase()
|
106
|
-
.str.contains("^(true|false|1|0|wahr|falsch|nein|nok|ok|ja)$")
|
107
|
-
.all()
|
108
|
-
):
|
109
|
-
s = s.str.to_lowercase().str.contains(
|
110
|
-
"^(true|1|wahr|ja|ok)$", strict=True
|
111
|
-
)
|
112
|
-
|
113
|
-
except Exception as e:
|
114
|
-
if strict:
|
115
|
-
e.add_note(
|
116
|
-
"if you were trying to cast Utf8 to temporal dtypes, consider setting `strict=False`"
|
117
|
-
)
|
118
|
-
raise e
|
119
|
-
else:
|
120
|
-
if shrink_dtype:
|
121
|
-
if s.dtype == pl.Float64():
|
122
|
-
try:
|
123
|
-
if s.min() >= -16777216 and s.max() <= 16777216:
|
124
|
-
s = s.cast(pl.Float32(), strict=True)
|
125
|
-
except TypeError:
|
126
|
-
# if min or max is None, we cannot cast to Float32
|
127
|
-
pass
|
128
|
-
|
129
|
-
else:
|
130
|
-
s = s.shrink_dtype()
|
131
|
-
|
132
|
-
return s
|
133
|
-
|
134
|
-
|
135
|
-
def opt_dtype(
|
136
|
-
df: pl.DataFrame,
|
137
|
-
exclude: str | list[str] | None = None,
|
138
|
-
strict: bool = True,
|
139
|
-
include: str | list[str] | None = None,
|
140
|
-
shrink_dtype: bool = True,
|
141
|
-
) -> pl.DataFrame:
|
142
|
-
_opt_dtype_strict = partial(_opt_dtype, strict=strict, shrink_dtype=shrink_dtype)
|
143
|
-
_opt_dtype_not_strict = partial(_opt_dtype, strict=False, shrink_dtype=shrink_dtype)
|
144
|
-
if include is not None:
|
145
|
-
if isinstance(include, str):
|
146
|
-
include = [include]
|
147
|
-
exclude = [col for col in df.columns if col not in include]
|
148
|
-
return (
|
149
|
-
df.with_columns(
|
150
|
-
pl.all()
|
151
|
-
.exclude(exclude)
|
152
|
-
.map_batches(_opt_dtype_strict if strict else _opt_dtype_not_strict)
|
153
|
-
)
|
154
|
-
if exclude is not None
|
155
|
-
else df.with_columns(
|
156
|
-
pl.all().map_batches(_opt_dtype_strict if strict else _opt_dtype_not_strict)
|
157
|
-
)
|
158
|
-
)
|
159
|
-
|
160
|
-
|
161
382
|
def explode_all(df: pl.DataFrame | pl.LazyFrame):
|
162
383
|
list_columns = [col for col in df.columns if df[col].dtype == pl.List]
|
163
384
|
for col in list_columns:
|
@@ -364,8 +585,13 @@ def with_row_count(
|
|
364
585
|
# return df
|
365
586
|
|
366
587
|
|
367
|
-
def
|
368
|
-
|
588
|
+
def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
|
589
|
+
"""Remove columns with all null values from the DataFrame."""
|
590
|
+
return df.select([col for col in df.columns if df[col].null_count() < df.height])
|
591
|
+
|
592
|
+
|
593
|
+
def unify_schemas(dfs: list[pl.DataFrame | pl.LazyFrame]) -> pl.Schema:
|
594
|
+
df = pl.concat([df.lazy() for df in dfs], how="diagonal_relaxed")
|
369
595
|
if isinstance(df, pl.LazyFrame):
|
370
596
|
return df.collect_schema()
|
371
597
|
return df.schema
|
@@ -411,7 +637,7 @@ def delta(
|
|
411
637
|
df1 = df1.lazy()
|
412
638
|
|
413
639
|
# cast to equal schema
|
414
|
-
unified_schema =
|
640
|
+
unified_schema = unify_schemas([df1.select(subset), df2.select(subset)])
|
415
641
|
|
416
642
|
df1 = df1.cast_relaxed(unified_schema)
|
417
643
|
df2 = df2.cast_relaxed(unified_schema)
|
@@ -544,10 +770,6 @@ def partition_by(
|
|
544
770
|
return [({}, df)]
|
545
771
|
|
546
772
|
|
547
|
-
def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
|
548
|
-
return df.select([col for col in df.columns if not df[col].is_null().all()])
|
549
|
-
|
550
|
-
|
551
773
|
pl.DataFrame.unnest_all = unnest_all
|
552
774
|
pl.DataFrame.explode_all = explode_all
|
553
775
|
pl.DataFrame.opt_dtype = opt_dtype
|