FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/cfg/__init__.py +3 -3
- flowerpower/cfg/pipeline/__init__.py +5 -3
- flowerpower/cfg/project/__init__.py +3 -3
- flowerpower/cfg/project/job_queue.py +1 -128
- flowerpower/cli/__init__.py +5 -5
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/job_queue.py +401 -133
- flowerpower/cli/pipeline.py +14 -413
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +537 -28
- flowerpower/job_queue/__init__.py +5 -94
- flowerpower/job_queue/base.py +201 -3
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
- flowerpower/job_queue/rq/manager.py +388 -77
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +2 -2
- flowerpower/pipeline/io.py +14 -16
- flowerpower/pipeline/manager.py +21 -642
- flowerpower/pipeline/pipeline.py +571 -0
- flowerpower/pipeline/registry.py +242 -10
- flowerpower/pipeline/visualizer.py +1 -2
- flowerpower/plugins/_io/__init__.py +8 -0
- flowerpower/plugins/mqtt/manager.py +6 -6
- flowerpower/settings/backend.py +0 -2
- flowerpower/settings/job_queue.py +1 -57
- flowerpower/utils/misc.py +0 -256
- flowerpower/utils/monkey.py +1 -83
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
- flowerpower-0.20.0.dist-info/RECORD +58 -0
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.19.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,875 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
import polars as pl
|
3
|
-
import polars.selectors as cs
|
4
|
-
|
5
|
-
from .datetime import get_timedelta_str, get_timestamp_column
|
6
|
-
|
7
|
-
# Pre-compiled regex patterns (identical to original)
|
8
|
-
INTEGER_REGEX = r"^[-+]?\d+$"
|
9
|
-
FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
|
10
|
-
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
|
11
|
-
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
|
12
|
-
DATETIME_REGEX = (
|
13
|
-
r"^("
|
14
|
-
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
|
15
|
-
r"|"
|
16
|
-
r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
|
17
|
-
r"|"
|
18
|
-
r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
|
19
|
-
r"|"
|
20
|
-
r"\d{8}" # Compact: 20231231
|
21
|
-
r")"
|
22
|
-
r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
|
23
|
-
r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
|
24
|
-
r"$"
|
25
|
-
)
|
26
|
-
|
27
|
-
# Float32 range limits
|
28
|
-
F32_MIN = float(np.finfo(np.float32).min)
|
29
|
-
F32_MAX = float(np.finfo(np.float32).max)
|
30
|
-
|
31
|
-
|
32
|
-
def _clean_string_expr(col_name: str) -> pl.Expr:
|
33
|
-
"""Create expression to clean string values."""
|
34
|
-
return (
|
35
|
-
pl.col(col_name)
|
36
|
-
.str.strip_chars()
|
37
|
-
.replace({
|
38
|
-
"-": None,
|
39
|
-
"": None,
|
40
|
-
"None": None,
|
41
|
-
"none": None,
|
42
|
-
"NONE": None,
|
43
|
-
"NaN": None,
|
44
|
-
"Nan": None,
|
45
|
-
"nan": None,
|
46
|
-
"NAN": None,
|
47
|
-
"N/A": None,
|
48
|
-
"n/a": None,
|
49
|
-
"null": None,
|
50
|
-
"Null": None,
|
51
|
-
"NULL": None,
|
52
|
-
})
|
53
|
-
)
|
54
|
-
|
55
|
-
|
56
|
-
def _can_downcast_to_float32(series: pl.Series) -> bool:
|
57
|
-
"""Check if float values are within Float32 range."""
|
58
|
-
finite_values = series.filter(series.is_finite())
|
59
|
-
if finite_values.is_empty():
|
60
|
-
return True
|
61
|
-
|
62
|
-
min_val, max_val = finite_values.min(), finite_values.max()
|
63
|
-
return F32_MIN <= min_val <= max_val <= F32_MAX
|
64
|
-
|
65
|
-
|
66
|
-
def _optimize_numeric_column(
|
67
|
-
series: pl.Series,
|
68
|
-
shrink: bool,
|
69
|
-
allow_unsigned: bool = True,
|
70
|
-
allow_null: bool = True,
|
71
|
-
) -> pl.Expr:
|
72
|
-
"""Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
|
73
|
-
col_name = series.name
|
74
|
-
expr = pl.col(col_name)
|
75
|
-
dtype = series.dtype
|
76
|
-
if series.is_null().all():
|
77
|
-
# If all values are null, cast to Null type if allow_null is True
|
78
|
-
if allow_null:
|
79
|
-
return expr.cast(pl.Null())
|
80
|
-
|
81
|
-
if not allow_unsigned:
|
82
|
-
# If unsigned types are not allowed, ensure we use signed integer types
|
83
|
-
if dtype.is_integer() and not dtype.is_signed_integer():
|
84
|
-
return expr.cast(pl.Int64)
|
85
|
-
|
86
|
-
if (
|
87
|
-
allow_unsigned
|
88
|
-
and dtype.is_integer()
|
89
|
-
and (series.min() is not None)
|
90
|
-
and series.min() >= 0
|
91
|
-
):
|
92
|
-
# Convert to unsigned integer type, shrink if requested
|
93
|
-
if shrink:
|
94
|
-
return expr.cast(pl.UInt64).shrink_dtype()
|
95
|
-
else:
|
96
|
-
return expr.cast(pl.UInt64)
|
97
|
-
|
98
|
-
if not shrink:
|
99
|
-
return expr
|
100
|
-
|
101
|
-
if dtype == pl.Float64 and not _can_downcast_to_float32(series):
|
102
|
-
return expr
|
103
|
-
|
104
|
-
return expr.shrink_dtype()
|
105
|
-
|
106
|
-
|
107
|
-
def _optimize_string_column(
|
108
|
-
series: pl.Series,
|
109
|
-
shrink_numerics: bool,
|
110
|
-
time_zone: str | None = None,
|
111
|
-
allow_null: bool = True,
|
112
|
-
allow_unsigned: bool = True,
|
113
|
-
) -> pl.Expr:
|
114
|
-
"""Convert string column to appropriate type based on content analysis."""
|
115
|
-
# Return early for empty or null-only series
|
116
|
-
col_name = series.name
|
117
|
-
cleaned_expr = _clean_string_expr(col_name)
|
118
|
-
non_null = series.drop_nulls()
|
119
|
-
if non_null.is_empty():
|
120
|
-
if allow_null:
|
121
|
-
return pl.col(col_name).cast(pl.Null())
|
122
|
-
else:
|
123
|
-
return pl.col(col_name).cast(series.dtype)
|
124
|
-
|
125
|
-
stripped = non_null.str.strip_chars()
|
126
|
-
lowercase = stripped.str.to_lowercase()
|
127
|
-
|
128
|
-
# Check for boolean values
|
129
|
-
if lowercase.str.contains(BOOLEAN_REGEX).all(ignore_nulls=False):
|
130
|
-
return (
|
131
|
-
cleaned_expr.str.to_lowercase()
|
132
|
-
.str.contains(BOOLEAN_TRUE_REGEX)
|
133
|
-
.alias(col_name)
|
134
|
-
)
|
135
|
-
|
136
|
-
elif stripped.str.contains(INTEGER_REGEX).all(ignore_nulls=False):
|
137
|
-
int_expr = cleaned_expr.cast(pl.Int64).alias(col_name)
|
138
|
-
return (
|
139
|
-
int_expr.shrink_dtype().alias(col_name)
|
140
|
-
if shrink_numerics
|
141
|
-
else int_expr.alias(col_name)
|
142
|
-
)
|
143
|
-
|
144
|
-
# Check for numeric values
|
145
|
-
elif stripped.str.contains(FLOAT_REGEX).all(ignore_nulls=False):
|
146
|
-
float_expr = (
|
147
|
-
cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
|
148
|
-
)
|
149
|
-
|
150
|
-
if shrink_numerics:
|
151
|
-
# Check if values can fit in Float32
|
152
|
-
temp_floats = stripped.str.replace_all(",", ".").cast(
|
153
|
-
pl.Float64, strict=False
|
154
|
-
)
|
155
|
-
if _can_downcast_to_float32(temp_floats):
|
156
|
-
return float_expr.shrink_dtype().alias(col_name)
|
157
|
-
|
158
|
-
return float_expr
|
159
|
-
|
160
|
-
try:
|
161
|
-
if stripped.str.contains(DATETIME_REGEX).all(ignore_nulls=False):
|
162
|
-
return cleaned_expr.str.to_datetime(
|
163
|
-
strict=False, time_unit="us", time_zone=time_zone
|
164
|
-
).alias(col_name)
|
165
|
-
except pl.exceptions.PolarsError:
|
166
|
-
pass
|
167
|
-
|
168
|
-
# Keep original if no conversion applies
|
169
|
-
return pl.col(col_name)
|
170
|
-
|
171
|
-
|
172
|
-
def _get_column_expr(
|
173
|
-
df: pl.DataFrame,
|
174
|
-
col_name: str,
|
175
|
-
shrink_numerics: bool = True,
|
176
|
-
allow_unsigned: bool = True,
|
177
|
-
allow_null: bool = True,
|
178
|
-
time_zone: str | None = None,
|
179
|
-
) -> pl.Expr:
|
180
|
-
"""Generate optimization expression for a single column."""
|
181
|
-
series = df[col_name]
|
182
|
-
|
183
|
-
# Handle all-null columns
|
184
|
-
if series.is_null().all():
|
185
|
-
if allow_null:
|
186
|
-
# If all values are null, cast to Null type if allow_null is True
|
187
|
-
return pl.col(col_name).cast(pl.Null())
|
188
|
-
|
189
|
-
# Process based on current type
|
190
|
-
if series.dtype.is_numeric():
|
191
|
-
return _optimize_numeric_column(
|
192
|
-
series, shrink_numerics, allow_unsigned, allow_null
|
193
|
-
)
|
194
|
-
elif series.dtype == pl.Utf8:
|
195
|
-
return _optimize_string_column(
|
196
|
-
series, col_name, shrink_numerics, time_zone, allow_null
|
197
|
-
)
|
198
|
-
|
199
|
-
# Keep original for other types
|
200
|
-
return pl.col(col_name)
|
201
|
-
|
202
|
-
|
203
|
-
def opt_dtype(
|
204
|
-
df: pl.DataFrame,
|
205
|
-
include: str | list[str] | None = None,
|
206
|
-
exclude: str | list[str] | None = None,
|
207
|
-
time_zone: str | None = None,
|
208
|
-
shrink_numerics: bool = True,
|
209
|
-
allow_unsigned: bool = True,
|
210
|
-
allow_null: bool = True,
|
211
|
-
strict: bool = False,
|
212
|
-
) -> pl.DataFrame:
|
213
|
-
"""
|
214
|
-
Optimize data types of a Polars DataFrame for performance and memory efficiency.
|
215
|
-
|
216
|
-
This function analyzes each column and converts it to the most appropriate
|
217
|
-
data type based on content, handling string-to-type conversions and
|
218
|
-
numeric type downcasting.
|
219
|
-
|
220
|
-
Args:
|
221
|
-
df: DataFrame to optimize
|
222
|
-
include: Column(s) to include in optimization (default: all columns)
|
223
|
-
exclude: Column(s) to exclude from optimization
|
224
|
-
time_zone: Optional time zone for datetime parsing
|
225
|
-
shrink_numerics: Whether to downcast numeric types when possible
|
226
|
-
allow_unsigned: Whether to allow unsigned integer types
|
227
|
-
allow_null: Whether to allow columns with all null values to be cast to Null type
|
228
|
-
strict: If True, will raise an error if any column cannot be optimized
|
229
|
-
|
230
|
-
Returns:
|
231
|
-
DataFrame with optimized data types
|
232
|
-
"""
|
233
|
-
# Normalize include/exclude parameters
|
234
|
-
if isinstance(include, str):
|
235
|
-
include = [include]
|
236
|
-
if isinstance(exclude, str):
|
237
|
-
exclude = [exclude]
|
238
|
-
|
239
|
-
# Determine columns to process
|
240
|
-
cols_to_process = df.columns
|
241
|
-
if include:
|
242
|
-
cols_to_process = [col for col in include if col in df.columns]
|
243
|
-
if exclude:
|
244
|
-
cols_to_process = [col for col in cols_to_process if col not in exclude]
|
245
|
-
|
246
|
-
# Generate optimization expressions for all columns
|
247
|
-
expressions = []
|
248
|
-
for col_name in cols_to_process:
|
249
|
-
try:
|
250
|
-
expressions.append(
|
251
|
-
_get_column_expr(
|
252
|
-
df, col_name, shrink_numerics, allow_unsigned, allow_null, time_zone
|
253
|
-
)
|
254
|
-
)
|
255
|
-
except Exception as e:
|
256
|
-
if strict:
|
257
|
-
raise e
|
258
|
-
# If strict mode is off, just keep the original column
|
259
|
-
continue
|
260
|
-
|
261
|
-
# Apply all transformations at once if any exist
|
262
|
-
return df if not expressions else df.with_columns(expressions)
|
263
|
-
|
264
|
-
|
265
|
-
# def opt_dtype(
|
266
|
-
# df: pl.DataFrame,
|
267
|
-
# include: str | list[str] | None = None,
|
268
|
-
# exclude: str | list[str] | None = None,
|
269
|
-
# time_zone: str | None = None,
|
270
|
-
# shrink_numerics: bool = True,
|
271
|
-
# ) -> pl.DataFrame:
|
272
|
-
# """
|
273
|
-
# Analyzes and optimizes the data types of a Polars DataFrame for performance
|
274
|
-
# and memory efficiency.
|
275
|
-
|
276
|
-
# This version includes:
|
277
|
-
# - Robust numeric, boolean, and datetime casting from strings.
|
278
|
-
# - Handling of whitespace and common null-like string values.
|
279
|
-
# - Casting of columns containing only nulls to pl.Int8.
|
280
|
-
# - Optional shrinking of numeric columns to the smallest possible type.
|
281
|
-
|
282
|
-
# Args:
|
283
|
-
# df: The DataFrame to optimize.
|
284
|
-
# include: A list of columns to forcefully include in the optimization.
|
285
|
-
# exclude: A list of columns to exclude from the optimization.
|
286
|
-
# time_zone: Optional time zone for datetime parsing.
|
287
|
-
# shrink_numerics: If True, numeric columns (both existing and newly converted from strings)
|
288
|
-
# will be downcast to the smallest possible type that can hold their values (e.g., Int64 to Int32, Float64 to Float32),
|
289
|
-
# similar to Polars' shrink_dtype() behavior. If False, this shrinking step is skipped.
|
290
|
-
|
291
|
-
# Returns:
|
292
|
-
# An optimized Polars DataFrame with improved data types.
|
293
|
-
# """
|
294
|
-
# # Phase 1: Analysis - Determine columns to process and build a list of
|
295
|
-
# # transformation expressions without executing them immediately.
|
296
|
-
# if isinstance(include, str):
|
297
|
-
# include = [include]
|
298
|
-
# if isinstance(exclude, str):
|
299
|
-
# exclude = [exclude]
|
300
|
-
|
301
|
-
# cols_to_process = df.columns
|
302
|
-
# if include:
|
303
|
-
# cols_to_process = [col for col in include if col in df.columns]
|
304
|
-
# if exclude:
|
305
|
-
# cols_to_process = [col for col in cols_to_process if col not in exclude]
|
306
|
-
|
307
|
-
# expressions = []
|
308
|
-
# for col_name in cols_to_process:
|
309
|
-
# s = df[col_name]
|
310
|
-
|
311
|
-
# # NEW: If a column is entirely null, cast it to Int8 and skip other checks.
|
312
|
-
# if s.is_null().all():
|
313
|
-
# expressions.append(pl.col(col_name).cast(pl.Int8))
|
314
|
-
# continue
|
315
|
-
|
316
|
-
# dtype = s.dtype
|
317
|
-
|
318
|
-
# # 1. Optimize numeric columns by shrinking their size
|
319
|
-
# if dtype.is_numeric():
|
320
|
-
# if shrink_numerics:
|
321
|
-
# if dtype == pl.Float64:
|
322
|
-
# column_series = df[col_name]
|
323
|
-
# finite_values_series = column_series.filter(
|
324
|
-
# column_series.is_finite()
|
325
|
-
# )
|
326
|
-
# can_shrink = True
|
327
|
-
# if not finite_values_series.is_empty():
|
328
|
-
# min_finite_val = finite_values_series.min()
|
329
|
-
# max_finite_val = finite_values_series.max()
|
330
|
-
# if (min_finite_val < F32_MIN_FINITE) or (
|
331
|
-
# max_finite_val > F32_MAX_FINITE
|
332
|
-
# ):
|
333
|
-
# can_shrink = False
|
334
|
-
# if can_shrink:
|
335
|
-
# expressions.append(pl.col(col_name).shrink_dtype())
|
336
|
-
# else:
|
337
|
-
# expressions.append(pl.col(col_name))
|
338
|
-
# else:
|
339
|
-
# expressions.append(pl.col(col_name).shrink_dtype())
|
340
|
-
# else:
|
341
|
-
# expressions.append(pl.col(col_name))
|
342
|
-
# continue
|
343
|
-
|
344
|
-
# # 2. Optimize string columns by casting to more specific types
|
345
|
-
# if dtype == pl.Utf8:
|
346
|
-
# # Create a cleaned column expression that first strips whitespace, then
|
347
|
-
# # replaces common null-like strings.
|
348
|
-
# cleaned_col = (
|
349
|
-
# pl.col(col_name)
|
350
|
-
# .str.strip_chars()
|
351
|
-
# .replace({"-": None, "": None, "None": None})
|
352
|
-
# )
|
353
|
-
|
354
|
-
# # Analyze a stripped, non-null version of the series to decide the cast type
|
355
|
-
# s_non_null = s.drop_nulls()
|
356
|
-
# if len(s_non_null) == 0:
|
357
|
-
# # The column only contains nulls or null-like strings.
|
358
|
-
# # Cast to Int8 as requested for all-null columns.
|
359
|
-
# expressions.append(pl.col(col_name).cast(pl.Int8))
|
360
|
-
# continue
|
361
|
-
|
362
|
-
# s_stripped_non_null = s_non_null.str.strip_chars()
|
363
|
-
|
364
|
-
# # Check for boolean type
|
365
|
-
# if s_stripped_non_null.str.to_lowercase().str.contains(BOOLEAN_REGEX).all():
|
366
|
-
# expr = cleaned_col.str.to_lowercase().str.contains(BOOLEAN_TRUE_REGEX)
|
367
|
-
# expressions.append(expr.alias(col_name))
|
368
|
-
# continue
|
369
|
-
|
370
|
-
# # Check for numeric type
|
371
|
-
# if s_stripped_non_null.str.contains(NUMERIC_REGEX).all():
|
372
|
-
# is_float = s_stripped_non_null.str.contains(r"[.,eE]").any()
|
373
|
-
# numeric_col = cleaned_col.str.replace_all(",", ".")
|
374
|
-
# if is_float:
|
375
|
-
# if shrink_numerics:
|
376
|
-
# temp_float_series = s_stripped_non_null.str.replace_all(
|
377
|
-
# ",", "."
|
378
|
-
# ).cast(pl.Float64, strict=False)
|
379
|
-
# finite_values_series = temp_float_series.filter(
|
380
|
-
# temp_float_series.is_finite()
|
381
|
-
# )
|
382
|
-
# can_shrink = True
|
383
|
-
# if not finite_values_series.is_empty():
|
384
|
-
# min_finite_val = finite_values_series.min()
|
385
|
-
# max_finite_val = finite_values_series.max()
|
386
|
-
# if (min_finite_val < F32_MIN_FINITE) or (
|
387
|
-
# max_finite_val > F32_MAX_FINITE
|
388
|
-
# ):
|
389
|
-
# can_shrink = False
|
390
|
-
# base_expr = numeric_col.cast(pl.Float64)
|
391
|
-
# if can_shrink:
|
392
|
-
# expressions.append(base_expr.shrink_dtype().alias(col_name))
|
393
|
-
# else:
|
394
|
-
# expressions.append(base_expr.alias(col_name))
|
395
|
-
# else:
|
396
|
-
# expressions.append(numeric_col.cast(pl.Float64).alias(col_name))
|
397
|
-
# else:
|
398
|
-
# if shrink_numerics:
|
399
|
-
# expressions.append(
|
400
|
-
# numeric_col.cast(pl.Int64).shrink_dtype().alias(col_name)
|
401
|
-
# )
|
402
|
-
# else:
|
403
|
-
# expressions.append(numeric_col.cast(pl.Int64).alias(col_name))
|
404
|
-
# continue
|
405
|
-
|
406
|
-
# # Check for datetime type using a fast heuristic
|
407
|
-
# try:
|
408
|
-
# if s_stripped_non_null.str.contains(DATETIME_REGEX).all():
|
409
|
-
# expressions.append(
|
410
|
-
# cleaned_col.str.to_datetime(
|
411
|
-
# strict=False, time_unit="us", time_zone=time_zone
|
412
|
-
# ).alias(col_name)
|
413
|
-
# )
|
414
|
-
# continue
|
415
|
-
# except pl.exceptions.PolarsError:
|
416
|
-
# pass
|
417
|
-
|
418
|
-
# # Phase 2: Execution - If any optimizations were identified, apply them
|
419
|
-
# # all at once for maximum parallelism and performance.
|
420
|
-
# if not expressions:
|
421
|
-
# return df
|
422
|
-
|
423
|
-
# return df.with_columns(expressions)
|
424
|
-
|
425
|
-
|
426
|
-
def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None):
|
427
|
-
def _unnest_all(struct_columns):
|
428
|
-
if fields is not None:
|
429
|
-
return (
|
430
|
-
df.with_columns([
|
431
|
-
pl.col(col).struct.rename_fields([
|
432
|
-
f"{col}{seperator}{field_name}"
|
433
|
-
for field_name in df[col].struct.fields
|
434
|
-
])
|
435
|
-
for col in struct_columns
|
436
|
-
])
|
437
|
-
.unnest(struct_columns)
|
438
|
-
.select(
|
439
|
-
list(set(df.columns) - set(struct_columns))
|
440
|
-
+ sorted([
|
441
|
-
f"{col}{seperator}{field_name}"
|
442
|
-
for field_name in fields
|
443
|
-
for col in struct_columns
|
444
|
-
])
|
445
|
-
)
|
446
|
-
)
|
447
|
-
|
448
|
-
return df.with_columns([
|
449
|
-
pl.col(col).struct.rename_fields([
|
450
|
-
f"{col}{seperator}{field_name}" for field_name in df[col].struct.fields
|
451
|
-
])
|
452
|
-
for col in struct_columns
|
453
|
-
]).unnest(struct_columns)
|
454
|
-
|
455
|
-
struct_columns = [col for col in df.columns if df[col].dtype == pl.Struct] # noqa: F821
|
456
|
-
while len(struct_columns):
|
457
|
-
df = _unnest_all(struct_columns=struct_columns)
|
458
|
-
struct_columns = [col for col in df.columns if df[col].dtype == pl.Struct]
|
459
|
-
return df
|
460
|
-
|
461
|
-
|
462
|
-
def explode_all(df: pl.DataFrame | pl.LazyFrame):
|
463
|
-
list_columns = [col for col in df.columns if df[col].dtype == pl.List]
|
464
|
-
for col in list_columns:
|
465
|
-
df = df.explode(col)
|
466
|
-
return df
|
467
|
-
|
468
|
-
|
469
|
-
def with_strftime_columns(
|
470
|
-
df: pl.DataFrame | pl.LazyFrame,
|
471
|
-
strftime: str | list[str],
|
472
|
-
timestamp_column: str = "auto",
|
473
|
-
column_names: str | list[str] | None = None,
|
474
|
-
):
|
475
|
-
if timestamp_column is None or timestamp_column == "auto":
|
476
|
-
timestamp_column = get_timestamp_column(df)
|
477
|
-
if len(timestamp_column):
|
478
|
-
timestamp_column = timestamp_column[0]
|
479
|
-
|
480
|
-
if timestamp_column is None:
|
481
|
-
raise ValueError("timestamp_column is not specified nor found in the dataframe")
|
482
|
-
|
483
|
-
if isinstance(strftime, str):
|
484
|
-
strftime = [strftime]
|
485
|
-
if isinstance(column_names, str):
|
486
|
-
column_names = [column_names]
|
487
|
-
|
488
|
-
if column_names is None:
|
489
|
-
column_names = [
|
490
|
-
f"_strftime_{strftime_.replace('%', '').replace('-', '_')}_"
|
491
|
-
for strftime_ in strftime
|
492
|
-
]
|
493
|
-
# print("timestamp_column, with_strftime_columns", timestamp_column)
|
494
|
-
return opt_dtype(
|
495
|
-
df.with_columns([
|
496
|
-
pl.col(timestamp_column)
|
497
|
-
.dt.strftime(strftime_)
|
498
|
-
.fill_null(0)
|
499
|
-
.alias(column_name)
|
500
|
-
for strftime_, column_name in zip(strftime, column_names)
|
501
|
-
]),
|
502
|
-
include=column_names,
|
503
|
-
strict=False,
|
504
|
-
)
|
505
|
-
|
506
|
-
|
507
|
-
def with_truncated_columns(
|
508
|
-
df: pl.DataFrame | pl.LazyFrame,
|
509
|
-
truncate_by: str | list[str],
|
510
|
-
timestamp_column: str = "auto",
|
511
|
-
column_names: str | list[str] | None = None,
|
512
|
-
):
|
513
|
-
if timestamp_column is None or timestamp_column == "auto":
|
514
|
-
timestamp_column = get_timestamp_column(df)
|
515
|
-
if len(timestamp_column):
|
516
|
-
timestamp_column = timestamp_column[0]
|
517
|
-
|
518
|
-
if timestamp_column is None:
|
519
|
-
raise ValueError(
|
520
|
-
"timestamp_column is not specified nor found in the dataframe"
|
521
|
-
)
|
522
|
-
if isinstance(truncate_by, str):
|
523
|
-
truncate_by = [truncate_by]
|
524
|
-
|
525
|
-
if isinstance(column_names, str):
|
526
|
-
column_names = [column_names]
|
527
|
-
|
528
|
-
if column_names is None:
|
529
|
-
column_names = [
|
530
|
-
f"_truncated_{truncate_.replace(' ', '_')}_" for truncate_ in truncate_by
|
531
|
-
]
|
532
|
-
|
533
|
-
truncate_by = [
|
534
|
-
get_timedelta_str(truncate_, to="polars") for truncate_ in truncate_by
|
535
|
-
]
|
536
|
-
return df.with_columns([
|
537
|
-
pl.col(timestamp_column).dt.truncate(truncate_).alias(column_name)
|
538
|
-
for truncate_, column_name in zip(truncate_by, column_names)
|
539
|
-
])
|
540
|
-
|
541
|
-
|
542
|
-
def with_datepart_columns(
|
543
|
-
df: pl.DataFrame | pl.LazyFrame,
|
544
|
-
timestamp_column: str = "auto",
|
545
|
-
year: bool = False,
|
546
|
-
month: bool = False,
|
547
|
-
week: bool = False,
|
548
|
-
yearday: bool = False,
|
549
|
-
monthday: bool = False,
|
550
|
-
day: bool = False,
|
551
|
-
weekday: bool = False,
|
552
|
-
hour: bool = False,
|
553
|
-
minute: bool = False,
|
554
|
-
strftime: str | None = None,
|
555
|
-
):
|
556
|
-
if strftime:
|
557
|
-
if isinstance(strftime, str):
|
558
|
-
strftime = [strftime]
|
559
|
-
column_names = [
|
560
|
-
f"_strftime_{strftime_.replace('%', '').replace('-', '_')}_"
|
561
|
-
for strftime_ in strftime
|
562
|
-
]
|
563
|
-
else:
|
564
|
-
strftime = []
|
565
|
-
column_names = []
|
566
|
-
|
567
|
-
if year:
|
568
|
-
strftime.append("%Y")
|
569
|
-
column_names.append("year")
|
570
|
-
if month:
|
571
|
-
strftime.append("%m")
|
572
|
-
column_names.append("month")
|
573
|
-
if week:
|
574
|
-
strftime.append("%W")
|
575
|
-
column_names.append("week")
|
576
|
-
if yearday:
|
577
|
-
strftime.append("%j")
|
578
|
-
column_names.append("year_day")
|
579
|
-
if monthday:
|
580
|
-
strftime.append("%d")
|
581
|
-
column_names.append("day")
|
582
|
-
if day:
|
583
|
-
strftime.append("%d")
|
584
|
-
column_names.append("day")
|
585
|
-
if weekday:
|
586
|
-
strftime.append("%a")
|
587
|
-
column_names.append("week_day")
|
588
|
-
if hour:
|
589
|
-
strftime.append("%H")
|
590
|
-
column_names.append("hour")
|
591
|
-
if minute:
|
592
|
-
strftime.append("%M")
|
593
|
-
column_names.append("minute")
|
594
|
-
|
595
|
-
column_names = [col for col in column_names if col not in df.columns]
|
596
|
-
# print("timestamp_column, with_datepart_columns", timestamp_column)
|
597
|
-
return with_strftime_columns(
|
598
|
-
df=df,
|
599
|
-
timestamp_column=timestamp_column,
|
600
|
-
strftime=strftime,
|
601
|
-
column_names=column_names,
|
602
|
-
)
|
603
|
-
|
604
|
-
|
605
|
-
def with_row_count(
|
606
|
-
df: pl.DataFrame | pl.LazyFrame,
|
607
|
-
over: str | list[str] | None = None,
|
608
|
-
):
|
609
|
-
if over:
|
610
|
-
if len(over) == 0:
|
611
|
-
over = None
|
612
|
-
|
613
|
-
if isinstance(over, str):
|
614
|
-
over = [over]
|
615
|
-
|
616
|
-
if over:
|
617
|
-
return df.with_columns(pl.lit(1).alias("row_nr")).with_columns(
|
618
|
-
pl.col("row_nr").cum_sum().over(over)
|
619
|
-
)
|
620
|
-
else:
|
621
|
-
return df.with_columns(pl.lit(1).alias("row_nr")).with_columns(
|
622
|
-
pl.col("row_nr").cum_sum()
|
623
|
-
)
|
624
|
-
|
625
|
-
|
626
|
-
# def delta(
|
627
|
-
# df1: pl.DataFrame | pl.LazyFrame,
|
628
|
-
# df2: pl.DataFrame | pl.LazyFrame,
|
629
|
-
# subset: str | list[str] | None = None,
|
630
|
-
# eager: bool = False,
|
631
|
-
# ) -> pl.LazyFrame:
|
632
|
-
# columns = sorted(set(df1.columns) & set(df2.columns))
|
633
|
-
|
634
|
-
# if subset is None:
|
635
|
-
# subset = columns
|
636
|
-
# if isinstance(subset, str):
|
637
|
-
# subset = [subset]
|
638
|
-
|
639
|
-
# subset = sorted(set(columns) & set(subset))
|
640
|
-
|
641
|
-
# if isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.DataFrame):
|
642
|
-
# df2 = df2.lazy()
|
643
|
-
|
644
|
-
# elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.LazyFrame):
|
645
|
-
# df1 = df1.lazy()
|
646
|
-
|
647
|
-
# df = (
|
648
|
-
# pl.concat(
|
649
|
-
# [
|
650
|
-
# df1.select(columns)
|
651
|
-
# .with_columns(pl.lit(1).alias("df"))
|
652
|
-
# .with_row_count(),
|
653
|
-
# df2.select(columns)
|
654
|
-
# .with_columns(pl.lit(2).alias("df"))
|
655
|
-
# .with_row_count(),
|
656
|
-
# ],
|
657
|
-
# how="vertical_relaxed",
|
658
|
-
# )
|
659
|
-
# .filter((pl.count().over(subset) == 1) & (pl.col("df") == 1))
|
660
|
-
# .select(pl.exclude(["df", "row_nr"]))
|
661
|
-
# )
|
662
|
-
|
663
|
-
# if eager and isinstance(df, pl.LazyFrame):
|
664
|
-
# return df.collect()
|
665
|
-
# return df
|
666
|
-
|
667
|
-
|
668
|
-
def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
|
669
|
-
"""Remove columns with all null values from the DataFrame."""
|
670
|
-
return df.select([col for col in df.columns if df[col].null_count() < df.height])
|
671
|
-
|
672
|
-
|
673
|
-
def unify_schemas(dfs: list[pl.DataFrame | pl.LazyFrame]) -> pl.Schema:
|
674
|
-
df = pl.concat([df.lazy() for df in dfs], how="diagonal_relaxed")
|
675
|
-
if isinstance(df, pl.LazyFrame):
|
676
|
-
return df.collect_schema()
|
677
|
-
return df.schema
|
678
|
-
|
679
|
-
|
680
|
-
def cast_relaxed(
|
681
|
-
df: pl.DataFrame | pl.LazyFrame, schema: pl.Schema
|
682
|
-
) -> pl.DataFrame | pl.LazyFrame:
|
683
|
-
if isinstance(df, pl.LazyFrame):
|
684
|
-
columns = df.collect_schema().names()
|
685
|
-
else:
|
686
|
-
columns = df.schema.names()
|
687
|
-
new_columns = [col for col in schema.names() if col not in columns]
|
688
|
-
if len(new_columns):
|
689
|
-
return df.with_columns([
|
690
|
-
pl.lit(None).alias(new_col) for new_col in new_columns
|
691
|
-
]).cast(schema)
|
692
|
-
return df.cast(schema)
|
693
|
-
|
694
|
-
|
695
|
-
def delta(
|
696
|
-
df1: pl.DataFrame | pl.LazyFrame,
|
697
|
-
df2: pl.DataFrame | pl.LazyFrame,
|
698
|
-
subset: list[str] | None = None,
|
699
|
-
eager: bool = False,
|
700
|
-
) -> pl.DataFrame:
|
701
|
-
s1 = df1.select(~cs.by_dtype(pl.Null())).collect_schema()
|
702
|
-
s2 = df2.select(~cs.by_dtype(pl.Null())).collect_schema()
|
703
|
-
|
704
|
-
columns = sorted(set(s1.names()) & set(s2.names()))
|
705
|
-
|
706
|
-
if subset is None:
|
707
|
-
subset = df1.columns
|
708
|
-
if isinstance(subset, str):
|
709
|
-
subset = [subset]
|
710
|
-
|
711
|
-
subset = sorted(set(columns) & set(subset))
|
712
|
-
|
713
|
-
if isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.DataFrame):
|
714
|
-
df2 = df2.lazy()
|
715
|
-
|
716
|
-
elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.LazyFrame):
|
717
|
-
df1 = df1.lazy()
|
718
|
-
|
719
|
-
# cast to equal schema
|
720
|
-
unified_schema = unify_schemas([df1.select(subset), df2.select(subset)])
|
721
|
-
|
722
|
-
df1 = df1.cast_relaxed(unified_schema)
|
723
|
-
df2 = df2.cast_relaxed(unified_schema)
|
724
|
-
|
725
|
-
df = df1.join(df2, on=subset, how="anti", join_nulls=True)
|
726
|
-
|
727
|
-
if eager and isinstance(df, pl.LazyFrame):
|
728
|
-
return df.collect()
|
729
|
-
|
730
|
-
return df
|
731
|
-
|
732
|
-
|
733
|
-
def partition_by(
|
734
|
-
df: pl.DataFrame | pl.LazyFrame,
|
735
|
-
timestamp_column: str | None = None,
|
736
|
-
columns: str | list[str] | None = None,
|
737
|
-
strftime: str | list[str] | None = None,
|
738
|
-
timedelta: str | list[str] | None = None,
|
739
|
-
num_rows: int | None = None,
|
740
|
-
) -> list[tuple[dict, pl.DataFrame | pl.LazyFrame]]:
|
741
|
-
if columns is not None:
|
742
|
-
if isinstance(columns, str):
|
743
|
-
columns = [columns]
|
744
|
-
columns_ = columns.copy()
|
745
|
-
else:
|
746
|
-
columns_ = []
|
747
|
-
|
748
|
-
drop_columns = columns_.copy()
|
749
|
-
|
750
|
-
if strftime is not None:
|
751
|
-
if isinstance(strftime, str):
|
752
|
-
strftime = [strftime]
|
753
|
-
|
754
|
-
df = df.with_strftime_columns(
|
755
|
-
timestamp_column=timestamp_column, strftime=strftime
|
756
|
-
)
|
757
|
-
strftime_columns = [
|
758
|
-
f"_strftime_{strftime_.replaace('%', '')}_" for strftime_ in strftime
|
759
|
-
]
|
760
|
-
columns_ += strftime_columns
|
761
|
-
drop_columns += strftime_columns
|
762
|
-
|
763
|
-
if timedelta is not None:
|
764
|
-
if isinstance(timedelta, str):
|
765
|
-
timedelta = [timedelta]
|
766
|
-
|
767
|
-
df = df.with_duration_columns(
|
768
|
-
timestamp_column=timestamp_column, timedelta=timedelta
|
769
|
-
)
|
770
|
-
timedelta_columns = [f"_timedelta_{timedelta_}_" for timedelta_ in timedelta]
|
771
|
-
columns_ += timedelta_columns
|
772
|
-
drop_columns += timedelta_columns
|
773
|
-
|
774
|
-
if columns_:
|
775
|
-
# datetime_columns = {
|
776
|
-
# col: col in [col.lower() for col in columns_]
|
777
|
-
# for col in [
|
778
|
-
# "year",
|
779
|
-
# "month",
|
780
|
-
# "week",
|
781
|
-
# "yearday",
|
782
|
-
# "monthday",
|
783
|
-
# "weekday",
|
784
|
-
# "strftime",
|
785
|
-
# ]
|
786
|
-
# if col not in [table_col.lower() for table_col in df.columns]
|
787
|
-
# }
|
788
|
-
datetime_columns = [
|
789
|
-
col.lower()
|
790
|
-
for col in columns_
|
791
|
-
if col
|
792
|
-
in [
|
793
|
-
"year",
|
794
|
-
"month",
|
795
|
-
"week",
|
796
|
-
"yearday",
|
797
|
-
"monthday",
|
798
|
-
"weekday",
|
799
|
-
"day",
|
800
|
-
"hour",
|
801
|
-
"minute",
|
802
|
-
"strftime",
|
803
|
-
]
|
804
|
-
and col not in df.columns
|
805
|
-
]
|
806
|
-
|
807
|
-
datetime_columns = {
|
808
|
-
col: col in datetime_columns
|
809
|
-
for col in [
|
810
|
-
"year",
|
811
|
-
"month",
|
812
|
-
"week",
|
813
|
-
"yearday",
|
814
|
-
"monthday",
|
815
|
-
"weekday",
|
816
|
-
"day",
|
817
|
-
"hour",
|
818
|
-
"minute",
|
819
|
-
"strftime",
|
820
|
-
]
|
821
|
-
}
|
822
|
-
if any(datetime_columns.values()):
|
823
|
-
df = df.with_datepart_columns(
|
824
|
-
timestamp_column=timestamp_column, **datetime_columns
|
825
|
-
)
|
826
|
-
|
827
|
-
if isinstance(df, pl.LazyFrame):
|
828
|
-
df = df.collect()
|
829
|
-
columns_ = [col for col in columns_ if col in df.columns]
|
830
|
-
|
831
|
-
if num_rows is not None:
|
832
|
-
df = df.with_row_count_ext(over=columns_).with_columns(
|
833
|
-
(pl.col("row_nr") - 1) // num_rows
|
834
|
-
)
|
835
|
-
columns_ += ["row_nr"]
|
836
|
-
drop_columns += ["row_nr"]
|
837
|
-
|
838
|
-
if columns_:
|
839
|
-
partitions = [
|
840
|
-
(p.select(columns_).unique().to_dicts()[0], p.drop(drop_columns))
|
841
|
-
for p in df.partition_by(
|
842
|
-
by=columns_,
|
843
|
-
as_dict=False,
|
844
|
-
maintain_order=True,
|
845
|
-
)
|
846
|
-
]
|
847
|
-
|
848
|
-
return partitions
|
849
|
-
|
850
|
-
return [({}, df)]
|
851
|
-
|
852
|
-
|
853
|
-
pl.DataFrame.unnest_all = unnest_all
|
854
|
-
pl.DataFrame.explode_all = explode_all
|
855
|
-
pl.DataFrame.opt_dtype = opt_dtype
|
856
|
-
pl.DataFrame.with_row_count_ext = with_row_count
|
857
|
-
pl.DataFrame.with_datepart_columns = with_datepart_columns
|
858
|
-
pl.DataFrame.with_duration_columns = with_truncated_columns
|
859
|
-
pl.DataFrame.with_strftime_columns = with_strftime_columns
|
860
|
-
pl.DataFrame.cast_relaxed = cast_relaxed
|
861
|
-
pl.DataFrame.delta = delta
|
862
|
-
pl.DataFrame.partition_by_ext = partition_by
|
863
|
-
pl.DataFrame.drop_null_columns = drop_null_columns
|
864
|
-
|
865
|
-
pl.LazyFrame.unnest_all = unnest_all
|
866
|
-
pl.LazyFrame.explode_all = explode_all
|
867
|
-
pl.LazyFrame.opt_dtype = opt_dtype
|
868
|
-
pl.LazyFrame.with_row_count_ext = with_row_count
|
869
|
-
pl.LazyFrame.with_datepart_columns = with_datepart_columns
|
870
|
-
pl.LazyFrame.with_duration_columns = with_truncated_columns
|
871
|
-
pl.LazyFrame.with_strftime_columns = with_strftime_columns
|
872
|
-
pl.LazyFrame.delta = delta
|
873
|
-
pl.LazyFrame.cast_relaxed = cast_relaxed
|
874
|
-
pl.LazyFrame.partition_by_ext = partition_by
|
875
|
-
pl.LazyFrame.drop_null_columns = drop_null_columns
|