FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +401 -133
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.19.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,875 +0,0 @@
1
- import numpy as np
2
- import polars as pl
3
- import polars.selectors as cs
4
-
5
- from .datetime import get_timedelta_str, get_timestamp_column
6
-
7
- # Pre-compiled regex patterns (identical to original)
8
- INTEGER_REGEX = r"^[-+]?\d+$"
9
- FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
10
- BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
11
- BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
12
- DATETIME_REGEX = (
13
- r"^("
14
- r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
15
- r"|"
16
- r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
17
- r"|"
18
- r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
19
- r"|"
20
- r"\d{8}" # Compact: 20231231
21
- r")"
22
- r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
23
- r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
24
- r"$"
25
- )
26
-
27
- # Float32 range limits
28
- F32_MIN = float(np.finfo(np.float32).min)
29
- F32_MAX = float(np.finfo(np.float32).max)
30
-
31
-
32
- def _clean_string_expr(col_name: str) -> pl.Expr:
33
- """Create expression to clean string values."""
34
- return (
35
- pl.col(col_name)
36
- .str.strip_chars()
37
- .replace({
38
- "-": None,
39
- "": None,
40
- "None": None,
41
- "none": None,
42
- "NONE": None,
43
- "NaN": None,
44
- "Nan": None,
45
- "nan": None,
46
- "NAN": None,
47
- "N/A": None,
48
- "n/a": None,
49
- "null": None,
50
- "Null": None,
51
- "NULL": None,
52
- })
53
- )
54
-
55
-
56
- def _can_downcast_to_float32(series: pl.Series) -> bool:
57
- """Check if float values are within Float32 range."""
58
- finite_values = series.filter(series.is_finite())
59
- if finite_values.is_empty():
60
- return True
61
-
62
- min_val, max_val = finite_values.min(), finite_values.max()
63
- return F32_MIN <= min_val <= max_val <= F32_MAX
64
-
65
-
66
- def _optimize_numeric_column(
67
- series: pl.Series,
68
- shrink: bool,
69
- allow_unsigned: bool = True,
70
- allow_null: bool = True,
71
- ) -> pl.Expr:
72
- """Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
73
- col_name = series.name
74
- expr = pl.col(col_name)
75
- dtype = series.dtype
76
- if series.is_null().all():
77
- # If all values are null, cast to Null type if allow_null is True
78
- if allow_null:
79
- return expr.cast(pl.Null())
80
-
81
- if not allow_unsigned:
82
- # If unsigned types are not allowed, ensure we use signed integer types
83
- if dtype.is_integer() and not dtype.is_signed_integer():
84
- return expr.cast(pl.Int64)
85
-
86
- if (
87
- allow_unsigned
88
- and dtype.is_integer()
89
- and (series.min() is not None)
90
- and series.min() >= 0
91
- ):
92
- # Convert to unsigned integer type, shrink if requested
93
- if shrink:
94
- return expr.cast(pl.UInt64).shrink_dtype()
95
- else:
96
- return expr.cast(pl.UInt64)
97
-
98
- if not shrink:
99
- return expr
100
-
101
- if dtype == pl.Float64 and not _can_downcast_to_float32(series):
102
- return expr
103
-
104
- return expr.shrink_dtype()
105
-
106
-
107
- def _optimize_string_column(
108
- series: pl.Series,
109
- shrink_numerics: bool,
110
- time_zone: str | None = None,
111
- allow_null: bool = True,
112
- allow_unsigned: bool = True,
113
- ) -> pl.Expr:
114
- """Convert string column to appropriate type based on content analysis."""
115
- # Return early for empty or null-only series
116
- col_name = series.name
117
- cleaned_expr = _clean_string_expr(col_name)
118
- non_null = series.drop_nulls()
119
- if non_null.is_empty():
120
- if allow_null:
121
- return pl.col(col_name).cast(pl.Null())
122
- else:
123
- return pl.col(col_name).cast(series.dtype)
124
-
125
- stripped = non_null.str.strip_chars()
126
- lowercase = stripped.str.to_lowercase()
127
-
128
- # Check for boolean values
129
- if lowercase.str.contains(BOOLEAN_REGEX).all(ignore_nulls=False):
130
- return (
131
- cleaned_expr.str.to_lowercase()
132
- .str.contains(BOOLEAN_TRUE_REGEX)
133
- .alias(col_name)
134
- )
135
-
136
- elif stripped.str.contains(INTEGER_REGEX).all(ignore_nulls=False):
137
- int_expr = cleaned_expr.cast(pl.Int64).alias(col_name)
138
- return (
139
- int_expr.shrink_dtype().alias(col_name)
140
- if shrink_numerics
141
- else int_expr.alias(col_name)
142
- )
143
-
144
- # Check for numeric values
145
- elif stripped.str.contains(FLOAT_REGEX).all(ignore_nulls=False):
146
- float_expr = (
147
- cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
148
- )
149
-
150
- if shrink_numerics:
151
- # Check if values can fit in Float32
152
- temp_floats = stripped.str.replace_all(",", ".").cast(
153
- pl.Float64, strict=False
154
- )
155
- if _can_downcast_to_float32(temp_floats):
156
- return float_expr.shrink_dtype().alias(col_name)
157
-
158
- return float_expr
159
-
160
- try:
161
- if stripped.str.contains(DATETIME_REGEX).all(ignore_nulls=False):
162
- return cleaned_expr.str.to_datetime(
163
- strict=False, time_unit="us", time_zone=time_zone
164
- ).alias(col_name)
165
- except pl.exceptions.PolarsError:
166
- pass
167
-
168
- # Keep original if no conversion applies
169
- return pl.col(col_name)
170
-
171
-
172
- def _get_column_expr(
173
- df: pl.DataFrame,
174
- col_name: str,
175
- shrink_numerics: bool = True,
176
- allow_unsigned: bool = True,
177
- allow_null: bool = True,
178
- time_zone: str | None = None,
179
- ) -> pl.Expr:
180
- """Generate optimization expression for a single column."""
181
- series = df[col_name]
182
-
183
- # Handle all-null columns
184
- if series.is_null().all():
185
- if allow_null:
186
- # If all values are null, cast to Null type if allow_null is True
187
- return pl.col(col_name).cast(pl.Null())
188
-
189
- # Process based on current type
190
- if series.dtype.is_numeric():
191
- return _optimize_numeric_column(
192
- series, shrink_numerics, allow_unsigned, allow_null
193
- )
194
- elif series.dtype == pl.Utf8:
195
- return _optimize_string_column(
196
- series, col_name, shrink_numerics, time_zone, allow_null
197
- )
198
-
199
- # Keep original for other types
200
- return pl.col(col_name)
201
-
202
-
203
- def opt_dtype(
204
- df: pl.DataFrame,
205
- include: str | list[str] | None = None,
206
- exclude: str | list[str] | None = None,
207
- time_zone: str | None = None,
208
- shrink_numerics: bool = True,
209
- allow_unsigned: bool = True,
210
- allow_null: bool = True,
211
- strict: bool = False,
212
- ) -> pl.DataFrame:
213
- """
214
- Optimize data types of a Polars DataFrame for performance and memory efficiency.
215
-
216
- This function analyzes each column and converts it to the most appropriate
217
- data type based on content, handling string-to-type conversions and
218
- numeric type downcasting.
219
-
220
- Args:
221
- df: DataFrame to optimize
222
- include: Column(s) to include in optimization (default: all columns)
223
- exclude: Column(s) to exclude from optimization
224
- time_zone: Optional time zone for datetime parsing
225
- shrink_numerics: Whether to downcast numeric types when possible
226
- allow_unsigned: Whether to allow unsigned integer types
227
- allow_null: Whether to allow columns with all null values to be cast to Null type
228
- strict: If True, will raise an error if any column cannot be optimized
229
-
230
- Returns:
231
- DataFrame with optimized data types
232
- """
233
- # Normalize include/exclude parameters
234
- if isinstance(include, str):
235
- include = [include]
236
- if isinstance(exclude, str):
237
- exclude = [exclude]
238
-
239
- # Determine columns to process
240
- cols_to_process = df.columns
241
- if include:
242
- cols_to_process = [col for col in include if col in df.columns]
243
- if exclude:
244
- cols_to_process = [col for col in cols_to_process if col not in exclude]
245
-
246
- # Generate optimization expressions for all columns
247
- expressions = []
248
- for col_name in cols_to_process:
249
- try:
250
- expressions.append(
251
- _get_column_expr(
252
- df, col_name, shrink_numerics, allow_unsigned, allow_null, time_zone
253
- )
254
- )
255
- except Exception as e:
256
- if strict:
257
- raise e
258
- # If strict mode is off, just keep the original column
259
- continue
260
-
261
- # Apply all transformations at once if any exist
262
- return df if not expressions else df.with_columns(expressions)
263
-
264
-
265
- # def opt_dtype(
266
- # df: pl.DataFrame,
267
- # include: str | list[str] | None = None,
268
- # exclude: str | list[str] | None = None,
269
- # time_zone: str | None = None,
270
- # shrink_numerics: bool = True,
271
- # ) -> pl.DataFrame:
272
- # """
273
- # Analyzes and optimizes the data types of a Polars DataFrame for performance
274
- # and memory efficiency.
275
-
276
- # This version includes:
277
- # - Robust numeric, boolean, and datetime casting from strings.
278
- # - Handling of whitespace and common null-like string values.
279
- # - Casting of columns containing only nulls to pl.Int8.
280
- # - Optional shrinking of numeric columns to the smallest possible type.
281
-
282
- # Args:
283
- # df: The DataFrame to optimize.
284
- # include: A list of columns to forcefully include in the optimization.
285
- # exclude: A list of columns to exclude from the optimization.
286
- # time_zone: Optional time zone for datetime parsing.
287
- # shrink_numerics: If True, numeric columns (both existing and newly converted from strings)
288
- # will be downcast to the smallest possible type that can hold their values (e.g., Int64 to Int32, Float64 to Float32),
289
- # similar to Polars' shrink_dtype() behavior. If False, this shrinking step is skipped.
290
-
291
- # Returns:
292
- # An optimized Polars DataFrame with improved data types.
293
- # """
294
- # # Phase 1: Analysis - Determine columns to process and build a list of
295
- # # transformation expressions without executing them immediately.
296
- # if isinstance(include, str):
297
- # include = [include]
298
- # if isinstance(exclude, str):
299
- # exclude = [exclude]
300
-
301
- # cols_to_process = df.columns
302
- # if include:
303
- # cols_to_process = [col for col in include if col in df.columns]
304
- # if exclude:
305
- # cols_to_process = [col for col in cols_to_process if col not in exclude]
306
-
307
- # expressions = []
308
- # for col_name in cols_to_process:
309
- # s = df[col_name]
310
-
311
- # # NEW: If a column is entirely null, cast it to Int8 and skip other checks.
312
- # if s.is_null().all():
313
- # expressions.append(pl.col(col_name).cast(pl.Int8))
314
- # continue
315
-
316
- # dtype = s.dtype
317
-
318
- # # 1. Optimize numeric columns by shrinking their size
319
- # if dtype.is_numeric():
320
- # if shrink_numerics:
321
- # if dtype == pl.Float64:
322
- # column_series = df[col_name]
323
- # finite_values_series = column_series.filter(
324
- # column_series.is_finite()
325
- # )
326
- # can_shrink = True
327
- # if not finite_values_series.is_empty():
328
- # min_finite_val = finite_values_series.min()
329
- # max_finite_val = finite_values_series.max()
330
- # if (min_finite_val < F32_MIN_FINITE) or (
331
- # max_finite_val > F32_MAX_FINITE
332
- # ):
333
- # can_shrink = False
334
- # if can_shrink:
335
- # expressions.append(pl.col(col_name).shrink_dtype())
336
- # else:
337
- # expressions.append(pl.col(col_name))
338
- # else:
339
- # expressions.append(pl.col(col_name).shrink_dtype())
340
- # else:
341
- # expressions.append(pl.col(col_name))
342
- # continue
343
-
344
- # # 2. Optimize string columns by casting to more specific types
345
- # if dtype == pl.Utf8:
346
- # # Create a cleaned column expression that first strips whitespace, then
347
- # # replaces common null-like strings.
348
- # cleaned_col = (
349
- # pl.col(col_name)
350
- # .str.strip_chars()
351
- # .replace({"-": None, "": None, "None": None})
352
- # )
353
-
354
- # # Analyze a stripped, non-null version of the series to decide the cast type
355
- # s_non_null = s.drop_nulls()
356
- # if len(s_non_null) == 0:
357
- # # The column only contains nulls or null-like strings.
358
- # # Cast to Int8 as requested for all-null columns.
359
- # expressions.append(pl.col(col_name).cast(pl.Int8))
360
- # continue
361
-
362
- # s_stripped_non_null = s_non_null.str.strip_chars()
363
-
364
- # # Check for boolean type
365
- # if s_stripped_non_null.str.to_lowercase().str.contains(BOOLEAN_REGEX).all():
366
- # expr = cleaned_col.str.to_lowercase().str.contains(BOOLEAN_TRUE_REGEX)
367
- # expressions.append(expr.alias(col_name))
368
- # continue
369
-
370
- # # Check for numeric type
371
- # if s_stripped_non_null.str.contains(NUMERIC_REGEX).all():
372
- # is_float = s_stripped_non_null.str.contains(r"[.,eE]").any()
373
- # numeric_col = cleaned_col.str.replace_all(",", ".")
374
- # if is_float:
375
- # if shrink_numerics:
376
- # temp_float_series = s_stripped_non_null.str.replace_all(
377
- # ",", "."
378
- # ).cast(pl.Float64, strict=False)
379
- # finite_values_series = temp_float_series.filter(
380
- # temp_float_series.is_finite()
381
- # )
382
- # can_shrink = True
383
- # if not finite_values_series.is_empty():
384
- # min_finite_val = finite_values_series.min()
385
- # max_finite_val = finite_values_series.max()
386
- # if (min_finite_val < F32_MIN_FINITE) or (
387
- # max_finite_val > F32_MAX_FINITE
388
- # ):
389
- # can_shrink = False
390
- # base_expr = numeric_col.cast(pl.Float64)
391
- # if can_shrink:
392
- # expressions.append(base_expr.shrink_dtype().alias(col_name))
393
- # else:
394
- # expressions.append(base_expr.alias(col_name))
395
- # else:
396
- # expressions.append(numeric_col.cast(pl.Float64).alias(col_name))
397
- # else:
398
- # if shrink_numerics:
399
- # expressions.append(
400
- # numeric_col.cast(pl.Int64).shrink_dtype().alias(col_name)
401
- # )
402
- # else:
403
- # expressions.append(numeric_col.cast(pl.Int64).alias(col_name))
404
- # continue
405
-
406
- # # Check for datetime type using a fast heuristic
407
- # try:
408
- # if s_stripped_non_null.str.contains(DATETIME_REGEX).all():
409
- # expressions.append(
410
- # cleaned_col.str.to_datetime(
411
- # strict=False, time_unit="us", time_zone=time_zone
412
- # ).alias(col_name)
413
- # )
414
- # continue
415
- # except pl.exceptions.PolarsError:
416
- # pass
417
-
418
- # # Phase 2: Execution - If any optimizations were identified, apply them
419
- # # all at once for maximum parallelism and performance.
420
- # if not expressions:
421
- # return df
422
-
423
- # return df.with_columns(expressions)
424
-
425
-
426
- def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None):
427
- def _unnest_all(struct_columns):
428
- if fields is not None:
429
- return (
430
- df.with_columns([
431
- pl.col(col).struct.rename_fields([
432
- f"{col}{seperator}{field_name}"
433
- for field_name in df[col].struct.fields
434
- ])
435
- for col in struct_columns
436
- ])
437
- .unnest(struct_columns)
438
- .select(
439
- list(set(df.columns) - set(struct_columns))
440
- + sorted([
441
- f"{col}{seperator}{field_name}"
442
- for field_name in fields
443
- for col in struct_columns
444
- ])
445
- )
446
- )
447
-
448
- return df.with_columns([
449
- pl.col(col).struct.rename_fields([
450
- f"{col}{seperator}{field_name}" for field_name in df[col].struct.fields
451
- ])
452
- for col in struct_columns
453
- ]).unnest(struct_columns)
454
-
455
- struct_columns = [col for col in df.columns if df[col].dtype == pl.Struct] # noqa: F821
456
- while len(struct_columns):
457
- df = _unnest_all(struct_columns=struct_columns)
458
- struct_columns = [col for col in df.columns if df[col].dtype == pl.Struct]
459
- return df
460
-
461
-
462
- def explode_all(df: pl.DataFrame | pl.LazyFrame):
463
- list_columns = [col for col in df.columns if df[col].dtype == pl.List]
464
- for col in list_columns:
465
- df = df.explode(col)
466
- return df
467
-
468
-
469
- def with_strftime_columns(
470
- df: pl.DataFrame | pl.LazyFrame,
471
- strftime: str | list[str],
472
- timestamp_column: str = "auto",
473
- column_names: str | list[str] | None = None,
474
- ):
475
- if timestamp_column is None or timestamp_column == "auto":
476
- timestamp_column = get_timestamp_column(df)
477
- if len(timestamp_column):
478
- timestamp_column = timestamp_column[0]
479
-
480
- if timestamp_column is None:
481
- raise ValueError("timestamp_column is not specified nor found in the dataframe")
482
-
483
- if isinstance(strftime, str):
484
- strftime = [strftime]
485
- if isinstance(column_names, str):
486
- column_names = [column_names]
487
-
488
- if column_names is None:
489
- column_names = [
490
- f"_strftime_{strftime_.replace('%', '').replace('-', '_')}_"
491
- for strftime_ in strftime
492
- ]
493
- # print("timestamp_column, with_strftime_columns", timestamp_column)
494
- return opt_dtype(
495
- df.with_columns([
496
- pl.col(timestamp_column)
497
- .dt.strftime(strftime_)
498
- .fill_null(0)
499
- .alias(column_name)
500
- for strftime_, column_name in zip(strftime, column_names)
501
- ]),
502
- include=column_names,
503
- strict=False,
504
- )
505
-
506
-
507
- def with_truncated_columns(
508
- df: pl.DataFrame | pl.LazyFrame,
509
- truncate_by: str | list[str],
510
- timestamp_column: str = "auto",
511
- column_names: str | list[str] | None = None,
512
- ):
513
- if timestamp_column is None or timestamp_column == "auto":
514
- timestamp_column = get_timestamp_column(df)
515
- if len(timestamp_column):
516
- timestamp_column = timestamp_column[0]
517
-
518
- if timestamp_column is None:
519
- raise ValueError(
520
- "timestamp_column is not specified nor found in the dataframe"
521
- )
522
- if isinstance(truncate_by, str):
523
- truncate_by = [truncate_by]
524
-
525
- if isinstance(column_names, str):
526
- column_names = [column_names]
527
-
528
- if column_names is None:
529
- column_names = [
530
- f"_truncated_{truncate_.replace(' ', '_')}_" for truncate_ in truncate_by
531
- ]
532
-
533
- truncate_by = [
534
- get_timedelta_str(truncate_, to="polars") for truncate_ in truncate_by
535
- ]
536
- return df.with_columns([
537
- pl.col(timestamp_column).dt.truncate(truncate_).alias(column_name)
538
- for truncate_, column_name in zip(truncate_by, column_names)
539
- ])
540
-
541
-
542
- def with_datepart_columns(
543
- df: pl.DataFrame | pl.LazyFrame,
544
- timestamp_column: str = "auto",
545
- year: bool = False,
546
- month: bool = False,
547
- week: bool = False,
548
- yearday: bool = False,
549
- monthday: bool = False,
550
- day: bool = False,
551
- weekday: bool = False,
552
- hour: bool = False,
553
- minute: bool = False,
554
- strftime: str | None = None,
555
- ):
556
- if strftime:
557
- if isinstance(strftime, str):
558
- strftime = [strftime]
559
- column_names = [
560
- f"_strftime_{strftime_.replace('%', '').replace('-', '_')}_"
561
- for strftime_ in strftime
562
- ]
563
- else:
564
- strftime = []
565
- column_names = []
566
-
567
- if year:
568
- strftime.append("%Y")
569
- column_names.append("year")
570
- if month:
571
- strftime.append("%m")
572
- column_names.append("month")
573
- if week:
574
- strftime.append("%W")
575
- column_names.append("week")
576
- if yearday:
577
- strftime.append("%j")
578
- column_names.append("year_day")
579
- if monthday:
580
- strftime.append("%d")
581
- column_names.append("day")
582
- if day:
583
- strftime.append("%d")
584
- column_names.append("day")
585
- if weekday:
586
- strftime.append("%a")
587
- column_names.append("week_day")
588
- if hour:
589
- strftime.append("%H")
590
- column_names.append("hour")
591
- if minute:
592
- strftime.append("%M")
593
- column_names.append("minute")
594
-
595
- column_names = [col for col in column_names if col not in df.columns]
596
- # print("timestamp_column, with_datepart_columns", timestamp_column)
597
- return with_strftime_columns(
598
- df=df,
599
- timestamp_column=timestamp_column,
600
- strftime=strftime,
601
- column_names=column_names,
602
- )
603
-
604
-
605
- def with_row_count(
606
- df: pl.DataFrame | pl.LazyFrame,
607
- over: str | list[str] | None = None,
608
- ):
609
- if over:
610
- if len(over) == 0:
611
- over = None
612
-
613
- if isinstance(over, str):
614
- over = [over]
615
-
616
- if over:
617
- return df.with_columns(pl.lit(1).alias("row_nr")).with_columns(
618
- pl.col("row_nr").cum_sum().over(over)
619
- )
620
- else:
621
- return df.with_columns(pl.lit(1).alias("row_nr")).with_columns(
622
- pl.col("row_nr").cum_sum()
623
- )
624
-
625
-
626
- # def delta(
627
- # df1: pl.DataFrame | pl.LazyFrame,
628
- # df2: pl.DataFrame | pl.LazyFrame,
629
- # subset: str | list[str] | None = None,
630
- # eager: bool = False,
631
- # ) -> pl.LazyFrame:
632
- # columns = sorted(set(df1.columns) & set(df2.columns))
633
-
634
- # if subset is None:
635
- # subset = columns
636
- # if isinstance(subset, str):
637
- # subset = [subset]
638
-
639
- # subset = sorted(set(columns) & set(subset))
640
-
641
- # if isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.DataFrame):
642
- # df2 = df2.lazy()
643
-
644
- # elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.LazyFrame):
645
- # df1 = df1.lazy()
646
-
647
- # df = (
648
- # pl.concat(
649
- # [
650
- # df1.select(columns)
651
- # .with_columns(pl.lit(1).alias("df"))
652
- # .with_row_count(),
653
- # df2.select(columns)
654
- # .with_columns(pl.lit(2).alias("df"))
655
- # .with_row_count(),
656
- # ],
657
- # how="vertical_relaxed",
658
- # )
659
- # .filter((pl.count().over(subset) == 1) & (pl.col("df") == 1))
660
- # .select(pl.exclude(["df", "row_nr"]))
661
- # )
662
-
663
- # if eager and isinstance(df, pl.LazyFrame):
664
- # return df.collect()
665
- # return df
666
-
667
-
668
- def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
669
- """Remove columns with all null values from the DataFrame."""
670
- return df.select([col for col in df.columns if df[col].null_count() < df.height])
671
-
672
-
673
- def unify_schemas(dfs: list[pl.DataFrame | pl.LazyFrame]) -> pl.Schema:
674
- df = pl.concat([df.lazy() for df in dfs], how="diagonal_relaxed")
675
- if isinstance(df, pl.LazyFrame):
676
- return df.collect_schema()
677
- return df.schema
678
-
679
-
680
- def cast_relaxed(
681
- df: pl.DataFrame | pl.LazyFrame, schema: pl.Schema
682
- ) -> pl.DataFrame | pl.LazyFrame:
683
- if isinstance(df, pl.LazyFrame):
684
- columns = df.collect_schema().names()
685
- else:
686
- columns = df.schema.names()
687
- new_columns = [col for col in schema.names() if col not in columns]
688
- if len(new_columns):
689
- return df.with_columns([
690
- pl.lit(None).alias(new_col) for new_col in new_columns
691
- ]).cast(schema)
692
- return df.cast(schema)
693
-
694
-
695
- def delta(
696
- df1: pl.DataFrame | pl.LazyFrame,
697
- df2: pl.DataFrame | pl.LazyFrame,
698
- subset: list[str] | None = None,
699
- eager: bool = False,
700
- ) -> pl.DataFrame:
701
- s1 = df1.select(~cs.by_dtype(pl.Null())).collect_schema()
702
- s2 = df2.select(~cs.by_dtype(pl.Null())).collect_schema()
703
-
704
- columns = sorted(set(s1.names()) & set(s2.names()))
705
-
706
- if subset is None:
707
- subset = df1.columns
708
- if isinstance(subset, str):
709
- subset = [subset]
710
-
711
- subset = sorted(set(columns) & set(subset))
712
-
713
- if isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.DataFrame):
714
- df2 = df2.lazy()
715
-
716
- elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.LazyFrame):
717
- df1 = df1.lazy()
718
-
719
- # cast to equal schema
720
- unified_schema = unify_schemas([df1.select(subset), df2.select(subset)])
721
-
722
- df1 = df1.cast_relaxed(unified_schema)
723
- df2 = df2.cast_relaxed(unified_schema)
724
-
725
- df = df1.join(df2, on=subset, how="anti", join_nulls=True)
726
-
727
- if eager and isinstance(df, pl.LazyFrame):
728
- return df.collect()
729
-
730
- return df
731
-
732
-
733
- def partition_by(
734
- df: pl.DataFrame | pl.LazyFrame,
735
- timestamp_column: str | None = None,
736
- columns: str | list[str] | None = None,
737
- strftime: str | list[str] | None = None,
738
- timedelta: str | list[str] | None = None,
739
- num_rows: int | None = None,
740
- ) -> list[tuple[dict, pl.DataFrame | pl.LazyFrame]]:
741
- if columns is not None:
742
- if isinstance(columns, str):
743
- columns = [columns]
744
- columns_ = columns.copy()
745
- else:
746
- columns_ = []
747
-
748
- drop_columns = columns_.copy()
749
-
750
- if strftime is not None:
751
- if isinstance(strftime, str):
752
- strftime = [strftime]
753
-
754
- df = df.with_strftime_columns(
755
- timestamp_column=timestamp_column, strftime=strftime
756
- )
757
- strftime_columns = [
758
- f"_strftime_{strftime_.replaace('%', '')}_" for strftime_ in strftime
759
- ]
760
- columns_ += strftime_columns
761
- drop_columns += strftime_columns
762
-
763
- if timedelta is not None:
764
- if isinstance(timedelta, str):
765
- timedelta = [timedelta]
766
-
767
- df = df.with_duration_columns(
768
- timestamp_column=timestamp_column, timedelta=timedelta
769
- )
770
- timedelta_columns = [f"_timedelta_{timedelta_}_" for timedelta_ in timedelta]
771
- columns_ += timedelta_columns
772
- drop_columns += timedelta_columns
773
-
774
- if columns_:
775
- # datetime_columns = {
776
- # col: col in [col.lower() for col in columns_]
777
- # for col in [
778
- # "year",
779
- # "month",
780
- # "week",
781
- # "yearday",
782
- # "monthday",
783
- # "weekday",
784
- # "strftime",
785
- # ]
786
- # if col not in [table_col.lower() for table_col in df.columns]
787
- # }
788
- datetime_columns = [
789
- col.lower()
790
- for col in columns_
791
- if col
792
- in [
793
- "year",
794
- "month",
795
- "week",
796
- "yearday",
797
- "monthday",
798
- "weekday",
799
- "day",
800
- "hour",
801
- "minute",
802
- "strftime",
803
- ]
804
- and col not in df.columns
805
- ]
806
-
807
- datetime_columns = {
808
- col: col in datetime_columns
809
- for col in [
810
- "year",
811
- "month",
812
- "week",
813
- "yearday",
814
- "monthday",
815
- "weekday",
816
- "day",
817
- "hour",
818
- "minute",
819
- "strftime",
820
- ]
821
- }
822
- if any(datetime_columns.values()):
823
- df = df.with_datepart_columns(
824
- timestamp_column=timestamp_column, **datetime_columns
825
- )
826
-
827
- if isinstance(df, pl.LazyFrame):
828
- df = df.collect()
829
- columns_ = [col for col in columns_ if col in df.columns]
830
-
831
- if num_rows is not None:
832
- df = df.with_row_count_ext(over=columns_).with_columns(
833
- (pl.col("row_nr") - 1) // num_rows
834
- )
835
- columns_ += ["row_nr"]
836
- drop_columns += ["row_nr"]
837
-
838
- if columns_:
839
- partitions = [
840
- (p.select(columns_).unique().to_dicts()[0], p.drop(drop_columns))
841
- for p in df.partition_by(
842
- by=columns_,
843
- as_dict=False,
844
- maintain_order=True,
845
- )
846
- ]
847
-
848
- return partitions
849
-
850
- return [({}, df)]
851
-
852
-
853
- pl.DataFrame.unnest_all = unnest_all
854
- pl.DataFrame.explode_all = explode_all
855
- pl.DataFrame.opt_dtype = opt_dtype
856
- pl.DataFrame.with_row_count_ext = with_row_count
857
- pl.DataFrame.with_datepart_columns = with_datepart_columns
858
- pl.DataFrame.with_duration_columns = with_truncated_columns
859
- pl.DataFrame.with_strftime_columns = with_strftime_columns
860
- pl.DataFrame.cast_relaxed = cast_relaxed
861
- pl.DataFrame.delta = delta
862
- pl.DataFrame.partition_by_ext = partition_by
863
- pl.DataFrame.drop_null_columns = drop_null_columns
864
-
865
- pl.LazyFrame.unnest_all = unnest_all
866
- pl.LazyFrame.explode_all = explode_all
867
- pl.LazyFrame.opt_dtype = opt_dtype
868
- pl.LazyFrame.with_row_count_ext = with_row_count
869
- pl.LazyFrame.with_datepart_columns = with_datepart_columns
870
- pl.LazyFrame.with_duration_columns = with_truncated_columns
871
- pl.LazyFrame.with_strftime_columns = with_strftime_columns
872
- pl.LazyFrame.delta = delta
873
- pl.LazyFrame.cast_relaxed = cast_relaxed
874
- pl.LazyFrame.partition_by_ext = partition_by
875
- pl.LazyFrame.drop_null_columns = drop_null_columns