FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,346 @@
1
- from functools import partial
2
-
3
- import pandas as pd
1
+ import numpy as np
4
2
  import polars as pl
5
3
  import polars.selectors as cs
6
4
 
7
5
  from .datetime import get_timedelta_str, get_timestamp_column
8
6
 
9
- # import string
7
+ # Pre-compiled regex patterns (identical to original)
8
+ INTEGER_REGEX = r"^[-+]?\d+$"
9
+ FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
10
+ BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
11
+ BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
12
+ DATETIME_REGEX = (
13
+ r"^("
14
+ r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
15
+ r"|"
16
+ r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
17
+ r"|"
18
+ r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
19
+ r"|"
20
+ r"\d{8}" # Compact: 20231231
21
+ r")"
22
+ r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
23
+ r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
24
+ r"$"
25
+ )
26
+
27
+ # Float32 range limits
28
+ F32_MIN = float(np.finfo(np.float32).min)
29
+ F32_MAX = float(np.finfo(np.float32).max)
30
+
31
+
32
+ def _clean_string_expr(col_name: str) -> pl.Expr:
33
+ """Create expression to clean string values."""
34
+ return (
35
+ pl.col(col_name).str.strip_chars().replace({"-": None, "": None, "None": None})
36
+ )
37
+
38
+
39
+ def _can_downcast_to_float32(series: pl.Series) -> bool:
40
+ """Check if float values are within Float32 range."""
41
+ finite_values = series.filter(series.is_finite())
42
+ if finite_values.is_empty():
43
+ return True
44
+
45
+ min_val, max_val = finite_values.min(), finite_values.max()
46
+ return F32_MIN <= min_val <= max_val <= F32_MAX
47
+
48
+
49
+ def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) -> pl.Expr:
50
+ """Optimize numeric column types."""
51
+ if not shrink:
52
+ return pl.col(col_name)
53
+
54
+ if series.dtype == pl.Float64 and not _can_downcast_to_float32(series):
55
+ return pl.col(col_name)
56
+
57
+ return pl.col(col_name).shrink_dtype()
58
+
59
+
60
+ def _optimize_string_column(
61
+ series: pl.Series,
62
+ col_name: str,
63
+ shrink_numerics: bool,
64
+ time_zone: str | None = None,
65
+ ) -> pl.Expr:
66
+ """Convert string column to appropriate type based on content analysis."""
67
+ # Return early for empty or null-only series
68
+ cleaned_expr = _clean_string_expr(col_name)
69
+ non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
70
+ if len(non_null) == 0:
71
+ return pl.col(col_name).cast(pl.Int8)
72
+
73
+ stripped = non_null.str.strip_chars()
74
+ lowercase = stripped.str.to_lowercase()
75
+
76
+ # Check for boolean values
77
+ if lowercase.str.contains(BOOLEAN_REGEX).all():
78
+ return (
79
+ cleaned_expr.str.to_lowercase()
80
+ .str.contains(BOOLEAN_TRUE_REGEX)
81
+ .alias(col_name)
82
+ )
83
+
84
+ elif stripped.str.contains(INTEGER_REGEX).all():
85
+ int_expr = cleaned_expr.cast(pl.Int64)
86
+ return (
87
+ int_expr.shrink_dtype().alias(col_name)
88
+ if shrink_numerics
89
+ else int_expr.alias(col_name)
90
+ )
91
+
92
+ # Check for numeric values
93
+ elif stripped.str.contains(FLOAT_REGEX).all():
94
+ float_expr = cleaned_expr.str.replace_all(",", ".").cast(pl.Float64)
95
+
96
+ if shrink_numerics:
97
+ # Check if values can fit in Float32
98
+ temp_floats = stripped.str.replace_all(",", ".").cast(
99
+ pl.Float64, strict=False
100
+ )
101
+ if _can_downcast_to_float32(temp_floats):
102
+ return float_expr.shrink_dtype().alias(col_name)
103
+
104
+ return float_expr.alias(col_name)
105
+
106
+ try:
107
+ if stripped.str.contains(DATETIME_REGEX).all():
108
+ return cleaned_expr.str.to_datetime(
109
+ strict=False, time_unit="us", time_zone=time_zone
110
+ ).alias(col_name)
111
+ except pl.exceptions.PolarsError:
112
+ pass
113
+
114
+ # Keep original if no conversion applies
115
+ return pl.col(col_name)
116
+
117
+
118
+ def _get_column_expr(
119
+ df: pl.DataFrame, col_name: str, shrink_numerics: bool, time_zone: str | None = None
120
+ ) -> pl.Expr:
121
+ """Generate optimization expression for a single column."""
122
+ series = df[col_name]
123
+
124
+ # Handle all-null columns
125
+ if series.is_null().all():
126
+ return pl.col(col_name).cast(pl.Int8)
127
+
128
+ # Process based on current type
129
+ if series.dtype.is_numeric():
130
+ return _optimize_numeric_column(series, col_name, shrink_numerics)
131
+ elif series.dtype == pl.Utf8:
132
+ return _optimize_string_column(series, col_name, shrink_numerics, time_zone)
133
+
134
+ # Keep original for other types
135
+ return pl.col(col_name)
136
+
137
+
138
+ def opt_dtype(
139
+ df: pl.DataFrame,
140
+ include: str | list[str] | None = None,
141
+ exclude: str | list[str] | None = None,
142
+ time_zone: str | None = None,
143
+ shrink_numerics: bool = True,
144
+ ) -> pl.DataFrame:
145
+ """
146
+ Optimize data types of a Polars DataFrame for performance and memory efficiency.
147
+
148
+ This function analyzes each column and converts it to the most appropriate
149
+ data type based on content, handling string-to-type conversions and
150
+ numeric type downcasting.
151
+
152
+ Args:
153
+ df: DataFrame to optimize
154
+ include: Column(s) to include in optimization (default: all columns)
155
+ exclude: Column(s) to exclude from optimization
156
+ time_zone: Optional time zone for datetime parsing
157
+ shrink_numerics: Whether to downcast numeric types when possible
158
+
159
+ Returns:
160
+ DataFrame with optimized data types
161
+ """
162
+ # Normalize include/exclude parameters
163
+ if isinstance(include, str):
164
+ include = [include]
165
+ if isinstance(exclude, str):
166
+ exclude = [exclude]
167
+
168
+ # Determine columns to process
169
+ cols_to_process = df.columns
170
+ if include:
171
+ cols_to_process = [col for col in include if col in df.columns]
172
+ if exclude:
173
+ cols_to_process = [col for col in cols_to_process if col not in exclude]
174
+
175
+ # Generate optimization expressions for all columns
176
+ expressions = [
177
+ _get_column_expr(df, col_name, shrink_numerics, time_zone)
178
+ for col_name in cols_to_process
179
+ ]
180
+
181
+ # Apply all transformations at once if any exist
182
+ return df if not expressions else df.with_columns(expressions)
183
+
184
+
185
+ # def opt_dtype(
186
+ # df: pl.DataFrame,
187
+ # include: str | list[str] | None = None,
188
+ # exclude: str | list[str] | None = None,
189
+ # time_zone: str | None = None,
190
+ # shrink_numerics: bool = True,
191
+ # ) -> pl.DataFrame:
192
+ # """
193
+ # Analyzes and optimizes the data types of a Polars DataFrame for performance
194
+ # and memory efficiency.
195
+
196
+ # This version includes:
197
+ # - Robust numeric, boolean, and datetime casting from strings.
198
+ # - Handling of whitespace and common null-like string values.
199
+ # - Casting of columns containing only nulls to pl.Int8.
200
+ # - Optional shrinking of numeric columns to the smallest possible type.
201
+
202
+ # Args:
203
+ # df: The DataFrame to optimize.
204
+ # include: A list of columns to forcefully include in the optimization.
205
+ # exclude: A list of columns to exclude from the optimization.
206
+ # time_zone: Optional time zone for datetime parsing.
207
+ # shrink_numerics: If True, numeric columns (both existing and newly converted from strings)
208
+ # will be downcast to the smallest possible type that can hold their values (e.g., Int64 to Int32, Float64 to Float32),
209
+ # similar to Polars' shrink_dtype() behavior. If False, this shrinking step is skipped.
210
+
211
+ # Returns:
212
+ # An optimized Polars DataFrame with improved data types.
213
+ # """
214
+ # # Phase 1: Analysis - Determine columns to process and build a list of
215
+ # # transformation expressions without executing them immediately.
216
+ # if isinstance(include, str):
217
+ # include = [include]
218
+ # if isinstance(exclude, str):
219
+ # exclude = [exclude]
220
+
221
+ # cols_to_process = df.columns
222
+ # if include:
223
+ # cols_to_process = [col for col in include if col in df.columns]
224
+ # if exclude:
225
+ # cols_to_process = [col for col in cols_to_process if col not in exclude]
226
+
227
+ # expressions = []
228
+ # for col_name in cols_to_process:
229
+ # s = df[col_name]
230
+
231
+ # # NEW: If a column is entirely null, cast it to Int8 and skip other checks.
232
+ # if s.is_null().all():
233
+ # expressions.append(pl.col(col_name).cast(pl.Int8))
234
+ # continue
235
+
236
+ # dtype = s.dtype
237
+
238
+ # # 1. Optimize numeric columns by shrinking their size
239
+ # if dtype.is_numeric():
240
+ # if shrink_numerics:
241
+ # if dtype == pl.Float64:
242
+ # column_series = df[col_name]
243
+ # finite_values_series = column_series.filter(
244
+ # column_series.is_finite()
245
+ # )
246
+ # can_shrink = True
247
+ # if not finite_values_series.is_empty():
248
+ # min_finite_val = finite_values_series.min()
249
+ # max_finite_val = finite_values_series.max()
250
+ # if (min_finite_val < F32_MIN_FINITE) or (
251
+ # max_finite_val > F32_MAX_FINITE
252
+ # ):
253
+ # can_shrink = False
254
+ # if can_shrink:
255
+ # expressions.append(pl.col(col_name).shrink_dtype())
256
+ # else:
257
+ # expressions.append(pl.col(col_name))
258
+ # else:
259
+ # expressions.append(pl.col(col_name).shrink_dtype())
260
+ # else:
261
+ # expressions.append(pl.col(col_name))
262
+ # continue
263
+
264
+ # # 2. Optimize string columns by casting to more specific types
265
+ # if dtype == pl.Utf8:
266
+ # # Create a cleaned column expression that first strips whitespace, then
267
+ # # replaces common null-like strings.
268
+ # cleaned_col = (
269
+ # pl.col(col_name)
270
+ # .str.strip_chars()
271
+ # .replace({"-": None, "": None, "None": None})
272
+ # )
273
+
274
+ # # Analyze a stripped, non-null version of the series to decide the cast type
275
+ # s_non_null = s.drop_nulls()
276
+ # if len(s_non_null) == 0:
277
+ # # The column only contains nulls or null-like strings.
278
+ # # Cast to Int8 as requested for all-null columns.
279
+ # expressions.append(pl.col(col_name).cast(pl.Int8))
280
+ # continue
281
+
282
+ # s_stripped_non_null = s_non_null.str.strip_chars()
283
+
284
+ # # Check for boolean type
285
+ # if s_stripped_non_null.str.to_lowercase().str.contains(BOOLEAN_REGEX).all():
286
+ # expr = cleaned_col.str.to_lowercase().str.contains(BOOLEAN_TRUE_REGEX)
287
+ # expressions.append(expr.alias(col_name))
288
+ # continue
289
+
290
+ # # Check for numeric type
291
+ # if s_stripped_non_null.str.contains(NUMERIC_REGEX).all():
292
+ # is_float = s_stripped_non_null.str.contains(r"[.,eE]").any()
293
+ # numeric_col = cleaned_col.str.replace_all(",", ".")
294
+ # if is_float:
295
+ # if shrink_numerics:
296
+ # temp_float_series = s_stripped_non_null.str.replace_all(
297
+ # ",", "."
298
+ # ).cast(pl.Float64, strict=False)
299
+ # finite_values_series = temp_float_series.filter(
300
+ # temp_float_series.is_finite()
301
+ # )
302
+ # can_shrink = True
303
+ # if not finite_values_series.is_empty():
304
+ # min_finite_val = finite_values_series.min()
305
+ # max_finite_val = finite_values_series.max()
306
+ # if (min_finite_val < F32_MIN_FINITE) or (
307
+ # max_finite_val > F32_MAX_FINITE
308
+ # ):
309
+ # can_shrink = False
310
+ # base_expr = numeric_col.cast(pl.Float64)
311
+ # if can_shrink:
312
+ # expressions.append(base_expr.shrink_dtype().alias(col_name))
313
+ # else:
314
+ # expressions.append(base_expr.alias(col_name))
315
+ # else:
316
+ # expressions.append(numeric_col.cast(pl.Float64).alias(col_name))
317
+ # else:
318
+ # if shrink_numerics:
319
+ # expressions.append(
320
+ # numeric_col.cast(pl.Int64).shrink_dtype().alias(col_name)
321
+ # )
322
+ # else:
323
+ # expressions.append(numeric_col.cast(pl.Int64).alias(col_name))
324
+ # continue
325
+
326
+ # # Check for datetime type using a fast heuristic
327
+ # try:
328
+ # if s_stripped_non_null.str.contains(DATETIME_REGEX).all():
329
+ # expressions.append(
330
+ # cleaned_col.str.to_datetime(
331
+ # strict=False, time_unit="us", time_zone=time_zone
332
+ # ).alias(col_name)
333
+ # )
334
+ # continue
335
+ # except pl.exceptions.PolarsError:
336
+ # pass
337
+
338
+ # # Phase 2: Execution - If any optimizations were identified, apply them
339
+ # # all at once for maximum parallelism and performance.
340
+ # if not expressions:
341
+ # return df
342
+
343
+ # return df.with_columns(expressions)
10
344
 
11
345
 
12
346
  def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None):
@@ -45,119 +379,6 @@ def unnest_all(df: pl.DataFrame, seperator="_", fields: list[str] | None = None)
45
379
  return df
46
380
 
47
381
 
48
- def _opt_dtype(
49
- s: pl.Series, strict: bool = True, shrink_dtype: bool = True
50
- ) -> pl.Series:
51
- if s.dtype == pl.Utf8():
52
- try:
53
- s = s.set(s == "-", None).set(s == "", None).set(s == "None", None)
54
-
55
- # cast string numbers to int or float
56
- if (
57
- s.str.contains(r"^[-+]?[0-9]*[.,]?[0-9]+([eE][-+]?[0-9]+)?$")
58
- | s.is_null()
59
- | s.str.contains(r"^$")
60
- ).all():
61
- s = (
62
- s.str.replace_all(",", ".")
63
- # .str.replace_all("^0{1,}$", "+0")
64
- # .str.strip_chars_start("0")
65
- .str.replace_all(r"\.0*$", "")
66
- )
67
- s = s.set(s == "-", None).set(s == "", None).set(s == "None", None)
68
- if s.str.contains(r"\.").any() | s.str.contains("NaN").any():
69
- s = s.cast(pl.Float64(), strict=True)
70
- if shrink_dtype:
71
- try:
72
- if s.min() >= -16777216 and s.max() <= 16777216:
73
- s = s.cast(pl.Float32(), strict=True)
74
- except TypeError:
75
- # if min or max is None, we cannot cast to Float32
76
- pass
77
- else:
78
- s = s.cast(pl.Int64(), strict=True)
79
- if shrink_dtype:
80
- s = s.shrink_dtype()
81
-
82
- # cast str to datetime
83
-
84
- elif (
85
- s.str.contains(r"^\d{4}-\d{2}-\d{2}$")
86
- | s.str.contains(r"^\d{1,2}\/\d{1,2}\/\d{4}$")
87
- | s.str.contains(
88
- r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?.\d{0,}$"
89
- )
90
- | s.str.contains(
91
- r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?\.\d{0,}$"
92
- )
93
- | s.str.contains(
94
- r"^\d{4}-\d{2}-\d{2}T{0,1}\s{0,1}\d{2}:\d{2}(:\d{2})?\.\d{1,}\w{0,1}\+\d{0,2}:\d{0,2}:\d{0,2}$"
95
- )
96
- | s.is_null()
97
- | s.str.contains("^$")
98
- ).all():
99
- s = pl.Series(
100
- name=s.name, values=pd.to_datetime(s, format="mixed")
101
- ).cast(pl.Datetime("us"))
102
-
103
- # cast str to bool
104
- elif (
105
- s.str.to_lowercase()
106
- .str.contains("^(true|false|1|0|wahr|falsch|nein|nok|ok|ja)$")
107
- .all()
108
- ):
109
- s = s.str.to_lowercase().str.contains(
110
- "^(true|1|wahr|ja|ok)$", strict=True
111
- )
112
-
113
- except Exception as e:
114
- if strict:
115
- e.add_note(
116
- "if you were trying to cast Utf8 to temporal dtypes, consider setting `strict=False`"
117
- )
118
- raise e
119
- else:
120
- if shrink_dtype:
121
- if s.dtype == pl.Float64():
122
- try:
123
- if s.min() >= -16777216 and s.max() <= 16777216:
124
- s = s.cast(pl.Float32(), strict=True)
125
- except TypeError:
126
- # if min or max is None, we cannot cast to Float32
127
- pass
128
-
129
- else:
130
- s = s.shrink_dtype()
131
-
132
- return s
133
-
134
-
135
- def opt_dtype(
136
- df: pl.DataFrame,
137
- exclude: str | list[str] | None = None,
138
- strict: bool = True,
139
- include: str | list[str] | None = None,
140
- shrink_dtype: bool = True,
141
- ) -> pl.DataFrame:
142
- _opt_dtype_strict = partial(_opt_dtype, strict=strict, shrink_dtype=shrink_dtype)
143
- _opt_dtype_not_strict = partial(_opt_dtype, strict=False, shrink_dtype=shrink_dtype)
144
- if include is not None:
145
- if isinstance(include, str):
146
- include = [include]
147
- exclude = [col for col in df.columns if col not in include]
148
- return (
149
- df.with_columns(
150
- pl.all()
151
- .exclude(exclude)
152
- .map_batches(_opt_dtype_strict if strict else _opt_dtype_not_strict)
153
- )
154
- if exclude is not None
155
- else df.with_columns(
156
- pl.all().map_batches(_opt_dtype_strict if strict else _opt_dtype_not_strict)
157
- )
158
- )
159
-
160
-
161
382
  def explode_all(df: pl.DataFrame | pl.LazyFrame):
162
383
  list_columns = [col for col in df.columns if df[col].dtype == pl.List]
163
384
  for col in list_columns:
@@ -364,8 +585,13 @@ def with_row_count(
364
585
  # return df
365
586
 
366
587
 
367
- def unify_schema(dfs: list[pl.DataFrame | pl.LazyFrame]) -> pl.Schema:
368
- df = pl.concat(dfs, how="diagonal_relaxed")
588
+ def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
589
+ """Remove columns with all null values from the DataFrame."""
590
+ return df.select([col for col in df.columns if df[col].null_count() < df.height])
591
+
592
+
593
+ def unify_schemas(dfs: list[pl.DataFrame | pl.LazyFrame]) -> pl.Schema:
594
+ df = pl.concat([df.lazy() for df in dfs], how="diagonal_relaxed")
369
595
  if isinstance(df, pl.LazyFrame):
370
596
  return df.collect_schema()
371
597
  return df.schema
@@ -411,7 +637,7 @@ def delta(
411
637
  df1 = df1.lazy()
412
638
 
413
639
  # cast to equal schema
414
- unified_schema = unify_schema([df1.select(subset), df2.select(subset)])
640
+ unified_schema = unify_schemas([df1.select(subset), df2.select(subset)])
415
641
 
416
642
  df1 = df1.cast_relaxed(unified_schema)
417
643
  df2 = df2.cast_relaxed(unified_schema)
@@ -544,10 +770,6 @@ def partition_by(
544
770
  return [({}, df)]
545
771
 
546
772
 
547
- def drop_null_columns(df: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame | pl.LazyFrame:
548
- return df.select([col for col in df.columns if not df[col].is_null().all()])
549
-
550
-
551
773
  pl.DataFrame.unnest_all = unnest_all
552
774
  pl.DataFrame.explode_all = explode_all
553
775
  pl.DataFrame.opt_dtype = opt_dtype