FlowerPower 0.11.6.16__py3-none-any.whl → 0.11.6.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/plugins/io/helpers/polars.py +61 -17
- flowerpower/plugins/io/helpers/pyarrow.py +162 -133
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/METADATA +1 -1
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/RECORD +8 -8
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.16.dist-info → flowerpower-0.11.6.17.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,8 @@ from .datetime import get_timedelta_str, get_timestamp_column
|
|
7
7
|
# Pre-compiled regex patterns (identical to original)
|
8
8
|
INTEGER_REGEX = r"^[-+]?\d+$"
|
9
9
|
FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
|
10
|
-
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
|
11
|
-
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
|
10
|
+
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
|
11
|
+
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
|
12
12
|
DATETIME_REGEX = (
|
13
13
|
r"^("
|
14
14
|
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
|
@@ -32,7 +32,24 @@ F32_MAX = float(np.finfo(np.float32).max)
|
|
32
32
|
def _clean_string_expr(col_name: str) -> pl.Expr:
|
33
33
|
"""Create expression to clean string values."""
|
34
34
|
return (
|
35
|
-
pl.col(col_name)
|
35
|
+
pl.col(col_name)
|
36
|
+
.str.strip_chars()
|
37
|
+
.replace({
|
38
|
+
"-": None,
|
39
|
+
"": None,
|
40
|
+
"None": None,
|
41
|
+
"none": None,
|
42
|
+
"NONE": None,
|
43
|
+
"NaN": None,
|
44
|
+
"Nan": None,
|
45
|
+
"nan": None,
|
46
|
+
"NAN": None,
|
47
|
+
"N/A": None,
|
48
|
+
"n/a": None,
|
49
|
+
"null": None,
|
50
|
+
"Null": None,
|
51
|
+
"NULL": None,
|
52
|
+
})
|
36
53
|
)
|
37
54
|
|
38
55
|
|
@@ -47,11 +64,24 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
|
|
47
64
|
|
48
65
|
|
49
66
|
def _optimize_numeric_column(
|
50
|
-
series: pl.Series,
|
67
|
+
series: pl.Series,
|
68
|
+
shrink: bool,
|
69
|
+
allow_unsigned: bool = True,
|
70
|
+
allow_null: bool = True,
|
51
71
|
) -> pl.Expr:
|
52
72
|
"""Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
|
73
|
+
col_name = series.name
|
53
74
|
expr = pl.col(col_name)
|
54
75
|
dtype = series.dtype
|
76
|
+
if series.is_null().all():
|
77
|
+
# If all values are null, cast to Null type if allow_null is True
|
78
|
+
if allow_null:
|
79
|
+
return expr.cast(pl.Null())
|
80
|
+
|
81
|
+
if not allow_unsigned:
|
82
|
+
# If unsigned types are not allowed, ensure we use signed integer types
|
83
|
+
if dtype.is_integer() and not dtype.is_signed_integer():
|
84
|
+
return expr.cast(pl.Int64)
|
55
85
|
|
56
86
|
if (
|
57
87
|
allow_unsigned
|
@@ -76,16 +106,21 @@ def _optimize_numeric_column(
|
|
76
106
|
|
77
107
|
def _optimize_string_column(
|
78
108
|
series: pl.Series,
|
79
|
-
col_name: str,
|
80
109
|
shrink_numerics: bool,
|
81
110
|
time_zone: str | None = None,
|
111
|
+
allow_null: bool = True,
|
112
|
+
allow_unsigned: bool = True,
|
82
113
|
) -> pl.Expr:
|
83
114
|
"""Convert string column to appropriate type based on content analysis."""
|
84
115
|
# Return early for empty or null-only series
|
116
|
+
col_name = series.name
|
85
117
|
cleaned_expr = _clean_string_expr(col_name)
|
86
|
-
non_null = series.drop_nulls()
|
87
|
-
if
|
88
|
-
|
118
|
+
non_null = series.drop_nulls()
|
119
|
+
if non_null.is_empty():
|
120
|
+
if allow_null:
|
121
|
+
return pl.col(col_name).cast(pl.Null())
|
122
|
+
else:
|
123
|
+
return pl.col(col_name).cast(series.dtype)
|
89
124
|
|
90
125
|
stripped = non_null.str.strip_chars()
|
91
126
|
lowercase = stripped.str.to_lowercase()
|
@@ -99,7 +134,7 @@ def _optimize_string_column(
|
|
99
134
|
)
|
100
135
|
|
101
136
|
elif stripped.str.contains(INTEGER_REGEX).all(ignore_nulls=False):
|
102
|
-
int_expr = cleaned_expr.cast(pl.Int64)
|
137
|
+
int_expr = cleaned_expr.cast(pl.Int64).alias(col_name)
|
103
138
|
return (
|
104
139
|
int_expr.shrink_dtype().alias(col_name)
|
105
140
|
if shrink_numerics
|
@@ -108,7 +143,9 @@ def _optimize_string_column(
|
|
108
143
|
|
109
144
|
# Check for numeric values
|
110
145
|
elif stripped.str.contains(FLOAT_REGEX).all(ignore_nulls=False):
|
111
|
-
float_expr =
|
146
|
+
float_expr = (
|
147
|
+
cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
|
148
|
+
)
|
112
149
|
|
113
150
|
if shrink_numerics:
|
114
151
|
# Check if values can fit in Float32
|
@@ -118,7 +155,7 @@ def _optimize_string_column(
|
|
118
155
|
if _can_downcast_to_float32(temp_floats):
|
119
156
|
return float_expr.shrink_dtype().alias(col_name)
|
120
157
|
|
121
|
-
return float_expr
|
158
|
+
return float_expr
|
122
159
|
|
123
160
|
try:
|
124
161
|
if stripped.str.contains(DATETIME_REGEX).all(ignore_nulls=False):
|
@@ -135,8 +172,9 @@ def _optimize_string_column(
|
|
135
172
|
def _get_column_expr(
|
136
173
|
df: pl.DataFrame,
|
137
174
|
col_name: str,
|
138
|
-
shrink_numerics: bool,
|
139
|
-
allow_unsigned: bool,
|
175
|
+
shrink_numerics: bool = True,
|
176
|
+
allow_unsigned: bool = True,
|
177
|
+
allow_null: bool = True,
|
140
178
|
time_zone: str | None = None,
|
141
179
|
) -> pl.Expr:
|
142
180
|
"""Generate optimization expression for a single column."""
|
@@ -144,15 +182,19 @@ def _get_column_expr(
|
|
144
182
|
|
145
183
|
# Handle all-null columns
|
146
184
|
if series.is_null().all():
|
147
|
-
|
185
|
+
if allow_null:
|
186
|
+
# If all values are null, cast to Null type if allow_null is True
|
187
|
+
return pl.col(col_name).cast(pl.Null())
|
148
188
|
|
149
189
|
# Process based on current type
|
150
190
|
if series.dtype.is_numeric():
|
151
191
|
return _optimize_numeric_column(
|
152
|
-
series,
|
192
|
+
series, shrink_numerics, allow_unsigned, allow_null
|
153
193
|
)
|
154
194
|
elif series.dtype == pl.Utf8:
|
155
|
-
return _optimize_string_column(
|
195
|
+
return _optimize_string_column(
|
196
|
+
series, col_name, shrink_numerics, time_zone, allow_null
|
197
|
+
)
|
156
198
|
|
157
199
|
# Keep original for other types
|
158
200
|
return pl.col(col_name)
|
@@ -165,6 +207,7 @@ def opt_dtype(
|
|
165
207
|
time_zone: str | None = None,
|
166
208
|
shrink_numerics: bool = True,
|
167
209
|
allow_unsigned: bool = True,
|
210
|
+
allow_null: bool = True,
|
168
211
|
strict: bool = False,
|
169
212
|
) -> pl.DataFrame:
|
170
213
|
"""
|
@@ -181,6 +224,7 @@ def opt_dtype(
|
|
181
224
|
time_zone: Optional time zone for datetime parsing
|
182
225
|
shrink_numerics: Whether to downcast numeric types when possible
|
183
226
|
allow_unsigned: Whether to allow unsigned integer types
|
227
|
+
allow_null: Whether to allow columns with all null values to be cast to Null type
|
184
228
|
strict: If True, will raise an error if any column cannot be optimized
|
185
229
|
|
186
230
|
Returns:
|
@@ -205,7 +249,7 @@ def opt_dtype(
|
|
205
249
|
try:
|
206
250
|
expressions.append(
|
207
251
|
_get_column_expr(
|
208
|
-
df, col_name, shrink_numerics, allow_unsigned, time_zone
|
252
|
+
df, col_name, shrink_numerics, allow_unsigned, allow_null, time_zone
|
209
253
|
)
|
210
254
|
)
|
211
255
|
except Exception as e:
|
@@ -1,3 +1,5 @@
|
|
1
|
+
import concurrent.futures
|
2
|
+
|
1
3
|
import numpy as np
|
2
4
|
import polars as pl
|
3
5
|
import pyarrow as pa
|
@@ -6,8 +8,8 @@ import pyarrow.compute as pc
|
|
6
8
|
# Pre-compiled regex patterns (identical to original)
|
7
9
|
INTEGER_REGEX = r"^[-+]?\d+$"
|
8
10
|
FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
|
9
|
-
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
|
10
|
-
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
|
11
|
+
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
|
12
|
+
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
|
11
13
|
DATETIME_REGEX = (
|
12
14
|
r"^("
|
13
15
|
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
|
@@ -260,12 +262,13 @@ def convert_large_types_to_normal(schema: pa.Schema) -> pa.Schema:
|
|
260
262
|
return pa.schema(new_fields)
|
261
263
|
|
262
264
|
|
263
|
-
def _clean_string_array(array: pa.Array) -> pa.
|
265
|
+
def _clean_string_array(array: pa.Array) -> pa.DataType:
|
264
266
|
"""
|
265
267
|
Clean string values in a PyArrow array using vectorized operations.
|
268
|
+
Returns the optimal dtype after cleaning.
|
266
269
|
"""
|
267
270
|
if len(array) == 0 or array.null_count == len(array):
|
268
|
-
return array
|
271
|
+
return array.type
|
269
272
|
|
270
273
|
# Trim whitespace using compute functions
|
271
274
|
trimmed = pc.utf8_trim_whitespace(array)
|
@@ -273,12 +276,28 @@ def _clean_string_array(array: pa.Array) -> pa.Array:
|
|
273
276
|
# Create mask for values to convert to null
|
274
277
|
empty_mask = pc.equal(trimmed, "")
|
275
278
|
dash_mask = pc.equal(trimmed, "-")
|
276
|
-
none_mask = pc.
|
279
|
+
none_mask = pc.or_(
|
280
|
+
pc.equal(trimmed, "None"),
|
281
|
+
pc.equal(trimmed, "none"),
|
282
|
+
pc.equal(trimmed, "NONE"),
|
283
|
+
pc.equal(trimmed, "Nan"),
|
284
|
+
pc.equal(trimmed, "N/A"),
|
285
|
+
pc.equal(trimmed, "n/a"),
|
286
|
+
pc.equal(trimmed, "NaN"),
|
287
|
+
pc.equal(trimmed, "nan"),
|
288
|
+
pc.equal(trimmed, "NAN"),
|
289
|
+
pc.equal(trimmed, "Null"),
|
290
|
+
pc.equal(trimmed, "NULL"),
|
291
|
+
pc.equal(trimmed, "null"),
|
292
|
+
)
|
277
293
|
|
278
294
|
null_mask = pc.or_(pc.or_(empty_mask, dash_mask), none_mask)
|
279
295
|
|
280
|
-
#
|
281
|
-
|
296
|
+
# If all values are null after cleaning, return null type
|
297
|
+
if pc.all(null_mask).as_py():
|
298
|
+
return pa.null()
|
299
|
+
|
300
|
+
return array.type # Default: keep string type if not all null
|
282
301
|
|
283
302
|
|
284
303
|
def _can_downcast_to_float32(array: pa.Array) -> bool:
|
@@ -288,37 +307,35 @@ def _can_downcast_to_float32(array: pa.Array) -> bool:
|
|
288
307
|
if len(array) == 0 or array.null_count == len(array):
|
289
308
|
return True
|
290
309
|
|
291
|
-
# Use compute functions to filter finite values and calculate min/max
|
292
310
|
is_finite = pc.is_finite(array)
|
293
|
-
|
294
|
-
# Skip if no finite values
|
295
311
|
if not pc.any(is_finite).as_py():
|
296
312
|
return True
|
297
313
|
|
298
|
-
# Filter out non-finite values
|
299
314
|
finite_array = pc.filter(array, is_finite)
|
300
|
-
|
301
315
|
min_val = pc.min(finite_array).as_py()
|
302
316
|
max_val = pc.max(finite_array).as_py()
|
303
317
|
|
304
318
|
return F32_MIN <= min_val <= max_val <= F32_MAX
|
305
319
|
|
306
320
|
|
307
|
-
def _get_optimal_int_type(
|
321
|
+
def _get_optimal_int_type(
|
322
|
+
array: pa.Array, allow_unsigned: bool, allow_null: bool = True
|
323
|
+
) -> pa.DataType:
|
308
324
|
"""
|
309
325
|
Determine the most efficient integer type based on data range.
|
310
326
|
"""
|
311
|
-
# Handle empty or all-null arrays
|
312
327
|
if len(array) == 0 or array.null_count == len(array):
|
313
|
-
|
328
|
+
if allow_null:
|
329
|
+
return pa.null()
|
330
|
+
else:
|
331
|
+
# If all values are null and allow_null is False, default to int8
|
332
|
+
return pa.int8()
|
314
333
|
|
315
|
-
# Use compute functions to get min and max values
|
316
334
|
min_max = pc.min_max(array)
|
317
335
|
min_val = min_max["min"].as_py()
|
318
336
|
max_val = min_max["max"].as_py()
|
319
337
|
|
320
338
|
if allow_unsigned and min_val >= 0:
|
321
|
-
# If allow_unsigned is True, check for unsigned types
|
322
339
|
if max_val <= 255:
|
323
340
|
return pa.uint8()
|
324
341
|
elif max_val <= 65535:
|
@@ -327,8 +344,7 @@ def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
|
|
327
344
|
return pa.uint32()
|
328
345
|
else:
|
329
346
|
return pa.uint64()
|
330
|
-
|
331
|
-
else: # Signed
|
347
|
+
else:
|
332
348
|
if -128 <= min_val and max_val <= 127:
|
333
349
|
return pa.int8()
|
334
350
|
elif -32768 <= min_val and max_val <= 32767:
|
@@ -340,141 +356,159 @@ def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
|
|
340
356
|
|
341
357
|
|
342
358
|
def _optimize_numeric_array(
|
343
|
-
array: pa.Array, shrink: bool, allow_unsigned: bool = True
|
344
|
-
) -> pa.
|
359
|
+
array: pa.Array, shrink: bool, allow_unsigned: bool = True, allow_null: bool = True
|
360
|
+
) -> pa.DataType:
|
345
361
|
"""
|
346
362
|
Optimize numeric PyArrow array by downcasting when possible.
|
347
|
-
|
363
|
+
Returns the optimal dtype.
|
348
364
|
"""
|
365
|
+
|
349
366
|
if not shrink or len(array) == 0 or array.null_count == len(array):
|
350
|
-
|
367
|
+
if allow_null:
|
368
|
+
return pa.null()
|
369
|
+
else:
|
370
|
+
return array.type
|
351
371
|
|
352
|
-
# Handle floating point types
|
353
372
|
if pa.types.is_floating(array.type):
|
354
373
|
if array.type == pa.float64() and _can_downcast_to_float32(array):
|
355
|
-
return
|
356
|
-
return array
|
374
|
+
return pa.float32()
|
375
|
+
return array.type
|
357
376
|
|
358
|
-
# Handle integer types
|
359
377
|
if pa.types.is_integer(array.type):
|
360
|
-
|
361
|
-
if array.type in [pa.int8(), pa.uint8()]:
|
362
|
-
return array
|
378
|
+
return _get_optimal_int_type(array, allow_unsigned, allow_null)
|
363
379
|
|
364
|
-
|
365
|
-
return pc.cast(array, optimal_type)
|
366
|
-
|
367
|
-
# Default: return unchanged
|
368
|
-
return array
|
380
|
+
return array.type
|
369
381
|
|
370
382
|
|
371
383
|
def _all_match_regex(array: pa.Array, pattern: str) -> bool:
|
372
384
|
"""
|
373
385
|
Check if all non-null values in array match regex pattern.
|
374
|
-
Uses pyarrow.compute.match_substring_regex for vectorized evaluation.
|
375
386
|
"""
|
376
387
|
if len(array) == 0 or array.null_count == len(array):
|
377
388
|
return False
|
378
|
-
|
379
|
-
# Check if al values match the pattern
|
380
389
|
return pc.all(pc.match_substring_regex(array, pattern, ignore_case=True)).as_py()
|
381
390
|
|
382
391
|
|
383
392
|
def _optimize_string_array(
|
384
|
-
array: pa.Array,
|
385
|
-
|
393
|
+
array: pa.Array,
|
394
|
+
col_name: str,
|
395
|
+
shrink_numerics: bool,
|
396
|
+
time_zone: str | None = None,
|
397
|
+
allow_unsigned: bool = True,
|
398
|
+
allow_null: bool = True,
|
399
|
+
) -> pa.DataType:
|
386
400
|
"""
|
387
401
|
Convert string PyArrow array to appropriate type based on content analysis.
|
388
|
-
|
402
|
+
Returns the optimal dtype.
|
389
403
|
"""
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
404
|
+
if len(array) == 0 or array.null_count == len(array):
|
405
|
+
if allow_null:
|
406
|
+
return pa.null()
|
407
|
+
else:
|
408
|
+
return array.type
|
395
409
|
|
396
|
-
|
397
|
-
|
410
|
+
cleaned_array = _clean_string_array(
|
411
|
+
array, allow_null
|
412
|
+
) # pc.utf8_trim_whitespace(array)
|
398
413
|
|
399
414
|
try:
|
400
|
-
# Check for boolean values
|
401
415
|
if _all_match_regex(cleaned_array, BOOLEAN_REGEX):
|
402
|
-
|
403
|
-
true_matches = pc.match_substring_regex(
|
404
|
-
array, BOOLEAN_TRUE_REGEX, ignore_case=True
|
405
|
-
)
|
406
|
-
|
407
|
-
# Convert to boolean type
|
408
|
-
return pc.cast(true_matches, pa.bool_())
|
409
|
-
|
416
|
+
return pa.bool_()
|
410
417
|
elif _all_match_regex(cleaned_array, INTEGER_REGEX):
|
411
|
-
# Convert to integer
|
412
|
-
# First replace commas with periods in Polars, then cast
|
413
418
|
int_array = pc.cast(
|
414
419
|
pc.replace_substring(cleaned_array, ",", "."), pa.int64()
|
415
420
|
)
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
return pc.cast(int_array, optimal_type)
|
420
|
-
|
421
|
-
return int_array
|
422
|
-
|
423
|
-
# Check for numeric values
|
421
|
+
return _optimize_numeric_array(
|
422
|
+
int_array, allow_unsigned=allow_unsigned, allow_null=allow_null
|
423
|
+
)
|
424
424
|
elif _all_match_regex(cleaned_array, FLOAT_REGEX):
|
425
|
-
# Convert to float
|
426
|
-
# First replace commas with periods in Polars
|
427
425
|
float_array = pc.cast(
|
428
426
|
pc.replace_substring(cleaned_array, ",", "."), pa.float64()
|
429
427
|
)
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
428
|
+
return _optimize_numeric_array(
|
429
|
+
float_array,
|
430
|
+
shrink_numerics,
|
431
|
+
allow_unsigned=allow_unsigned,
|
432
|
+
allow_null=allow_null,
|
433
|
+
)
|
436
434
|
elif _all_match_regex(cleaned_array, DATETIME_REGEX):
|
437
|
-
# Convert via polars
|
438
|
-
|
439
435
|
pl_series = pl.Series(col_name, cleaned_array)
|
440
436
|
converted = pl_series.str.to_datetime(
|
441
437
|
strict=False, time_unit="us", time_zone=time_zone
|
442
438
|
)
|
443
|
-
#
|
444
|
-
|
445
|
-
|
439
|
+
# Get the Arrow dtype from Polars
|
440
|
+
arrow_dtype = converted.to_arrow().type
|
441
|
+
return arrow_dtype
|
446
442
|
except Exception:
|
447
|
-
|
448
|
-
return cleaned_array
|
443
|
+
return pa.string()
|
449
444
|
|
450
|
-
|
451
|
-
return cleaned_array
|
445
|
+
return pa.string()
|
452
446
|
|
453
447
|
|
454
448
|
def _process_column(
|
455
|
-
table: pa.Table,
|
449
|
+
# table: pa.Table,
|
450
|
+
# col_name: str,
|
451
|
+
array: pa.Array,
|
456
452
|
col_name: str,
|
457
453
|
shrink_numerics: bool,
|
458
454
|
allow_unsigned: bool,
|
459
455
|
time_zone: str | None = None,
|
460
|
-
) -> pa.
|
456
|
+
) -> pa.Field:
|
461
457
|
"""
|
462
458
|
Process a single column for type optimization.
|
459
|
+
Returns a pyarrow.Field with the optimal dtype.
|
463
460
|
"""
|
464
|
-
array = table[col_name]
|
465
|
-
|
466
|
-
# Handle all-null columns
|
461
|
+
# array = table[col_name]
|
467
462
|
if array.null_count == len(array):
|
468
|
-
return pa.
|
463
|
+
return pa.field(col_name, pa.null())
|
469
464
|
|
470
|
-
# Process based on current type
|
471
465
|
if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
|
472
|
-
|
466
|
+
dtype = _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
|
467
|
+
return pa.field(col_name, dtype, nullable=array.null_count > 0)
|
473
468
|
elif pa.types.is_string(array.type):
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
return array
|
469
|
+
dtype = _optimize_string_array(array, col_name, shrink_numerics, time_zone)
|
470
|
+
return pa.field(col_name, dtype, nullable=array.null_count > 0)
|
471
|
+
|
472
|
+
return pa.field(col_name, array.type, nullable=array.null_count > 0)
|
473
|
+
|
474
|
+
|
475
|
+
def _process_column_for_opt_dtype(args):
|
476
|
+
(
|
477
|
+
array,
|
478
|
+
col_name,
|
479
|
+
cols_to_process,
|
480
|
+
shrink_numerics,
|
481
|
+
allow_unsigned,
|
482
|
+
time_zone,
|
483
|
+
strict,
|
484
|
+
allow_null,
|
485
|
+
) = args
|
486
|
+
try:
|
487
|
+
if col_name in cols_to_process:
|
488
|
+
field = _process_column(
|
489
|
+
array, col_name, shrink_numerics, allow_unsigned, time_zone
|
490
|
+
)
|
491
|
+
if pa.types.is_null(field.type):
|
492
|
+
if allow_null:
|
493
|
+
array = pa.nulls(array.length(), type=pa.null())
|
494
|
+
return (col_name, field, array)
|
495
|
+
else:
|
496
|
+
orig_type = array.type
|
497
|
+
# array = table[col_name]
|
498
|
+
field = pa.field(col_name, orig_type, nullable=True)
|
499
|
+
return (col_name, field, array)
|
500
|
+
else:
|
501
|
+
array = array.cast(field.type)
|
502
|
+
return (col_name, field, array)
|
503
|
+
else:
|
504
|
+
field = pa.field(col_name, array.type, nullable=True)
|
505
|
+
# array = table[col_name]
|
506
|
+
return (col_name, field, array)
|
507
|
+
except Exception as e:
|
508
|
+
if strict:
|
509
|
+
raise e
|
510
|
+
field = pa.field(col_name, array.type, nullable=True)
|
511
|
+
return (col_name, field, array)
|
478
512
|
|
479
513
|
|
480
514
|
def opt_dtype(
|
@@ -484,58 +518,53 @@ def opt_dtype(
|
|
484
518
|
time_zone: str | None = None,
|
485
519
|
shrink_numerics: bool = True,
|
486
520
|
allow_unsigned: bool = True,
|
521
|
+
use_large_dtypes: bool = False,
|
487
522
|
strict: bool = False,
|
523
|
+
allow_null: bool = True,
|
488
524
|
) -> pa.Table:
|
489
525
|
"""
|
490
526
|
Optimize data types of a PyArrow Table for performance and memory efficiency.
|
491
|
-
|
492
|
-
This function analyzes each column and converts it to the most appropriate
|
493
|
-
data type based on content, handling string-to-type conversions and
|
494
|
-
numeric type downcasting. It is the PyArrow equivalent of the Polars
|
495
|
-
`opt_dtype` function.
|
527
|
+
Returns a new table casted to the optimal schema.
|
496
528
|
|
497
529
|
Args:
|
498
|
-
|
499
|
-
include: Column(s) to include in optimization (default: all columns)
|
500
|
-
exclude: Column(s) to exclude from optimization
|
501
|
-
time_zone: Optional time zone for datetime parsing
|
502
|
-
shrink_numerics: Whether to downcast numeric types when possible
|
503
|
-
allow_unsigned: Whether to allow unsigned types
|
504
|
-
strict: If True, will raise an error if any column cannot be optimized
|
505
|
-
|
506
|
-
Returns:
|
507
|
-
PyArrow Table with optimized data types
|
530
|
+
allow_null (bool): If False, columns that only hold null-like values will not be converted to pyarrow.null().
|
508
531
|
"""
|
509
|
-
# Normalize include/exclude parameters
|
510
532
|
if isinstance(include, str):
|
511
533
|
include = [include]
|
512
534
|
if isinstance(exclude, str):
|
513
535
|
exclude = [exclude]
|
514
536
|
|
515
|
-
# Determine columns to process
|
516
537
|
cols_to_process = table.column_names
|
517
538
|
if include:
|
518
539
|
cols_to_process = [col for col in include if col in table.column_names]
|
519
540
|
if exclude:
|
520
541
|
cols_to_process = [col for col in cols_to_process if col not in exclude]
|
521
542
|
|
522
|
-
#
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
543
|
+
# Prepare arguments for parallel processing
|
544
|
+
args_list = [
|
545
|
+
(
|
546
|
+
table[col_name],
|
547
|
+
col_name,
|
548
|
+
cols_to_process,
|
549
|
+
shrink_numerics,
|
550
|
+
allow_unsigned,
|
551
|
+
time_zone,
|
552
|
+
strict,
|
553
|
+
allow_null,
|
554
|
+
)
|
555
|
+
for col_name in table.column_names
|
556
|
+
]
|
557
|
+
|
558
|
+
# Parallelize column processing
|
559
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
560
|
+
results = list(executor.map(_process_column_for_opt_dtype, args_list))
|
561
|
+
|
562
|
+
# Sort results to preserve column order
|
563
|
+
results.sort(key=lambda x: table.column_names.index(x[0]))
|
564
|
+
fields = [field for _, field, _ in results]
|
565
|
+
arrays = [array for _, _, array in results]
|
566
|
+
|
567
|
+
schema = pa.schema(fields)
|
568
|
+
if use_large_dtypes:
|
569
|
+
schema = convert_large_types_to_normal(schema)
|
570
|
+
return pa.Table.from_arrays(arrays, schema=schema)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6.
|
3
|
+
Version: 0.11.6.17
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -47,8 +47,8 @@ flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpu
|
|
47
47
|
flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFzk,97098
|
48
48
|
flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
|
49
49
|
flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
|
50
|
-
flowerpower/plugins/io/helpers/polars.py,sha256=
|
51
|
-
flowerpower/plugins/io/helpers/pyarrow.py,sha256=
|
50
|
+
flowerpower/plugins/io/helpers/polars.py,sha256=2U0XMIDGWr7b7GcD7Qc0v-tSdZz_IiNo1Ws5kSOju6U,29359
|
51
|
+
flowerpower/plugins/io/helpers/pyarrow.py,sha256=uFaradEOTReqiDGQy5QyXCC1hY4_Vp_R-3FoosaUJBY,19070
|
52
52
|
flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
|
53
53
|
flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
|
54
54
|
flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
|
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
|
|
94
94
|
flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
|
95
95
|
flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
|
96
96
|
flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
|
97
|
-
flowerpower-0.11.6.
|
98
|
-
flowerpower-0.11.6.
|
99
|
-
flowerpower-0.11.6.
|
100
|
-
flowerpower-0.11.6.
|
101
|
-
flowerpower-0.11.6.
|
102
|
-
flowerpower-0.11.6.
|
97
|
+
flowerpower-0.11.6.17.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
+
flowerpower-0.11.6.17.dist-info/METADATA,sha256=l84AN7Bq8Dj4z1A6f69KmmIoIQEUQ9ZiTAjA51AuZJM,21613
|
99
|
+
flowerpower-0.11.6.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
+
flowerpower-0.11.6.17.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
+
flowerpower-0.11.6.17.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
+
flowerpower-0.11.6.17.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|