FlowerPower 0.11.6.16__py3-none-any.whl → 0.11.6.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flowerpower/fs/ext.py CHANGED
@@ -949,7 +949,7 @@ def _read_parquet(
949
949
  if concat:
950
950
  # Unify schemas before concatenation if opt_dtypes or multiple tables
951
951
  if isinstance(tables, list):
952
- if len(tables) > 1:
952
+ if len(tables) > 0:
953
953
  schemas = [t.schema for t in tables]
954
954
  unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
955
955
  tables = [cast_schema(t, unified_schema) for t in tables]
@@ -7,8 +7,8 @@ from .datetime import get_timedelta_str, get_timestamp_column
7
7
  # Pre-compiled regex patterns (identical to original)
8
8
  INTEGER_REGEX = r"^[-+]?\d+$"
9
9
  FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
10
- BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
11
- BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
10
+ BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
11
+ BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
12
12
  DATETIME_REGEX = (
13
13
  r"^("
14
14
  r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
@@ -32,7 +32,24 @@ F32_MAX = float(np.finfo(np.float32).max)
32
32
  def _clean_string_expr(col_name: str) -> pl.Expr:
33
33
  """Create expression to clean string values."""
34
34
  return (
35
- pl.col(col_name).str.strip_chars().replace({"-": None, "": None, "None": None})
35
+ pl.col(col_name)
36
+ .str.strip_chars()
37
+ .replace({
38
+ "-": None,
39
+ "": None,
40
+ "None": None,
41
+ "none": None,
42
+ "NONE": None,
43
+ "NaN": None,
44
+ "Nan": None,
45
+ "nan": None,
46
+ "NAN": None,
47
+ "N/A": None,
48
+ "n/a": None,
49
+ "null": None,
50
+ "Null": None,
51
+ "NULL": None,
52
+ })
36
53
  )
37
54
 
38
55
 
@@ -47,11 +64,24 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
47
64
 
48
65
 
49
66
  def _optimize_numeric_column(
50
- series: pl.Series, col_name: str, shrink: bool, allow_unsigned: bool = False
67
+ series: pl.Series,
68
+ shrink: bool,
69
+ allow_unsigned: bool = True,
70
+ allow_null: bool = True,
51
71
  ) -> pl.Expr:
52
72
  """Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
73
+ col_name = series.name
53
74
  expr = pl.col(col_name)
54
75
  dtype = series.dtype
76
+ if series.is_null().all():
77
+ # If all values are null, cast to Null type if allow_null is True
78
+ if allow_null:
79
+ return expr.cast(pl.Null())
80
+
81
+ if not allow_unsigned:
82
+ # If unsigned types are not allowed, ensure we use signed integer types
83
+ if dtype.is_integer() and not dtype.is_signed_integer():
84
+ return expr.cast(pl.Int64)
55
85
 
56
86
  if (
57
87
  allow_unsigned
@@ -76,16 +106,21 @@ def _optimize_numeric_column(
76
106
 
77
107
  def _optimize_string_column(
78
108
  series: pl.Series,
79
- col_name: str,
80
109
  shrink_numerics: bool,
81
110
  time_zone: str | None = None,
111
+ allow_null: bool = True,
112
+ allow_unsigned: bool = True,
82
113
  ) -> pl.Expr:
83
114
  """Convert string column to appropriate type based on content analysis."""
84
115
  # Return early for empty or null-only series
116
+ col_name = series.name
85
117
  cleaned_expr = _clean_string_expr(col_name)
86
- non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
87
- if len(non_null) == 0:
88
- return pl.col(col_name).cast(pl.Null()) # Fix: Cast to Null type
118
+ non_null = series.drop_nulls()
119
+ if non_null.is_empty():
120
+ if allow_null:
121
+ return pl.col(col_name).cast(pl.Null())
122
+ else:
123
+ return pl.col(col_name).cast(series.dtype)
89
124
 
90
125
  stripped = non_null.str.strip_chars()
91
126
  lowercase = stripped.str.to_lowercase()
@@ -99,7 +134,7 @@ def _optimize_string_column(
99
134
  )
100
135
 
101
136
  elif stripped.str.contains(INTEGER_REGEX).all(ignore_nulls=False):
102
- int_expr = cleaned_expr.cast(pl.Int64)
137
+ int_expr = cleaned_expr.cast(pl.Int64).alias(col_name)
103
138
  return (
104
139
  int_expr.shrink_dtype().alias(col_name)
105
140
  if shrink_numerics
@@ -108,7 +143,9 @@ def _optimize_string_column(
108
143
 
109
144
  # Check for numeric values
110
145
  elif stripped.str.contains(FLOAT_REGEX).all(ignore_nulls=False):
111
- float_expr = cleaned_expr.str.replace_all(",", ".").cast(pl.Float64)
146
+ float_expr = (
147
+ cleaned_expr.str.replace_all(",", ".").cast(pl.Float64).alias(col_name)
148
+ )
112
149
 
113
150
  if shrink_numerics:
114
151
  # Check if values can fit in Float32
@@ -118,7 +155,7 @@ def _optimize_string_column(
118
155
  if _can_downcast_to_float32(temp_floats):
119
156
  return float_expr.shrink_dtype().alias(col_name)
120
157
 
121
- return float_expr.alias(col_name)
158
+ return float_expr
122
159
 
123
160
  try:
124
161
  if stripped.str.contains(DATETIME_REGEX).all(ignore_nulls=False):
@@ -135,8 +172,9 @@ def _optimize_string_column(
135
172
  def _get_column_expr(
136
173
  df: pl.DataFrame,
137
174
  col_name: str,
138
- shrink_numerics: bool,
139
- allow_unsigned: bool,
175
+ shrink_numerics: bool = True,
176
+ allow_unsigned: bool = True,
177
+ allow_null: bool = True,
140
178
  time_zone: str | None = None,
141
179
  ) -> pl.Expr:
142
180
  """Generate optimization expression for a single column."""
@@ -144,15 +182,19 @@ def _get_column_expr(
144
182
 
145
183
  # Handle all-null columns
146
184
  if series.is_null().all():
147
- return pl.col(col_name).cast(pl.Null())
185
+ if allow_null:
186
+ # If all values are null, cast to Null type if allow_null is True
187
+ return pl.col(col_name).cast(pl.Null())
148
188
 
149
189
  # Process based on current type
150
190
  if series.dtype.is_numeric():
151
191
  return _optimize_numeric_column(
152
- series, col_name, shrink_numerics, allow_unsigned
192
+ series, shrink_numerics, allow_unsigned, allow_null
153
193
  )
154
194
  elif series.dtype == pl.Utf8:
155
- return _optimize_string_column(series, col_name, shrink_numerics, time_zone)
195
+ return _optimize_string_column(
196
+ series, col_name, shrink_numerics, time_zone, allow_null
197
+ )
156
198
 
157
199
  # Keep original for other types
158
200
  return pl.col(col_name)
@@ -165,6 +207,7 @@ def opt_dtype(
165
207
  time_zone: str | None = None,
166
208
  shrink_numerics: bool = True,
167
209
  allow_unsigned: bool = True,
210
+ allow_null: bool = True,
168
211
  strict: bool = False,
169
212
  ) -> pl.DataFrame:
170
213
  """
@@ -181,6 +224,7 @@ def opt_dtype(
181
224
  time_zone: Optional time zone for datetime parsing
182
225
  shrink_numerics: Whether to downcast numeric types when possible
183
226
  allow_unsigned: Whether to allow unsigned integer types
227
+ allow_null: Whether to allow columns with all null values to be cast to Null type
184
228
  strict: If True, will raise an error if any column cannot be optimized
185
229
 
186
230
  Returns:
@@ -205,7 +249,7 @@ def opt_dtype(
205
249
  try:
206
250
  expressions.append(
207
251
  _get_column_expr(
208
- df, col_name, shrink_numerics, allow_unsigned, time_zone
252
+ df, col_name, shrink_numerics, allow_unsigned, allow_null, time_zone
209
253
  )
210
254
  )
211
255
  except Exception as e:
@@ -1,3 +1,5 @@
1
+ import concurrent.futures
2
+
1
3
  import numpy as np
2
4
  import polars as pl
3
5
  import pyarrow as pa
@@ -6,8 +8,8 @@ import pyarrow.compute as pc
6
8
  # Pre-compiled regex patterns (identical to original)
7
9
  INTEGER_REGEX = r"^[-+]?\d+$"
8
10
  FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
9
- BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n)$"
10
- BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j)$"
11
+ BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
12
+ BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
11
13
  DATETIME_REGEX = (
12
14
  r"^("
13
15
  r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
@@ -260,12 +262,13 @@ def convert_large_types_to_normal(schema: pa.Schema) -> pa.Schema:
260
262
  return pa.schema(new_fields)
261
263
 
262
264
 
263
- def _clean_string_array(array: pa.Array) -> pa.Array:
265
+ def _clean_string_array(array: pa.Array) -> pa.DataType:
264
266
  """
265
267
  Clean string values in a PyArrow array using vectorized operations.
268
+ Returns the optimal dtype after cleaning.
266
269
  """
267
270
  if len(array) == 0 or array.null_count == len(array):
268
- return array
271
+ return array.type
269
272
 
270
273
  # Trim whitespace using compute functions
271
274
  trimmed = pc.utf8_trim_whitespace(array)
@@ -273,12 +276,28 @@ def _clean_string_array(array: pa.Array) -> pa.Array:
273
276
  # Create mask for values to convert to null
274
277
  empty_mask = pc.equal(trimmed, "")
275
278
  dash_mask = pc.equal(trimmed, "-")
276
- none_mask = pc.equal(trimmed, "None")
279
+ none_mask = pc.or_(
280
+ pc.equal(trimmed, "None"),
281
+ pc.equal(trimmed, "none"),
282
+ pc.equal(trimmed, "NONE"),
283
+ pc.equal(trimmed, "Nan"),
284
+ pc.equal(trimmed, "N/A"),
285
+ pc.equal(trimmed, "n/a"),
286
+ pc.equal(trimmed, "NaN"),
287
+ pc.equal(trimmed, "nan"),
288
+ pc.equal(trimmed, "NAN"),
289
+ pc.equal(trimmed, "Null"),
290
+ pc.equal(trimmed, "NULL"),
291
+ pc.equal(trimmed, "null"),
292
+ )
277
293
 
278
294
  null_mask = pc.or_(pc.or_(empty_mask, dash_mask), none_mask)
279
295
 
280
- # Apply the mask to set matching values to null
281
- return pc.if_else(null_mask, None, trimmed)
296
+ # If all values are null after cleaning, return null type
297
+ if pc.all(null_mask).as_py():
298
+ return pa.null()
299
+
300
+ return array.type # Default: keep string type if not all null
282
301
 
283
302
 
284
303
  def _can_downcast_to_float32(array: pa.Array) -> bool:
@@ -288,37 +307,35 @@ def _can_downcast_to_float32(array: pa.Array) -> bool:
288
307
  if len(array) == 0 or array.null_count == len(array):
289
308
  return True
290
309
 
291
- # Use compute functions to filter finite values and calculate min/max
292
310
  is_finite = pc.is_finite(array)
293
-
294
- # Skip if no finite values
295
311
  if not pc.any(is_finite).as_py():
296
312
  return True
297
313
 
298
- # Filter out non-finite values
299
314
  finite_array = pc.filter(array, is_finite)
300
-
301
315
  min_val = pc.min(finite_array).as_py()
302
316
  max_val = pc.max(finite_array).as_py()
303
317
 
304
318
  return F32_MIN <= min_val <= max_val <= F32_MAX
305
319
 
306
320
 
307
- def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
321
+ def _get_optimal_int_type(
322
+ array: pa.Array, allow_unsigned: bool, allow_null: bool = True
323
+ ) -> pa.DataType:
308
324
  """
309
325
  Determine the most efficient integer type based on data range.
310
326
  """
311
- # Handle empty or all-null arrays
312
327
  if len(array) == 0 or array.null_count == len(array):
313
- return pa.int8()
328
+ if allow_null:
329
+ return pa.null()
330
+ else:
331
+ # If all values are null and allow_null is False, default to int8
332
+ return pa.int8()
314
333
 
315
- # Use compute functions to get min and max values
316
334
  min_max = pc.min_max(array)
317
335
  min_val = min_max["min"].as_py()
318
336
  max_val = min_max["max"].as_py()
319
337
 
320
338
  if allow_unsigned and min_val >= 0:
321
- # If allow_unsigned is True, check for unsigned types
322
339
  if max_val <= 255:
323
340
  return pa.uint8()
324
341
  elif max_val <= 65535:
@@ -327,8 +344,7 @@ def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
327
344
  return pa.uint32()
328
345
  else:
329
346
  return pa.uint64()
330
-
331
- else: # Signed
347
+ else:
332
348
  if -128 <= min_val and max_val <= 127:
333
349
  return pa.int8()
334
350
  elif -32768 <= min_val and max_val <= 32767:
@@ -340,141 +356,159 @@ def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
340
356
 
341
357
 
342
358
  def _optimize_numeric_array(
343
- array: pa.Array, shrink: bool, allow_unsigned: bool = True
344
- ) -> pa.Array:
359
+ array: pa.Array, shrink: bool, allow_unsigned: bool = True, allow_null: bool = True
360
+ ) -> pa.DataType:
345
361
  """
346
362
  Optimize numeric PyArrow array by downcasting when possible.
347
- Uses vectorized operations for efficiency.
363
+ Returns the optimal dtype.
348
364
  """
365
+
349
366
  if not shrink or len(array) == 0 or array.null_count == len(array):
350
- return array if len(array) > 0 else pa.array([], type=pa.int8())
367
+ if allow_null:
368
+ return pa.null()
369
+ else:
370
+ return array.type
351
371
 
352
- # Handle floating point types
353
372
  if pa.types.is_floating(array.type):
354
373
  if array.type == pa.float64() and _can_downcast_to_float32(array):
355
- return pc.cast(array, pa.float32())
356
- return array
374
+ return pa.float32()
375
+ return array.type
357
376
 
358
- # Handle integer types
359
377
  if pa.types.is_integer(array.type):
360
- # Skip if already optimized to smallest types
361
- if array.type in [pa.int8(), pa.uint8()]:
362
- return array
378
+ return _get_optimal_int_type(array, allow_unsigned, allow_null)
363
379
 
364
- optimal_type = _get_optimal_int_type(array, allow_unsigned)
365
- return pc.cast(array, optimal_type)
366
-
367
- # Default: return unchanged
368
- return array
380
+ return array.type
369
381
 
370
382
 
371
383
  def _all_match_regex(array: pa.Array, pattern: str) -> bool:
372
384
  """
373
385
  Check if all non-null values in array match regex pattern.
374
- Uses pyarrow.compute.match_substring_regex for vectorized evaluation.
375
386
  """
376
387
  if len(array) == 0 or array.null_count == len(array):
377
388
  return False
378
-
379
- # Check if al values match the pattern
380
389
  return pc.all(pc.match_substring_regex(array, pattern, ignore_case=True)).as_py()
381
390
 
382
391
 
383
392
  def _optimize_string_array(
384
- array: pa.Array, col_name: str, shrink_numerics: bool, time_zone: str | None = None
385
- ) -> pa.Array:
393
+ array: pa.Array,
394
+ col_name: str,
395
+ shrink_numerics: bool,
396
+ time_zone: str | None = None,
397
+ allow_unsigned: bool = True,
398
+ allow_null: bool = True,
399
+ ) -> pa.DataType:
386
400
  """
387
401
  Convert string PyArrow array to appropriate type based on content analysis.
388
- Uses fully vectorized operations wherever possible.
402
+ Returns the optimal dtype.
389
403
  """
390
- # Handle empty or all-null arrays
391
- if len(array) == 0:
392
- return pa.array([], type=pa.int8())
393
- if array.null_count == len(array):
394
- return pa.array([None] * len(array), type=pa.null())
404
+ if len(array) == 0 or array.null_count == len(array):
405
+ if allow_null:
406
+ return pa.null()
407
+ else:
408
+ return array.type
395
409
 
396
- # Clean string values
397
- cleaned_array = _clean_string_array(array)
410
+ cleaned_array = _clean_string_array(
411
+ array, allow_null
412
+ ) # pc.utf8_trim_whitespace(array)
398
413
 
399
414
  try:
400
- # Check for boolean values
401
415
  if _all_match_regex(cleaned_array, BOOLEAN_REGEX):
402
- # Match with TRUE pattern
403
- true_matches = pc.match_substring_regex(
404
- array, BOOLEAN_TRUE_REGEX, ignore_case=True
405
- )
406
-
407
- # Convert to boolean type
408
- return pc.cast(true_matches, pa.bool_())
409
-
416
+ return pa.bool_()
410
417
  elif _all_match_regex(cleaned_array, INTEGER_REGEX):
411
- # Convert to integer
412
- # First replace commas with periods in Polars, then cast
413
418
  int_array = pc.cast(
414
419
  pc.replace_substring(cleaned_array, ",", "."), pa.int64()
415
420
  )
416
-
417
- if shrink_numerics:
418
- optimal_type = _get_optimal_int_type(int_array)
419
- return pc.cast(int_array, optimal_type)
420
-
421
- return int_array
422
-
423
- # Check for numeric values
421
+ return _optimize_numeric_array(
422
+ int_array, allow_unsigned=allow_unsigned, allow_null=allow_null
423
+ )
424
424
  elif _all_match_regex(cleaned_array, FLOAT_REGEX):
425
- # Convert to float
426
- # First replace commas with periods in Polars
427
425
  float_array = pc.cast(
428
426
  pc.replace_substring(cleaned_array, ",", "."), pa.float64()
429
427
  )
430
- if shrink_numerics and _can_downcast_to_float32(float_array):
431
- return pc.cast(float_array, pa.float32())
432
-
433
- return float_array
434
-
435
- # Check for datetime values - use polars for conversion as specified
428
+ return _optimize_numeric_array(
429
+ float_array,
430
+ shrink_numerics,
431
+ allow_unsigned=allow_unsigned,
432
+ allow_null=allow_null,
433
+ )
436
434
  elif _all_match_regex(cleaned_array, DATETIME_REGEX):
437
- # Convert via polars
438
-
439
435
  pl_series = pl.Series(col_name, cleaned_array)
440
436
  converted = pl_series.str.to_datetime(
441
437
  strict=False, time_unit="us", time_zone=time_zone
442
438
  )
443
- # Convert polars datetime back to pyarrow
444
- return converted.to_arrow()
445
-
439
+ # Get the Arrow dtype from Polars
440
+ arrow_dtype = converted.to_arrow().type
441
+ return arrow_dtype
446
442
  except Exception:
447
- # Fallback: return cleaned strings on any error
448
- return cleaned_array
443
+ return pa.string()
449
444
 
450
- # Default: return cleaned strings
451
- return cleaned_array
445
+ return pa.string()
452
446
 
453
447
 
454
448
  def _process_column(
455
- table: pa.Table,
449
+ # table: pa.Table,
450
+ # col_name: str,
451
+ array: pa.Array,
456
452
  col_name: str,
457
453
  shrink_numerics: bool,
458
454
  allow_unsigned: bool,
459
455
  time_zone: str | None = None,
460
- ) -> pa.Array:
456
+ ) -> pa.Field:
461
457
  """
462
458
  Process a single column for type optimization.
459
+ Returns a pyarrow.Field with the optimal dtype.
463
460
  """
464
- array = table[col_name]
465
-
466
- # Handle all-null columns
461
+ # array = table[col_name]
467
462
  if array.null_count == len(array):
468
- return pa.array([None] * len(array), type=pa.null())
463
+ return pa.field(col_name, pa.null())
469
464
 
470
- # Process based on current type
471
465
  if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
472
- return _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
466
+ dtype = _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
467
+ return pa.field(col_name, dtype, nullable=array.null_count > 0)
473
468
  elif pa.types.is_string(array.type):
474
- return _optimize_string_array(array, col_name, shrink_numerics, time_zone)
475
-
476
- # Keep original for other types
477
- return array
469
+ dtype = _optimize_string_array(array, col_name, shrink_numerics, time_zone)
470
+ return pa.field(col_name, dtype, nullable=array.null_count > 0)
471
+
472
+ return pa.field(col_name, array.type, nullable=array.null_count > 0)
473
+
474
+
475
+ def _process_column_for_opt_dtype(args):
476
+ (
477
+ array,
478
+ col_name,
479
+ cols_to_process,
480
+ shrink_numerics,
481
+ allow_unsigned,
482
+ time_zone,
483
+ strict,
484
+ allow_null,
485
+ ) = args
486
+ try:
487
+ if col_name in cols_to_process:
488
+ field = _process_column(
489
+ array, col_name, shrink_numerics, allow_unsigned, time_zone
490
+ )
491
+ if pa.types.is_null(field.type):
492
+ if allow_null:
493
+ array = pa.nulls(array.length(), type=pa.null())
494
+ return (col_name, field, array)
495
+ else:
496
+ orig_type = array.type
497
+ # array = table[col_name]
498
+ field = pa.field(col_name, orig_type, nullable=True)
499
+ return (col_name, field, array)
500
+ else:
501
+ array = array.cast(field.type)
502
+ return (col_name, field, array)
503
+ else:
504
+ field = pa.field(col_name, array.type, nullable=True)
505
+ # array = table[col_name]
506
+ return (col_name, field, array)
507
+ except Exception as e:
508
+ if strict:
509
+ raise e
510
+ field = pa.field(col_name, array.type, nullable=True)
511
+ return (col_name, field, array)
478
512
 
479
513
 
480
514
  def opt_dtype(
@@ -484,58 +518,53 @@ def opt_dtype(
484
518
  time_zone: str | None = None,
485
519
  shrink_numerics: bool = True,
486
520
  allow_unsigned: bool = True,
521
+ use_large_dtypes: bool = False,
487
522
  strict: bool = False,
523
+ allow_null: bool = True,
488
524
  ) -> pa.Table:
489
525
  """
490
526
  Optimize data types of a PyArrow Table for performance and memory efficiency.
491
-
492
- This function analyzes each column and converts it to the most appropriate
493
- data type based on content, handling string-to-type conversions and
494
- numeric type downcasting. It is the PyArrow equivalent of the Polars
495
- `opt_dtype` function.
527
+ Returns a new table casted to the optimal schema.
496
528
 
497
529
  Args:
498
- table: PyArrow Table to optimize
499
- include: Column(s) to include in optimization (default: all columns)
500
- exclude: Column(s) to exclude from optimization
501
- time_zone: Optional time zone for datetime parsing
502
- shrink_numerics: Whether to downcast numeric types when possible
503
- allow_unsigned: Whether to allow unsigned types
504
- strict: If True, will raise an error if any column cannot be optimized
505
-
506
- Returns:
507
- PyArrow Table with optimized data types
530
+ allow_null (bool): If False, columns that only hold null-like values will not be converted to pyarrow.null().
508
531
  """
509
- # Normalize include/exclude parameters
510
532
  if isinstance(include, str):
511
533
  include = [include]
512
534
  if isinstance(exclude, str):
513
535
  exclude = [exclude]
514
536
 
515
- # Determine columns to process
516
537
  cols_to_process = table.column_names
517
538
  if include:
518
539
  cols_to_process = [col for col in include if col in table.column_names]
519
540
  if exclude:
520
541
  cols_to_process = [col for col in cols_to_process if col not in exclude]
521
542
 
522
- # Process each column and build a new table
523
- new_columns = []
524
- for col_name in table.column_names:
525
- if col_name in cols_to_process:
526
- try:
527
- # Process column for optimization
528
- new_columns.append(
529
- _process_column(
530
- table, col_name, shrink_numerics, allow_unsigned, time_zone
531
- )
532
- )
533
- except Exception as e:
534
- if strict:
535
- raise e
536
- new_columns.append(table[col_name])
537
- else:
538
- new_columns.append(table[col_name])
539
-
540
- # Create a new table with the optimized columns
541
- return pa.Table.from_arrays(new_columns, names=table.column_names)
543
+ # Prepare arguments for parallel processing
544
+ args_list = [
545
+ (
546
+ table[col_name],
547
+ col_name,
548
+ cols_to_process,
549
+ shrink_numerics,
550
+ allow_unsigned,
551
+ time_zone,
552
+ strict,
553
+ allow_null,
554
+ )
555
+ for col_name in table.column_names
556
+ ]
557
+
558
+ # Parallelize column processing
559
+ with concurrent.futures.ThreadPoolExecutor() as executor:
560
+ results = list(executor.map(_process_column_for_opt_dtype, args_list))
561
+
562
+ # Sort results to preserve column order
563
+ results.sort(key=lambda x: table.column_names.index(x[0]))
564
+ fields = [field for _, field, _ in results]
565
+ arrays = [array for _, _, array in results]
566
+
567
+ schema = pa.schema(fields)
568
+ if use_large_dtypes:
569
+ schema = convert_large_types_to_normal(schema)
570
+ return pa.Table.from_arrays(arrays, schema=schema)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowerPower
3
- Version: 0.11.6.16
3
+ Version: 0.11.6.18
4
4
  Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
5
5
  Author-email: "Volker L." <ligno.blades@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/legout/flowerpower
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
18
18
  flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
19
19
  flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
20
20
  flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
21
- flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
21
+ flowerpower/fs/ext.py,sha256=7Ue70LL-ktGH_A_bi_nyI0P1WY_fboY5HqdPpqh8a6c,70604
22
22
  flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
23
23
  flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
24
24
  flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
@@ -47,8 +47,8 @@ flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpu
47
47
  flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFzk,97098
48
48
  flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
49
49
  flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
50
- flowerpower/plugins/io/helpers/polars.py,sha256=ESA2YHZmWCmyhjwNULQO9_7G_sFcgF9x3q-IGatbP-0,28124
51
- flowerpower/plugins/io/helpers/pyarrow.py,sha256=lYZHbPklzYvd7L5XqDjoTUV42cHi_c9Wh8xf1HYtS2M,18592
50
+ flowerpower/plugins/io/helpers/polars.py,sha256=2U0XMIDGWr7b7GcD7Qc0v-tSdZz_IiNo1Ws5kSOju6U,29359
51
+ flowerpower/plugins/io/helpers/pyarrow.py,sha256=uFaradEOTReqiDGQy5QyXCC1hY4_Vp_R-3FoosaUJBY,19070
52
52
  flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
53
53
  flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
54
54
  flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
94
94
  flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
95
95
  flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
96
96
  flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
97
- flowerpower-0.11.6.16.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
- flowerpower-0.11.6.16.dist-info/METADATA,sha256=X1bvOqBHxJgYBRCceoHrE3huch9jf2VB3bwj-5LMLbE,21613
99
- flowerpower-0.11.6.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- flowerpower-0.11.6.16.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
- flowerpower-0.11.6.16.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
- flowerpower-0.11.6.16.dist-info/RECORD,,
97
+ flowerpower-0.11.6.18.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
+ flowerpower-0.11.6.18.dist-info/METADATA,sha256=MzbJqxKAyuDTm0h_qdp26mhhyEfAF8MOmyX_b4kcYjU,21613
99
+ flowerpower-0.11.6.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
+ flowerpower-0.11.6.18.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
+ flowerpower-0.11.6.18.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
+ flowerpower-0.11.6.18.dist-info/RECORD,,