FlowerPower 0.11.6.10__py3-none-any.whl → 0.11.6.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flowerpower/fs/ext.py CHANGED
@@ -951,10 +951,15 @@ def _read_parquet(
951
951
  if isinstance(tables, list):
952
952
  if len(tables) > 1:
953
953
  schemas = [t.schema for t in tables]
954
- unified_schema = unify_schemas_pa(schemas)
954
+ unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
955
955
  tables = [cast_schema(t, unified_schema) for t in tables]
956
+
957
+ tables = [table for table in tables if table.num_rows > 0]
958
+ if not tables:
959
+ return unified_schema.empty_table()
960
+
956
961
  result = pa.concat_tables(
957
- [table for table in tables if table.num_rows > 0],
962
+ tables,
958
963
  promote_options="permissive",
959
964
  )
960
965
  # if opt_dtypes:
@@ -965,8 +970,12 @@ def _read_parquet(
965
970
  # tables = opt_dtype_pa(tables, strict=False)
966
971
  return tables
967
972
  else:
968
- return pa.concat_tables(
969
- [table for table in tables if table.num_rows > 0],
973
+ tables = [table for table in tables if table.num_rows > 0]
974
+ if not tables:
975
+ return unified_schema.empty_table()
976
+
977
+ result = pa.concat_tables(
978
+ tables,
970
979
  promote_options="permissive",
971
980
  )
972
981
  return tables
@@ -1086,10 +1095,13 @@ def _read_parquet_batches(
1086
1095
  # Unify schemas before concatenation
1087
1096
  if len(batch_tables) > 1:
1088
1097
  schemas = [t.schema for t in batch_tables]
1089
- unified_schema = unify_schemas_pa(schemas)
1098
+ unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
1090
1099
  batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
1100
+ batch_tables = [table for table in batch_tables if table.num_rows > 0]
1101
+ if not batch_tables:
1102
+ yield unified_schema.empty_table()
1091
1103
  batch_table = pa.concat_tables(
1092
- [table for table in batch_tables if table.num_rows > 0],
1104
+ batch_tables,
1093
1105
  promote_options="permissive",
1094
1106
  )
1095
1107
  # if opt_dtypes:
@@ -46,15 +46,27 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
46
46
  return F32_MIN <= min_val <= max_val <= F32_MAX
47
47
 
48
48
 
49
- def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) -> pl.Expr:
50
- """Optimize numeric column types."""
49
+ def _optimize_numeric_column(
50
+ series: pl.Series, col_name: str, shrink: bool, allow_unsigned: bool = False
51
+ ) -> pl.Expr:
52
+ """Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
53
+ expr = pl.col(col_name)
54
+ dtype = series.dtype
55
+
56
+ if allow_unsigned and dtype.is_integer() and (series.min() is not None) and series.min() >= 0:
57
+ # Convert to unsigned integer type, shrink if requested
58
+ if shrink:
59
+ return expr.cast(pl.UInt64).shrink_dtype()
60
+ else:
61
+ return expr.cast(pl.UInt64)
62
+
51
63
  if not shrink:
52
- return pl.col(col_name)
64
+ return expr
53
65
 
54
- if series.dtype == pl.Float64 and not _can_downcast_to_float32(series):
55
- return pl.col(col_name)
66
+ if dtype == pl.Float64 and not _can_downcast_to_float32(series):
67
+ return expr
56
68
 
57
- return pl.col(col_name).shrink_dtype()
69
+ return expr.shrink_dtype()
58
70
 
59
71
 
60
72
  def _optimize_string_column(
@@ -68,7 +80,7 @@ def _optimize_string_column(
68
80
  cleaned_expr = _clean_string_expr(col_name)
69
81
  non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
70
82
  if len(non_null) == 0:
71
- return pl.col(col_name).cast(series.dtype)
83
+ return pl.col(col_name).cast(pl.Null()) # Fix: Cast to Null type
72
84
 
73
85
  stripped = non_null.str.strip_chars()
74
86
  lowercase = stripped.str.to_lowercase()
@@ -123,7 +135,7 @@ def _get_column_expr(
123
135
 
124
136
  # Handle all-null columns
125
137
  if series.is_null().all():
126
- return pl.col(col_name).cast(series.dtype)
138
+ return pl.col(col_name).cast(pl.Null())
127
139
 
128
140
  # Process based on current type
129
141
  if series.dtype.is_numeric():
@@ -28,18 +28,133 @@ F32_MIN = float(np.finfo(np.float32).min)
28
28
  F32_MAX = float(np.finfo(np.float32).max)
29
29
 
30
30
 
31
+ def dominant_timezone_per_column(
32
+ schemas: list[pa.Schema],
33
+ ) -> dict[str, tuple[str | None, str | None]]:
34
+ """
35
+ For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
36
+ If None and a timezone are tied, prefer the timezone.
37
+ Returns a dict: {column_name: dominant_timezone}
38
+ """
39
+ from collections import Counter, defaultdict
40
+
41
+ tz_counts = defaultdict(Counter)
42
+ units = {}
43
+
44
+ for schema in schemas:
45
+ for field in schema:
46
+ if pa.types.is_timestamp(field.type):
47
+ tz = field.type.tz
48
+ name = field.name
49
+ tz_counts[name][tz] += 1
50
+ # Track unit for each column (assume consistent)
51
+ if name not in units:
52
+ units[name] = field.type.unit
53
+
54
+ dominant = {}
55
+ for name, counter in tz_counts.items():
56
+ most_common = counter.most_common()
57
+ if not most_common:
58
+ continue
59
+ top_count = most_common[0][1]
60
+ # Find all with top_count
61
+ top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
62
+ # If tie and one is not None, prefer not-None
63
+ if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
64
+ tz = next(tz for tz in top_tzs if tz is not None)
65
+ else:
66
+ tz = most_common[0][0]
67
+ dominant[name] = (units[name], tz)
68
+ return dominant
69
+
70
+
71
+ def standardize_schema_timezones_by_majority(
72
+ schemas: list[pa.Schema],
73
+ ) -> list[pa.Schema]:
74
+ """
75
+ For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
76
+ Returns a new list of schemas with updated timestamp timezones.
77
+ """
78
+ dom = dominant_timezone_per_column(schemas)
79
+ new_schemas = []
80
+ for schema in schemas:
81
+ fields = []
82
+ for field in schema:
83
+ if pa.types.is_timestamp(field.type) and field.name in dom:
84
+ unit, tz = dom[field.name]
85
+ fields.append(
86
+ pa.field(
87
+ field.name,
88
+ pa.timestamp(unit, tz),
89
+ field.nullable,
90
+ field.metadata,
91
+ )
92
+ )
93
+ else:
94
+ fields.append(field)
95
+ new_schemas.append(pa.schema(fields, schema.metadata))
96
+ return new_schemas
97
+
98
+
99
+ def standardize_schema_timezones(
100
+ schemas: list[pa.Schema], timezone: str | None = None
101
+ ) -> list[pa.Schema]:
102
+ """
103
+ Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
104
+
105
+ Args:
106
+ schemas (list of pa.Schema): List of PyArrow schemas.
107
+ timezone (str or None): If None, remove timezone from all timestamp columns.
108
+ If str, set this timezone for all timestamp columns.
109
+ If "auto", use the most frequent timezone across schemas.
110
+
111
+ Returns:
112
+ list of pa.Schema: New schemas with standardized timezone info.
113
+ """
114
+ if timezone == "auto":
115
+ # Use the most frequent timezone for each column
116
+ return standardize_schema_timezones_by_majority(schemas)
117
+ new_schemas = []
118
+ for schema in schemas:
119
+ fields = []
120
+ for field in schema:
121
+ if pa.types.is_timestamp(field.type):
122
+ fields.append(
123
+ pa.field(
124
+ field.name,
125
+ pa.timestamp(field.type.unit, timezone),
126
+ field.nullable,
127
+ field.metadata,
128
+ )
129
+ )
130
+ else:
131
+ fields.append(field)
132
+ new_schemas.append(pa.schema(fields, schema.metadata))
133
+ return new_schemas
134
+
135
+
31
136
  def unify_schemas(
32
- schemas: list[pa.Schema], use_large_dtypes: bool = False
137
+ schemas: list[pa.Schema],
138
+ use_large_dtypes: bool = False,
139
+ timezone: str | None = None,
140
+ standardize_timezones: bool = True,
33
141
  ) -> pa.Schema:
34
142
  """
35
143
  Unify a list of PyArrow schemas into a single schema.
36
144
 
37
145
  Args:
38
146
  schemas (list[pa.Schema]): List of PyArrow schemas to unify.
147
+ use_large_dtypes (bool): If True, keep large types like large_string.
148
+ timezone (str | None): If specified, standardize all timestamp columns to this timezone.
149
+ If "auto", use the most frequent timezone across schemas.
150
+ If None, remove timezone from all timestamp columns.
151
+ standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
39
152
 
40
153
  Returns:
41
154
  pa.Schema: A unified PyArrow schema.
42
155
  """
156
+ if standardize_timezones:
157
+ schemas = standardize_schema_timezones(schemas, timezone)
43
158
  try:
44
159
  return pa.unify_schemas(schemas, promote_options="permissive")
45
160
  except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
@@ -189,7 +304,7 @@ def _can_downcast_to_float32(array: pa.Array) -> bool:
189
304
  return F32_MIN <= min_val <= max_val <= F32_MAX
190
305
 
191
306
 
192
- def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
307
+ def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
193
308
  """
194
309
  Determine the most efficient integer type based on data range.
195
310
  """
@@ -202,7 +317,8 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
202
317
  min_val = min_max["min"].as_py()
203
318
  max_val = min_max["max"].as_py()
204
319
 
205
- if min_val >= 0: # Unsigned
320
+ if allow_unsigned and min_val >= 0:
321
+ # If allow_unsigned is True, check for unsigned types
206
322
  if max_val <= 255:
207
323
  return pa.uint8()
208
324
  elif max_val <= 65535:
@@ -211,6 +327,7 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
211
327
  return pa.uint32()
212
328
  else:
213
329
  return pa.uint64()
330
+
214
331
  else: # Signed
215
332
  if -128 <= min_val and max_val <= 127:
216
333
  return pa.int8()
@@ -222,7 +339,9 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
222
339
  return pa.int64()
223
340
 
224
341
 
225
- def _optimize_numeric_array(array: pa.Array, shrink: bool) -> pa.Array:
342
+ def _optimize_numeric_array(
343
+ array: pa.Array, shrink: bool, allow_unsigned: bool = True
344
+ ) -> pa.Array:
226
345
  """
227
346
  Optimize numeric PyArrow array by downcasting when possible.
228
347
  Uses vectorized operations for efficiency.
@@ -242,7 +361,7 @@ def _optimize_numeric_array(array: pa.Array, shrink: bool) -> pa.Array:
242
361
  if array.type in [pa.int8(), pa.uint8()]:
243
362
  return array
244
363
 
245
- optimal_type = _get_optimal_int_type(array)
364
+ optimal_type = _get_optimal_int_type(array, allow_unsigned)
246
365
  return pc.cast(array, optimal_type)
247
366
 
248
367
  # Default: return unchanged
@@ -272,7 +391,7 @@ def _optimize_string_array(
272
391
  if len(array) == 0:
273
392
  return pa.array([], type=pa.int8())
274
393
  if array.null_count == len(array):
275
- return pa.array([None] * len(array), type=array.type)
394
+ return pa.array([None] * len(array), type=pa.null())
276
395
 
277
396
  # Clean string values
278
397
  cleaned_array = _clean_string_array(array)
@@ -333,7 +452,11 @@ def _optimize_string_array(
333
452
 
334
453
 
335
454
  def _process_column(
336
- table: pa.Table, col_name: str, shrink_numerics: bool, time_zone: str | None = None
455
+ table: pa.Table,
456
+ col_name: str,
457
+ shrink_numerics: bool,
458
+ allow_unsigned: bool,
459
+ time_zone: str | None = None,
337
460
  ) -> pa.Array:
338
461
  """
339
462
  Process a single column for type optimization.
@@ -342,11 +465,11 @@ def _process_column(
342
465
 
343
466
  # Handle all-null columns
344
467
  if array.null_count == len(array):
345
- return pa.array([None] * len(array), type=array.type)
468
+ return pa.array([None] * len(array), type=pa.null())
346
469
 
347
470
  # Process based on current type
348
471
  if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
349
- return _optimize_numeric_array(array, shrink_numerics)
472
+ return _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
350
473
  elif pa.types.is_string(array.type):
351
474
  return _optimize_string_array(array, col_name, shrink_numerics, time_zone)
352
475
 
@@ -360,6 +483,7 @@ def opt_dtype(
360
483
  exclude: str | list[str] | None = None,
361
484
  time_zone: str | None = None,
362
485
  shrink_numerics: bool = True,
486
+ allow_unsigned: bool = True,
363
487
  strict: bool = False,
364
488
  ) -> pa.Table:
365
489
  """
@@ -376,6 +500,7 @@ def opt_dtype(
376
500
  exclude: Column(s) to exclude from optimization
377
501
  time_zone: Optional time zone for datetime parsing
378
502
  shrink_numerics: Whether to downcast numeric types when possible
503
+ allow_unsigned: Whether to allow unsigned types
379
504
  strict: If True, will raise an error if any column cannot be optimized
380
505
 
381
506
  Returns:
@@ -401,7 +526,9 @@ def opt_dtype(
401
526
  try:
402
527
  # Process column for optimization
403
528
  new_columns.append(
404
- _process_column(table, col_name, shrink_numerics, time_zone)
529
+ _process_column(
530
+ table, col_name, shrink_numerics, allow_unsigned, time_zone
531
+ )
405
532
  )
406
533
  except Exception as e:
407
534
  if strict:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowerPower
3
- Version: 0.11.6.10
3
+ Version: 0.11.6.15
4
4
  Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
5
5
  Author-email: "Volker L." <ligno.blades@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/legout/flowerpower
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
18
18
  flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
19
19
  flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
20
20
  flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
21
- flowerpower/fs/ext.py,sha256=jshry-nBIjasijqrZZsqVn6QyB8Zh3amt8v1lWk0hEo,70205
21
+ flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
22
22
  flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
23
23
  flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
24
24
  flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
@@ -47,8 +47,8 @@ flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpu
47
47
  flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFzk,97098
48
48
  flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
49
49
  flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
50
- flowerpower/plugins/io/helpers/polars.py,sha256=cuzMby0a90AMFXhNEycf53UOwdHw4uxnx322l3m7jB0,27443
51
- flowerpower/plugins/io/helpers/pyarrow.py,sha256=NwA2NAPMIcGmaFE3gx1jKYW_-6gAxQ8Oczdgk4Av-s8,13903
50
+ flowerpower/plugins/io/helpers/polars.py,sha256=B4eg0GZUWh5Mbd1auC8SMmkCznR07q3sHDcgnwRmSNU,27856
51
+ flowerpower/plugins/io/helpers/pyarrow.py,sha256=lYZHbPklzYvd7L5XqDjoTUV42cHi_c9Wh8xf1HYtS2M,18592
52
52
  flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
53
53
  flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
54
54
  flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
94
94
  flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
95
95
  flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
96
96
  flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
97
- flowerpower-0.11.6.10.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
- flowerpower-0.11.6.10.dist-info/METADATA,sha256=sSuLHj16hjgnT_mUWWZLs6Wf5IE-Fz4hTRmBy0A9s7s,21613
99
- flowerpower-0.11.6.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- flowerpower-0.11.6.10.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
- flowerpower-0.11.6.10.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
- flowerpower-0.11.6.10.dist-info/RECORD,,
97
+ flowerpower-0.11.6.15.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
+ flowerpower-0.11.6.15.dist-info/METADATA,sha256=QIkBDvwIdawIRdDJDxx0a9xSbBUzX4X-vjQwKjY8Fh8,21613
99
+ flowerpower-0.11.6.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
+ flowerpower-0.11.6.15.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
+ flowerpower-0.11.6.15.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
+ flowerpower-0.11.6.15.dist-info/RECORD,,