FlowerPower 0.11.6.12__py3-none-any.whl → 0.11.6.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flowerpower/fs/ext.py CHANGED
@@ -951,7 +951,7 @@ def _read_parquet(
951
951
  if isinstance(tables, list):
952
952
  if len(tables) > 1:
953
953
  schemas = [t.schema for t in tables]
954
- unified_schema = unify_schemas_pa(schemas)
954
+ unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
955
955
  tables = [cast_schema(t, unified_schema) for t in tables]
956
956
 
957
957
  tables = [table for table in tables if table.num_rows > 0]
@@ -1095,7 +1095,7 @@ def _read_parquet_batches(
1095
1095
  # Unify schemas before concatenation
1096
1096
  if len(batch_tables) > 1:
1097
1097
  schemas = [t.schema for t in batch_tables]
1098
- unified_schema = unify_schemas_pa(schemas)
1098
+ unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
1099
1099
  batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
1100
1100
  batch_tables = [table for table in batch_tables if table.num_rows > 0]
1101
1101
  if not batch_tables:
@@ -28,18 +28,133 @@ F32_MIN = float(np.finfo(np.float32).min)
28
28
  F32_MAX = float(np.finfo(np.float32).max)
29
29
 
30
30
 
31
+ def dominant_timezone_per_column(
32
+ schemas: list[pa.Schema],
33
+ ) -> dict[str, tuple[str | None, str | None]]:
34
+ """
35
+ For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
36
+ If None and a timezone are tied, prefer the timezone.
37
+ Returns a dict: {column_name: dominant_timezone}
38
+ """
39
+ from collections import Counter, defaultdict
40
+
41
+ tz_counts = defaultdict(Counter)
42
+ units = {}
43
+
44
+ for schema in schemas:
45
+ for field in schema:
46
+ if pa.types.is_timestamp(field.type):
47
+ tz = field.type.tz
48
+ name = field.name
49
+ tz_counts[name][tz] += 1
50
+ # Track unit for each column (assume consistent)
51
+ if name not in units:
52
+ units[name] = field.type.unit
53
+
54
+ dominant = {}
55
+ for name, counter in tz_counts.items():
56
+ most_common = counter.most_common()
57
+ if not most_common:
58
+ continue
59
+ top_count = most_common[0][1]
60
+ # Find all with top_count
61
+ top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
62
+ # If tie and one is not None, prefer not-None
63
+ if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
64
+ tz = next(tz for tz in top_tzs if tz is not None)
65
+ else:
66
+ tz = most_common[0][0]
67
+ dominant[name] = (units[name], tz)
68
+ return dominant
69
+
70
+
71
+ def standardize_schema_timezones_by_majority(
72
+ schemas: list[pa.Schema],
73
+ ) -> list[pa.Schema]:
74
+ """
75
+ For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
76
+ Returns a new list of schemas with updated timestamp timezones.
77
+ """
78
+ dom = dominant_timezone_per_column(schemas)
79
+ new_schemas = []
80
+ for schema in schemas:
81
+ fields = []
82
+ for field in schema:
83
+ if pa.types.is_timestamp(field.type) and field.name in dom:
84
+ unit, tz = dom[field.name]
85
+ fields.append(
86
+ pa.field(
87
+ field.name,
88
+ pa.timestamp(unit, tz),
89
+ field.nullable,
90
+ field.metadata,
91
+ )
92
+ )
93
+ else:
94
+ fields.append(field)
95
+ new_schemas.append(pa.schema(fields, schema.metadata))
96
+ return new_schemas
97
+
98
+
99
+ def standardize_schema_timezones(
100
+ schemas: list[pa.Schema], timezone: str | None = None
101
+ ) -> list[pa.Schema]:
102
+ """
103
+ Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
104
+
105
+ Args:
106
+ schemas (list of pa.Schema): List of PyArrow schemas.
107
+ timezone (str or None): If None, remove timezone from all timestamp columns.
108
+ If str, set this timezone for all timestamp columns.
109
+ If "auto", use the most frequent timezone across schemas.
110
+
111
+ Returns:
112
+ list of pa.Schema: New schemas with standardized timezone info.
113
+ """
114
+ if timezone == "auto":
115
+ # Use the most frequent timezone for each column
116
+ return standardize_schema_timezones_by_majority(schemas)
117
+ new_schemas = []
118
+ for schema in schemas:
119
+ fields = []
120
+ for field in schema:
121
+ if pa.types.is_timestamp(field.type):
122
+ fields.append(
123
+ pa.field(
124
+ field.name,
125
+ pa.timestamp(field.type.unit, timezone),
126
+ field.nullable,
127
+ field.metadata,
128
+ )
129
+ )
130
+ else:
131
+ fields.append(field)
132
+ new_schemas.append(pa.schema(fields, schema.metadata))
133
+ return new_schemas
134
+
135
+
31
136
  def unify_schemas(
32
- schemas: list[pa.Schema], use_large_dtypes: bool = False
137
+ schemas: list[pa.Schema],
138
+ use_large_dtypes: bool = False,
139
+ timezone: str | None = None,
140
+ standardize_timezones: bool = True,
33
141
  ) -> pa.Schema:
34
142
  """
35
143
  Unify a list of PyArrow schemas into a single schema.
36
144
 
37
145
  Args:
38
146
  schemas (list[pa.Schema]): List of PyArrow schemas to unify.
147
+ use_large_dtypes (bool): If True, keep large types like large_string.
148
+ timezone (str | None): If specified, standardize all timestamp columns to this timezone.
149
+ If "auto", use the most frequent timezone across schemas.
150
+ If None, remove timezone from all timestamp columns.
151
+ standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
39
152
 
40
153
  Returns:
41
154
  pa.Schema: A unified PyArrow schema.
42
155
  """
156
+ if standardize_timezones:
157
+ schemas = standardize_schema_timezones(schemas, timezone)
43
158
  try:
44
159
  return pa.unify_schemas(schemas, promote_options="permissive")
45
160
  except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowerPower
3
- Version: 0.11.6.12
3
+ Version: 0.11.6.13
4
4
  Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
5
5
  Author-email: "Volker L." <ligno.blades@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/legout/flowerpower
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
18
18
  flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
19
19
  flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
20
20
  flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
21
- flowerpower/fs/ext.py,sha256=gVmca4UVC1y33Y-8BIstO5a7oyw20bcSxtA19F0Limk,70548
21
+ flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
22
22
  flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
23
23
  flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
24
24
  flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
@@ -48,7 +48,7 @@ flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFz
48
48
  flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
49
49
  flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
50
50
  flowerpower/plugins/io/helpers/polars.py,sha256=6YbPg1UDeZaWLSnXatgvzCNJI8Ui2GhTegYsbV5VgrM,27463
51
- flowerpower/plugins/io/helpers/pyarrow.py,sha256=r8JNCp_BSte2ly41hpk0Z9Ik02-IouazgNp98GcNCb8,13901
51
+ flowerpower/plugins/io/helpers/pyarrow.py,sha256=umgmM2hZQ-tfbZTl8rYo158K6P0SsAOfm7oe-N5cc_M,18243
52
52
  flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
53
53
  flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
54
54
  flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
94
94
  flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
95
95
  flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
96
96
  flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
97
- flowerpower-0.11.6.12.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
- flowerpower-0.11.6.12.dist-info/METADATA,sha256=2_9L9FZ9Chw2Z5IQJXNdkUnfdmrv91cPiq7KRijnrw0,21613
99
- flowerpower-0.11.6.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
- flowerpower-0.11.6.12.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
- flowerpower-0.11.6.12.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
- flowerpower-0.11.6.12.dist-info/RECORD,,
97
+ flowerpower-0.11.6.13.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
98
+ flowerpower-0.11.6.13.dist-info/METADATA,sha256=aDlOA-x27j2YjpomvE0xrtb7MzOPo7L7ljz-rSMLE6c,21613
99
+ flowerpower-0.11.6.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
+ flowerpower-0.11.6.13.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
101
+ flowerpower-0.11.6.13.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
102
+ flowerpower-0.11.6.13.dist-info/RECORD,,