FlowerPower 0.11.6.11__py3-none-any.whl → 0.11.6.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +18 -6
- flowerpower/plugins/io/helpers/pyarrow.py +116 -1
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/METADATA +1 -1
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/RECORD +8 -8
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -951,10 +951,15 @@ def _read_parquet(
|
|
951
951
|
if isinstance(tables, list):
|
952
952
|
if len(tables) > 1:
|
953
953
|
schemas = [t.schema for t in tables]
|
954
|
-
unified_schema = unify_schemas_pa(schemas)
|
954
|
+
unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
|
955
955
|
tables = [cast_schema(t, unified_schema) for t in tables]
|
956
|
+
|
957
|
+
tables = [table for table in tables if table.num_rows > 0]
|
958
|
+
if not tables:
|
959
|
+
return unified_schema.empty_table()
|
960
|
+
|
956
961
|
result = pa.concat_tables(
|
957
|
-
|
962
|
+
tables,
|
958
963
|
promote_options="permissive",
|
959
964
|
)
|
960
965
|
# if opt_dtypes:
|
@@ -965,8 +970,12 @@ def _read_parquet(
|
|
965
970
|
# tables = opt_dtype_pa(tables, strict=False)
|
966
971
|
return tables
|
967
972
|
else:
|
968
|
-
|
969
|
-
|
973
|
+
tables = [table for table in tables if table.num_rows > 0]
|
974
|
+
if not tables:
|
975
|
+
return unified_schema.empty_table()
|
976
|
+
|
977
|
+
result = pa.concat_tables(
|
978
|
+
tables,
|
970
979
|
promote_options="permissive",
|
971
980
|
)
|
972
981
|
return tables
|
@@ -1086,10 +1095,13 @@ def _read_parquet_batches(
|
|
1086
1095
|
# Unify schemas before concatenation
|
1087
1096
|
if len(batch_tables) > 1:
|
1088
1097
|
schemas = [t.schema for t in batch_tables]
|
1089
|
-
unified_schema = unify_schemas_pa(schemas)
|
1098
|
+
unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
|
1090
1099
|
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1100
|
+
batch_tables = [table for table in batch_tables if table.num_rows > 0]
|
1101
|
+
if not batch_tables:
|
1102
|
+
yield unified_schema.empty_table()
|
1091
1103
|
batch_table = pa.concat_tables(
|
1092
|
-
|
1104
|
+
batch_tables,
|
1093
1105
|
promote_options="permissive",
|
1094
1106
|
)
|
1095
1107
|
# if opt_dtypes:
|
@@ -28,18 +28,133 @@ F32_MIN = float(np.finfo(np.float32).min)
|
|
28
28
|
F32_MAX = float(np.finfo(np.float32).max)
|
29
29
|
|
30
30
|
|
31
|
+
def dominant_timezone_per_column(
|
32
|
+
schemas: list[pa.Schema],
|
33
|
+
) -> dict[str, tuple[str | None, str | None]]:
|
34
|
+
"""
|
35
|
+
For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
|
36
|
+
If None and a timezone are tied, prefer the timezone.
|
37
|
+
Returns a dict: {column_name: dominant_timezone}
|
38
|
+
"""
|
39
|
+
from collections import Counter, defaultdict
|
40
|
+
|
41
|
+
tz_counts = defaultdict(Counter)
|
42
|
+
units = {}
|
43
|
+
|
44
|
+
for schema in schemas:
|
45
|
+
for field in schema:
|
46
|
+
if pa.types.is_timestamp(field.type):
|
47
|
+
tz = field.type.tz
|
48
|
+
name = field.name
|
49
|
+
tz_counts[name][tz] += 1
|
50
|
+
# Track unit for each column (assume consistent)
|
51
|
+
if name not in units:
|
52
|
+
units[name] = field.type.unit
|
53
|
+
|
54
|
+
dominant = {}
|
55
|
+
for name, counter in tz_counts.items():
|
56
|
+
most_common = counter.most_common()
|
57
|
+
if not most_common:
|
58
|
+
continue
|
59
|
+
top_count = most_common[0][1]
|
60
|
+
# Find all with top_count
|
61
|
+
top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
|
62
|
+
# If tie and one is not None, prefer not-None
|
63
|
+
if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
|
64
|
+
tz = next(tz for tz in top_tzs if tz is not None)
|
65
|
+
else:
|
66
|
+
tz = most_common[0][0]
|
67
|
+
dominant[name] = (units[name], tz)
|
68
|
+
return dominant
|
69
|
+
|
70
|
+
|
71
|
+
def standardize_schema_timezones_by_majority(
|
72
|
+
schemas: list[pa.Schema],
|
73
|
+
) -> list[pa.Schema]:
|
74
|
+
"""
|
75
|
+
For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
|
76
|
+
Returns a new list of schemas with updated timestamp timezones.
|
77
|
+
"""
|
78
|
+
dom = dominant_timezone_per_column(schemas)
|
79
|
+
new_schemas = []
|
80
|
+
for schema in schemas:
|
81
|
+
fields = []
|
82
|
+
for field in schema:
|
83
|
+
if pa.types.is_timestamp(field.type) and field.name in dom:
|
84
|
+
unit, tz = dom[field.name]
|
85
|
+
fields.append(
|
86
|
+
pa.field(
|
87
|
+
field.name,
|
88
|
+
pa.timestamp(unit, tz),
|
89
|
+
field.nullable,
|
90
|
+
field.metadata,
|
91
|
+
)
|
92
|
+
)
|
93
|
+
else:
|
94
|
+
fields.append(field)
|
95
|
+
new_schemas.append(pa.schema(fields, schema.metadata))
|
96
|
+
return new_schemas
|
97
|
+
|
98
|
+
|
99
|
+
def standardize_schema_timezones(
|
100
|
+
schemas: list[pa.Schema], timezone: str | None = None
|
101
|
+
) -> list[pa.Schema]:
|
102
|
+
"""
|
103
|
+
Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
schemas (list of pa.Schema): List of PyArrow schemas.
|
107
|
+
timezone (str or None): If None, remove timezone from all timestamp columns.
|
108
|
+
If str, set this timezone for all timestamp columns.
|
109
|
+
If "auto", use the most frequent timezone across schemas.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
list of pa.Schema: New schemas with standardized timezone info.
|
113
|
+
"""
|
114
|
+
if timezone == "auto":
|
115
|
+
# Use the most frequent timezone for each column
|
116
|
+
return standardize_schema_timezones_by_majority(schemas)
|
117
|
+
new_schemas = []
|
118
|
+
for schema in schemas:
|
119
|
+
fields = []
|
120
|
+
for field in schema:
|
121
|
+
if pa.types.is_timestamp(field.type):
|
122
|
+
fields.append(
|
123
|
+
pa.field(
|
124
|
+
field.name,
|
125
|
+
pa.timestamp(field.type.unit, timezone),
|
126
|
+
field.nullable,
|
127
|
+
field.metadata,
|
128
|
+
)
|
129
|
+
)
|
130
|
+
else:
|
131
|
+
fields.append(field)
|
132
|
+
new_schemas.append(pa.schema(fields, schema.metadata))
|
133
|
+
return new_schemas
|
134
|
+
|
135
|
+
|
31
136
|
def unify_schemas(
|
32
|
-
schemas: list[pa.Schema],
|
137
|
+
schemas: list[pa.Schema],
|
138
|
+
use_large_dtypes: bool = False,
|
139
|
+
timezone: str | None = None,
|
140
|
+
standardize_timezones: bool = True,
|
33
141
|
) -> pa.Schema:
|
34
142
|
"""
|
35
143
|
Unify a list of PyArrow schemas into a single schema.
|
36
144
|
|
37
145
|
Args:
|
38
146
|
schemas (list[pa.Schema]): List of PyArrow schemas to unify.
|
147
|
+
use_large_dtypes (bool): If True, keep large types like large_string.
|
148
|
+
timezone (str | None): If specified, standardize all timestamp columns to this timezone.
|
149
|
+
If "auto", use the most frequent timezone across schemas.
|
150
|
+
If None, remove timezone from all timestamp columns.
|
151
|
+
standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
|
39
152
|
|
40
153
|
Returns:
|
41
154
|
pa.Schema: A unified PyArrow schema.
|
42
155
|
"""
|
156
|
+
if standardize_timezones:
|
157
|
+
schemas = standardize_schema_timezones(schemas, timezone)
|
43
158
|
try:
|
44
159
|
return pa.unify_schemas(schemas, promote_options="permissive")
|
45
160
|
except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6.
|
3
|
+
Version: 0.11.6.13
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
|
|
18
18
|
flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
|
19
19
|
flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
|
20
20
|
flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
|
21
|
-
flowerpower/fs/ext.py,sha256=
|
21
|
+
flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
|
22
22
|
flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
|
23
23
|
flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
|
24
24
|
flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
|
@@ -48,7 +48,7 @@ flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFz
|
|
48
48
|
flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
|
49
49
|
flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
|
50
50
|
flowerpower/plugins/io/helpers/polars.py,sha256=6YbPg1UDeZaWLSnXatgvzCNJI8Ui2GhTegYsbV5VgrM,27463
|
51
|
-
flowerpower/plugins/io/helpers/pyarrow.py,sha256=
|
51
|
+
flowerpower/plugins/io/helpers/pyarrow.py,sha256=umgmM2hZQ-tfbZTl8rYo158K6P0SsAOfm7oe-N5cc_M,18243
|
52
52
|
flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
|
53
53
|
flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
|
54
54
|
flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
|
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
|
|
94
94
|
flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
|
95
95
|
flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
|
96
96
|
flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
|
97
|
-
flowerpower-0.11.6.
|
98
|
-
flowerpower-0.11.6.
|
99
|
-
flowerpower-0.11.6.
|
100
|
-
flowerpower-0.11.6.
|
101
|
-
flowerpower-0.11.6.
|
102
|
-
flowerpower-0.11.6.
|
97
|
+
flowerpower-0.11.6.13.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
+
flowerpower-0.11.6.13.dist-info/METADATA,sha256=aDlOA-x27j2YjpomvE0xrtb7MzOPo7L7ljz-rSMLE6c,21613
|
99
|
+
flowerpower-0.11.6.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
+
flowerpower-0.11.6.13.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
+
flowerpower-0.11.6.13.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
+
flowerpower-0.11.6.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|