PyPI - FlowerPower - Versions diffs - 0.11.6.11__py3-none-any.whl → 0.11.6.13__py3-none-any.whl - Mend

FlowerPower 0.11.6.11py3-none-any.whl → 0.11.6.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

flowerpower/fs/ext.py CHANGED Viewed

@@ -951,10 +951,15 @@ def _read_parquet(
         if isinstance(tables, list):
             if len(tables) > 1:
                 schemas = [t.schema for t in tables]
-                unified_schema = unify_schemas_pa(schemas)
+                unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
                 tables = [cast_schema(t, unified_schema) for t in tables]
+            tables = [table for table in tables if table.num_rows > 0]
+            if not tables:
+                return unified_schema.empty_table()
             result = pa.concat_tables(
-                [table for table in tables if table.num_rows > 0],
+                tables,
                 promote_options="permissive",
             )
             # if opt_dtypes:
@@ -965,8 +970,12 @@ def _read_parquet(
             #    tables = opt_dtype_pa(tables, strict=False)
             return tables
         else:
-            return pa.concat_tables(
-                [table for table in tables if table.num_rows > 0],
+            tables = [table for table in tables if table.num_rows > 0]
+            if not tables:
+                return unified_schema.empty_table()
+            result = pa.concat_tables(
+                tables,
                 promote_options="permissive",
             )
     return tables
@@ -1086,10 +1095,13 @@ def _read_parquet_batches(
             # Unify schemas before concatenation
             if len(batch_tables) > 1:
                 schemas = [t.schema for t in batch_tables]
-                unified_schema = unify_schemas_pa(schemas)
+                unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
                 batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
+            batch_tables = [table for table in batch_tables if table.num_rows > 0]
+            if not batch_tables:
+                yield unified_schema.empty_table()
             batch_table = pa.concat_tables(
-                [table for table in batch_tables if table.num_rows > 0],
+                batch_tables,
                 promote_options="permissive",
             )
             # if opt_dtypes:

flowerpower/plugins/io/helpers/pyarrow.py CHANGED Viewed

@@ -28,18 +28,133 @@ F32_MIN = float(np.finfo(np.float32).min)
 F32_MAX = float(np.finfo(np.float32).max)
+def dominant_timezone_per_column(
+    schemas: list[pa.Schema],
+) -> dict[str, tuple[str | None, str | None]]:
+    """
+    For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
+    If None and a timezone are tied, prefer the timezone.
+    Returns a dict: {column_name: dominant_timezone}
+    """
+    from collections import Counter, defaultdict
+    tz_counts = defaultdict(Counter)
+    units = {}
+    for schema in schemas:
+        for field in schema:
+            if pa.types.is_timestamp(field.type):
+                tz = field.type.tz
+                name = field.name
+                tz_counts[name][tz] += 1
+                # Track unit for each column (assume consistent)
+                if name not in units:
+                    units[name] = field.type.unit
+    dominant = {}
+    for name, counter in tz_counts.items():
+        most_common = counter.most_common()
+        if not most_common:
+            continue
+        top_count = most_common[0][1]
+        # Find all with top_count
+        top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
+        # If tie and one is not None, prefer not-None
+        if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
+            tz = next(tz for tz in top_tzs if tz is not None)
+        else:
+            tz = most_common[0][0]
+        dominant[name] = (units[name], tz)
+    return dominant
+def standardize_schema_timezones_by_majority(
+    schemas: list[pa.Schema],
+) -> list[pa.Schema]:
+    """
+    For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
+    Returns a new list of schemas with updated timestamp timezones.
+    """
+    dom = dominant_timezone_per_column(schemas)
+    new_schemas = []
+    for schema in schemas:
+        fields = []
+        for field in schema:
+            if pa.types.is_timestamp(field.type) and field.name in dom:
+                unit, tz = dom[field.name]
+                fields.append(
+                    pa.field(
+                        field.name,
+                        pa.timestamp(unit, tz),
+                        field.nullable,
+                        field.metadata,
+                    )
+                )
+            else:
+                fields.append(field)
+        new_schemas.append(pa.schema(fields, schema.metadata))
+    return new_schemas
+def standardize_schema_timezones(
+    schemas: list[pa.Schema], timezone: str | None = None
+) -> list[pa.Schema]:
+    """
+    Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
+    Args:
+        schemas (list of pa.Schema): List of PyArrow schemas.
+        timezone (str or None): If None, remove timezone from all timestamp columns.
+                                If str, set this timezone for all timestamp columns.
+                                If "auto", use the most frequent timezone across schemas.
+    Returns:
+        list of pa.Schema: New schemas with standardized timezone info.
+    """
+    if timezone == "auto":
+        # Use the most frequent timezone for each column
+        return standardize_schema_timezones_by_majority(schemas)
+    new_schemas = []
+    for schema in schemas:
+        fields = []
+        for field in schema:
+            if pa.types.is_timestamp(field.type):
+                fields.append(
+                    pa.field(
+                        field.name,
+                        pa.timestamp(field.type.unit, timezone),
+                        field.nullable,
+                        field.metadata,
+                    )
+                )
+            else:
+                fields.append(field)
+        new_schemas.append(pa.schema(fields, schema.metadata))
+    return new_schemas
 def unify_schemas(
-    schemas: list[pa.Schema], use_large_dtypes: bool = False
+    schemas: list[pa.Schema],
+    use_large_dtypes: bool = False,
+    timezone: str | None = None,
+    standardize_timezones: bool = True,
 ) -> pa.Schema:
     """
     Unify a list of PyArrow schemas into a single schema.
     Args:
         schemas (list[pa.Schema]): List of PyArrow schemas to unify.
+        use_large_dtypes (bool): If True, keep large types like large_string.
+        timezone (str | None): If specified, standardize all timestamp columns to this timezone.
+            If "auto", use the most frequent timezone across schemas.
+            If None, remove timezone from all timestamp columns.
+        standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
     Returns:
         pa.Schema: A unified PyArrow schema.
     """
+    if standardize_timezones:
+        schemas = standardize_schema_timezones(schemas, timezone)
     try:
         return pa.unify_schemas(schemas, promote_options="permissive")
     except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: FlowerPower
-Version: 0.11.6.11
+Version: 0.11.6.13
 Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
 Author-email: "Volker L." <ligno.blades@gmail.com>
 Project-URL: Homepage, https://github.com/legout/flowerpower

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/RECORD RENAMED Viewed

@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
 flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
 flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
 flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
-flowerpower/fs/ext.py,sha256=jshry-nBIjasijqrZZsqVn6QyB8Zh3amt8v1lWk0hEo,70205
+flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
 flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
 flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
 flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
@@ -48,7 +48,7 @@ flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFz
 flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
 flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
 flowerpower/plugins/io/helpers/polars.py,sha256=6YbPg1UDeZaWLSnXatgvzCNJI8Ui2GhTegYsbV5VgrM,27463
-flowerpower/plugins/io/helpers/pyarrow.py,sha256=r8JNCp_BSte2ly41hpk0Z9Ik02-IouazgNp98GcNCb8,13901
+flowerpower/plugins/io/helpers/pyarrow.py,sha256=umgmM2hZQ-tfbZTl8rYo158K6P0SsAOfm7oe-N5cc_M,18243
 flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
 flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
 flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
 flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
 flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
 flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
-flowerpower-0.11.6.11.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
-flowerpower-0.11.6.11.dist-info/METADATA,sha256=T9-JZpIYcZhxzHz2YwcuVfEhXFof_HVuVDsDm917QwU,21613
-flowerpower-0.11.6.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-flowerpower-0.11.6.11.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
-flowerpower-0.11.6.11.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
-flowerpower-0.11.6.11.dist-info/RECORD,,
+flowerpower-0.11.6.13.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
+flowerpower-0.11.6.13.dist-info/METADATA,sha256=aDlOA-x27j2YjpomvE0xrtb7MzOPo7L7ljz-rSMLE6c,21613
+flowerpower-0.11.6.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+flowerpower-0.11.6.13.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
+flowerpower-0.11.6.13.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
+flowerpower-0.11.6.13.dist-info/RECORD,,

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{flowerpower-0.11.6.11.dist-info → flowerpower-0.11.6.13.dist-info}/top_level.txt RENAMED Viewed

File without changes

FlowerPower 0.11.6.11__py3-none-any.whl → 0.11.6.13__py3-none-any.whl

FlowerPower 0.11.6.11py3-none-any.whl → 0.11.6.13py3-none-any.whl