PyPI - FlowerPower - Versions diffs - 0.11.6.10__py3-none-any.whl → 0.11.6.15__py3-none-any.whl - Mend

FlowerPower 0.11.6.10py3-none-any.whl → 0.11.6.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

flowerpower/fs/ext.py CHANGED Viewed

@@ -951,10 +951,15 @@ def _read_parquet(
         if isinstance(tables, list):
             if len(tables) > 1:
                 schemas = [t.schema for t in tables]
-                unified_schema = unify_schemas_pa(schemas)
+                unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
                 tables = [cast_schema(t, unified_schema) for t in tables]
+            tables = [table for table in tables if table.num_rows > 0]
+            if not tables:
+                return unified_schema.empty_table()
             result = pa.concat_tables(
-                [table for table in tables if table.num_rows > 0],
+                tables,
                 promote_options="permissive",
             )
             # if opt_dtypes:
@@ -965,8 +970,12 @@ def _read_parquet(
             #    tables = opt_dtype_pa(tables, strict=False)
             return tables
         else:
-            return pa.concat_tables(
-                [table for table in tables if table.num_rows > 0],
+            tables = [table for table in tables if table.num_rows > 0]
+            if not tables:
+                return unified_schema.empty_table()
+            result = pa.concat_tables(
+                tables,
                 promote_options="permissive",
             )
     return tables
@@ -1086,10 +1095,13 @@ def _read_parquet_batches(
             # Unify schemas before concatenation
             if len(batch_tables) > 1:
                 schemas = [t.schema for t in batch_tables]
-                unified_schema = unify_schemas_pa(schemas)
+                unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
                 batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
+            batch_tables = [table for table in batch_tables if table.num_rows > 0]
+            if not batch_tables:
+                yield unified_schema.empty_table()
             batch_table = pa.concat_tables(
-                [table for table in batch_tables if table.num_rows > 0],
+                batch_tables,
                 promote_options="permissive",
             )
             # if opt_dtypes:

flowerpower/plugins/io/helpers/polars.py CHANGED Viewed

@@ -46,15 +46,27 @@ def _can_downcast_to_float32(series: pl.Series) -> bool:
     return F32_MIN <= min_val <= max_val <= F32_MAX
-def _optimize_numeric_column(series: pl.Series, col_name: str, shrink: bool) -> pl.Expr:
-    """Optimize numeric column types."""
+def _optimize_numeric_column(
+    series: pl.Series, col_name: str, shrink: bool, allow_unsigned: bool = False
+) -> pl.Expr:
+    """Optimize numeric column types, optionally converting to unsigned if all values >= 0."""
+    expr = pl.col(col_name)
+    dtype = series.dtype
+    if allow_unsigned and dtype.is_integer() and (series.min() is not None) and series.min() >= 0:
+        # Convert to unsigned integer type, shrink if requested
+        if shrink:
+            return expr.cast(pl.UInt64).shrink_dtype()
+        else:
+            return expr.cast(pl.UInt64)
     if not shrink:
-        return pl.col(col_name)
+        return expr
-    if series.dtype == pl.Float64 and not _can_downcast_to_float32(series):
-        return pl.col(col_name)
+    if dtype == pl.Float64 and not _can_downcast_to_float32(series):
+        return expr
-    return pl.col(col_name).shrink_dtype()
+    return expr.shrink_dtype()
 def _optimize_string_column(
@@ -68,7 +80,7 @@ def _optimize_string_column(
     cleaned_expr = _clean_string_expr(col_name)
     non_null = series.drop_nulls().replace({"-": None, "": None, "None": None})
     if len(non_null) == 0:
-        return pl.col(col_name).cast(series.dtype)
+        return pl.col(col_name).cast(pl.Null())  # Fix: Cast to Null type
     stripped = non_null.str.strip_chars()
     lowercase = stripped.str.to_lowercase()
@@ -123,7 +135,7 @@ def _get_column_expr(
     # Handle all-null columns
     if series.is_null().all():
-        return pl.col(col_name).cast(series.dtype)
+        return pl.col(col_name).cast(pl.Null())
     # Process based on current type
     if series.dtype.is_numeric():

flowerpower/plugins/io/helpers/pyarrow.py CHANGED Viewed

@@ -28,18 +28,133 @@ F32_MIN = float(np.finfo(np.float32).min)
 F32_MAX = float(np.finfo(np.float32).max)
+def dominant_timezone_per_column(
+    schemas: list[pa.Schema],
+) -> dict[str, tuple[str | None, str | None]]:
+    """
+    For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
+    If None and a timezone are tied, prefer the timezone.
+    Returns a dict: {column_name: dominant_timezone}
+    """
+    from collections import Counter, defaultdict
+    tz_counts = defaultdict(Counter)
+    units = {}
+    for schema in schemas:
+        for field in schema:
+            if pa.types.is_timestamp(field.type):
+                tz = field.type.tz
+                name = field.name
+                tz_counts[name][tz] += 1
+                # Track unit for each column (assume consistent)
+                if name not in units:
+                    units[name] = field.type.unit
+    dominant = {}
+    for name, counter in tz_counts.items():
+        most_common = counter.most_common()
+        if not most_common:
+            continue
+        top_count = most_common[0][1]
+        # Find all with top_count
+        top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
+        # If tie and one is not None, prefer not-None
+        if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
+            tz = next(tz for tz in top_tzs if tz is not None)
+        else:
+            tz = most_common[0][0]
+        dominant[name] = (units[name], tz)
+    return dominant
+def standardize_schema_timezones_by_majority(
+    schemas: list[pa.Schema],
+) -> list[pa.Schema]:
+    """
+    For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
+    Returns a new list of schemas with updated timestamp timezones.
+    """
+    dom = dominant_timezone_per_column(schemas)
+    new_schemas = []
+    for schema in schemas:
+        fields = []
+        for field in schema:
+            if pa.types.is_timestamp(field.type) and field.name in dom:
+                unit, tz = dom[field.name]
+                fields.append(
+                    pa.field(
+                        field.name,
+                        pa.timestamp(unit, tz),
+                        field.nullable,
+                        field.metadata,
+                    )
+                )
+            else:
+                fields.append(field)
+        new_schemas.append(pa.schema(fields, schema.metadata))
+    return new_schemas
+def standardize_schema_timezones(
+    schemas: list[pa.Schema], timezone: str | None = None
+) -> list[pa.Schema]:
+    """
+    Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
+    Args:
+        schemas (list of pa.Schema): List of PyArrow schemas.
+        timezone (str or None): If None, remove timezone from all timestamp columns.
+                                If str, set this timezone for all timestamp columns.
+                                If "auto", use the most frequent timezone across schemas.
+    Returns:
+        list of pa.Schema: New schemas with standardized timezone info.
+    """
+    if timezone == "auto":
+        # Use the most frequent timezone for each column
+        return standardize_schema_timezones_by_majority(schemas)
+    new_schemas = []
+    for schema in schemas:
+        fields = []
+        for field in schema:
+            if pa.types.is_timestamp(field.type):
+                fields.append(
+                    pa.field(
+                        field.name,
+                        pa.timestamp(field.type.unit, timezone),
+                        field.nullable,
+                        field.metadata,
+                    )
+                )
+            else:
+                fields.append(field)
+        new_schemas.append(pa.schema(fields, schema.metadata))
+    return new_schemas
 def unify_schemas(
-    schemas: list[pa.Schema], use_large_dtypes: bool = False
+    schemas: list[pa.Schema],
+    use_large_dtypes: bool = False,
+    timezone: str | None = None,
+    standardize_timezones: bool = True,
 ) -> pa.Schema:
     """
     Unify a list of PyArrow schemas into a single schema.
     Args:
         schemas (list[pa.Schema]): List of PyArrow schemas to unify.
+        use_large_dtypes (bool): If True, keep large types like large_string.
+        timezone (str | None): If specified, standardize all timestamp columns to this timezone.
+            If "auto", use the most frequent timezone across schemas.
+            If None, remove timezone from all timestamp columns.
+        standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
     Returns:
         pa.Schema: A unified PyArrow schema.
     """
+    if standardize_timezones:
+        schemas = standardize_schema_timezones(schemas, timezone)
     try:
         return pa.unify_schemas(schemas, promote_options="permissive")
     except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
@@ -189,7 +304,7 @@ def _can_downcast_to_float32(array: pa.Array) -> bool:
     return F32_MIN <= min_val <= max_val <= F32_MAX
-def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
+def _get_optimal_int_type(array: pa.Array, allow_unsigned: bool) -> pa.DataType:
     """
     Determine the most efficient integer type based on data range.
     """
@@ -202,7 +317,8 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
     min_val = min_max["min"].as_py()
     max_val = min_max["max"].as_py()
-    if min_val >= 0:  # Unsigned
+    if allow_unsigned and min_val >= 0:
+        # If allow_unsigned is True, check for unsigned types
         if max_val <= 255:
             return pa.uint8()
         elif max_val <= 65535:
@@ -211,6 +327,7 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
             return pa.uint32()
         else:
             return pa.uint64()
     else:  # Signed
         if -128 <= min_val and max_val <= 127:
             return pa.int8()
@@ -222,7 +339,9 @@ def _get_optimal_int_type(array: pa.Array) -> pa.DataType:
             return pa.int64()
-def _optimize_numeric_array(array: pa.Array, shrink: bool) -> pa.Array:
+def _optimize_numeric_array(
+    array: pa.Array, shrink: bool, allow_unsigned: bool = True
+) -> pa.Array:
     """
     Optimize numeric PyArrow array by downcasting when possible.
     Uses vectorized operations for efficiency.
@@ -242,7 +361,7 @@ def _optimize_numeric_array(array: pa.Array, shrink: bool) -> pa.Array:
         if array.type in [pa.int8(), pa.uint8()]:
             return array
-        optimal_type = _get_optimal_int_type(array)
+        optimal_type = _get_optimal_int_type(array, allow_unsigned)
         return pc.cast(array, optimal_type)
     # Default: return unchanged
@@ -272,7 +391,7 @@ def _optimize_string_array(
     if len(array) == 0:
         return pa.array([], type=pa.int8())
     if array.null_count == len(array):
-        return pa.array([None] * len(array), type=array.type)
+        return pa.array([None] * len(array), type=pa.null())
     # Clean string values
     cleaned_array = _clean_string_array(array)
@@ -333,7 +452,11 @@ def _optimize_string_array(
 def _process_column(
-    table: pa.Table, col_name: str, shrink_numerics: bool, time_zone: str | None = None
+    table: pa.Table,
+    col_name: str,
+    shrink_numerics: bool,
+    allow_unsigned: bool,
+    time_zone: str | None = None,
 ) -> pa.Array:
     """
     Process a single column for type optimization.
@@ -342,11 +465,11 @@ def _process_column(
     # Handle all-null columns
     if array.null_count == len(array):
-        return pa.array([None] * len(array), type=array.type)
+        return pa.array([None] * len(array), type=pa.null())
     # Process based on current type
     if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
-        return _optimize_numeric_array(array, shrink_numerics)
+        return _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
     elif pa.types.is_string(array.type):
         return _optimize_string_array(array, col_name, shrink_numerics, time_zone)
@@ -360,6 +483,7 @@ def opt_dtype(
     exclude: str | list[str] | None = None,
     time_zone: str | None = None,
     shrink_numerics: bool = True,
+    allow_unsigned: bool = True,
     strict: bool = False,
 ) -> pa.Table:
     """
@@ -376,6 +500,7 @@ def opt_dtype(
         exclude: Column(s) to exclude from optimization
         time_zone: Optional time zone for datetime parsing
         shrink_numerics: Whether to downcast numeric types when possible
+        allow_unsigned: Whether to allow unsigned types
         strict: If True, will raise an error if any column cannot be optimized
     Returns:
@@ -401,7 +526,9 @@ def opt_dtype(
             try:
                 # Process column for optimization
                 new_columns.append(
-                    _process_column(table, col_name, shrink_numerics, time_zone)
+                    _process_column(
+                        table, col_name, shrink_numerics, allow_unsigned, time_zone
+                    )
                 )
             except Exception as e:
                 if strict:

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: FlowerPower
-Version: 0.11.6.10
+Version: 0.11.6.15
 Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
 Author-email: "Volker L." <ligno.blades@gmail.com>
 Project-URL: Homepage, https://github.com/legout/flowerpower

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/RECORD RENAMED Viewed

@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
 flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
 flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
 flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
-flowerpower/fs/ext.py,sha256=jshry-nBIjasijqrZZsqVn6QyB8Zh3amt8v1lWk0hEo,70205
+flowerpower/fs/ext.py,sha256=2-BkLdNFORW-OtrmlCXMmUJtYxxhmTmhrzxVtbbsDSw,70604
 flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
 flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
 flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
@@ -47,8 +47,8 @@ flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpu
 flowerpower/plugins/io/base.py,sha256=oGxTKobs0M19hPV842EelAeJ01EBz6kDdGv_4GTyFzk,97098
 flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
 flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
-flowerpower/plugins/io/helpers/polars.py,sha256=cuzMby0a90AMFXhNEycf53UOwdHw4uxnx322l3m7jB0,27443
-flowerpower/plugins/io/helpers/pyarrow.py,sha256=NwA2NAPMIcGmaFE3gx1jKYW_-6gAxQ8Oczdgk4Av-s8,13903
+flowerpower/plugins/io/helpers/polars.py,sha256=B4eg0GZUWh5Mbd1auC8SMmkCznR07q3sHDcgnwRmSNU,27856
+flowerpower/plugins/io/helpers/pyarrow.py,sha256=lYZHbPklzYvd7L5XqDjoTUV42cHi_c9Wh8xf1HYtS2M,18592
 flowerpower/plugins/io/helpers/sql.py,sha256=BPIxjarKF3p93EdtUu-md8KislE9q8IWNSeZ5toFU6U,7298
 flowerpower/plugins/io/loader/__init__.py,sha256=MKH42nvVokaWas0wFgX1yrpU5iLpvHjRqqF-KzwLHCg,780
 flowerpower/plugins/io/loader/csv.py,sha256=Q5bmcbbr530sT1kQ2YiJwvsMUPqi0VcZWsLOygmzRyI,827
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
 flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
 flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
 flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
-flowerpower-0.11.6.10.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
-flowerpower-0.11.6.10.dist-info/METADATA,sha256=sSuLHj16hjgnT_mUWWZLs6Wf5IE-Fz4hTRmBy0A9s7s,21613
-flowerpower-0.11.6.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-flowerpower-0.11.6.10.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
-flowerpower-0.11.6.10.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
-flowerpower-0.11.6.10.dist-info/RECORD,,
+flowerpower-0.11.6.15.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
+flowerpower-0.11.6.15.dist-info/METADATA,sha256=QIkBDvwIdawIRdDJDxx0a9xSbBUzX4X-vjQwKjY8Fh8,21613
+flowerpower-0.11.6.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+flowerpower-0.11.6.15.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
+flowerpower-0.11.6.15.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
+flowerpower-0.11.6.15.dist-info/RECORD,,

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/WHEEL RENAMED Viewed

File without changes

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{flowerpower-0.11.6.10.dist-info → flowerpower-0.11.6.15.dist-info}/top_level.txt RENAMED Viewed

File without changes

FlowerPower 0.11.6.10__py3-none-any.whl → 0.11.6.15__py3-none-any.whl

FlowerPower 0.11.6.10py3-none-any.whl → 0.11.6.15py3-none-any.whl