PyPI - datupapi - Versions diffs - 1.112.2__py3-none-any.whl → 1.114.0__py3-none-any.whl - Mend

datupapi 1.112.2py3-none-any.whl → 1.114.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

datupapi/evaluate/errors.py CHANGED Viewed

@@ -192,6 +192,81 @@ class Errors(Config):
         return wmape_capped
+    def compute_wmape_by_date(self, target_col, forecast_col, date_col, target_sum_dict):
+        """
+        Calculate WMAPE for a single row where the weight is the sum of all targets
+        for the same date value.
+        This function is optimized to use a pre-calculated dictionary of target sums
+        by date, making it much faster than filtering the DataFrame on each iteration.
+        Dates are normalized to 'YYYY-MM-DD' string format.
+        :param target_col: Name of the target column
+        :param forecast_col: Name of the forecast column
+        :param date_col: Name of the date column
+        :param target_sum_dict: Dictionary with date string as key and sum of targets as value
+        :return: WMAPE value for that row weighted by date total
+        Example usage:
+        >>> # First, create the dictionary of target sums by date
+        >>> target_sum_dict = Errors.create_target_sum_dict(
+        ...     Errors, df=df, target_col='Target', date_col='date'
+        ... )
+        >>>
+        >>> # Then apply WMAPE calculation
+        >>> df['WMAPE'] = df.apply(lambda row: compute_wmape_by_date(target_col=row['Target'], forecast_col=row[forecast_col], date_col=row['Date'],target_sum_dict=target_sum_dict), axis=1
+        ... )
+        """
+        try:
+            # Get the total target sum for this date from the dictionary
+            target_sum = target_sum_dict.get(date_col, 0)
+            target = np.array(target_col, dtype=float)
+            forecast = np.array(forecast_col, dtype=float)
+            wmape_capped = 0
+            # Calculate absolute error for current row
+            e = target - forecast
+            wmape = 100 * (target * np.divide(abs(e), abs(target),
+                                                out=np.ones_like(target),
+                                                where=target != 0)).sum() / target_sum
+            wmape_capped = wmape if wmape <= 100 else 100
+        except ZeroDivisionError as err:
+            self.logger.exception(f'Division by zero. Error set to 0 by default: {err}')
+            wmape_capped = 0
+        return wmape_capped
+    def create_target_sum_dict(self, df, target_col, date_col):
+        """
+        Create a dictionary with the sum of target values for each unique date.
+        This pre-calculation significantly improves performance when computing
+        WMAPE row by row, as it avoids filtering the DataFrame repeatedly.
+        Dates are always normalized to 'YYYY-MM-DD' string format.
+        :param df: DataFrame containing the data
+        :param target_col: Name of the target column
+        :param date_col: Name of the date column
+        :return: Dictionary with date string (YYYY-MM-DD) as key and sum of targets as value
+        Example:
+        >>> target_sum_dict = Errors.create_target_sum_dict(
+        ...     Errors, df=df, target_col='Target', date_col='date'
+        ... )
+        >>> # Returns: {'2024-01-01': 450, '2024-01-02': 320, ...}
+        """
+        try:
+            # Convert dates to string format YYYY-MM-DD for dictionary keys
+            df_copy = df.copy()
+            df_copy[date_col] = pd.to_datetime(df_copy[date_col]).dt.strftime('%Y-%m-%d')
+            target_sum_dict = df_copy.groupby(date_col)[target_col].sum().to_dict()
+            return target_sum_dict
+        except Exception as err:
+            self.logger.exception(f'Error creating target sum dictionary: {err}')
+            return {}

datupapi/prepare/format_opt.py ADDED Viewed

@@ -0,0 +1,400 @@
+import os
+import polars as pl
+import pandas as pd
+import re
+from datupapi.configure.config import Config
+class FormatOptimized(Config):
+    """
+    Optimized Format class using Polars for efficient data resampling operations.
+    This class provides the same interface as Format but with improved performance
+    through Polars' efficient processing capabilities.
+    """
+    def __init__(self, config_file, logfile, log_path, *args, **kwargs):
+        Config.__init__(self, config_file=config_file, logfile=logfile)
+        self.log_path = log_path
+    def _convert_frequency_to_polars(self, frequency: str) -> str:
+        """
+        Convert pandas frequency notation to Polars notation.
+        :param frequency: Pandas frequency string (e.g., 'M', 'W', 'D', 'Q', '2M', '3W')
+        :return: Polars frequency string (e.g., '1mo', '1w', '1d', '1q', '2mo', '3w')
+        """
+        # Mapping of pandas frequency codes to Polars
+        freq_map = {
+            'D': 'd',   # Day
+            'W': 'w',   # Week
+            'M': 'mo',  # Month
+            'Q': 'q',   # Quarter
+            'Y': 'y',   # Year
+            'H': 'h',   # Hour
+            'T': 'm',   # Minute (T in pandas, m in polars)
+            'S': 's',   # Second
+        }
+        # Extract number prefix if exists (e.g., '2M' -> '2', 'M')
+        match = re.match(r'^(\d*)([A-Z]+)$', frequency.upper())
+        if not match:
+            raise ValueError(f"Invalid frequency format: {frequency}")
+        number = match.group(1) or '1'
+        freq_code = match.group(2)
+        if freq_code not in freq_map:
+            raise ValueError(f"Unsupported frequency code: {freq_code}")
+        polars_freq = freq_map[freq_code]
+        return f"{number}{polars_freq}"
+    def reorder_cols(self, df, first_cols):
+        """
+        Return a dataframe with columns specified in first_col at the leading positions
+        :param df: Dataframe to reorder
+        :param first_cols: Leading columns to appear in the dataframe
+        :return df: Dataframe reordered
+        >>> df = reorder_cols(df, first_cols)
+        >>> df =
+                        var1    var2    var3
+                idx0     1       2       3
+        """
+        cols = list(df.columns)
+        for col in reversed(first_cols):
+            if col in cols:
+                cols.remove(col)
+                cols.insert(0, col)
+        df = df[cols]
+        return df
+    def resample_dataset(self, df, date_col=None, item_col=None, frequency=None, agg_dict=None, use_lazy=True):
+        """
+        Return a dataframe resampling the date dimension to the specified frequency using Polars.
+        This optimized version:
+        - Converts pandas to Polars for faster processing
+        - Uses lazy evaluation for optimal query planning (when use_lazy=True)
+        - Uses group_by_dynamic for efficient resampling
+        - Fills missing date ranges with 0
+        - Adjusts dates to the last day of each month
+        - Returns a pandas DataFrame
+        :param df: Pandas DataFrame to be resampled
+        :param date_col: Name of the date column
+        :param item_col: Name of the item column
+        :param frequency: Target frequency to resample the data (e.g., 'M' for monthly, 'W' for weekly)
+        :param agg_dict: Aggregation dictionary including column as key and operation as value
+        :param use_lazy: Use lazy evaluation for better performance (default: True)
+        :return df_out: Pandas DataFrame resampled
+        >>> df_out = resample_dataset(df, date_col='timestamp', item_col='item_id',
+        ...                           frequency='M', agg_dict={'demand': 'sum'})
+        >>> df_out =
+                                timestamp  item_id  demand
+                        0      2021-01-31     sku1      23
+                        1      2021-02-28     sku1     543
+        """
+        try:
+            # Convert pandas frequency to Polars frequency
+            polars_frequency = self._convert_frequency_to_polars(frequency)
+            # Convert pandas DataFrame to Polars (lazy if requested)
+            if use_lazy:
+                df_pl = pl.from_pandas(df).lazy()
+            else:
+                df_pl = pl.from_pandas(df)
+            # Build the lazy query
+            df_lazy = (
+                df_pl
+                # Ensure date column is datetime type
+                .with_columns(
+                    pl.col(date_col).cast(pl.Datetime)
+                )
+                # Sort by date column
+                .sort(date_col)
+            )
+            # Collect to perform group_by_dynamic (not supported in lazy mode)
+            if use_lazy:
+                df_collected = df_lazy.collect()
+            else:
+                df_collected = df_lazy
+            # Perform dynamic grouping and resampling
+            df_resampled = (
+                df_collected.group_by_dynamic(
+                    index_column=date_col,
+                    every=polars_frequency,
+                    closed="left",  # Left-closed interval
+                    by=[item_col]
+                )
+                .agg([getattr(pl.col(col), func)().alias(col) for col, func in agg_dict.items()])
+            )
+            # Continue with lazy operations
+            if use_lazy:
+                df_out_lazy = df_resampled.lazy()
+            else:
+                df_out_lazy = df_resampled
+            # Adjust to the last day of the month
+            df_out_lazy = df_out_lazy.with_columns(
+                pl.col(date_col).dt.month_end().alias(date_col)
+            )
+            # Collect to get min/max dates for range creation
+            if use_lazy:
+                df_temp = df_out_lazy.collect()
+            else:
+                df_temp = df_out_lazy
+            # Fill missing date ranges with 0
+            # Get all unique items
+            items = df_temp.select(item_col).unique()
+            # Get date range from min to max
+            min_date = df_temp.select(pl.col(date_col).min()).item()
+            max_date = df_temp.select(pl.col(date_col).max()).item()
+            # Create complete date range at month end
+            date_range = pl.datetime_range(
+                min_date,
+                max_date,
+                interval=polars_frequency,
+                eager=True
+            ).dt.month_end()
+            # Create a complete grid of dates and items
+            complete_grid = items.join(
+                pl.DataFrame({date_col: date_range}),
+                how="cross"
+            )
+            # Build final lazy query for joins and fills
+            if use_lazy:
+                complete_grid_lazy = complete_grid.lazy()
+                df_temp_lazy = df_temp.lazy()
+                df_out_lazy = (
+                    complete_grid_lazy
+                    .join(
+                        df_temp_lazy,
+                        on=[date_col, item_col],
+                        how="left"
+                    )
+                )
+                # Fill null values with 0 for aggregated columns
+                for col in agg_dict.keys():
+                    df_out_lazy = df_out_lazy.with_columns(
+                        pl.col(col).fill_null(0)
+                    )
+                # Reorder columns: date_col, item_col, then others
+                other_cols = [c for c in df_temp.columns if c not in [date_col, item_col]]
+                df_out_lazy = df_out_lazy.select(
+                    [pl.col(date_col), pl.col(item_col)] + [pl.col(c) for c in other_cols]
+                )
+                # Collect the final result
+                df_out = df_out_lazy.collect()
+            else:
+                # Join with resampled data and fill nulls with 0
+                df_out = complete_grid.join(
+                    df_temp,
+                    on=[date_col, item_col],
+                    how="left"
+                )
+                # Fill null values with 0 for aggregated columns
+                for col in agg_dict.keys():
+                    df_out = df_out.with_columns(
+                        pl.col(col).fill_null(0)
+                    )
+                # Reorder columns: date_col, item_col, then others
+                other_cols = [c for c in df_out.columns if c not in [date_col, item_col]]
+                df_out = df_out.select(
+                    [pl.col(date_col), pl.col(item_col)] + [pl.col(c) for c in other_cols]
+                )
+            # Convert back to pandas
+            df_pandas = df_out.to_pandas()
+            # Reorder columns using the class method
+            df_pandas = self.reorder_cols(df_pandas, first_cols=[date_col, item_col])
+        except KeyError as err:
+            self.logger.exception(f'Columns for index, item or qty not found. Please check spelling: {err}')
+            raise
+        return df_pandas
+    def resample_dataset_with_location(self, df, date_col_=None, item_col_=None, location_col_=None, frequency_=None, agg_dict_=None, use_lazy=True):
+        """
+        Return a dataframe resampling the date dimension to the specified frequency using Polars,
+        including location grouping.
+        This optimized version:
+        - Converts pandas to Polars for faster processing
+        - Uses lazy evaluation for optimal query planning (when use_lazy=True)
+        - Uses group_by_dynamic for efficient resampling with location
+        - Fills missing date ranges with 0
+        - Adjusts dates to the last day of each month
+        - Returns a pandas DataFrame
+        :param df: Pandas DataFrame to be resampled
+        :param date_col_: Name of the date column
+        :param item_col_: Name of the item column
+        :param location_col_: Name of the location column
+        :param frequency_: Target frequency to resample the data (e.g., 'M' for monthly, 'W' for weekly)
+        :param agg_dict_: Aggregation dictionary including column as key and operation as value
+        :param use_lazy: Use lazy evaluation for better performance (default: True)
+        :return df_out: Pandas DataFrame resampled
+        >>> df_out = resample_dataset_with_location(df, date_col_='timestamp',
+        ...                                         item_col_='item_id', location_col_='location',
+        ...                                         frequency_='M', agg_dict_={'demand': 'sum'})
+        """
+        try:
+            # Convert pandas frequency to Polars frequency
+            polars_frequency = self._convert_frequency_to_polars(frequency_)
+            # Convert pandas DataFrame to Polars (lazy if requested)
+            if use_lazy:
+                df_pl = pl.from_pandas(df).lazy()
+            else:
+                df_pl = pl.from_pandas(df)
+            # Build the lazy query
+            df_lazy = (
+                df_pl
+                # Ensure date column is datetime type
+                .with_columns(
+                    pl.col(date_col_).cast(pl.Datetime)
+                )
+                # Sort by date column
+                .sort(date_col_)
+            )
+            # Collect to perform group_by_dynamic (not supported in lazy mode)
+            if use_lazy:
+                df_collected = df_lazy.collect()
+            else:
+                df_collected = df_lazy
+            # Perform dynamic grouping and resampling
+            df_resampled = (
+                df_collected.group_by_dynamic(
+                    index_column=date_col_,
+                    every=polars_frequency,
+                    closed="left",  # Left-closed interval
+                    by=[location_col_, item_col_]
+                )
+                .agg([getattr(pl.col(col), func)().alias(col) for col, func in agg_dict_.items()])
+            )
+            # Continue with lazy operations
+            if use_lazy:
+                df_out_lazy = df_resampled.lazy()
+            else:
+                df_out_lazy = df_resampled
+            # Adjust to the last day of the month
+            df_out_lazy = df_out_lazy.with_columns(
+                pl.col(date_col_).dt.month_end().alias(date_col_)
+            )
+            # Collect to get min/max dates for range creation
+            if use_lazy:
+                df_temp = df_out_lazy.collect()
+            else:
+                df_temp = df_out_lazy
+            # Fill missing date ranges with 0
+            # Get all unique combinations of location and item
+            location_items = df_temp.select([location_col_, item_col_]).unique()
+            # Get date range from min to max
+            min_date = df_temp.select(pl.col(date_col_).min()).item()
+            max_date = df_temp.select(pl.col(date_col_).max()).item()
+            # Create complete date range at month end
+            date_range = pl.datetime_range(
+                min_date,
+                max_date,
+                interval=polars_frequency,
+                eager=True
+            ).dt.month_end()
+            # Create a complete grid of dates, locations, and items
+            complete_grid = location_items.join(
+                pl.DataFrame({date_col_: date_range}),
+                how="cross"
+            )
+            # Build final lazy query for joins and fills
+            if use_lazy:
+                complete_grid_lazy = complete_grid.lazy()
+                df_temp_lazy = df_temp.lazy()
+                df_out_lazy = (
+                    complete_grid_lazy
+                    .join(
+                        df_temp_lazy,
+                        on=[date_col_, location_col_, item_col_],
+                        how="left"
+                    )
+                )
+                # Fill null values with 0 for aggregated columns
+                for col in agg_dict_.keys():
+                    df_out_lazy = df_out_lazy.with_columns(
+                        pl.col(col).fill_null(0)
+                    )
+                # Reorder columns: date_col, item_col, location_col, then others
+                other_cols = [c for c in df_temp.columns if c not in [date_col_, item_col_, location_col_]]
+                df_out_lazy = df_out_lazy.select(
+                    [pl.col(date_col_), pl.col(item_col_), pl.col(location_col_)] + [pl.col(c) for c in other_cols]
+                )
+                # Collect the final result
+                df_out = df_out_lazy.collect()
+            else:
+                # Join with resampled data and fill nulls with 0
+                df_out = complete_grid.join(
+                    df_temp,
+                    on=[date_col_, location_col_, item_col_],
+                    how="left"
+                )
+                # Fill null values with 0 for aggregated columns
+                for col in agg_dict_.keys():
+                    df_out = df_out.with_columns(
+                        pl.col(col).fill_null(0)
+                    )
+                # Reorder columns: date_col, item_col, location_col, then others
+                other_cols = [c for c in df_out.columns if c not in [date_col_, item_col_, location_col_]]
+                df_out = df_out.select(
+                    [pl.col(date_col_), pl.col(item_col_), pl.col(location_col_)] + [pl.col(c) for c in other_cols]
+                )
+            # Convert back to pandas
+            df_pandas = df_out.to_pandas()
+            # Reorder columns using the class method
+            df_pandas = self.reorder_cols(df_pandas, first_cols=[date_col_, item_col_, location_col_])
+        except KeyError as err:
+            self.logger.exception(f'Columns for index, item or qty not found. Please check spelling: {err}')
+            raise
+        return df_pandas

{datupapi-1.112.2.dist-info → datupapi-1.114.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datupapi
-Version: 1.112.2
+Version: 1.114.0
 Summary: Utility library to support Datup AI MLOps processes
 Author: Datup AI
 Author-email: ramiro@datup.ai

{datupapi-1.112.2.dist-info → datupapi-1.114.0.dist-info}/RECORD RENAMED Viewed

@@ -8,7 +8,7 @@ datupapi/distribution/src/Format/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 datupapi/distribution/src/Format/distribution_format.py,sha256=CFqUHTk9StDvaOvlR3yLr3NZiFY2Ao1yVXoY-IsrNWE,3964
 datupapi/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datupapi/evaluate/anomaly.py,sha256=fjIDAvEPGBJdZjVXhz7Rk90WKCR5t3Hbe6zeTKVXFlw,33506
-datupapi/evaluate/errors.py,sha256=9SRYAjwRDfEdP1EnBbfA7zoQEi4xU4qI16vBE8-jkeA,7039
+datupapi/evaluate/errors.py,sha256=Nd4bCKOQsRzAvTmovuJjMbs_4Y8ojc9xWxzbQ5Cf7YQ,10582
 datupapi/extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datupapi/extract/io.py,sha256=fYPXf-SmYyw4ywbN3SjQsdl6qBQvQz1K3i9kbpiEkkA,84343
 datupapi/extract/io_citrix.py,sha256=txq6VklpZcMgRcd0AFb6iMgX_rRW_eapqvPyXr9tyHY,9345
@@ -38,6 +38,7 @@ datupapi/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 datupapi/prepare/cleanse.py,sha256=alujVHYfN83_mFoIuCPe0TkREglFOpZO_2225-HRHCg,1922
 datupapi/prepare/format.py,sha256=6XoeIBv4ovIqgAy6b-4sM9rcQ5VICDiTlzdNFdGCIwo,20841
 datupapi/prepare/format_dask.py,sha256=m4xdGpTB8Jeu9we8-nitEWHX1YLtEvraC5revYxPxZE,4800
+datupapi/prepare/format_opt.py,sha256=9WPmJEsy613kDoO9edyzYtLSTznwIw2sPBJJi2M7X6A,16206
 datupapi/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datupapi/training/attup.py,sha256=DalY7JtE5t_pPwt-JD9hP6CFcpGTzHblj-6hAlEYA-U,25158
 datupapi/training/deepar.py,sha256=ivaQkZt071LBV5uwXZVcqPUhUFVF79sa2CECAivbWss,31654
@@ -48,7 +49,7 @@ datupapi/transform/forecasting.py,sha256=OboiVyErzWXJAv6R4fCXiPNaoVp5dNAP9F53EDq
 datupapi/transform/ranking.py,sha256=XOI0XqMx9Cy52Xjc4LCzJCNUsJZNjgrPky7nrpELr-U,7943
 datupapi/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datupapi/utils/utils.py,sha256=pU3mXPupm-1gvODI-kPlIpOdMHa2F9lEXvqBn6t3ajc,4637
-datupapi-1.112.2.dist-info/METADATA,sha256=q_XO4eLpCV8aICr_WBnDnAHiDBs7LJjnxbTVcUNShUs,1516
-datupapi-1.112.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-datupapi-1.112.2.dist-info/top_level.txt,sha256=oERwtRZu8xq2u1TDGwJwuWK0iJbH4p7x9kYECAL5So0,9
-datupapi-1.112.2.dist-info/RECORD,,
+datupapi-1.114.0.dist-info/METADATA,sha256=gudex0xIUJevCkb-UjuYbgu5-5Hd4Su4MBOTFpP0xt0,1516
+datupapi-1.114.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+datupapi-1.114.0.dist-info/top_level.txt,sha256=oERwtRZu8xq2u1TDGwJwuWK0iJbH4p7x9kYECAL5So0,9
+datupapi-1.114.0.dist-info/RECORD,,

{datupapi-1.112.2.dist-info → datupapi-1.114.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{datupapi-1.112.2.dist-info → datupapi-1.114.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datupapi 1.112.2__py3-none-any.whl → 1.114.0__py3-none-any.whl

datupapi 1.112.2py3-none-any.whl → 1.114.0py3-none-any.whl