datupapi 1.112.2__py3-none-any.whl → 1.114.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -192,6 +192,81 @@ class Errors(Config):
192
192
  return wmape_capped
193
193
 
194
194
 
195
+ def compute_wmape_by_date(self, target_col, forecast_col, date_col, target_sum_dict):
196
+ """
197
+ Calculate WMAPE for a single row where the weight is the sum of all targets
198
+ for the same date value.
199
+
200
+ This function is optimized to use a pre-calculated dictionary of target sums
201
+ by date, making it much faster than filtering the DataFrame on each iteration.
202
+ Dates are normalized to 'YYYY-MM-DD' string format.
203
+
204
+ :param target_col: Name of the target column
205
+ :param forecast_col: Name of the forecast column
206
+ :param date_col: Name of the date column
207
+ :param target_sum_dict: Dictionary with date string as key and sum of targets as value
208
+ :return: WMAPE value for that row weighted by date total
209
+
210
+ Example usage:
211
+ >>> # First, create the dictionary of target sums by date
212
+ >>> target_sum_dict = Errors.create_target_sum_dict(
213
+ ... Errors, df=df, target_col='Target', date_col='date'
214
+ ... )
215
+ >>>
216
+ >>> # Then apply WMAPE calculation
217
+ >>> df['WMAPE'] = df.apply(lambda row: compute_wmape_by_date(target_col=row['Target'], forecast_col=row[forecast_col], date_col=row['Date'],target_sum_dict=target_sum_dict), axis=1
218
+ ... )
219
+ """
220
+ try:
221
+ # Get the total target sum for this date from the dictionary
222
+ target_sum = target_sum_dict.get(date_col, 0)
223
+
224
+ target = np.array(target_col, dtype=float)
225
+ forecast = np.array(forecast_col, dtype=float)
226
+ wmape_capped = 0
195
227
 
228
+ # Calculate absolute error for current row
229
+
230
+ e = target - forecast
231
+ wmape = 100 * (target * np.divide(abs(e), abs(target),
232
+ out=np.ones_like(target),
233
+ where=target != 0)).sum() / target_sum
196
234
 
235
+ wmape_capped = wmape if wmape <= 100 else 100
197
236
 
237
+ except ZeroDivisionError as err:
238
+ self.logger.exception(f'Division by zero. Error set to 0 by default: {err}')
239
+ wmape_capped = 0
240
+
241
+ return wmape_capped
242
+
243
+
244
+ def create_target_sum_dict(self, df, target_col, date_col):
245
+ """
246
+ Create a dictionary with the sum of target values for each unique date.
247
+
248
+ This pre-calculation significantly improves performance when computing
249
+ WMAPE row by row, as it avoids filtering the DataFrame repeatedly.
250
+ Dates are always normalized to 'YYYY-MM-DD' string format.
251
+
252
+ :param df: DataFrame containing the data
253
+ :param target_col: Name of the target column
254
+ :param date_col: Name of the date column
255
+ :return: Dictionary with date string (YYYY-MM-DD) as key and sum of targets as value
256
+
257
+ Example:
258
+ >>> target_sum_dict = Errors.create_target_sum_dict(
259
+ ... Errors, df=df, target_col='Target', date_col='date'
260
+ ... )
261
+ >>> # Returns: {'2024-01-01': 450, '2024-01-02': 320, ...}
262
+ """
263
+ try:
264
+ # Convert dates to string format YYYY-MM-DD for dictionary keys
265
+ df_copy = df.copy()
266
+ df_copy[date_col] = pd.to_datetime(df_copy[date_col]).dt.strftime('%Y-%m-%d')
267
+ target_sum_dict = df_copy.groupby(date_col)[target_col].sum().to_dict()
268
+
269
+ return target_sum_dict
270
+ except Exception as err:
271
+ self.logger.exception(f'Error creating target sum dictionary: {err}')
272
+ return {}
@@ -0,0 +1,400 @@
1
+ import os
2
+ import polars as pl
3
+ import pandas as pd
4
+ import re
5
+ from datupapi.configure.config import Config
6
+
7
+
8
+ class FormatOptimized(Config):
9
+ """
10
+ Optimized Format class using Polars for efficient data resampling operations.
11
+ This class provides the same interface as Format but with improved performance
12
+ through Polars' efficient processing capabilities.
13
+ """
14
+
15
+ def __init__(self, config_file, logfile, log_path, *args, **kwargs):
16
+ Config.__init__(self, config_file=config_file, logfile=logfile)
17
+ self.log_path = log_path
18
+
19
+ def _convert_frequency_to_polars(self, frequency: str) -> str:
20
+ """
21
+ Convert pandas frequency notation to Polars notation.
22
+
23
+ :param frequency: Pandas frequency string (e.g., 'M', 'W', 'D', 'Q', '2M', '3W')
24
+ :return: Polars frequency string (e.g., '1mo', '1w', '1d', '1q', '2mo', '3w')
25
+ """
26
+ # Mapping of pandas frequency codes to Polars
27
+ freq_map = {
28
+ 'D': 'd', # Day
29
+ 'W': 'w', # Week
30
+ 'M': 'mo', # Month
31
+ 'Q': 'q', # Quarter
32
+ 'Y': 'y', # Year
33
+ 'H': 'h', # Hour
34
+ 'T': 'm', # Minute (T in pandas, m in polars)
35
+ 'S': 's', # Second
36
+ }
37
+
38
+ # Extract number prefix if exists (e.g., '2M' -> '2', 'M')
39
+ match = re.match(r'^(\d*)([A-Z]+)$', frequency.upper())
40
+
41
+ if not match:
42
+ raise ValueError(f"Invalid frequency format: {frequency}")
43
+
44
+ number = match.group(1) or '1'
45
+ freq_code = match.group(2)
46
+
47
+ if freq_code not in freq_map:
48
+ raise ValueError(f"Unsupported frequency code: {freq_code}")
49
+
50
+ polars_freq = freq_map[freq_code]
51
+
52
+ return f"{number}{polars_freq}"
53
+
54
+ def reorder_cols(self, df, first_cols):
55
+ """
56
+ Return a dataframe with columns specified in first_col at the leading positions
57
+
58
+ :param df: Dataframe to reorder
59
+ :param first_cols: Leading columns to appear in the dataframe
60
+ :return df: Dataframe reordered
61
+
62
+ >>> df = reorder_cols(df, first_cols)
63
+ >>> df =
64
+ var1 var2 var3
65
+ idx0 1 2 3
66
+ """
67
+ cols = list(df.columns)
68
+ for col in reversed(first_cols):
69
+ if col in cols:
70
+ cols.remove(col)
71
+ cols.insert(0, col)
72
+ df = df[cols]
73
+ return df
74
+
75
+ def resample_dataset(self, df, date_col=None, item_col=None, frequency=None, agg_dict=None, use_lazy=True):
76
+ """
77
+ Return a dataframe resampling the date dimension to the specified frequency using Polars.
78
+
79
+ This optimized version:
80
+ - Converts pandas to Polars for faster processing
81
+ - Uses lazy evaluation for optimal query planning (when use_lazy=True)
82
+ - Uses group_by_dynamic for efficient resampling
83
+ - Fills missing date ranges with 0
84
+ - Adjusts dates to the last day of each month
85
+ - Returns a pandas DataFrame
86
+
87
+ :param df: Pandas DataFrame to be resampled
88
+ :param date_col: Name of the date column
89
+ :param item_col: Name of the item column
90
+ :param frequency: Target frequency to resample the data (e.g., 'M' for monthly, 'W' for weekly)
91
+ :param agg_dict: Aggregation dictionary including column as key and operation as value
92
+ :param use_lazy: Use lazy evaluation for better performance (default: True)
93
+ :return df_out: Pandas DataFrame resampled
94
+
95
+ >>> df_out = resample_dataset(df, date_col='timestamp', item_col='item_id',
96
+ ... frequency='M', agg_dict={'demand': 'sum'})
97
+ >>> df_out =
98
+ timestamp item_id demand
99
+ 0 2021-01-31 sku1 23
100
+ 1 2021-02-28 sku1 543
101
+ """
102
+ try:
103
+ # Convert pandas frequency to Polars frequency
104
+ polars_frequency = self._convert_frequency_to_polars(frequency)
105
+
106
+ # Convert pandas DataFrame to Polars (lazy if requested)
107
+ if use_lazy:
108
+ df_pl = pl.from_pandas(df).lazy()
109
+ else:
110
+ df_pl = pl.from_pandas(df)
111
+
112
+ # Build the lazy query
113
+ df_lazy = (
114
+ df_pl
115
+ # Ensure date column is datetime type
116
+ .with_columns(
117
+ pl.col(date_col).cast(pl.Datetime)
118
+ )
119
+ # Sort by date column
120
+ .sort(date_col)
121
+ )
122
+
123
+ # Collect to perform group_by_dynamic (not supported in lazy mode)
124
+ if use_lazy:
125
+ df_collected = df_lazy.collect()
126
+ else:
127
+ df_collected = df_lazy
128
+
129
+ # Perform dynamic grouping and resampling
130
+ df_resampled = (
131
+ df_collected.group_by_dynamic(
132
+ index_column=date_col,
133
+ every=polars_frequency,
134
+ closed="left", # Left-closed interval
135
+ by=[item_col]
136
+ )
137
+ .agg([getattr(pl.col(col), func)().alias(col) for col, func in agg_dict.items()])
138
+ )
139
+
140
+ # Continue with lazy operations
141
+ if use_lazy:
142
+ df_out_lazy = df_resampled.lazy()
143
+ else:
144
+ df_out_lazy = df_resampled
145
+
146
+ # Adjust to the last day of the month
147
+ df_out_lazy = df_out_lazy.with_columns(
148
+ pl.col(date_col).dt.month_end().alias(date_col)
149
+ )
150
+
151
+ # Collect to get min/max dates for range creation
152
+ if use_lazy:
153
+ df_temp = df_out_lazy.collect()
154
+ else:
155
+ df_temp = df_out_lazy
156
+
157
+ # Fill missing date ranges with 0
158
+ # Get all unique items
159
+ items = df_temp.select(item_col).unique()
160
+
161
+ # Get date range from min to max
162
+ min_date = df_temp.select(pl.col(date_col).min()).item()
163
+ max_date = df_temp.select(pl.col(date_col).max()).item()
164
+
165
+ # Create complete date range at month end
166
+ date_range = pl.datetime_range(
167
+ min_date,
168
+ max_date,
169
+ interval=polars_frequency,
170
+ eager=True
171
+ ).dt.month_end()
172
+
173
+ # Create a complete grid of dates and items
174
+ complete_grid = items.join(
175
+ pl.DataFrame({date_col: date_range}),
176
+ how="cross"
177
+ )
178
+
179
+ # Build final lazy query for joins and fills
180
+ if use_lazy:
181
+ complete_grid_lazy = complete_grid.lazy()
182
+ df_temp_lazy = df_temp.lazy()
183
+
184
+ df_out_lazy = (
185
+ complete_grid_lazy
186
+ .join(
187
+ df_temp_lazy,
188
+ on=[date_col, item_col],
189
+ how="left"
190
+ )
191
+ )
192
+
193
+ # Fill null values with 0 for aggregated columns
194
+ for col in agg_dict.keys():
195
+ df_out_lazy = df_out_lazy.with_columns(
196
+ pl.col(col).fill_null(0)
197
+ )
198
+
199
+ # Reorder columns: date_col, item_col, then others
200
+ other_cols = [c for c in df_temp.columns if c not in [date_col, item_col]]
201
+ df_out_lazy = df_out_lazy.select(
202
+ [pl.col(date_col), pl.col(item_col)] + [pl.col(c) for c in other_cols]
203
+ )
204
+
205
+ # Collect the final result
206
+ df_out = df_out_lazy.collect()
207
+ else:
208
+ # Join with resampled data and fill nulls with 0
209
+ df_out = complete_grid.join(
210
+ df_temp,
211
+ on=[date_col, item_col],
212
+ how="left"
213
+ )
214
+
215
+ # Fill null values with 0 for aggregated columns
216
+ for col in agg_dict.keys():
217
+ df_out = df_out.with_columns(
218
+ pl.col(col).fill_null(0)
219
+ )
220
+
221
+ # Reorder columns: date_col, item_col, then others
222
+ other_cols = [c for c in df_out.columns if c not in [date_col, item_col]]
223
+ df_out = df_out.select(
224
+ [pl.col(date_col), pl.col(item_col)] + [pl.col(c) for c in other_cols]
225
+ )
226
+
227
+ # Convert back to pandas
228
+ df_pandas = df_out.to_pandas()
229
+
230
+ # Reorder columns using the class method
231
+ df_pandas = self.reorder_cols(df_pandas, first_cols=[date_col, item_col])
232
+
233
+ except KeyError as err:
234
+ self.logger.exception(f'Columns for index, item or qty not found. Please check spelling: {err}')
235
+ raise
236
+
237
+ return df_pandas
238
+
239
+ def resample_dataset_with_location(self, df, date_col_=None, item_col_=None, location_col_=None, frequency_=None, agg_dict_=None, use_lazy=True):
240
+ """
241
+ Return a dataframe resampling the date dimension to the specified frequency using Polars,
242
+ including location grouping.
243
+
244
+ This optimized version:
245
+ - Converts pandas to Polars for faster processing
246
+ - Uses lazy evaluation for optimal query planning (when use_lazy=True)
247
+ - Uses group_by_dynamic for efficient resampling with location
248
+ - Fills missing date ranges with 0
249
+ - Adjusts dates to the last day of each month
250
+ - Returns a pandas DataFrame
251
+
252
+ :param df: Pandas DataFrame to be resampled
253
+ :param date_col_: Name of the date column
254
+ :param item_col_: Name of the item column
255
+ :param location_col_: Name of the location column
256
+ :param frequency_: Target frequency to resample the data (e.g., 'M' for monthly, 'W' for weekly)
257
+ :param agg_dict_: Aggregation dictionary including column as key and operation as value
258
+ :param use_lazy: Use lazy evaluation for better performance (default: True)
259
+ :return df_out: Pandas DataFrame resampled
260
+
261
+ >>> df_out = resample_dataset_with_location(df, date_col_='timestamp',
262
+ ... item_col_='item_id', location_col_='location',
263
+ ... frequency_='M', agg_dict_={'demand': 'sum'})
264
+ """
265
+ try:
266
+ # Convert pandas frequency to Polars frequency
267
+ polars_frequency = self._convert_frequency_to_polars(frequency_)
268
+
269
+ # Convert pandas DataFrame to Polars (lazy if requested)
270
+ if use_lazy:
271
+ df_pl = pl.from_pandas(df).lazy()
272
+ else:
273
+ df_pl = pl.from_pandas(df)
274
+
275
+ # Build the lazy query
276
+ df_lazy = (
277
+ df_pl
278
+ # Ensure date column is datetime type
279
+ .with_columns(
280
+ pl.col(date_col_).cast(pl.Datetime)
281
+ )
282
+ # Sort by date column
283
+ .sort(date_col_)
284
+ )
285
+
286
+ # Collect to perform group_by_dynamic (not supported in lazy mode)
287
+ if use_lazy:
288
+ df_collected = df_lazy.collect()
289
+ else:
290
+ df_collected = df_lazy
291
+
292
+ # Perform dynamic grouping and resampling
293
+ df_resampled = (
294
+ df_collected.group_by_dynamic(
295
+ index_column=date_col_,
296
+ every=polars_frequency,
297
+ closed="left", # Left-closed interval
298
+ by=[location_col_, item_col_]
299
+ )
300
+ .agg([getattr(pl.col(col), func)().alias(col) for col, func in agg_dict_.items()])
301
+ )
302
+
303
+ # Continue with lazy operations
304
+ if use_lazy:
305
+ df_out_lazy = df_resampled.lazy()
306
+ else:
307
+ df_out_lazy = df_resampled
308
+
309
+ # Adjust to the last day of the month
310
+ df_out_lazy = df_out_lazy.with_columns(
311
+ pl.col(date_col_).dt.month_end().alias(date_col_)
312
+ )
313
+
314
+ # Collect to get min/max dates for range creation
315
+ if use_lazy:
316
+ df_temp = df_out_lazy.collect()
317
+ else:
318
+ df_temp = df_out_lazy
319
+
320
+ # Fill missing date ranges with 0
321
+ # Get all unique combinations of location and item
322
+ location_items = df_temp.select([location_col_, item_col_]).unique()
323
+
324
+ # Get date range from min to max
325
+ min_date = df_temp.select(pl.col(date_col_).min()).item()
326
+ max_date = df_temp.select(pl.col(date_col_).max()).item()
327
+
328
+ # Create complete date range at month end
329
+ date_range = pl.datetime_range(
330
+ min_date,
331
+ max_date,
332
+ interval=polars_frequency,
333
+ eager=True
334
+ ).dt.month_end()
335
+
336
+ # Create a complete grid of dates, locations, and items
337
+ complete_grid = location_items.join(
338
+ pl.DataFrame({date_col_: date_range}),
339
+ how="cross"
340
+ )
341
+
342
+ # Build final lazy query for joins and fills
343
+ if use_lazy:
344
+ complete_grid_lazy = complete_grid.lazy()
345
+ df_temp_lazy = df_temp.lazy()
346
+
347
+ df_out_lazy = (
348
+ complete_grid_lazy
349
+ .join(
350
+ df_temp_lazy,
351
+ on=[date_col_, location_col_, item_col_],
352
+ how="left"
353
+ )
354
+ )
355
+
356
+ # Fill null values with 0 for aggregated columns
357
+ for col in agg_dict_.keys():
358
+ df_out_lazy = df_out_lazy.with_columns(
359
+ pl.col(col).fill_null(0)
360
+ )
361
+
362
+ # Reorder columns: date_col, item_col, location_col, then others
363
+ other_cols = [c for c in df_temp.columns if c not in [date_col_, item_col_, location_col_]]
364
+ df_out_lazy = df_out_lazy.select(
365
+ [pl.col(date_col_), pl.col(item_col_), pl.col(location_col_)] + [pl.col(c) for c in other_cols]
366
+ )
367
+
368
+ # Collect the final result
369
+ df_out = df_out_lazy.collect()
370
+ else:
371
+ # Join with resampled data and fill nulls with 0
372
+ df_out = complete_grid.join(
373
+ df_temp,
374
+ on=[date_col_, location_col_, item_col_],
375
+ how="left"
376
+ )
377
+
378
+ # Fill null values with 0 for aggregated columns
379
+ for col in agg_dict_.keys():
380
+ df_out = df_out.with_columns(
381
+ pl.col(col).fill_null(0)
382
+ )
383
+
384
+ # Reorder columns: date_col, item_col, location_col, then others
385
+ other_cols = [c for c in df_out.columns if c not in [date_col_, item_col_, location_col_]]
386
+ df_out = df_out.select(
387
+ [pl.col(date_col_), pl.col(item_col_), pl.col(location_col_)] + [pl.col(c) for c in other_cols]
388
+ )
389
+
390
+ # Convert back to pandas
391
+ df_pandas = df_out.to_pandas()
392
+
393
+ # Reorder columns using the class method
394
+ df_pandas = self.reorder_cols(df_pandas, first_cols=[date_col_, item_col_, location_col_])
395
+
396
+ except KeyError as err:
397
+ self.logger.exception(f'Columns for index, item or qty not found. Please check spelling: {err}')
398
+ raise
399
+
400
+ return df_pandas
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datupapi
3
- Version: 1.112.2
3
+ Version: 1.114.0
4
4
  Summary: Utility library to support Datup AI MLOps processes
5
5
  Author: Datup AI
6
6
  Author-email: ramiro@datup.ai
@@ -8,7 +8,7 @@ datupapi/distribution/src/Format/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
8
8
  datupapi/distribution/src/Format/distribution_format.py,sha256=CFqUHTk9StDvaOvlR3yLr3NZiFY2Ao1yVXoY-IsrNWE,3964
9
9
  datupapi/evaluate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  datupapi/evaluate/anomaly.py,sha256=fjIDAvEPGBJdZjVXhz7Rk90WKCR5t3Hbe6zeTKVXFlw,33506
11
- datupapi/evaluate/errors.py,sha256=9SRYAjwRDfEdP1EnBbfA7zoQEi4xU4qI16vBE8-jkeA,7039
11
+ datupapi/evaluate/errors.py,sha256=Nd4bCKOQsRzAvTmovuJjMbs_4Y8ojc9xWxzbQ5Cf7YQ,10582
12
12
  datupapi/extract/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  datupapi/extract/io.py,sha256=fYPXf-SmYyw4ywbN3SjQsdl6qBQvQz1K3i9kbpiEkkA,84343
14
14
  datupapi/extract/io_citrix.py,sha256=txq6VklpZcMgRcd0AFb6iMgX_rRW_eapqvPyXr9tyHY,9345
@@ -38,6 +38,7 @@ datupapi/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
38
38
  datupapi/prepare/cleanse.py,sha256=alujVHYfN83_mFoIuCPe0TkREglFOpZO_2225-HRHCg,1922
39
39
  datupapi/prepare/format.py,sha256=6XoeIBv4ovIqgAy6b-4sM9rcQ5VICDiTlzdNFdGCIwo,20841
40
40
  datupapi/prepare/format_dask.py,sha256=m4xdGpTB8Jeu9we8-nitEWHX1YLtEvraC5revYxPxZE,4800
41
+ datupapi/prepare/format_opt.py,sha256=9WPmJEsy613kDoO9edyzYtLSTznwIw2sPBJJi2M7X6A,16206
41
42
  datupapi/training/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
43
  datupapi/training/attup.py,sha256=DalY7JtE5t_pPwt-JD9hP6CFcpGTzHblj-6hAlEYA-U,25158
43
44
  datupapi/training/deepar.py,sha256=ivaQkZt071LBV5uwXZVcqPUhUFVF79sa2CECAivbWss,31654
@@ -48,7 +49,7 @@ datupapi/transform/forecasting.py,sha256=OboiVyErzWXJAv6R4fCXiPNaoVp5dNAP9F53EDq
48
49
  datupapi/transform/ranking.py,sha256=XOI0XqMx9Cy52Xjc4LCzJCNUsJZNjgrPky7nrpELr-U,7943
49
50
  datupapi/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
50
51
  datupapi/utils/utils.py,sha256=pU3mXPupm-1gvODI-kPlIpOdMHa2F9lEXvqBn6t3ajc,4637
51
- datupapi-1.112.2.dist-info/METADATA,sha256=q_XO4eLpCV8aICr_WBnDnAHiDBs7LJjnxbTVcUNShUs,1516
52
- datupapi-1.112.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
53
- datupapi-1.112.2.dist-info/top_level.txt,sha256=oERwtRZu8xq2u1TDGwJwuWK0iJbH4p7x9kYECAL5So0,9
54
- datupapi-1.112.2.dist-info/RECORD,,
52
+ datupapi-1.114.0.dist-info/METADATA,sha256=gudex0xIUJevCkb-UjuYbgu5-5Hd4Su4MBOTFpP0xt0,1516
53
+ datupapi-1.114.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
54
+ datupapi-1.114.0.dist-info/top_level.txt,sha256=oERwtRZu8xq2u1TDGwJwuWK0iJbH4p7x9kYECAL5So0,9
55
+ datupapi-1.114.0.dist-info/RECORD,,