utilsds-models 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: utilsds-models
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Solution for specific models
5
5
  Author-email: DS Team <ds@sts.pl>
6
6
  License: MIT License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "utilsds-models"
7
- version = "0.0.2"
7
+ version = "0.0.4"
8
8
  description = "Solution for specific models"
9
9
  readme = {file = "docs/ALTERNATIVE_README.md", content-type = "text/markdown"}
10
10
  requires-python = ">=3.12"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: utilsds-models
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Solution for specific models
5
5
  Author-email: DS Team <ds@sts.pl>
6
6
  License: MIT License
@@ -0,0 +1,8 @@
1
+ README.md
2
+ pyproject.toml
3
+ docs/ALTERNATIVE_README.md
4
+ utilsds_models.egg-info/PKG-INFO
5
+ utilsds_models.egg-info/SOURCES.txt
6
+ utilsds_models.egg-info/dependency_links.txt
7
+ utilsds_models.egg-info/requires.txt
8
+ utilsds_models.egg-info/top_level.txt
File without changes
@@ -1,626 +0,0 @@
1
- from typing import Any, Callable, Optional, Tuple
2
-
3
- import numpy as np
4
- import pandas as pd
5
-
6
-
7
- class EvalMetric:
8
- """
9
- Evaluation metrics for time-series forecasting with time-based weighting.
10
-
11
- This class provides both cohort-level (aggregated by days_since_ftd) and
12
- sample-level metrics with time decay weighting based on business value periods.
13
-
14
- Attributes
15
- ----------
16
- base_X : pd.DataFrame
17
- Base dataframe containing days_since_ftd, y_true, and y_pred
18
- daily_agg : pd.DataFrame
19
- Daily aggregated metrics with time decay weights
20
- weights : np.ndarray or None
21
- Computed weights for samples
22
- agg_preds : float or None
23
- Aggregated predictions
24
- agg_true : float or None
25
- Aggregated true values
26
- """
27
-
28
- def __init__(self) -> None:
29
- """Initialize the EvalMetric class with empty dataframes."""
30
- self.base_X = pd.DataFrame(columns=["days_since_ftd", "y_true", "y_pred"])
31
- self.daily_agg = pd.DataFrame(
32
- columns=["days_since_ftd", "time_decay_weight", "daily_agg_target", "daily_agg_preds"]
33
- )
34
- self.weights = None
35
- self.agg_preds = None
36
- self.agg_true = None
37
- pass
38
-
39
- def get_daily_agg(self, agg_func: str = "mean") -> pd.DataFrame:
40
- """
41
- Calculate the daily aggregated target and predictions grouped by days_since_ftd.
42
-
43
- Parameters
44
- ----------
45
- agg_func : str, default='mean'
46
- Aggregation function to use. Options: 'sum' or 'mean'
47
-
48
- Returns
49
- -------
50
- pd.DataFrame
51
- DataFrame with columns: days_since_ftd, daily_agg_target, daily_agg_preds
52
- """
53
- self.daily_agg = (
54
- self.base_X.groupby("days_since_ftd")
55
- .agg(
56
- daily_agg_target=("y_true", agg_func),
57
- daily_agg_preds=("y_pred", agg_func),
58
- )
59
- .reset_index()
60
- )
61
-
62
- return self.daily_agg
63
-
64
- def get_time_decay_weight(self) -> Any:
65
- """
66
- Calculates segmented time-based weights based on business value periods.
67
-
68
- Weight scheme:
69
- - Days 0-7: 0.05 (very low weight for initial period)
70
- - Days 8-14: 0.85 (high weight for early engagement)
71
- - Days 15-45: 1.00 (maximum weight for peak value period)
72
- - Days 46-90: 0.90 (high weight)
73
- - Days 91-180: 0.60 (medium weight)
74
- - Days 181-270: 0.40 (lower weight)
75
- - Days 270+: 0.30 (lowest weight)
76
-
77
- Returns
78
- -------
79
- pd.Series
80
- Series of time decay weights for each day
81
- """
82
- days = self.daily_agg["days_since_ftd"]
83
- self.daily_agg["time_decay_weight"] = np.where(
84
- days <= 7,
85
- 0.05,
86
- np.where(
87
- days <= 14,
88
- 0.85,
89
- np.where(
90
- days <= 45,
91
- 1.00,
92
- np.where(days <= 90, 0.90, np.where(days <= 180, 0.60, np.where(days <= 270, 0.40, 0.30))),
93
- ),
94
- ),
95
- )
96
-
97
- return self.daily_agg["time_decay_weight"]
98
-
99
- def _set_y_true_and_pred(self, y_true: Any, y_pred: Any) -> None:
100
- """
101
- Set the true and predicted values in base_X dataframe.
102
-
103
- Handles conversion from pandas Series, numpy arrays, or other array-like objects
104
- to flattened arrays.
105
-
106
- Parameters
107
- ----------
108
- y_true : array-like
109
- True target values
110
- y_pred : array-like
111
- Predicted values
112
- """
113
- if hasattr(y_true, "values"):
114
- y_true_flat = y_true.values.flatten()
115
- else:
116
- y_true_flat = np.array(y_true).flatten()
117
-
118
- if hasattr(y_pred, "values"):
119
- y_pred_flat = y_pred.values.flatten() if hasattr(y_pred, "values") else y_pred
120
- else:
121
- y_pred_flat = np.array(y_pred).flatten()
122
-
123
- self.base_X["y_true"] = y_true_flat
124
- self.base_X["y_pred"] = y_pred_flat
125
-
126
- # ========================================================================
127
- # COHORT-LEVEL METRICS (agregowane po days_since_ftd)
128
- # ========================================================================
129
-
130
- def cohort_weighted_mape(self, X: Any, y_true: Any, y_pred: Any) -> float:
131
- """
132
- Calculate the cohort-level weighted mean absolute percentage error.
133
-
134
- Aggregates predictions and targets by days_since_ftd, then calculates
135
- MAPE with time decay weighting. Excludes days where target is zero.
136
-
137
- Parameters
138
- ----------
139
- X : pd.DataFrame
140
- Feature matrix containing 'days_since_ftd' column
141
- y_true : array-like
142
- True target values
143
- y_pred : array-like
144
- Predicted values
145
-
146
- Returns
147
- -------
148
- float
149
- Weighted mean absolute percentage error at cohort level
150
- """
151
- self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
152
- self._set_y_true_and_pred(y_true, y_pred)
153
-
154
- self.get_daily_agg()
155
- self.get_time_decay_weight()
156
-
157
- mask = self.daily_agg["daily_agg_target"] != 0
158
-
159
- percentage_errors = np.abs(
160
- (self.daily_agg["daily_agg_target"][mask] - self.daily_agg["daily_agg_preds"][mask])
161
- / self.daily_agg["daily_agg_target"][mask]
162
- )
163
- weights = self.daily_agg["time_decay_weight"][mask]
164
- wmape = np.average(percentage_errors, weights=weights)
165
-
166
- return float(wmape)
167
-
168
- def cohort_weighted_mae(self, X: Any, y_true: Any, y_pred: Any) -> float:
169
- """
170
- Calculate the cohort-level weighted mean absolute error.
171
-
172
- Aggregates predictions and targets by days_since_ftd, then calculates
173
- MAE with time decay weighting.
174
-
175
- Parameters
176
- ----------
177
- X : pd.DataFrame
178
- Feature matrix containing 'days_since_ftd' column
179
- y_true : array-like
180
- True target values
181
- y_pred : array-like
182
- Predicted values
183
-
184
- Returns
185
- -------
186
- float
187
- Weighted mean absolute error at cohort level
188
- """
189
- self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
190
- self._set_y_true_and_pred(y_true, y_pred)
191
-
192
- self.get_daily_agg()
193
- self.get_time_decay_weight()
194
-
195
- abs_errors = np.abs(self.daily_agg["daily_agg_target"] - self.daily_agg["daily_agg_preds"])
196
- wmae = np.average(abs_errors, weights=self.daily_agg["time_decay_weight"])
197
-
198
- return float(wmae)
199
-
200
- def cohort_weighted_mse(self, X: Any, y_true: Any, y_pred: Any) -> float:
201
- """
202
- Calculate the cohort-level weighted mean squared error.
203
-
204
- Aggregates predictions and targets by days_since_ftd, then calculates
205
- MSE with time decay weighting.
206
-
207
- Parameters
208
- ----------
209
- X : pd.DataFrame
210
- Feature matrix containing 'days_since_ftd' column
211
- y_true : array-like
212
- True target values
213
- y_pred : array-like
214
- Predicted values
215
-
216
- Returns
217
- -------
218
- float
219
- Weighted mean squared error at cohort level
220
- """
221
- self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
222
- self._set_y_true_and_pred(y_true, y_pred)
223
-
224
- self.get_daily_agg()
225
- self.get_time_decay_weight()
226
-
227
- squared_errors = (self.daily_agg["daily_agg_target"] - self.daily_agg["daily_agg_preds"]) ** 2
228
- wmse = np.average(squared_errors, weights=self.daily_agg["time_decay_weight"])
229
-
230
- return float(wmse)
231
-
232
- # ========================================================================
233
- # SAMPLE-LEVEL METRICS (na pojedynczych próbkach)
234
- # ========================================================================
235
-
236
- def _get_sample_weights(self, X: Any) -> np.ndarray:
237
- """
238
- Calculate time-based weights for individual samples.
239
-
240
- Uses the same weight scheme as cohort-level metrics but applies
241
- to individual samples without aggregation.
242
-
243
- Parameters
244
- ----------
245
- X : pd.DataFrame
246
- Feature matrix containing 'days_since_ftd' column
247
-
248
- Returns
249
- -------
250
- np.ndarray
251
- Array of weights for each sample
252
- """
253
- days = X["days_since_ftd"].values
254
- weights = np.where(
255
- days <= 7,
256
- 0.05,
257
- np.where(
258
- days <= 14,
259
- 0.85,
260
- np.where(
261
- days <= 45,
262
- 1.00,
263
- np.where(days <= 90, 0.90, np.where(days <= 180, 0.60, np.where(days <= 270, 0.40, 0.30))),
264
- ),
265
- ),
266
- )
267
- return weights
268
-
269
- def sample_weighted_mae(self, X: Any, y_true: Any, y_pred: Any) -> float:
270
- """
271
- Calculate the sample-level weighted mean absolute error.
272
-
273
- Calculates MAE on individual samples with time decay weighting,
274
- without aggregation by days_since_ftd.
275
-
276
- Parameters
277
- ----------
278
- X : pd.DataFrame
279
- Feature matrix containing 'days_since_ftd' column
280
- y_true : array-like
281
- True target values
282
- y_pred : array-like
283
- Predicted values
284
-
285
- Returns
286
- -------
287
- float
288
- Weighted mean absolute error at sample level
289
- """
290
- X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
291
-
292
- self._set_y_true_and_pred(y_true, y_pred)
293
-
294
- weights = self._get_sample_weights(X_reset)
295
- abs_errors = np.abs(self.base_X["y_true"] - self.base_X["y_pred"])
296
- wmae = np.average(abs_errors, weights=weights)
297
-
298
- return float(wmae)
299
-
300
- def sample_weighted_mse(self, X: Any, y_true: Any, y_pred: Any) -> float:
301
- """
302
- Calculate the sample-level weighted mean squared error.
303
-
304
- Calculates MSE on individual samples with time decay weighting,
305
- without aggregation by days_since_ftd.
306
-
307
- Parameters
308
- ----------
309
- X : pd.DataFrame
310
- Feature matrix containing 'days_since_ftd' column
311
- y_true : array-like
312
- True target values
313
- y_pred : array-like
314
- Predicted values
315
-
316
- Returns
317
- -------
318
- float
319
- Weighted mean squared error at sample level
320
- """
321
- X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
322
-
323
- self._set_y_true_and_pred(y_true, y_pred)
324
-
325
- weights = self._get_sample_weights(X_reset)
326
- squared_errors = (self.base_X["y_true"] - self.base_X["y_pred"]) ** 2
327
- wmse = np.average(squared_errors, weights=weights)
328
-
329
- return float(wmse)
330
-
331
- def sample_weighted_mape(self, X: Any, y_true: Any, y_pred: Any) -> float:
332
- """
333
- Calculate the sample-level weighted mean absolute percentage error.
334
-
335
- Calculates MAPE on individual samples with time decay weighting,
336
- without aggregation by days_since_ftd. Excludes samples where y_true is zero.
337
-
338
- Parameters
339
- ----------
340
- X : pd.DataFrame
341
- Feature matrix containing 'days_since_ftd' column
342
- y_true : array-like
343
- True target values
344
- y_pred : array-like
345
- Predicted values
346
-
347
- Returns
348
- -------
349
- float
350
- Weighted mean absolute percentage error at sample level
351
- """
352
- X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
353
-
354
- self._set_y_true_and_pred(y_true, y_pred)
355
-
356
- weights = self._get_sample_weights(X_reset)
357
-
358
- mask = self.base_X["y_true"] != 0
359
- percentage_errors = np.abs(
360
- (self.base_X["y_true"][mask] - self.base_X["y_pred"][mask]) / self.base_X["y_true"][mask]
361
- )
362
- weights_filtered = weights[mask]
363
-
364
- wmape = np.average(percentage_errors, weights=weights_filtered)
365
-
366
- return float(wmape)
367
-
368
- def create_lgb_metric(
369
- self, X_eval: pd.DataFrame, metric_type: str = "mae", level: str = "cohort"
370
- ) -> Callable[[np.ndarray, np.ndarray], Tuple[str, float, bool]]:
371
- """
372
- Creates a LightGBM-compatible evaluation metric with access to X via closure.
373
-
374
- This method creates a custom metric function that can be passed to LightGBM's
375
- eval_metric parameter. The returned function has access to the evaluation
376
- features (X_eval) through closure, allowing time-weighted metrics.
377
-
378
- Parameters
379
- ----------
380
- X_eval : pd.DataFrame
381
- DataFrame containing 'days_since_ftd' column for evaluation set
382
- metric_type : str, default='mae'
383
- Type of metric: 'mae', 'mse', or 'mape'
384
- level : str, default='cohort'
385
- Level of aggregation:
386
- - 'cohort': aggregated by days_since_ftd
387
- - 'sample': individual samples
388
-
389
- Returns
390
- -------
391
- Callable
392
- Custom eval metric for LightGBM with signature:
393
- (y_true, y_pred) -> (metric_name, metric_value, is_higher_better)
394
-
395
- Raises
396
- ------
397
- ValueError
398
- If invalid combination of level and metric_type is provided
399
-
400
- Examples
401
- --------
402
- >>> eval_metric = EvalMetric()
403
- >>> lgb_metric = eval_metric.create_lgb_metric(X_val, metric_type='mae', level='cohort')
404
- >>> model = lgb.train(params, train_data, valid_sets=[val_data], feval=lgb_metric)
405
- """
406
- X_eval_reset = X_eval[["days_since_ftd"]].copy().reset_index(drop=True)
407
-
408
- # Mapowanie: (level, metric_type) -> (metoda obliczająca, nazwa dla LightGBM)
409
- metrics_map = {
410
- ("cohort", "mae"): (self.cohort_weighted_mae, "cohort_wmae"),
411
- ("cohort", "mse"): (self.cohort_weighted_mse, "cohort_wmse"),
412
- ("cohort", "mape"): (self.cohort_weighted_mape, "cohort_wmape"),
413
- ("sample", "mae"): (self.sample_weighted_mae, "sample_wmae"),
414
- ("sample", "mse"): (self.sample_weighted_mse, "sample_wmse"),
415
- ("sample", "mape"): (self.sample_weighted_mape, "sample_wmape"),
416
- }
417
-
418
- # Walidacja i wybór metryki
419
- key = (level, metric_type)
420
- if key not in metrics_map:
421
- valid_combinations = ", ".join([f"{lvl}_{typ}" for lvl, typ in metrics_map.keys()])
422
- raise ValueError(
423
- f"Invalid combination: level={level!r}, metric_type={metric_type!r}. "
424
- f"Valid combinations: {valid_combinations}"
425
- )
426
-
427
- metric_func, metric_name = metrics_map[key]
428
-
429
- # Wrapper dla LightGBM
430
- def weighted_metric(y_true: Any, y_pred: Any) -> Tuple[str, float, bool]:
431
- try:
432
- # Konwersja do pandas Series jeśli potrzeba
433
- if isinstance(y_true, np.ndarray):
434
- y_true = pd.Series(y_true).reset_index(drop=True)
435
- if isinstance(y_pred, np.ndarray):
436
- y_pred = pd.Series(y_pred).reset_index(drop=True)
437
-
438
- # Oblicz metrykę używając wybranej funkcji z klasy
439
- metric_value = metric_func(X_eval_reset, y_true, y_pred)
440
-
441
- # Zwróć (nazwa, wartość, czy_większe_lepsze)
442
- return metric_name, metric_value, False
443
-
444
- except Exception as e:
445
- print(f"❌ Error in {metric_name}: {e}")
446
- import traceback
447
-
448
- traceback.print_exc()
449
- return metric_name, 999999, False
450
-
451
- return weighted_metric
452
-
453
-
454
- class DaysWeightedObjective:
455
- """
456
- Custom time-decayed objective function for LightGBM.
457
-
458
- This class implements a custom loss function with time-based weighting
459
- for gradient boosting. The objective applies time decay weights based
460
- on days_since_ftd to prioritize predictions in specific time periods.
461
-
462
- Attributes
463
- ----------
464
- epsilon : float
465
- Small constant for numerical stability in calculations
466
- X : pd.DataFrame or None
467
- Feature matrix containing days_since_ftd column
468
- days_col : str
469
- Name of the column containing days information
470
- agg_level : str
471
- Aggregation level ('sample' only currently supported)
472
- mode : str
473
- Loss function mode: 'mae', 'mse', or 'mape'
474
- time_decay : np.ndarray
475
- Computed time decay weights
476
- """
477
-
478
- def __init__(
479
- self, days_col: str = "days_since_ftd", agg_level: str = "sample", mode: str = "mse", epsilon: float = 1e-3
480
- ) -> None:
481
- """
482
- Initialize the custom objective function.
483
-
484
- Parameters
485
- ----------
486
- days_col : str, default='days_since_ftd'
487
- Name of the column containing days since first transaction
488
- agg_level : str, default='sample'
489
- Aggregation level (currently only 'sample' is supported)
490
- mode : str, default='mse'
491
- Loss function mode: 'mae', 'mse', or 'mape'
492
- epsilon : float, default=1e-3
493
- Small constant for numerical stability
494
- """
495
- self.epsilon = epsilon
496
- self.X: Optional[pd.DataFrame] = None
497
- self.days_col = days_col
498
- self.agg_level = agg_level.lower()
499
- self.mode = mode.lower()
500
- self.time_decay: Optional[np.ndarray] = None
501
-
502
- def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
503
- """
504
- Compute gradients and hessians for LightGBM custom objective.
505
-
506
- Parameters
507
- ----------
508
- y_true : np.ndarray
509
- True target values
510
- y_pred : np.ndarray
511
- Predicted values
512
-
513
- Returns
514
- -------
515
- tuple of np.ndarray
516
- (gradients, hessians) for LightGBM optimization
517
-
518
- Raises
519
- ------
520
- ValueError
521
- If agg_level is not 'sample'
522
- """
523
- if self.agg_level == "sample":
524
- return self.sample_level(y_true, y_pred)
525
- else:
526
- raise ValueError("agg_level must be 'sample'")
527
-
528
- def set_data(self, X: pd.DataFrame) -> None:
529
- """
530
- Attach feature matrix for objective function computation.
531
-
532
- Must be called before using the objective in LightGBM training
533
- to provide access to the days_since_ftd column.
534
-
535
- Parameters
536
- ----------
537
- X : pd.DataFrame
538
- Feature matrix containing days_since_ftd column
539
- """
540
- self.X = X[[self.days_col]].copy()
541
-
542
- def get_time_decay_weight(self, X: pd.DataFrame) -> None:
543
- """
544
- Calculates uniform time-based weights (currently all set to 1).
545
-
546
- This method computes time decay weights based on days_since_ftd.
547
- The current implementation uses uniform weights (all 1.0) across
548
- all time periods. Can be modified to apply different weights for
549
- different business value periods.
550
-
551
- Parameters
552
- ----------
553
- X : pd.DataFrame
554
- Feature matrix containing days_since_ftd column
555
- """
556
- days = X[self.days_col].values
557
- self.time_decay = np.where(
558
- days <= 7,
559
- 1,
560
- np.where(
561
- days <= 14,
562
- 1,
563
- np.where(
564
- days <= 45,
565
- 1,
566
- np.where(days <= 90, 1, np.where(days <= 180, 1, np.where(days <= 270, 1, 1))),
567
- ),
568
- ),
569
- )
570
-
571
- # def get_time_decay_weight(self, X):
572
- # """
573
- # Calculates the time decay weight
574
- # """
575
- # self.time_decay = np.exp(-X[self.days_col].values.astype(np.float64) / 365.0)
576
-
577
- def sample_level(self, y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
578
- """
579
- Compute weighted gradients and hessians for sample-level optimization.
580
-
581
- Calculates first and second derivatives of the loss function with
582
- time decay weighting applied. Supports MAE, MSE, and MAPE modes.
583
-
584
- Parameters
585
- ----------
586
- y_true : np.ndarray
587
- True target values
588
- y_pred : np.ndarray
589
- Predicted values
590
-
591
- Returns
592
- -------
593
- tuple of np.ndarray
594
- (grad, hess) - Gradients and Hessians for LightGBM
595
-
596
- Raises
597
- ------
598
- ValueError
599
- If X has not been set via set_data(), or if mode is invalid
600
- """
601
- if self.X is None:
602
- raise ValueError("Feature matrix X has not been set. Use set_data(X) before fitting.")
603
-
604
- self.get_time_decay_weight(self.X)
605
- error = y_pred - y_true
606
- sign = np.sign(error)
607
-
608
- if self.mode == "mape":
609
- denom = np.abs(y_true) + self.epsilon
610
- smooth = np.sqrt(error**2 + self.epsilon**2)
611
-
612
- grad = self.time_decay * error / (smooth * denom)
613
- hess = self.time_decay * (self.epsilon**2) / (smooth**3 * denom)
614
-
615
- elif self.mode == "mae":
616
- grad = self.time_decay * sign
617
- hess = self.time_decay
618
-
619
- elif self.mode == "mse":
620
- grad = 2 * self.time_decay * error
621
- hess = 2 * self.time_decay * np.ones_like(error)
622
-
623
- else:
624
- raise ValueError("mode must be 'mae' or 'mape' or 'mse'")
625
-
626
- return grad, hess