utilsds-models 0.0.4__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/PKG-INFO +1 -1
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/pyproject.toml +2 -2
- utilsds_models-0.0.5/utilsds_models/__init__.py +0 -0
- utilsds_models-0.0.5/utilsds_models/custom_metrics.py +626 -0
- utilsds_models-0.0.5/utilsds_models/data_processing.py +396 -0
- utilsds_models-0.0.5/utilsds_models/evip_dynamic.py +124 -0
- utilsds_models-0.0.5/utilsds_models/metrics.py +179 -0
- utilsds_models-0.0.5/utilsds_models/visualization.py +179 -0
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/utilsds_models.egg-info/PKG-INFO +1 -1
- utilsds_models-0.0.5/utilsds_models.egg-info/SOURCES.txt +14 -0
- utilsds_models-0.0.5/utilsds_models.egg-info/top_level.txt +1 -0
- utilsds_models-0.0.4/utilsds_models.egg-info/SOURCES.txt +0 -8
- utilsds_models-0.0.4/utilsds_models.egg-info/top_level.txt +0 -1
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/README.md +0 -0
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/docs/ALTERNATIVE_README.md +0 -0
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/setup.cfg +0 -0
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/utilsds_models.egg-info/dependency_links.txt +0 -0
- {utilsds_models-0.0.4 → utilsds_models-0.0.5}/utilsds_models.egg-info/requires.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "utilsds-models"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.5"
|
|
8
8
|
description = "Solution for specific models"
|
|
9
9
|
readme = {file = "docs/ALTERNATIVE_README.md", content-type = "text/markdown"}
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -31,5 +31,5 @@ dev = [
|
|
|
31
31
|
|
|
32
32
|
[tool.setuptools.packages.find]
|
|
33
33
|
where = ["."]
|
|
34
|
-
include = ["
|
|
34
|
+
include = ["utilsds_models*"]
|
|
35
35
|
exclude = ["tests*", "docs*"]
|
|
File without changes
|
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
from typing import Any, Callable, Optional, Tuple
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EvalMetric:
|
|
8
|
+
"""
|
|
9
|
+
Evaluation metrics for time-series forecasting with time-based weighting.
|
|
10
|
+
|
|
11
|
+
This class provides both cohort-level (aggregated by days_since_ftd) and
|
|
12
|
+
sample-level metrics with time decay weighting based on business value periods.
|
|
13
|
+
|
|
14
|
+
Attributes
|
|
15
|
+
----------
|
|
16
|
+
base_X : pd.DataFrame
|
|
17
|
+
Base dataframe containing days_since_ftd, y_true, and y_pred
|
|
18
|
+
daily_agg : pd.DataFrame
|
|
19
|
+
Daily aggregated metrics with time decay weights
|
|
20
|
+
weights : np.ndarray or None
|
|
21
|
+
Computed weights for samples
|
|
22
|
+
agg_preds : float or None
|
|
23
|
+
Aggregated predictions
|
|
24
|
+
agg_true : float or None
|
|
25
|
+
Aggregated true values
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
"""Initialize the EvalMetric class with empty dataframes."""
|
|
30
|
+
self.base_X = pd.DataFrame(columns=["days_since_ftd", "y_true", "y_pred"])
|
|
31
|
+
self.daily_agg = pd.DataFrame(
|
|
32
|
+
columns=["days_since_ftd", "time_decay_weight", "daily_agg_target", "daily_agg_preds"]
|
|
33
|
+
)
|
|
34
|
+
self.weights = None
|
|
35
|
+
self.agg_preds = None
|
|
36
|
+
self.agg_true = None
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
def get_daily_agg(self, agg_func: str = "mean") -> pd.DataFrame:
|
|
40
|
+
"""
|
|
41
|
+
Calculate the daily aggregated target and predictions grouped by days_since_ftd.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
agg_func : str, default='mean'
|
|
46
|
+
Aggregation function to use. Options: 'sum' or 'mean'
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
pd.DataFrame
|
|
51
|
+
DataFrame with columns: days_since_ftd, daily_agg_target, daily_agg_preds
|
|
52
|
+
"""
|
|
53
|
+
self.daily_agg = (
|
|
54
|
+
self.base_X.groupby("days_since_ftd")
|
|
55
|
+
.agg(
|
|
56
|
+
daily_agg_target=("y_true", agg_func),
|
|
57
|
+
daily_agg_preds=("y_pred", agg_func),
|
|
58
|
+
)
|
|
59
|
+
.reset_index()
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return self.daily_agg
|
|
63
|
+
|
|
64
|
+
def get_time_decay_weight(self) -> Any:
|
|
65
|
+
"""
|
|
66
|
+
Calculates segmented time-based weights based on business value periods.
|
|
67
|
+
|
|
68
|
+
Weight scheme:
|
|
69
|
+
- Days 0-7: 0.05 (very low weight for initial period)
|
|
70
|
+
- Days 8-14: 0.85 (high weight for early engagement)
|
|
71
|
+
- Days 15-45: 1.00 (maximum weight for peak value period)
|
|
72
|
+
- Days 46-90: 0.90 (high weight)
|
|
73
|
+
- Days 91-180: 0.60 (medium weight)
|
|
74
|
+
- Days 181-270: 0.40 (lower weight)
|
|
75
|
+
- Days 270+: 0.30 (lowest weight)
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
pd.Series
|
|
80
|
+
Series of time decay weights for each day
|
|
81
|
+
"""
|
|
82
|
+
days = self.daily_agg["days_since_ftd"]
|
|
83
|
+
self.daily_agg["time_decay_weight"] = np.where(
|
|
84
|
+
days <= 7,
|
|
85
|
+
0.05,
|
|
86
|
+
np.where(
|
|
87
|
+
days <= 14,
|
|
88
|
+
0.85,
|
|
89
|
+
np.where(
|
|
90
|
+
days <= 45,
|
|
91
|
+
1.00,
|
|
92
|
+
np.where(days <= 90, 0.90, np.where(days <= 180, 0.60, np.where(days <= 270, 0.40, 0.30))),
|
|
93
|
+
),
|
|
94
|
+
),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return self.daily_agg["time_decay_weight"]
|
|
98
|
+
|
|
99
|
+
def _set_y_true_and_pred(self, y_true: Any, y_pred: Any) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Set the true and predicted values in base_X dataframe.
|
|
102
|
+
|
|
103
|
+
Handles conversion from pandas Series, numpy arrays, or other array-like objects
|
|
104
|
+
to flattened arrays.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
y_true : array-like
|
|
109
|
+
True target values
|
|
110
|
+
y_pred : array-like
|
|
111
|
+
Predicted values
|
|
112
|
+
"""
|
|
113
|
+
if hasattr(y_true, "values"):
|
|
114
|
+
y_true_flat = y_true.values.flatten()
|
|
115
|
+
else:
|
|
116
|
+
y_true_flat = np.array(y_true).flatten()
|
|
117
|
+
|
|
118
|
+
if hasattr(y_pred, "values"):
|
|
119
|
+
y_pred_flat = y_pred.values.flatten() if hasattr(y_pred, "values") else y_pred
|
|
120
|
+
else:
|
|
121
|
+
y_pred_flat = np.array(y_pred).flatten()
|
|
122
|
+
|
|
123
|
+
self.base_X["y_true"] = y_true_flat
|
|
124
|
+
self.base_X["y_pred"] = y_pred_flat
|
|
125
|
+
|
|
126
|
+
# ========================================================================
|
|
127
|
+
# COHORT-LEVEL METRICS (agregowane po days_since_ftd)
|
|
128
|
+
# ========================================================================
|
|
129
|
+
|
|
130
|
+
def cohort_weighted_mape(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
131
|
+
"""
|
|
132
|
+
Calculate the cohort-level weighted mean absolute percentage error.
|
|
133
|
+
|
|
134
|
+
Aggregates predictions and targets by days_since_ftd, then calculates
|
|
135
|
+
MAPE with time decay weighting. Excludes days where target is zero.
|
|
136
|
+
|
|
137
|
+
Parameters
|
|
138
|
+
----------
|
|
139
|
+
X : pd.DataFrame
|
|
140
|
+
Feature matrix containing 'days_since_ftd' column
|
|
141
|
+
y_true : array-like
|
|
142
|
+
True target values
|
|
143
|
+
y_pred : array-like
|
|
144
|
+
Predicted values
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
float
|
|
149
|
+
Weighted mean absolute percentage error at cohort level
|
|
150
|
+
"""
|
|
151
|
+
self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
152
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
153
|
+
|
|
154
|
+
self.get_daily_agg()
|
|
155
|
+
self.get_time_decay_weight()
|
|
156
|
+
|
|
157
|
+
mask = self.daily_agg["daily_agg_target"] != 0
|
|
158
|
+
|
|
159
|
+
percentage_errors = np.abs(
|
|
160
|
+
(self.daily_agg["daily_agg_target"][mask] - self.daily_agg["daily_agg_preds"][mask])
|
|
161
|
+
/ self.daily_agg["daily_agg_target"][mask]
|
|
162
|
+
)
|
|
163
|
+
weights = self.daily_agg["time_decay_weight"][mask]
|
|
164
|
+
wmape = np.average(percentage_errors, weights=weights)
|
|
165
|
+
|
|
166
|
+
return float(wmape)
|
|
167
|
+
|
|
168
|
+
def cohort_weighted_mae(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
169
|
+
"""
|
|
170
|
+
Calculate the cohort-level weighted mean absolute error.
|
|
171
|
+
|
|
172
|
+
Aggregates predictions and targets by days_since_ftd, then calculates
|
|
173
|
+
MAE with time decay weighting.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
X : pd.DataFrame
|
|
178
|
+
Feature matrix containing 'days_since_ftd' column
|
|
179
|
+
y_true : array-like
|
|
180
|
+
True target values
|
|
181
|
+
y_pred : array-like
|
|
182
|
+
Predicted values
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
float
|
|
187
|
+
Weighted mean absolute error at cohort level
|
|
188
|
+
"""
|
|
189
|
+
self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
190
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
191
|
+
|
|
192
|
+
self.get_daily_agg()
|
|
193
|
+
self.get_time_decay_weight()
|
|
194
|
+
|
|
195
|
+
abs_errors = np.abs(self.daily_agg["daily_agg_target"] - self.daily_agg["daily_agg_preds"])
|
|
196
|
+
wmae = np.average(abs_errors, weights=self.daily_agg["time_decay_weight"])
|
|
197
|
+
|
|
198
|
+
return float(wmae)
|
|
199
|
+
|
|
200
|
+
def cohort_weighted_mse(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
201
|
+
"""
|
|
202
|
+
Calculate the cohort-level weighted mean squared error.
|
|
203
|
+
|
|
204
|
+
Aggregates predictions and targets by days_since_ftd, then calculates
|
|
205
|
+
MSE with time decay weighting.
|
|
206
|
+
|
|
207
|
+
Parameters
|
|
208
|
+
----------
|
|
209
|
+
X : pd.DataFrame
|
|
210
|
+
Feature matrix containing 'days_since_ftd' column
|
|
211
|
+
y_true : array-like
|
|
212
|
+
True target values
|
|
213
|
+
y_pred : array-like
|
|
214
|
+
Predicted values
|
|
215
|
+
|
|
216
|
+
Returns
|
|
217
|
+
-------
|
|
218
|
+
float
|
|
219
|
+
Weighted mean squared error at cohort level
|
|
220
|
+
"""
|
|
221
|
+
self.base_X = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
222
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
223
|
+
|
|
224
|
+
self.get_daily_agg()
|
|
225
|
+
self.get_time_decay_weight()
|
|
226
|
+
|
|
227
|
+
squared_errors = (self.daily_agg["daily_agg_target"] - self.daily_agg["daily_agg_preds"]) ** 2
|
|
228
|
+
wmse = np.average(squared_errors, weights=self.daily_agg["time_decay_weight"])
|
|
229
|
+
|
|
230
|
+
return float(wmse)
|
|
231
|
+
|
|
232
|
+
# ========================================================================
|
|
233
|
+
# SAMPLE-LEVEL METRICS (na pojedynczych próbkach)
|
|
234
|
+
# ========================================================================
|
|
235
|
+
|
|
236
|
+
def _get_sample_weights(self, X: Any) -> np.ndarray:
|
|
237
|
+
"""
|
|
238
|
+
Calculate time-based weights for individual samples.
|
|
239
|
+
|
|
240
|
+
Uses the same weight scheme as cohort-level metrics but applies
|
|
241
|
+
to individual samples without aggregation.
|
|
242
|
+
|
|
243
|
+
Parameters
|
|
244
|
+
----------
|
|
245
|
+
X : pd.DataFrame
|
|
246
|
+
Feature matrix containing 'days_since_ftd' column
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
np.ndarray
|
|
251
|
+
Array of weights for each sample
|
|
252
|
+
"""
|
|
253
|
+
days = X["days_since_ftd"].values
|
|
254
|
+
weights = np.where(
|
|
255
|
+
days <= 7,
|
|
256
|
+
0.05,
|
|
257
|
+
np.where(
|
|
258
|
+
days <= 14,
|
|
259
|
+
0.85,
|
|
260
|
+
np.where(
|
|
261
|
+
days <= 45,
|
|
262
|
+
1.00,
|
|
263
|
+
np.where(days <= 90, 0.90, np.where(days <= 180, 0.60, np.where(days <= 270, 0.40, 0.30))),
|
|
264
|
+
),
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
return weights
|
|
268
|
+
|
|
269
|
+
def sample_weighted_mae(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
270
|
+
"""
|
|
271
|
+
Calculate the sample-level weighted mean absolute error.
|
|
272
|
+
|
|
273
|
+
Calculates MAE on individual samples with time decay weighting,
|
|
274
|
+
without aggregation by days_since_ftd.
|
|
275
|
+
|
|
276
|
+
Parameters
|
|
277
|
+
----------
|
|
278
|
+
X : pd.DataFrame
|
|
279
|
+
Feature matrix containing 'days_since_ftd' column
|
|
280
|
+
y_true : array-like
|
|
281
|
+
True target values
|
|
282
|
+
y_pred : array-like
|
|
283
|
+
Predicted values
|
|
284
|
+
|
|
285
|
+
Returns
|
|
286
|
+
-------
|
|
287
|
+
float
|
|
288
|
+
Weighted mean absolute error at sample level
|
|
289
|
+
"""
|
|
290
|
+
X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
291
|
+
|
|
292
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
293
|
+
|
|
294
|
+
weights = self._get_sample_weights(X_reset)
|
|
295
|
+
abs_errors = np.abs(self.base_X["y_true"] - self.base_X["y_pred"])
|
|
296
|
+
wmae = np.average(abs_errors, weights=weights)
|
|
297
|
+
|
|
298
|
+
return float(wmae)
|
|
299
|
+
|
|
300
|
+
def sample_weighted_mse(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
301
|
+
"""
|
|
302
|
+
Calculate the sample-level weighted mean squared error.
|
|
303
|
+
|
|
304
|
+
Calculates MSE on individual samples with time decay weighting,
|
|
305
|
+
without aggregation by days_since_ftd.
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
X : pd.DataFrame
|
|
310
|
+
Feature matrix containing 'days_since_ftd' column
|
|
311
|
+
y_true : array-like
|
|
312
|
+
True target values
|
|
313
|
+
y_pred : array-like
|
|
314
|
+
Predicted values
|
|
315
|
+
|
|
316
|
+
Returns
|
|
317
|
+
-------
|
|
318
|
+
float
|
|
319
|
+
Weighted mean squared error at sample level
|
|
320
|
+
"""
|
|
321
|
+
X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
322
|
+
|
|
323
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
324
|
+
|
|
325
|
+
weights = self._get_sample_weights(X_reset)
|
|
326
|
+
squared_errors = (self.base_X["y_true"] - self.base_X["y_pred"]) ** 2
|
|
327
|
+
wmse = np.average(squared_errors, weights=weights)
|
|
328
|
+
|
|
329
|
+
return float(wmse)
|
|
330
|
+
|
|
331
|
+
def sample_weighted_mape(self, X: Any, y_true: Any, y_pred: Any) -> float:
|
|
332
|
+
"""
|
|
333
|
+
Calculate the sample-level weighted mean absolute percentage error.
|
|
334
|
+
|
|
335
|
+
Calculates MAPE on individual samples with time decay weighting,
|
|
336
|
+
without aggregation by days_since_ftd. Excludes samples where y_true is zero.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
X : pd.DataFrame
|
|
341
|
+
Feature matrix containing 'days_since_ftd' column
|
|
342
|
+
y_true : array-like
|
|
343
|
+
True target values
|
|
344
|
+
y_pred : array-like
|
|
345
|
+
Predicted values
|
|
346
|
+
|
|
347
|
+
Returns
|
|
348
|
+
-------
|
|
349
|
+
float
|
|
350
|
+
Weighted mean absolute percentage error at sample level
|
|
351
|
+
"""
|
|
352
|
+
X_reset = X[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
353
|
+
|
|
354
|
+
self._set_y_true_and_pred(y_true, y_pred)
|
|
355
|
+
|
|
356
|
+
weights = self._get_sample_weights(X_reset)
|
|
357
|
+
|
|
358
|
+
mask = self.base_X["y_true"] != 0
|
|
359
|
+
percentage_errors = np.abs(
|
|
360
|
+
(self.base_X["y_true"][mask] - self.base_X["y_pred"][mask]) / self.base_X["y_true"][mask]
|
|
361
|
+
)
|
|
362
|
+
weights_filtered = weights[mask]
|
|
363
|
+
|
|
364
|
+
wmape = np.average(percentage_errors, weights=weights_filtered)
|
|
365
|
+
|
|
366
|
+
return float(wmape)
|
|
367
|
+
|
|
368
|
+
def create_lgb_metric(
|
|
369
|
+
self, X_eval: pd.DataFrame, metric_type: str = "mae", level: str = "cohort"
|
|
370
|
+
) -> Callable[[np.ndarray, np.ndarray], Tuple[str, float, bool]]:
|
|
371
|
+
"""
|
|
372
|
+
Creates a LightGBM-compatible evaluation metric with access to X via closure.
|
|
373
|
+
|
|
374
|
+
This method creates a custom metric function that can be passed to LightGBM's
|
|
375
|
+
eval_metric parameter. The returned function has access to the evaluation
|
|
376
|
+
features (X_eval) through closure, allowing time-weighted metrics.
|
|
377
|
+
|
|
378
|
+
Parameters
|
|
379
|
+
----------
|
|
380
|
+
X_eval : pd.DataFrame
|
|
381
|
+
DataFrame containing 'days_since_ftd' column for evaluation set
|
|
382
|
+
metric_type : str, default='mae'
|
|
383
|
+
Type of metric: 'mae', 'mse', or 'mape'
|
|
384
|
+
level : str, default='cohort'
|
|
385
|
+
Level of aggregation:
|
|
386
|
+
- 'cohort': aggregated by days_since_ftd
|
|
387
|
+
- 'sample': individual samples
|
|
388
|
+
|
|
389
|
+
Returns
|
|
390
|
+
-------
|
|
391
|
+
Callable
|
|
392
|
+
Custom eval metric for LightGBM with signature:
|
|
393
|
+
(y_true, y_pred) -> (metric_name, metric_value, is_higher_better)
|
|
394
|
+
|
|
395
|
+
Raises
|
|
396
|
+
------
|
|
397
|
+
ValueError
|
|
398
|
+
If invalid combination of level and metric_type is provided
|
|
399
|
+
|
|
400
|
+
Examples
|
|
401
|
+
--------
|
|
402
|
+
>>> eval_metric = EvalMetric()
|
|
403
|
+
>>> lgb_metric = eval_metric.create_lgb_metric(X_val, metric_type='mae', level='cohort')
|
|
404
|
+
>>> model = lgb.train(params, train_data, valid_sets=[val_data], feval=lgb_metric)
|
|
405
|
+
"""
|
|
406
|
+
X_eval_reset = X_eval[["days_since_ftd"]].copy().reset_index(drop=True)
|
|
407
|
+
|
|
408
|
+
# Mapowanie: (level, metric_type) -> (metoda obliczająca, nazwa dla LightGBM)
|
|
409
|
+
metrics_map = {
|
|
410
|
+
("cohort", "mae"): (self.cohort_weighted_mae, "cohort_wmae"),
|
|
411
|
+
("cohort", "mse"): (self.cohort_weighted_mse, "cohort_wmse"),
|
|
412
|
+
("cohort", "mape"): (self.cohort_weighted_mape, "cohort_wmape"),
|
|
413
|
+
("sample", "mae"): (self.sample_weighted_mae, "sample_wmae"),
|
|
414
|
+
("sample", "mse"): (self.sample_weighted_mse, "sample_wmse"),
|
|
415
|
+
("sample", "mape"): (self.sample_weighted_mape, "sample_wmape"),
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
# Walidacja i wybór metryki
|
|
419
|
+
key = (level, metric_type)
|
|
420
|
+
if key not in metrics_map:
|
|
421
|
+
valid_combinations = ", ".join([f"{lvl}_{typ}" for lvl, typ in metrics_map.keys()])
|
|
422
|
+
raise ValueError(
|
|
423
|
+
f"Invalid combination: level={level!r}, metric_type={metric_type!r}. "
|
|
424
|
+
f"Valid combinations: {valid_combinations}"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
metric_func, metric_name = metrics_map[key]
|
|
428
|
+
|
|
429
|
+
# Wrapper dla LightGBM
|
|
430
|
+
def weighted_metric(y_true: Any, y_pred: Any) -> Tuple[str, float, bool]:
|
|
431
|
+
try:
|
|
432
|
+
# Konwersja do pandas Series jeśli potrzeba
|
|
433
|
+
if isinstance(y_true, np.ndarray):
|
|
434
|
+
y_true = pd.Series(y_true).reset_index(drop=True)
|
|
435
|
+
if isinstance(y_pred, np.ndarray):
|
|
436
|
+
y_pred = pd.Series(y_pred).reset_index(drop=True)
|
|
437
|
+
|
|
438
|
+
# Oblicz metrykę używając wybranej funkcji z klasy
|
|
439
|
+
metric_value = metric_func(X_eval_reset, y_true, y_pred)
|
|
440
|
+
|
|
441
|
+
# Zwróć (nazwa, wartość, czy_większe_lepsze)
|
|
442
|
+
return metric_name, metric_value, False
|
|
443
|
+
|
|
444
|
+
except Exception as e:
|
|
445
|
+
print(f"❌ Error in {metric_name}: {e}")
|
|
446
|
+
import traceback
|
|
447
|
+
|
|
448
|
+
traceback.print_exc()
|
|
449
|
+
return metric_name, 999999, False
|
|
450
|
+
|
|
451
|
+
return weighted_metric
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
class DaysWeightedObjective:
|
|
455
|
+
"""
|
|
456
|
+
Custom time-decayed objective function for LightGBM.
|
|
457
|
+
|
|
458
|
+
This class implements a custom loss function with time-based weighting
|
|
459
|
+
for gradient boosting. The objective applies time decay weights based
|
|
460
|
+
on days_since_ftd to prioritize predictions in specific time periods.
|
|
461
|
+
|
|
462
|
+
Attributes
|
|
463
|
+
----------
|
|
464
|
+
epsilon : float
|
|
465
|
+
Small constant for numerical stability in calculations
|
|
466
|
+
X : pd.DataFrame or None
|
|
467
|
+
Feature matrix containing days_since_ftd column
|
|
468
|
+
days_col : str
|
|
469
|
+
Name of the column containing days information
|
|
470
|
+
agg_level : str
|
|
471
|
+
Aggregation level ('sample' only currently supported)
|
|
472
|
+
mode : str
|
|
473
|
+
Loss function mode: 'mae', 'mse', or 'mape'
|
|
474
|
+
time_decay : np.ndarray
|
|
475
|
+
Computed time decay weights
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
def __init__(
|
|
479
|
+
self, days_col: str = "days_since_ftd", agg_level: str = "sample", mode: str = "mse", epsilon: float = 1e-3
|
|
480
|
+
) -> None:
|
|
481
|
+
"""
|
|
482
|
+
Initialize the custom objective function.
|
|
483
|
+
|
|
484
|
+
Parameters
|
|
485
|
+
----------
|
|
486
|
+
days_col : str, default='days_since_ftd'
|
|
487
|
+
Name of the column containing days since first transaction
|
|
488
|
+
agg_level : str, default='sample'
|
|
489
|
+
Aggregation level (currently only 'sample' is supported)
|
|
490
|
+
mode : str, default='mse'
|
|
491
|
+
Loss function mode: 'mae', 'mse', or 'mape'
|
|
492
|
+
epsilon : float, default=1e-3
|
|
493
|
+
Small constant for numerical stability
|
|
494
|
+
"""
|
|
495
|
+
self.epsilon = epsilon
|
|
496
|
+
self.X: Optional[pd.DataFrame] = None
|
|
497
|
+
self.days_col = days_col
|
|
498
|
+
self.agg_level = agg_level.lower()
|
|
499
|
+
self.mode = mode.lower()
|
|
500
|
+
self.time_decay: Optional[np.ndarray] = None
|
|
501
|
+
|
|
502
|
+
def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
503
|
+
"""
|
|
504
|
+
Compute gradients and hessians for LightGBM custom objective.
|
|
505
|
+
|
|
506
|
+
Parameters
|
|
507
|
+
----------
|
|
508
|
+
y_true : np.ndarray
|
|
509
|
+
True target values
|
|
510
|
+
y_pred : np.ndarray
|
|
511
|
+
Predicted values
|
|
512
|
+
|
|
513
|
+
Returns
|
|
514
|
+
-------
|
|
515
|
+
tuple of np.ndarray
|
|
516
|
+
(gradients, hessians) for LightGBM optimization
|
|
517
|
+
|
|
518
|
+
Raises
|
|
519
|
+
------
|
|
520
|
+
ValueError
|
|
521
|
+
If agg_level is not 'sample'
|
|
522
|
+
"""
|
|
523
|
+
if self.agg_level == "sample":
|
|
524
|
+
return self.sample_level(y_true, y_pred)
|
|
525
|
+
else:
|
|
526
|
+
raise ValueError("agg_level must be 'sample'")
|
|
527
|
+
|
|
528
|
+
def set_data(self, X: pd.DataFrame) -> None:
|
|
529
|
+
"""
|
|
530
|
+
Attach feature matrix for objective function computation.
|
|
531
|
+
|
|
532
|
+
Must be called before using the objective in LightGBM training
|
|
533
|
+
to provide access to the days_since_ftd column.
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
----------
|
|
537
|
+
X : pd.DataFrame
|
|
538
|
+
Feature matrix containing days_since_ftd column
|
|
539
|
+
"""
|
|
540
|
+
self.X = X[[self.days_col]].copy()
|
|
541
|
+
|
|
542
|
+
def get_time_decay_weight(self, X: pd.DataFrame) -> None:
|
|
543
|
+
"""
|
|
544
|
+
Calculates uniform time-based weights (currently all set to 1).
|
|
545
|
+
|
|
546
|
+
This method computes time decay weights based on days_since_ftd.
|
|
547
|
+
The current implementation uses uniform weights (all 1.0) across
|
|
548
|
+
all time periods. Can be modified to apply different weights for
|
|
549
|
+
different business value periods.
|
|
550
|
+
|
|
551
|
+
Parameters
|
|
552
|
+
----------
|
|
553
|
+
X : pd.DataFrame
|
|
554
|
+
Feature matrix containing days_since_ftd column
|
|
555
|
+
"""
|
|
556
|
+
days = X[self.days_col].values
|
|
557
|
+
self.time_decay = np.where(
|
|
558
|
+
days <= 7,
|
|
559
|
+
1,
|
|
560
|
+
np.where(
|
|
561
|
+
days <= 14,
|
|
562
|
+
1,
|
|
563
|
+
np.where(
|
|
564
|
+
days <= 45,
|
|
565
|
+
1,
|
|
566
|
+
np.where(days <= 90, 1, np.where(days <= 180, 1, np.where(days <= 270, 1, 1))),
|
|
567
|
+
),
|
|
568
|
+
),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
# def get_time_decay_weight(self, X):
|
|
572
|
+
# """
|
|
573
|
+
# Calculates the time decay weight
|
|
574
|
+
# """
|
|
575
|
+
# self.time_decay = np.exp(-X[self.days_col].values.astype(np.float64) / 365.0)
|
|
576
|
+
|
|
577
|
+
def sample_level(self, y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
578
|
+
"""
|
|
579
|
+
Compute weighted gradients and hessians for sample-level optimization.
|
|
580
|
+
|
|
581
|
+
Calculates first and second derivatives of the loss function with
|
|
582
|
+
time decay weighting applied. Supports MAE, MSE, and MAPE modes.
|
|
583
|
+
|
|
584
|
+
Parameters
|
|
585
|
+
----------
|
|
586
|
+
y_true : np.ndarray
|
|
587
|
+
True target values
|
|
588
|
+
y_pred : np.ndarray
|
|
589
|
+
Predicted values
|
|
590
|
+
|
|
591
|
+
Returns
|
|
592
|
+
-------
|
|
593
|
+
tuple of np.ndarray
|
|
594
|
+
(grad, hess) - Gradients and Hessians for LightGBM
|
|
595
|
+
|
|
596
|
+
Raises
|
|
597
|
+
------
|
|
598
|
+
ValueError
|
|
599
|
+
If X has not been set via set_data(), or if mode is invalid
|
|
600
|
+
"""
|
|
601
|
+
if self.X is None:
|
|
602
|
+
raise ValueError("Feature matrix X has not been set. Use set_data(X) before fitting.")
|
|
603
|
+
|
|
604
|
+
self.get_time_decay_weight(self.X)
|
|
605
|
+
error = y_pred - y_true
|
|
606
|
+
sign = np.sign(error)
|
|
607
|
+
|
|
608
|
+
if self.mode == "mape":
|
|
609
|
+
denom = np.abs(y_true) + self.epsilon
|
|
610
|
+
smooth = np.sqrt(error**2 + self.epsilon**2)
|
|
611
|
+
|
|
612
|
+
grad = self.time_decay * error / (smooth * denom)
|
|
613
|
+
hess = self.time_decay * (self.epsilon**2) / (smooth**3 * denom)
|
|
614
|
+
|
|
615
|
+
elif self.mode == "mae":
|
|
616
|
+
grad = self.time_decay * sign
|
|
617
|
+
hess = self.time_decay
|
|
618
|
+
|
|
619
|
+
elif self.mode == "mse":
|
|
620
|
+
grad = 2 * self.time_decay * error
|
|
621
|
+
hess = 2 * self.time_decay * np.ones_like(error)
|
|
622
|
+
|
|
623
|
+
else:
|
|
624
|
+
raise ValueError("mode must be 'mae' or 'mape' or 'mse'")
|
|
625
|
+
|
|
626
|
+
return grad, hess
|