utilsds-models 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- utilsds-models/__init__.py +0 -0
- utilsds-models/custom_metrics.py +626 -0
- utilsds-models/data_processing.py +396 -0
- utilsds-models/evip_dynamic.py +124 -0
- utilsds-models/metrics.py +179 -0
- utilsds-models/visualization.py +179 -0
- utilsds_models-0.0.1.dist-info/METADATA +106 -0
- utilsds_models-0.0.1.dist-info/RECORD +10 -0
- utilsds_models-0.0.1.dist-info/WHEEL +5 -0
- utilsds_models-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""Data processing classes compatible with scikit-learn pipelines"""
|
|
2
|
+
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
9
|
+
from sklearn.preprocessing import LabelEncoder
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ColumnCopyImputer(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
13
|
+
"""
|
|
14
|
+
Imputes values by copying from other columns.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
copy_mapping : dict
|
|
19
|
+
Mapping: {'target_column': 'source_column'}
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, copy_mapping: Dict[str, str]):
|
|
23
|
+
self.copy_mapping = copy_mapping
|
|
24
|
+
|
|
25
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "ColumnCopyImputer":
|
|
26
|
+
self.is_fitted_ = True
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
30
|
+
X_transformed = X.copy()
|
|
31
|
+
|
|
32
|
+
for target_col, source_col in self.copy_mapping.items():
|
|
33
|
+
if target_col in X_transformed.columns and source_col in X_transformed.columns:
|
|
34
|
+
null_mask = X_transformed[target_col].isnull()
|
|
35
|
+
values_to_copy = X_transformed.loc[null_mask, source_col]
|
|
36
|
+
|
|
37
|
+
if pd.api.types.is_integer_dtype(values_to_copy.dtype):
|
|
38
|
+
# Jeśli źródło to Int64, konwertuj na float
|
|
39
|
+
values_to_copy = values_to_copy.astype("float32")
|
|
40
|
+
|
|
41
|
+
X_transformed.loc[null_mask, target_col] = values_to_copy
|
|
42
|
+
|
|
43
|
+
return X_transformed
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class NullImputerWithFlags(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
47
|
+
"""
|
|
48
|
+
Imputes null values and creates flag columns for selected columns.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
columns : list
|
|
53
|
+
List of columns to impute
|
|
54
|
+
values : dict or any
|
|
55
|
+
Values for imputation. If dict, maps columns to values.
|
|
56
|
+
If single value, uses it for all columns.
|
|
57
|
+
flag_columns : list, optional
|
|
58
|
+
List of columns for which to create flags (whether value existed before imputation)
|
|
59
|
+
flag_suffix : str, default='_isnull_flag'
|
|
60
|
+
Suffix for flag column names
|
|
61
|
+
strategy : str, optional
|
|
62
|
+
Imputation strategy: 'mean', 'median', 'mode' (overrides values)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
columns: List[str],
|
|
68
|
+
values: Union[Dict[str, Any], Any] = None,
|
|
69
|
+
flag_columns: Optional[List[str]] = None,
|
|
70
|
+
flag_suffix: str = "_isnull_flag",
|
|
71
|
+
strategy: Optional[str] = None,
|
|
72
|
+
):
|
|
73
|
+
|
|
74
|
+
self.columns = columns
|
|
75
|
+
self.values = values
|
|
76
|
+
self.flag_columns = flag_columns or []
|
|
77
|
+
self.flag_suffix = flag_suffix
|
|
78
|
+
self.strategy = strategy
|
|
79
|
+
self._fitted_values: Dict[str, Any] = {}
|
|
80
|
+
|
|
81
|
+
# Walidacja
|
|
82
|
+
if self.values is None and self.strategy is None:
|
|
83
|
+
raise ValueError("Either 'values' or 'strategy' must be provided")
|
|
84
|
+
|
|
85
|
+
if self.strategy and self.strategy not in ["mean", "median", "mode"]:
|
|
86
|
+
raise ValueError("strategy must be one of: 'mean', 'median', 'mode'")
|
|
87
|
+
|
|
88
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "NullImputerWithFlags":
|
|
89
|
+
"""
|
|
90
|
+
Learn values for imputation (if using strategy).
|
|
91
|
+
"""
|
|
92
|
+
if self.strategy is not None:
|
|
93
|
+
for column in self.columns:
|
|
94
|
+
if column in X.columns:
|
|
95
|
+
if self.strategy == "mean":
|
|
96
|
+
self._fitted_values[column] = X[column].mean()
|
|
97
|
+
elif self.strategy == "median":
|
|
98
|
+
self._fitted_values[column] = X[column].median()
|
|
99
|
+
elif self.strategy == "mode":
|
|
100
|
+
mode_result = X[column].mode()
|
|
101
|
+
self._fitted_values[column] = mode_result[0] if len(mode_result) > 0 else 0
|
|
102
|
+
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
106
|
+
"""
|
|
107
|
+
Impute values and create flags.
|
|
108
|
+
"""
|
|
109
|
+
X_transformed = X.copy()
|
|
110
|
+
|
|
111
|
+
# 1. NAJPIERW STWÓRZ FLAGI (przed imputacją!)
|
|
112
|
+
for column in self.flag_columns:
|
|
113
|
+
if column in X_transformed.columns:
|
|
114
|
+
flag_name = f"{column}{self.flag_suffix}"
|
|
115
|
+
X_transformed[flag_name] = X_transformed[column].isnull().astype(int)
|
|
116
|
+
|
|
117
|
+
# 2. POTEM IMPUTUJ WARTOŚCI
|
|
118
|
+
for column in self.columns:
|
|
119
|
+
if column in X_transformed.columns:
|
|
120
|
+
|
|
121
|
+
# Określ wartość do imputacji
|
|
122
|
+
if self.strategy is not None and column in self._fitted_values:
|
|
123
|
+
fill_value = self._fitted_values[column]
|
|
124
|
+
elif isinstance(self.values, dict) and column in self.values:
|
|
125
|
+
fill_value = self.values[column]
|
|
126
|
+
elif not isinstance(self.values, dict):
|
|
127
|
+
fill_value = self.values
|
|
128
|
+
else:
|
|
129
|
+
continue # Pomiń jeśli nie ma wartości dla tej kolumny
|
|
130
|
+
|
|
131
|
+
# Imputuj
|
|
132
|
+
X_transformed[column] = X_transformed[column].fillna(fill_value)
|
|
133
|
+
|
|
134
|
+
return X_transformed
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class MaxMultiplierImputer(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
138
|
+
"""
|
|
139
|
+
Imputes values based on maximum * multiplier from training data.
|
|
140
|
+
Optionally creates flags for columns that had values before imputation.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
columns : list
|
|
145
|
+
List of columns to impute
|
|
146
|
+
multiplier : float, default=1.5
|
|
147
|
+
Multiplier for maximum value (max * multiplier)
|
|
148
|
+
flag_columns : list, optional
|
|
149
|
+
List of columns for which to create flags (whether value existed before imputation)
|
|
150
|
+
If None, no flags are created
|
|
151
|
+
flag_suffix : str, default='_flag'
|
|
152
|
+
Suffix for flag column names
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
columns: List[str],
|
|
158
|
+
multiplier: float = 1.5,
|
|
159
|
+
flag_columns: Optional[List[str]] = None,
|
|
160
|
+
flag_suffix: str = "_flag",
|
|
161
|
+
):
|
|
162
|
+
|
|
163
|
+
self.columns = columns
|
|
164
|
+
self.multiplier = multiplier
|
|
165
|
+
self.flag_columns = flag_columns or []
|
|
166
|
+
self.flag_suffix = flag_suffix
|
|
167
|
+
self._max_values: Dict[str, float] = {}
|
|
168
|
+
|
|
169
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "MaxMultiplierImputer":
|
|
170
|
+
"""
|
|
171
|
+
Learn maximum values from training data.
|
|
172
|
+
"""
|
|
173
|
+
for column in self.columns:
|
|
174
|
+
if column in X.columns:
|
|
175
|
+
existing_values = X[column].dropna()
|
|
176
|
+
if len(existing_values) > 0:
|
|
177
|
+
self._max_values[column] = existing_values.max() * self.multiplier
|
|
178
|
+
else:
|
|
179
|
+
self._max_values[column] = 365 # fallback
|
|
180
|
+
|
|
181
|
+
return self
|
|
182
|
+
|
|
183
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Create flags and impute values as max * multiplier.
|
|
186
|
+
"""
|
|
187
|
+
X_transformed = X.copy()
|
|
188
|
+
|
|
189
|
+
# 1. NAJPIERW STWÓRZ FLAGI (przed imputacją!)
|
|
190
|
+
for column in self.flag_columns:
|
|
191
|
+
if column in X_transformed.columns:
|
|
192
|
+
flag_name = f"{column}{self.flag_suffix}"
|
|
193
|
+
X_transformed[flag_name] = (X_transformed[column].isnull()).astype(int)
|
|
194
|
+
|
|
195
|
+
# 2. POTEM IMPUTUJ WARTOŚCI
|
|
196
|
+
for column in self.columns:
|
|
197
|
+
if column in X_transformed.columns and column in self._max_values:
|
|
198
|
+
X_transformed[column] = X_transformed[column].fillna(self._max_values[column])
|
|
199
|
+
|
|
200
|
+
return X_transformed
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class SportsHybridEncoder(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
204
|
+
"""
|
|
205
|
+
Encodes top N sports as binary features + aggregations for the rest.
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(self, sport_columns: Optional[List[str]] = None, top_n: int = 5):
|
|
209
|
+
self.sport_columns = sport_columns or ["dominant_sport", "first_ticket_sport"]
|
|
210
|
+
self.top_n = top_n
|
|
211
|
+
self.sport_mappings_: Dict[str, List[str]] = {}
|
|
212
|
+
|
|
213
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "SportsHybridEncoder":
|
|
214
|
+
|
|
215
|
+
for col in self.sport_columns:
|
|
216
|
+
if col not in X.columns:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
all_sports = []
|
|
220
|
+
for sports_array in X[col]:
|
|
221
|
+
if isinstance(sports_array, np.ndarray):
|
|
222
|
+
all_sports.extend(sports_array.tolist())
|
|
223
|
+
|
|
224
|
+
sport_counts = Counter(all_sports)
|
|
225
|
+
top_sports = [sport for sport, _ in sport_counts.most_common(self.top_n)]
|
|
226
|
+
self.sport_mappings_[col] = top_sports
|
|
227
|
+
|
|
228
|
+
print(f"{col}: top {len(top_sports)} sports selected")
|
|
229
|
+
|
|
230
|
+
return self
|
|
231
|
+
|
|
232
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
233
|
+
X_result = X.copy()
|
|
234
|
+
|
|
235
|
+
for col in self.sport_columns:
|
|
236
|
+
if col not in X.columns:
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
prefix = col.replace("_sport", "")
|
|
240
|
+
top_sports = self.sport_mappings_[col]
|
|
241
|
+
|
|
242
|
+
# Binary tylko dla top N sportów
|
|
243
|
+
for sport in top_sports:
|
|
244
|
+
safe_name = sport.lower().replace(" ", "_").replace("ł", "l").replace("ą", "a")
|
|
245
|
+
col_name = f"has_{prefix}_{safe_name}"
|
|
246
|
+
|
|
247
|
+
X_result[col_name] = X[col].apply(lambda x, s=sport: 1 if isinstance(x, np.ndarray) and s in x else 0)
|
|
248
|
+
|
|
249
|
+
# Agregacje
|
|
250
|
+
X_result[f"num_{prefix}_sports"] = X[col].apply(lambda x: len(x) if isinstance(x, np.ndarray) else 0)
|
|
251
|
+
|
|
252
|
+
X_result[f"num_{prefix}_niche"] = X[col].apply(
|
|
253
|
+
lambda x: sum(1 for s in x if s not in top_sports) if isinstance(x, np.ndarray) else 0
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
X_result[f"pct_{prefix}_popular"] = X_result.apply(
|
|
257
|
+
lambda row: (
|
|
258
|
+
(row[f"num_{prefix}_sports"] - row[f"num_{prefix}_niche"]) / row[f"num_{prefix}_sports"]
|
|
259
|
+
if row[f"num_{prefix}_sports"] > 0
|
|
260
|
+
else 0
|
|
261
|
+
),
|
|
262
|
+
axis=1,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Drop oryginał
|
|
266
|
+
X_result = X_result.drop(col, axis=1)
|
|
267
|
+
|
|
268
|
+
return X_result
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class LabelEncoderTransformer(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
272
|
+
"""
|
|
273
|
+
Label encoding for categorical variables (for LightGBM).
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
def __init__(self, columns: List[str]):
|
|
277
|
+
self.columns = columns
|
|
278
|
+
self.label_encoders_: Dict[str, LabelEncoder] = {}
|
|
279
|
+
|
|
280
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "LabelEncoderTransformer":
|
|
281
|
+
|
|
282
|
+
for col in self.columns:
|
|
283
|
+
if col not in X.columns:
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
le = LabelEncoder()
|
|
287
|
+
le.fit(X[col].astype(str))
|
|
288
|
+
self.label_encoders_[col] = le
|
|
289
|
+
|
|
290
|
+
print(f"{col}: {len(le.classes_)} categories")
|
|
291
|
+
|
|
292
|
+
return self
|
|
293
|
+
|
|
294
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
295
|
+
X_result = X.copy()
|
|
296
|
+
|
|
297
|
+
for col in self.columns:
|
|
298
|
+
if col not in X.columns or col not in self.label_encoders_:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
le = self.label_encoders_[col]
|
|
302
|
+
col_str = X[col].astype(str)
|
|
303
|
+
|
|
304
|
+
# Zamień nieznane kategorie na wartość która była w treningu
|
|
305
|
+
# lub na specjalną wartość -1
|
|
306
|
+
def safe_transform(val):
|
|
307
|
+
if val in le.classes_:
|
|
308
|
+
return le.transform([val])[0]
|
|
309
|
+
else:
|
|
310
|
+
# Nieznana wartość -> zwróć -1 lub pierwszą znaną klasę
|
|
311
|
+
return -1 # LightGBM poradzi sobie z -1 jako "unknown"
|
|
312
|
+
|
|
313
|
+
X_result[col] = col_str.apply(safe_transform)
|
|
314
|
+
|
|
315
|
+
# Oznacz jako category dla LightGBM
|
|
316
|
+
X_result[col] = X_result[col].astype("category")
|
|
317
|
+
|
|
318
|
+
return X_result
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class DerivedFeatureCreator(BaseEstimator, TransformerMixin): # type: ignore[misc]
|
|
322
|
+
"""
|
|
323
|
+
Creates new columns that are derived from existing columns.
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
derived_features : dict
|
|
328
|
+
Mapping: {'new_column': ('source_column', operation)}
|
|
329
|
+
Operation can be:
|
|
330
|
+
- 'divide_7' - divide by 7
|
|
331
|
+
- 'divide_30' - divide by 30
|
|
332
|
+
- float - divide by given number
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
def __init__(self, derived_features: Dict[str, Tuple[str, Any]]):
|
|
336
|
+
self.derived_features = derived_features
|
|
337
|
+
|
|
338
|
+
def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "DerivedFeatureCreator":
|
|
339
|
+
self.is_fitted_ = True
|
|
340
|
+
return self
|
|
341
|
+
|
|
342
|
+
def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
|
|
343
|
+
X_transformed = X.copy()
|
|
344
|
+
|
|
345
|
+
for new_col, (source_col, operation) in self.derived_features.items():
|
|
346
|
+
if source_col not in X_transformed.columns:
|
|
347
|
+
print(f"WARNING: Source column {source_col!r} not found, skipping {new_col!r}")
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
if operation == "divide_7":
|
|
351
|
+
X_transformed[new_col] = X_transformed[source_col] / 7
|
|
352
|
+
elif operation == "divide_30":
|
|
353
|
+
X_transformed[new_col] = X_transformed[source_col] / 30
|
|
354
|
+
elif isinstance(operation, (int, float)):
|
|
355
|
+
X_transformed[new_col] = X_transformed[source_col] / operation
|
|
356
|
+
else:
|
|
357
|
+
print(f"WARNING: Unknown operation {operation!r} for column {new_col!r}")
|
|
358
|
+
|
|
359
|
+
return X_transformed
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def combine_test_data(X: pd.DataFrame, y: pd.Series, y_pred: pd.Series, id_df: pd.DataFrame) -> pd.DataFrame:
|
|
363
|
+
"""
|
|
364
|
+
Combine test features, targets, predictions, and metadata into single DataFrame.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
X : pd.DataFrame
|
|
369
|
+
Features dataframe
|
|
370
|
+
y : pd.Series
|
|
371
|
+
Actual target values (remaining_ngr)
|
|
372
|
+
y_pred : pd.Series
|
|
373
|
+
Predicted target values
|
|
374
|
+
id_df : pd.DataFrame
|
|
375
|
+
DataFrame containing metadata like 'ngr_after_1_year'
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
pd.DataFrame
|
|
380
|
+
Combined dataframe with all columns:
|
|
381
|
+
- All columns from X
|
|
382
|
+
- 'ngr_after_1_year' from id_df
|
|
383
|
+
- 'remaining_ngr' (actual values)
|
|
384
|
+
- 'remaining_ngr_pred' (predicted values)
|
|
385
|
+
|
|
386
|
+
Examples
|
|
387
|
+
--------
|
|
388
|
+
>>> df = combine_test_data(X, y, y_pred, id_df)
|
|
389
|
+
>>> df.head()
|
|
390
|
+
"""
|
|
391
|
+
df = X.copy()
|
|
392
|
+
df["ngr_after_1_year"] = id_df["ngr_after_1_year"]
|
|
393
|
+
df["remaining_ngr"] = y
|
|
394
|
+
df["remaining_ngr_pred"] = y_pred
|
|
395
|
+
|
|
396
|
+
return df
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EVIP dynamic classification metrics.
|
|
3
|
+
|
|
4
|
+
This module provides recall-based metrics with false-positive budget constraints
|
|
5
|
+
for binary and multi-class premium classification use cases.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def recall_with_fp_cap(y_true, y_pred, fp_budget_rate=0.03017, alpha=2.0):
|
|
12
|
+
"""
|
|
13
|
+
Compute recall with a penalty when the false-positive budget is exceeded.
|
|
14
|
+
|
|
15
|
+
Standard recall is returned when the number of false positives stays within
|
|
16
|
+
the allowed budget. Otherwise, recall is scaled down linearly according to
|
|
17
|
+
how much the budget was exceeded.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y_true : array-like of shape (n_samples,)
|
|
22
|
+
Ground-truth binary labels (0 or 1).
|
|
23
|
+
y_pred : array-like of shape (n_samples,)
|
|
24
|
+
Predicted binary labels (0 or 1).
|
|
25
|
+
fp_budget_rate : float, default=0.03017
|
|
26
|
+
Maximum allowed false positives as a fraction of the sample size
|
|
27
|
+
(``fp_budget = fp_budget_rate * n_samples``).
|
|
28
|
+
alpha : float, default=2.0
|
|
29
|
+
Penalty multiplier applied to the FP budget excess ratio. The final
|
|
30
|
+
penalty is capped at 1.0.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
float
|
|
35
|
+
Recall when ``fp <= fp_budget``, otherwise
|
|
36
|
+
``recall * (1 - min(alpha * excess_ratio, 1.0))`` where
|
|
37
|
+
``excess_ratio = (fp - fp_budget) / fp_budget``.
|
|
38
|
+
|
|
39
|
+
Notes
|
|
40
|
+
-----
|
|
41
|
+
Recall is computed as ``tp / max(tp + fn, 1)``.
|
|
42
|
+
"""
|
|
43
|
+
y_true = np.asarray(y_true).ravel()
|
|
44
|
+
y_pred = np.asarray(y_pred).ravel()
|
|
45
|
+
|
|
46
|
+
fp_budget = fp_budget_rate * len(y_true)
|
|
47
|
+
|
|
48
|
+
tp = int(((y_pred == 1) & (y_true == 1)).sum())
|
|
49
|
+
fp = int(((y_pred == 1) & (y_true == 0)).sum())
|
|
50
|
+
fn = int(((y_pred == 0) & (y_true == 1)).sum())
|
|
51
|
+
|
|
52
|
+
recall = tp / max(tp + fn, 1)
|
|
53
|
+
|
|
54
|
+
if fp <= fp_budget:
|
|
55
|
+
return recall
|
|
56
|
+
|
|
57
|
+
excess_ratio = (fp - fp_budget) / fp_budget
|
|
58
|
+
penalty = min(alpha * excess_ratio, 1.0)
|
|
59
|
+
return recall * (1.0 - penalty)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def weighted_premium_recall_with_fp_cap(
|
|
63
|
+
y_true, y_pred, premium_fpr_budget=0.20, alpha=2.0, w_class_2=2.0
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Compute weighted premium recall with a penalty for excessive false-positive rate.
|
|
67
|
+
|
|
68
|
+
Classes 1 and 2 are treated as premium targets, with class 2 weighted more
|
|
69
|
+
heavily in recall. Class 0 is the negative class. A false positive is any
|
|
70
|
+
sample with true label 0 predicted as premium (class 1 or 2).
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
y_true : array-like of shape (n_samples,)
|
|
75
|
+
Ground-truth labels (0 = non-premium, 1 or 2 = premium).
|
|
76
|
+
y_pred : array-like of shape (n_samples,)
|
|
77
|
+
Predicted labels (0, 1, or 2).
|
|
78
|
+
premium_fpr_budget : float, default=0.20
|
|
79
|
+
Maximum allowed false-positive rate among non-premium samples.
|
|
80
|
+
alpha : float, default=2.0
|
|
81
|
+
Penalty multiplier applied to the FPR excess ratio. The final penalty
|
|
82
|
+
is capped at 1.0.
|
|
83
|
+
w_class_2 : float, default=2.0
|
|
84
|
+
Weight applied to true positives from class 2 in the weighted recall.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
float
|
|
89
|
+
Weighted premium recall when ``fpr <= premium_fpr_budget``, otherwise
|
|
90
|
+
``weighted_recall * (1 - min(alpha * excess_ratio, 1.0))`` where
|
|
91
|
+
``excess_ratio = (fpr - premium_fpr_budget) / premium_fpr_budget``.
|
|
92
|
+
Returns ``0.0`` when there are no premium samples in ``y_true``.
|
|
93
|
+
|
|
94
|
+
Notes
|
|
95
|
+
-----
|
|
96
|
+
Weighted recall is computed as
|
|
97
|
+
``(tp_1 + tp_2 * w_class_2) / (n_1 + n_2 * w_class_2)``.
|
|
98
|
+
"""
|
|
99
|
+
y_true = np.asarray(y_true).ravel()
|
|
100
|
+
y_pred = np.asarray(y_pred).ravel()
|
|
101
|
+
|
|
102
|
+
tp_1 = int(((y_true == 1) & (y_pred == 1)).sum())
|
|
103
|
+
tp_2 = int(((y_true == 2) & (y_pred == 2)).sum())
|
|
104
|
+
n_1 = int((y_true == 1).sum())
|
|
105
|
+
n_2 = int((y_true == 2).sum())
|
|
106
|
+
|
|
107
|
+
max_score = n_1 + n_2 * w_class_2
|
|
108
|
+
if max_score == 0:
|
|
109
|
+
return 0.0
|
|
110
|
+
|
|
111
|
+
weighted_recall = (tp_1 + tp_2 * w_class_2) / max_score
|
|
112
|
+
|
|
113
|
+
is_premium_true = (y_true == 1) | (y_true == 2)
|
|
114
|
+
is_premium_pred = (y_pred == 1) | (y_pred == 2)
|
|
115
|
+
fp = int((~is_premium_true & is_premium_pred).sum())
|
|
116
|
+
n_negative = int((~is_premium_true).sum())
|
|
117
|
+
fpr = fp / max(n_negative, 1)
|
|
118
|
+
|
|
119
|
+
if fpr <= premium_fpr_budget:
|
|
120
|
+
return weighted_recall
|
|
121
|
+
|
|
122
|
+
excess_ratio = (fpr - premium_fpr_budget) / premium_fpr_budget
|
|
123
|
+
penalty = min(alpha * excess_ratio, 1.0)
|
|
124
|
+
return weighted_recall * (1.0 - penalty)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NGR Metrics Calculation Module
|
|
3
|
+
|
|
4
|
+
This module provides functions for calculating various error metrics
|
|
5
|
+
for NGR (Net Gaming Revenue) predictions with business-optimal weighting.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, Optional
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def calculate_ngr_metrics(
|
|
15
|
+
df: pd.DataFrame, late_stage_day: Optional[int] = None, late_stage_correction: Optional[float] = None
|
|
16
|
+
) -> Dict[str, Any]:
|
|
17
|
+
"""
|
|
18
|
+
Calculate NGR metrics with business-optimal weighting.
|
|
19
|
+
|
|
20
|
+
This function computes various error metrics (MAE, MAPE, ME, MPE) with business weights
|
|
21
|
+
based on days_since_ftd. Optionally applies late-stage corrections to predictions.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
df : pd.DataFrame
|
|
26
|
+
DataFrame containing prediction results with columns:
|
|
27
|
+
- 'days_since_ftd': int, days since first time deposit
|
|
28
|
+
- 'remaining_ngr': float, actual remaining NGR
|
|
29
|
+
- 'remaining_ngr_pred': float, predicted remaining NGR
|
|
30
|
+
late_stage_day : int, optional
|
|
31
|
+
Day threshold after which to apply correction
|
|
32
|
+
late_stage_correction : float, optional
|
|
33
|
+
Correction factor to apply (0.0-1.0) linearly after late_stage_day
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
dict
|
|
38
|
+
Dictionary containing calculated metrics:
|
|
39
|
+
- 'Standard MAE': Mean Absolute Error
|
|
40
|
+
- 'Standard MAPE (%)': Mean Absolute Percentage Error
|
|
41
|
+
- 'Standard ME': Mean Error
|
|
42
|
+
- 'Standard MPE (%)': Mean Percentage Error
|
|
43
|
+
- 'Business Optimal MAE': Weighted MAE
|
|
44
|
+
- 'Business Optimal MAPE (%)': Weighted MAPE
|
|
45
|
+
- 'Business Optimal ME': Weighted ME
|
|
46
|
+
- 'Business Optimal MPE (%)': Weighted MPE
|
|
47
|
+
- 'mean_abs_error_by_bin': DataFrame with errors by period
|
|
48
|
+
- 'mean_values': DataFrame with aggregated values by day
|
|
49
|
+
- 'weights': Array of business weights by day
|
|
50
|
+
|
|
51
|
+
Notes
|
|
52
|
+
-----
|
|
53
|
+
Business-optimal weights by day ranges:
|
|
54
|
+
- Days 1-7: 0.05 (learning period - insufficient data)
|
|
55
|
+
- Days 8-14: 0.85 (early signals emerging)
|
|
56
|
+
- Days 15-45: 1.00 (SUPER CRITICAL - optimal intervention window)
|
|
57
|
+
- Days 46-90: 0.90 (confirmation period - high value)
|
|
58
|
+
- Days 91-180: 0.60 (established patterns - moderate value)
|
|
59
|
+
- Days 181-270: 0.40 (mature behavior - operational value)
|
|
60
|
+
- Days 271+: 0.30 (end-game precision - tactical value)
|
|
61
|
+
|
|
62
|
+
Examples
|
|
63
|
+
--------
|
|
64
|
+
>>> metrics = calculate_ngr_metrics(
|
|
65
|
+
... df,
|
|
66
|
+
... late_stage_day=320,
|
|
67
|
+
... late_stage_correction=0.95
|
|
68
|
+
... )
|
|
69
|
+
>>> print(f"Business MAE: {metrics['Business Optimal MAE']:.2f}")
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Calculate mean values grouped by days_since_ftd
|
|
73
|
+
mean_values = (
|
|
74
|
+
df.groupby("days_since_ftd")
|
|
75
|
+
.agg(
|
|
76
|
+
{
|
|
77
|
+
"remaining_ngr": "mean",
|
|
78
|
+
"remaining_ngr_pred": "mean",
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
.reset_index()
|
|
82
|
+
)
|
|
83
|
+
mean_values = mean_values[mean_values["days_since_ftd"] < 365]
|
|
84
|
+
|
|
85
|
+
# Apply late stage correction if specified
|
|
86
|
+
if late_stage_day is not None and late_stage_correction is not None:
|
|
87
|
+
days_array = mean_values["days_since_ftd"].values
|
|
88
|
+
days_from_start = days_array - late_stage_day
|
|
89
|
+
max_days = 364 - late_stage_day
|
|
90
|
+
|
|
91
|
+
correction_factor = np.where(
|
|
92
|
+
days_array <= late_stage_day,
|
|
93
|
+
1.0, # Before late_stage_day: no changes
|
|
94
|
+
1.0 - (1.0 - late_stage_correction) * (days_from_start / max_days), # Linear transition
|
|
95
|
+
)
|
|
96
|
+
mean_values["remaining_ngr_pred"] = mean_values["remaining_ngr_pred"] * correction_factor
|
|
97
|
+
|
|
98
|
+
days = mean_values["days_since_ftd"].values.astype(np.float64)
|
|
99
|
+
|
|
100
|
+
# Business-optimal weighting
|
|
101
|
+
weights = np.where(
|
|
102
|
+
days <= 7,
|
|
103
|
+
0.05, # Learning period - insufficient data
|
|
104
|
+
np.where(
|
|
105
|
+
days <= 14,
|
|
106
|
+
0.85, # Early signals emerging
|
|
107
|
+
np.where(
|
|
108
|
+
days <= 45,
|
|
109
|
+
1.00, # SUPER CRITICAL - optimal intervention window
|
|
110
|
+
np.where(
|
|
111
|
+
days <= 90,
|
|
112
|
+
0.90, # Confirmation period - high value
|
|
113
|
+
np.where(
|
|
114
|
+
days <= 180,
|
|
115
|
+
0.60, # Established patterns - moderate value
|
|
116
|
+
np.where(days <= 270, 0.40, 0.30), # Mature behavior - operational value
|
|
117
|
+
),
|
|
118
|
+
),
|
|
119
|
+
),
|
|
120
|
+
),
|
|
121
|
+
) # End-game precision - tactical value
|
|
122
|
+
|
|
123
|
+
abs_errors = np.abs(mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"])
|
|
124
|
+
errors = mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]
|
|
125
|
+
|
|
126
|
+
# Define bins for error analysis by period
|
|
127
|
+
bins = [0, 7, 14, 45, 90, 180, 270, 364]
|
|
128
|
+
labels = ["1-7", "8-14", "15-45", "46-90", "91-180", "181-270", "271-364"]
|
|
129
|
+
|
|
130
|
+
# Calculate standard metrics
|
|
131
|
+
business_optimal_tmae = np.sum(abs_errors * weights) / np.sum(weights)
|
|
132
|
+
business_optimal_me = np.sum(errors * weights) / np.sum(weights)
|
|
133
|
+
standard_mae = round(np.mean(abs_errors), 4)
|
|
134
|
+
standard_me = round(np.mean(errors), 4)
|
|
135
|
+
|
|
136
|
+
# Calculate percentage-based metrics
|
|
137
|
+
# Avoid division by zero - use small epsilon for very small values
|
|
138
|
+
epsilon = 1e-6
|
|
139
|
+
safe_remaining_ngr = np.where(np.abs(mean_values["remaining_ngr"]) < epsilon, epsilon, mean_values["remaining_ngr"])
|
|
140
|
+
|
|
141
|
+
percentage_abs_errors = (
|
|
142
|
+
np.abs((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
|
|
143
|
+
)
|
|
144
|
+
errors_percentage = ((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
|
|
145
|
+
|
|
146
|
+
# Assign each day to a period bin
|
|
147
|
+
day_bins = pd.cut(mean_values["days_since_ftd"], bins=bins, labels=labels, right=True, include_lowest=True)
|
|
148
|
+
|
|
149
|
+
# Calculate mean errors by period
|
|
150
|
+
mean_abs_error_by_bin = pd.DataFrame(
|
|
151
|
+
{
|
|
152
|
+
"bin": labels,
|
|
153
|
+
"mean_abs_error": [abs_errors[day_bins == label].mean() for label in labels],
|
|
154
|
+
"mean_error": [errors[day_bins == label].mean() for label in labels],
|
|
155
|
+
"mape": [percentage_abs_errors[day_bins == label].mean() for label in labels],
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Calculate business-optimal percentage metrics
|
|
160
|
+
business_optimal_mape = np.sum(percentage_abs_errors * weights) / np.sum(weights)
|
|
161
|
+
standard_mape = np.mean(percentage_abs_errors)
|
|
162
|
+
|
|
163
|
+
business_optimal_mpe = np.sum(errors_percentage * weights) / np.sum(weights)
|
|
164
|
+
standard_mpe = np.mean(errors_percentage)
|
|
165
|
+
|
|
166
|
+
# Return dict with all metrics
|
|
167
|
+
return {
|
|
168
|
+
"Standard MAE": standard_mae,
|
|
169
|
+
"Standard MAPE (%)": round(standard_mape, 1),
|
|
170
|
+
"Standard ME": standard_me,
|
|
171
|
+
"Standard MPE (%)": round(standard_mpe, 1),
|
|
172
|
+
"Business Optimal MAE": round(business_optimal_tmae, 2),
|
|
173
|
+
"Business Optimal MAPE (%)": round(business_optimal_mape, 1),
|
|
174
|
+
"Business Optimal ME": round(business_optimal_me, 2),
|
|
175
|
+
"Business Optimal MPE (%)": round(business_optimal_mpe, 1),
|
|
176
|
+
"mean_abs_error_by_bin": mean_abs_error_by_bin,
|
|
177
|
+
"mean_values": mean_values,
|
|
178
|
+
"weights": weights,
|
|
179
|
+
}
|