utilsds-models 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ """Data processing classes compatible with scikit-learn pipelines"""
2
+
3
+ from collections import Counter
4
+ from typing import Any, Dict, List, Optional, Tuple, Union
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.base import BaseEstimator, TransformerMixin
9
+ from sklearn.preprocessing import LabelEncoder
10
+
11
+
12
+ class ColumnCopyImputer(BaseEstimator, TransformerMixin): # type: ignore[misc]
13
+ """
14
+ Imputes values by copying from other columns.
15
+
16
+ Parameters
17
+ ----------
18
+ copy_mapping : dict
19
+ Mapping: {'target_column': 'source_column'}
20
+ """
21
+
22
+ def __init__(self, copy_mapping: Dict[str, str]):
23
+ self.copy_mapping = copy_mapping
24
+
25
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "ColumnCopyImputer":
26
+ self.is_fitted_ = True
27
+ return self
28
+
29
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
30
+ X_transformed = X.copy()
31
+
32
+ for target_col, source_col in self.copy_mapping.items():
33
+ if target_col in X_transformed.columns and source_col in X_transformed.columns:
34
+ null_mask = X_transformed[target_col].isnull()
35
+ values_to_copy = X_transformed.loc[null_mask, source_col]
36
+
37
+ if pd.api.types.is_integer_dtype(values_to_copy.dtype):
38
+ # Jeśli źródło to Int64, konwertuj na float
39
+ values_to_copy = values_to_copy.astype("float32")
40
+
41
+ X_transformed.loc[null_mask, target_col] = values_to_copy
42
+
43
+ return X_transformed
44
+
45
+
46
+ class NullImputerWithFlags(BaseEstimator, TransformerMixin): # type: ignore[misc]
47
+ """
48
+ Imputes null values and creates flag columns for selected columns.
49
+
50
+ Parameters
51
+ ----------
52
+ columns : list
53
+ List of columns to impute
54
+ values : dict or any
55
+ Values for imputation. If dict, maps columns to values.
56
+ If single value, uses it for all columns.
57
+ flag_columns : list, optional
58
+ List of columns for which to create flags (whether value existed before imputation)
59
+ flag_suffix : str, default='_isnull_flag'
60
+ Suffix for flag column names
61
+ strategy : str, optional
62
+ Imputation strategy: 'mean', 'median', 'mode' (overrides values)
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ columns: List[str],
68
+ values: Union[Dict[str, Any], Any] = None,
69
+ flag_columns: Optional[List[str]] = None,
70
+ flag_suffix: str = "_isnull_flag",
71
+ strategy: Optional[str] = None,
72
+ ):
73
+
74
+ self.columns = columns
75
+ self.values = values
76
+ self.flag_columns = flag_columns or []
77
+ self.flag_suffix = flag_suffix
78
+ self.strategy = strategy
79
+ self._fitted_values: Dict[str, Any] = {}
80
+
81
+ # Walidacja
82
+ if self.values is None and self.strategy is None:
83
+ raise ValueError("Either 'values' or 'strategy' must be provided")
84
+
85
+ if self.strategy and self.strategy not in ["mean", "median", "mode"]:
86
+ raise ValueError("strategy must be one of: 'mean', 'median', 'mode'")
87
+
88
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "NullImputerWithFlags":
89
+ """
90
+ Learn values for imputation (if using strategy).
91
+ """
92
+ if self.strategy is not None:
93
+ for column in self.columns:
94
+ if column in X.columns:
95
+ if self.strategy == "mean":
96
+ self._fitted_values[column] = X[column].mean()
97
+ elif self.strategy == "median":
98
+ self._fitted_values[column] = X[column].median()
99
+ elif self.strategy == "mode":
100
+ mode_result = X[column].mode()
101
+ self._fitted_values[column] = mode_result[0] if len(mode_result) > 0 else 0
102
+
103
+ return self
104
+
105
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
106
+ """
107
+ Impute values and create flags.
108
+ """
109
+ X_transformed = X.copy()
110
+
111
+ # 1. NAJPIERW STWÓRZ FLAGI (przed imputacją!)
112
+ for column in self.flag_columns:
113
+ if column in X_transformed.columns:
114
+ flag_name = f"{column}{self.flag_suffix}"
115
+ X_transformed[flag_name] = X_transformed[column].isnull().astype(int)
116
+
117
+ # 2. POTEM IMPUTUJ WARTOŚCI
118
+ for column in self.columns:
119
+ if column in X_transformed.columns:
120
+
121
+ # Określ wartość do imputacji
122
+ if self.strategy is not None and column in self._fitted_values:
123
+ fill_value = self._fitted_values[column]
124
+ elif isinstance(self.values, dict) and column in self.values:
125
+ fill_value = self.values[column]
126
+ elif not isinstance(self.values, dict):
127
+ fill_value = self.values
128
+ else:
129
+ continue # Pomiń jeśli nie ma wartości dla tej kolumny
130
+
131
+ # Imputuj
132
+ X_transformed[column] = X_transformed[column].fillna(fill_value)
133
+
134
+ return X_transformed
135
+
136
+
137
+ class MaxMultiplierImputer(BaseEstimator, TransformerMixin): # type: ignore[misc]
138
+ """
139
+ Imputes values based on maximum * multiplier from training data.
140
+ Optionally creates flags for columns that had values before imputation.
141
+
142
+ Parameters
143
+ ----------
144
+ columns : list
145
+ List of columns to impute
146
+ multiplier : float, default=1.5
147
+ Multiplier for maximum value (max * multiplier)
148
+ flag_columns : list, optional
149
+ List of columns for which to create flags (whether value existed before imputation)
150
+ If None, no flags are created
151
+ flag_suffix : str, default='_flag'
152
+ Suffix for flag column names
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ columns: List[str],
158
+ multiplier: float = 1.5,
159
+ flag_columns: Optional[List[str]] = None,
160
+ flag_suffix: str = "_flag",
161
+ ):
162
+
163
+ self.columns = columns
164
+ self.multiplier = multiplier
165
+ self.flag_columns = flag_columns or []
166
+ self.flag_suffix = flag_suffix
167
+ self._max_values: Dict[str, float] = {}
168
+
169
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "MaxMultiplierImputer":
170
+ """
171
+ Learn maximum values from training data.
172
+ """
173
+ for column in self.columns:
174
+ if column in X.columns:
175
+ existing_values = X[column].dropna()
176
+ if len(existing_values) > 0:
177
+ self._max_values[column] = existing_values.max() * self.multiplier
178
+ else:
179
+ self._max_values[column] = 365 # fallback
180
+
181
+ return self
182
+
183
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
184
+ """
185
+ Create flags and impute values as max * multiplier.
186
+ """
187
+ X_transformed = X.copy()
188
+
189
+ # 1. NAJPIERW STWÓRZ FLAGI (przed imputacją!)
190
+ for column in self.flag_columns:
191
+ if column in X_transformed.columns:
192
+ flag_name = f"{column}{self.flag_suffix}"
193
+ X_transformed[flag_name] = (X_transformed[column].isnull()).astype(int)
194
+
195
+ # 2. POTEM IMPUTUJ WARTOŚCI
196
+ for column in self.columns:
197
+ if column in X_transformed.columns and column in self._max_values:
198
+ X_transformed[column] = X_transformed[column].fillna(self._max_values[column])
199
+
200
+ return X_transformed
201
+
202
+
203
+ class SportsHybridEncoder(BaseEstimator, TransformerMixin): # type: ignore[misc]
204
+ """
205
+ Encodes top N sports as binary features + aggregations for the rest.
206
+ """
207
+
208
+ def __init__(self, sport_columns: Optional[List[str]] = None, top_n: int = 5):
209
+ self.sport_columns = sport_columns or ["dominant_sport", "first_ticket_sport"]
210
+ self.top_n = top_n
211
+ self.sport_mappings_: Dict[str, List[str]] = {}
212
+
213
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "SportsHybridEncoder":
214
+
215
+ for col in self.sport_columns:
216
+ if col not in X.columns:
217
+ continue
218
+
219
+ all_sports = []
220
+ for sports_array in X[col]:
221
+ if isinstance(sports_array, np.ndarray):
222
+ all_sports.extend(sports_array.tolist())
223
+
224
+ sport_counts = Counter(all_sports)
225
+ top_sports = [sport for sport, _ in sport_counts.most_common(self.top_n)]
226
+ self.sport_mappings_[col] = top_sports
227
+
228
+ print(f"{col}: top {len(top_sports)} sports selected")
229
+
230
+ return self
231
+
232
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
233
+ X_result = X.copy()
234
+
235
+ for col in self.sport_columns:
236
+ if col not in X.columns:
237
+ continue
238
+
239
+ prefix = col.replace("_sport", "")
240
+ top_sports = self.sport_mappings_[col]
241
+
242
+ # Binary tylko dla top N sportów
243
+ for sport in top_sports:
244
+ safe_name = sport.lower().replace(" ", "_").replace("ł", "l").replace("ą", "a")
245
+ col_name = f"has_{prefix}_{safe_name}"
246
+
247
+ X_result[col_name] = X[col].apply(lambda x, s=sport: 1 if isinstance(x, np.ndarray) and s in x else 0)
248
+
249
+ # Agregacje
250
+ X_result[f"num_{prefix}_sports"] = X[col].apply(lambda x: len(x) if isinstance(x, np.ndarray) else 0)
251
+
252
+ X_result[f"num_{prefix}_niche"] = X[col].apply(
253
+ lambda x: sum(1 for s in x if s not in top_sports) if isinstance(x, np.ndarray) else 0
254
+ )
255
+
256
+ X_result[f"pct_{prefix}_popular"] = X_result.apply(
257
+ lambda row: (
258
+ (row[f"num_{prefix}_sports"] - row[f"num_{prefix}_niche"]) / row[f"num_{prefix}_sports"]
259
+ if row[f"num_{prefix}_sports"] > 0
260
+ else 0
261
+ ),
262
+ axis=1,
263
+ )
264
+
265
+ # Drop oryginał
266
+ X_result = X_result.drop(col, axis=1)
267
+
268
+ return X_result
269
+
270
+
271
+ class LabelEncoderTransformer(BaseEstimator, TransformerMixin): # type: ignore[misc]
272
+ """
273
+ Label encoding for categorical variables (for LightGBM).
274
+ """
275
+
276
+ def __init__(self, columns: List[str]):
277
+ self.columns = columns
278
+ self.label_encoders_: Dict[str, LabelEncoder] = {}
279
+
280
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "LabelEncoderTransformer":
281
+
282
+ for col in self.columns:
283
+ if col not in X.columns:
284
+ continue
285
+
286
+ le = LabelEncoder()
287
+ le.fit(X[col].astype(str))
288
+ self.label_encoders_[col] = le
289
+
290
+ print(f"{col}: {len(le.classes_)} categories")
291
+
292
+ return self
293
+
294
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
295
+ X_result = X.copy()
296
+
297
+ for col in self.columns:
298
+ if col not in X.columns or col not in self.label_encoders_:
299
+ continue
300
+
301
+ le = self.label_encoders_[col]
302
+ col_str = X[col].astype(str)
303
+
304
+ # Zamień nieznane kategorie na wartość która była w treningu
305
+ # lub na specjalną wartość -1
306
+ def safe_transform(val):
307
+ if val in le.classes_:
308
+ return le.transform([val])[0]
309
+ else:
310
+ # Nieznana wartość -> zwróć -1 lub pierwszą znaną klasę
311
+ return -1 # LightGBM poradzi sobie z -1 jako "unknown"
312
+
313
+ X_result[col] = col_str.apply(safe_transform)
314
+
315
+ # Oznacz jako category dla LightGBM
316
+ X_result[col] = X_result[col].astype("category")
317
+
318
+ return X_result
319
+
320
+
321
+ class DerivedFeatureCreator(BaseEstimator, TransformerMixin): # type: ignore[misc]
322
+ """
323
+ Creates new columns that are derived from existing columns.
324
+
325
+ Parameters
326
+ ----------
327
+ derived_features : dict
328
+ Mapping: {'new_column': ('source_column', operation)}
329
+ Operation can be:
330
+ - 'divide_7' - divide by 7
331
+ - 'divide_30' - divide by 30
332
+ - float - divide by given number
333
+ """
334
+
335
+ def __init__(self, derived_features: Dict[str, Tuple[str, Any]]):
336
+ self.derived_features = derived_features
337
+
338
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None) -> "DerivedFeatureCreator":
339
+ self.is_fitted_ = True
340
+ return self
341
+
342
+ def transform(self, X: pd.DataFrame, y: Optional[Any] = None) -> pd.DataFrame:
343
+ X_transformed = X.copy()
344
+
345
+ for new_col, (source_col, operation) in self.derived_features.items():
346
+ if source_col not in X_transformed.columns:
347
+ print(f"WARNING: Source column {source_col!r} not found, skipping {new_col!r}")
348
+ continue
349
+
350
+ if operation == "divide_7":
351
+ X_transformed[new_col] = X_transformed[source_col] / 7
352
+ elif operation == "divide_30":
353
+ X_transformed[new_col] = X_transformed[source_col] / 30
354
+ elif isinstance(operation, (int, float)):
355
+ X_transformed[new_col] = X_transformed[source_col] / operation
356
+ else:
357
+ print(f"WARNING: Unknown operation {operation!r} for column {new_col!r}")
358
+
359
+ return X_transformed
360
+
361
+
362
+ def combine_test_data(X: pd.DataFrame, y: pd.Series, y_pred: pd.Series, id_df: pd.DataFrame) -> pd.DataFrame:
363
+ """
364
+ Combine test features, targets, predictions, and metadata into single DataFrame.
365
+
366
+ Parameters
367
+ ----------
368
+ X : pd.DataFrame
369
+ Features dataframe
370
+ y : pd.Series
371
+ Actual target values (remaining_ngr)
372
+ y_pred : pd.Series
373
+ Predicted target values
374
+ id_df : pd.DataFrame
375
+ DataFrame containing metadata like 'ngr_after_1_year'
376
+
377
+ Returns
378
+ -------
379
+ pd.DataFrame
380
+ Combined dataframe with all columns:
381
+ - All columns from X
382
+ - 'ngr_after_1_year' from id_df
383
+ - 'remaining_ngr' (actual values)
384
+ - 'remaining_ngr_pred' (predicted values)
385
+
386
+ Examples
387
+ --------
388
+ >>> df = combine_test_data(X, y, y_pred, id_df)
389
+ >>> df.head()
390
+ """
391
+ df = X.copy()
392
+ df["ngr_after_1_year"] = id_df["ngr_after_1_year"]
393
+ df["remaining_ngr"] = y
394
+ df["remaining_ngr_pred"] = y_pred
395
+
396
+ return df
@@ -0,0 +1,124 @@
1
+ """
2
+ EVIP dynamic classification metrics.
3
+
4
+ This module provides recall-based metrics with false-positive budget constraints
5
+ for binary and multi-class premium classification use cases.
6
+ """
7
+
8
+ import numpy as np
9
+
10
+
11
+ def recall_with_fp_cap(y_true, y_pred, fp_budget_rate=0.03017, alpha=2.0):
12
+ """
13
+ Compute recall with a penalty when the false-positive budget is exceeded.
14
+
15
+ Standard recall is returned when the number of false positives stays within
16
+ the allowed budget. Otherwise, recall is scaled down linearly according to
17
+ how much the budget was exceeded.
18
+
19
+ Parameters
20
+ ----------
21
+ y_true : array-like of shape (n_samples,)
22
+ Ground-truth binary labels (0 or 1).
23
+ y_pred : array-like of shape (n_samples,)
24
+ Predicted binary labels (0 or 1).
25
+ fp_budget_rate : float, default=0.03017
26
+ Maximum allowed false positives as a fraction of the sample size
27
+ (``fp_budget = fp_budget_rate * n_samples``).
28
+ alpha : float, default=2.0
29
+ Penalty multiplier applied to the FP budget excess ratio. The final
30
+ penalty is capped at 1.0.
31
+
32
+ Returns
33
+ -------
34
+ float
35
+ Recall when ``fp <= fp_budget``, otherwise
36
+ ``recall * (1 - min(alpha * excess_ratio, 1.0))`` where
37
+ ``excess_ratio = (fp - fp_budget) / fp_budget``.
38
+
39
+ Notes
40
+ -----
41
+ Recall is computed as ``tp / max(tp + fn, 1)``.
42
+ """
43
+ y_true = np.asarray(y_true).ravel()
44
+ y_pred = np.asarray(y_pred).ravel()
45
+
46
+ fp_budget = fp_budget_rate * len(y_true)
47
+
48
+ tp = int(((y_pred == 1) & (y_true == 1)).sum())
49
+ fp = int(((y_pred == 1) & (y_true == 0)).sum())
50
+ fn = int(((y_pred == 0) & (y_true == 1)).sum())
51
+
52
+ recall = tp / max(tp + fn, 1)
53
+
54
+ if fp <= fp_budget:
55
+ return recall
56
+
57
+ excess_ratio = (fp - fp_budget) / fp_budget
58
+ penalty = min(alpha * excess_ratio, 1.0)
59
+ return recall * (1.0 - penalty)
60
+
61
+
62
+ def weighted_premium_recall_with_fp_cap(
63
+ y_true, y_pred, premium_fpr_budget=0.20, alpha=2.0, w_class_2=2.0
64
+ ):
65
+ """
66
+ Compute weighted premium recall with a penalty for excessive false-positive rate.
67
+
68
+ Classes 1 and 2 are treated as premium targets, with class 2 weighted more
69
+ heavily in recall. Class 0 is the negative class. A false positive is any
70
+ sample with true label 0 predicted as premium (class 1 or 2).
71
+
72
+ Parameters
73
+ ----------
74
+ y_true : array-like of shape (n_samples,)
75
+ Ground-truth labels (0 = non-premium, 1 or 2 = premium).
76
+ y_pred : array-like of shape (n_samples,)
77
+ Predicted labels (0, 1, or 2).
78
+ premium_fpr_budget : float, default=0.20
79
+ Maximum allowed false-positive rate among non-premium samples.
80
+ alpha : float, default=2.0
81
+ Penalty multiplier applied to the FPR excess ratio. The final penalty
82
+ is capped at 1.0.
83
+ w_class_2 : float, default=2.0
84
+ Weight applied to true positives from class 2 in the weighted recall.
85
+
86
+ Returns
87
+ -------
88
+ float
89
+ Weighted premium recall when ``fpr <= premium_fpr_budget``, otherwise
90
+ ``weighted_recall * (1 - min(alpha * excess_ratio, 1.0))`` where
91
+ ``excess_ratio = (fpr - premium_fpr_budget) / premium_fpr_budget``.
92
+ Returns ``0.0`` when there are no premium samples in ``y_true``.
93
+
94
+ Notes
95
+ -----
96
+ Weighted recall is computed as
97
+ ``(tp_1 + tp_2 * w_class_2) / (n_1 + n_2 * w_class_2)``.
98
+ """
99
+ y_true = np.asarray(y_true).ravel()
100
+ y_pred = np.asarray(y_pred).ravel()
101
+
102
+ tp_1 = int(((y_true == 1) & (y_pred == 1)).sum())
103
+ tp_2 = int(((y_true == 2) & (y_pred == 2)).sum())
104
+ n_1 = int((y_true == 1).sum())
105
+ n_2 = int((y_true == 2).sum())
106
+
107
+ max_score = n_1 + n_2 * w_class_2
108
+ if max_score == 0:
109
+ return 0.0
110
+
111
+ weighted_recall = (tp_1 + tp_2 * w_class_2) / max_score
112
+
113
+ is_premium_true = (y_true == 1) | (y_true == 2)
114
+ is_premium_pred = (y_pred == 1) | (y_pred == 2)
115
+ fp = int((~is_premium_true & is_premium_pred).sum())
116
+ n_negative = int((~is_premium_true).sum())
117
+ fpr = fp / max(n_negative, 1)
118
+
119
+ if fpr <= premium_fpr_budget:
120
+ return weighted_recall
121
+
122
+ excess_ratio = (fpr - premium_fpr_budget) / premium_fpr_budget
123
+ penalty = min(alpha * excess_ratio, 1.0)
124
+ return weighted_recall * (1.0 - penalty)
@@ -0,0 +1,179 @@
1
+ """
2
+ NGR Metrics Calculation Module
3
+
4
+ This module provides functions for calculating various error metrics
5
+ for NGR (Net Gaming Revenue) predictions with business-optimal weighting.
6
+ """
7
+
8
+ from typing import Any, Dict, Optional
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+
14
+ def calculate_ngr_metrics(
15
+ df: pd.DataFrame, late_stage_day: Optional[int] = None, late_stage_correction: Optional[float] = None
16
+ ) -> Dict[str, Any]:
17
+ """
18
+ Calculate NGR metrics with business-optimal weighting.
19
+
20
+ This function computes various error metrics (MAE, MAPE, ME, MPE) with business weights
21
+ based on days_since_ftd. Optionally applies late-stage corrections to predictions.
22
+
23
+ Parameters
24
+ ----------
25
+ df : pd.DataFrame
26
+ DataFrame containing prediction results with columns:
27
+ - 'days_since_ftd': int, days since first time deposit
28
+ - 'remaining_ngr': float, actual remaining NGR
29
+ - 'remaining_ngr_pred': float, predicted remaining NGR
30
+ late_stage_day : int, optional
31
+ Day threshold after which to apply correction
32
+ late_stage_correction : float, optional
33
+ Correction factor to apply (0.0-1.0) linearly after late_stage_day
34
+
35
+ Returns
36
+ -------
37
+ dict
38
+ Dictionary containing calculated metrics:
39
+ - 'Standard MAE': Mean Absolute Error
40
+ - 'Standard MAPE (%)': Mean Absolute Percentage Error
41
+ - 'Standard ME': Mean Error
42
+ - 'Standard MPE (%)': Mean Percentage Error
43
+ - 'Business Optimal MAE': Weighted MAE
44
+ - 'Business Optimal MAPE (%)': Weighted MAPE
45
+ - 'Business Optimal ME': Weighted ME
46
+ - 'Business Optimal MPE (%)': Weighted MPE
47
+ - 'mean_abs_error_by_bin': DataFrame with errors by period
48
+ - 'mean_values': DataFrame with aggregated values by day
49
+ - 'weights': Array of business weights by day
50
+
51
+ Notes
52
+ -----
53
+ Business-optimal weights by day ranges:
54
+ - Days 1-7: 0.05 (learning period - insufficient data)
55
+ - Days 8-14: 0.85 (early signals emerging)
56
+ - Days 15-45: 1.00 (SUPER CRITICAL - optimal intervention window)
57
+ - Days 46-90: 0.90 (confirmation period - high value)
58
+ - Days 91-180: 0.60 (established patterns - moderate value)
59
+ - Days 181-270: 0.40 (mature behavior - operational value)
60
+ - Days 271+: 0.30 (end-game precision - tactical value)
61
+
62
+ Examples
63
+ --------
64
+ >>> metrics = calculate_ngr_metrics(
65
+ ... df,
66
+ ... late_stage_day=320,
67
+ ... late_stage_correction=0.95
68
+ ... )
69
+ >>> print(f"Business MAE: {metrics['Business Optimal MAE']:.2f}")
70
+ """
71
+
72
+ # Calculate mean values grouped by days_since_ftd
73
+ mean_values = (
74
+ df.groupby("days_since_ftd")
75
+ .agg(
76
+ {
77
+ "remaining_ngr": "mean",
78
+ "remaining_ngr_pred": "mean",
79
+ }
80
+ )
81
+ .reset_index()
82
+ )
83
+ mean_values = mean_values[mean_values["days_since_ftd"] < 365]
84
+
85
+ # Apply late stage correction if specified
86
+ if late_stage_day is not None and late_stage_correction is not None:
87
+ days_array = mean_values["days_since_ftd"].values
88
+ days_from_start = days_array - late_stage_day
89
+ max_days = 364 - late_stage_day
90
+
91
+ correction_factor = np.where(
92
+ days_array <= late_stage_day,
93
+ 1.0, # Before late_stage_day: no changes
94
+ 1.0 - (1.0 - late_stage_correction) * (days_from_start / max_days), # Linear transition
95
+ )
96
+ mean_values["remaining_ngr_pred"] = mean_values["remaining_ngr_pred"] * correction_factor
97
+
98
+ days = mean_values["days_since_ftd"].values.astype(np.float64)
99
+
100
+ # Business-optimal weighting
101
+ weights = np.where(
102
+ days <= 7,
103
+ 0.05, # Learning period - insufficient data
104
+ np.where(
105
+ days <= 14,
106
+ 0.85, # Early signals emerging
107
+ np.where(
108
+ days <= 45,
109
+ 1.00, # SUPER CRITICAL - optimal intervention window
110
+ np.where(
111
+ days <= 90,
112
+ 0.90, # Confirmation period - high value
113
+ np.where(
114
+ days <= 180,
115
+ 0.60, # Established patterns - moderate value
116
+ np.where(days <= 270, 0.40, 0.30), # Mature behavior - operational value
117
+ ),
118
+ ),
119
+ ),
120
+ ),
121
+ ) # End-game precision - tactical value
122
+
123
+ abs_errors = np.abs(mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"])
124
+ errors = mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]
125
+
126
+ # Define bins for error analysis by period
127
+ bins = [0, 7, 14, 45, 90, 180, 270, 364]
128
+ labels = ["1-7", "8-14", "15-45", "46-90", "91-180", "181-270", "271-364"]
129
+
130
+ # Calculate standard metrics
131
+ business_optimal_tmae = np.sum(abs_errors * weights) / np.sum(weights)
132
+ business_optimal_me = np.sum(errors * weights) / np.sum(weights)
133
+ standard_mae = round(np.mean(abs_errors), 4)
134
+ standard_me = round(np.mean(errors), 4)
135
+
136
+ # Calculate percentage-based metrics
137
+ # Avoid division by zero - use small epsilon for very small values
138
+ epsilon = 1e-6
139
+ safe_remaining_ngr = np.where(np.abs(mean_values["remaining_ngr"]) < epsilon, epsilon, mean_values["remaining_ngr"])
140
+
141
+ percentage_abs_errors = (
142
+ np.abs((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
143
+ )
144
+ errors_percentage = ((mean_values["remaining_ngr"] - mean_values["remaining_ngr_pred"]) / safe_remaining_ngr) * 100
145
+
146
+ # Assign each day to a period bin
147
+ day_bins = pd.cut(mean_values["days_since_ftd"], bins=bins, labels=labels, right=True, include_lowest=True)
148
+
149
+ # Calculate mean errors by period
150
+ mean_abs_error_by_bin = pd.DataFrame(
151
+ {
152
+ "bin": labels,
153
+ "mean_abs_error": [abs_errors[day_bins == label].mean() for label in labels],
154
+ "mean_error": [errors[day_bins == label].mean() for label in labels],
155
+ "mape": [percentage_abs_errors[day_bins == label].mean() for label in labels],
156
+ }
157
+ )
158
+
159
+ # Calculate business-optimal percentage metrics
160
+ business_optimal_mape = np.sum(percentage_abs_errors * weights) / np.sum(weights)
161
+ standard_mape = np.mean(percentage_abs_errors)
162
+
163
+ business_optimal_mpe = np.sum(errors_percentage * weights) / np.sum(weights)
164
+ standard_mpe = np.mean(errors_percentage)
165
+
166
+ # Return dict with all metrics
167
+ return {
168
+ "Standard MAE": standard_mae,
169
+ "Standard MAPE (%)": round(standard_mape, 1),
170
+ "Standard ME": standard_me,
171
+ "Standard MPE (%)": round(standard_mpe, 1),
172
+ "Business Optimal MAE": round(business_optimal_tmae, 2),
173
+ "Business Optimal MAPE (%)": round(business_optimal_mape, 1),
174
+ "Business Optimal ME": round(business_optimal_me, 2),
175
+ "Business Optimal MPE (%)": round(business_optimal_mpe, 1),
176
+ "mean_abs_error_by_bin": mean_abs_error_by_bin,
177
+ "mean_values": mean_values,
178
+ "weights": weights,
179
+ }