ml-analytics-tools 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ml_analytics/__init__.py +53 -0
- ml_analytics/aws_auth.py +169 -0
- ml_analytics/cli.py +58 -0
- ml_analytics/data_connector.py +2615 -0
- ml_analytics/gsheet_connector.py +1646 -0
- ml_analytics/model_manager.py +1208 -0
- ml_analytics/model_tools.py +990 -0
- ml_analytics/s3_connector.py +1381 -0
- ml_analytics/slack_connector.py +637 -0
- ml_analytics/tunnel_manager.py +277 -0
- ml_analytics/utils.py +673 -0
- ml_analytics_tools-0.2.0.dist-info/METADATA +231 -0
- ml_analytics_tools-0.2.0.dist-info/RECORD +17 -0
- ml_analytics_tools-0.2.0.dist-info/WHEEL +5 -0
- ml_analytics_tools-0.2.0.dist-info/entry_points.txt +4 -0
- ml_analytics_tools-0.2.0.dist-info/licenses/LICENSE +21 -0
- ml_analytics_tools-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,990 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Set of utility functions for training/testing models
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import matplotlib.dates as mdates
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import seaborn as sns
|
|
10
|
+
import shap
|
|
11
|
+
from catboost import CatBoostClassifier, CatBoostRegressor, EFeaturesSelectionAlgorithm, EShapCalcType, Pool
|
|
12
|
+
from lifelines import KaplanMeierFitter
|
|
13
|
+
from lifelines.utils import concordance_index
|
|
14
|
+
from sklearn.metrics import (
|
|
15
|
+
auc,
|
|
16
|
+
brier_score_loss,
|
|
17
|
+
f1_score,
|
|
18
|
+
precision_recall_curve,
|
|
19
|
+
precision_score,
|
|
20
|
+
recall_score,
|
|
21
|
+
roc_auc_score,
|
|
22
|
+
)
|
|
23
|
+
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, train_test_split
|
|
24
|
+
|
|
25
|
+
from ml_analytics.utils import get_logger, log_and_raise_error
|
|
26
|
+
|
|
27
|
+
logger = get_logger("modeling-tools")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def prepare_catboost_data(
|
|
31
|
+
df: pd.DataFrame,
|
|
32
|
+
cat_features: list[str],
|
|
33
|
+
feature_list: list[str] | None = None,
|
|
34
|
+
) -> pd.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
Prepare a DataFrame for CatBoost training or inference.
|
|
37
|
+
|
|
38
|
+
Runs ``infer_objects`` to downcast object dtypes, then converts every
|
|
39
|
+
categorical column to ``str`` so CatBoost never receives Python ``None``
|
|
40
|
+
or ``np.nan`` values in categorical columns (which would raise an error).
|
|
41
|
+
Numeric columns are left untouched.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
df : pd.DataFrame
|
|
46
|
+
Input DataFrame (will be copied internally).
|
|
47
|
+
cat_features : list[str]
|
|
48
|
+
Columns to treat as categorical.
|
|
49
|
+
feature_list : list[str], optional
|
|
50
|
+
Subset of columns to check for missing values (for logging only).
|
|
51
|
+
If None, all columns are checked.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
pd.DataFrame
|
|
56
|
+
Pre-processed copy of ``df``.
|
|
57
|
+
"""
|
|
58
|
+
df = df.copy()
|
|
59
|
+
df = df.infer_objects(copy=False)
|
|
60
|
+
|
|
61
|
+
check_cols = feature_list if feature_list is not None else df.columns.tolist()
|
|
62
|
+
missing = df[check_cols].isnull().sum()
|
|
63
|
+
n_missing = missing[missing > 0].shape[0]
|
|
64
|
+
logger.info(f"Features with missing values: {n_missing}")
|
|
65
|
+
|
|
66
|
+
for col in cat_features:
|
|
67
|
+
if col in df.columns:
|
|
68
|
+
df[col] = df[col].astype(str)
|
|
69
|
+
|
|
70
|
+
return df
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def make_catboost_pool(
|
|
74
|
+
df: pd.DataFrame,
|
|
75
|
+
feature_list: list[str],
|
|
76
|
+
cat_features: list[str],
|
|
77
|
+
label=None,
|
|
78
|
+
**pool_kwargs,
|
|
79
|
+
) -> Pool:
|
|
80
|
+
"""
|
|
81
|
+
Build a CatBoost ``Pool`` with automatic preprocessing.
|
|
82
|
+
|
|
83
|
+
Calls :func:`prepare_catboost_data` before constructing the pool, so you
|
|
84
|
+
never have to remember to handle missing values or cast categorical columns
|
|
85
|
+
manually.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
df : pd.DataFrame
|
|
90
|
+
Source DataFrame (will be copied internally by ``prepare_catboost_data``).
|
|
91
|
+
feature_list : list[str]
|
|
92
|
+
Feature columns to include in the pool.
|
|
93
|
+
cat_features : list[str]
|
|
94
|
+
Columns to treat as categorical.
|
|
95
|
+
label : array-like or str, optional
|
|
96
|
+
Target values, or the name of the target column in ``df``.
|
|
97
|
+
If a string, it is extracted from ``df`` before subsetting features.
|
|
98
|
+
**pool_kwargs :
|
|
99
|
+
Any additional keyword arguments forwarded to ``catboost.Pool``
|
|
100
|
+
(e.g. ``group_id``, ``weight``).
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
catboost.Pool
|
|
105
|
+
"""
|
|
106
|
+
if isinstance(label, str):
|
|
107
|
+
label = df[label]
|
|
108
|
+
|
|
109
|
+
df = prepare_catboost_data(df, cat_features=cat_features, feature_list=feature_list)
|
|
110
|
+
|
|
111
|
+
present_cat = [c for c in cat_features if c in feature_list]
|
|
112
|
+
cat_indices = [feature_list.index(c) for c in present_cat]
|
|
113
|
+
|
|
114
|
+
return Pool(
|
|
115
|
+
data=df[feature_list],
|
|
116
|
+
label=label,
|
|
117
|
+
feature_names=feature_list,
|
|
118
|
+
cat_features=cat_indices,
|
|
119
|
+
**pool_kwargs,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_features(df, target_col=None):
|
|
124
|
+
"""
|
|
125
|
+
Get categorical and numerical features from a DataFrame.
|
|
126
|
+
Args:
|
|
127
|
+
df (pd.DataFrame): DataFrame to extract features from.
|
|
128
|
+
target_col (str, optional): Name of the target column. Defaults to None.
|
|
129
|
+
Returns:
|
|
130
|
+
tuple: List of categorical features and list of numerical features.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
if target_col is not None:
|
|
134
|
+
df = df.drop(columns=[target_col], errors="ignore").copy()
|
|
135
|
+
|
|
136
|
+
categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
137
|
+
numerical_features = df.select_dtypes(include=["int32", "int64", "float64", "float32"]).columns.tolist()
|
|
138
|
+
return categorical_features, numerical_features
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_balanced_accuracy(y_true, y_pred):
|
|
142
|
+
acc = (recall_score(y_true, y_pred, pos_label=0) + recall_score(y_true, y_pred, pos_label=1)) / 2
|
|
143
|
+
return acc
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def pr_auc_score(y_true, y_pred_proba):
|
|
147
|
+
precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
|
|
148
|
+
return auc(recall, precision)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def brier_score(y_true, y_pred_proba):
|
|
152
|
+
"""
|
|
153
|
+
Calculate the Brier score for binary classification.
|
|
154
|
+
|
|
155
|
+
The Brier score is a proper scoring rule that measures the accuracy of probabilistic predictions.
|
|
156
|
+
Lower scores are better, with 0 being perfect and 0.25 being the worst possible score for a binary classifier.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
y_true (array-like): True binary labels (0 or 1).
|
|
160
|
+
y_pred_proba (array-like): Predicted probabilities for the positive class.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
float: Brier score (lower is better).
|
|
164
|
+
"""
|
|
165
|
+
return brier_score_loss(y_true, y_pred_proba)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def expected_calibration_error(y_true, y_pred_proba, n_bins=10):
|
|
169
|
+
"""
|
|
170
|
+
Calculate the Expected Calibration Error (ECE) for binary classification.
|
|
171
|
+
|
|
172
|
+
ECE measures the difference between predicted probabilities and actual outcomes
|
|
173
|
+
across different confidence bins. It indicates how well-calibrated the model's
|
|
174
|
+
probability estimates are.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
y_true (array-like): True binary labels (0 or 1).
|
|
178
|
+
y_pred_proba (array-like): Predicted probabilities for the positive class.
|
|
179
|
+
n_bins (int): Number of bins to use for calibration curve. Default is 10.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
float: Expected Calibration Error (lower is better, 0 is perfect calibration).
|
|
183
|
+
"""
|
|
184
|
+
try:
|
|
185
|
+
# Calculate bin boundaries
|
|
186
|
+
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
187
|
+
bin_lowers = bin_boundaries[:-1]
|
|
188
|
+
bin_uppers = bin_boundaries[1:]
|
|
189
|
+
|
|
190
|
+
# Calculate ECE
|
|
191
|
+
ece = 0.0
|
|
192
|
+
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=True):
|
|
193
|
+
# Find samples in this bin
|
|
194
|
+
in_bin = (y_pred_proba > bin_lower) & (y_pred_proba <= bin_upper)
|
|
195
|
+
prop_in_bin = in_bin.mean()
|
|
196
|
+
|
|
197
|
+
if prop_in_bin > 0:
|
|
198
|
+
# Calculate accuracy and confidence for this bin
|
|
199
|
+
accuracy_in_bin = y_true[in_bin].mean() if in_bin.sum() > 0 else 0
|
|
200
|
+
avg_confidence_in_bin = y_pred_proba[in_bin].mean() if in_bin.sum() > 0 else 0
|
|
201
|
+
|
|
202
|
+
# Add weighted calibration error for this bin
|
|
203
|
+
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
|
|
204
|
+
|
|
205
|
+
return ece
|
|
206
|
+
|
|
207
|
+
except Exception as e:
|
|
208
|
+
logger.warning(f"Error calculating ECE: {e}. Returning NaN.")
|
|
209
|
+
return np.nan
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def mcc_score(y_true, y_pred):
|
|
213
|
+
"""
|
|
214
|
+
Calculate Matthews correlation coefficient (MCC) for binary classification.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
y_true (array-like): True binary labels.
|
|
218
|
+
y_pred (array-like): Predicted binary labels.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
float: Matthews correlation coefficient.
|
|
222
|
+
"""
|
|
223
|
+
tp = ((y_true == 1) & (y_pred == 1)).sum()
|
|
224
|
+
tn = ((y_true == 0) & (y_pred == 0)).sum()
|
|
225
|
+
fp = ((y_true == 0) & (y_pred == 1)).sum()
|
|
226
|
+
fn = ((y_true == 1) & (y_pred == 0)).sum()
|
|
227
|
+
|
|
228
|
+
numerator = (tp * tn) - (fp * fn)
|
|
229
|
+
denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
|
|
230
|
+
|
|
231
|
+
if denominator == 0:
|
|
232
|
+
return 0.0
|
|
233
|
+
|
|
234
|
+
return numerator / denominator
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def survival_mae(observed_time, event_indicator, predicted_time):
|
|
238
|
+
"""
|
|
239
|
+
Calculates a Mean Absolute Error (MAE) suitable for survival data,
|
|
240
|
+
considering censored observations.
|
|
241
|
+
Ensures inputs are treated as NumPy arrays to avoid Pandas indexing issues.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
observed_time_pd (pd.Series): Series of observed times.
|
|
245
|
+
event_indicator_pd (pd.Series): Series indicating if the event was observed.
|
|
246
|
+
predicted_time_pd (pd.Series): Series of predicted event times.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
float: The mean absolute error.
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
errors = np.zeros_like(observed_time, dtype=float)
|
|
253
|
+
|
|
254
|
+
event_mask = (event_indicator == 1) | (event_indicator is True)
|
|
255
|
+
errors[event_mask] = np.abs(observed_time[event_mask] - predicted_time[event_mask])
|
|
256
|
+
censored_mask = (event_indicator == 0) | (event_indicator is False)
|
|
257
|
+
predicted_early_mask = (censored_mask) & (predicted_time < observed_time)
|
|
258
|
+
errors[predicted_early_mask] = observed_time[predicted_early_mask] - predicted_time[predicted_early_mask]
|
|
259
|
+
|
|
260
|
+
return np.mean(errors)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def get_metrics(y_obs, y_pred, y_prob, prefix=None):
|
|
264
|
+
metrics = {}
|
|
265
|
+
|
|
266
|
+
if prefix is None:
|
|
267
|
+
prefix = ""
|
|
268
|
+
else:
|
|
269
|
+
prefix = prefix + "_"
|
|
270
|
+
|
|
271
|
+
metrics[prefix + "balanced_accuracy"] = get_balanced_accuracy(y_obs, y_pred)
|
|
272
|
+
metrics[prefix + "mcc"] = mcc_score(y_obs, y_pred)
|
|
273
|
+
metrics[prefix + "precision"] = precision_score(y_obs, y_pred, zero_division=0)
|
|
274
|
+
metrics[prefix + "recall"] = recall_score(y_obs, y_pred, zero_division=0)
|
|
275
|
+
metrics[prefix + "f1"] = f1_score(y_obs, y_pred, zero_division=0)
|
|
276
|
+
metrics[prefix + "pr_auc"] = pr_auc_score(y_obs, y_prob)
|
|
277
|
+
metrics[prefix + "roc_auc"] = roc_auc_score(y_obs, y_prob)
|
|
278
|
+
metrics[prefix + "brier_score"] = brier_score(y_obs, y_prob)
|
|
279
|
+
metrics[prefix + "ece"] = expected_calibration_error(y_obs, y_prob)
|
|
280
|
+
return metrics
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def get_performance(data, target_col, pred_col, prob_col, grouping_cols=None, test_size_minimum_per_group=50):
|
|
284
|
+
"""
|
|
285
|
+
Calculates performance metrics for each combination of grouping columns.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
data (pd.DataFrame): DataFrame containing target, predictions, probabilities, and grouping columns.
|
|
289
|
+
target_col (str): Name of the target column.
|
|
290
|
+
pred_col (str): Name of the prediction column (0 or 1).
|
|
291
|
+
prob_col (str): Name of the probability column (probability of class 1).
|
|
292
|
+
grouping_cols (list, optional): List of column names to group by.
|
|
293
|
+
Defaults to ['product', 'event_type', 'country_code'].
|
|
294
|
+
test_size_minimum_per_group (int, optional): Minimum number of observations required in each group to calculate metrics.
|
|
295
|
+
Defaults to 50.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
pd.DataFrame: DataFrame with grouping columns, number of observations, and performance metrics for each group.
|
|
299
|
+
""" # noqa: E501
|
|
300
|
+
|
|
301
|
+
data = data.copy()
|
|
302
|
+
if grouping_cols is None:
|
|
303
|
+
data["fake_group"] = 1
|
|
304
|
+
grouping_cols = ["fake_group"]
|
|
305
|
+
else:
|
|
306
|
+
grouping_cols = [col for col in grouping_cols if col in data.columns]
|
|
307
|
+
if not grouping_cols:
|
|
308
|
+
logger.error("No valid grouping columns found!")
|
|
309
|
+
|
|
310
|
+
results = []
|
|
311
|
+
for name, group in data.groupby(grouping_cols):
|
|
312
|
+
if not isinstance(name, tuple):
|
|
313
|
+
name = (name,)
|
|
314
|
+
n_observations = len(group)
|
|
315
|
+
|
|
316
|
+
# Skip group if too small early on
|
|
317
|
+
if n_observations == 0:
|
|
318
|
+
logger.warning(f"Skipping empty group {name}.")
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
if n_observations > test_size_minimum_per_group:
|
|
322
|
+
y_true = group[target_col].values.ravel()
|
|
323
|
+
y_pred = group[pred_col].values.ravel()
|
|
324
|
+
y_prob = group[prob_col].values.ravel()
|
|
325
|
+
|
|
326
|
+
tp = ((y_true == 1) & (y_pred == 1)).sum() / n_observations
|
|
327
|
+
tn = ((y_true == 0) & (y_pred == 0)).sum() / n_observations
|
|
328
|
+
fp = ((y_true == 0) & (y_pred == 1)).sum() / n_observations
|
|
329
|
+
fn = ((y_true == 1) & (y_pred == 0)).sum() / n_observations
|
|
330
|
+
|
|
331
|
+
# Calculate rank correlation
|
|
332
|
+
rank_correlation = np.nan # Default to NaN
|
|
333
|
+
binning_agg = pd.DataFrame(columns=["bin", "target_rate"])
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
bins_labels = pd.qcut(y_prob, q=10, labels=False, duplicates="drop")
|
|
337
|
+
except ValueError:
|
|
338
|
+
try:
|
|
339
|
+
bins_labels = pd.cut(y_prob, bins=10, labels=False, include_lowest=True)
|
|
340
|
+
except ValueError:
|
|
341
|
+
logger.warning(f"Could not compute bins for group {name}. Setting rank_correlation to NaN.")
|
|
342
|
+
bins_labels = None
|
|
343
|
+
|
|
344
|
+
if bins_labels is not None:
|
|
345
|
+
try:
|
|
346
|
+
binning_df = pd.DataFrame({"bin": bins_labels, "target_rate": y_true})
|
|
347
|
+
binning_agg = binning_df.groupby("bin")["target_rate"].mean().reset_index()
|
|
348
|
+
if len(binning_agg) > 4:
|
|
349
|
+
rank_correlation = binning_agg[["bin", "target_rate"]].corr(method="spearman").iloc[0, 1]
|
|
350
|
+
target_bin_correlation = binning_agg[["bin", "target_rate"]].corr(method="pearson").iloc[0, 1]
|
|
351
|
+
else:
|
|
352
|
+
logger.warning(
|
|
353
|
+
f"Not enough bins with data to calculate correlation for group {name}. Setting rank_correlation to NaN." # noqa: E501
|
|
354
|
+
) # noqa: E501
|
|
355
|
+
rank_correlation = np.nan
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.error(f"Error during binning aggregation or correlation for group {name}: {e}")
|
|
358
|
+
|
|
359
|
+
if len(np.unique(y_true)) > 1:
|
|
360
|
+
metrics = get_metrics(y_true, y_pred, y_prob)
|
|
361
|
+
|
|
362
|
+
binning_agg = binning_agg.sort_values(by="bin", ascending=False)
|
|
363
|
+
group_result = dict(zip(grouping_cols, name, strict=False))
|
|
364
|
+
group_result["n_observations"] = n_observations
|
|
365
|
+
group_result["target_rate"] = np.mean(y_true)
|
|
366
|
+
group_result["score_average"] = np.mean(y_prob)
|
|
367
|
+
group_result["score_std"] = np.std(y_prob)
|
|
368
|
+
group_result["rank_correlation"] = rank_correlation
|
|
369
|
+
group_result["target_bin_correlation"] = target_bin_correlation
|
|
370
|
+
group_result["number_deciles"] = len(binning_agg)
|
|
371
|
+
group_result["tp_perc"] = tp * 100
|
|
372
|
+
group_result["tn_perc"] = tn * 100
|
|
373
|
+
group_result["fp_perc"] = fp * 100
|
|
374
|
+
group_result["fn_perc"] = fn * 100
|
|
375
|
+
group_result.update(metrics)
|
|
376
|
+
group_result["first_decile_target_rate"] = binning_agg.iloc[0, 1] if len(binning_agg) > 0 else np.nan
|
|
377
|
+
group_result["second_decile_target_rate"] = binning_agg.iloc[1, 1] if len(binning_agg) > 1 else np.nan
|
|
378
|
+
group_result["third_decile_target_rate"] = binning_agg.iloc[2, 1] if len(binning_agg) > 2 else np.nan
|
|
379
|
+
results.append(group_result)
|
|
380
|
+
else:
|
|
381
|
+
logger.warning(f"Group {name} has too few observations ({n_observations}). Skipping.")
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
if not results:
|
|
385
|
+
logger.warning("No groups found or processed. Returning empty DataFrame.")
|
|
386
|
+
return None
|
|
387
|
+
else:
|
|
388
|
+
performance_df = pd.DataFrame(results)
|
|
389
|
+
|
|
390
|
+
def create_improvement_column(df, new_column, baseline, new_value):
|
|
391
|
+
df[new_column] = np.where(
|
|
392
|
+
(df[baseline].notna()) & (df[baseline] != 0) & (df[new_value].notna()),
|
|
393
|
+
(df[new_value] - df[baseline]) / df[baseline],
|
|
394
|
+
np.nan,
|
|
395
|
+
)
|
|
396
|
+
return df
|
|
397
|
+
|
|
398
|
+
performance_df = create_improvement_column(performance_df, "pr_auc_improvement", "target_rate", "pr_auc")
|
|
399
|
+
performance_df = create_improvement_column(
|
|
400
|
+
performance_df, "first_decile_improvement", "target_rate", "first_decile_target_rate"
|
|
401
|
+
) # noqa: E501
|
|
402
|
+
performance_df = create_improvement_column(
|
|
403
|
+
performance_df, "second_decile_improvement", "target_rate", "second_decile_target_rate"
|
|
404
|
+
) # noqa: E501
|
|
405
|
+
|
|
406
|
+
performance_df = performance_df.round(3)
|
|
407
|
+
return performance_df
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def get_metrics_surv(observed_time, event_indicator, predicted_time, prefix=None):
|
|
411
|
+
"""
|
|
412
|
+
Calculate performance metrics for survival analysis.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
observed_time (array-like): Observed times.
|
|
416
|
+
event_indicator (array-like): Event indicators (1 if event occurred, 0 if censored).
|
|
417
|
+
predicted_time (array-like): Predicted times.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
dict: Dictionary of performance metrics.
|
|
421
|
+
"""
|
|
422
|
+
metrics = {}
|
|
423
|
+
|
|
424
|
+
if prefix is None:
|
|
425
|
+
prefix = ""
|
|
426
|
+
else:
|
|
427
|
+
prefix = prefix + "_"
|
|
428
|
+
|
|
429
|
+
metrics[prefix + "mae"] = survival_mae(observed_time, event_indicator, predicted_time)
|
|
430
|
+
metrics[prefix + "c_index"] = concordance_index(observed_time, predicted_time, event_indicator)
|
|
431
|
+
return metrics
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def get_performance_surv(data, time_col, event_col, pred_col, grouping_cols=None, test_size_minimum_per_group=50):
|
|
435
|
+
"""
|
|
436
|
+
Calculates performance metrics for each combination of grouping columns.
|
|
437
|
+
Args:
|
|
438
|
+
data (pd.DataFrame): DataFrame containing target, predictions, probabilities, and grouping columns.
|
|
439
|
+
time_col (str): Name of the time column.
|
|
440
|
+
event_col (str): Name of the event column.
|
|
441
|
+
pred_col (str): Name of the prediction column (predicted survival time).
|
|
442
|
+
grouping_cols (list, optional): List of column names to group by.
|
|
443
|
+
Defaults to ['product', 'event_type', 'country_code'].
|
|
444
|
+
Returns:
|
|
445
|
+
pd.DataFrame: DataFrame with grouping columns, number of observations, and performance metrics for each group.
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
kmf = KaplanMeierFitter()
|
|
449
|
+
|
|
450
|
+
data = data.copy()
|
|
451
|
+
if grouping_cols is None:
|
|
452
|
+
data["fake_group"] = 1
|
|
453
|
+
grouping_cols = ["fake_group"]
|
|
454
|
+
else:
|
|
455
|
+
grouping_cols = [col for col in grouping_cols if col in data.columns]
|
|
456
|
+
if not grouping_cols:
|
|
457
|
+
logger.error("No valid grouping columns found!")
|
|
458
|
+
|
|
459
|
+
results = []
|
|
460
|
+
for name, group in data.groupby(grouping_cols):
|
|
461
|
+
if not isinstance(name, tuple):
|
|
462
|
+
name = (name,)
|
|
463
|
+
n_observations = len(group)
|
|
464
|
+
# Skip group if too small early on
|
|
465
|
+
if n_observations == 0:
|
|
466
|
+
logger.warning(f"Skipping empty group {name}.")
|
|
467
|
+
continue
|
|
468
|
+
|
|
469
|
+
if n_observations > test_size_minimum_per_group:
|
|
470
|
+
kmf.fit(durations=group[time_col], event_observed=group[event_col])
|
|
471
|
+
print()
|
|
472
|
+
metrics = get_metrics_surv(group[time_col], group[event_col], group[pred_col])
|
|
473
|
+
group_result = dict(zip(grouping_cols, name, strict=False))
|
|
474
|
+
group_result["n_observations"] = n_observations
|
|
475
|
+
group_result["avg_months"] = np.mean(group[time_col])
|
|
476
|
+
group_result["median_months"] = np.median(group[time_col])
|
|
477
|
+
group_result["km_median_surv_months"] = kmf.median_survival_time_
|
|
478
|
+
group_result["event_rate"] = np.mean(group[event_col])
|
|
479
|
+
group_result["avg_predicted_months"] = np.mean(group[pred_col])
|
|
480
|
+
group_result["median_predicted_months"] = np.median(group[pred_col])
|
|
481
|
+
group_result.update(metrics)
|
|
482
|
+
results.append(group_result)
|
|
483
|
+
else:
|
|
484
|
+
logger.warning(f"Group {name} has too few observations ({n_observations}). Skipping.")
|
|
485
|
+
continue
|
|
486
|
+
|
|
487
|
+
if not results:
|
|
488
|
+
logger.warning("No groups found or processed. Returning empty DataFrame.")
|
|
489
|
+
return None
|
|
490
|
+
else:
|
|
491
|
+
performance_df = pd.DataFrame(results)
|
|
492
|
+
performance_df = performance_df.round(3)
|
|
493
|
+
return performance_df
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def time_split(data, date_column, test_ratio=0.3, max_date=None):
|
|
497
|
+
"""
|
|
498
|
+
Splits the data into training and testing sets based on a date column.
|
|
499
|
+
|
|
500
|
+
Parameters:
|
|
501
|
+
data (pd.DataFrame): The input dataframe to be split.
|
|
502
|
+
date_column (str): The name of the date column to use for splitting.
|
|
503
|
+
test_ratio (float): The ratio of the data to be used for testing. Default is 0.3.
|
|
504
|
+
max_date (str): The maximum date to consider for the splittIf None, all data is used. Default is None.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
pd.DataFrame, pd.DataFrame: The training and testing dataframes.
|
|
508
|
+
"""
|
|
509
|
+
|
|
510
|
+
if test_ratio <= 0 or test_ratio >= 1:
|
|
511
|
+
log_and_raise_error(logger=logger, message="test_ratio must be between 0 and 1!")
|
|
512
|
+
|
|
513
|
+
if max_date is not None:
|
|
514
|
+
data = data[data[date_column] < max_date].copy()
|
|
515
|
+
|
|
516
|
+
data[date_column] = pd.to_datetime(data[date_column]).dt.date
|
|
517
|
+
|
|
518
|
+
data = data.sort_values(by=date_column)
|
|
519
|
+
cutoff_date = data[date_column].quantile(1 - test_ratio)
|
|
520
|
+
|
|
521
|
+
# Split the data into training and testing sets
|
|
522
|
+
train_data = data[data[date_column] < cutoff_date]
|
|
523
|
+
test_data = data[data[date_column] >= cutoff_date]
|
|
524
|
+
|
|
525
|
+
return train_data, test_data
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def catboost_feature_selection(
|
|
529
|
+
train_df: pd.DataFrame,
|
|
530
|
+
feature_list: list[str],
|
|
531
|
+
target_column: str | list[str], # Modified to handle both single and multiple targets
|
|
532
|
+
cat_features: list[str],
|
|
533
|
+
id_col: str = "id_col",
|
|
534
|
+
num_features_to_select: int = None,
|
|
535
|
+
define_best_num_features: bool = False,
|
|
536
|
+
model: CatBoostClassifier | CatBoostRegressor = None,
|
|
537
|
+
model_task: str = "classification",
|
|
538
|
+
algorithm: EFeaturesSelectionAlgorithm = None,
|
|
539
|
+
shap_calc_type: EShapCalcType = EShapCalcType.Regular,
|
|
540
|
+
force_to_include: list[str] = None,
|
|
541
|
+
steps: int = 15,
|
|
542
|
+
iterations: int = 100,
|
|
543
|
+
random_seed: int = 42,
|
|
544
|
+
logging_level: str = "Verbose",
|
|
545
|
+
min_count_binary: int = 30,
|
|
546
|
+
sample_ratio: float = None,
|
|
547
|
+
) -> list[str]:
|
|
548
|
+
"""
|
|
549
|
+
Perform feature selection using CatBoost's built-in feature selection with GroupKFold validation.
|
|
550
|
+
Now supports both classification and regression tasks, including survival analysis.
|
|
551
|
+
|
|
552
|
+
Parameters:
|
|
553
|
+
- train_df: pd.DataFrame. The training DataFrame.
|
|
554
|
+
- feature_list: list[str]. List of features to consider for selection.
|
|
555
|
+
- target_column: str or list[str]. The target column name(s). For survival: ['y_lower', 'y_upper']
|
|
556
|
+
- cat_features: list[str]. List of categorical feature names for CatBoost.
|
|
557
|
+
- id_col: str. Column name for grouping (default: 'id_col').
|
|
558
|
+
- model: CatBoostClassifier or CatBoostRegressor instance. If None, creates appropriate model based on target.
|
|
559
|
+
- model_task: str. Type of model task: 'classification', 'regression', or 'survival'.
|
|
560
|
+
- algorithm: EFeaturesSelectionAlgorithm. If None, defaults to RecursiveByShapValues.
|
|
561
|
+
- force_to_include: list[str]. Features to always include in final set.
|
|
562
|
+
- steps: int. Number of steps for feature selection.
|
|
563
|
+
- iterations: int. Number of iterations for the CatBoost model.
|
|
564
|
+
- random_seed: int. Random seed for reproducibility.
|
|
565
|
+
- logging_level: str. Logging level for CatBoost.
|
|
566
|
+
- min_count_binary: int. Minimum count for binary features to be included.
|
|
567
|
+
- sample_ratio: float. If provided (e.g., 0.3), sample this ratio of unique IDs using stratified sampling by target for faster iteration.
|
|
568
|
+
|
|
569
|
+
Returns:
|
|
570
|
+
- list[str]. List of selected features.
|
|
571
|
+
""" # noqa: E501
|
|
572
|
+
feature_list = feature_list.copy()
|
|
573
|
+
train_df = prepare_catboost_data(train_df, cat_features=cat_features, feature_list=feature_list)
|
|
574
|
+
|
|
575
|
+
if algorithm is None:
|
|
576
|
+
algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues
|
|
577
|
+
|
|
578
|
+
# Sample data by unique IDs if sample_ratio is specified
|
|
579
|
+
if sample_ratio is not None:
|
|
580
|
+
if sample_ratio <= 0 or sample_ratio >= 1:
|
|
581
|
+
log_and_raise_error(logger, "sample_ratio must be between 0 and 1!")
|
|
582
|
+
# Get unique IDs and their corresponding target values for stratification
|
|
583
|
+
if isinstance(target_column, list):
|
|
584
|
+
# For survival models, aggregate first target column
|
|
585
|
+
unique_ids_df = train_df.groupby(id_col, as_index=False)[target_column[0]].first()
|
|
586
|
+
if model_task == "survival":
|
|
587
|
+
# Also need second target column for event indicator
|
|
588
|
+
unique_ids_df[target_column[1]] = train_df.groupby(id_col)[target_column[1]].first().values
|
|
589
|
+
stratify_col = (unique_ids_df[target_column[1]] != -1).astype(int)
|
|
590
|
+
else:
|
|
591
|
+
stratify_col = unique_ids_df[target_column[0]]
|
|
592
|
+
else:
|
|
593
|
+
unique_ids_df = train_df.groupby(id_col, as_index=False)[target_column].first()
|
|
594
|
+
stratify_col = unique_ids_df[target_column]
|
|
595
|
+
|
|
596
|
+
# Sample unique IDs with stratification
|
|
597
|
+
sampled_ids, _ = train_test_split(
|
|
598
|
+
unique_ids_df[id_col], train_size=sample_ratio, stratify=stratify_col, random_state=random_seed
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
# Filter dataframe to keep only sampled IDs
|
|
602
|
+
original_size = len(train_df)
|
|
603
|
+
train_df = train_df[train_df[id_col].isin(sampled_ids)].copy()
|
|
604
|
+
logger.info(f"Sampled {len(sampled_ids)} unique IDs ({sample_ratio:.1%}) from {len(unique_ids_df)} total IDs")
|
|
605
|
+
logger.info(
|
|
606
|
+
f"Reduced dataset from {original_size:,} to {len(train_df):,} rows ({len(train_df) / original_size:.1%}) for faster feature selection iteration" # noqa: E501
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Determine if this is a survival/regression task
|
|
610
|
+
if model_task is None:
|
|
611
|
+
log_and_raise_error(logger, "model_task must be specified as 'classification', 'regression', or 'survival'!")
|
|
612
|
+
if model_task not in ["classification", "regression", "survival"]:
|
|
613
|
+
log_and_raise_error(logger, "model_task must be one of 'classification', 'regression', or 'survival'!")
|
|
614
|
+
|
|
615
|
+
if model is None:
|
|
616
|
+
if model_task == "regression":
|
|
617
|
+
model = CatBoostRegressor(iterations=iterations, loss_function="RMSE", verbose=0, random_seed=random_seed)
|
|
618
|
+
elif model_task == "survival":
|
|
619
|
+
# Survival analysis with AFT loss
|
|
620
|
+
model = CatBoostRegressor(
|
|
621
|
+
iterations=iterations, loss_function="SurvivalAft:dist=Normal", verbose=0, random_seed=random_seed
|
|
622
|
+
)
|
|
623
|
+
else:
|
|
624
|
+
model = CatBoostClassifier(
|
|
625
|
+
iterations=iterations, loss_function="Logloss", verbose=0, random_seed=random_seed
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
if num_features_to_select is not None and define_best_num_features:
|
|
629
|
+
num_features_to_select = None
|
|
630
|
+
if num_features_to_select is None:
|
|
631
|
+
define_best_num_features = True
|
|
632
|
+
|
|
633
|
+
# Remove features with only one value
|
|
634
|
+
for col in feature_list[:]:
|
|
635
|
+
if train_df[col].dropna().nunique() == 1:
|
|
636
|
+
feature_list.remove(col)
|
|
637
|
+
|
|
638
|
+
# Remove binary features with insufficient counts
|
|
639
|
+
for col in feature_list[:]:
|
|
640
|
+
if train_df[col].dropna().nunique() == 2:
|
|
641
|
+
unique_vals = set(train_df[col].dropna().unique())
|
|
642
|
+
if unique_vals == {0, 1} or unique_vals == {1, 0} or unique_vals == {0.0, 1.0}:
|
|
643
|
+
if train_df[col].value_counts().get(1, 0) < min_count_binary:
|
|
644
|
+
feature_list.remove(col)
|
|
645
|
+
|
|
646
|
+
# Get correlation between numberic future, and print the features with more than +-0.80 correlation
|
|
647
|
+
num_features = [f for f in feature_list if f not in cat_features]
|
|
648
|
+
corr_matrix = train_df[num_features].corr()
|
|
649
|
+
high_corr_pairs = []
|
|
650
|
+
for i in range(len(corr_matrix.columns)):
|
|
651
|
+
for j in range(i):
|
|
652
|
+
if abs(corr_matrix.iloc[i, j]) > 0.8:
|
|
653
|
+
high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
|
|
654
|
+
if high_corr_pairs:
|
|
655
|
+
logger.info(f"Found {len(high_corr_pairs)} highly correlated feature pairs (|correlation| > 0.8)")
|
|
656
|
+
else:
|
|
657
|
+
logger.info("No highly correlated feature pairs found (|correlation| > 0.8).")
|
|
658
|
+
|
|
659
|
+
if force_to_include is None:
|
|
660
|
+
force_to_include = []
|
|
661
|
+
|
|
662
|
+
# Only exclude forced features from selection - let algorithm decide on categorical features
|
|
663
|
+
selection_feature_list = [f for f in feature_list if f not in force_to_include]
|
|
664
|
+
# Include categorical features from both selection list and forced features
|
|
665
|
+
all_features_for_pool = selection_feature_list + force_to_include
|
|
666
|
+
ncat_features = [c for c in cat_features if c in all_features_for_pool]
|
|
667
|
+
|
|
668
|
+
logger.info(
|
|
669
|
+
f"Feature selection: {len(selection_feature_list)} selectable, {len(force_to_include)} forced, {len(ncat_features)} categorical" # noqa: E501
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
if model_task in ["regression", "survival"]:
|
|
673
|
+
gkf = GroupKFold(n_splits=3)
|
|
674
|
+
groups = train_df[id_col]
|
|
675
|
+
if model_task == "survival":
|
|
676
|
+
# For survival, use event indicator for stratification approximation
|
|
677
|
+
stratify_col = (train_df[target_column[1]] != -1).astype(int)
|
|
678
|
+
splits = list(StratifiedGroupKFold(n_splits=3).split(train_df, stratify_col, groups=groups))
|
|
679
|
+
else:
|
|
680
|
+
splits = list(gkf.split(train_df, groups=groups))
|
|
681
|
+
else:
|
|
682
|
+
sgkf = StratifiedGroupKFold(n_splits=3)
|
|
683
|
+
groups = train_df[id_col]
|
|
684
|
+
splits = list(sgkf.split(train_df, train_df[target_column], groups=groups))
|
|
685
|
+
|
|
686
|
+
train_idx = np.concatenate([splits[0][0], splits[1][0]])
|
|
687
|
+
val_idx = splits[2][1]
|
|
688
|
+
|
|
689
|
+
train_fold = train_df.iloc[train_idx]
|
|
690
|
+
val_fold = train_df.iloc[val_idx]
|
|
691
|
+
|
|
692
|
+
train_pool = make_catboost_pool(train_fold, all_features_for_pool, cat_features, label=train_fold[target_column])
|
|
693
|
+
eval_pool = make_catboost_pool(val_fold, all_features_for_pool, cat_features, label=val_fold[target_column])
|
|
694
|
+
|
|
695
|
+
# Perform feature selection
|
|
696
|
+
selected_features_algo = []
|
|
697
|
+
try:
|
|
698
|
+
if define_best_num_features:
|
|
699
|
+
fsummary = model.select_features(
|
|
700
|
+
train_pool,
|
|
701
|
+
eval_set=eval_pool,
|
|
702
|
+
features_for_select=selection_feature_list,
|
|
703
|
+
num_features_to_select=1,
|
|
704
|
+
steps=steps,
|
|
705
|
+
algorithm=algorithm,
|
|
706
|
+
shap_calc_type=shap_calc_type,
|
|
707
|
+
train_final_model=False,
|
|
708
|
+
logging_level=logging_level,
|
|
709
|
+
plot=False,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
loss_features = pd.DataFrame(
|
|
713
|
+
{
|
|
714
|
+
"Features_Removed": fsummary["loss_graph"]["removed_features_count"],
|
|
715
|
+
"Loss": fsummary["loss_graph"]["loss_values"],
|
|
716
|
+
}
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
best_idx = loss_features["Loss"].idxmin()
|
|
720
|
+
best_features_kept = len(selection_feature_list) - loss_features.loc[best_idx, "Features_Removed"]
|
|
721
|
+
best_loss_value = loss_features.loc[best_idx, "Loss"]
|
|
722
|
+
|
|
723
|
+
logger.info(
|
|
724
|
+
f"Optimal features: {best_features_kept} selected + {len(force_to_include)} forced = {best_features_kept + len(force_to_include)} total (loss: {best_loss_value:.6f})" # noqa: E501
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
# plot of losses vs number of features removed
|
|
728
|
+
plt.figure(figsize=(8, 5))
|
|
729
|
+
plt.plot(
|
|
730
|
+
loss_features["Features_Removed"],
|
|
731
|
+
loss_features["Loss"],
|
|
732
|
+
marker="o",
|
|
733
|
+
linestyle="-",
|
|
734
|
+
color="b",
|
|
735
|
+
)
|
|
736
|
+
plt.axvline(
|
|
737
|
+
x=loss_features.loc[best_idx, "Features_Removed"],
|
|
738
|
+
color="r",
|
|
739
|
+
linestyle="--",
|
|
740
|
+
label="Best number of features",
|
|
741
|
+
)
|
|
742
|
+
plt.title("Feature Selection Loss Graph")
|
|
743
|
+
plt.xlabel("Number of Features Removed")
|
|
744
|
+
plt.ylabel("Loss")
|
|
745
|
+
plt.legend()
|
|
746
|
+
plt.grid()
|
|
747
|
+
plt.show()
|
|
748
|
+
|
|
749
|
+
summary = model.select_features(
|
|
750
|
+
train_pool,
|
|
751
|
+
eval_set=eval_pool,
|
|
752
|
+
features_for_select=selection_feature_list,
|
|
753
|
+
num_features_to_select=best_features_kept,
|
|
754
|
+
steps=steps,
|
|
755
|
+
algorithm=algorithm,
|
|
756
|
+
shap_calc_type=shap_calc_type,
|
|
757
|
+
train_final_model=False,
|
|
758
|
+
logging_level=logging_level,
|
|
759
|
+
plot=False,
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
else:
|
|
763
|
+
summary = model.select_features(
|
|
764
|
+
train_pool,
|
|
765
|
+
eval_set=eval_pool,
|
|
766
|
+
features_for_select=selection_feature_list,
|
|
767
|
+
num_features_to_select=num_features_to_select,
|
|
768
|
+
steps=steps,
|
|
769
|
+
algorithm=algorithm,
|
|
770
|
+
shap_calc_type=shap_calc_type,
|
|
771
|
+
train_final_model=False,
|
|
772
|
+
logging_level=logging_level,
|
|
773
|
+
plot=False,
|
|
774
|
+
)
|
|
775
|
+
selected_features_algo = summary["selected_features_names"]
|
|
776
|
+
logger.info(f"Selected {len(selected_features_algo)} features by algorithm")
|
|
777
|
+
|
|
778
|
+
except Exception as e:
|
|
779
|
+
logger.error(f"Error during feature selection: {e}")
|
|
780
|
+
raise
|
|
781
|
+
|
|
782
|
+
# Combine selected features with forced features
|
|
783
|
+
final_features = selected_features_algo + force_to_include
|
|
784
|
+
|
|
785
|
+
logger.info(
|
|
786
|
+
f"Final feature set: {len(final_features)} features ({len(selected_features_algo)} selected, {len(force_to_include)} forced)" # noqa: E501
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
return final_features
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def plot_score_bins(
|
|
793
|
+
df,
|
|
794
|
+
prob_col,
|
|
795
|
+
target_col,
|
|
796
|
+
bins=10,
|
|
797
|
+
show_plot=True,
|
|
798
|
+
figsize=(10, 6),
|
|
799
|
+
title_placeholder=None,
|
|
800
|
+
xlabel=None,
|
|
801
|
+
ylabel=None,
|
|
802
|
+
save_path=None,
|
|
803
|
+
):
|
|
804
|
+
"""
|
|
805
|
+
Add 'propensity' and 'bins' columns to the input DataFrame, group the data,
|
|
806
|
+
and plot the average target for each bin using a bar plot.
|
|
807
|
+
|
|
808
|
+
Parameters:
|
|
809
|
+
df (pd.DataFrame): DataFrame containing the target column.
|
|
810
|
+
prob_col: Name of the column containing predicted probabilities.
|
|
811
|
+
target_col (str): Name of the target column to aggregate.
|
|
812
|
+
bins (int): Number of quantile bins for the 'propensity' column. Default is 10.
|
|
813
|
+
show_plot (bool): Whether to display the plot immediately. Default is True.as_integer_ratio
|
|
814
|
+
figsize (tuple): Size of the plot. Default is (10, 6).
|
|
815
|
+
title_placeholder (str): Title for the plot. If None, a default title is generated.
|
|
816
|
+
save_path (str): Path to save the plot. If None, the plot is not saved.
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
pd.DataFrame: Aggregated DataFrame with bin means for target and propensity.
|
|
820
|
+
"""
|
|
821
|
+
|
|
822
|
+
if title_placeholder is None:
|
|
823
|
+
title_placeholder = ""
|
|
824
|
+
else:
|
|
825
|
+
title_placeholder = f" - {title_placeholder}"
|
|
826
|
+
|
|
827
|
+
df = df.copy()
|
|
828
|
+
df.loc[:, "bins"] = pd.qcut(df[prob_col], q=bins, labels=False, duplicates="drop") + 1
|
|
829
|
+
tab = df.groupby("bins").agg({target_col: "mean", prob_col: "mean"}).reset_index()
|
|
830
|
+
|
|
831
|
+
figure, ax = plt.subplots(figsize=figsize)
|
|
832
|
+
|
|
833
|
+
ax = sns.barplot(x=tab["bins"], y=tab[target_col])
|
|
834
|
+
if xlabel is not None:
|
|
835
|
+
plt.xlabel(xlabel)
|
|
836
|
+
else:
|
|
837
|
+
plt.xlabel("Score bin")
|
|
838
|
+
|
|
839
|
+
if ylabel is not None:
|
|
840
|
+
plt.ylabel(f"Average {ylabel}")
|
|
841
|
+
else:
|
|
842
|
+
ylabel = f"Average {target_col.replace('_', ' ').title()}"
|
|
843
|
+
plt.ylabel(ylabel)
|
|
844
|
+
|
|
845
|
+
for index, row in tab.iterrows():
|
|
846
|
+
bar_patch = None
|
|
847
|
+
for patch in ax.patches:
|
|
848
|
+
if abs(patch.get_height() - row[target_col]) < 1e-6:
|
|
849
|
+
if abs(patch.get_x() + patch.get_width() / 2 - index) < 0.5:
|
|
850
|
+
bar_patch = patch
|
|
851
|
+
break
|
|
852
|
+
if bar_patch:
|
|
853
|
+
x_pos = bar_patch.get_x() + bar_patch.get_width() / 2.0
|
|
854
|
+
y_pos = bar_patch.get_height()
|
|
855
|
+
ax.text(x_pos, y_pos, f"{row[target_col]:.2f}", ha="center", va="bottom", color="black", fontweight="bold")
|
|
856
|
+
else:
|
|
857
|
+
ax.text(
|
|
858
|
+
index,
|
|
859
|
+
row[target_col],
|
|
860
|
+
f"{row[target_col]:.2f}",
|
|
861
|
+
ha="center",
|
|
862
|
+
va="bottom",
|
|
863
|
+
color="black",
|
|
864
|
+
fontweight="bold",
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
plt.axhline(y=df[target_col].mean(), color="r", linestyle="--", label="Overall target rate")
|
|
868
|
+
|
|
869
|
+
# compute improvement (last bing versus average rate)
|
|
870
|
+
improvement = (tab[target_col].iloc[-1] - df[target_col].mean()) / df[target_col].mean()
|
|
871
|
+
|
|
872
|
+
plt.title(
|
|
873
|
+
f"{ylabel} by score bin ({bins} bins) {title_placeholder}\n"
|
|
874
|
+
f"Improvement (highest bin vs average) : {improvement:.2%}"
|
|
875
|
+
)
|
|
876
|
+
|
|
877
|
+
if save_path is not None:
|
|
878
|
+
plt.savefig(save_path)
|
|
879
|
+
|
|
880
|
+
if show_plot:
|
|
881
|
+
plt.show()
|
|
882
|
+
else:
|
|
883
|
+
plt.close()
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
def shap_plot(pipeline, data, features, output_path="shap_summary.png", show_plot=True):
|
|
887
|
+
"""
|
|
888
|
+
Create and save a SHAP summary plot using a scikit-learn pipeline.
|
|
889
|
+
If a whole column is missing in test data that was present in training, it fills that column with zeros.
|
|
890
|
+
|
|
891
|
+
Parameters:
|
|
892
|
+
pipeline: scikit-learn Pipeline containing a 'preprocessor' and a 'classifier'
|
|
893
|
+
test_set: DataFrame for test data (used for computing SHAP values)
|
|
894
|
+
features: list of feature column names
|
|
895
|
+
output_path: file path where the plot will be saved
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
None. The plot is saved to output_path.
|
|
899
|
+
"""
|
|
900
|
+
|
|
901
|
+
data_transformed = pipeline.named_steps["preprocessor"].transform(data[features])
|
|
902
|
+
feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
|
|
903
|
+
# feature_names = [name.replace('cat__', '').replace('num__', '') for name in feature_names]
|
|
904
|
+
|
|
905
|
+
df = pd.DataFrame(data_transformed, columns=feature_names)
|
|
906
|
+
|
|
907
|
+
model = pipeline.named_steps["classifier"]
|
|
908
|
+
explainer = shap.Explainer(model)
|
|
909
|
+
shap_values = explainer.shap_values(df)
|
|
910
|
+
|
|
911
|
+
plt.figure()
|
|
912
|
+
shap.summary_plot(shap_values, df, show=show_plot)
|
|
913
|
+
plt.tight_layout()
|
|
914
|
+
plt.savefig(output_path)
|
|
915
|
+
plt.close()
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def plot_trend(
|
|
919
|
+
data,
|
|
920
|
+
x_col,
|
|
921
|
+
y_col,
|
|
922
|
+
hue_col,
|
|
923
|
+
xlabel="",
|
|
924
|
+
ylabel=None,
|
|
925
|
+
title=None,
|
|
926
|
+
figsize=(12, 6),
|
|
927
|
+
rotation=65,
|
|
928
|
+
vline_date=None,
|
|
929
|
+
show_grid=False,
|
|
930
|
+
show_plot=True,
|
|
931
|
+
save_path=None,
|
|
932
|
+
):
|
|
933
|
+
"""
|
|
934
|
+
Generates a line plot for time series data, grouped by a hue column.
|
|
935
|
+
|
|
936
|
+
Args:
|
|
937
|
+
data (pd.DataFrame): DataFrame containing the data to plot.
|
|
938
|
+
x_col (str): Name of the column for the x-axis (should be datetime-like).
|
|
939
|
+
y_col (str): Name of the column for the y-axis.
|
|
940
|
+
hue_col (str): Name of the column to group and color lines by.
|
|
941
|
+
title (str): Title for the plot.
|
|
942
|
+
xlabel (str, optional): Label for the x-axis. Defaults to ''.
|
|
943
|
+
ylabel (str, optional): Label for the y-axis. Defaults to y_col name.
|
|
944
|
+
figsize (tuple, optional): Figure size. Defaults to (12, 6).
|
|
945
|
+
rotation (int, optional): Rotation angle for x-axis labels. Defaults to 65.
|
|
946
|
+
vline_date (str, optional): Date string (e.g., 'YYYY-MM-DD') to draw a
|
|
947
|
+
vertical line. Defaults to None.
|
|
948
|
+
show_grid (bool, optional): Whether to display the background grid. Defaults to False.
|
|
949
|
+
show_plot (bool, optional): Whether to display the plot using plt.show(). Defaults to True.
|
|
950
|
+
save_path (str, optional): Path to save the figure. If None, figure is not saved. Defaults to None.
|
|
951
|
+
"""
|
|
952
|
+
if ylabel is None:
|
|
953
|
+
ylabel = y_col
|
|
954
|
+
|
|
955
|
+
data = data.copy()
|
|
956
|
+
data[x_col] = data[x_col].dt.to_timestamp()
|
|
957
|
+
|
|
958
|
+
plt.figure(figsize=figsize)
|
|
959
|
+
# sns.set_style("whitegrid" if show_grid else "white") # Set style based on show_grid
|
|
960
|
+
|
|
961
|
+
ax = sns.lineplot(data=data, x=x_col, y=y_col, hue=hue_col, marker="o")
|
|
962
|
+
|
|
963
|
+
ax.xaxis.set_major_locator(mdates.MonthLocator())
|
|
964
|
+
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
|
|
965
|
+
|
|
966
|
+
# Add vertical line if specified
|
|
967
|
+
if vline_date:
|
|
968
|
+
try:
|
|
969
|
+
vline_dt = pd.to_datetime(vline_date)
|
|
970
|
+
plt.axvline(vline_dt, color="red", linestyle="--")
|
|
971
|
+
except ValueError:
|
|
972
|
+
print(f"Warning: Could not parse vline_date '{vline_date}'. Skipping vertical line.")
|
|
973
|
+
|
|
974
|
+
plt.grid(show_grid)
|
|
975
|
+
plt.xticks(rotation=rotation, ha="right")
|
|
976
|
+
plt.legend(title=None)
|
|
977
|
+
if title:
|
|
978
|
+
plt.title(title)
|
|
979
|
+
plt.xlabel(xlabel)
|
|
980
|
+
plt.ylabel(ylabel)
|
|
981
|
+
plt.tight_layout() # Adjust layout
|
|
982
|
+
|
|
983
|
+
if save_path:
|
|
984
|
+
plt.savefig(save_path)
|
|
985
|
+
print(f"Plot saved to {save_path}")
|
|
986
|
+
|
|
987
|
+
if show_plot:
|
|
988
|
+
plt.show()
|
|
989
|
+
else:
|
|
990
|
+
plt.close() # Close the plot if not showing to free memory
|