ml-analytics-tools 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,990 @@
1
+ """
2
+ Set of utility functions for training/testing models
3
+ """
4
+
5
+ import matplotlib.dates as mdates
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import pandas as pd
9
+ import seaborn as sns
10
+ import shap
11
+ from catboost import CatBoostClassifier, CatBoostRegressor, EFeaturesSelectionAlgorithm, EShapCalcType, Pool
12
+ from lifelines import KaplanMeierFitter
13
+ from lifelines.utils import concordance_index
14
+ from sklearn.metrics import (
15
+ auc,
16
+ brier_score_loss,
17
+ f1_score,
18
+ precision_recall_curve,
19
+ precision_score,
20
+ recall_score,
21
+ roc_auc_score,
22
+ )
23
+ from sklearn.model_selection import GroupKFold, StratifiedGroupKFold, train_test_split
24
+
25
+ from ml_analytics.utils import get_logger, log_and_raise_error
26
+
27
+ logger = get_logger("modeling-tools")
28
+
29
+
30
+ def prepare_catboost_data(
31
+ df: pd.DataFrame,
32
+ cat_features: list[str],
33
+ feature_list: list[str] | None = None,
34
+ ) -> pd.DataFrame:
35
+ """
36
+ Prepare a DataFrame for CatBoost training or inference.
37
+
38
+ Runs ``infer_objects`` to downcast object dtypes, then converts every
39
+ categorical column to ``str`` so CatBoost never receives Python ``None``
40
+ or ``np.nan`` values in categorical columns (which would raise an error).
41
+ Numeric columns are left untouched.
42
+
43
+ Parameters
44
+ ----------
45
+ df : pd.DataFrame
46
+ Input DataFrame (will be copied internally).
47
+ cat_features : list[str]
48
+ Columns to treat as categorical.
49
+ feature_list : list[str], optional
50
+ Subset of columns to check for missing values (for logging only).
51
+ If None, all columns are checked.
52
+
53
+ Returns
54
+ -------
55
+ pd.DataFrame
56
+ Pre-processed copy of ``df``.
57
+ """
58
+ df = df.copy()
59
+ df = df.infer_objects(copy=False)
60
+
61
+ check_cols = feature_list if feature_list is not None else df.columns.tolist()
62
+ missing = df[check_cols].isnull().sum()
63
+ n_missing = missing[missing > 0].shape[0]
64
+ logger.info(f"Features with missing values: {n_missing}")
65
+
66
+ for col in cat_features:
67
+ if col in df.columns:
68
+ df[col] = df[col].astype(str)
69
+
70
+ return df
71
+
72
+
73
+ def make_catboost_pool(
74
+ df: pd.DataFrame,
75
+ feature_list: list[str],
76
+ cat_features: list[str],
77
+ label=None,
78
+ **pool_kwargs,
79
+ ) -> Pool:
80
+ """
81
+ Build a CatBoost ``Pool`` with automatic preprocessing.
82
+
83
+ Calls :func:`prepare_catboost_data` before constructing the pool, so you
84
+ never have to remember to handle missing values or cast categorical columns
85
+ manually.
86
+
87
+ Parameters
88
+ ----------
89
+ df : pd.DataFrame
90
+ Source DataFrame (will be copied internally by ``prepare_catboost_data``).
91
+ feature_list : list[str]
92
+ Feature columns to include in the pool.
93
+ cat_features : list[str]
94
+ Columns to treat as categorical.
95
+ label : array-like or str, optional
96
+ Target values, or the name of the target column in ``df``.
97
+ If a string, it is extracted from ``df`` before subsetting features.
98
+ **pool_kwargs :
99
+ Any additional keyword arguments forwarded to ``catboost.Pool``
100
+ (e.g. ``group_id``, ``weight``).
101
+
102
+ Returns
103
+ -------
104
+ catboost.Pool
105
+ """
106
+ if isinstance(label, str):
107
+ label = df[label]
108
+
109
+ df = prepare_catboost_data(df, cat_features=cat_features, feature_list=feature_list)
110
+
111
+ present_cat = [c for c in cat_features if c in feature_list]
112
+ cat_indices = [feature_list.index(c) for c in present_cat]
113
+
114
+ return Pool(
115
+ data=df[feature_list],
116
+ label=label,
117
+ feature_names=feature_list,
118
+ cat_features=cat_indices,
119
+ **pool_kwargs,
120
+ )
121
+
122
+
123
+ def get_features(df, target_col=None):
124
+ """
125
+ Get categorical and numerical features from a DataFrame.
126
+ Args:
127
+ df (pd.DataFrame): DataFrame to extract features from.
128
+ target_col (str, optional): Name of the target column. Defaults to None.
129
+ Returns:
130
+ tuple: List of categorical features and list of numerical features.
131
+ """
132
+
133
+ if target_col is not None:
134
+ df = df.drop(columns=[target_col], errors="ignore").copy()
135
+
136
+ categorical_features = df.select_dtypes(include=["object", "category"]).columns.tolist()
137
+ numerical_features = df.select_dtypes(include=["int32", "int64", "float64", "float32"]).columns.tolist()
138
+ return categorical_features, numerical_features
139
+
140
+
141
+ def get_balanced_accuracy(y_true, y_pred):
142
+ acc = (recall_score(y_true, y_pred, pos_label=0) + recall_score(y_true, y_pred, pos_label=1)) / 2
143
+ return acc
144
+
145
+
146
+ def pr_auc_score(y_true, y_pred_proba):
147
+ precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
148
+ return auc(recall, precision)
149
+
150
+
151
+ def brier_score(y_true, y_pred_proba):
152
+ """
153
+ Calculate the Brier score for binary classification.
154
+
155
+ The Brier score is a proper scoring rule that measures the accuracy of probabilistic predictions.
156
+ Lower scores are better, with 0 being perfect and 0.25 being the worst possible score for a binary classifier.
157
+
158
+ Args:
159
+ y_true (array-like): True binary labels (0 or 1).
160
+ y_pred_proba (array-like): Predicted probabilities for the positive class.
161
+
162
+ Returns:
163
+ float: Brier score (lower is better).
164
+ """
165
+ return brier_score_loss(y_true, y_pred_proba)
166
+
167
+
168
+ def expected_calibration_error(y_true, y_pred_proba, n_bins=10):
169
+ """
170
+ Calculate the Expected Calibration Error (ECE) for binary classification.
171
+
172
+ ECE measures the difference between predicted probabilities and actual outcomes
173
+ across different confidence bins. It indicates how well-calibrated the model's
174
+ probability estimates are.
175
+
176
+ Args:
177
+ y_true (array-like): True binary labels (0 or 1).
178
+ y_pred_proba (array-like): Predicted probabilities for the positive class.
179
+ n_bins (int): Number of bins to use for calibration curve. Default is 10.
180
+
181
+ Returns:
182
+ float: Expected Calibration Error (lower is better, 0 is perfect calibration).
183
+ """
184
+ try:
185
+ # Calculate bin boundaries
186
+ bin_boundaries = np.linspace(0, 1, n_bins + 1)
187
+ bin_lowers = bin_boundaries[:-1]
188
+ bin_uppers = bin_boundaries[1:]
189
+
190
+ # Calculate ECE
191
+ ece = 0.0
192
+ for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=True):
193
+ # Find samples in this bin
194
+ in_bin = (y_pred_proba > bin_lower) & (y_pred_proba <= bin_upper)
195
+ prop_in_bin = in_bin.mean()
196
+
197
+ if prop_in_bin > 0:
198
+ # Calculate accuracy and confidence for this bin
199
+ accuracy_in_bin = y_true[in_bin].mean() if in_bin.sum() > 0 else 0
200
+ avg_confidence_in_bin = y_pred_proba[in_bin].mean() if in_bin.sum() > 0 else 0
201
+
202
+ # Add weighted calibration error for this bin
203
+ ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
204
+
205
+ return ece
206
+
207
+ except Exception as e:
208
+ logger.warning(f"Error calculating ECE: {e}. Returning NaN.")
209
+ return np.nan
210
+
211
+
212
+ def mcc_score(y_true, y_pred):
213
+ """
214
+ Calculate Matthews correlation coefficient (MCC) for binary classification.
215
+
216
+ Args:
217
+ y_true (array-like): True binary labels.
218
+ y_pred (array-like): Predicted binary labels.
219
+
220
+ Returns:
221
+ float: Matthews correlation coefficient.
222
+ """
223
+ tp = ((y_true == 1) & (y_pred == 1)).sum()
224
+ tn = ((y_true == 0) & (y_pred == 0)).sum()
225
+ fp = ((y_true == 0) & (y_pred == 1)).sum()
226
+ fn = ((y_true == 1) & (y_pred == 0)).sum()
227
+
228
+ numerator = (tp * tn) - (fp * fn)
229
+ denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
230
+
231
+ if denominator == 0:
232
+ return 0.0
233
+
234
+ return numerator / denominator
235
+
236
+
237
+ def survival_mae(observed_time, event_indicator, predicted_time):
238
+ """
239
+ Calculates a Mean Absolute Error (MAE) suitable for survival data,
240
+ considering censored observations.
241
+ Ensures inputs are treated as NumPy arrays to avoid Pandas indexing issues.
242
+
243
+ Args:
244
+ observed_time_pd (pd.Series): Series of observed times.
245
+ event_indicator_pd (pd.Series): Series indicating if the event was observed.
246
+ predicted_time_pd (pd.Series): Series of predicted event times.
247
+
248
+ Returns:
249
+ float: The mean absolute error.
250
+ """
251
+
252
+ errors = np.zeros_like(observed_time, dtype=float)
253
+
254
+ event_mask = (event_indicator == 1) | (event_indicator is True)
255
+ errors[event_mask] = np.abs(observed_time[event_mask] - predicted_time[event_mask])
256
+ censored_mask = (event_indicator == 0) | (event_indicator is False)
257
+ predicted_early_mask = (censored_mask) & (predicted_time < observed_time)
258
+ errors[predicted_early_mask] = observed_time[predicted_early_mask] - predicted_time[predicted_early_mask]
259
+
260
+ return np.mean(errors)
261
+
262
+
263
+ def get_metrics(y_obs, y_pred, y_prob, prefix=None):
264
+ metrics = {}
265
+
266
+ if prefix is None:
267
+ prefix = ""
268
+ else:
269
+ prefix = prefix + "_"
270
+
271
+ metrics[prefix + "balanced_accuracy"] = get_balanced_accuracy(y_obs, y_pred)
272
+ metrics[prefix + "mcc"] = mcc_score(y_obs, y_pred)
273
+ metrics[prefix + "precision"] = precision_score(y_obs, y_pred, zero_division=0)
274
+ metrics[prefix + "recall"] = recall_score(y_obs, y_pred, zero_division=0)
275
+ metrics[prefix + "f1"] = f1_score(y_obs, y_pred, zero_division=0)
276
+ metrics[prefix + "pr_auc"] = pr_auc_score(y_obs, y_prob)
277
+ metrics[prefix + "roc_auc"] = roc_auc_score(y_obs, y_prob)
278
+ metrics[prefix + "brier_score"] = brier_score(y_obs, y_prob)
279
+ metrics[prefix + "ece"] = expected_calibration_error(y_obs, y_prob)
280
+ return metrics
281
+
282
+
283
+ def get_performance(data, target_col, pred_col, prob_col, grouping_cols=None, test_size_minimum_per_group=50):
284
+ """
285
+ Calculates performance metrics for each combination of grouping columns.
286
+
287
+ Args:
288
+ data (pd.DataFrame): DataFrame containing target, predictions, probabilities, and grouping columns.
289
+ target_col (str): Name of the target column.
290
+ pred_col (str): Name of the prediction column (0 or 1).
291
+ prob_col (str): Name of the probability column (probability of class 1).
292
+ grouping_cols (list, optional): List of column names to group by.
293
+ Defaults to ['product', 'event_type', 'country_code'].
294
+ test_size_minimum_per_group (int, optional): Minimum number of observations required in each group to calculate metrics.
295
+ Defaults to 50.
296
+
297
+ Returns:
298
+ pd.DataFrame: DataFrame with grouping columns, number of observations, and performance metrics for each group.
299
+ """ # noqa: E501
300
+
301
+ data = data.copy()
302
+ if grouping_cols is None:
303
+ data["fake_group"] = 1
304
+ grouping_cols = ["fake_group"]
305
+ else:
306
+ grouping_cols = [col for col in grouping_cols if col in data.columns]
307
+ if not grouping_cols:
308
+ logger.error("No valid grouping columns found!")
309
+
310
+ results = []
311
+ for name, group in data.groupby(grouping_cols):
312
+ if not isinstance(name, tuple):
313
+ name = (name,)
314
+ n_observations = len(group)
315
+
316
+ # Skip group if too small early on
317
+ if n_observations == 0:
318
+ logger.warning(f"Skipping empty group {name}.")
319
+ continue
320
+
321
+ if n_observations > test_size_minimum_per_group:
322
+ y_true = group[target_col].values.ravel()
323
+ y_pred = group[pred_col].values.ravel()
324
+ y_prob = group[prob_col].values.ravel()
325
+
326
+ tp = ((y_true == 1) & (y_pred == 1)).sum() / n_observations
327
+ tn = ((y_true == 0) & (y_pred == 0)).sum() / n_observations
328
+ fp = ((y_true == 0) & (y_pred == 1)).sum() / n_observations
329
+ fn = ((y_true == 1) & (y_pred == 0)).sum() / n_observations
330
+
331
+ # Calculate rank correlation
332
+ rank_correlation = np.nan # Default to NaN
333
+ binning_agg = pd.DataFrame(columns=["bin", "target_rate"])
334
+
335
+ try:
336
+ bins_labels = pd.qcut(y_prob, q=10, labels=False, duplicates="drop")
337
+ except ValueError:
338
+ try:
339
+ bins_labels = pd.cut(y_prob, bins=10, labels=False, include_lowest=True)
340
+ except ValueError:
341
+ logger.warning(f"Could not compute bins for group {name}. Setting rank_correlation to NaN.")
342
+ bins_labels = None
343
+
344
+ if bins_labels is not None:
345
+ try:
346
+ binning_df = pd.DataFrame({"bin": bins_labels, "target_rate": y_true})
347
+ binning_agg = binning_df.groupby("bin")["target_rate"].mean().reset_index()
348
+ if len(binning_agg) > 4:
349
+ rank_correlation = binning_agg[["bin", "target_rate"]].corr(method="spearman").iloc[0, 1]
350
+ target_bin_correlation = binning_agg[["bin", "target_rate"]].corr(method="pearson").iloc[0, 1]
351
+ else:
352
+ logger.warning(
353
+ f"Not enough bins with data to calculate correlation for group {name}. Setting rank_correlation to NaN." # noqa: E501
354
+ ) # noqa: E501
355
+ rank_correlation = np.nan
356
+ except Exception as e:
357
+ logger.error(f"Error during binning aggregation or correlation for group {name}: {e}")
358
+
359
+ if len(np.unique(y_true)) > 1:
360
+ metrics = get_metrics(y_true, y_pred, y_prob)
361
+
362
+ binning_agg = binning_agg.sort_values(by="bin", ascending=False)
363
+ group_result = dict(zip(grouping_cols, name, strict=False))
364
+ group_result["n_observations"] = n_observations
365
+ group_result["target_rate"] = np.mean(y_true)
366
+ group_result["score_average"] = np.mean(y_prob)
367
+ group_result["score_std"] = np.std(y_prob)
368
+ group_result["rank_correlation"] = rank_correlation
369
+ group_result["target_bin_correlation"] = target_bin_correlation
370
+ group_result["number_deciles"] = len(binning_agg)
371
+ group_result["tp_perc"] = tp * 100
372
+ group_result["tn_perc"] = tn * 100
373
+ group_result["fp_perc"] = fp * 100
374
+ group_result["fn_perc"] = fn * 100
375
+ group_result.update(metrics)
376
+ group_result["first_decile_target_rate"] = binning_agg.iloc[0, 1] if len(binning_agg) > 0 else np.nan
377
+ group_result["second_decile_target_rate"] = binning_agg.iloc[1, 1] if len(binning_agg) > 1 else np.nan
378
+ group_result["third_decile_target_rate"] = binning_agg.iloc[2, 1] if len(binning_agg) > 2 else np.nan
379
+ results.append(group_result)
380
+ else:
381
+ logger.warning(f"Group {name} has too few observations ({n_observations}). Skipping.")
382
+ continue
383
+
384
+ if not results:
385
+ logger.warning("No groups found or processed. Returning empty DataFrame.")
386
+ return None
387
+ else:
388
+ performance_df = pd.DataFrame(results)
389
+
390
+ def create_improvement_column(df, new_column, baseline, new_value):
391
+ df[new_column] = np.where(
392
+ (df[baseline].notna()) & (df[baseline] != 0) & (df[new_value].notna()),
393
+ (df[new_value] - df[baseline]) / df[baseline],
394
+ np.nan,
395
+ )
396
+ return df
397
+
398
+ performance_df = create_improvement_column(performance_df, "pr_auc_improvement", "target_rate", "pr_auc")
399
+ performance_df = create_improvement_column(
400
+ performance_df, "first_decile_improvement", "target_rate", "first_decile_target_rate"
401
+ ) # noqa: E501
402
+ performance_df = create_improvement_column(
403
+ performance_df, "second_decile_improvement", "target_rate", "second_decile_target_rate"
404
+ ) # noqa: E501
405
+
406
+ performance_df = performance_df.round(3)
407
+ return performance_df
408
+
409
+
410
+ def get_metrics_surv(observed_time, event_indicator, predicted_time, prefix=None):
411
+ """
412
+ Calculate performance metrics for survival analysis.
413
+
414
+ Args:
415
+ observed_time (array-like): Observed times.
416
+ event_indicator (array-like): Event indicators (1 if event occurred, 0 if censored).
417
+ predicted_time (array-like): Predicted times.
418
+
419
+ Returns:
420
+ dict: Dictionary of performance metrics.
421
+ """
422
+ metrics = {}
423
+
424
+ if prefix is None:
425
+ prefix = ""
426
+ else:
427
+ prefix = prefix + "_"
428
+
429
+ metrics[prefix + "mae"] = survival_mae(observed_time, event_indicator, predicted_time)
430
+ metrics[prefix + "c_index"] = concordance_index(observed_time, predicted_time, event_indicator)
431
+ return metrics
432
+
433
+
434
+ def get_performance_surv(data, time_col, event_col, pred_col, grouping_cols=None, test_size_minimum_per_group=50):
435
+ """
436
+ Calculates performance metrics for each combination of grouping columns.
437
+ Args:
438
+ data (pd.DataFrame): DataFrame containing target, predictions, probabilities, and grouping columns.
439
+ time_col (str): Name of the time column.
440
+ event_col (str): Name of the event column.
441
+ pred_col (str): Name of the prediction column (predicted survival time).
442
+ grouping_cols (list, optional): List of column names to group by.
443
+ Defaults to ['product', 'event_type', 'country_code'].
444
+ Returns:
445
+ pd.DataFrame: DataFrame with grouping columns, number of observations, and performance metrics for each group.
446
+ """
447
+
448
+ kmf = KaplanMeierFitter()
449
+
450
+ data = data.copy()
451
+ if grouping_cols is None:
452
+ data["fake_group"] = 1
453
+ grouping_cols = ["fake_group"]
454
+ else:
455
+ grouping_cols = [col for col in grouping_cols if col in data.columns]
456
+ if not grouping_cols:
457
+ logger.error("No valid grouping columns found!")
458
+
459
+ results = []
460
+ for name, group in data.groupby(grouping_cols):
461
+ if not isinstance(name, tuple):
462
+ name = (name,)
463
+ n_observations = len(group)
464
+ # Skip group if too small early on
465
+ if n_observations == 0:
466
+ logger.warning(f"Skipping empty group {name}.")
467
+ continue
468
+
469
+ if n_observations > test_size_minimum_per_group:
470
+ kmf.fit(durations=group[time_col], event_observed=group[event_col])
471
+ print()
472
+ metrics = get_metrics_surv(group[time_col], group[event_col], group[pred_col])
473
+ group_result = dict(zip(grouping_cols, name, strict=False))
474
+ group_result["n_observations"] = n_observations
475
+ group_result["avg_months"] = np.mean(group[time_col])
476
+ group_result["median_months"] = np.median(group[time_col])
477
+ group_result["km_median_surv_months"] = kmf.median_survival_time_
478
+ group_result["event_rate"] = np.mean(group[event_col])
479
+ group_result["avg_predicted_months"] = np.mean(group[pred_col])
480
+ group_result["median_predicted_months"] = np.median(group[pred_col])
481
+ group_result.update(metrics)
482
+ results.append(group_result)
483
+ else:
484
+ logger.warning(f"Group {name} has too few observations ({n_observations}). Skipping.")
485
+ continue
486
+
487
+ if not results:
488
+ logger.warning("No groups found or processed. Returning empty DataFrame.")
489
+ return None
490
+ else:
491
+ performance_df = pd.DataFrame(results)
492
+ performance_df = performance_df.round(3)
493
+ return performance_df
494
+
495
+
496
+ def time_split(data, date_column, test_ratio=0.3, max_date=None):
497
+ """
498
+ Splits the data into training and testing sets based on a date column.
499
+
500
+ Parameters:
501
+ data (pd.DataFrame): The input dataframe to be split.
502
+ date_column (str): The name of the date column to use for splitting.
503
+ test_ratio (float): The ratio of the data to be used for testing. Default is 0.3.
504
+ max_date (str): The maximum date to consider for the splittIf None, all data is used. Default is None.
505
+
506
+ Returns:
507
+ pd.DataFrame, pd.DataFrame: The training and testing dataframes.
508
+ """
509
+
510
+ if test_ratio <= 0 or test_ratio >= 1:
511
+ log_and_raise_error(logger=logger, message="test_ratio must be between 0 and 1!")
512
+
513
+ if max_date is not None:
514
+ data = data[data[date_column] < max_date].copy()
515
+
516
+ data[date_column] = pd.to_datetime(data[date_column]).dt.date
517
+
518
+ data = data.sort_values(by=date_column)
519
+ cutoff_date = data[date_column].quantile(1 - test_ratio)
520
+
521
+ # Split the data into training and testing sets
522
+ train_data = data[data[date_column] < cutoff_date]
523
+ test_data = data[data[date_column] >= cutoff_date]
524
+
525
+ return train_data, test_data
526
+
527
+
528
+ def catboost_feature_selection(
529
+ train_df: pd.DataFrame,
530
+ feature_list: list[str],
531
+ target_column: str | list[str], # Modified to handle both single and multiple targets
532
+ cat_features: list[str],
533
+ id_col: str = "id_col",
534
+ num_features_to_select: int = None,
535
+ define_best_num_features: bool = False,
536
+ model: CatBoostClassifier | CatBoostRegressor = None,
537
+ model_task: str = "classification",
538
+ algorithm: EFeaturesSelectionAlgorithm = None,
539
+ shap_calc_type: EShapCalcType = EShapCalcType.Regular,
540
+ force_to_include: list[str] = None,
541
+ steps: int = 15,
542
+ iterations: int = 100,
543
+ random_seed: int = 42,
544
+ logging_level: str = "Verbose",
545
+ min_count_binary: int = 30,
546
+ sample_ratio: float = None,
547
+ ) -> list[str]:
548
+ """
549
+ Perform feature selection using CatBoost's built-in feature selection with GroupKFold validation.
550
+ Now supports both classification and regression tasks, including survival analysis.
551
+
552
+ Parameters:
553
+ - train_df: pd.DataFrame. The training DataFrame.
554
+ - feature_list: list[str]. List of features to consider for selection.
555
+ - target_column: str or list[str]. The target column name(s). For survival: ['y_lower', 'y_upper']
556
+ - cat_features: list[str]. List of categorical feature names for CatBoost.
557
+ - id_col: str. Column name for grouping (default: 'id_col').
558
+ - model: CatBoostClassifier or CatBoostRegressor instance. If None, creates appropriate model based on target.
559
+ - model_task: str. Type of model task: 'classification', 'regression', or 'survival'.
560
+ - algorithm: EFeaturesSelectionAlgorithm. If None, defaults to RecursiveByShapValues.
561
+ - force_to_include: list[str]. Features to always include in final set.
562
+ - steps: int. Number of steps for feature selection.
563
+ - iterations: int. Number of iterations for the CatBoost model.
564
+ - random_seed: int. Random seed for reproducibility.
565
+ - logging_level: str. Logging level for CatBoost.
566
+ - min_count_binary: int. Minimum count for binary features to be included.
567
+ - sample_ratio: float. If provided (e.g., 0.3), sample this ratio of unique IDs using stratified sampling by target for faster iteration.
568
+
569
+ Returns:
570
+ - list[str]. List of selected features.
571
+ """ # noqa: E501
572
+ feature_list = feature_list.copy()
573
+ train_df = prepare_catboost_data(train_df, cat_features=cat_features, feature_list=feature_list)
574
+
575
+ if algorithm is None:
576
+ algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues
577
+
578
+ # Sample data by unique IDs if sample_ratio is specified
579
+ if sample_ratio is not None:
580
+ if sample_ratio <= 0 or sample_ratio >= 1:
581
+ log_and_raise_error(logger, "sample_ratio must be between 0 and 1!")
582
+ # Get unique IDs and their corresponding target values for stratification
583
+ if isinstance(target_column, list):
584
+ # For survival models, aggregate first target column
585
+ unique_ids_df = train_df.groupby(id_col, as_index=False)[target_column[0]].first()
586
+ if model_task == "survival":
587
+ # Also need second target column for event indicator
588
+ unique_ids_df[target_column[1]] = train_df.groupby(id_col)[target_column[1]].first().values
589
+ stratify_col = (unique_ids_df[target_column[1]] != -1).astype(int)
590
+ else:
591
+ stratify_col = unique_ids_df[target_column[0]]
592
+ else:
593
+ unique_ids_df = train_df.groupby(id_col, as_index=False)[target_column].first()
594
+ stratify_col = unique_ids_df[target_column]
595
+
596
+ # Sample unique IDs with stratification
597
+ sampled_ids, _ = train_test_split(
598
+ unique_ids_df[id_col], train_size=sample_ratio, stratify=stratify_col, random_state=random_seed
599
+ )
600
+
601
+ # Filter dataframe to keep only sampled IDs
602
+ original_size = len(train_df)
603
+ train_df = train_df[train_df[id_col].isin(sampled_ids)].copy()
604
+ logger.info(f"Sampled {len(sampled_ids)} unique IDs ({sample_ratio:.1%}) from {len(unique_ids_df)} total IDs")
605
+ logger.info(
606
+ f"Reduced dataset from {original_size:,} to {len(train_df):,} rows ({len(train_df) / original_size:.1%}) for faster feature selection iteration" # noqa: E501
607
+ )
608
+
609
+ # Determine if this is a survival/regression task
610
+ if model_task is None:
611
+ log_and_raise_error(logger, "model_task must be specified as 'classification', 'regression', or 'survival'!")
612
+ if model_task not in ["classification", "regression", "survival"]:
613
+ log_and_raise_error(logger, "model_task must be one of 'classification', 'regression', or 'survival'!")
614
+
615
+ if model is None:
616
+ if model_task == "regression":
617
+ model = CatBoostRegressor(iterations=iterations, loss_function="RMSE", verbose=0, random_seed=random_seed)
618
+ elif model_task == "survival":
619
+ # Survival analysis with AFT loss
620
+ model = CatBoostRegressor(
621
+ iterations=iterations, loss_function="SurvivalAft:dist=Normal", verbose=0, random_seed=random_seed
622
+ )
623
+ else:
624
+ model = CatBoostClassifier(
625
+ iterations=iterations, loss_function="Logloss", verbose=0, random_seed=random_seed
626
+ )
627
+
628
+ if num_features_to_select is not None and define_best_num_features:
629
+ num_features_to_select = None
630
+ if num_features_to_select is None:
631
+ define_best_num_features = True
632
+
633
+ # Remove features with only one value
634
+ for col in feature_list[:]:
635
+ if train_df[col].dropna().nunique() == 1:
636
+ feature_list.remove(col)
637
+
638
+ # Remove binary features with insufficient counts
639
+ for col in feature_list[:]:
640
+ if train_df[col].dropna().nunique() == 2:
641
+ unique_vals = set(train_df[col].dropna().unique())
642
+ if unique_vals == {0, 1} or unique_vals == {1, 0} or unique_vals == {0.0, 1.0}:
643
+ if train_df[col].value_counts().get(1, 0) < min_count_binary:
644
+ feature_list.remove(col)
645
+
646
+ # Get correlation between numberic future, and print the features with more than +-0.80 correlation
647
+ num_features = [f for f in feature_list if f not in cat_features]
648
+ corr_matrix = train_df[num_features].corr()
649
+ high_corr_pairs = []
650
+ for i in range(len(corr_matrix.columns)):
651
+ for j in range(i):
652
+ if abs(corr_matrix.iloc[i, j]) > 0.8:
653
+ high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
654
+ if high_corr_pairs:
655
+ logger.info(f"Found {len(high_corr_pairs)} highly correlated feature pairs (|correlation| > 0.8)")
656
+ else:
657
+ logger.info("No highly correlated feature pairs found (|correlation| > 0.8).")
658
+
659
+ if force_to_include is None:
660
+ force_to_include = []
661
+
662
+ # Only exclude forced features from selection - let algorithm decide on categorical features
663
+ selection_feature_list = [f for f in feature_list if f not in force_to_include]
664
+ # Include categorical features from both selection list and forced features
665
+ all_features_for_pool = selection_feature_list + force_to_include
666
+ ncat_features = [c for c in cat_features if c in all_features_for_pool]
667
+
668
+ logger.info(
669
+ f"Feature selection: {len(selection_feature_list)} selectable, {len(force_to_include)} forced, {len(ncat_features)} categorical" # noqa: E501
670
+ )
671
+
672
+ if model_task in ["regression", "survival"]:
673
+ gkf = GroupKFold(n_splits=3)
674
+ groups = train_df[id_col]
675
+ if model_task == "survival":
676
+ # For survival, use event indicator for stratification approximation
677
+ stratify_col = (train_df[target_column[1]] != -1).astype(int)
678
+ splits = list(StratifiedGroupKFold(n_splits=3).split(train_df, stratify_col, groups=groups))
679
+ else:
680
+ splits = list(gkf.split(train_df, groups=groups))
681
+ else:
682
+ sgkf = StratifiedGroupKFold(n_splits=3)
683
+ groups = train_df[id_col]
684
+ splits = list(sgkf.split(train_df, train_df[target_column], groups=groups))
685
+
686
+ train_idx = np.concatenate([splits[0][0], splits[1][0]])
687
+ val_idx = splits[2][1]
688
+
689
+ train_fold = train_df.iloc[train_idx]
690
+ val_fold = train_df.iloc[val_idx]
691
+
692
+ train_pool = make_catboost_pool(train_fold, all_features_for_pool, cat_features, label=train_fold[target_column])
693
+ eval_pool = make_catboost_pool(val_fold, all_features_for_pool, cat_features, label=val_fold[target_column])
694
+
695
+ # Perform feature selection
696
+ selected_features_algo = []
697
+ try:
698
+ if define_best_num_features:
699
+ fsummary = model.select_features(
700
+ train_pool,
701
+ eval_set=eval_pool,
702
+ features_for_select=selection_feature_list,
703
+ num_features_to_select=1,
704
+ steps=steps,
705
+ algorithm=algorithm,
706
+ shap_calc_type=shap_calc_type,
707
+ train_final_model=False,
708
+ logging_level=logging_level,
709
+ plot=False,
710
+ )
711
+
712
+ loss_features = pd.DataFrame(
713
+ {
714
+ "Features_Removed": fsummary["loss_graph"]["removed_features_count"],
715
+ "Loss": fsummary["loss_graph"]["loss_values"],
716
+ }
717
+ )
718
+
719
+ best_idx = loss_features["Loss"].idxmin()
720
+ best_features_kept = len(selection_feature_list) - loss_features.loc[best_idx, "Features_Removed"]
721
+ best_loss_value = loss_features.loc[best_idx, "Loss"]
722
+
723
+ logger.info(
724
+ f"Optimal features: {best_features_kept} selected + {len(force_to_include)} forced = {best_features_kept + len(force_to_include)} total (loss: {best_loss_value:.6f})" # noqa: E501
725
+ )
726
+
727
+ # plot of losses vs number of features removed
728
+ plt.figure(figsize=(8, 5))
729
+ plt.plot(
730
+ loss_features["Features_Removed"],
731
+ loss_features["Loss"],
732
+ marker="o",
733
+ linestyle="-",
734
+ color="b",
735
+ )
736
+ plt.axvline(
737
+ x=loss_features.loc[best_idx, "Features_Removed"],
738
+ color="r",
739
+ linestyle="--",
740
+ label="Best number of features",
741
+ )
742
+ plt.title("Feature Selection Loss Graph")
743
+ plt.xlabel("Number of Features Removed")
744
+ plt.ylabel("Loss")
745
+ plt.legend()
746
+ plt.grid()
747
+ plt.show()
748
+
749
+ summary = model.select_features(
750
+ train_pool,
751
+ eval_set=eval_pool,
752
+ features_for_select=selection_feature_list,
753
+ num_features_to_select=best_features_kept,
754
+ steps=steps,
755
+ algorithm=algorithm,
756
+ shap_calc_type=shap_calc_type,
757
+ train_final_model=False,
758
+ logging_level=logging_level,
759
+ plot=False,
760
+ )
761
+
762
+ else:
763
+ summary = model.select_features(
764
+ train_pool,
765
+ eval_set=eval_pool,
766
+ features_for_select=selection_feature_list,
767
+ num_features_to_select=num_features_to_select,
768
+ steps=steps,
769
+ algorithm=algorithm,
770
+ shap_calc_type=shap_calc_type,
771
+ train_final_model=False,
772
+ logging_level=logging_level,
773
+ plot=False,
774
+ )
775
+ selected_features_algo = summary["selected_features_names"]
776
+ logger.info(f"Selected {len(selected_features_algo)} features by algorithm")
777
+
778
+ except Exception as e:
779
+ logger.error(f"Error during feature selection: {e}")
780
+ raise
781
+
782
+ # Combine selected features with forced features
783
+ final_features = selected_features_algo + force_to_include
784
+
785
+ logger.info(
786
+ f"Final feature set: {len(final_features)} features ({len(selected_features_algo)} selected, {len(force_to_include)} forced)" # noqa: E501
787
+ )
788
+
789
+ return final_features
790
+
791
+
792
+ def plot_score_bins(
793
+ df,
794
+ prob_col,
795
+ target_col,
796
+ bins=10,
797
+ show_plot=True,
798
+ figsize=(10, 6),
799
+ title_placeholder=None,
800
+ xlabel=None,
801
+ ylabel=None,
802
+ save_path=None,
803
+ ):
804
+ """
805
+ Add 'propensity' and 'bins' columns to the input DataFrame, group the data,
806
+ and plot the average target for each bin using a bar plot.
807
+
808
+ Parameters:
809
+ df (pd.DataFrame): DataFrame containing the target column.
810
+ prob_col: Name of the column containing predicted probabilities.
811
+ target_col (str): Name of the target column to aggregate.
812
+ bins (int): Number of quantile bins for the 'propensity' column. Default is 10.
813
+ show_plot (bool): Whether to display the plot immediately. Default is True.as_integer_ratio
814
+ figsize (tuple): Size of the plot. Default is (10, 6).
815
+ title_placeholder (str): Title for the plot. If None, a default title is generated.
816
+ save_path (str): Path to save the plot. If None, the plot is not saved.
817
+
818
+ Returns:
819
+ pd.DataFrame: Aggregated DataFrame with bin means for target and propensity.
820
+ """
821
+
822
+ if title_placeholder is None:
823
+ title_placeholder = ""
824
+ else:
825
+ title_placeholder = f" - {title_placeholder}"
826
+
827
+ df = df.copy()
828
+ df.loc[:, "bins"] = pd.qcut(df[prob_col], q=bins, labels=False, duplicates="drop") + 1
829
+ tab = df.groupby("bins").agg({target_col: "mean", prob_col: "mean"}).reset_index()
830
+
831
+ figure, ax = plt.subplots(figsize=figsize)
832
+
833
+ ax = sns.barplot(x=tab["bins"], y=tab[target_col])
834
+ if xlabel is not None:
835
+ plt.xlabel(xlabel)
836
+ else:
837
+ plt.xlabel("Score bin")
838
+
839
+ if ylabel is not None:
840
+ plt.ylabel(f"Average {ylabel}")
841
+ else:
842
+ ylabel = f"Average {target_col.replace('_', ' ').title()}"
843
+ plt.ylabel(ylabel)
844
+
845
+ for index, row in tab.iterrows():
846
+ bar_patch = None
847
+ for patch in ax.patches:
848
+ if abs(patch.get_height() - row[target_col]) < 1e-6:
849
+ if abs(patch.get_x() + patch.get_width() / 2 - index) < 0.5:
850
+ bar_patch = patch
851
+ break
852
+ if bar_patch:
853
+ x_pos = bar_patch.get_x() + bar_patch.get_width() / 2.0
854
+ y_pos = bar_patch.get_height()
855
+ ax.text(x_pos, y_pos, f"{row[target_col]:.2f}", ha="center", va="bottom", color="black", fontweight="bold")
856
+ else:
857
+ ax.text(
858
+ index,
859
+ row[target_col],
860
+ f"{row[target_col]:.2f}",
861
+ ha="center",
862
+ va="bottom",
863
+ color="black",
864
+ fontweight="bold",
865
+ )
866
+
867
+ plt.axhline(y=df[target_col].mean(), color="r", linestyle="--", label="Overall target rate")
868
+
869
+ # compute improvement (last bing versus average rate)
870
+ improvement = (tab[target_col].iloc[-1] - df[target_col].mean()) / df[target_col].mean()
871
+
872
+ plt.title(
873
+ f"{ylabel} by score bin ({bins} bins) {title_placeholder}\n"
874
+ f"Improvement (highest bin vs average) : {improvement:.2%}"
875
+ )
876
+
877
+ if save_path is not None:
878
+ plt.savefig(save_path)
879
+
880
+ if show_plot:
881
+ plt.show()
882
+ else:
883
+ plt.close()
884
+
885
+
886
+ def shap_plot(pipeline, data, features, output_path="shap_summary.png", show_plot=True):
887
+ """
888
+ Create and save a SHAP summary plot using a scikit-learn pipeline.
889
+ If a whole column is missing in test data that was present in training, it fills that column with zeros.
890
+
891
+ Parameters:
892
+ pipeline: scikit-learn Pipeline containing a 'preprocessor' and a 'classifier'
893
+ test_set: DataFrame for test data (used for computing SHAP values)
894
+ features: list of feature column names
895
+ output_path: file path where the plot will be saved
896
+
897
+ Returns:
898
+ None. The plot is saved to output_path.
899
+ """
900
+
901
+ data_transformed = pipeline.named_steps["preprocessor"].transform(data[features])
902
+ feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
903
+ # feature_names = [name.replace('cat__', '').replace('num__', '') for name in feature_names]
904
+
905
+ df = pd.DataFrame(data_transformed, columns=feature_names)
906
+
907
+ model = pipeline.named_steps["classifier"]
908
+ explainer = shap.Explainer(model)
909
+ shap_values = explainer.shap_values(df)
910
+
911
+ plt.figure()
912
+ shap.summary_plot(shap_values, df, show=show_plot)
913
+ plt.tight_layout()
914
+ plt.savefig(output_path)
915
+ plt.close()
916
+
917
+
918
+ def plot_trend(
919
+ data,
920
+ x_col,
921
+ y_col,
922
+ hue_col,
923
+ xlabel="",
924
+ ylabel=None,
925
+ title=None,
926
+ figsize=(12, 6),
927
+ rotation=65,
928
+ vline_date=None,
929
+ show_grid=False,
930
+ show_plot=True,
931
+ save_path=None,
932
+ ):
933
+ """
934
+ Generates a line plot for time series data, grouped by a hue column.
935
+
936
+ Args:
937
+ data (pd.DataFrame): DataFrame containing the data to plot.
938
+ x_col (str): Name of the column for the x-axis (should be datetime-like).
939
+ y_col (str): Name of the column for the y-axis.
940
+ hue_col (str): Name of the column to group and color lines by.
941
+ title (str): Title for the plot.
942
+ xlabel (str, optional): Label for the x-axis. Defaults to ''.
943
+ ylabel (str, optional): Label for the y-axis. Defaults to y_col name.
944
+ figsize (tuple, optional): Figure size. Defaults to (12, 6).
945
+ rotation (int, optional): Rotation angle for x-axis labels. Defaults to 65.
946
+ vline_date (str, optional): Date string (e.g., 'YYYY-MM-DD') to draw a
947
+ vertical line. Defaults to None.
948
+ show_grid (bool, optional): Whether to display the background grid. Defaults to False.
949
+ show_plot (bool, optional): Whether to display the plot using plt.show(). Defaults to True.
950
+ save_path (str, optional): Path to save the figure. If None, figure is not saved. Defaults to None.
951
+ """
952
+ if ylabel is None:
953
+ ylabel = y_col
954
+
955
+ data = data.copy()
956
+ data[x_col] = data[x_col].dt.to_timestamp()
957
+
958
+ plt.figure(figsize=figsize)
959
+ # sns.set_style("whitegrid" if show_grid else "white") # Set style based on show_grid
960
+
961
+ ax = sns.lineplot(data=data, x=x_col, y=y_col, hue=hue_col, marker="o")
962
+
963
+ ax.xaxis.set_major_locator(mdates.MonthLocator())
964
+ ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
965
+
966
+ # Add vertical line if specified
967
+ if vline_date:
968
+ try:
969
+ vline_dt = pd.to_datetime(vline_date)
970
+ plt.axvline(vline_dt, color="red", linestyle="--")
971
+ except ValueError:
972
+ print(f"Warning: Could not parse vline_date '{vline_date}'. Skipping vertical line.")
973
+
974
+ plt.grid(show_grid)
975
+ plt.xticks(rotation=rotation, ha="right")
976
+ plt.legend(title=None)
977
+ if title:
978
+ plt.title(title)
979
+ plt.xlabel(xlabel)
980
+ plt.ylabel(ylabel)
981
+ plt.tight_layout() # Adjust layout
982
+
983
+ if save_path:
984
+ plt.savefig(save_path)
985
+ print(f"Plot saved to {save_path}")
986
+
987
+ if show_plot:
988
+ plt.show()
989
+ else:
990
+ plt.close() # Close the plot if not showing to free memory