leakproof-ml 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. leakproof_ml-0.0.1/.gitignore +9 -0
  2. leakproof_ml-0.0.1/LICENSE.txt +21 -0
  3. leakproof_ml-0.0.1/PKG-INFO +176 -0
  4. leakproof_ml-0.0.1/README.md +143 -0
  5. leakproof_ml-0.0.1/pyproject.toml +64 -0
  6. leakproof_ml-0.0.1/src/leakproof_ml/__init__.py +3 -0
  7. leakproof_ml-0.0.1/src/leakproof_ml/interpretability/__init__.py +2 -0
  8. leakproof_ml-0.0.1/src/leakproof_ml/interpretability/_explainer_utils.py +92 -0
  9. leakproof_ml-0.0.1/src/leakproof_ml/interpretability/analysis.py +40 -0
  10. leakproof_ml-0.0.1/src/leakproof_ml/interpretability/explainer.py +381 -0
  11. leakproof_ml-0.0.1/src/leakproof_ml/modeling/__init__.py +1 -0
  12. leakproof_ml-0.0.1/src/leakproof_ml/modeling/_model_utils.py +366 -0
  13. leakproof_ml-0.0.1/src/leakproof_ml/modeling/training.py +290 -0
  14. leakproof_ml-0.0.1/src/leakproof_ml/plots/__init__.py +4 -0
  15. leakproof_ml-0.0.1/src/leakproof_ml/plots/_plots_utils.py +95 -0
  16. leakproof_ml-0.0.1/src/leakproof_ml/plots/explainer_plots.py +262 -0
  17. leakproof_ml-0.0.1/src/leakproof_ml/plots/metric_plots.py +278 -0
  18. leakproof_ml-0.0.1/src/leakproof_ml/preprocessing/__init__.py +3 -0
  19. leakproof_ml-0.0.1/src/leakproof_ml/preprocessing/_pipeline_utils.py +19 -0
  20. leakproof_ml-0.0.1/src/leakproof_ml/preprocessing/cleaning.py +58 -0
  21. leakproof_ml-0.0.1/src/leakproof_ml/preprocessing/pipeline.py +69 -0
  22. leakproof_ml-0.0.1/src/leakproof_ml/preprocessing/selector.py +110 -0
  23. leakproof_ml-0.0.1/src/leakproof_ml/tuning/__init__.py +1 -0
  24. leakproof_ml-0.0.1/src/leakproof_ml/tuning/_tuning_utils.py +204 -0
  25. leakproof_ml-0.0.1/src/leakproof_ml/tuning/tuner.py +356 -0
  26. leakproof_ml-0.0.1/src/leakproof_ml/utils/__init__.py +1 -0
  27. leakproof_ml-0.0.1/src/leakproof_ml/utils/io_utils.py +119 -0
  28. leakproof_ml-0.0.1/src/leakproof_ml/validation/__init__.py +1 -0
  29. leakproof_ml-0.0.1/src/leakproof_ml/validation/splitters.py +78 -0
@@ -0,0 +1,9 @@
1
+ # Virtual Enviroment
2
+ venv/
3
+ virtualMachine/
4
+
5
+ catboost_info/
6
+
7
+ __pycache__/
8
+
9
+ .pytest_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Alexei Ortiz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,176 @@
1
+ Metadata-Version: 2.4
2
+ Name: leakproof-ml
3
+ Version: 0.0.1
4
+ Summary: ML framework to avoid most common sources of data leakage
5
+ Project-URL: Homepage, https://github.com/ORALEM00/leakproof-ml.git
6
+ Author-email: Alexei Ortiz <alex.lztb@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE.txt
9
+ Keywords: cross-validation,data-leakage,framework,group-kfold,machine-learning,materials-informatics,model-evaluation
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: numpy<3.0,>=1.21
17
+ Requires-Dist: optuna>=3.0
18
+ Requires-Dist: pandas>=1.5
19
+ Requires-Dist: scikit-learn>=1.1
20
+ Provides-Extra: dev
21
+ Requires-Dist: flake8; extra == 'dev'
22
+ Requires-Dist: pytest; extra == 'dev'
23
+ Provides-Extra: full
24
+ Requires-Dist: matplotlib>=3.5; extra == 'full'
25
+ Requires-Dist: seaborn>=0.12; extra == 'full'
26
+ Requires-Dist: shap>=0.43; extra == 'full'
27
+ Provides-Extra: plots
28
+ Requires-Dist: matplotlib>=3.5; extra == 'plots'
29
+ Requires-Dist: seaborn>=0.12; extra == 'plots'
30
+ Provides-Extra: shap
31
+ Requires-Dist: shap>=0.43; extra == 'shap'
32
+ Description-Content-Type: text/markdown
33
+
34
+
35
+ # Leakproof ML
36
+
37
+ **Leakproof ML** is a an open-source, flexible, and simple to use Python package designed to systematically prevent data leakage across a complete modelling process. Focused on the most common sources of data leakage arising from improper validation strategies and inadequate isolation between training and test data.
38
+
39
+ ## Install
40
+
41
+ Leakproof ML can be installed from [PyPI](https://pypi.org/project/shap):
42
+
43
+ <pre>
44
+ pip install leakproof_ml
45
+ </pre>
46
+
47
+ ## Data Leakage Framework
48
+ Leakproof provides an unifed framework of leakafe-aware properties across its main functionalities. This is done by enforcing a standardized implementation of ML workflows to ensure that preprocessing, feature selection, tuning, and fitting are performed exclusively on the training sets. While, promoting the use of splitting strategies aligned with the structure of the data.
49
+
50
+ ## Quick start
51
+ The software provides three main functionalities integrated into the data leakage framework: training, tuning, and interpretability.
52
+
53
+ Each functionality can be applied to both a standard single train-test split, and a cross-validation implementation for small data cases.
54
+
55
+ ```python
56
+ # Setting for the example
57
+ import xgboost
58
+ from src.leakproof_ml.validation import ShuffledGroupKFold
59
+
60
+ df = pd.read_csv("data.csv")
61
+
62
+ X = df.drop(columns=["target", "group_id"])
63
+ y = df["target"]
64
+ groups = df["group_id"]
65
+
66
+ # Splitter for group based splitter
67
+ # (however can be any splitter)
68
+ splitter = ShuffledGroupKFold(n_splits = 10, random_state = 42)
69
+ ```
70
+
71
+ ### Training
72
+ The simplest function where a model can be used to fit in the dataset, avoiding data leakage in an easy way.
73
+ ```python
74
+ from leakproof_ml import cv_analysis
75
+ from leakproof_ml.plots import plot_predictions
76
+
77
+ # The class of the model is passed as parameter
78
+ # Results are gathered in dictionary format
79
+ results = cv_analysis(X, y, XGBRegressor, splitter, groups=groups, params = {"max_depth"= 4})
80
+
81
+ plot_predictions(results['y_true'], results['y_predict'])
82
+ ```
83
+ <p align="center">
84
+ <img width="616" src="./resulting_plots\XGBRegressor\metrics\groupedCV_predictions.png" />
85
+ </p>
86
+
87
+ ### Tuning
88
+ For hyperparameter optimization, Leakproof ML employs the Tree-structured Parzen Estimator algorithm implemented in the Optuna library.
89
+
90
+ In the train-test setting, a CV is applied on the train set to optimize parameters and subsequently evaluated on the held-out test set. In contrast, for the CV setting, Leakproof ML implements a nested CV scheme to avoid a possible optimistic bias present when tuning the parameters using the entire dataset
91
+ ```python
92
+ from leakproof_ml.tuning import nested_cv_tunning
93
+
94
+ # For nested cv an extra inner splitter needs to be
95
+ # defined
96
+ inner_splitter = ShuffledGroupKFold(n_splits = 3, random_state = 42)
97
+
98
+ # A function accepting parameter trial for Optuna tuning within the framework
99
+ def search_space(trial):
100
+ return {
101
+ "max_depth": trial.suggest_int("max_depth", 2, 5),
102
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
103
+ }
104
+
105
+ # Returns in addition the set of parameters optimized
106
+ results = nested_cv_tunning(X, y, XGBRegressor, splitter, inner_splitter, search_space, groups=groups)
107
+ ```
108
+
109
+ ### Interpretability
110
+ To extract physical insights and underlying mechanisms from data-driven models, Leakproof ML uses two global, model-agnostic interpretability methods: permutation importance (PI) and SHAP, which allow for quantification of magnitude and direction of feature influence. By default, PI is used.
111
+ ```python
112
+ from leakproof_ml.interpretability import cv_interpretability
113
+ from leakproof_ml.plots import plot_interpretability_bar
114
+
115
+ results = cv_interpretability(X, y, model, splitter, groups=groups)
116
+
117
+ plot_interpretability_bar(results)
118
+ ```
119
+ <p align="center">
120
+ <img width="616" src="./resulting_plots\XGBRegressor\pi\pi_groupedCV.png" />
121
+ </p>
122
+
123
+ ## Custom Pipeline
124
+ Apart from the default pipelines in the functions, the package allows for any custom pipeline to be implemented within the functions. To construct a custom pipeline a function returning the pipeline must be used as parameter in the functions. With the final step of the pipeline always defining the model as: ('model', model).
125
+
126
+ ```python
127
+ from sklearn.discriminant_analysis import StandardScaler
128
+ from sklearn.impute import SimpleImputer
129
+ from sklearn.compose import ColumnTransformer, make_column_selector
130
+ from leakproof_ml import cv_analysis
131
+
132
+ # Custom pipeline
133
+ def polynomial_custom_factory(model, degree=2):
134
+ numeric_pipe = Pipeline(steps=[
135
+ ('imputer', SimpleImputer(strategy='median')),
136
+ ('scaler', StandardScaler())
137
+ ])
138
+ preprocessor = ColumnTransformer(
139
+ transformers=[
140
+ ('num', numeric_pipe, make_column_selector(dtype_include='float64')),
141
+ ],
142
+ remainder='passthrough'
143
+ )
144
+
145
+ # Pipeline steps
146
+ pipe = Pipeline(steps=[
147
+ ('preprocessor', preprocessor),
148
+ ('poly', PolynomialFeatures(degree=degree)),
149
+ ('model', model)
150
+ ])
151
+ return pipe
152
+
153
+ results = cv_analysis(X, y, XGBRegressor, splitter, groups=groups, params = {"max_depth"= 4}, pipeline_factory = polynomial_custom_factory)
154
+ ```
155
+
156
+ ## Citation
157
+ If used in a research project, please cite paper "Leakproof ML: Data Leakage Prevention with a Robust, Interpretable, and Reproducible Machine Learning Framework":
158
+
159
+ <details open>
160
+ <summary>BibTeX</summary>
161
+
162
+ ```bibtex
163
+ @inproceedings{,
164
+ title={},
165
+ author={},
166
+ booktitle={},
167
+ pages={},
168
+ year={}
169
+ }
170
+ ```
171
+ </details>
172
+
173
+
174
+ ## License
175
+
176
+ MIT License (see [LICENSE](./LICENSE.txt)).
@@ -0,0 +1,143 @@
1
+
2
+ # Leakproof ML
3
+
4
+ **Leakproof ML** is a an open-source, flexible, and simple to use Python package designed to systematically prevent data leakage across a complete modelling process. Focused on the most common sources of data leakage arising from improper validation strategies and inadequate isolation between training and test data.
5
+
6
+ ## Install
7
+
8
+ Leakproof ML can be installed from [PyPI](https://pypi.org/project/shap):
9
+
10
+ <pre>
11
+ pip install leakproof_ml
12
+ </pre>
13
+
14
+ ## Data Leakage Framework
15
+ Leakproof provides an unifed framework of leakafe-aware properties across its main functionalities. This is done by enforcing a standardized implementation of ML workflows to ensure that preprocessing, feature selection, tuning, and fitting are performed exclusively on the training sets. While, promoting the use of splitting strategies aligned with the structure of the data.
16
+
17
+ ## Quick start
18
+ The software provides three main functionalities integrated into the data leakage framework: training, tuning, and interpretability.
19
+
20
+ Each functionality can be applied to both a standard single train-test split, and a cross-validation implementation for small data cases.
21
+
22
+ ```python
23
+ # Setting for the example
24
+ import xgboost
25
+ from src.leakproof_ml.validation import ShuffledGroupKFold
26
+
27
+ df = pd.read_csv("data.csv")
28
+
29
+ X = df.drop(columns=["target", "group_id"])
30
+ y = df["target"]
31
+ groups = df["group_id"]
32
+
33
+ # Splitter for group based splitter
34
+ # (however can be any splitter)
35
+ splitter = ShuffledGroupKFold(n_splits = 10, random_state = 42)
36
+ ```
37
+
38
+ ### Training
39
+ The simplest function where a model can be used to fit in the dataset, avoiding data leakage in an easy way.
40
+ ```python
41
+ from leakproof_ml import cv_analysis
42
+ from leakproof_ml.plots import plot_predictions
43
+
44
+ # The class of the model is passed as parameter
45
+ # Results are gathered in dictionary format
46
+ results = cv_analysis(X, y, XGBRegressor, splitter, groups=groups, params = {"max_depth"= 4})
47
+
48
+ plot_predictions(results['y_true'], results['y_predict'])
49
+ ```
50
+ <p align="center">
51
+ <img width="616" src="./resulting_plots\XGBRegressor\metrics\groupedCV_predictions.png" />
52
+ </p>
53
+
54
+ ### Tuning
55
+ For hyperparameter optimization, Leakproof ML employs the Tree-structured Parzen Estimator algorithm implemented in the Optuna library.
56
+
57
+ In the train-test setting, a CV is applied on the train set to optimize parameters and subsequently evaluated on the held-out test set. In contrast, for the CV setting, Leakproof ML implements a nested CV scheme to avoid a possible optimistic bias present when tuning the parameters using the entire dataset
58
+ ```python
59
+ from leakproof_ml.tuning import nested_cv_tunning
60
+
61
+ # For nested cv an extra inner splitter needs to be
62
+ # defined
63
+ inner_splitter = ShuffledGroupKFold(n_splits = 3, random_state = 42)
64
+
65
+ # A function accepting parameter trial for Optuna tuning within the framework
66
+ def search_space(trial):
67
+ return {
68
+ "max_depth": trial.suggest_int("max_depth", 2, 5),
69
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
70
+ }
71
+
72
+ # Returns in addition the set of parameters optimized
73
+ results = nested_cv_tunning(X, y, XGBRegressor, splitter, inner_splitter, search_space, groups=groups)
74
+ ```
75
+
76
+ ### Interpretability
77
+ To extract physical insights and underlying mechanisms from data-driven models, Leakproof ML uses two global, model-agnostic interpretability methods: permutation importance (PI) and SHAP, which allow for quantification of magnitude and direction of feature influence. By default, PI is used.
78
+ ```python
79
+ from leakproof_ml.interpretability import cv_interpretability
80
+ from leakproof_ml.plots import plot_interpretability_bar
81
+
82
+ results = cv_interpretability(X, y, model, splitter, groups=groups)
83
+
84
+ plot_interpretability_bar(results)
85
+ ```
86
+ <p align="center">
87
+ <img width="616" src="./resulting_plots\XGBRegressor\pi\pi_groupedCV.png" />
88
+ </p>
89
+
90
+ ## Custom Pipeline
91
+ Apart from the default pipelines in the functions, the package allows for any custom pipeline to be implemented within the functions. To construct a custom pipeline a function returning the pipeline must be used as parameter in the functions. With the final step of the pipeline always defining the model as: ('model', model).
92
+
93
+ ```python
94
+ from sklearn.discriminant_analysis import StandardScaler
95
+ from sklearn.impute import SimpleImputer
96
+ from sklearn.compose import ColumnTransformer, make_column_selector
97
+ from leakproof_ml import cv_analysis
98
+
99
+ # Custom pipeline
100
+ def polynomial_custom_factory(model, degree=2):
101
+ numeric_pipe = Pipeline(steps=[
102
+ ('imputer', SimpleImputer(strategy='median')),
103
+ ('scaler', StandardScaler())
104
+ ])
105
+ preprocessor = ColumnTransformer(
106
+ transformers=[
107
+ ('num', numeric_pipe, make_column_selector(dtype_include='float64')),
108
+ ],
109
+ remainder='passthrough'
110
+ )
111
+
112
+ # Pipeline steps
113
+ pipe = Pipeline(steps=[
114
+ ('preprocessor', preprocessor),
115
+ ('poly', PolynomialFeatures(degree=degree)),
116
+ ('model', model)
117
+ ])
118
+ return pipe
119
+
120
+ results = cv_analysis(X, y, XGBRegressor, splitter, groups=groups, params = {"max_depth"= 4}, pipeline_factory = polynomial_custom_factory)
121
+ ```
122
+
123
+ ## Citation
124
+ If used in a research project, please cite paper "Leakproof ML: Data Leakage Prevention with a Robust, Interpretable, and Reproducible Machine Learning Framework":
125
+
126
+ <details open>
127
+ <summary>BibTeX</summary>
128
+
129
+ ```bibtex
130
+ @inproceedings{,
131
+ title={},
132
+ author={},
133
+ booktitle={},
134
+ pages={},
135
+ year={}
136
+ }
137
+ ```
138
+ </details>
139
+
140
+
141
+ ## License
142
+
143
+ MIT License (see [LICENSE](./LICENSE.txt)).
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "leakproof-ml"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Alexei Ortiz", email="alex.lztb@gmail.com" },
10
+ ]
11
+ description = "ML framework to avoid most common sources of data leakage"
12
+ readme = "README.md"
13
+ requires-python = ">=3.9"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Topic :: Scientific/Engineering :: Artificial Intelligence"
20
+ ]
21
+ license = { text = "MIT" }
22
+
23
+ dependencies = [
24
+ "numpy>=1.21,<3.0",
25
+ "pandas>=1.5",
26
+ "scikit-learn>=1.1",
27
+ "optuna>=3.0"
28
+ ]
29
+
30
+ keywords = [
31
+ "machine-learning",
32
+ "data-leakage",
33
+ "framework",
34
+ "cross-validation",
35
+ "group-kfold",
36
+ "materials-informatics",
37
+ "model-evaluation"
38
+ ]
39
+
40
+
41
+ [project.optional-dependencies]
42
+ shap = ["shap>=0.43"]
43
+ plots = ["matplotlib>=3.5", "seaborn>=0.12"]
44
+ full = ["shap>=0.43", "matplotlib>=3.5", "seaborn>=0.12"]
45
+ dev = ["pytest", "flake8"]
46
+
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/ORALEM00/leakproof-ml.git"
50
+
51
+
52
+ [tool.hatch.build.targets.wheel]
53
+ packages = ["src/leakproof_ml"]
54
+
55
+ [tool.hatch.build.targets.sdist]
56
+ include = [
57
+ "src/leakproof_ml",
58
+ "pyproject.toml",
59
+ "README.md",
60
+ "LICENSE",
61
+ ]
62
+ exclude = [
63
+ "virtualMachine/",
64
+ ]
@@ -0,0 +1,3 @@
1
+ __version__ = "0.0.1"
2
+
3
+ from .modeling import train_test_analysis, cv_analysis
@@ -0,0 +1,2 @@
1
+ from .explainer import train_test_interpretability, cv_interpretability
2
+ from .analysis import get_stable_features
@@ -0,0 +1,92 @@
1
+ import warnings
2
+
3
+
4
+ def _shap_analysis(pipeline, X_train, X_test, background_size, random_state):
5
+ """
6
+ Compute SHAP values for a fitted model within a preprocessing pipeline.
7
+
8
+ This internal utility function applies SHAP-based interpretability to the
9
+ trained model contained in a scikit-learn ``Pipeline``. The analysis is
10
+ performed on data that has already been transformed by the preprocessing
11
+ steps of the pipeline, ensuring that explanations correspond to the actual
12
+ feature representation seen by the model.
13
+
14
+ The function use ``shap.Explainer`` for fast, model-specific
15
+ explanations (e.g., ``TreeExplainer`` or ``LinearExplainer``). If the model
16
+ type is not supported or automatic detection fails, it falls back to
17
+ ``KernelExplainer``, issuing a warning due to its higher computational cost.
18
+
19
+ Parameters
20
+ ----------
21
+ pipeline : sklearn.pipeline.Pipeline
22
+ A fitted scikit-learn pipeline. The final step must be a predictive
23
+ estimator exposing a ``predict`` method and is assumed to be named
24
+ ``'model'``.
25
+ X_train : pandas.DataFrame or numpy.ndarray of shape (n_train_samples, n_features)
26
+ The transformed training data used as background for SHAP value
27
+ estimation. This data is assumed to be the output of the pipeline
28
+ preprocessing steps.
29
+ X_test : pandas.DataFrame or numpy.ndarray of shape (n_test_samples, n_features)
30
+ The transformed test data for which SHAP values are computed.
31
+ background_size : int or None
32
+ Number of samples from ``X_train`` to use as background data when
33
+ ``KernelExplainer`` is employed. If ``None``, all training samples are
34
+ used. Reducing this value can significantly decrease computation time
35
+ at the cost of higher variance in SHAP estimates.
36
+ random_state : int
37
+ Random seed used when subsampling the background data for
38
+ ``KernelExplainer``.
39
+
40
+ Returns
41
+ -------
42
+ shap_values : numpy.ndarray
43
+ Array of SHAP values with shape
44
+ ``(n_test_samples, n_features)``, representing the contribution of each
45
+ transformed feature to the model prediction for each test sample.
46
+
47
+ Notes
48
+ -----
49
+ - SHAP values are computed in the *transformed feature space*, not on the
50
+ raw input features. As a result, explanations reflect the learned feature
51
+ representation produced by the preprocessing pipeline.
52
+ - For tree-based models, SHAP values are computed using path-dependent
53
+ expectations, which are robust to feature correlations introduced by
54
+ preprocessing.
55
+ - For model-agnostic explanations (``KernelExplainer``), SHAP relies on
56
+ background data sampling and may produce high-variance attributions in
57
+ small-data or highly correlated feature settings.
58
+ """
59
+ try:
60
+ import shap
61
+ except ImportError:
62
+ raise ImportError(
63
+ "The 'shap' library is required for SHAP analysis. "
64
+ "Please install it via 'pip install shap'."
65
+ )
66
+
67
+ # Attempt to use the fast SHAP Explainer
68
+ try:
69
+ explainer = shap.Explainer(pipeline.named_steps['model'], X_train, seed=random_state)
70
+ sv = explainer.shap_values(X_test)
71
+ # Fallback to KernelExplainer for ensembles or custom estimators
72
+ except:
73
+ warnings.warn(
74
+ "Falling back to SHAP KernelExplainer. "
75
+ "This may be slower for large datasets or complex models."
76
+ "Runtime can be reduced by setting `shap_background_size`.",
77
+ UserWarning
78
+ )
79
+
80
+ # Wrapper function
81
+ def model_predict(data):
82
+ return pipeline.named_steps['model'].predict(data)
83
+
84
+ # Reduce background data size if requested to optimize execution time
85
+ if (background_size is not None) and (background_size < X_train.shape[0]):
86
+ # Use a subset of the training data as background
87
+ X_train = shap.sample(X_train, background_size, random_state=random_state)
88
+
89
+ explainer = shap.KernelExplainer(model_predict, X_train, seed=random_state)
90
+ sv = explainer.shap_values(X_test)
91
+
92
+ return sv
@@ -0,0 +1,40 @@
1
+ import numpy as np
2
+
3
+
4
+
5
+ def get_stable_features(features, threshold=0.5):
6
+ """
7
+ Identify features that consistently appear across multiple cross-validation folds.
8
+
9
+ In nested cross-validation, feature selection (like Correlation Selection)
10
+ is performed inside each fold. This results in different sets of features
11
+ per fold. This function identifies "stable" features—those that were
12
+ selected frequently enough to meet the specified threshold.
13
+
14
+ Parameters
15
+ ----------
16
+ features : list of list of str
17
+ A nested list where each inner list contains the names of features
18
+ selected in a specific fold.
19
+ Example: [['feat_A', 'feat_B'], ['feat_A', 'feat_C'], ['feat_A']]
20
+ threshold : float, default=0.5
21
+ The minimum fraction of folds (from 0.0 to 1.0) a feature must
22
+ appear in to be considered stable. For example, with 5 folds and
23
+ a 0.6 threshold, a feature must appear in at least 3 folds.
24
+
25
+ Returns
26
+ -------
27
+ stable_features : list of str
28
+ An alphabetically sorted list of feature names that met or exceeded
29
+ the selection frequency threshold.
30
+
31
+ """
32
+ selected_features = np.array([f for sublist in features for f in sublist])
33
+ unique, counts = np.unique(selected_features, return_counts=True)
34
+
35
+ n_folds = len(features)
36
+ # Round number of folds based on threhold
37
+ min_appearances = int(n_folds * threshold)
38
+
39
+ stable_features = [str(col) for col, count in zip(unique, counts) if count >= min_appearances]
40
+ return stable_features