lambda-guard-boosting 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lambda_guard_boosting-0.2.2/LICENSE.md +34 -0
- lambda_guard_boosting-0.2.2/PKG-INFO +229 -0
- lambda_guard_boosting-0.2.2/README.md +201 -0
- lambda_guard_boosting-0.2.2/lambda_guard_boosting.egg-info/PKG-INFO +229 -0
- lambda_guard_boosting-0.2.2/lambda_guard_boosting.egg-info/SOURCES.txt +14 -0
- lambda_guard_boosting-0.2.2/lambda_guard_boosting.egg-info/dependency_links.txt +1 -0
- lambda_guard_boosting-0.2.2/lambda_guard_boosting.egg-info/requires.txt +8 -0
- lambda_guard_boosting-0.2.2/lambda_guard_boosting.egg-info/top_level.txt +2 -0
- lambda_guard_boosting-0.2.2/lambdaguard/__init__.py +18 -0
- lambda_guard_boosting-0.2.2/lambdaguard/cusum.py +92 -0
- lambda_guard_boosting-0.2.2/lambdaguard/lambdaguard.py +58 -0
- lambda_guard_boosting-0.2.2/lambdaguard/ofi.py +157 -0
- lambda_guard_boosting-0.2.2/pyproject.toml +41 -0
- lambda_guard_boosting-0.2.2/setup.cfg +4 -0
- lambda_guard_boosting-0.2.2/tests/test_cusum.py +21 -0
- lambda_guard_boosting-0.2.2/tests/test_ofi.py +26 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+

|
|
2
|
+
© 2026 **Fabrizio Di Sciorio, PhD**
|
|
3
|
+
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to **deal in the Software without restriction**, including without limitation the rights to:
|
|
8
|
+
|
|
9
|
+
- ✅ Use
|
|
10
|
+
- ✅ Copy
|
|
11
|
+
- ✅ Modify
|
|
12
|
+
- ✅ Merge
|
|
13
|
+
- ✅ Publish
|
|
14
|
+
- ✅ Distribute
|
|
15
|
+
- ✅ Sublicense
|
|
16
|
+
- ✅ Sell copies of the Software
|
|
17
|
+
|
|
18
|
+
and to permit persons to whom the Software is furnished to do so, **subject to the following conditions**:
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in **all copies or substantial portions of the Software**.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
### ⚠️ Disclaimer
|
|
27
|
+
|
|
28
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF:
|
|
29
|
+
|
|
30
|
+
- MERCHANTABILITY
|
|
31
|
+
- FITNESS FOR A PARTICULAR PURPOSE
|
|
32
|
+
- NONINFRINGEMENT
|
|
33
|
+
|
|
34
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lambda-guard-boosting
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Overfitting detection for Gradient Boosting models using λ-Guard methodology.
|
|
5
|
+
Author-email: "Fabrizio Di Sciorio, PhD" <fabriziodisciorio91@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/faberBI/lambdaguard
|
|
8
|
+
Project-URL: Documentation, https://github.com/faberBI/lambdaguard
|
|
9
|
+
Project-URL: BugTracker, https://github.com/faberBI/lambdaguard/issues
|
|
10
|
+
Keywords: machine-learning,gradient-boosting,overfitting,boosting,lambda-guard
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE.md
|
|
19
|
+
Requires-Dist: numpy<2.2,>=1.26
|
|
20
|
+
Requires-Dist: pandas<3.0,>=2.2
|
|
21
|
+
Requires-Dist: scikit-learn<2.0,>=1.3
|
|
22
|
+
Requires-Dist: matplotlib<4.0,>=3.8
|
|
23
|
+
Requires-Dist: seaborn<0.14,>=0.12
|
|
24
|
+
Requires-Dist: xgboost<4.0,>=1.7
|
|
25
|
+
Requires-Dist: lightgbm<5.0,>=4.4
|
|
26
|
+
Requires-Dist: catboost<2.0,>=1.1
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="docs/logo.png" alt="λ-Guard" width="160"/>
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<strong>Overfitting detection for Gradient Boosting</strong> — <em>no validation set required</em><br>
|
|
35
|
+
<i>Detect the moment when your model stops learning signal and starts memorizing structure.</i>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://github.com/faberBI/lambdaguard/actions/workflows/tests.yml">
|
|
40
|
+
<img src="https://img.shields.io/github/actions/workflow/status/faberBI/lambdaguard/tests.yml?branch=main&logo=github" alt="Tests Status">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://coveralls.io/github/faberBI/lambdaguard">
|
|
43
|
+
<img src="https://img.shields.io/coveralls/github/faberBI/lambdaguard/main.svg" alt="Coverage Status">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://pypi.org/project/lambdaguard/">
|
|
46
|
+
<img src="https://img.shields.io/pypi/v/lambdaguard?logo=python" alt="PyPI Version">
|
|
47
|
+
</a>
|
|
48
|
+
<a href="https://opensource.org/licenses/MIT">
|
|
49
|
+
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License MIT">
|
|
50
|
+
</a>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## ❓ Why λ-Guard?
|
|
56
|
+
|
|
57
|
+
In Gradient Boosting, overfitting often appears **before the validation error rises**.
|
|
58
|
+
By that point, the model is already:
|
|
59
|
+
|
|
60
|
+
- ✂️ Splitting features into extremely fine regions
|
|
61
|
+
- 🍃 Fitting leaves supported by very few observations
|
|
62
|
+
- 🌪 Sensitive to tiny perturbations
|
|
63
|
+
|
|
64
|
+
It’s **no longer improving predictions**, it’s **memorizing the training dataset**.
|
|
65
|
+
**λ-Guard detects that moment automatically.**
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## 🧠 Core Intuition
|
|
70
|
+
|
|
71
|
+
A boosting model learns two things simultaneously:
|
|
72
|
+
|
|
73
|
+
| Component | Role |
|
|
74
|
+
|-----------|------|
|
|
75
|
+
| Geometry | partitions the feature space |
|
|
76
|
+
| Predictor | assigns values to each region |
|
|
77
|
+
|
|
78
|
+
Overfitting occurs when:
|
|
79
|
+
|
|
80
|
+
*"Geometry keeps growing, but predictor stops extracting real information."*
|
|
81
|
+
|
|
82
|
+
λ-Guard measures three key signals:
|
|
83
|
+
|
|
84
|
+
- 📦 **Capacity** → structural complexity
|
|
85
|
+
- 🎯 **Alignment** → extracted signal
|
|
86
|
+
- 🌊 **Stability** → fragility of predictions
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## 🧩 Representation Matrix
|
|
91
|
+
|
|
92
|
+
Every tree divides the feature space into **leaves**.
|
|
93
|
+
We record where each observation falls:
|
|
94
|
+
Z[i,j] = 1 if sample i falls in leaf j
|
|
95
|
+
Z[i,j] = 0 otherwise
|
|
96
|
+
|
|
97
|
+
- Rows → observations
|
|
98
|
+
- Columns → leaves across all trees
|
|
99
|
+
|
|
100
|
+
Think of **Z** as the **representation learned by the ensemble**.
|
|
101
|
+
|
|
102
|
+
- Linear regression → hat matrix **H**
|
|
103
|
+
- Boosting → representation **Z**
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## 📦 Capacity — Structural Complexity
|
|
108
|
+
|
|
109
|
+
- 🔹 Low C → few effective regions
|
|
110
|
+
- 🔹 High C → model fragments space
|
|
111
|
+
|
|
112
|
+
Late-stage boosting **increases C quickly**, often without improving predictions.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🎯 Alignment — Useful Information
|
|
117
|
+
|
|
118
|
+
- 🔹 High A → trees add real predictive signal
|
|
119
|
+
- 🔹 Low A → trees mostly refine boundaries
|
|
120
|
+
|
|
121
|
+
*"After some trees, alignment saturates."*
|
|
122
|
+
Boosting continues **growing structure** even if prediction stops improving.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 🌊 Stability — Sensitivity to Perturbations
|
|
127
|
+
|
|
128
|
+
- 🔹 Low S → smooth, robust model
|
|
129
|
+
- 🔹 High S → brittle, sensitive model
|
|
130
|
+
|
|
131
|
+
**Stability is the first signal to explode during overfitting.**
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 🔥 The Overfitting Index λ
|
|
136
|
+
|
|
137
|
+
| Situation | λ |
|
|
138
|
+
|-----------|---|
|
|
139
|
+
| Compact structure + stable predictions | low |
|
|
140
|
+
| Many regions + weak signal | high |
|
|
141
|
+
| Unstable predictions | very high |
|
|
142
|
+
|
|
143
|
+
**Interpretation:** measures how much structural complexity is wasted.
|
|
144
|
+
Normalized λ ∈ [0,1] can be used to **compare models**.
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
## 🧪 Structural Overfitting Test
|
|
148
|
+
|
|
149
|
+
Detect if a few training points dominate the model using **approximate leverage**:
|
|
150
|
+
H_ii ≈ Σ_trees (learning_rate / leaf_size)
|
|
151
|
+
T1 = mean(H_ii) # global complexity
|
|
152
|
+
T2 = max(H_ii)/mean(H_ii) # local memorization
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
**Bootstrap procedure:**
|
|
156
|
+
|
|
157
|
+
1. Repeat B times: resample training data, recompute T1 & T2
|
|
158
|
+
2. Compute p-values:
|
|
159
|
+
- p1 = P(T1_boot ≥ T1_obs)
|
|
160
|
+
- p2 = P(T2_boot ≥ T2_obs)
|
|
161
|
+
|
|
162
|
+
Reject structural stability if:
|
|
163
|
+
|
|
164
|
+
p1 < α OR p2 < α
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 📊 What λ-Guard Distinguishes
|
|
170
|
+
|
|
171
|
+
| Regime | Meaning |
|
|
172
|
+
|--------|---------|
|
|
173
|
+
| ✅ Stable | smooth generalization |
|
|
174
|
+
| 📈 Global overfitting | too many effective parameters |
|
|
175
|
+
| ⚠️ Local memorization | few points dominate |
|
|
176
|
+
| 💥 Extreme | interpolation behavior |
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## 🧭 When to Use
|
|
181
|
+
|
|
182
|
+
- Monitor boosting during training
|
|
183
|
+
- Hyperparameter tuning
|
|
184
|
+
- Small datasets (no validation split)
|
|
185
|
+
- Diagnose late-stage performance collapse
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## ⚙️ Installation
|
|
190
|
+
|
|
191
|
+
Install via GitHub:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
pip install git+https://github.com/faberBI/lambdaguard.git
|
|
195
|
+
|
|
196
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
197
|
+
from lambdaguard.ofi import overfitting_index
|
|
198
|
+
from lambdaguard.lambda_guard import lambda_guard_test, interpret
|
|
199
|
+
from lambdaguard.cusum import detect_structural_overfitting_cusum_robust
|
|
200
|
+
import pandas as pd
|
|
201
|
+
|
|
202
|
+
# Fit a model
|
|
203
|
+
model = GradientBoostingRegressor(n_estimators=50, max_depth=3)
|
|
204
|
+
model.fit(X_train, y_train)
|
|
205
|
+
|
|
206
|
+
# Compute Overfitting Index
|
|
207
|
+
ofi_res = overfitting_index(model, X_train, y_train)
|
|
208
|
+
|
|
209
|
+
# Lambda-guard test
|
|
210
|
+
lg_res = lambda_guard_test(model, X_train)
|
|
211
|
+
print(interpret(lg_res))
|
|
212
|
+
|
|
213
|
+
# CUSUM-based detection
|
|
214
|
+
df = pd.DataFrame([
|
|
215
|
+
{"model": "GBR", "n_estimators": 50, "max_depth": 3, "A": 0.8, "OFI_norm": 0.2},
|
|
216
|
+
{"model": "GBR", "n_estimators": 100, "max_depth": 5, "A": 0.85, "OFI_norm": 0.3},
|
|
217
|
+
])
|
|
218
|
+
cusum_res = detect_structural_overfitting_cusum_robust(df, model_name="GBR")
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## 📜 Citation
|
|
223
|
+
|
|
224
|
+
If you use **λ-Guard** in your research or projects, please cite the following:
|
|
225
|
+
|
|
226
|
+
**Fabrizio Di Sciorio, PhD**
|
|
227
|
+
*Universidad de Almeria — Business and Economics Department*
|
|
228
|
+
> "λ-Guard: Structural Overfitting Detection for Gradient Boosting Models"
|
|
229
|
+
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/logo.png" alt="λ-Guard" width="160"/>
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<strong>Overfitting detection for Gradient Boosting</strong> — <em>no validation set required</em><br>
|
|
7
|
+
<i>Detect the moment when your model stops learning signal and starts memorizing structure.</i>
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
<a href="https://github.com/faberBI/lambdaguard/actions/workflows/tests.yml">
|
|
12
|
+
<img src="https://img.shields.io/github/actions/workflow/status/faberBI/lambdaguard/tests.yml?branch=main&logo=github" alt="Tests Status">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://coveralls.io/github/faberBI/lambdaguard">
|
|
15
|
+
<img src="https://img.shields.io/coveralls/github/faberBI/lambdaguard/main.svg" alt="Coverage Status">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://pypi.org/project/lambdaguard/">
|
|
18
|
+
<img src="https://img.shields.io/pypi/v/lambdaguard?logo=python" alt="PyPI Version">
|
|
19
|
+
</a>
|
|
20
|
+
<a href="https://opensource.org/licenses/MIT">
|
|
21
|
+
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License MIT">
|
|
22
|
+
</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## ❓ Why λ-Guard?
|
|
28
|
+
|
|
29
|
+
In Gradient Boosting, overfitting often appears **before the validation error rises**.
|
|
30
|
+
By that point, the model is already:
|
|
31
|
+
|
|
32
|
+
- ✂️ Splitting features into extremely fine regions
|
|
33
|
+
- 🍃 Fitting leaves supported by very few observations
|
|
34
|
+
- 🌪 Sensitive to tiny perturbations
|
|
35
|
+
|
|
36
|
+
It’s **no longer improving predictions**, it’s **memorizing the training dataset**.
|
|
37
|
+
**λ-Guard detects that moment automatically.**
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## 🧠 Core Intuition
|
|
42
|
+
|
|
43
|
+
A boosting model learns two things simultaneously:
|
|
44
|
+
|
|
45
|
+
| Component | Role |
|
|
46
|
+
|-----------|------|
|
|
47
|
+
| Geometry | partitions the feature space |
|
|
48
|
+
| Predictor | assigns values to each region |
|
|
49
|
+
|
|
50
|
+
Overfitting occurs when:
|
|
51
|
+
|
|
52
|
+
*"Geometry keeps growing, but predictor stops extracting real information."*
|
|
53
|
+
|
|
54
|
+
λ-Guard measures three key signals:
|
|
55
|
+
|
|
56
|
+
- 📦 **Capacity** → structural complexity
|
|
57
|
+
- 🎯 **Alignment** → extracted signal
|
|
58
|
+
- 🌊 **Stability** → fragility of predictions
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## 🧩 Representation Matrix
|
|
63
|
+
|
|
64
|
+
Every tree divides the feature space into **leaves**.
|
|
65
|
+
We record where each observation falls:
|
|
66
|
+
Z[i,j] = 1 if sample i falls in leaf j
|
|
67
|
+
Z[i,j] = 0 otherwise
|
|
68
|
+
|
|
69
|
+
- Rows → observations
|
|
70
|
+
- Columns → leaves across all trees
|
|
71
|
+
|
|
72
|
+
Think of **Z** as the **representation learned by the ensemble**.
|
|
73
|
+
|
|
74
|
+
- Linear regression → hat matrix **H**
|
|
75
|
+
- Boosting → representation **Z**
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## 📦 Capacity — Structural Complexity
|
|
80
|
+
|
|
81
|
+
- 🔹 Low C → few effective regions
|
|
82
|
+
- 🔹 High C → model fragments space
|
|
83
|
+
|
|
84
|
+
Late-stage boosting **increases C quickly**, often without improving predictions.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## 🎯 Alignment — Useful Information
|
|
89
|
+
|
|
90
|
+
- 🔹 High A → trees add real predictive signal
|
|
91
|
+
- 🔹 Low A → trees mostly refine boundaries
|
|
92
|
+
|
|
93
|
+
*"After some trees, alignment saturates."*
|
|
94
|
+
Boosting continues **growing structure** even if prediction stops improving.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## 🌊 Stability — Sensitivity to Perturbations
|
|
99
|
+
|
|
100
|
+
- 🔹 Low S → smooth, robust model
|
|
101
|
+
- 🔹 High S → brittle, sensitive model
|
|
102
|
+
|
|
103
|
+
**Stability is the first signal to explode during overfitting.**
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## 🔥 The Overfitting Index λ
|
|
108
|
+
|
|
109
|
+
| Situation | λ |
|
|
110
|
+
|-----------|---|
|
|
111
|
+
| Compact structure + stable predictions | low |
|
|
112
|
+
| Many regions + weak signal | high |
|
|
113
|
+
| Unstable predictions | very high |
|
|
114
|
+
|
|
115
|
+
**Interpretation:** measures how much structural complexity is wasted.
|
|
116
|
+
Normalized λ ∈ [0,1] can be used to **compare models**.
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
## 🧪 Structural Overfitting Test
|
|
120
|
+
|
|
121
|
+
Detect if a few training points dominate the model using **approximate leverage**:
|
|
122
|
+
H_ii ≈ Σ_trees (learning_rate / leaf_size)
|
|
123
|
+
T1 = mean(H_ii) # global complexity
|
|
124
|
+
T2 = max(H_ii)/mean(H_ii) # local memorization
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
**Bootstrap procedure:**
|
|
128
|
+
|
|
129
|
+
1. Repeat B times: resample training data, recompute T1 & T2
|
|
130
|
+
2. Compute p-values:
|
|
131
|
+
- p1 = P(T1_boot ≥ T1_obs)
|
|
132
|
+
- p2 = P(T2_boot ≥ T2_obs)
|
|
133
|
+
|
|
134
|
+
Reject structural stability if:
|
|
135
|
+
|
|
136
|
+
p1 < α OR p2 < α
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## 📊 What λ-Guard Distinguishes
|
|
142
|
+
|
|
143
|
+
| Regime | Meaning |
|
|
144
|
+
|--------|---------|
|
|
145
|
+
| ✅ Stable | smooth generalization |
|
|
146
|
+
| 📈 Global overfitting | too many effective parameters |
|
|
147
|
+
| ⚠️ Local memorization | few points dominate |
|
|
148
|
+
| 💥 Extreme | interpolation behavior |
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 🧭 When to Use
|
|
153
|
+
|
|
154
|
+
- Monitor boosting during training
|
|
155
|
+
- Hyperparameter tuning
|
|
156
|
+
- Small datasets (no validation split)
|
|
157
|
+
- Diagnose late-stage performance collapse
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## ⚙️ Installation
|
|
162
|
+
|
|
163
|
+
Install via GitHub:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
pip install git+https://github.com/faberBI/lambdaguard.git
|
|
167
|
+
|
|
168
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
169
|
+
from lambdaguard.ofi import overfitting_index
|
|
170
|
+
from lambdaguard.lambda_guard import lambda_guard_test, interpret
|
|
171
|
+
from lambdaguard.cusum import detect_structural_overfitting_cusum_robust
|
|
172
|
+
import pandas as pd
|
|
173
|
+
|
|
174
|
+
# Fit a model
|
|
175
|
+
model = GradientBoostingRegressor(n_estimators=50, max_depth=3)
|
|
176
|
+
model.fit(X_train, y_train)
|
|
177
|
+
|
|
178
|
+
# Compute Overfitting Index
|
|
179
|
+
ofi_res = overfitting_index(model, X_train, y_train)
|
|
180
|
+
|
|
181
|
+
# Lambda-guard test
|
|
182
|
+
lg_res = lambda_guard_test(model, X_train)
|
|
183
|
+
print(interpret(lg_res))
|
|
184
|
+
|
|
185
|
+
# CUSUM-based detection
|
|
186
|
+
df = pd.DataFrame([
|
|
187
|
+
{"model": "GBR", "n_estimators": 50, "max_depth": 3, "A": 0.8, "OFI_norm": 0.2},
|
|
188
|
+
{"model": "GBR", "n_estimators": 100, "max_depth": 5, "A": 0.85, "OFI_norm": 0.3},
|
|
189
|
+
])
|
|
190
|
+
cusum_res = detect_structural_overfitting_cusum_robust(df, model_name="GBR")
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## 📜 Citation
|
|
195
|
+
|
|
196
|
+
If you use **λ-Guard** in your research or projects, please cite the following:
|
|
197
|
+
|
|
198
|
+
**Fabrizio Di Sciorio, PhD**
|
|
199
|
+
*Universidad de Almeria — Business and Economics Department*
|
|
200
|
+
> "λ-Guard: Structural Overfitting Detection for Gradient Boosting Models"
|
|
201
|
+
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lambda-guard-boosting
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Overfitting detection for Gradient Boosting models using λ-Guard methodology.
|
|
5
|
+
Author-email: "Fabrizio Di Sciorio, PhD" <fabriziodisciorio91@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/faberBI/lambdaguard
|
|
8
|
+
Project-URL: Documentation, https://github.com/faberBI/lambdaguard
|
|
9
|
+
Project-URL: BugTracker, https://github.com/faberBI/lambdaguard/issues
|
|
10
|
+
Keywords: machine-learning,gradient-boosting,overfitting,boosting,lambda-guard
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE.md
|
|
19
|
+
Requires-Dist: numpy<2.2,>=1.26
|
|
20
|
+
Requires-Dist: pandas<3.0,>=2.2
|
|
21
|
+
Requires-Dist: scikit-learn<2.0,>=1.3
|
|
22
|
+
Requires-Dist: matplotlib<4.0,>=3.8
|
|
23
|
+
Requires-Dist: seaborn<0.14,>=0.12
|
|
24
|
+
Requires-Dist: xgboost<4.0,>=1.7
|
|
25
|
+
Requires-Dist: lightgbm<5.0,>=4.4
|
|
26
|
+
Requires-Dist: catboost<2.0,>=1.1
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="docs/logo.png" alt="λ-Guard" width="160"/>
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<strong>Overfitting detection for Gradient Boosting</strong> — <em>no validation set required</em><br>
|
|
35
|
+
<i>Detect the moment when your model stops learning signal and starts memorizing structure.</i>
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<a href="https://github.com/faberBI/lambdaguard/actions/workflows/tests.yml">
|
|
40
|
+
<img src="https://img.shields.io/github/actions/workflow/status/faberBI/lambdaguard/tests.yml?branch=main&logo=github" alt="Tests Status">
|
|
41
|
+
</a>
|
|
42
|
+
<a href="https://coveralls.io/github/faberBI/lambdaguard">
|
|
43
|
+
<img src="https://img.shields.io/coveralls/github/faberBI/lambdaguard/main.svg" alt="Coverage Status">
|
|
44
|
+
</a>
|
|
45
|
+
<a href="https://pypi.org/project/lambdaguard/">
|
|
46
|
+
<img src="https://img.shields.io/pypi/v/lambdaguard?logo=python" alt="PyPI Version">
|
|
47
|
+
</a>
|
|
48
|
+
<a href="https://opensource.org/licenses/MIT">
|
|
49
|
+
<img src="https://img.shields.io/badge/License-MIT-green.svg" alt="License MIT">
|
|
50
|
+
</a>
|
|
51
|
+
</p>
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## ❓ Why λ-Guard?
|
|
56
|
+
|
|
57
|
+
In Gradient Boosting, overfitting often appears **before the validation error rises**.
|
|
58
|
+
By that point, the model is already:
|
|
59
|
+
|
|
60
|
+
- ✂️ Splitting features into extremely fine regions
|
|
61
|
+
- 🍃 Fitting leaves supported by very few observations
|
|
62
|
+
- 🌪 Sensitive to tiny perturbations
|
|
63
|
+
|
|
64
|
+
It’s **no longer improving predictions**, it’s **memorizing the training dataset**.
|
|
65
|
+
**λ-Guard detects that moment automatically.**
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## 🧠 Core Intuition
|
|
70
|
+
|
|
71
|
+
A boosting model learns two things simultaneously:
|
|
72
|
+
|
|
73
|
+
| Component | Role |
|
|
74
|
+
|-----------|------|
|
|
75
|
+
| Geometry | partitions the feature space |
|
|
76
|
+
| Predictor | assigns values to each region |
|
|
77
|
+
|
|
78
|
+
Overfitting occurs when:
|
|
79
|
+
|
|
80
|
+
*"Geometry keeps growing, but predictor stops extracting real information."*
|
|
81
|
+
|
|
82
|
+
λ-Guard measures three key signals:
|
|
83
|
+
|
|
84
|
+
- 📦 **Capacity** → structural complexity
|
|
85
|
+
- 🎯 **Alignment** → extracted signal
|
|
86
|
+
- 🌊 **Stability** → fragility of predictions
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## 🧩 Representation Matrix
|
|
91
|
+
|
|
92
|
+
Every tree divides the feature space into **leaves**.
|
|
93
|
+
We record where each observation falls:
|
|
94
|
+
Z[i,j] = 1 if sample i falls in leaf j
|
|
95
|
+
Z[i,j] = 0 otherwise
|
|
96
|
+
|
|
97
|
+
- Rows → observations
|
|
98
|
+
- Columns → leaves across all trees
|
|
99
|
+
|
|
100
|
+
Think of **Z** as the **representation learned by the ensemble**.
|
|
101
|
+
|
|
102
|
+
- Linear regression → hat matrix **H**
|
|
103
|
+
- Boosting → representation **Z**
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## 📦 Capacity — Structural Complexity
|
|
108
|
+
|
|
109
|
+
- 🔹 Low C → few effective regions
|
|
110
|
+
- 🔹 High C → model fragments space
|
|
111
|
+
|
|
112
|
+
Late-stage boosting **increases C quickly**, often without improving predictions.
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## 🎯 Alignment — Useful Information
|
|
117
|
+
|
|
118
|
+
- 🔹 High A → trees add real predictive signal
|
|
119
|
+
- 🔹 Low A → trees mostly refine boundaries
|
|
120
|
+
|
|
121
|
+
*"After some trees, alignment saturates."*
|
|
122
|
+
Boosting continues **growing structure** even if prediction stops improving.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 🌊 Stability — Sensitivity to Perturbations
|
|
127
|
+
|
|
128
|
+
- 🔹 Low S → smooth, robust model
|
|
129
|
+
- 🔹 High S → brittle, sensitive model
|
|
130
|
+
|
|
131
|
+
**Stability is the first signal to explode during overfitting.**
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## 🔥 The Overfitting Index λ
|
|
136
|
+
|
|
137
|
+
| Situation | λ |
|
|
138
|
+
|-----------|---|
|
|
139
|
+
| Compact structure + stable predictions | low |
|
|
140
|
+
| Many regions + weak signal | high |
|
|
141
|
+
| Unstable predictions | very high |
|
|
142
|
+
|
|
143
|
+
**Interpretation:** measures how much structural complexity is wasted.
|
|
144
|
+
Normalized λ ∈ [0,1] can be used to **compare models**.
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
## 🧪 Structural Overfitting Test
|
|
148
|
+
|
|
149
|
+
Detect if a few training points dominate the model using **approximate leverage**:
|
|
150
|
+
H_ii ≈ Σ_trees (learning_rate / leaf_size)
|
|
151
|
+
T1 = mean(H_ii) # global complexity
|
|
152
|
+
T2 = max(H_ii)/mean(H_ii) # local memorization
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
**Bootstrap procedure:**
|
|
156
|
+
|
|
157
|
+
1. Repeat B times: resample training data, recompute T1 & T2
|
|
158
|
+
2. Compute p-values:
|
|
159
|
+
- p1 = P(T1_boot ≥ T1_obs)
|
|
160
|
+
- p2 = P(T2_boot ≥ T2_obs)
|
|
161
|
+
|
|
162
|
+
Reject structural stability if:
|
|
163
|
+
|
|
164
|
+
p1 < α OR p2 < α
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## 📊 What λ-Guard Distinguishes
|
|
170
|
+
|
|
171
|
+
| Regime | Meaning |
|
|
172
|
+
|--------|---------|
|
|
173
|
+
| ✅ Stable | smooth generalization |
|
|
174
|
+
| 📈 Global overfitting | too many effective parameters |
|
|
175
|
+
| ⚠️ Local memorization | few points dominate |
|
|
176
|
+
| 💥 Extreme | interpolation behavior |
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## 🧭 When to Use
|
|
181
|
+
|
|
182
|
+
- Monitor boosting during training
|
|
183
|
+
- Hyperparameter tuning
|
|
184
|
+
- Small datasets (no validation split)
|
|
185
|
+
- Diagnose late-stage performance collapse
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## ⚙️ Installation
|
|
190
|
+
|
|
191
|
+
Install via GitHub:
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
pip install git+https://github.com/faberBI/lambdaguard.git
|
|
195
|
+
|
|
196
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
197
|
+
from lambdaguard.ofi import overfitting_index
|
|
198
|
+
from lambdaguard.lambda_guard import lambda_guard_test, interpret
|
|
199
|
+
from lambdaguard.cusum import detect_structural_overfitting_cusum_robust
|
|
200
|
+
import pandas as pd
|
|
201
|
+
|
|
202
|
+
# Fit a model
|
|
203
|
+
model = GradientBoostingRegressor(n_estimators=50, max_depth=3)
|
|
204
|
+
model.fit(X_train, y_train)
|
|
205
|
+
|
|
206
|
+
# Compute Overfitting Index
|
|
207
|
+
ofi_res = overfitting_index(model, X_train, y_train)
|
|
208
|
+
|
|
209
|
+
# Lambda-guard test
|
|
210
|
+
lg_res = lambda_guard_test(model, X_train)
|
|
211
|
+
print(interpret(lg_res))
|
|
212
|
+
|
|
213
|
+
# CUSUM-based detection
|
|
214
|
+
df = pd.DataFrame([
|
|
215
|
+
{"model": "GBR", "n_estimators": 50, "max_depth": 3, "A": 0.8, "OFI_norm": 0.2},
|
|
216
|
+
{"model": "GBR", "n_estimators": 100, "max_depth": 5, "A": 0.85, "OFI_norm": 0.3},
|
|
217
|
+
])
|
|
218
|
+
cusum_res = detect_structural_overfitting_cusum_robust(df, model_name="GBR")
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## 📜 Citation
|
|
223
|
+
|
|
224
|
+
If you use **λ-Guard** in your research or projects, please cite the following:
|
|
225
|
+
|
|
226
|
+
**Fabrizio Di Sciorio, PhD**
|
|
227
|
+
*Universidad de Almeria — Business and Economics Department*
|
|
228
|
+
> "λ-Guard: Structural Overfitting Detection for Gradient Boosting Models"
|
|
229
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE.md
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
lambda_guard_boosting.egg-info/PKG-INFO
|
|
5
|
+
lambda_guard_boosting.egg-info/SOURCES.txt
|
|
6
|
+
lambda_guard_boosting.egg-info/dependency_links.txt
|
|
7
|
+
lambda_guard_boosting.egg-info/requires.txt
|
|
8
|
+
lambda_guard_boosting.egg-info/top_level.txt
|
|
9
|
+
lambdaguard/__init__.py
|
|
10
|
+
lambdaguard/cusum.py
|
|
11
|
+
lambdaguard/lambdaguard.py
|
|
12
|
+
lambdaguard/ofi.py
|
|
13
|
+
tests/test_cusum.py
|
|
14
|
+
tests/test_ofi.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
__version__ = "0.2.2"
|
|
2
|
+
|
|
3
|
+
from .ofi import generalization_index, instability_index, create_model, run_experiment_multi_model, plot_all_multi_model, regression_test
|
|
4
|
+
from .lambda_guard import lambda_guard_test, boosting_leverage, interpret
|
|
5
|
+
from .cusum import lambda_detect
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"generalization_index",
|
|
9
|
+
"instability_index",
|
|
10
|
+
"create_model",
|
|
11
|
+
"run_experiment_multi_model",
|
|
12
|
+
"plot_all_multi_model",
|
|
13
|
+
"regression_test",
|
|
14
|
+
"lambda_guard_test",
|
|
15
|
+
"boosting_leverage",
|
|
16
|
+
"interpret",
|
|
17
|
+
"lambda_detect"
|
|
18
|
+
]
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
|
|
5
|
+
def lambda_detect(
|
|
6
|
+
df,
|
|
7
|
+
model_name,
|
|
8
|
+
complexity_metric="combined",
|
|
9
|
+
lambda_col="OFI_norm",
|
|
10
|
+
alignment_col="A",
|
|
11
|
+
smooth_window=3,
|
|
12
|
+
cusum_threshold_factor=1.5,
|
|
13
|
+
baseline_points=10
|
|
14
|
+
):
|
|
15
|
+
df_model = df[df["model"] == model_name].copy()
|
|
16
|
+
|
|
17
|
+
if complexity_metric == "combined":
|
|
18
|
+
df_model["complexity"] = df_model["n_estimators"] * df_model["max_depth"]
|
|
19
|
+
else:
|
|
20
|
+
df_model["complexity"] = df_model[complexity_metric]
|
|
21
|
+
|
|
22
|
+
df_model = df_model.sort_values("complexity").reset_index(drop=True)
|
|
23
|
+
|
|
24
|
+
lambdas = df_model[lambda_col].values
|
|
25
|
+
alignment = df_model[alignment_col].values
|
|
26
|
+
complexity = df_model["complexity"].values
|
|
27
|
+
|
|
28
|
+
# --- Normalizzazione z-score su baseline ---
|
|
29
|
+
lambda_baseline = lambdas[:baseline_points]
|
|
30
|
+
mu_lambda = np.mean(lambda_baseline)
|
|
31
|
+
sigma_lambda = np.std(lambda_baseline) + 1e-8
|
|
32
|
+
lambda_z = (lambdas - mu_lambda) / sigma_lambda
|
|
33
|
+
|
|
34
|
+
# --- Derivate ---
|
|
35
|
+
delta_lambda = np.diff(lambda_z)
|
|
36
|
+
delta_lambda = pd.Series(delta_lambda).rolling(smooth_window, min_periods=1).mean().values
|
|
37
|
+
|
|
38
|
+
delta2_lambda = np.diff(delta_lambda)
|
|
39
|
+
delta2_lambda = pd.Series(delta2_lambda).rolling(smooth_window, min_periods=1).mean().values
|
|
40
|
+
|
|
41
|
+
# --- CUSUM cumulativo ---
|
|
42
|
+
mean_d2 = np.mean(delta2_lambda)
|
|
43
|
+
std_d2 = np.std(delta2_lambda) + 1e-8
|
|
44
|
+
centered_d2 = delta2_lambda - mean_d2
|
|
45
|
+
|
|
46
|
+
cusum = np.zeros_like(centered_d2)
|
|
47
|
+
for i in range(1, len(centered_d2)):
|
|
48
|
+
cusum[i] = max(0, cusum[i-1] + centered_d2[i])
|
|
49
|
+
|
|
50
|
+
cusum_threshold = cusum_threshold_factor * std_d2
|
|
51
|
+
|
|
52
|
+
# --- Individuazione del change point ---
|
|
53
|
+
change_index = None
|
|
54
|
+
delta_alignment = np.diff(alignment)
|
|
55
|
+
for i, val in enumerate(cusum):
|
|
56
|
+
align_flat = delta_alignment[i] < 0.01 if i < len(delta_alignment) else False
|
|
57
|
+
if val > cusum_threshold and align_flat:
|
|
58
|
+
change_index = i + 2
|
|
59
|
+
break
|
|
60
|
+
|
|
61
|
+
# --- Miglior modello prima dell'overfitting basato su OFI minimo ---
|
|
62
|
+
best_model_before_overfitting = None
|
|
63
|
+
if change_index is not None and change_index > 0:
|
|
64
|
+
pre_overfit_df = df_model.iloc[:change_index]
|
|
65
|
+
best_row = pre_overfit_df.loc[pre_overfit_df[lambda_col].idxmin()]
|
|
66
|
+
best_model_before_overfitting = (
|
|
67
|
+
f"{best_row['model']} | {best_row['dataset']} | "
|
|
68
|
+
f"min_samples_leaf={best_row['min_samples_leaf']} | "
|
|
69
|
+
f"n_estimators={best_row['n_estimators']} | "
|
|
70
|
+
f"max_depth={best_row['max_depth']}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# --- Plot λ_z ---
|
|
74
|
+
plt.figure(figsize=(10,5))
|
|
75
|
+
plt.plot(complexity, lambda_z, '-o', label='λ_z (normalized)', color='tab:blue')
|
|
76
|
+
if change_index is not None:
|
|
77
|
+
plt.axvline(complexity[change_index], color='red', linestyle='--', label='Change Point')
|
|
78
|
+
plt.scatter(complexity[change_index], lambda_z[change_index], color='red', s=100)
|
|
79
|
+
plt.xlabel("Complexity (n_estimators*max_depth)")
|
|
80
|
+
plt.ylabel("λ_z (normalized)")
|
|
81
|
+
plt.title(f"Structural Overfitting Detection - {model_name}")
|
|
82
|
+
plt.grid(True)
|
|
83
|
+
plt.legend()
|
|
84
|
+
plt.show()
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"overfitting_detected": change_index is not None,
|
|
88
|
+
"change_index": change_index,
|
|
89
|
+
"complexity_at_change": complexity[change_index] if change_index is not None else None,
|
|
90
|
+
"lambda_z_at_change": lambda_z[change_index] if change_index is not None else None,
|
|
91
|
+
"best_model_before_overfitting": best_model_before_overfitting
|
|
92
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
|
|
5
|
+
def boosting_leverage(model, X):
|
|
6
|
+
n = X.shape[0]
|
|
7
|
+
influence = np.zeros(n)
|
|
8
|
+
for est in model.estimators_.ravel():
|
|
9
|
+
leaf_id = est.apply(X)
|
|
10
|
+
unique, counts = np.unique(leaf_id, return_counts=True)
|
|
11
|
+
leaf_sizes = dict(zip(unique, counts))
|
|
12
|
+
lr = model.learning_rate
|
|
13
|
+
for i in range(n):
|
|
14
|
+
influence[i] += lr / leaf_sizes[leaf_id[i]]
|
|
15
|
+
return influence
|
|
16
|
+
|
|
17
|
+
def lambda_guard_test(model, X, B=300, alpha=0.05, plot=True):
|
|
18
|
+
n = X.shape[0]
|
|
19
|
+
H = boosting_leverage(model, X)
|
|
20
|
+
T1_obs = H.sum() / n
|
|
21
|
+
T2_obs = H.max() / H.mean()
|
|
22
|
+
T1_boot = np.zeros(B)
|
|
23
|
+
T2_boot = np.zeros(B)
|
|
24
|
+
for b in range(B):
|
|
25
|
+
idx = np.random.choice(n, n, replace=True)
|
|
26
|
+
Hb = boosting_leverage(model, X[idx])
|
|
27
|
+
T1_boot[b] = Hb.sum() / n
|
|
28
|
+
T2_boot[b] = Hb.max() / Hb.mean()
|
|
29
|
+
q1, q2 = np.quantile(T1_boot, 1-alpha), np.quantile(T2_boot, 1-alpha)
|
|
30
|
+
p1, p2 = np.mean(T1_boot >= T1_obs), np.mean(T2_boot >= T2_obs)
|
|
31
|
+
reject = (p1 < alpha) or (p2 < alpha)
|
|
32
|
+
if plot:
|
|
33
|
+
fig, axes = plt.subplots(1,2,figsize=(12,4))
|
|
34
|
+
axes[0].hist(T1_boot, bins=30, density=True, alpha=0.7)
|
|
35
|
+
axes[0].axvline(T1_obs, color="black", label="Observed")
|
|
36
|
+
axes[0].axvline(q1, color="red", linestyle="--", label="Critical")
|
|
37
|
+
axes[0].set_title("T1: Effective DoF ratio")
|
|
38
|
+
axes[0].legend()
|
|
39
|
+
axes[1].hist(T2_boot, bins=30, density=True, alpha=0.7)
|
|
40
|
+
axes[1].axvline(T2_obs, color="black", label="Observed")
|
|
41
|
+
axes[1].axvline(q2, color="red", linestyle="--", label="Critical")
|
|
42
|
+
axes[1].set_title("T2: Peak leverage ratio")
|
|
43
|
+
axes[1].legend()
|
|
44
|
+
plt.show()
|
|
45
|
+
return {
|
|
46
|
+
"T1_df_ratio": T1_obs, "critical_df_ratio": q1, "p_df_ratio": p1,
|
|
47
|
+
"T2_peak_ratio": T2_obs, "critical_peak_ratio": q2, "p_peak_ratio": p2,
|
|
48
|
+
"reject_H0": reject
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
def interpret(res):
|
|
52
|
+
if not res["reject_H0"]:
|
|
53
|
+
return "✔ REGIME STABILE / GENERALIZZANTE"
|
|
54
|
+
if res["p_df_ratio"] < 0.05 and res["p_peak_ratio"] < 0.05:
|
|
55
|
+
return "✖ REGIME INTERPOLANTE (OVERFITTING FORTE)"
|
|
56
|
+
if res["p_df_ratio"] < 0.05:
|
|
57
|
+
return "✖ COMPLESSITÀ GLOBALE ECCESSIVA"
|
|
58
|
+
return "✖ (LEVERAGE SPIKES)"
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# FULL EXPERIMENT: LAMBDA GUARD - MULTI-MODEL (OPTIMIZED)
|
|
3
|
+
# ============================================================
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import seaborn as sns
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
from itertools import product
|
|
10
|
+
|
|
11
|
+
from sklearn.model_selection import train_test_split
|
|
12
|
+
from sklearn.metrics import mean_squared_error
|
|
13
|
+
from sklearn.datasets import make_regression
|
|
14
|
+
|
|
15
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
16
|
+
import xgboost as xgb
|
|
17
|
+
import lightgbm as lgb
|
|
18
|
+
from catboost import CatBoostRegressor
|
|
19
|
+
|
|
20
|
+
# -----------------------------
|
|
21
|
+
# GENERALIZATION COMPONENTS
|
|
22
|
+
# -----------------------------
|
|
23
|
+
def generalization_index(model, X, y):
|
|
24
|
+
preds = model.predict(X)
|
|
25
|
+
A = np.corrcoef(preds, y)[0, 1] if np.std(preds) > 0 else 0
|
|
26
|
+
C = np.var(preds)
|
|
27
|
+
GI = A / C if C > 0 else 0
|
|
28
|
+
return GI, A, C
|
|
29
|
+
|
|
30
|
+
def instability_index(model, X, noise_std=1e-3, seed=42):
|
|
31
|
+
rng = np.random.default_rng(seed)
|
|
32
|
+
noise = rng.normal(0, noise_std, X.shape)
|
|
33
|
+
preds_clean = model.predict(X)
|
|
34
|
+
preds_noisy = model.predict(X + noise)
|
|
35
|
+
S = np.mean(np.abs(preds_clean - preds_noisy)) / (np.std(preds_clean) + 1e-8)
|
|
36
|
+
return S
|
|
37
|
+
|
|
38
|
+
# -----------------------------
|
|
39
|
+
# MODEL FACTORY
|
|
40
|
+
# -----------------------------
|
|
41
|
+
def create_model(model_name, n_estimators=100, max_depth=3, learning_rate=0.05, min_samples_leaf=1):
|
|
42
|
+
if model_name == "GBR":
|
|
43
|
+
return GradientBoostingRegressor(
|
|
44
|
+
n_estimators=n_estimators,
|
|
45
|
+
max_depth=max_depth,
|
|
46
|
+
learning_rate=learning_rate,
|
|
47
|
+
subsample=0.8,
|
|
48
|
+
min_samples_leaf=min_samples_leaf,
|
|
49
|
+
random_state=42
|
|
50
|
+
)
|
|
51
|
+
elif model_name == "XGB":
|
|
52
|
+
return xgb.XGBRegressor(
|
|
53
|
+
n_estimators=n_estimators,
|
|
54
|
+
max_depth=max_depth,
|
|
55
|
+
learning_rate=learning_rate,
|
|
56
|
+
subsample=0.8,
|
|
57
|
+
random_state=42,
|
|
58
|
+
verbosity=0
|
|
59
|
+
)
|
|
60
|
+
elif model_name == "LGBM":
|
|
61
|
+
return lgb.LGBMRegressor(
|
|
62
|
+
n_estimators=n_estimators,
|
|
63
|
+
max_depth=max_depth,
|
|
64
|
+
learning_rate=learning_rate,
|
|
65
|
+
subsample=0.8,
|
|
66
|
+
min_child_samples=min_samples_leaf,
|
|
67
|
+
random_state=42,
|
|
68
|
+
verbose=-1
|
|
69
|
+
)
|
|
70
|
+
elif model_name == "CAT":
|
|
71
|
+
return CatBoostRegressor(
|
|
72
|
+
iterations=n_estimators,
|
|
73
|
+
depth=max_depth,
|
|
74
|
+
learning_rate=learning_rate,
|
|
75
|
+
random_seed=42,
|
|
76
|
+
verbose=0
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Unknown model {model_name}")
|
|
80
|
+
|
|
81
|
+
# -----------------------------
|
|
82
|
+
# EXPERIMENT FUNCTION
|
|
83
|
+
# -----------------------------
|
|
84
|
+
def run_experiment_multi_model(X, y, dataset_name, model_names=["GBR"],
|
|
85
|
+
n_estimators_list=[50,100,200], max_depth_list=[3,5,7], min_samples_leaf_list=[2,5]):
|
|
86
|
+
print(f"\n{'='*70}\nDATASET: {dataset_name}\n{'='*70}")
|
|
87
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
|
88
|
+
results = []
|
|
89
|
+
|
|
90
|
+
for model_name in model_names:
|
|
91
|
+
for sl, n_est, depth in product(min_samples_leaf_list, n_estimators_list, max_depth_list):
|
|
92
|
+
model = create_model(model_name, n_estimators=n_est, max_depth=depth, min_samples_leaf=sl)
|
|
93
|
+
model.fit(X_train, y_train)
|
|
94
|
+
|
|
95
|
+
GI, A, C = generalization_index(model, X_train, y_train)
|
|
96
|
+
S = instability_index(model, X_train)
|
|
97
|
+
OFI = (C / (A + C)) * S
|
|
98
|
+
G_norm = A / (A + C)
|
|
99
|
+
|
|
100
|
+
y_train_pred = model.predict(X_train)
|
|
101
|
+
y_test_pred = model.predict(X_test)
|
|
102
|
+
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
|
|
103
|
+
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
|
|
104
|
+
gap = rmse_test - rmse_train
|
|
105
|
+
|
|
106
|
+
results.append({
|
|
107
|
+
"model": model_name,
|
|
108
|
+
"dataset": dataset_name,
|
|
109
|
+
"min_samples_leaf": sl,
|
|
110
|
+
"n_estimators": n_est,
|
|
111
|
+
"max_depth": depth,
|
|
112
|
+
"A": A,
|
|
113
|
+
"C": C,
|
|
114
|
+
"GI": GI,
|
|
115
|
+
"G_norm": G_norm,
|
|
116
|
+
"Instability": S,
|
|
117
|
+
"OFI": OFI,
|
|
118
|
+
"Train_RMSE": rmse_train,
|
|
119
|
+
"Test_RMSE": rmse_test,
|
|
120
|
+
"Gap": gap
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
df = pd.DataFrame(results)
|
|
124
|
+
df["OFI_norm"] = df.groupby("model")["OFI"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
|
|
125
|
+
return df
|
|
126
|
+
|
|
127
|
+
# -----------------------------
|
|
128
|
+
# PLOT FUNCTION
|
|
129
|
+
# -----------------------------
|
|
130
|
+
def plot_all_multi_model(df, metric="Gap"):
|
|
131
|
+
for model_name in df["model"].unique():
|
|
132
|
+
df_model = df[df["model"] == model_name]
|
|
133
|
+
for x_col in ["G_norm", "OFI_norm"]:
|
|
134
|
+
plt.figure(figsize=(6,5))
|
|
135
|
+
sns.regplot(data=df_model, x=x_col, y=metric)
|
|
136
|
+
plt.title(f"{model_name} - {x_col} vs {metric}")
|
|
137
|
+
plt.grid(True)
|
|
138
|
+
plt.show()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# -----------------------------
|
|
142
|
+
# REGRESSION TEST
|
|
143
|
+
# -----------------------------
|
|
144
|
+
def regression_test(df):
|
|
145
|
+
X = df['OFI_norm']
|
|
146
|
+
y = df['Gap']
|
|
147
|
+
X_const = sm.add_constant(X)
|
|
148
|
+
model = sm.OLS(y, X_const).fit()
|
|
149
|
+
print(model.summary())
|
|
150
|
+
plt.figure(figsize=(8,6))
|
|
151
|
+
plt.scatter(df['OFI_norm'], df_model['Gap'], alpha=0.6)
|
|
152
|
+
plt.plot(df['OFI_norm'], model.predict(X_const), color='red', linewidth=2)
|
|
153
|
+
plt.xlabel('Lambda')
|
|
154
|
+
plt.ylabel('RMSE gap (test - train)')
|
|
155
|
+
plt.title(f'Regression Gap RMSE vs Lambda - {df_model["model"].iloc[0]}')
|
|
156
|
+
plt.grid(True)
|
|
157
|
+
plt.show()
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lambda-guard-boosting"
|
|
7
|
+
version = "0.2.2"
|
|
8
|
+
description = "Overfitting detection for Gradient Boosting models using λ-Guard methodology."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
authors = [
|
|
12
|
+
{name="Fabrizio Di Sciorio, PhD", email="fabriziodisciorio91@gmail.com"}
|
|
13
|
+
]
|
|
14
|
+
keywords = ["machine-learning", "gradient-boosting", "overfitting", "boosting", "lambda-guard"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
"Topic :: Software Development :: Libraries :: Python Modules"
|
|
21
|
+
]
|
|
22
|
+
requires-python = ">=3.8"
|
|
23
|
+
dependencies = [
|
|
24
|
+
"numpy>=1.26,<2.2",
|
|
25
|
+
"pandas>=2.2,<3.0",
|
|
26
|
+
"scikit-learn>=1.3,<2.0",
|
|
27
|
+
"matplotlib>=3.8,<4.0",
|
|
28
|
+
"seaborn>=0.12,<0.14",
|
|
29
|
+
"xgboost>=1.7,<4.0",
|
|
30
|
+
"lightgbm>=4.4,<5.0",
|
|
31
|
+
"catboost>=1.1,<2.0"
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/faberBI/lambdaguard"
|
|
36
|
+
Documentation = "https://github.com/faberBI/lambdaguard"
|
|
37
|
+
BugTracker = "https://github.com/faberBI/lambdaguard/issues"
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["."]
|
|
41
|
+
exclude = ["notebooks*", "tests*", "docs*"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from lambdaguard.cusum import lambda_detect
|
|
4
|
+
|
|
5
|
+
def test_lambda_detect_basic():
|
|
6
|
+
# Fake dataset with 2 models and increasing OFI
|
|
7
|
+
df = pd.DataFrame({
|
|
8
|
+
"model": ["GBR"]*5,
|
|
9
|
+
"dataset": ["test"]*5,
|
|
10
|
+
"min_samples_leaf": [2]*5,
|
|
11
|
+
"n_estimators": [10,20,30,40,50],
|
|
12
|
+
"max_depth": [3]*5,
|
|
13
|
+
"A": np.linspace(0.8, 0.95, 5),
|
|
14
|
+
"C": np.linspace(0.1,0.5,5),
|
|
15
|
+
"OFI": np.linspace(0.1,0.6,5),
|
|
16
|
+
"OFI_norm": np.linspace(0.1,0.6,5)
|
|
17
|
+
})
|
|
18
|
+
|
|
19
|
+
result = lambda_detect(df, "GBR")
|
|
20
|
+
assert "overfitting_detected" in result
|
|
21
|
+
assert "best_model_before_overfitting" in result
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from sklearn.datasets import make_regression
|
|
4
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
5
|
+
|
|
6
|
+
from lambdaguard.ofi import generalization_index, instability_index, create_model
|
|
7
|
+
|
|
8
|
+
def test_generalization_index():
|
|
9
|
+
X, y = make_regression(n_samples=50, n_features=5, noise=0.1, random_state=42)
|
|
10
|
+
model = GradientBoostingRegressor(n_estimators=10, random_state=42)
|
|
11
|
+
model.fit(X, y)
|
|
12
|
+
GI, A, C = generalization_index(model, X, y)
|
|
13
|
+
assert 0 <= GI or GI <= 1e10, "GI seems off"
|
|
14
|
+
assert np.isfinite(A), "Alignment not finite"
|
|
15
|
+
assert np.isfinite(C), "Complexity not finite"
|
|
16
|
+
|
|
17
|
+
def test_instability_index():
|
|
18
|
+
X, y = make_regression(n_samples=50, n_features=5, noise=0.1, random_state=42)
|
|
19
|
+
model = GradientBoostingRegressor(n_estimators=10, random_state=42)
|
|
20
|
+
model.fit(X, y)
|
|
21
|
+
S = instability_index(model, X)
|
|
22
|
+
assert S >= 0, "Instability should be non-negative"
|
|
23
|
+
|
|
24
|
+
def test_create_model():
|
|
25
|
+
model = create_model("GBR", n_estimators=5, max_depth=2)
|
|
26
|
+
assert model.n_estimators == 5
|