geocif 0.2.2__tar.gz → 0.2.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.2.2/geocif.egg-info → geocif-0.2.21}/PKG-INFO +1 -1
- geocif-0.2.21/geocif/ml/feature_selection.py +316 -0
- {geocif-0.2.2 → geocif-0.2.21/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.2.2 → geocif-0.2.21}/setup.py +1 -1
- geocif-0.2.2/geocif/ml/feature_selection.py +0 -350
- {geocif-0.2.2 → geocif-0.2.21}/LICENSE +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/MANIFEST.in +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/README.md +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/agmet/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/agmet/plot.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/agmet/utils.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/analysis.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/constants.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/features.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/geo.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/geocif.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/metadata.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/backup/models.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/cei/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/cei/definitions.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/cei/indices.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/experiments.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/geocif.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/geocif_runner.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_angola.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/logger.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/correlations.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/embedding.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/outliers.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/outlook.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/output.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/stages.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/stats.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/trainers.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/trend.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/ml/xai.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/mm.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/aa.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/area.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/automl.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/download_esi.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/enso.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/eval.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/gamtest.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/gee_access.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/misc.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/reg.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/sustain.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/tmp.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/tmp2.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/tmp3.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/tmp4.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/tmp5.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/playground/wolayita_maize_mask.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/risk/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/utils.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/viz/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/viz/gt.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/viz/plot.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif/viz/tmp.py +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/requirements.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/setup.cfg +0 -0
- {geocif-0.2.2 → geocif-0.2.21}/tests/test_geocif.py +0 -0
@@ -0,0 +1,316 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from tqdm import tqdm
|
3
|
+
from sklearn.ensemble import RandomForestRegressor
|
4
|
+
from collections import Counter
|
5
|
+
from pathlib import Path
|
6
|
+
import matplotlib.pyplot as plt
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
|
10
|
+
def are_all_features_non_eo(features):
|
11
|
+
"""
|
12
|
+
Check if all the features are non-EO features
|
13
|
+
|
14
|
+
Args:
|
15
|
+
features: iterable of feature names
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
bool: True if every feature is in the non-EO list
|
19
|
+
"""
|
20
|
+
non_eo_features = [
|
21
|
+
'Median Yield (tn per ha)',
|
22
|
+
'Analogous Year',
|
23
|
+
'Analogous Year Yield',
|
24
|
+
'lon',
|
25
|
+
'lat',
|
26
|
+
't -1 Yield (tn per ha)',
|
27
|
+
't -2 Yield (tn per ha)',
|
28
|
+
't -3 Yield (tn per ha)',
|
29
|
+
't -4 Yield (tn per ha)',
|
30
|
+
't -5 Yield (tn per ha)',
|
31
|
+
]
|
32
|
+
return all(f in non_eo_features for f in features)
|
33
|
+
|
34
|
+
|
35
|
+
def select_features(
|
36
|
+
X, y,
|
37
|
+
method="RFE",
|
38
|
+
min_features_to_select=3,
|
39
|
+
threshold_nan=0.2,
|
40
|
+
threshold_unique=0.6
|
41
|
+
):
|
42
|
+
"""
|
43
|
+
Feature-selection wrapper supporting many methods plus a new 'multi' option.
|
44
|
+
|
45
|
+
Parameters
|
46
|
+
----------
|
47
|
+
X : pd.DataFrame
|
48
|
+
y : array-like
|
49
|
+
method : str
|
50
|
+
One of {"SHAP", "stabl", "feature_engine", "mrmr", "RFECV", "lasso",
|
51
|
+
"BorutaPy", "Leshy", "PowerShap", "BorutaShap", "Genetic", "RFE", "multi"}
|
52
|
+
min_features_to_select : int
|
53
|
+
threshold_nan : float
|
54
|
+
Drop columns with > threshold_nan proportion of NaNs
|
55
|
+
threshold_unique : float
|
56
|
+
(Reserved for future use)
|
57
|
+
|
58
|
+
Returns
|
59
|
+
-------
|
60
|
+
selector : fitted selector object or None (for multi)
|
61
|
+
X_filtered : pd.DataFrame of selected features
|
62
|
+
selected_features : list[str]
|
63
|
+
"""
|
64
|
+
|
65
|
+
# copy original for multi-mode recursion
|
66
|
+
X_clean = X.copy()
|
67
|
+
|
68
|
+
# 1) drop columns with too many NaNs
|
69
|
+
nan_prop = X_clean.isna().mean()
|
70
|
+
X_clean = X_clean.loc[:, nan_prop <= threshold_nan]
|
71
|
+
|
72
|
+
# 2) fill NaNs with median
|
73
|
+
X_clean = X_clean.fillna(X_clean.median())
|
74
|
+
|
75
|
+
# --- multi-method ensemble -------------------------------
|
76
|
+
if method.lower() == "multi":
|
77
|
+
counter = Counter()
|
78
|
+
# run three selectors and count feature picks
|
79
|
+
for sub_m in ["BorutaPy", "mrmr", "lasso"]:
|
80
|
+
_, _, feats = select_features(
|
81
|
+
X_clean, y,
|
82
|
+
method=sub_m,
|
83
|
+
min_features_to_select=min_features_to_select,
|
84
|
+
threshold_nan=threshold_nan,
|
85
|
+
threshold_unique=threshold_unique
|
86
|
+
)
|
87
|
+
counter.update(feats)
|
88
|
+
|
89
|
+
# union of all features
|
90
|
+
combined = sorted(counter.keys())
|
91
|
+
X_out = X_clean.loc[:, combined]
|
92
|
+
|
93
|
+
# plot and save histogram
|
94
|
+
freq = pd.Series(counter).sort_values(ascending=False)
|
95
|
+
fig = freq.plot(kind="bar", width=0.9).get_figure()
|
96
|
+
plt.title("Feature selection frequency across methods")
|
97
|
+
plt.xlabel("Feature")
|
98
|
+
plt.ylabel("Times selected (out of 3)")
|
99
|
+
plt.tight_layout()
|
100
|
+
|
101
|
+
out_dir = Path("feature_selection_multi")
|
102
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
103
|
+
fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
|
104
|
+
plt.close(fig)
|
105
|
+
|
106
|
+
return None, X_out, combined
|
107
|
+
|
108
|
+
# define forest for methods that need it
|
109
|
+
forest = RandomForestRegressor(
|
110
|
+
n_estimators=500,
|
111
|
+
n_jobs=8,
|
112
|
+
max_depth=5,
|
113
|
+
random_state=1,
|
114
|
+
)
|
115
|
+
|
116
|
+
# patch numpy deprecation
|
117
|
+
np.int = np.int32
|
118
|
+
np.float = np.float64
|
119
|
+
np.bool = np.bool_
|
120
|
+
|
121
|
+
if method == "SHAP":
|
122
|
+
import pandas as pd
|
123
|
+
from catboost import CatBoostRegressor
|
124
|
+
from fasttreeshap import TreeExplainer as FastTreeExplainer
|
125
|
+
from sklearn.model_selection import cross_val_score
|
126
|
+
|
127
|
+
model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
|
128
|
+
model.fit(X_clean, y)
|
129
|
+
explainer = FastTreeExplainer(model)
|
130
|
+
shap_values = explainer.shap_values(X_clean)
|
131
|
+
shap_importances = np.mean(np.abs(shap_values), axis=0)
|
132
|
+
shap_df = pd.DataFrame({
|
133
|
+
"feature": X_clean.columns,
|
134
|
+
"importance": shap_importances
|
135
|
+
}).sort_values("importance", ascending=False)
|
136
|
+
|
137
|
+
def eval_n(N):
|
138
|
+
top = shap_df["feature"].head(N)
|
139
|
+
sel = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
|
140
|
+
scores = cross_val_score(sel, X_clean[top], y,
|
141
|
+
cv=5, scoring="neg_mean_squared_error",
|
142
|
+
n_jobs=-1)
|
143
|
+
return np.mean(scores)
|
144
|
+
|
145
|
+
nrange = [5,10,15,20,25,30]
|
146
|
+
scores = [eval_n(N) for N in tqdm(nrange)]
|
147
|
+
best = nrange[np.argmax(scores)]
|
148
|
+
selected = shap_df["feature"].head(best).tolist()
|
149
|
+
|
150
|
+
elif method == "stabl":
|
151
|
+
from stabl.stabl import Stabl
|
152
|
+
from sklearn.linear_model import Lasso
|
153
|
+
|
154
|
+
st = Stabl(
|
155
|
+
base_estimator=Lasso(alpha=0.001),
|
156
|
+
n_bootstraps=10,
|
157
|
+
artificial_type="knockoff",
|
158
|
+
artificial_proportion=0.5,
|
159
|
+
replace=False,
|
160
|
+
fdr_threshold_range=np.arange(0.1,1,0.01),
|
161
|
+
sample_fraction=0.5,
|
162
|
+
random_state=42,
|
163
|
+
lambda_grid="auto",
|
164
|
+
verbose=1
|
165
|
+
)
|
166
|
+
st.fit(X_clean, y)
|
167
|
+
selected = st.get_feature_names_out()
|
168
|
+
|
169
|
+
elif method == "feature_engine":
|
170
|
+
from feature_engine.selection import SmartCorrelatedSelection
|
171
|
+
sel = SmartCorrelatedSelection(
|
172
|
+
method="pearson",
|
173
|
+
threshold=0.7,
|
174
|
+
selection_method="model_performance",
|
175
|
+
estimator=forest,
|
176
|
+
scoring="neg_mean_squared_error",
|
177
|
+
)
|
178
|
+
X_fe = sel.fit_transform(X_clean, y)
|
179
|
+
selected = X_fe.columns.tolist()
|
180
|
+
|
181
|
+
elif method == "mrmr":
|
182
|
+
from mrmr import mrmr_regression
|
183
|
+
selected = mrmr_regression(X=X_clean, y=y, K=10)
|
184
|
+
|
185
|
+
elif method == "RFECV":
|
186
|
+
from sklearn.feature_selection import RFECV
|
187
|
+
from sklearn.model_selection import KFold
|
188
|
+
|
189
|
+
class RFECVProg(RFECV):
|
190
|
+
def _fit(self, X, y):
|
191
|
+
with tqdm(total=X.shape[1]) as p:
|
192
|
+
orig = self.scorer_
|
193
|
+
def wrap(*a, **k):
|
194
|
+
p.update(1)
|
195
|
+
return orig(*a, **k)
|
196
|
+
self.scorer_ = wrap
|
197
|
+
super()._fit(X, y)
|
198
|
+
|
199
|
+
cv = KFold(n_splits=5)
|
200
|
+
sel = RFECVProg(
|
201
|
+
estimator=forest,
|
202
|
+
step=1,
|
203
|
+
cv=cv,
|
204
|
+
scoring="neg_mean_squared_error",
|
205
|
+
n_jobs=-1,
|
206
|
+
verbose=0
|
207
|
+
)
|
208
|
+
sel.fit(X_clean, y)
|
209
|
+
mask = sel.get_support()
|
210
|
+
selected = X_clean.columns[mask].tolist()
|
211
|
+
|
212
|
+
elif method == "lasso":
|
213
|
+
from sklearn.linear_model import LassoLarsCV
|
214
|
+
from sklearn.feature_selection import SelectFromModel
|
215
|
+
|
216
|
+
lr = LassoLarsCV(cv=5)
|
217
|
+
lr.fit(X_clean, y)
|
218
|
+
sfm = SelectFromModel(lr, prefit=True)
|
219
|
+
selected = X_clean.columns[sfm.get_support()].tolist()
|
220
|
+
|
221
|
+
elif method == "BorutaPy":
|
222
|
+
from boruta import BorutaPy
|
223
|
+
sel = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
|
224
|
+
sel.fit(X_clean.values, y)
|
225
|
+
mask = sel.support_ | sel.support_weak_
|
226
|
+
selected = X_clean.columns[mask].tolist()
|
227
|
+
|
228
|
+
elif method == "Leshy":
|
229
|
+
import arfs.feature_selection.allrelevant as arfsgroot
|
230
|
+
from catboost import CatBoostRegressor
|
231
|
+
model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
|
232
|
+
sel = arfsgroot.Leshy(
|
233
|
+
model,
|
234
|
+
n_estimators="auto",
|
235
|
+
verbose=1,
|
236
|
+
max_iter=10,
|
237
|
+
random_state=42,
|
238
|
+
importance="fastshap",
|
239
|
+
)
|
240
|
+
sel.fit(X_clean, y)
|
241
|
+
selected = sel.get_feature_names_out()
|
242
|
+
|
243
|
+
elif method == "PowerShap":
|
244
|
+
from powershap import PowerShap
|
245
|
+
from catboost import CatBoostRegressor
|
246
|
+
sel = PowerShap(
|
247
|
+
model=CatBoostRegressor(n_estimators=500, verbose=0),
|
248
|
+
power_alpha=0.05,
|
249
|
+
)
|
250
|
+
sel.fit(X_clean, y)
|
251
|
+
selected = sel.transform(X_clean).columns.tolist()
|
252
|
+
|
253
|
+
elif method == "BorutaShap":
|
254
|
+
from BorutaShap import BorutaShap
|
255
|
+
from catboost import CatBoostRegressor
|
256
|
+
params = {
|
257
|
+
"depth": 6,
|
258
|
+
"learning_rate": 0.05,
|
259
|
+
"iterations": 500,
|
260
|
+
"subsample": 1.0,
|
261
|
+
"random_strength": 0.5,
|
262
|
+
"reg_lambda": 0.001,
|
263
|
+
"loss_function": "RMSE",
|
264
|
+
"early_stopping_rounds": 25,
|
265
|
+
"random_seed": 42,
|
266
|
+
"verbose": False,
|
267
|
+
}
|
268
|
+
model = CatBoostRegressor(**params)
|
269
|
+
sel = BorutaShap(model=model, importance_measure="shap", classification=False)
|
270
|
+
sel.fit(X=X_clean, y=y, n_trials=100, sample=False,
|
271
|
+
train_or_test="test", normalize=True, verbose=False)
|
272
|
+
selected = sel.Subset().columns.tolist()
|
273
|
+
|
274
|
+
elif method == "Genetic":
|
275
|
+
from sklearn_genetic import GAFeatureSelectionCV
|
276
|
+
sel = GAFeatureSelectionCV(
|
277
|
+
estimator=forest,
|
278
|
+
cv=5,
|
279
|
+
scoring="neg_mean_squared_error",
|
280
|
+
population_size=100,
|
281
|
+
generations=40,
|
282
|
+
max_features=max(len(X_clean.columns)//3, min_features_to_select),
|
283
|
+
crossover_probability=0.9,
|
284
|
+
mutation_probability=0.1,
|
285
|
+
keep_top_k=2,
|
286
|
+
elitism=True,
|
287
|
+
n_jobs=-1,
|
288
|
+
verbose=1,
|
289
|
+
)
|
290
|
+
sel.fit(X_clean, y)
|
291
|
+
selected = X_clean.columns[sel.support_].tolist()
|
292
|
+
|
293
|
+
elif method == "RFE":
|
294
|
+
from sklearn.feature_selection import RFE
|
295
|
+
sel = RFE(forest, n_features_to_select=min_features_to_select, step=1, verbose=1)
|
296
|
+
sel = sel.fit(X_clean, y)
|
297
|
+
selected = X_clean.columns[sel.support_].tolist()
|
298
|
+
|
299
|
+
else:
|
300
|
+
raise ValueError(f"Unknown method: {method}")
|
301
|
+
|
302
|
+
# post-filtering: non-EO fallback to SelectKBest
|
303
|
+
non_eo = are_all_features_non_eo(selected)
|
304
|
+
if non_eo or method == "SelectKBest":
|
305
|
+
from sklearn.feature_selection import SelectKBest, f_regression
|
306
|
+
k = 15
|
307
|
+
skb = SelectKBest(score_func=f_regression, k=k)
|
308
|
+
skb.fit(X_clean, y)
|
309
|
+
selected = X_clean.columns[skb.get_support()].tolist()
|
310
|
+
|
311
|
+
# return selector (if exists), filtered DataFrame, and feature list
|
312
|
+
try:
|
313
|
+
return sel, X_clean.loc[:, selected], selected
|
314
|
+
except NameError:
|
315
|
+
# for methods that didn't create `sel`
|
316
|
+
return None, X_clean.loc[:, selected], selected
|
@@ -1,350 +0,0 @@
|
|
1
|
-
import numpy as np
|
2
|
-
from tqdm import tqdm
|
3
|
-
from sklearn.ensemble import RandomForestRegressor
|
4
|
-
|
5
|
-
|
6
|
-
def are_all_features_non_eo(features):
|
7
|
-
"""
|
8
|
-
Check if all the features non eo features
|
9
|
-
|
10
|
-
Args:
|
11
|
-
feature:
|
12
|
-
|
13
|
-
Returns:
|
14
|
-
|
15
|
-
"""
|
16
|
-
non_eo_features = ['Median Yield (tn per ha)',
|
17
|
-
'Analogous Year',
|
18
|
-
'Analogous Year Yield',
|
19
|
-
'lon',
|
20
|
-
'lat',
|
21
|
-
't -1 Yield (tn per ha)',
|
22
|
-
't -2 Yield (tn per ha)',
|
23
|
-
't -3 Yield (tn per ha)',
|
24
|
-
't -4 Yield (tn per ha)',
|
25
|
-
't -5 Yield (tn per ha)']
|
26
|
-
|
27
|
-
# Check if all features are non-eo features, return True if they are
|
28
|
-
return all(feature in non_eo_features for feature in features)
|
29
|
-
|
30
|
-
|
31
|
-
def select_features(X, y, method="RFE", min_features_to_select=3, threshold_nan=0.2, threshold_unique=0.6):
|
32
|
-
"""
|
33
|
-
|
34
|
-
Args:
|
35
|
-
X:
|
36
|
-
y:
|
37
|
-
method:
|
38
|
-
min_features_to_select:
|
39
|
-
threshold_unique:
|
40
|
-
|
41
|
-
Returns:
|
42
|
-
|
43
|
-
"""
|
44
|
-
|
45
|
-
# df = X.copy()
|
46
|
-
#
|
47
|
-
# # Initialize and apply StandardScaler
|
48
|
-
# scaler = StandardScaler()
|
49
|
-
# scaled_data = scaler.fit_transform(df)
|
50
|
-
#
|
51
|
-
# # Initialize and apply VarianceThreshold
|
52
|
-
# # Note: Since data is standardized, all features now have variance of 1 before applying VarianceThreshold.
|
53
|
-
# # You would adjust the threshold based on new criteria since variances have been normalized.
|
54
|
-
# selector = VarianceThreshold(threshold=scaled_data.var().mean())
|
55
|
-
# X = selector.fit_transform(scaled_data)
|
56
|
-
selector = None
|
57
|
-
X_original = X.copy()
|
58
|
-
|
59
|
-
# Calculate the proportion of NaN values in each column
|
60
|
-
nan_proportion = X.isna().mean()
|
61
|
-
|
62
|
-
# Drop columns where more than 20% of the values are NaN
|
63
|
-
X = X.loc[:, nan_proportion <= threshold_nan]
|
64
|
-
|
65
|
-
# Fill in columns with median of that column
|
66
|
-
X = X.fillna(X.median())
|
67
|
-
|
68
|
-
# Calculate the proportion of unique values in each column
|
69
|
-
# unique_proportion = X.nunique(axis="columns") / len(X)
|
70
|
-
#
|
71
|
-
# # Filter columns that have at least 60% unique values
|
72
|
-
# columns_to_keep = unique_proportion[unique_proportion >= threshold_unique].index
|
73
|
-
#
|
74
|
-
# # Drop columns that do not meet the threshold
|
75
|
-
# X = X[columns_to_keep]
|
76
|
-
|
77
|
-
# Define the RandomForestRegressor
|
78
|
-
forest = RandomForestRegressor(
|
79
|
-
n_estimators=500,
|
80
|
-
n_jobs=8,
|
81
|
-
max_depth=5,
|
82
|
-
random_state=1,
|
83
|
-
)
|
84
|
-
|
85
|
-
# Adjusting numpy types due to deprecation warnings or errors
|
86
|
-
np.int = np.int32
|
87
|
-
np.float = np.float64
|
88
|
-
np.bool = np.bool_
|
89
|
-
|
90
|
-
if method == "SHAP":
|
91
|
-
import pandas as pd
|
92
|
-
from catboost import CatBoostRegressor
|
93
|
-
from fasttreeshap import TreeExplainer as FastTreeExplainer
|
94
|
-
from sklearn.model_selection import cross_val_score
|
95
|
-
|
96
|
-
model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
|
97
|
-
model.fit(X, y)
|
98
|
-
|
99
|
-
explainer = FastTreeExplainer(model)
|
100
|
-
shap_values = explainer.shap_values(X)
|
101
|
-
|
102
|
-
# Step 5: Summarize the SHAP values for feature importance
|
103
|
-
shap_importances = np.mean(np.abs(shap_values), axis=0)
|
104
|
-
shap_importance_df = pd.DataFrame(
|
105
|
-
{"feature": X.columns, "importance": shap_importances}
|
106
|
-
).sort_values(by="importance", ascending=False)
|
107
|
-
|
108
|
-
def evaluate_model_with_n_features(N, X_train, y_train):
|
109
|
-
top_features = shap_importance_df["feature"].head(N).values
|
110
|
-
X_train_selected = X_train[top_features]
|
111
|
-
selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
|
112
|
-
scores = cross_val_score(
|
113
|
-
selector,
|
114
|
-
X_train_selected,
|
115
|
-
y_train,
|
116
|
-
cv=5,
|
117
|
-
scoring="neg_mean_squared_error",
|
118
|
-
n_jobs=-1,
|
119
|
-
)
|
120
|
-
|
121
|
-
return np.mean(scores)
|
122
|
-
|
123
|
-
# Evaluate model performance with different number of features
|
124
|
-
nrange = [5, 10, 15, 20, 25, 30]
|
125
|
-
cv_scores = []
|
126
|
-
for N in tqdm(nrange):
|
127
|
-
cv_scores.append(evaluate_model_with_n_features(N, X, y))
|
128
|
-
|
129
|
-
# Select the number of features that gives the best cross-validation score (lowest MSE)
|
130
|
-
optimal_N = nrange[np.argmax(cv_scores)]
|
131
|
-
|
132
|
-
# Use optimal N to select features
|
133
|
-
selected_features = (
|
134
|
-
shap_importance_df["feature"].head(optimal_N).values.tolist()
|
135
|
-
)
|
136
|
-
elif method == "stabl":
|
137
|
-
from stabl.stabl import Stabl
|
138
|
-
from sklearn.linear_model import Lasso
|
139
|
-
|
140
|
-
stabl = Stabl(
|
141
|
-
base_estimator=Lasso(alpha=0.001),
|
142
|
-
n_bootstraps=10,
|
143
|
-
artificial_type="knockoff",
|
144
|
-
artificial_proportion=.5,
|
145
|
-
replace=False,
|
146
|
-
fdr_threshold_range=np.arange(0.1, 1, 0.01),
|
147
|
-
sample_fraction=0.5,
|
148
|
-
random_state=42,
|
149
|
-
lambda_grid="auto",
|
150
|
-
verbose=1
|
151
|
-
)
|
152
|
-
stabl.fit(X, y)
|
153
|
-
selected_features = stabl.get_feature_names_out()
|
154
|
-
elif method == "feature_engine":
|
155
|
-
from feature_engine.selection import SmartCorrelatedSelection
|
156
|
-
|
157
|
-
selector = SmartCorrelatedSelection(
|
158
|
-
method="pearson",
|
159
|
-
threshold=0.7,
|
160
|
-
selection_method="model_performance",
|
161
|
-
estimator=forest,
|
162
|
-
scoring="neg_mean_squared_error",
|
163
|
-
)
|
164
|
-
|
165
|
-
X_filtered = selector.fit_transform(X, y)
|
166
|
-
selected_features = X_filtered.columns.tolist()
|
167
|
-
elif method == "mrmr":
|
168
|
-
from mrmr import mrmr_regression
|
169
|
-
|
170
|
-
try:
|
171
|
-
selected_features = mrmr_regression(X=X, y=y, K=10)
|
172
|
-
except:
|
173
|
-
breakpoint()
|
174
|
-
# combine X and y into a dataframe
|
175
|
-
# df = pd.concat([X, y], axis=1)
|
176
|
-
|
177
|
-
elif method == "RFECV":
|
178
|
-
from sklearn.feature_selection import RFECV
|
179
|
-
from sklearn.model_selection import KFold
|
180
|
-
|
181
|
-
# Initialize a k-fold cross-validation strategy
|
182
|
-
cv = KFold(n_splits=5)
|
183
|
-
|
184
|
-
# Patch the scoring function to add a progress bar
|
185
|
-
class RFECVWithProgress(RFECV):
|
186
|
-
def _fit(self, X, y):
|
187
|
-
from tqdm import tqdm
|
188
|
-
|
189
|
-
n_features = X.shape[1]
|
190
|
-
with tqdm(total=n_features) as pbar:
|
191
|
-
|
192
|
-
def patched_scorer(*args, **kwargs):
|
193
|
-
pbar.update(1)
|
194
|
-
return self.scorer_(*args, **kwargs)
|
195
|
-
|
196
|
-
self.scorer_ = patched_scorer
|
197
|
-
super()._fit(X, y)
|
198
|
-
|
199
|
-
# Initialize RFECV with the estimator and cross-validation strategy
|
200
|
-
selector = RFECVWithProgress(
|
201
|
-
estimator=forest, step=1, n_jobs=-1, cv=cv, scoring="neg_mean_squared_error"
|
202
|
-
)
|
203
|
-
selector.fit(X, y)
|
204
|
-
# Get the selected feature indices
|
205
|
-
selected_features = selector.get_support(indices=True)
|
206
|
-
|
207
|
-
# Get the selected feature names
|
208
|
-
selected_features = X.columns[selected_features].tolist()
|
209
|
-
elif method == "lasso":
|
210
|
-
from sklearn.linear_model import LassoLarsCV
|
211
|
-
from sklearn.feature_selection import SelectFromModel
|
212
|
-
|
213
|
-
# Fit Lasso model (L1 regularization) to perform feature selection
|
214
|
-
lasso = LassoLarsCV(cv=5)
|
215
|
-
lasso.fit(X, y)
|
216
|
-
|
217
|
-
# Use SelectFromModel to remove features with zero coefficients
|
218
|
-
selector = SelectFromModel(lasso, prefit=True)
|
219
|
-
|
220
|
-
# Get the selected features
|
221
|
-
selected_features = X.columns[selector.get_support()].tolist()
|
222
|
-
print(selected_features)
|
223
|
-
elif method == "BorutaPy":
|
224
|
-
from boruta import BorutaPy
|
225
|
-
|
226
|
-
selector = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
|
227
|
-
selector.fit(X.values, y.values)
|
228
|
-
selected_features_mask = selector.support_
|
229
|
-
selected_features = X.columns[selected_features_mask].tolist()
|
230
|
-
tentative_features = X.columns[selector.support_weak_].tolist()
|
231
|
-
|
232
|
-
selected_features = selected_features + tentative_features
|
233
|
-
elif method == "Leshy":
|
234
|
-
import arfs.feature_selection.allrelevant as arfsgroot
|
235
|
-
from catboost import CatBoostRegressor
|
236
|
-
|
237
|
-
model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
|
238
|
-
selector = arfsgroot.Leshy(
|
239
|
-
model,
|
240
|
-
n_estimators="auto",
|
241
|
-
verbose=1,
|
242
|
-
max_iter=10,
|
243
|
-
random_state=42,
|
244
|
-
importance="fastshap",
|
245
|
-
)
|
246
|
-
selector.fit(X, y, sample_weight=None)
|
247
|
-
|
248
|
-
selected_features = selector.get_feature_names_out()
|
249
|
-
# feat_selector.plot_importance(n_feat_per_inch=5)
|
250
|
-
elif method == "PowerShap":
|
251
|
-
from powershap import PowerShap
|
252
|
-
from catboost import CatBoostRegressor
|
253
|
-
|
254
|
-
selector = PowerShap(
|
255
|
-
model=CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=True),
|
256
|
-
power_alpha=0.05,
|
257
|
-
)
|
258
|
-
|
259
|
-
selector.fit(X, y) # Fit the PowerShap feature selector
|
260
|
-
selector.transform(X) # Reduce the dataset to the selected features
|
261
|
-
elif method == "BorutaShap":
|
262
|
-
from BorutaShap import BorutaShap
|
263
|
-
from catboost import CatBoostRegressor
|
264
|
-
|
265
|
-
hyperparams = {
|
266
|
-
"depth": 6,
|
267
|
-
"learning_rate": 0.05,
|
268
|
-
"iterations": 500,
|
269
|
-
"subsample": 1.0,
|
270
|
-
"random_strength": 0.5,
|
271
|
-
"reg_lambda": 0.001,
|
272
|
-
"loss_function": "RMSE",
|
273
|
-
"early_stopping_rounds": 25,
|
274
|
-
"random_seed": 42,
|
275
|
-
"verbose": False,
|
276
|
-
}
|
277
|
-
model = CatBoostRegressor(**hyperparams)
|
278
|
-
|
279
|
-
selector = BorutaShap(
|
280
|
-
model=model, importance_measure="shap", classification=False
|
281
|
-
)
|
282
|
-
selector.fit(
|
283
|
-
X=X,
|
284
|
-
y=y,
|
285
|
-
n_trials=100,
|
286
|
-
sample=False,
|
287
|
-
train_or_test="test",
|
288
|
-
normalize=True,
|
289
|
-
verbose=False,
|
290
|
-
)
|
291
|
-
selected_features_mask = selector.Subset().columns
|
292
|
-
selected_features = X[selected_features_mask].columns.tolist()
|
293
|
-
elif method == "Genetic":
|
294
|
-
from sklearn_genetic import GAFeatureSelectionCV
|
295
|
-
|
296
|
-
selector = GAFeatureSelectionCV(
|
297
|
-
estimator=forest,
|
298
|
-
cv=5,
|
299
|
-
verbose=1,
|
300
|
-
scoring="neg_mean_squared_error",
|
301
|
-
max_features=max(len(X.columns) // 3, min_features_to_select),
|
302
|
-
population_size=100,
|
303
|
-
generations=40,
|
304
|
-
crossover_probability=0.9,
|
305
|
-
mutation_probability=0.1,
|
306
|
-
keep_top_k=2,
|
307
|
-
elitism=True,
|
308
|
-
n_jobs=-1,
|
309
|
-
)
|
310
|
-
selector.fit(X, y)
|
311
|
-
selected_features_mask = selector.support_
|
312
|
-
selected_features = X.columns[selected_features_mask].tolist()
|
313
|
-
elif method == "RFE":
|
314
|
-
from sklearn.feature_selection import RFE
|
315
|
-
|
316
|
-
selector = RFE(
|
317
|
-
forest, n_features_to_select=min_features_to_select, step=1, verbose=1
|
318
|
-
)
|
319
|
-
selector = selector.fit(X, y)
|
320
|
-
selected_features_mask = selector.support_
|
321
|
-
selected_features = X.columns[selected_features_mask].tolist()
|
322
|
-
else:
|
323
|
-
raise ValueError("Method not recognized. Use BorutaPy, Genetic, or RFE")
|
324
|
-
# tentative_features = X.columns[selector.support_weak_].tolist()
|
325
|
-
print(selected_features)
|
326
|
-
breakpoint()
|
327
|
-
non_eo = are_all_features_non_eo(selected_features)
|
328
|
-
if non_eo or method == "SelectKBest":
|
329
|
-
from sklearn.feature_selection import SelectKBest, f_regression
|
330
|
-
|
331
|
-
k = 15 # Number of features to select
|
332
|
-
selector = SelectKBest(score_func=f_regression, k=k)
|
333
|
-
|
334
|
-
# Fit the selector to the data and transform the data to select the best features
|
335
|
-
try:
|
336
|
-
X_new = selector.fit_transform(X, y)
|
337
|
-
except:
|
338
|
-
breakpoint()
|
339
|
-
|
340
|
-
# Get the selected feature indices
|
341
|
-
selected_features = selector.get_support(indices=True)
|
342
|
-
|
343
|
-
# Get the selected feature names
|
344
|
-
selected_features = X.columns[selected_features].tolist()
|
345
|
-
|
346
|
-
# print(selected_features)
|
347
|
-
# Filter the dataset for selected features
|
348
|
-
X_filtered = X.loc[:, selected_features]
|
349
|
-
|
350
|
-
return selector, X_filtered, selected_features
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|