geocif 0.2.2__tar.gz → 0.2.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {geocif-0.2.2/geocif.egg-info → geocif-0.2.22}/PKG-INFO +1 -1
  2. geocif-0.2.22/geocif/ml/feature_selection.py +316 -0
  3. {geocif-0.2.2 → geocif-0.2.22/geocif.egg-info}/PKG-INFO +1 -1
  4. {geocif-0.2.2 → geocif-0.2.22}/setup.py +1 -1
  5. geocif-0.2.2/geocif/ml/feature_selection.py +0 -350
  6. {geocif-0.2.2 → geocif-0.2.22}/LICENSE +0 -0
  7. {geocif-0.2.2 → geocif-0.2.22}/MANIFEST.in +0 -0
  8. {geocif-0.2.2 → geocif-0.2.22}/README.md +0 -0
  9. {geocif-0.2.2 → geocif-0.2.22}/geocif/__init__.py +0 -0
  10. {geocif-0.2.2 → geocif-0.2.22}/geocif/agmet/__init__.py +0 -0
  11. {geocif-0.2.2 → geocif-0.2.22}/geocif/agmet/geoagmet.py +0 -0
  12. {geocif-0.2.2 → geocif-0.2.22}/geocif/agmet/plot.py +0 -0
  13. {geocif-0.2.2 → geocif-0.2.22}/geocif/agmet/utils.py +0 -0
  14. {geocif-0.2.2 → geocif-0.2.22}/geocif/analysis.py +0 -0
  15. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/__init__.py +0 -0
  16. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/constants.py +0 -0
  17. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/features.py +0 -0
  18. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/geo.py +0 -0
  19. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/geocif.py +0 -0
  20. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/metadata.py +0 -0
  21. {geocif-0.2.2 → geocif-0.2.22}/geocif/backup/models.py +0 -0
  22. {geocif-0.2.2 → geocif-0.2.22}/geocif/cei/__init__.py +0 -0
  23. {geocif-0.2.2 → geocif-0.2.22}/geocif/cei/definitions.py +0 -0
  24. {geocif-0.2.2 → geocif-0.2.22}/geocif/cei/indices.py +0 -0
  25. {geocif-0.2.2 → geocif-0.2.22}/geocif/experiments.py +0 -0
  26. {geocif-0.2.2 → geocif-0.2.22}/geocif/geocif.py +0 -0
  27. {geocif-0.2.2 → geocif-0.2.22}/geocif/geocif_runner.py +0 -0
  28. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner.py +0 -0
  29. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_angola.py +0 -0
  30. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_madagascar.py +0 -0
  31. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_malawi.py +0 -0
  32. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_mozambique.py +0 -0
  33. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_south_africa.py +0 -0
  34. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_zambia.py +0 -0
  35. {geocif-0.2.2 → geocif-0.2.22}/geocif/indices_runner_zimbabwe.py +0 -0
  36. {geocif-0.2.2 → geocif-0.2.22}/geocif/logger.py +0 -0
  37. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/__init__.py +0 -0
  38. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/correlations.py +0 -0
  39. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/embedding.py +0 -0
  40. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/feature_engineering.py +0 -0
  41. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/outliers.py +0 -0
  42. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/outlook.py +0 -0
  43. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/output.py +0 -0
  44. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/spatial_autocorrelation.py +0 -0
  45. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/stages.py +0 -0
  46. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/stats.py +0 -0
  47. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/trainers.py +0 -0
  48. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/trend.py +0 -0
  49. {geocif-0.2.2 → geocif-0.2.22}/geocif/ml/xai.py +0 -0
  50. {geocif-0.2.2 → geocif-0.2.22}/geocif/mm.py +0 -0
  51. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/__init__.py +0 -0
  52. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/aa.py +0 -0
  53. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/area.py +0 -0
  54. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/automl.py +0 -0
  55. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/download_esi.py +0 -0
  56. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/enso.py +0 -0
  57. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/eval.py +0 -0
  58. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/gamtest.py +0 -0
  59. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/gee_access.py +0 -0
  60. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/misc.py +0 -0
  61. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/play_xagg.py +0 -0
  62. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/reg.py +0 -0
  63. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/sustain.py +0 -0
  64. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/test_catboost.py +0 -0
  65. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/tmp.py +0 -0
  66. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/tmp2.py +0 -0
  67. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/tmp3.py +0 -0
  68. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/tmp4.py +0 -0
  69. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/tmp5.py +0 -0
  70. {geocif-0.2.2 → geocif-0.2.22}/geocif/playground/wolayita_maize_mask.py +0 -0
  71. {geocif-0.2.2 → geocif-0.2.22}/geocif/risk/__init__.py +0 -0
  72. {geocif-0.2.2 → geocif-0.2.22}/geocif/risk/impact_assessment.py +0 -0
  73. {geocif-0.2.2 → geocif-0.2.22}/geocif/utils.py +0 -0
  74. {geocif-0.2.2 → geocif-0.2.22}/geocif/viz/__init__.py +0 -0
  75. {geocif-0.2.2 → geocif-0.2.22}/geocif/viz/gt.py +0 -0
  76. {geocif-0.2.2 → geocif-0.2.22}/geocif/viz/plot.py +0 -0
  77. {geocif-0.2.2 → geocif-0.2.22}/geocif/viz/tmp.py +0 -0
  78. {geocif-0.2.2 → geocif-0.2.22}/geocif.egg-info/SOURCES.txt +0 -0
  79. {geocif-0.2.2 → geocif-0.2.22}/geocif.egg-info/dependency_links.txt +0 -0
  80. {geocif-0.2.2 → geocif-0.2.22}/geocif.egg-info/not-zip-safe +0 -0
  81. {geocif-0.2.2 → geocif-0.2.22}/geocif.egg-info/top_level.txt +0 -0
  82. {geocif-0.2.2 → geocif-0.2.22}/requirements.txt +0 -0
  83. {geocif-0.2.2 → geocif-0.2.22}/setup.cfg +0 -0
  84. {geocif-0.2.2 → geocif-0.2.22}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.2
3
+ Version: 0.2.22
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -0,0 +1,316 @@
1
+ import numpy as np
2
+ from tqdm import tqdm
3
+ from sklearn.ensemble import RandomForestRegressor
4
+ from collections import Counter
5
+ from pathlib import Path
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+
9
+
10
+ def are_all_features_non_eo(features):
11
+ """
12
+ Check if all the features are non-EO features
13
+
14
+ Args:
15
+ features: iterable of feature names
16
+
17
+ Returns:
18
+ bool: True if every feature is in the non-EO list
19
+ """
20
+ non_eo_features = [
21
+ 'Median Yield (tn per ha)',
22
+ 'Analogous Year',
23
+ 'Analogous Year Yield',
24
+ 'lon',
25
+ 'lat',
26
+ 't -1 Yield (tn per ha)',
27
+ 't -2 Yield (tn per ha)',
28
+ 't -3 Yield (tn per ha)',
29
+ 't -4 Yield (tn per ha)',
30
+ 't -5 Yield (tn per ha)',
31
+ ]
32
+ return all(f in non_eo_features for f in features)
33
+
34
+
35
+ def select_features(
36
+ X, y,
37
+ method="RFE",
38
+ min_features_to_select=3,
39
+ threshold_nan=0.2,
40
+ threshold_unique=0.6
41
+ ):
42
+ """
43
+ Feature-selection wrapper supporting many methods plus a new 'multi' option.
44
+
45
+ Parameters
46
+ ----------
47
+ X : pd.DataFrame
48
+ y : array-like
49
+ method : str
50
+ One of {"SHAP", "stabl", "feature_engine", "mrmr", "RFECV", "lasso",
51
+ "BorutaPy", "Leshy", "PowerShap", "BorutaShap", "Genetic", "RFE", "multi"}
52
+ min_features_to_select : int
53
+ threshold_nan : float
54
+ Drop columns with > threshold_nan proportion of NaNs
55
+ threshold_unique : float
56
+ (Reserved for future use)
57
+
58
+ Returns
59
+ -------
60
+ selector : fitted selector object or None (for multi)
61
+ X_filtered : pd.DataFrame of selected features
62
+ selected_features : list[str]
63
+ """
64
+
65
+ # copy original for multi-mode recursion
66
+ X_clean = X.copy()
67
+
68
+ # 1) drop columns with too many NaNs
69
+ nan_prop = X_clean.isna().mean()
70
+ X_clean = X_clean.loc[:, nan_prop <= threshold_nan]
71
+
72
+ # 2) fill NaNs with median
73
+ X_clean = X_clean.fillna(X_clean.median())
74
+
75
+ # --- multi-method ensemble -------------------------------
76
+ if method == "multi":
77
+ counter = Counter()
78
+ # run three selectors and count feature picks
79
+ for sub_m in ["BorutaPy", "mrmr"]:
80
+ _, _, feats = select_features(
81
+ X_clean, y,
82
+ method=sub_m,
83
+ min_features_to_select=min_features_to_select,
84
+ threshold_nan=threshold_nan,
85
+ threshold_unique=threshold_unique
86
+ )
87
+ counter.update(feats)
88
+
89
+ # union of all features
90
+ combined = sorted(counter.keys())
91
+ X_out = X_clean.loc[:, combined]
92
+
93
+ # plot and save histogram
94
+ freq = pd.Series(counter).sort_values(ascending=False)
95
+ fig = freq.plot(kind="bar", width=0.9).get_figure()
96
+ plt.title("Feature selection frequency across methods")
97
+ plt.xlabel("Feature")
98
+ plt.ylabel("Times selected (out of 3)")
99
+ plt.tight_layout()
100
+
101
+ out_dir = Path("feature_selection_multi")
102
+ out_dir.mkdir(parents=True, exist_ok=True)
103
+ fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
104
+ plt.close(fig)
105
+
106
+ return None, X_out, combined
107
+
108
+ # define forest for methods that need it
109
+ forest = RandomForestRegressor(
110
+ n_estimators=500,
111
+ n_jobs=8,
112
+ max_depth=5,
113
+ random_state=1,
114
+ )
115
+
116
+ # patch numpy deprecation
117
+ np.int = np.int32
118
+ np.float = np.float64
119
+ np.bool = np.bool_
120
+
121
+ if method == "SHAP":
122
+ import pandas as pd
123
+ from catboost import CatBoostRegressor
124
+ from fasttreeshap import TreeExplainer as FastTreeExplainer
125
+ from sklearn.model_selection import cross_val_score
126
+
127
+ model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
128
+ model.fit(X_clean, y)
129
+ explainer = FastTreeExplainer(model)
130
+ shap_values = explainer.shap_values(X_clean)
131
+ shap_importances = np.mean(np.abs(shap_values), axis=0)
132
+ shap_df = pd.DataFrame({
133
+ "feature": X_clean.columns,
134
+ "importance": shap_importances
135
+ }).sort_values("importance", ascending=False)
136
+
137
+ def eval_n(N):
138
+ top = shap_df["feature"].head(N)
139
+ sel = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
140
+ scores = cross_val_score(sel, X_clean[top], y,
141
+ cv=5, scoring="neg_mean_squared_error",
142
+ n_jobs=-1)
143
+ return np.mean(scores)
144
+
145
+ nrange = [5,10,15,20,25,30]
146
+ scores = [eval_n(N) for N in tqdm(nrange)]
147
+ best = nrange[np.argmax(scores)]
148
+ selected = shap_df["feature"].head(best).tolist()
149
+
150
+ elif method == "stabl":
151
+ from stabl.stabl import Stabl
152
+ from sklearn.linear_model import Lasso
153
+
154
+ st = Stabl(
155
+ base_estimator=Lasso(alpha=0.001),
156
+ n_bootstraps=10,
157
+ artificial_type="knockoff",
158
+ artificial_proportion=0.5,
159
+ replace=False,
160
+ fdr_threshold_range=np.arange(0.1,1,0.01),
161
+ sample_fraction=0.5,
162
+ random_state=42,
163
+ lambda_grid="auto",
164
+ verbose=1
165
+ )
166
+ st.fit(X_clean, y)
167
+ selected = st.get_feature_names_out()
168
+
169
+ elif method == "feature_engine":
170
+ from feature_engine.selection import SmartCorrelatedSelection
171
+ sel = SmartCorrelatedSelection(
172
+ method="pearson",
173
+ threshold=0.7,
174
+ selection_method="model_performance",
175
+ estimator=forest,
176
+ scoring="neg_mean_squared_error",
177
+ )
178
+ X_fe = sel.fit_transform(X_clean, y)
179
+ selected = X_fe.columns.tolist()
180
+
181
+ elif method == "mrmr":
182
+ from mrmr import mrmr_regression
183
+ selected = mrmr_regression(X=X_clean, y=y, K=10)
184
+
185
+ elif method == "RFECV":
186
+ from sklearn.feature_selection import RFECV
187
+ from sklearn.model_selection import KFold
188
+
189
+ class RFECVProg(RFECV):
190
+ def _fit(self, X, y):
191
+ with tqdm(total=X.shape[1]) as p:
192
+ orig = self.scorer_
193
+ def wrap(*a, **k):
194
+ p.update(1)
195
+ return orig(*a, **k)
196
+ self.scorer_ = wrap
197
+ super()._fit(X, y)
198
+
199
+ cv = KFold(n_splits=5)
200
+ sel = RFECVProg(
201
+ estimator=forest,
202
+ step=1,
203
+ cv=cv,
204
+ scoring="neg_mean_squared_error",
205
+ n_jobs=-1,
206
+ verbose=0
207
+ )
208
+ sel.fit(X_clean, y)
209
+ mask = sel.get_support()
210
+ selected = X_clean.columns[mask].tolist()
211
+
212
+ elif method == "lasso":
213
+ from sklearn.linear_model import LassoLarsCV
214
+ from sklearn.feature_selection import SelectFromModel
215
+
216
+ lr = LassoLarsCV(cv=5)
217
+ lr.fit(X_clean, y)
218
+ sfm = SelectFromModel(lr, prefit=True)
219
+ selected = X_clean.columns[sfm.get_support()].tolist()
220
+
221
+ elif method == "BorutaPy":
222
+ from boruta import BorutaPy
223
+ sel = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
224
+ sel.fit(X_clean.values, y)
225
+ mask = sel.support_ | sel.support_weak_
226
+ selected = X_clean.columns[mask].tolist()
227
+
228
+ elif method == "Leshy":
229
+ import arfs.feature_selection.allrelevant as arfsgroot
230
+ from catboost import CatBoostRegressor
231
+ model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
232
+ sel = arfsgroot.Leshy(
233
+ model,
234
+ n_estimators="auto",
235
+ verbose=1,
236
+ max_iter=10,
237
+ random_state=42,
238
+ importance="fastshap",
239
+ )
240
+ sel.fit(X_clean, y)
241
+ selected = sel.get_feature_names_out()
242
+
243
+ elif method == "PowerShap":
244
+ from powershap import PowerShap
245
+ from catboost import CatBoostRegressor
246
+ sel = PowerShap(
247
+ model=CatBoostRegressor(n_estimators=500, verbose=0),
248
+ power_alpha=0.05,
249
+ )
250
+ sel.fit(X_clean, y)
251
+ selected = sel.transform(X_clean).columns.tolist()
252
+
253
+ elif method == "BorutaShap":
254
+ from BorutaShap import BorutaShap
255
+ from catboost import CatBoostRegressor
256
+ params = {
257
+ "depth": 6,
258
+ "learning_rate": 0.05,
259
+ "iterations": 500,
260
+ "subsample": 1.0,
261
+ "random_strength": 0.5,
262
+ "reg_lambda": 0.001,
263
+ "loss_function": "RMSE",
264
+ "early_stopping_rounds": 25,
265
+ "random_seed": 42,
266
+ "verbose": False,
267
+ }
268
+ model = CatBoostRegressor(**params)
269
+ sel = BorutaShap(model=model, importance_measure="shap", classification=False)
270
+ sel.fit(X=X_clean, y=y, n_trials=100, sample=False,
271
+ train_or_test="test", normalize=True, verbose=False)
272
+ selected = sel.Subset().columns.tolist()
273
+
274
+ elif method == "Genetic":
275
+ from sklearn_genetic import GAFeatureSelectionCV
276
+ sel = GAFeatureSelectionCV(
277
+ estimator=forest,
278
+ cv=5,
279
+ scoring="neg_mean_squared_error",
280
+ population_size=100,
281
+ generations=40,
282
+ max_features=max(len(X_clean.columns)//3, min_features_to_select),
283
+ crossover_probability=0.9,
284
+ mutation_probability=0.1,
285
+ keep_top_k=2,
286
+ elitism=True,
287
+ n_jobs=-1,
288
+ verbose=1,
289
+ )
290
+ sel.fit(X_clean, y)
291
+ selected = X_clean.columns[sel.support_].tolist()
292
+
293
+ elif method == "RFE":
294
+ from sklearn.feature_selection import RFE
295
+ sel = RFE(forest, n_features_to_select=min_features_to_select, step=1, verbose=1)
296
+ sel = sel.fit(X_clean, y)
297
+ selected = X_clean.columns[sel.support_].tolist()
298
+
299
+ else:
300
+ raise ValueError(f"Unknown method: {method}")
301
+
302
+ # post-filtering: non-EO fallback to SelectKBest
303
+ non_eo = are_all_features_non_eo(selected)
304
+ if non_eo or method == "SelectKBest":
305
+ from sklearn.feature_selection import SelectKBest, f_regression
306
+ k = 15
307
+ skb = SelectKBest(score_func=f_regression, k=k)
308
+ skb.fit(X_clean, y)
309
+ selected = X_clean.columns[skb.get_support()].tolist()
310
+
311
+ # return selector (if exists), filtered DataFrame, and feature list
312
+ try:
313
+ return sel, X_clean.loc[:, selected], selected
314
+ except NameError:
315
+ # for methods that didn't create `sel`
316
+ return None, X_clean.loc[:, selected], selected
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.2
3
+ Version: 0.2.22
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.2.02",
53
+ version="0.2.22",
54
54
  zip_safe=False,
55
55
  )
@@ -1,350 +0,0 @@
1
- import numpy as np
2
- from tqdm import tqdm
3
- from sklearn.ensemble import RandomForestRegressor
4
-
5
-
6
- def are_all_features_non_eo(features):
7
- """
8
- Check if all the features non eo features
9
-
10
- Args:
11
- feature:
12
-
13
- Returns:
14
-
15
- """
16
- non_eo_features = ['Median Yield (tn per ha)',
17
- 'Analogous Year',
18
- 'Analogous Year Yield',
19
- 'lon',
20
- 'lat',
21
- 't -1 Yield (tn per ha)',
22
- 't -2 Yield (tn per ha)',
23
- 't -3 Yield (tn per ha)',
24
- 't -4 Yield (tn per ha)',
25
- 't -5 Yield (tn per ha)']
26
-
27
- # Check if all features are non-eo features, return True if they are
28
- return all(feature in non_eo_features for feature in features)
29
-
30
-
31
- def select_features(X, y, method="RFE", min_features_to_select=3, threshold_nan=0.2, threshold_unique=0.6):
32
- """
33
-
34
- Args:
35
- X:
36
- y:
37
- method:
38
- min_features_to_select:
39
- threshold_unique:
40
-
41
- Returns:
42
-
43
- """
44
-
45
- # df = X.copy()
46
- #
47
- # # Initialize and apply StandardScaler
48
- # scaler = StandardScaler()
49
- # scaled_data = scaler.fit_transform(df)
50
- #
51
- # # Initialize and apply VarianceThreshold
52
- # # Note: Since data is standardized, all features now have variance of 1 before applying VarianceThreshold.
53
- # # You would adjust the threshold based on new criteria since variances have been normalized.
54
- # selector = VarianceThreshold(threshold=scaled_data.var().mean())
55
- # X = selector.fit_transform(scaled_data)
56
- selector = None
57
- X_original = X.copy()
58
-
59
- # Calculate the proportion of NaN values in each column
60
- nan_proportion = X.isna().mean()
61
-
62
- # Drop columns where more than 20% of the values are NaN
63
- X = X.loc[:, nan_proportion <= threshold_nan]
64
-
65
- # Fill in columns with median of that column
66
- X = X.fillna(X.median())
67
-
68
- # Calculate the proportion of unique values in each column
69
- # unique_proportion = X.nunique(axis="columns") / len(X)
70
- #
71
- # # Filter columns that have at least 60% unique values
72
- # columns_to_keep = unique_proportion[unique_proportion >= threshold_unique].index
73
- #
74
- # # Drop columns that do not meet the threshold
75
- # X = X[columns_to_keep]
76
-
77
- # Define the RandomForestRegressor
78
- forest = RandomForestRegressor(
79
- n_estimators=500,
80
- n_jobs=8,
81
- max_depth=5,
82
- random_state=1,
83
- )
84
-
85
- # Adjusting numpy types due to deprecation warnings or errors
86
- np.int = np.int32
87
- np.float = np.float64
88
- np.bool = np.bool_
89
-
90
- if method == "SHAP":
91
- import pandas as pd
92
- from catboost import CatBoostRegressor
93
- from fasttreeshap import TreeExplainer as FastTreeExplainer
94
- from sklearn.model_selection import cross_val_score
95
-
96
- model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
97
- model.fit(X, y)
98
-
99
- explainer = FastTreeExplainer(model)
100
- shap_values = explainer.shap_values(X)
101
-
102
- # Step 5: Summarize the SHAP values for feature importance
103
- shap_importances = np.mean(np.abs(shap_values), axis=0)
104
- shap_importance_df = pd.DataFrame(
105
- {"feature": X.columns, "importance": shap_importances}
106
- ).sort_values(by="importance", ascending=False)
107
-
108
- def evaluate_model_with_n_features(N, X_train, y_train):
109
- top_features = shap_importance_df["feature"].head(N).values
110
- X_train_selected = X_train[top_features]
111
- selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
112
- scores = cross_val_score(
113
- selector,
114
- X_train_selected,
115
- y_train,
116
- cv=5,
117
- scoring="neg_mean_squared_error",
118
- n_jobs=-1,
119
- )
120
-
121
- return np.mean(scores)
122
-
123
- # Evaluate model performance with different number of features
124
- nrange = [5, 10, 15, 20, 25, 30]
125
- cv_scores = []
126
- for N in tqdm(nrange):
127
- cv_scores.append(evaluate_model_with_n_features(N, X, y))
128
-
129
- # Select the number of features that gives the best cross-validation score (lowest MSE)
130
- optimal_N = nrange[np.argmax(cv_scores)]
131
-
132
- # Use optimal N to select features
133
- selected_features = (
134
- shap_importance_df["feature"].head(optimal_N).values.tolist()
135
- )
136
- elif method == "stabl":
137
- from stabl.stabl import Stabl
138
- from sklearn.linear_model import Lasso
139
-
140
- stabl = Stabl(
141
- base_estimator=Lasso(alpha=0.001),
142
- n_bootstraps=10,
143
- artificial_type="knockoff",
144
- artificial_proportion=.5,
145
- replace=False,
146
- fdr_threshold_range=np.arange(0.1, 1, 0.01),
147
- sample_fraction=0.5,
148
- random_state=42,
149
- lambda_grid="auto",
150
- verbose=1
151
- )
152
- stabl.fit(X, y)
153
- selected_features = stabl.get_feature_names_out()
154
- elif method == "feature_engine":
155
- from feature_engine.selection import SmartCorrelatedSelection
156
-
157
- selector = SmartCorrelatedSelection(
158
- method="pearson",
159
- threshold=0.7,
160
- selection_method="model_performance",
161
- estimator=forest,
162
- scoring="neg_mean_squared_error",
163
- )
164
-
165
- X_filtered = selector.fit_transform(X, y)
166
- selected_features = X_filtered.columns.tolist()
167
- elif method == "mrmr":
168
- from mrmr import mrmr_regression
169
-
170
- try:
171
- selected_features = mrmr_regression(X=X, y=y, K=10)
172
- except:
173
- breakpoint()
174
- # combine X and y into a dataframe
175
- # df = pd.concat([X, y], axis=1)
176
-
177
- elif method == "RFECV":
178
- from sklearn.feature_selection import RFECV
179
- from sklearn.model_selection import KFold
180
-
181
- # Initialize a k-fold cross-validation strategy
182
- cv = KFold(n_splits=5)
183
-
184
- # Patch the scoring function to add a progress bar
185
- class RFECVWithProgress(RFECV):
186
- def _fit(self, X, y):
187
- from tqdm import tqdm
188
-
189
- n_features = X.shape[1]
190
- with tqdm(total=n_features) as pbar:
191
-
192
- def patched_scorer(*args, **kwargs):
193
- pbar.update(1)
194
- return self.scorer_(*args, **kwargs)
195
-
196
- self.scorer_ = patched_scorer
197
- super()._fit(X, y)
198
-
199
- # Initialize RFECV with the estimator and cross-validation strategy
200
- selector = RFECVWithProgress(
201
- estimator=forest, step=1, n_jobs=-1, cv=cv, scoring="neg_mean_squared_error"
202
- )
203
- selector.fit(X, y)
204
- # Get the selected feature indices
205
- selected_features = selector.get_support(indices=True)
206
-
207
- # Get the selected feature names
208
- selected_features = X.columns[selected_features].tolist()
209
- elif method == "lasso":
210
- from sklearn.linear_model import LassoLarsCV
211
- from sklearn.feature_selection import SelectFromModel
212
-
213
- # Fit Lasso model (L1 regularization) to perform feature selection
214
- lasso = LassoLarsCV(cv=5)
215
- lasso.fit(X, y)
216
-
217
- # Use SelectFromModel to remove features with zero coefficients
218
- selector = SelectFromModel(lasso, prefit=True)
219
-
220
- # Get the selected features
221
- selected_features = X.columns[selector.get_support()].tolist()
222
- print(selected_features)
223
- elif method == "BorutaPy":
224
- from boruta import BorutaPy
225
-
226
- selector = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
227
- selector.fit(X.values, y.values)
228
- selected_features_mask = selector.support_
229
- selected_features = X.columns[selected_features_mask].tolist()
230
- tentative_features = X.columns[selector.support_weak_].tolist()
231
-
232
- selected_features = selected_features + tentative_features
233
- elif method == "Leshy":
234
- import arfs.feature_selection.allrelevant as arfsgroot
235
- from catboost import CatBoostRegressor
236
-
237
- model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
238
- selector = arfsgroot.Leshy(
239
- model,
240
- n_estimators="auto",
241
- verbose=1,
242
- max_iter=10,
243
- random_state=42,
244
- importance="fastshap",
245
- )
246
- selector.fit(X, y, sample_weight=None)
247
-
248
- selected_features = selector.get_feature_names_out()
249
- # feat_selector.plot_importance(n_feat_per_inch=5)
250
- elif method == "PowerShap":
251
- from powershap import PowerShap
252
- from catboost import CatBoostRegressor
253
-
254
- selector = PowerShap(
255
- model=CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=True),
256
- power_alpha=0.05,
257
- )
258
-
259
- selector.fit(X, y) # Fit the PowerShap feature selector
260
- selector.transform(X) # Reduce the dataset to the selected features
261
- elif method == "BorutaShap":
262
- from BorutaShap import BorutaShap
263
- from catboost import CatBoostRegressor
264
-
265
- hyperparams = {
266
- "depth": 6,
267
- "learning_rate": 0.05,
268
- "iterations": 500,
269
- "subsample": 1.0,
270
- "random_strength": 0.5,
271
- "reg_lambda": 0.001,
272
- "loss_function": "RMSE",
273
- "early_stopping_rounds": 25,
274
- "random_seed": 42,
275
- "verbose": False,
276
- }
277
- model = CatBoostRegressor(**hyperparams)
278
-
279
- selector = BorutaShap(
280
- model=model, importance_measure="shap", classification=False
281
- )
282
- selector.fit(
283
- X=X,
284
- y=y,
285
- n_trials=100,
286
- sample=False,
287
- train_or_test="test",
288
- normalize=True,
289
- verbose=False,
290
- )
291
- selected_features_mask = selector.Subset().columns
292
- selected_features = X[selected_features_mask].columns.tolist()
293
- elif method == "Genetic":
294
- from sklearn_genetic import GAFeatureSelectionCV
295
-
296
- selector = GAFeatureSelectionCV(
297
- estimator=forest,
298
- cv=5,
299
- verbose=1,
300
- scoring="neg_mean_squared_error",
301
- max_features=max(len(X.columns) // 3, min_features_to_select),
302
- population_size=100,
303
- generations=40,
304
- crossover_probability=0.9,
305
- mutation_probability=0.1,
306
- keep_top_k=2,
307
- elitism=True,
308
- n_jobs=-1,
309
- )
310
- selector.fit(X, y)
311
- selected_features_mask = selector.support_
312
- selected_features = X.columns[selected_features_mask].tolist()
313
- elif method == "RFE":
314
- from sklearn.feature_selection import RFE
315
-
316
- selector = RFE(
317
- forest, n_features_to_select=min_features_to_select, step=1, verbose=1
318
- )
319
- selector = selector.fit(X, y)
320
- selected_features_mask = selector.support_
321
- selected_features = X.columns[selected_features_mask].tolist()
322
- else:
323
- raise ValueError("Method not recognized. Use BorutaPy, Genetic, or RFE")
324
- # tentative_features = X.columns[selector.support_weak_].tolist()
325
- print(selected_features)
326
- breakpoint()
327
- non_eo = are_all_features_non_eo(selected_features)
328
- if non_eo or method == "SelectKBest":
329
- from sklearn.feature_selection import SelectKBest, f_regression
330
-
331
- k = 15 # Number of features to select
332
- selector = SelectKBest(score_func=f_regression, k=k)
333
-
334
- # Fit the selector to the data and transform the data to select the best features
335
- try:
336
- X_new = selector.fit_transform(X, y)
337
- except:
338
- breakpoint()
339
-
340
- # Get the selected feature indices
341
- selected_features = selector.get_support(indices=True)
342
-
343
- # Get the selected feature names
344
- selected_features = X.columns[selected_features].tolist()
345
-
346
- # print(selected_features)
347
- # Filter the dataset for selected features
348
- X_filtered = X.loc[:, selected_features]
349
-
350
- return selector, X_filtered, selected_features
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes