geocif 0.2.1__tar.gz → 0.2.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {geocif-0.2.1/geocif.egg-info → geocif-0.2.21}/PKG-INFO +1 -1
  2. geocif-0.2.21/geocif/ml/feature_selection.py +316 -0
  3. {geocif-0.2.1 → geocif-0.2.21/geocif.egg-info}/PKG-INFO +1 -1
  4. {geocif-0.2.1 → geocif-0.2.21}/setup.py +1 -1
  5. geocif-0.2.1/geocif/ml/feature_selection.py +0 -349
  6. {geocif-0.2.1 → geocif-0.2.21}/LICENSE +0 -0
  7. {geocif-0.2.1 → geocif-0.2.21}/MANIFEST.in +0 -0
  8. {geocif-0.2.1 → geocif-0.2.21}/README.md +0 -0
  9. {geocif-0.2.1 → geocif-0.2.21}/geocif/__init__.py +0 -0
  10. {geocif-0.2.1 → geocif-0.2.21}/geocif/agmet/__init__.py +0 -0
  11. {geocif-0.2.1 → geocif-0.2.21}/geocif/agmet/geoagmet.py +0 -0
  12. {geocif-0.2.1 → geocif-0.2.21}/geocif/agmet/plot.py +0 -0
  13. {geocif-0.2.1 → geocif-0.2.21}/geocif/agmet/utils.py +0 -0
  14. {geocif-0.2.1 → geocif-0.2.21}/geocif/analysis.py +0 -0
  15. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/__init__.py +0 -0
  16. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/constants.py +0 -0
  17. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/features.py +0 -0
  18. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/geo.py +0 -0
  19. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/geocif.py +0 -0
  20. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/metadata.py +0 -0
  21. {geocif-0.2.1 → geocif-0.2.21}/geocif/backup/models.py +0 -0
  22. {geocif-0.2.1 → geocif-0.2.21}/geocif/cei/__init__.py +0 -0
  23. {geocif-0.2.1 → geocif-0.2.21}/geocif/cei/definitions.py +0 -0
  24. {geocif-0.2.1 → geocif-0.2.21}/geocif/cei/indices.py +0 -0
  25. {geocif-0.2.1 → geocif-0.2.21}/geocif/experiments.py +0 -0
  26. {geocif-0.2.1 → geocif-0.2.21}/geocif/geocif.py +0 -0
  27. {geocif-0.2.1 → geocif-0.2.21}/geocif/geocif_runner.py +0 -0
  28. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner.py +0 -0
  29. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_angola.py +0 -0
  30. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_madagascar.py +0 -0
  31. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_malawi.py +0 -0
  32. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_mozambique.py +0 -0
  33. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_south_africa.py +0 -0
  34. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_zambia.py +0 -0
  35. {geocif-0.2.1 → geocif-0.2.21}/geocif/indices_runner_zimbabwe.py +0 -0
  36. {geocif-0.2.1 → geocif-0.2.21}/geocif/logger.py +0 -0
  37. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/__init__.py +0 -0
  38. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/correlations.py +0 -0
  39. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/embedding.py +0 -0
  40. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/feature_engineering.py +0 -0
  41. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/outliers.py +0 -0
  42. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/outlook.py +0 -0
  43. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/output.py +0 -0
  44. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/spatial_autocorrelation.py +0 -0
  45. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/stages.py +0 -0
  46. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/stats.py +0 -0
  47. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/trainers.py +0 -0
  48. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/trend.py +0 -0
  49. {geocif-0.2.1 → geocif-0.2.21}/geocif/ml/xai.py +0 -0
  50. {geocif-0.2.1 → geocif-0.2.21}/geocif/mm.py +0 -0
  51. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/__init__.py +0 -0
  52. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/aa.py +0 -0
  53. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/area.py +0 -0
  54. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/automl.py +0 -0
  55. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/download_esi.py +0 -0
  56. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/enso.py +0 -0
  57. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/eval.py +0 -0
  58. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/gamtest.py +0 -0
  59. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/gee_access.py +0 -0
  60. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/misc.py +0 -0
  61. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/play_xagg.py +0 -0
  62. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/reg.py +0 -0
  63. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/sustain.py +0 -0
  64. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/test_catboost.py +0 -0
  65. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/tmp.py +0 -0
  66. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/tmp2.py +0 -0
  67. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/tmp3.py +0 -0
  68. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/tmp4.py +0 -0
  69. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/tmp5.py +0 -0
  70. {geocif-0.2.1 → geocif-0.2.21}/geocif/playground/wolayita_maize_mask.py +0 -0
  71. {geocif-0.2.1 → geocif-0.2.21}/geocif/risk/__init__.py +0 -0
  72. {geocif-0.2.1 → geocif-0.2.21}/geocif/risk/impact_assessment.py +0 -0
  73. {geocif-0.2.1 → geocif-0.2.21}/geocif/utils.py +0 -0
  74. {geocif-0.2.1 → geocif-0.2.21}/geocif/viz/__init__.py +0 -0
  75. {geocif-0.2.1 → geocif-0.2.21}/geocif/viz/gt.py +0 -0
  76. {geocif-0.2.1 → geocif-0.2.21}/geocif/viz/plot.py +0 -0
  77. {geocif-0.2.1 → geocif-0.2.21}/geocif/viz/tmp.py +0 -0
  78. {geocif-0.2.1 → geocif-0.2.21}/geocif.egg-info/SOURCES.txt +0 -0
  79. {geocif-0.2.1 → geocif-0.2.21}/geocif.egg-info/dependency_links.txt +0 -0
  80. {geocif-0.2.1 → geocif-0.2.21}/geocif.egg-info/not-zip-safe +0 -0
  81. {geocif-0.2.1 → geocif-0.2.21}/geocif.egg-info/top_level.txt +0 -0
  82. {geocif-0.2.1 → geocif-0.2.21}/requirements.txt +0 -0
  83. {geocif-0.2.1 → geocif-0.2.21}/setup.cfg +0 -0
  84. {geocif-0.2.1 → geocif-0.2.21}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.1
3
+ Version: 0.2.21
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -0,0 +1,316 @@
1
+ import numpy as np
2
+ from tqdm import tqdm
3
+ from sklearn.ensemble import RandomForestRegressor
4
+ from collections import Counter
5
+ from pathlib import Path
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+
9
+
10
+ def are_all_features_non_eo(features):
11
+ """
12
+ Check if all the features are non-EO features
13
+
14
+ Args:
15
+ features: iterable of feature names
16
+
17
+ Returns:
18
+ bool: True if every feature is in the non-EO list
19
+ """
20
+ non_eo_features = [
21
+ 'Median Yield (tn per ha)',
22
+ 'Analogous Year',
23
+ 'Analogous Year Yield',
24
+ 'lon',
25
+ 'lat',
26
+ 't -1 Yield (tn per ha)',
27
+ 't -2 Yield (tn per ha)',
28
+ 't -3 Yield (tn per ha)',
29
+ 't -4 Yield (tn per ha)',
30
+ 't -5 Yield (tn per ha)',
31
+ ]
32
+ return all(f in non_eo_features for f in features)
33
+
34
+
35
+ def select_features(
36
+ X, y,
37
+ method="RFE",
38
+ min_features_to_select=3,
39
+ threshold_nan=0.2,
40
+ threshold_unique=0.6
41
+ ):
42
+ """
43
+ Feature-selection wrapper supporting many methods plus a new 'multi' option.
44
+
45
+ Parameters
46
+ ----------
47
+ X : pd.DataFrame
48
+ y : array-like
49
+ method : str
50
+ One of {"SHAP", "stabl", "feature_engine", "mrmr", "RFECV", "lasso",
51
+ "BorutaPy", "Leshy", "PowerShap", "BorutaShap", "Genetic", "RFE", "multi"}
52
+ min_features_to_select : int
53
+ threshold_nan : float
54
+ Drop columns with > threshold_nan proportion of NaNs
55
+ threshold_unique : float
56
+ (Reserved for future use)
57
+
58
+ Returns
59
+ -------
60
+ selector : fitted selector object or None (for multi)
61
+ X_filtered : pd.DataFrame of selected features
62
+ selected_features : list[str]
63
+ """
64
+
65
+ # copy original for multi-mode recursion
66
+ X_clean = X.copy()
67
+
68
+ # 1) drop columns with too many NaNs
69
+ nan_prop = X_clean.isna().mean()
70
+ X_clean = X_clean.loc[:, nan_prop <= threshold_nan]
71
+
72
+ # 2) fill NaNs with median
73
+ X_clean = X_clean.fillna(X_clean.median())
74
+
75
+ # --- multi-method ensemble -------------------------------
76
+ if method.lower() == "multi":
77
+ counter = Counter()
78
+ # run three selectors and count feature picks
79
+ for sub_m in ["BorutaPy", "mrmr", "lasso"]:
80
+ _, _, feats = select_features(
81
+ X_clean, y,
82
+ method=sub_m,
83
+ min_features_to_select=min_features_to_select,
84
+ threshold_nan=threshold_nan,
85
+ threshold_unique=threshold_unique
86
+ )
87
+ counter.update(feats)
88
+
89
+ # union of all features
90
+ combined = sorted(counter.keys())
91
+ X_out = X_clean.loc[:, combined]
92
+
93
+ # plot and save histogram
94
+ freq = pd.Series(counter).sort_values(ascending=False)
95
+ fig = freq.plot(kind="bar", width=0.9).get_figure()
96
+ plt.title("Feature selection frequency across methods")
97
+ plt.xlabel("Feature")
98
+ plt.ylabel("Times selected (out of 3)")
99
+ plt.tight_layout()
100
+
101
+ out_dir = Path("feature_selection_multi")
102
+ out_dir.mkdir(parents=True, exist_ok=True)
103
+ fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
104
+ plt.close(fig)
105
+
106
+ return None, X_out, combined
107
+
108
+ # define forest for methods that need it
109
+ forest = RandomForestRegressor(
110
+ n_estimators=500,
111
+ n_jobs=8,
112
+ max_depth=5,
113
+ random_state=1,
114
+ )
115
+
116
+ # patch numpy deprecation
117
+ np.int = np.int32
118
+ np.float = np.float64
119
+ np.bool = np.bool_
120
+
121
+ if method == "SHAP":
122
+ import pandas as pd
123
+ from catboost import CatBoostRegressor
124
+ from fasttreeshap import TreeExplainer as FastTreeExplainer
125
+ from sklearn.model_selection import cross_val_score
126
+
127
+ model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
128
+ model.fit(X_clean, y)
129
+ explainer = FastTreeExplainer(model)
130
+ shap_values = explainer.shap_values(X_clean)
131
+ shap_importances = np.mean(np.abs(shap_values), axis=0)
132
+ shap_df = pd.DataFrame({
133
+ "feature": X_clean.columns,
134
+ "importance": shap_importances
135
+ }).sort_values("importance", ascending=False)
136
+
137
+ def eval_n(N):
138
+ top = shap_df["feature"].head(N)
139
+ sel = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
140
+ scores = cross_val_score(sel, X_clean[top], y,
141
+ cv=5, scoring="neg_mean_squared_error",
142
+ n_jobs=-1)
143
+ return np.mean(scores)
144
+
145
+ nrange = [5,10,15,20,25,30]
146
+ scores = [eval_n(N) for N in tqdm(nrange)]
147
+ best = nrange[np.argmax(scores)]
148
+ selected = shap_df["feature"].head(best).tolist()
149
+
150
+ elif method == "stabl":
151
+ from stabl.stabl import Stabl
152
+ from sklearn.linear_model import Lasso
153
+
154
+ st = Stabl(
155
+ base_estimator=Lasso(alpha=0.001),
156
+ n_bootstraps=10,
157
+ artificial_type="knockoff",
158
+ artificial_proportion=0.5,
159
+ replace=False,
160
+ fdr_threshold_range=np.arange(0.1,1,0.01),
161
+ sample_fraction=0.5,
162
+ random_state=42,
163
+ lambda_grid="auto",
164
+ verbose=1
165
+ )
166
+ st.fit(X_clean, y)
167
+ selected = st.get_feature_names_out()
168
+
169
+ elif method == "feature_engine":
170
+ from feature_engine.selection import SmartCorrelatedSelection
171
+ sel = SmartCorrelatedSelection(
172
+ method="pearson",
173
+ threshold=0.7,
174
+ selection_method="model_performance",
175
+ estimator=forest,
176
+ scoring="neg_mean_squared_error",
177
+ )
178
+ X_fe = sel.fit_transform(X_clean, y)
179
+ selected = X_fe.columns.tolist()
180
+
181
+ elif method == "mrmr":
182
+ from mrmr import mrmr_regression
183
+ selected = mrmr_regression(X=X_clean, y=y, K=10)
184
+
185
+ elif method == "RFECV":
186
+ from sklearn.feature_selection import RFECV
187
+ from sklearn.model_selection import KFold
188
+
189
+ class RFECVProg(RFECV):
190
+ def _fit(self, X, y):
191
+ with tqdm(total=X.shape[1]) as p:
192
+ orig = self.scorer_
193
+ def wrap(*a, **k):
194
+ p.update(1)
195
+ return orig(*a, **k)
196
+ self.scorer_ = wrap
197
+ super()._fit(X, y)
198
+
199
+ cv = KFold(n_splits=5)
200
+ sel = RFECVProg(
201
+ estimator=forest,
202
+ step=1,
203
+ cv=cv,
204
+ scoring="neg_mean_squared_error",
205
+ n_jobs=-1,
206
+ verbose=0
207
+ )
208
+ sel.fit(X_clean, y)
209
+ mask = sel.get_support()
210
+ selected = X_clean.columns[mask].tolist()
211
+
212
+ elif method == "lasso":
213
+ from sklearn.linear_model import LassoLarsCV
214
+ from sklearn.feature_selection import SelectFromModel
215
+
216
+ lr = LassoLarsCV(cv=5)
217
+ lr.fit(X_clean, y)
218
+ sfm = SelectFromModel(lr, prefit=True)
219
+ selected = X_clean.columns[sfm.get_support()].tolist()
220
+
221
+ elif method == "BorutaPy":
222
+ from boruta import BorutaPy
223
+ sel = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
224
+ sel.fit(X_clean.values, y)
225
+ mask = sel.support_ | sel.support_weak_
226
+ selected = X_clean.columns[mask].tolist()
227
+
228
+ elif method == "Leshy":
229
+ import arfs.feature_selection.allrelevant as arfsgroot
230
+ from catboost import CatBoostRegressor
231
+ model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
232
+ sel = arfsgroot.Leshy(
233
+ model,
234
+ n_estimators="auto",
235
+ verbose=1,
236
+ max_iter=10,
237
+ random_state=42,
238
+ importance="fastshap",
239
+ )
240
+ sel.fit(X_clean, y)
241
+ selected = sel.get_feature_names_out()
242
+
243
+ elif method == "PowerShap":
244
+ from powershap import PowerShap
245
+ from catboost import CatBoostRegressor
246
+ sel = PowerShap(
247
+ model=CatBoostRegressor(n_estimators=500, verbose=0),
248
+ power_alpha=0.05,
249
+ )
250
+ sel.fit(X_clean, y)
251
+ selected = sel.transform(X_clean).columns.tolist()
252
+
253
+ elif method == "BorutaShap":
254
+ from BorutaShap import BorutaShap
255
+ from catboost import CatBoostRegressor
256
+ params = {
257
+ "depth": 6,
258
+ "learning_rate": 0.05,
259
+ "iterations": 500,
260
+ "subsample": 1.0,
261
+ "random_strength": 0.5,
262
+ "reg_lambda": 0.001,
263
+ "loss_function": "RMSE",
264
+ "early_stopping_rounds": 25,
265
+ "random_seed": 42,
266
+ "verbose": False,
267
+ }
268
+ model = CatBoostRegressor(**params)
269
+ sel = BorutaShap(model=model, importance_measure="shap", classification=False)
270
+ sel.fit(X=X_clean, y=y, n_trials=100, sample=False,
271
+ train_or_test="test", normalize=True, verbose=False)
272
+ selected = sel.Subset().columns.tolist()
273
+
274
+ elif method == "Genetic":
275
+ from sklearn_genetic import GAFeatureSelectionCV
276
+ sel = GAFeatureSelectionCV(
277
+ estimator=forest,
278
+ cv=5,
279
+ scoring="neg_mean_squared_error",
280
+ population_size=100,
281
+ generations=40,
282
+ max_features=max(len(X_clean.columns)//3, min_features_to_select),
283
+ crossover_probability=0.9,
284
+ mutation_probability=0.1,
285
+ keep_top_k=2,
286
+ elitism=True,
287
+ n_jobs=-1,
288
+ verbose=1,
289
+ )
290
+ sel.fit(X_clean, y)
291
+ selected = X_clean.columns[sel.support_].tolist()
292
+
293
+ elif method == "RFE":
294
+ from sklearn.feature_selection import RFE
295
+ sel = RFE(forest, n_features_to_select=min_features_to_select, step=1, verbose=1)
296
+ sel = sel.fit(X_clean, y)
297
+ selected = X_clean.columns[sel.support_].tolist()
298
+
299
+ else:
300
+ raise ValueError(f"Unknown method: {method}")
301
+
302
+ # post-filtering: non-EO fallback to SelectKBest
303
+ non_eo = are_all_features_non_eo(selected)
304
+ if non_eo or method == "SelectKBest":
305
+ from sklearn.feature_selection import SelectKBest, f_regression
306
+ k = 15
307
+ skb = SelectKBest(score_func=f_regression, k=k)
308
+ skb.fit(X_clean, y)
309
+ selected = X_clean.columns[skb.get_support()].tolist()
310
+
311
+ # return selector (if exists), filtered DataFrame, and feature list
312
+ try:
313
+ return sel, X_clean.loc[:, selected], selected
314
+ except NameError:
315
+ # for methods that didn't create `sel`
316
+ return None, X_clean.loc[:, selected], selected
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.1
3
+ Version: 0.2.21
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.2.01",
53
+ version="0.2.21",
54
54
  zip_safe=False,
55
55
  )
@@ -1,349 +0,0 @@
1
- import numpy as np
2
- from tqdm import tqdm
3
- from sklearn.ensemble import RandomForestRegressor
4
-
5
-
6
- def are_all_features_non_eo(features):
7
- """
8
- Check if all the features non eo features
9
-
10
- Args:
11
- feature:
12
-
13
- Returns:
14
-
15
- """
16
- non_eo_features = ['Median Yield (tn per ha)',
17
- 'Analogous Year',
18
- 'Analogous Year Yield',
19
- 'lon',
20
- 'lat',
21
- 't -1 Yield (tn per ha)',
22
- 't -2 Yield (tn per ha)',
23
- 't -3 Yield (tn per ha)',
24
- 't -4 Yield (tn per ha)',
25
- 't -5 Yield (tn per ha)']
26
-
27
- # Check if all features are non-eo features, return True if they are
28
- return all(feature in non_eo_features for feature in features)
29
-
30
-
31
- def select_features(X, y, method="RFE", min_features_to_select=3, threshold_nan=0.2, threshold_unique=0.6):
32
- """
33
-
34
- Args:
35
- X:
36
- y:
37
- method:
38
- min_features_to_select:
39
- threshold_unique:
40
-
41
- Returns:
42
-
43
- """
44
-
45
- # df = X.copy()
46
- #
47
- # # Initialize and apply StandardScaler
48
- # scaler = StandardScaler()
49
- # scaled_data = scaler.fit_transform(df)
50
- #
51
- # # Initialize and apply VarianceThreshold
52
- # # Note: Since data is standardized, all features now have variance of 1 before applying VarianceThreshold.
53
- # # You would adjust the threshold based on new criteria since variances have been normalized.
54
- # selector = VarianceThreshold(threshold=scaled_data.var().mean())
55
- # X = selector.fit_transform(scaled_data)
56
- selector = None
57
- X_original = X.copy()
58
-
59
- # Calculate the proportion of NaN values in each column
60
- nan_proportion = X.isna().mean()
61
-
62
- # Drop columns where more than 20% of the values are NaN
63
- X = X.loc[:, nan_proportion <= threshold_nan]
64
-
65
- # Fill in columns with median of that column
66
- X = X.fillna(X.median())
67
-
68
- # Calculate the proportion of unique values in each column
69
- # unique_proportion = X.nunique(axis="columns") / len(X)
70
- #
71
- # # Filter columns that have at least 60% unique values
72
- # columns_to_keep = unique_proportion[unique_proportion >= threshold_unique].index
73
- #
74
- # # Drop columns that do not meet the threshold
75
- # X = X[columns_to_keep]
76
-
77
- # Define the RandomForestRegressor
78
- forest = RandomForestRegressor(
79
- n_estimators=500,
80
- n_jobs=8,
81
- max_depth=5,
82
- random_state=1,
83
- )
84
-
85
- # Adjusting numpy types due to deprecation warnings or errors
86
- np.int = np.int32
87
- np.float = np.float64
88
- np.bool = np.bool_
89
-
90
- if method == "SHAP":
91
- import pandas as pd
92
- from catboost import CatBoostRegressor
93
- from fasttreeshap import TreeExplainer as FastTreeExplainer
94
- from sklearn.model_selection import cross_val_score
95
-
96
- model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
97
- model.fit(X, y)
98
-
99
- explainer = FastTreeExplainer(model)
100
- shap_values = explainer.shap_values(X)
101
-
102
- # Step 5: Summarize the SHAP values for feature importance
103
- shap_importances = np.mean(np.abs(shap_values), axis=0)
104
- shap_importance_df = pd.DataFrame(
105
- {"feature": X.columns, "importance": shap_importances}
106
- ).sort_values(by="importance", ascending=False)
107
-
108
- def evaluate_model_with_n_features(N, X_train, y_train):
109
- top_features = shap_importance_df["feature"].head(N).values
110
- X_train_selected = X_train[top_features]
111
- selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
112
- scores = cross_val_score(
113
- selector,
114
- X_train_selected,
115
- y_train,
116
- cv=5,
117
- scoring="neg_mean_squared_error",
118
- n_jobs=-1,
119
- )
120
-
121
- return np.mean(scores)
122
-
123
- # Evaluate model performance with different number of features
124
- nrange = [5, 10, 15, 20, 25, 30]
125
- cv_scores = []
126
- for N in tqdm(nrange):
127
- cv_scores.append(evaluate_model_with_n_features(N, X, y))
128
-
129
- # Select the number of features that gives the best cross-validation score (lowest MSE)
130
- optimal_N = nrange[np.argmax(cv_scores)]
131
-
132
- # Use optimal N to select features
133
- selected_features = (
134
- shap_importance_df["feature"].head(optimal_N).values.tolist()
135
- )
136
- elif method == "stabl":
137
- from stabl.stabl import Stabl
138
- from sklearn.linear_model import Lasso
139
-
140
- stabl = Stabl(
141
- base_estimator=Lasso(alpha=0.001),
142
- n_bootstraps=10,
143
- artificial_type="knockoff",
144
- artificial_proportion=.5,
145
- replace=False,
146
- fdr_threshold_range=np.arange(0.1, 1, 0.01),
147
- sample_fraction=0.5,
148
- random_state=42,
149
- verbose=1
150
- )
151
- stabl.fit(X, y)
152
- selected_features = stabl.get_feature_names_out()
153
- elif method == "feature_engine":
154
- from feature_engine.selection import SmartCorrelatedSelection
155
-
156
- selector = SmartCorrelatedSelection(
157
- method="pearson",
158
- threshold=0.7,
159
- selection_method="model_performance",
160
- estimator=forest,
161
- scoring="neg_mean_squared_error",
162
- )
163
-
164
- X_filtered = selector.fit_transform(X, y)
165
- selected_features = X_filtered.columns.tolist()
166
- elif method == "mrmr":
167
- from mrmr import mrmr_regression
168
-
169
- try:
170
- selected_features = mrmr_regression(X=X, y=y, K=10)
171
- except:
172
- breakpoint()
173
- # combine X and y into a dataframe
174
- # df = pd.concat([X, y], axis=1)
175
-
176
- elif method == "RFECV":
177
- from sklearn.feature_selection import RFECV
178
- from sklearn.model_selection import KFold
179
-
180
- # Initialize a k-fold cross-validation strategy
181
- cv = KFold(n_splits=5)
182
-
183
- # Patch the scoring function to add a progress bar
184
- class RFECVWithProgress(RFECV):
185
- def _fit(self, X, y):
186
- from tqdm import tqdm
187
-
188
- n_features = X.shape[1]
189
- with tqdm(total=n_features) as pbar:
190
-
191
- def patched_scorer(*args, **kwargs):
192
- pbar.update(1)
193
- return self.scorer_(*args, **kwargs)
194
-
195
- self.scorer_ = patched_scorer
196
- super()._fit(X, y)
197
-
198
- # Initialize RFECV with the estimator and cross-validation strategy
199
- selector = RFECVWithProgress(
200
- estimator=forest, step=1, n_jobs=-1, cv=cv, scoring="neg_mean_squared_error"
201
- )
202
- selector.fit(X, y)
203
- # Get the selected feature indices
204
- selected_features = selector.get_support(indices=True)
205
-
206
- # Get the selected feature names
207
- selected_features = X.columns[selected_features].tolist()
208
- elif method == "lasso":
209
- from sklearn.linear_model import LassoLarsCV
210
- from sklearn.feature_selection import SelectFromModel
211
-
212
- # Fit Lasso model (L1 regularization) to perform feature selection
213
- lasso = LassoLarsCV(cv=5)
214
- lasso.fit(X, y)
215
-
216
- # Use SelectFromModel to remove features with zero coefficients
217
- selector = SelectFromModel(lasso, prefit=True)
218
-
219
- # Get the selected features
220
- selected_features = X.columns[selector.get_support()].tolist()
221
- print(selected_features)
222
- elif method == "BorutaPy":
223
- from boruta import BorutaPy
224
-
225
- selector = BorutaPy(forest, n_estimators="auto", random_state=42, verbose=0)
226
- selector.fit(X.values, y.values)
227
- selected_features_mask = selector.support_
228
- selected_features = X.columns[selected_features_mask].tolist()
229
- tentative_features = X.columns[selector.support_weak_].tolist()
230
-
231
- selected_features = selected_features + tentative_features
232
- elif method == "Leshy":
233
- import arfs.feature_selection.allrelevant as arfsgroot
234
- from catboost import CatBoostRegressor
235
-
236
- model = CatBoostRegressor(n_estimators=350, verbose=0, use_best_model=False)
237
- selector = arfsgroot.Leshy(
238
- model,
239
- n_estimators="auto",
240
- verbose=1,
241
- max_iter=10,
242
- random_state=42,
243
- importance="fastshap",
244
- )
245
- selector.fit(X, y, sample_weight=None)
246
-
247
- selected_features = selector.get_feature_names_out()
248
- # feat_selector.plot_importance(n_feat_per_inch=5)
249
- elif method == "PowerShap":
250
- from powershap import PowerShap
251
- from catboost import CatBoostRegressor
252
-
253
- selector = PowerShap(
254
- model=CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=True),
255
- power_alpha=0.05,
256
- )
257
-
258
- selector.fit(X, y) # Fit the PowerShap feature selector
259
- selector.transform(X) # Reduce the dataset to the selected features
260
- elif method == "BorutaShap":
261
- from BorutaShap import BorutaShap
262
- from catboost import CatBoostRegressor
263
-
264
- hyperparams = {
265
- "depth": 6,
266
- "learning_rate": 0.05,
267
- "iterations": 500,
268
- "subsample": 1.0,
269
- "random_strength": 0.5,
270
- "reg_lambda": 0.001,
271
- "loss_function": "RMSE",
272
- "early_stopping_rounds": 25,
273
- "random_seed": 42,
274
- "verbose": False,
275
- }
276
- model = CatBoostRegressor(**hyperparams)
277
-
278
- selector = BorutaShap(
279
- model=model, importance_measure="shap", classification=False
280
- )
281
- selector.fit(
282
- X=X,
283
- y=y,
284
- n_trials=100,
285
- sample=False,
286
- train_or_test="test",
287
- normalize=True,
288
- verbose=False,
289
- )
290
- selected_features_mask = selector.Subset().columns
291
- selected_features = X[selected_features_mask].columns.tolist()
292
- elif method == "Genetic":
293
- from sklearn_genetic import GAFeatureSelectionCV
294
-
295
- selector = GAFeatureSelectionCV(
296
- estimator=forest,
297
- cv=5,
298
- verbose=1,
299
- scoring="neg_mean_squared_error",
300
- max_features=max(len(X.columns) // 3, min_features_to_select),
301
- population_size=100,
302
- generations=40,
303
- crossover_probability=0.9,
304
- mutation_probability=0.1,
305
- keep_top_k=2,
306
- elitism=True,
307
- n_jobs=-1,
308
- )
309
- selector.fit(X, y)
310
- selected_features_mask = selector.support_
311
- selected_features = X.columns[selected_features_mask].tolist()
312
- elif method == "RFE":
313
- from sklearn.feature_selection import RFE
314
-
315
- selector = RFE(
316
- forest, n_features_to_select=min_features_to_select, step=1, verbose=1
317
- )
318
- selector = selector.fit(X, y)
319
- selected_features_mask = selector.support_
320
- selected_features = X.columns[selected_features_mask].tolist()
321
- else:
322
- raise ValueError("Method not recognized. Use BorutaPy, Genetic, or RFE")
323
- # tentative_features = X.columns[selector.support_weak_].tolist()
324
- print(selected_features)
325
- breakpoint()
326
- non_eo = are_all_features_non_eo(selected_features)
327
- if non_eo or method == "SelectKBest":
328
- from sklearn.feature_selection import SelectKBest, f_regression
329
-
330
- k = 15 # Number of features to select
331
- selector = SelectKBest(score_func=f_regression, k=k)
332
-
333
- # Fit the selector to the data and transform the data to select the best features
334
- try:
335
- X_new = selector.fit_transform(X, y)
336
- except:
337
- breakpoint()
338
-
339
- # Get the selected feature indices
340
- selected_features = selector.get_support(indices=True)
341
-
342
- # Get the selected feature names
343
- selected_features = X.columns[selected_features].tolist()
344
-
345
- # print(selected_features)
346
- # Filter the dataset for selected features
347
- X_filtered = X.loc[:, selected_features]
348
-
349
- return selector, X_filtered, selected_features
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes