geocif 0.2.24__tar.gz → 0.2.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.2.24/geocif.egg-info → geocif-0.2.26}/PKG-INFO +1 -1
- {geocif-0.2.24 → geocif-0.2.26}/geocif/geocif.py +60 -62
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/feature_selection.py +8 -8
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/trainers.py +2 -2
- {geocif-0.2.24 → geocif-0.2.26/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.2.24 → geocif-0.2.26}/setup.py +1 -1
- {geocif-0.2.24 → geocif-0.2.26}/LICENSE +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/MANIFEST.in +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/README.md +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/agmet/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/agmet/plot.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/agmet/utils.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/analysis.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/constants.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/features.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/geo.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/geocif.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/metadata.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/backup/models.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/cei/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/cei/definitions.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/cei/indices.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/experiments.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/geocif_runner.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_angola.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/logger.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/correlations.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/embedding.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/outliers.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/outlook.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/output.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/stages.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/stats.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/trend.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/ml/xai.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/mm.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/aa.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/area.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/automl.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/download_esi.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/enso.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/eval.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/gamtest.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/gee_access.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/misc.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/reg.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/sustain.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/tmp.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/tmp2.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/tmp3.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/tmp4.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/tmp5.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/playground/wolayita_maize_mask.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/risk/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/utils.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/viz/__init__.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/viz/gt.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/viz/plot.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif/viz/tmp.py +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/requirements.txt +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/setup.cfg +0 -0
- {geocif-0.2.24 → geocif-0.2.26}/tests/test_geocif.py +0 -0
@@ -179,6 +179,13 @@ class Geocif:
|
|
179
179
|
"Production (tn)",
|
180
180
|
]
|
181
181
|
|
182
|
+
if self.model_type == "REGRESSION":
|
183
|
+
self.target_column = (
|
184
|
+
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
185
|
+
)
|
186
|
+
elif self.model_type == "CLASSIFICATION":
|
187
|
+
self.target_column = self.target_class
|
188
|
+
|
182
189
|
self.combined_dict = {
|
183
190
|
**di.dict_indices,
|
184
191
|
**di.dict_ndvi,
|
@@ -204,7 +211,30 @@ class Geocif:
|
|
204
211
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
205
212
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
206
213
|
|
207
|
-
def
|
214
|
+
def apply_feature_selector(self, dir_output):
|
215
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
216
|
+
all_features = self.X_train.columns
|
217
|
+
|
218
|
+
# Select the columns with use_ceis in it
|
219
|
+
self.selected_features = [
|
220
|
+
column
|
221
|
+
for column in all_features
|
222
|
+
if any(cei in column for cei in self.use_ceis)
|
223
|
+
]
|
224
|
+
else:
|
225
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
226
|
+
selector, _, self.selected_features = fs.select_features(
|
227
|
+
self.X_train, self.y_train, method=self.feature_selection, dir_output=dir_output
|
228
|
+
)
|
229
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
230
|
+
|
231
|
+
""" Update model to include conformal estimates """
|
232
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
233
|
+
self.selected_features.append("lat")
|
234
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
235
|
+
self.selected_features.append("lon")
|
236
|
+
|
237
|
+
def train_model(self, df_region, scaler=None):
|
208
238
|
"""
|
209
239
|
|
210
240
|
Args:
|
@@ -214,55 +244,9 @@ class Geocif:
|
|
214
244
|
Returns:
|
215
245
|
|
216
246
|
"""
|
217
|
-
|
218
|
-
""" Perform feature selection """
|
219
|
-
if self.model_type == "REGRESSION":
|
220
|
-
target_column = (
|
221
|
-
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
222
|
-
)
|
223
|
-
elif self.model_type == "CLASSIFICATION":
|
224
|
-
target_column = self.target_class
|
225
|
-
|
226
|
-
# Drop rows where target_column is NaN
|
227
|
-
df_region = df_region.dropna(subset=[target_column])
|
228
|
-
|
229
|
-
X_train = df_region[self.feature_names]
|
230
|
-
# Drop any columns with NaNs
|
231
|
-
X_train = X_train.dropna(axis=1, how="any")
|
232
|
-
y_train = df_region[target_column]
|
233
|
-
|
234
247
|
if self.ml_model:
|
235
|
-
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
236
|
-
all_features = X_train.columns
|
237
|
-
|
238
|
-
# Select the columns with use_ceis in it
|
239
|
-
self.selected_features = [
|
240
|
-
column
|
241
|
-
for column in all_features
|
242
|
-
if any(cei in column for cei in self.use_ceis)
|
243
|
-
]
|
244
|
-
else:
|
245
|
-
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
246
|
-
selector, _, self.selected_features = fs.select_features(
|
247
|
-
X_train, y_train, method=self.feature_selection
|
248
|
-
)
|
249
|
-
self.logger.info(f"Selected features: {self.selected_features}")
|
250
|
-
|
251
|
-
""" Update model to include conformal estimates """
|
252
|
-
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
253
|
-
self.selected_features.append("lat")
|
254
|
-
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
255
|
-
self.selected_features.append("lon")
|
256
248
|
X_train = df_region[self.selected_features + self.cat_features]
|
257
249
|
|
258
|
-
dir_output = (
|
259
|
-
self.dir_analysis
|
260
|
-
/ self.country
|
261
|
-
/ self.crop
|
262
|
-
/ self.model_name
|
263
|
-
/ str(self.forecast_season)
|
264
|
-
)
|
265
|
-
|
266
250
|
region_id = df_region["Region_ID"].unique()[0]
|
267
251
|
X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
|
268
252
|
if scaler:
|
@@ -284,9 +268,9 @@ class Geocif:
|
|
284
268
|
"Harvest Year",
|
285
269
|
df_region[self.selected_features + self.cat_features + [self.target]],
|
286
270
|
X_train_scaled,
|
287
|
-
y_train,
|
271
|
+
self.y_train,
|
288
272
|
feature_names=self.selected_features,
|
289
|
-
target_col=target_column,
|
273
|
+
target_col=self.target_column,
|
290
274
|
optimize=self.optimize,
|
291
275
|
fraction_loocv=self.fraction_loocv,
|
292
276
|
cat_features=self.cat_features,
|
@@ -303,7 +287,7 @@ class Geocif:
|
|
303
287
|
if self.model_name == "catboost":
|
304
288
|
self.model.fit(
|
305
289
|
X_train,
|
306
|
-
y_train,
|
290
|
+
self.y_train,
|
307
291
|
cat_features=self.cat_features,
|
308
292
|
verbose=True,
|
309
293
|
)
|
@@ -313,16 +297,16 @@ class Geocif:
|
|
313
297
|
item for item in self.cat_features if item != "Harvest Year"
|
314
298
|
]
|
315
299
|
)
|
316
|
-
self.model.fit(X_train, y_train)
|
300
|
+
self.model.fit(X_train, self.y_train)
|
317
301
|
elif self.model_name == "ydf":
|
318
302
|
# Combine X_train and y_train
|
319
|
-
df_train = pd.concat([X_train, y_train], axis=1)
|
303
|
+
df_train = pd.concat([X_train, self.y_train], axis=1)
|
320
304
|
|
321
305
|
self.model = self.model.train(df_train)
|
322
306
|
elif self.model_name == "geospaNN":
|
323
307
|
self.model.fit(
|
324
308
|
X_train,
|
325
|
-
y_train,
|
309
|
+
self.y_train,
|
326
310
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
327
311
|
)
|
328
312
|
elif self.model_name == "merf":
|
@@ -334,15 +318,15 @@ class Geocif:
|
|
334
318
|
X_train,
|
335
319
|
Z_train,
|
336
320
|
clusters_train.astype("object"),
|
337
|
-
y_train.values,
|
321
|
+
self.y_train.values,
|
338
322
|
)
|
339
323
|
elif self.model_name == "linear":
|
340
|
-
self.model.fit(X_train_scaled, y_train)
|
324
|
+
self.model.fit(X_train_scaled, self.y_train)
|
341
325
|
elif self.model_name == "gam":
|
342
|
-
self.model.fit(X_train_scaled.values, y_train.values)
|
326
|
+
self.model.fit(X_train_scaled.values, self.y_train.values)
|
343
327
|
self.best_hyperparams = {}
|
344
328
|
elif self.model_name in ["cubist"]:
|
345
|
-
self.model.fit(X_train, y_train)
|
329
|
+
self.model.fit(X_train, self.y_train)
|
346
330
|
elif self.model_name in [
|
347
331
|
"cumulative_1",
|
348
332
|
"cumulative_2",
|
@@ -377,7 +361,7 @@ class Geocif:
|
|
377
361
|
# Combine scaled numeric features and encoded region
|
378
362
|
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
379
363
|
|
380
|
-
self.model.fit(X_train_scaled, y_train)
|
364
|
+
self.model.fit(X_train_scaled, self.y_train)
|
381
365
|
except Exception as e:
|
382
366
|
self.logger.error(
|
383
367
|
f"Error fitting model for {self.country} {self.crop} {e}"
|
@@ -782,6 +766,14 @@ class Geocif:
|
|
782
766
|
Returns:
|
783
767
|
|
784
768
|
"""
|
769
|
+
dir_output = (
|
770
|
+
self.dir_analysis
|
771
|
+
/ self.country
|
772
|
+
/ self.crop
|
773
|
+
/ self.model_name
|
774
|
+
/ str(self.forecast_season)
|
775
|
+
)
|
776
|
+
|
785
777
|
from sklearn.preprocessing import StandardScaler
|
786
778
|
|
787
779
|
scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
|
@@ -807,8 +799,6 @@ class Geocif:
|
|
807
799
|
mask_train = self.df_train["Region_ID"] == region
|
808
800
|
mask_test = self.df_test["Region_ID"] == region
|
809
801
|
|
810
|
-
num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
|
811
|
-
|
812
802
|
if self.cluster_strategy == "individual":
|
813
803
|
region_name = self.df_train["Region"].unique()[idx]
|
814
804
|
pbar.set_description(f"Fit/Predict for {region_name}")
|
@@ -836,12 +826,20 @@ class Geocif:
|
|
836
826
|
if self.last_year_yield_as_feature:
|
837
827
|
common_columns += [f"Last Year {self.target}"]
|
838
828
|
|
839
|
-
""" Train """
|
829
|
+
""" Feature selection and then Train """
|
840
830
|
# Filter dataframe based on region and self.feature_names
|
841
831
|
df_region_train = self.df_train[mask_train]
|
842
832
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
843
833
|
df_region_train.reset_index(drop=True, inplace=True)
|
844
|
-
self.
|
834
|
+
df_region_train = df_region_train.dropna(subset=[self.target_column])
|
835
|
+
|
836
|
+
self.X_train = df_region_train[self.feature_names]
|
837
|
+
# Drop any columns with NaNs
|
838
|
+
self.X_train.dropna(axis=1, how="any", inplace=True)
|
839
|
+
self.y_train = df_region_train[self.target_column]
|
840
|
+
breakpoint()
|
841
|
+
self.apply_feature_selector(dir_output)
|
842
|
+
self.train_model(df_region_train, scaler, dir_output)
|
845
843
|
|
846
844
|
""" Predict """
|
847
845
|
if self.check_yield_trend:
|
@@ -34,10 +34,11 @@ def are_all_features_non_eo(features):
|
|
34
34
|
|
35
35
|
def select_features(
|
36
36
|
X, y,
|
37
|
-
method="
|
37
|
+
method="multi",
|
38
38
|
min_features_to_select=3,
|
39
39
|
threshold_nan=0.2,
|
40
|
-
threshold_unique=0.6
|
40
|
+
threshold_unique=0.6,
|
41
|
+
dir_output="."
|
41
42
|
):
|
42
43
|
"""
|
43
44
|
Feature-selection wrapper supporting many methods plus a new 'multi' option.
|
@@ -75,8 +76,9 @@ def select_features(
|
|
75
76
|
# --- multi-method ensemble -------------------------------
|
76
77
|
if method == "multi":
|
77
78
|
counter = Counter()
|
79
|
+
models = ["BorutaPy", "mrmr"]
|
78
80
|
# run three selectors and count feature picks
|
79
|
-
for sub_m in
|
81
|
+
for sub_m in models:
|
80
82
|
_, _, feats = select_features(
|
81
83
|
X_clean, y,
|
82
84
|
method=sub_m,
|
@@ -84,7 +86,6 @@ def select_features(
|
|
84
86
|
threshold_nan=threshold_nan,
|
85
87
|
threshold_unique=threshold_unique
|
86
88
|
)
|
87
|
-
print(sub_m, feats)
|
88
89
|
counter.update(feats)
|
89
90
|
|
90
91
|
# union of all features
|
@@ -97,12 +98,11 @@ def select_features(
|
|
97
98
|
fig = freq.plot(kind="bar", width=0.9).get_figure()
|
98
99
|
plt.title("Feature selection frequency across methods")
|
99
100
|
plt.xlabel("Feature")
|
100
|
-
plt.ylabel("Times selected (out of
|
101
|
+
plt.ylabel(f"Times selected (out of {len(models)})")
|
101
102
|
plt.tight_layout()
|
102
103
|
|
103
|
-
|
104
|
-
|
105
|
-
fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
|
104
|
+
dir_output = dir_output / Path("feature_selection")
|
105
|
+
fig.savefig(dir_output / "feature_selection_frequency.png", dpi=300)
|
106
106
|
plt.close(fig)
|
107
107
|
|
108
108
|
return None, X_out, combined
|
@@ -268,7 +268,7 @@ def auto_train(
|
|
268
268
|
loss_function = "MAPE" if model_type == "REGRESSION" else "MultiClass"
|
269
269
|
bootstrap_type = "Bernoulli" if model_type == "CLASSIFICATION" else "MVS"
|
270
270
|
hyperparams = {
|
271
|
-
"iterations":
|
271
|
+
"iterations": 1500,
|
272
272
|
"learning_rate": 0.025,
|
273
273
|
"depth": 6,
|
274
274
|
"subsample": 1.0,
|
@@ -278,7 +278,7 @@ def auto_train(
|
|
278
278
|
"loss_function": loss_function,
|
279
279
|
"early_stopping_rounds": 20,
|
280
280
|
"random_seed": seed,
|
281
|
-
"verbose":
|
281
|
+
"verbose": False,
|
282
282
|
}
|
283
283
|
|
284
284
|
if model_name == "catboost":
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|