geocif 0.2.25__tar.gz → 0.2.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.2.25/geocif.egg-info → geocif-0.2.27}/PKG-INFO +1 -1
- {geocif-0.2.25 → geocif-0.2.27}/geocif/geocif.py +64 -62
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/feature_selection.py +9 -8
- {geocif-0.2.25 → geocif-0.2.27/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.2.25 → geocif-0.2.27}/setup.py +1 -1
- {geocif-0.2.25 → geocif-0.2.27}/LICENSE +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/MANIFEST.in +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/README.md +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/agmet/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/agmet/plot.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/agmet/utils.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/analysis.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/constants.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/features.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/geo.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/geocif.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/metadata.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/backup/models.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/cei/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/cei/definitions.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/cei/indices.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/experiments.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/geocif_runner.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_angola.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/logger.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/correlations.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/embedding.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/outliers.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/outlook.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/output.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/stages.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/stats.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/trainers.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/trend.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/ml/xai.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/mm.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/aa.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/area.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/automl.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/download_esi.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/enso.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/eval.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/gamtest.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/gee_access.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/misc.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/reg.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/sustain.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/tmp.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/tmp2.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/tmp3.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/tmp4.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/tmp5.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/playground/wolayita_maize_mask.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/risk/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/utils.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/viz/__init__.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/viz/gt.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/viz/plot.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif/viz/tmp.py +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/requirements.txt +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/setup.cfg +0 -0
- {geocif-0.2.25 → geocif-0.2.27}/tests/test_geocif.py +0 -0
@@ -179,6 +179,13 @@ class Geocif:
|
|
179
179
|
"Production (tn)",
|
180
180
|
]
|
181
181
|
|
182
|
+
if self.model_type == "REGRESSION":
|
183
|
+
self.target_column = (
|
184
|
+
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
185
|
+
)
|
186
|
+
elif self.model_type == "CLASSIFICATION":
|
187
|
+
self.target_column = self.target_class
|
188
|
+
|
182
189
|
self.combined_dict = {
|
183
190
|
**di.dict_indices,
|
184
191
|
**di.dict_ndvi,
|
@@ -204,7 +211,34 @@ class Geocif:
|
|
204
211
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
205
212
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
206
213
|
|
207
|
-
def
|
214
|
+
def apply_feature_selector(self, region, dir_output):
|
215
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
216
|
+
all_features = self.X_train.columns
|
217
|
+
|
218
|
+
# Select the columns with use_ceis in it
|
219
|
+
self.selected_features = [
|
220
|
+
column
|
221
|
+
for column in all_features
|
222
|
+
if any(cei in column for cei in self.use_ceis)
|
223
|
+
]
|
224
|
+
else:
|
225
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
226
|
+
selector, _, self.selected_features = fs.select_features(
|
227
|
+
self.X_train,
|
228
|
+
self.y_train,
|
229
|
+
method=self.feature_selection,
|
230
|
+
dir_output=dir_output,
|
231
|
+
region=region
|
232
|
+
)
|
233
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
234
|
+
|
235
|
+
""" Update model to include conformal estimates """
|
236
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
237
|
+
self.selected_features.append("lat")
|
238
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
239
|
+
self.selected_features.append("lon")
|
240
|
+
|
241
|
+
def train_model(self, df_region, scaler=None):
|
208
242
|
"""
|
209
243
|
|
210
244
|
Args:
|
@@ -214,55 +248,9 @@ class Geocif:
|
|
214
248
|
Returns:
|
215
249
|
|
216
250
|
"""
|
217
|
-
|
218
|
-
""" Perform feature selection """
|
219
|
-
if self.model_type == "REGRESSION":
|
220
|
-
target_column = (
|
221
|
-
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
222
|
-
)
|
223
|
-
elif self.model_type == "CLASSIFICATION":
|
224
|
-
target_column = self.target_class
|
225
|
-
|
226
|
-
# Drop rows where target_column is NaN
|
227
|
-
df_region = df_region.dropna(subset=[target_column])
|
228
|
-
|
229
|
-
X_train = df_region[self.feature_names]
|
230
|
-
# Drop any columns with NaNs
|
231
|
-
X_train = X_train.dropna(axis=1, how="any")
|
232
|
-
y_train = df_region[target_column]
|
233
|
-
|
234
251
|
if self.ml_model:
|
235
|
-
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
236
|
-
all_features = X_train.columns
|
237
|
-
|
238
|
-
# Select the columns with use_ceis in it
|
239
|
-
self.selected_features = [
|
240
|
-
column
|
241
|
-
for column in all_features
|
242
|
-
if any(cei in column for cei in self.use_ceis)
|
243
|
-
]
|
244
|
-
else:
|
245
|
-
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
246
|
-
selector, _, self.selected_features = fs.select_features(
|
247
|
-
X_train, y_train, method=self.feature_selection
|
248
|
-
)
|
249
|
-
self.logger.info(f"Selected features: {self.selected_features}")
|
250
|
-
|
251
|
-
""" Update model to include conformal estimates """
|
252
|
-
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
253
|
-
self.selected_features.append("lat")
|
254
|
-
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
255
|
-
self.selected_features.append("lon")
|
256
252
|
X_train = df_region[self.selected_features + self.cat_features]
|
257
253
|
|
258
|
-
dir_output = (
|
259
|
-
self.dir_analysis
|
260
|
-
/ self.country
|
261
|
-
/ self.crop
|
262
|
-
/ self.model_name
|
263
|
-
/ str(self.forecast_season)
|
264
|
-
)
|
265
|
-
|
266
254
|
region_id = df_region["Region_ID"].unique()[0]
|
267
255
|
X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
|
268
256
|
if scaler:
|
@@ -284,9 +272,9 @@ class Geocif:
|
|
284
272
|
"Harvest Year",
|
285
273
|
df_region[self.selected_features + self.cat_features + [self.target]],
|
286
274
|
X_train_scaled,
|
287
|
-
y_train,
|
275
|
+
self.y_train,
|
288
276
|
feature_names=self.selected_features,
|
289
|
-
target_col=target_column,
|
277
|
+
target_col=self.target_column,
|
290
278
|
optimize=self.optimize,
|
291
279
|
fraction_loocv=self.fraction_loocv,
|
292
280
|
cat_features=self.cat_features,
|
@@ -303,7 +291,7 @@ class Geocif:
|
|
303
291
|
if self.model_name == "catboost":
|
304
292
|
self.model.fit(
|
305
293
|
X_train,
|
306
|
-
y_train,
|
294
|
+
self.y_train,
|
307
295
|
cat_features=self.cat_features,
|
308
296
|
verbose=True,
|
309
297
|
)
|
@@ -313,16 +301,16 @@ class Geocif:
|
|
313
301
|
item for item in self.cat_features if item != "Harvest Year"
|
314
302
|
]
|
315
303
|
)
|
316
|
-
self.model.fit(X_train, y_train)
|
304
|
+
self.model.fit(X_train, self.y_train)
|
317
305
|
elif self.model_name == "ydf":
|
318
306
|
# Combine X_train and y_train
|
319
|
-
df_train = pd.concat([X_train, y_train], axis=1)
|
307
|
+
df_train = pd.concat([X_train, self.y_train], axis=1)
|
320
308
|
|
321
309
|
self.model = self.model.train(df_train)
|
322
310
|
elif self.model_name == "geospaNN":
|
323
311
|
self.model.fit(
|
324
312
|
X_train,
|
325
|
-
y_train,
|
313
|
+
self.y_train,
|
326
314
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
327
315
|
)
|
328
316
|
elif self.model_name == "merf":
|
@@ -334,15 +322,15 @@ class Geocif:
|
|
334
322
|
X_train,
|
335
323
|
Z_train,
|
336
324
|
clusters_train.astype("object"),
|
337
|
-
y_train.values,
|
325
|
+
self.y_train.values,
|
338
326
|
)
|
339
327
|
elif self.model_name == "linear":
|
340
|
-
self.model.fit(X_train_scaled, y_train)
|
328
|
+
self.model.fit(X_train_scaled, self.y_train)
|
341
329
|
elif self.model_name == "gam":
|
342
|
-
self.model.fit(X_train_scaled.values, y_train.values)
|
330
|
+
self.model.fit(X_train_scaled.values, self.y_train.values)
|
343
331
|
self.best_hyperparams = {}
|
344
332
|
elif self.model_name in ["cubist"]:
|
345
|
-
self.model.fit(X_train, y_train)
|
333
|
+
self.model.fit(X_train, self.y_train)
|
346
334
|
elif self.model_name in [
|
347
335
|
"cumulative_1",
|
348
336
|
"cumulative_2",
|
@@ -377,7 +365,7 @@ class Geocif:
|
|
377
365
|
# Combine scaled numeric features and encoded region
|
378
366
|
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
379
367
|
|
380
|
-
self.model.fit(X_train_scaled, y_train)
|
368
|
+
self.model.fit(X_train_scaled, self.y_train)
|
381
369
|
except Exception as e:
|
382
370
|
self.logger.error(
|
383
371
|
f"Error fitting model for {self.country} {self.crop} {e}"
|
@@ -782,6 +770,14 @@ class Geocif:
|
|
782
770
|
Returns:
|
783
771
|
|
784
772
|
"""
|
773
|
+
dir_output = (
|
774
|
+
self.dir_analysis
|
775
|
+
/ self.country
|
776
|
+
/ self.crop
|
777
|
+
/ self.model_name
|
778
|
+
/ str(self.forecast_season)
|
779
|
+
)
|
780
|
+
|
785
781
|
from sklearn.preprocessing import StandardScaler
|
786
782
|
|
787
783
|
scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
|
@@ -807,8 +803,6 @@ class Geocif:
|
|
807
803
|
mask_train = self.df_train["Region_ID"] == region
|
808
804
|
mask_test = self.df_test["Region_ID"] == region
|
809
805
|
|
810
|
-
num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
|
811
|
-
|
812
806
|
if self.cluster_strategy == "individual":
|
813
807
|
region_name = self.df_train["Region"].unique()[idx]
|
814
808
|
pbar.set_description(f"Fit/Predict for {region_name}")
|
@@ -836,12 +830,20 @@ class Geocif:
|
|
836
830
|
if self.last_year_yield_as_feature:
|
837
831
|
common_columns += [f"Last Year {self.target}"]
|
838
832
|
|
839
|
-
""" Train """
|
833
|
+
""" Feature selection and then Train """
|
840
834
|
# Filter dataframe based on region and self.feature_names
|
841
835
|
df_region_train = self.df_train[mask_train]
|
842
836
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
843
837
|
df_region_train.reset_index(drop=True, inplace=True)
|
844
|
-
self.
|
838
|
+
df_region_train = df_region_train.dropna(subset=[self.target_column])
|
839
|
+
|
840
|
+
self.X_train = df_region_train[self.feature_names]
|
841
|
+
# Drop any columns with NaNs
|
842
|
+
self.X_train.dropna(axis=1, how="any", inplace=True)
|
843
|
+
self.y_train = df_region_train[self.target_column]
|
844
|
+
|
845
|
+
self.apply_feature_selector(region, dir_output)
|
846
|
+
self.train_model(df_region_train, scaler, dir_output)
|
845
847
|
|
846
848
|
""" Predict """
|
847
849
|
if self.check_yield_trend:
|
@@ -34,10 +34,12 @@ def are_all_features_non_eo(features):
|
|
34
34
|
|
35
35
|
def select_features(
|
36
36
|
X, y,
|
37
|
-
method="
|
37
|
+
method="multi",
|
38
38
|
min_features_to_select=3,
|
39
39
|
threshold_nan=0.2,
|
40
|
-
threshold_unique=0.6
|
40
|
+
threshold_unique=0.6,
|
41
|
+
dir_output=".",
|
42
|
+
region=None
|
41
43
|
):
|
42
44
|
"""
|
43
45
|
Feature-selection wrapper supporting many methods plus a new 'multi' option.
|
@@ -75,8 +77,9 @@ def select_features(
|
|
75
77
|
# --- multi-method ensemble -------------------------------
|
76
78
|
if method == "multi":
|
77
79
|
counter = Counter()
|
80
|
+
models = ["BorutaPy", "mrmr"]
|
78
81
|
# run three selectors and count feature picks
|
79
|
-
for sub_m in
|
82
|
+
for sub_m in models:
|
80
83
|
_, _, feats = select_features(
|
81
84
|
X_clean, y,
|
82
85
|
method=sub_m,
|
@@ -84,7 +87,6 @@ def select_features(
|
|
84
87
|
threshold_nan=threshold_nan,
|
85
88
|
threshold_unique=threshold_unique
|
86
89
|
)
|
87
|
-
print(sub_m, feats)
|
88
90
|
counter.update(feats)
|
89
91
|
|
90
92
|
# union of all features
|
@@ -97,12 +99,11 @@ def select_features(
|
|
97
99
|
fig = freq.plot(kind="bar", width=0.9).get_figure()
|
98
100
|
plt.title("Feature selection frequency across methods")
|
99
101
|
plt.xlabel("Feature")
|
100
|
-
plt.ylabel("Times selected (out of
|
102
|
+
plt.ylabel(f"Times selected (out of {len(models)})")
|
101
103
|
plt.tight_layout()
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
fig.savefig(out_dir / "feature_selection_frequency.png", dpi=300)
|
105
|
+
dir_output = dir_output / Path("feature_selection")
|
106
|
+
fig.savefig(dir_output / f"feature_selection_frequency_{region}.png", dpi=300)
|
106
107
|
plt.close(fig)
|
107
108
|
|
108
109
|
return None, X_out, combined
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|