geocif 0.1.26__tar.gz → 0.1.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.26/geocif.egg-info → geocif-0.1.28}/PKG-INFO +1 -1
- {geocif-0.1.26 → geocif-0.1.28}/geocif/analysis.py +102 -93
- {geocif-0.1.26 → geocif-0.1.28}/geocif/geocif.py +48 -51
- {geocif-0.1.26 → geocif-0.1.28}/geocif/indices_runner.py +7 -7
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/correlations.py +6 -2
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/feature_selection.py +49 -3
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/stats.py +5 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/trainers.py +9 -12
- {geocif-0.1.26 → geocif-0.1.28}/geocif/viz/plot.py +1 -1
- {geocif-0.1.26 → geocif-0.1.28/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.26 → geocif-0.1.28}/setup.py +1 -1
- {geocif-0.1.26 → geocif-0.1.28}/LICENSE +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/MANIFEST.in +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/README.md +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/constants.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/features.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/geo.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/models.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/indices.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/logger.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/output.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/stages.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/trend.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/xai.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/automl.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/misc.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/utils.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/requirements.txt +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/setup.cfg +0 -0
- {geocif-0.1.26 → geocif-0.1.28}/tests/test_geocif.py +0 -0
@@ -374,17 +374,18 @@ class Geoanalysis:
|
|
374
374
|
|
375
375
|
# Remove df_tmp from df_model
|
376
376
|
df_model = df_model.drop(df_tmp.index)
|
377
|
-
|
378
377
|
# Plot the histogram of MAPE
|
379
378
|
# Create bins for '% of total Area (ha)' and 'MAPE'
|
379
|
+
bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 5 + 1)
|
380
|
+
|
380
381
|
df_model["Area Bins"] = pd.cut(
|
381
382
|
df_model["% of total Area (ha)"],
|
382
|
-
bins=
|
383
|
+
bins=bin_edges,
|
383
384
|
precision=0,
|
384
385
|
)
|
385
386
|
df_model["MAPE Bins"] = pd.cut(
|
386
387
|
df_model["Mean Absolute Percentage Error"],
|
387
|
-
bins=
|
388
|
+
bins=5, # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
|
388
389
|
right=False,
|
389
390
|
precision=1,
|
390
391
|
)
|
@@ -444,7 +445,6 @@ class Geoanalysis:
|
|
444
445
|
plt.xlabel("Mean Absolute Percentage Error (%)")
|
445
446
|
plt.ylabel("Density")
|
446
447
|
plt.legend(title="Country", title_fontsize="13")
|
447
|
-
|
448
448
|
plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
|
449
449
|
plt.close()
|
450
450
|
|
@@ -460,6 +460,9 @@ class Geoanalysis:
|
|
460
460
|
countries = df_model["Country"].unique().tolist()
|
461
461
|
# make it title case and replace _ with space
|
462
462
|
countries = [country.title().replace("_", " ") for country in countries]
|
463
|
+
countries = ["Malawi"]
|
464
|
+
df_model = df_model[df_model["Country"].isin(countries)]
|
465
|
+
self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
|
463
466
|
plot.plot_df_shpfile(
|
464
467
|
self.dg, # dataframe containing adm1 name and polygon
|
465
468
|
df_model, # dataframe containing information that will be mapped
|
@@ -468,9 +471,9 @@ class Geoanalysis:
|
|
468
471
|
name_col=col, # Which column to plot
|
469
472
|
dir_out=self.dir_analysis, # Output directory
|
470
473
|
fname=fname, # Output file name
|
471
|
-
label=f"
|
474
|
+
label=f"MAPE (%)",
|
472
475
|
vmin=df_model[col].min(),
|
473
|
-
vmax=
|
476
|
+
vmax=df_model[col].max(),
|
474
477
|
cmap=pal.scientific.sequential.Bamako_20_r,
|
475
478
|
series="sequential",
|
476
479
|
show_bg=False,
|
@@ -488,6 +491,10 @@ class Geoanalysis:
|
|
488
491
|
for model in models:
|
489
492
|
df_model = df_plot[df_plot["Model"] == model]
|
490
493
|
|
494
|
+
countries = ["malawi"]
|
495
|
+
df_model = df_model[df_model["Country"].isin(countries)]
|
496
|
+
self.dg = self.dg[self.dg["ADM0_NAME"].isin(["Malawi", "malawi"])]
|
497
|
+
|
491
498
|
countries = df_model["Country"].unique().tolist()
|
492
499
|
if len(countries) > 1:
|
493
500
|
self.dir_plot = self.dir_analysis
|
@@ -502,6 +509,7 @@ class Geoanalysis:
|
|
502
509
|
+ df_model["Region"].str.lower().str.replace("_", " ")
|
503
510
|
)
|
504
511
|
|
512
|
+
|
505
513
|
# Change Harvest year to type int
|
506
514
|
df_model["Harvest Year"] = df_model["Harvest Year"].astype(int)
|
507
515
|
annotate_region_column = (
|
@@ -517,83 +525,84 @@ class Geoanalysis:
|
|
517
525
|
df_time_period = df_harvest_year[
|
518
526
|
df_harvest_year["Stage Name"] == time_period
|
519
527
|
]
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
528
|
+
#
|
529
|
+
# """ % of total area """
|
530
|
+
if idx == 0:
|
531
|
+
fname = f"{self.country}_{self.crop}_perc_area.png"
|
532
|
+
col = "% of total Area (ha)"
|
533
|
+
plot.plot_df_shpfile(
|
534
|
+
self.dg, # dataframe containing adm1 name and polygon
|
535
|
+
df_model, # dataframe containing information that will be mapped
|
536
|
+
merge_col="Country Region", # Column on which to merge
|
537
|
+
name_country=countries, # Plot global map
|
538
|
+
name_col=col, # Which column to plot
|
539
|
+
dir_out=self.dir_plot / str(year), # Output directory
|
540
|
+
fname=fname, # Output file name
|
541
|
+
label=f"% of Total Area (ha)\n{self.crop.title()}",
|
542
|
+
vmin=df_model[col].min(),
|
543
|
+
vmax=df_model[col].max(),
|
544
|
+
cmap=pal.scientific.sequential.Bamako_20_r,
|
545
|
+
series="sequential",
|
546
|
+
show_bg=False,
|
547
|
+
annotate_regions=False,
|
548
|
+
annotate_region_column=annotate_region_column,
|
549
|
+
loc_legend="lower left",
|
550
|
+
)
|
551
|
+
#
|
552
|
+
# # """ Unique regions """
|
553
|
+
fname = f"{self.country}_{self.crop}_region_ID.png"
|
554
|
+
col = "Region_ID"
|
555
|
+
df_model[col] = df_model[col].astype(int) + 1
|
556
|
+
if len(df_model["Region_ID"].unique() > 1):
|
557
|
+
# Create a dictionary with each region assigned a unique integer identifier and name
|
558
|
+
dict_region = {
|
559
|
+
int(key): key for key in df_time_period["Region_ID"].unique()
|
560
|
+
}
|
561
|
+
plot.plot_df_shpfile(
|
562
|
+
self.dg, # dataframe containing adm1 name and polygon
|
563
|
+
df_model, # dataframe containing information that will be mapped
|
564
|
+
dict_lup=dict_region,
|
565
|
+
merge_col="Country Region", # Column on which to merge
|
566
|
+
name_country=countries, # Plot global map
|
567
|
+
name_col=col, # Which column to plot
|
568
|
+
dir_out=self.dir_plot / str(year), # Output directory
|
569
|
+
fname=fname, # Output file name
|
570
|
+
label=f"Region Cluster\n{self.crop.title()}",
|
571
|
+
vmin=df_model[col].min(),
|
572
|
+
vmax=df_model[col].max(),
|
573
|
+
cmap=pal.tableau.Tableau_20.mpl_colors,
|
574
|
+
series="qualitative",
|
575
|
+
show_bg=False,
|
576
|
+
alpha_feature=1,
|
577
|
+
use_key=True,
|
578
|
+
annotate_regions=False,
|
579
|
+
annotate_region_column=annotate_region_column,
|
580
|
+
loc_legend="lower left",
|
581
|
+
)
|
582
|
+
# breakpoint()
|
583
|
+
|
584
|
+
# """ Anomaly """
|
585
|
+
# fname = (
|
586
|
+
# f"{fname_prefix}_{self.crop}_{time_period}_{year}_anomaly.png"
|
587
|
+
# )
|
588
|
+
# plot.plot_df_shpfile(
|
589
|
+
# self.dg, # dataframe containing adm1 name and polygon
|
590
|
+
# df_harvest_year, # dataframe containing information that will be mapped
|
591
|
+
# merge_col="Country Region", # Column on which to merge
|
592
|
+
# name_country=countries, # Plot global map
|
593
|
+
# name_col="Anomaly", # Which column to plot
|
594
|
+
# dir_out=self.dir_plot / str(year), # Output directory
|
595
|
+
# fname=fname, # Output file name
|
596
|
+
# label=f"% of {self.number_lag_years}-year Median Yield\n{self.crop.title()}, {year}",
|
597
|
+
# vmin=df_harvest_year["Anomaly"].min(),
|
598
|
+
# vmax=110, # df_harvest_year["Anomaly"].max(),
|
599
|
+
# cmap=pal.cartocolors.diverging.Geyser_5_r,
|
600
|
+
# series="sequential",
|
601
|
+
# show_bg=False,
|
602
|
+
# annotate_regions=False,
|
603
|
+
# annotate_region_column=annotate_region_column,
|
604
|
+
# loc_legend="lower left",
|
605
|
+
# )
|
597
606
|
|
598
607
|
""" Predicted Yield """
|
599
608
|
fname = f"{fname_prefix}_{self.crop}_{time_period}_{year}_predicted_yield.png"
|
@@ -605,7 +614,7 @@ class Geoanalysis:
|
|
605
614
|
name_col="Predicted Yield (tn per ha)", # Which column to plot
|
606
615
|
dir_out=self.dir_plot / str(year), # Output directory
|
607
616
|
fname=fname, # Output file name
|
608
|
-
label=f"
|
617
|
+
label=f"Predicted Yield (Mg/ha)\n{self.crop.title()}, {year}",
|
609
618
|
vmin=df_harvest_year[self.predicted].min(),
|
610
619
|
vmax=df_harvest_year[self.predicted].max(),
|
611
620
|
cmap=pal.scientific.sequential.Bamako_20_r,
|
@@ -749,10 +758,10 @@ class Geoanalysis:
|
|
749
758
|
name_shapefile = df[df["Option"] == "boundary_file"]["Value"].values[0]
|
750
759
|
|
751
760
|
for crop in crops:
|
752
|
-
# Does a table with the name {country}
|
753
|
-
table = f"{country}
|
761
|
+
# Does a table with the name {country}_{crop} exist in the database?
|
762
|
+
table = f"{country}_{crop}"
|
754
763
|
if self.table_exists(self.db_path, table):
|
755
|
-
self.dict_config[f"{country}
|
764
|
+
self.dict_config[f"{country}_{crop}"] = {
|
756
765
|
"method": method,
|
757
766
|
"crops": crop,
|
758
767
|
"models": models,
|
@@ -789,7 +798,7 @@ class Geoanalysis:
|
|
789
798
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
|
790
799
|
)
|
791
800
|
# Make it lower case
|
792
|
-
self.dg["Country Region"] = self.dg["Country Region"].str.lower()
|
801
|
+
self.dg["Country Region"] = self.dg["Country Region"].str.lower().replace("_", " ")
|
793
802
|
|
794
803
|
|
795
804
|
def run(path_config_files=[Path("../config/geocif.txt")]):
|
@@ -800,16 +809,16 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
800
809
|
""" Loop over each country, crop, model combination in dict_config """
|
801
810
|
frames = []
|
802
811
|
for country_crop, value in obj.dict_config.items():
|
803
|
-
obj.
|
804
|
-
obj.
|
812
|
+
obj.crop = value["crops"]
|
813
|
+
# to get country, remove obj.crops from country_crop
|
814
|
+
obj.country = country_crop.replace(f"_{obj.crop}", "")
|
805
815
|
|
806
816
|
obj.admin_zone = value["admin_zone"]
|
807
817
|
obj.boundary_file = value["name_shapefile"]
|
808
818
|
obj.method = value["method"]
|
809
|
-
obj.number_lag_years =
|
819
|
+
obj.number_lag_years = 5
|
810
820
|
|
811
|
-
obj.table = f"{obj.country}
|
812
|
-
breakpoint()
|
821
|
+
obj.table = f"{obj.country}_{obj.crop}"
|
813
822
|
models = value["models"]
|
814
823
|
for model in models:
|
815
824
|
obj.model = model
|
@@ -93,6 +93,7 @@ class Geocif:
|
|
93
93
|
self.countries = ast.literal_eval(self.parser.get("DEFAULT", "countries"))
|
94
94
|
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
95
95
|
self.update_input_file = self.parser.getboolean("DEFAULT", "update_input_file")
|
96
|
+
self.correlation_plots = self.parser.getboolean("DEFAULT", "correlation_plots")
|
96
97
|
self.national_correlation = self.parser.getboolean(
|
97
98
|
"DEFAULT", "national_correlation"
|
98
99
|
)
|
@@ -143,8 +144,8 @@ class Geocif:
|
|
143
144
|
"""
|
144
145
|
# If ML model is run for individual region or cluster, then Region_ID is the same for each region
|
145
146
|
# or cluster and therefore redundant for the ML model
|
146
|
-
if self.cluster_strategy in ["individual", "auto_detect"]:
|
147
|
-
|
147
|
+
#if self.cluster_strategy in ["individual", "auto_detect"]:
|
148
|
+
# self.cat_features.remove("Region_ID")
|
148
149
|
|
149
150
|
self.fixed_columns: list = [
|
150
151
|
"Country",
|
@@ -264,6 +265,12 @@ class Geocif:
|
|
264
265
|
verbose=False,
|
265
266
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
266
267
|
)
|
268
|
+
elif self.model_name == "merf":
|
269
|
+
Z_train = np.ones((len(X_train), 1))
|
270
|
+
clusters_train = df_region["Region"]
|
271
|
+
clusters_train.reset_index(drop=True, inplace=True)
|
272
|
+
|
273
|
+
self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
|
267
274
|
elif self.model_name == "linear":
|
268
275
|
self.model.fit(X_train_scaled, y_train)
|
269
276
|
elif self.model_name == "gam":
|
@@ -273,23 +280,6 @@ class Geocif:
|
|
273
280
|
self.model.fit(X_train, y_train)
|
274
281
|
except:
|
275
282
|
self.logger.error(f"Error fitting model for {self.country} {self.crop}")
|
276
|
-
# if self.cluster_strategy == "individual" or len(X_train) == 1:
|
277
|
-
# self.model.fit(
|
278
|
-
# X_train,
|
279
|
-
# y_train,
|
280
|
-
# cat_features=self.cat_features,
|
281
|
-
# verbose=False,
|
282
|
-
# # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
283
|
-
# )
|
284
|
-
# elif self.cluster_strategy in ["auto_detect", "single"]:
|
285
|
-
# # Use MERF
|
286
|
-
# Z_train = np.ones((len(X_train), 1))
|
287
|
-
# clusters_train = df_region["Region"]
|
288
|
-
# clusters_train.reset_index(drop=True, inplace=True)
|
289
|
-
#
|
290
|
-
# self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
|
291
|
-
# # change clusters_train to object dtype
|
292
|
-
# # clusters_train = clusters_train.astype("object")
|
293
283
|
|
294
284
|
def predict(self, df_region, scaler=None):
|
295
285
|
"""
|
@@ -313,7 +303,6 @@ class Geocif:
|
|
313
303
|
len(X_test), df_region[f"Last Year {self.target}"].values
|
314
304
|
)
|
315
305
|
else:
|
316
|
-
best_hyperparameters = self.model.get_params().copy()
|
317
306
|
if self.model_name in ["linear", "gam"]:
|
318
307
|
# Drop cat_features from X_test
|
319
308
|
X_test = X_test.drop(
|
@@ -327,12 +316,17 @@ class Geocif:
|
|
327
316
|
if self.estimate_ci:
|
328
317
|
if self.estimate_ci_for_all or self.forecast_season == self.today_year:
|
329
318
|
y_pred, y_pred_ci = self.model.predict(X_test, alpha=0.1)
|
319
|
+
best_hyperparameters = self.model.get_params().copy()
|
320
|
+
elif self.model_name == "merf":
|
321
|
+
Z_test = np.ones((len(X_test), 1))
|
322
|
+
clusters_test = df_region["Region"]
|
323
|
+
clusters_test.reset_index(drop=True, inplace=True)
|
324
|
+
|
325
|
+
y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
|
326
|
+
best_hyperparameters = self.model.fe_model.get_params().copy()
|
330
327
|
else:
|
331
328
|
y_pred = self.model.predict(X_test)
|
332
|
-
|
333
|
-
# clusters_test = df_region["Region"]
|
334
|
-
# clusters_test.reset_index(drop=True, inplace=True)
|
335
|
-
# y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
|
329
|
+
best_hyperparameters = self.model.get_params().copy()
|
336
330
|
|
337
331
|
if self.check_yield_trend:
|
338
332
|
# Get information for retrending
|
@@ -353,7 +347,7 @@ class Geocif:
|
|
353
347
|
|
354
348
|
# Create a dataframe with forecast results
|
355
349
|
shp = len(X_test)
|
356
|
-
experiment_id = f"{self.country}
|
350
|
+
experiment_id = f"{self.country}_{self.crop}"
|
357
351
|
now = ar.utcnow().to("America/New_York").format("MMMM-DD-YYYY HH:mm:ss")
|
358
352
|
selected_features = self.selected_features + self.cat_features
|
359
353
|
df = pd.DataFrame(
|
@@ -431,6 +425,7 @@ class Geocif:
|
|
431
425
|
# Create an index based on following columns
|
432
426
|
index_columns = [
|
433
427
|
"Model",
|
428
|
+
"Cluster Strategy"
|
434
429
|
"Country",
|
435
430
|
"Region",
|
436
431
|
"Crop",
|
@@ -464,7 +459,7 @@ class Geocif:
|
|
464
459
|
2. method = "fraction" - Select a fraction (1-100) of all stages
|
465
460
|
"""
|
466
461
|
stages_features = stages.select_stages_for_ml(
|
467
|
-
stages_features, method="fraction", n=
|
462
|
+
stages_features, method="fraction", n=60
|
468
463
|
)
|
469
464
|
|
470
465
|
for stage in stages_features:
|
@@ -723,6 +718,7 @@ class Geocif:
|
|
723
718
|
)
|
724
719
|
|
725
720
|
# cat_features should be converted to category type
|
721
|
+
|
726
722
|
df[self.cat_features] = df[self.cat_features].astype("category")
|
727
723
|
|
728
724
|
""" Heatmap of correlation of various features with yield at each time step"""
|
@@ -743,30 +739,31 @@ class Geocif:
|
|
743
739
|
how="outer",
|
744
740
|
)
|
745
741
|
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
742
|
+
if self.correlation_plots:
|
743
|
+
dict_kwargs = {}
|
744
|
+
dict_kwargs["all_stages"] = self.all_stages
|
745
|
+
dict_kwargs["target_col"] = self.target
|
746
|
+
dict_kwargs["country"] = self.country
|
747
|
+
dict_kwargs["crop"] = self.crop
|
748
|
+
dict_kwargs["dir_output"] = (
|
749
|
+
self.dir_analysis
|
750
|
+
/ self.country
|
751
|
+
/ self.crop
|
752
|
+
/ self.model_name
|
753
|
+
/ str(self.forecast_season)
|
754
|
+
)
|
755
|
+
dict_kwargs["forecast_season"] = self.forecast_season
|
756
|
+
dict_kwargs["method"] = self.method
|
757
|
+
dict_kwargs["national_correlation"] = self.national_correlation
|
758
|
+
dict_kwargs["groupby"] = self.correlation_plot_groupby
|
759
|
+
dict_kwargs["dg_country"] = self.dg_country
|
760
|
+
dict_kwargs["combined_dict"] = self.combined_dict
|
761
|
+
|
762
|
+
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
763
|
+
(
|
764
|
+
dict_selected_features,
|
765
|
+
dict_best_cei,
|
766
|
+
) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
|
770
767
|
|
771
768
|
""" Separate into train and test datasets based on forecast_season """
|
772
769
|
mask = df["Harvest Year"] == self.forecast_season
|
@@ -841,7 +838,7 @@ class Geocif:
|
|
841
838
|
self.all_seasons = self.df_results["Harvest Year"].unique()
|
842
839
|
|
843
840
|
""" If not using a ML model then set XAI and CI to False """
|
844
|
-
if not self.ml_model or self.model_name in ["linear", "gam"]:
|
841
|
+
if not self.ml_model or self.model_name in ["linear", "gam", "merf"]:
|
845
842
|
self.do_xai = False
|
846
843
|
self.estimate_ci = False
|
847
844
|
self.check_yield_trend = False
|
@@ -155,21 +155,21 @@ class cei_runner(base.BaseGeo):
|
|
155
155
|
"ndvi",
|
156
156
|
True,
|
157
157
|
)
|
158
|
-
for year in range(
|
158
|
+
for year in range(2023, ar.utcnow().year + 1)
|
159
159
|
for status, path, filename, admin_zone, category in combinations
|
160
160
|
]
|
161
161
|
|
162
162
|
# Only keep those entries in combinations where the third elemt is
|
163
163
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
164
164
|
# This is done to test the code for these countries
|
165
|
-
|
166
|
-
|
165
|
+
combinations = [i for i in combinations if "angola_maize" in i[3] or
|
166
|
+
"lesotho_maize" in i[3] or
|
167
167
|
# "namibia" in i[2] or
|
168
168
|
# "united_republic_of_tanzania" in i[2] or
|
169
|
-
|
170
|
-
|
169
|
+
"zambia_maize" in i[3] or
|
170
|
+
"zimbabwe_maize" in i[3] or
|
171
171
|
# "south_africa" in i[2] or
|
172
|
-
|
172
|
+
"mozambique_maize" in i[3]]
|
173
173
|
# "malawi" in i[2]]
|
174
174
|
|
175
175
|
if self.do_parallel:
|
@@ -201,7 +201,7 @@ def run(path_config_files=[]):
|
|
201
201
|
indices.validate_index_definitions()
|
202
202
|
|
203
203
|
for method in [
|
204
|
-
"
|
204
|
+
"biweekly_r", # "dekad_r" # "dekad_r"
|
205
205
|
]: # , "full_season", "phenological_stages", "fraction_season"]:
|
206
206
|
obj = cei_runner(path_config_files)
|
207
207
|
obj.main(method)
|
@@ -274,8 +274,12 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
274
274
|
# For each element in dict_best_cei, add the type of the cei
|
275
275
|
else:
|
276
276
|
# HACK
|
277
|
-
|
278
|
-
|
277
|
+
df_corr = _all_correlated_feature_by_time(df, **kwargs)
|
278
|
+
dict_selected_features[region_id] = df_corr.columns
|
279
|
+
dict_best_cei[region_id] = {}
|
280
|
+
|
281
|
+
#dict_selected_features[region_id] = dict_selected_features[0]
|
282
|
+
#dict_best_cei[region_id] = dict_best_cei[0]
|
279
283
|
# Combine all unique values from the existing dictionary elements
|
280
284
|
# combined_metrics = set()
|
281
285
|
# for key in dict_selected_features:
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
from tqdm import tqdm
|
2
3
|
from sklearn.ensemble import RandomForestRegressor
|
3
4
|
|
4
5
|
|
@@ -27,6 +28,7 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
27
28
|
# # You would adjust the threshold based on new criteria since variances have been normalized.
|
28
29
|
# selector = VarianceThreshold(threshold=scaled_data.var().mean())
|
29
30
|
# X = selector.fit_transform(scaled_data)
|
31
|
+
selector = None
|
30
32
|
|
31
33
|
# Fill in columns with median of that column
|
32
34
|
X = X.fillna(X.median())
|
@@ -47,17 +49,58 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
47
49
|
if method == "SelectKBest":
|
48
50
|
from sklearn.feature_selection import SelectKBest, f_regression
|
49
51
|
|
50
|
-
k =
|
51
|
-
selector = SelectKBest(score_func=f_regression, k=
|
52
|
+
k = 15 # Number of features to select
|
53
|
+
selector = SelectKBest(score_func=f_regression, k=k)
|
52
54
|
|
53
55
|
# Fit the selector to the data and transform the data to select the best features
|
54
|
-
|
56
|
+
try:
|
57
|
+
X_new = selector.fit_transform(X, y)
|
58
|
+
except:
|
59
|
+
breakpoint()
|
55
60
|
|
56
61
|
# Get the selected feature indices
|
57
62
|
selected_features = selector.get_support(indices=True)
|
58
63
|
|
59
64
|
# Get the selected feature names
|
60
65
|
selected_features = X.columns[selected_features].tolist()
|
66
|
+
elif method == "SHAP":
|
67
|
+
import pandas as pd
|
68
|
+
from catboost import CatBoostRegressor
|
69
|
+
from fasttreeshap import TreeExplainer as FastTreeExplainer
|
70
|
+
from sklearn.model_selection import cross_val_score
|
71
|
+
|
72
|
+
model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
|
73
|
+
model.fit(X, y)
|
74
|
+
|
75
|
+
explainer = FastTreeExplainer(model)
|
76
|
+
shap_values = explainer.shap_values(X)
|
77
|
+
|
78
|
+
# Step 5: Summarize the SHAP values for feature importance
|
79
|
+
shap_importances = np.mean(np.abs(shap_values), axis=0)
|
80
|
+
shap_importance_df = pd.DataFrame({
|
81
|
+
'feature': X.columns,
|
82
|
+
'importance': shap_importances
|
83
|
+
}).sort_values(by='importance', ascending=False)
|
84
|
+
|
85
|
+
def evaluate_model_with_n_features(N, X_train, y_train):
|
86
|
+
top_features = shap_importance_df['feature'].head(N).values
|
87
|
+
X_train_selected = X_train[top_features]
|
88
|
+
selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
|
89
|
+
scores = cross_val_score(selector, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
|
90
|
+
|
91
|
+
return np.mean(scores)
|
92
|
+
|
93
|
+
# Evaluate model performance with different number of features
|
94
|
+
nrange = [5, 10, 15, 20, 25, 30]
|
95
|
+
cv_scores = []
|
96
|
+
for N in tqdm(nrange):
|
97
|
+
cv_scores.append(evaluate_model_with_n_features(N, X, y))
|
98
|
+
|
99
|
+
# Select the number of features that gives the best cross-validation score (lowest MSE)
|
100
|
+
optimal_N = nrange[np.argmax(cv_scores)]
|
101
|
+
|
102
|
+
# Use optimal N to select features
|
103
|
+
selected_features = shap_importance_df['feature'].head(optimal_N).values.tolist()
|
61
104
|
elif method == "feature_engine":
|
62
105
|
from feature_engine.selection import SmartCorrelatedSelection
|
63
106
|
|
@@ -110,6 +153,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
110
153
|
selector.fit(X.values, y.values)
|
111
154
|
selected_features_mask = selector.support_
|
112
155
|
selected_features = X.columns[selected_features_mask].tolist()
|
156
|
+
tentative_features = X.columns[selector.support_weak_].tolist()
|
157
|
+
|
158
|
+
selected_features = selected_features + tentative_features
|
113
159
|
elif method == "Leshy":
|
114
160
|
import arfs.feature_selection.allrelevant as arfsgroot
|
115
161
|
from catboost import CatBoostRegressor
|
@@ -205,6 +205,11 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
|
|
205
205
|
"value",
|
206
206
|
]
|
207
207
|
|
208
|
+
# Replace any inf or 0 values by NaN
|
209
|
+
yield_value = yield_value.replace([0, np.inf, -np.inf], np.nan)
|
210
|
+
area_value = area_value.replace([0, np.inf, -np.inf], np.nan)
|
211
|
+
prod_value = prod_value.replace([0, np.inf, -np.inf], np.nan)
|
212
|
+
|
208
213
|
if not yield_value.empty:
|
209
214
|
group.loc[:, target_col] = yield_value.values[0]
|
210
215
|
group.loc[:, "Area (ha)"] = area_value.values[0]
|
@@ -252,7 +252,7 @@ def auto_train(
|
|
252
252
|
else:
|
253
253
|
hyperparams = {}
|
254
254
|
|
255
|
-
if model_name
|
255
|
+
if model_name in ["catboost", "merf"]:
|
256
256
|
hyperparams = {
|
257
257
|
"depth": 6,
|
258
258
|
"learning_rate": 0.01,
|
@@ -265,17 +265,14 @@ def auto_train(
|
|
265
265
|
"random_seed": seed,
|
266
266
|
"verbose": False,
|
267
267
|
}
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
# # For all features with AUC in name, set monotone_constraints to 1, rest are 0
|
277
|
-
# # monotone_constraints = [1 if "AUC_" in ftr else 0 for ftr in feature_names]
|
278
|
-
# model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
268
|
+
if model_name == "catboost":
|
269
|
+
model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
270
|
+
elif model_name == "merf":
|
271
|
+
from merf import MERF
|
272
|
+
|
273
|
+
hyperparams["iterations"] = 1000
|
274
|
+
regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
275
|
+
model = MERF(regr, max_iterations=10)
|
279
276
|
elif model_name == "linear":
|
280
277
|
from sklearn.linear_model import LassoCV
|
281
278
|
|
@@ -332,7 +332,7 @@ def plot_df_shpfile(
|
|
332
332
|
cb.ax.set_title(
|
333
333
|
label, fontsize=8, fontweight="semibold", fontfamily="Arial"
|
334
334
|
)
|
335
|
-
cb.ax.set_xticklabels(ticks, fontsize=
|
335
|
+
cb.ax.set_xticklabels(ticks, fontsize=4, fontfamily="Arial")
|
336
336
|
|
337
337
|
# Use BoundaryNorm to create discrete levels
|
338
338
|
# sm = plt.cm.ScalarMappable(cmap=cmap.mpl_colormap, norm=norm)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|