geocif 0.1.26__tar.gz → 0.1.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {geocif-0.1.26/geocif.egg-info → geocif-0.1.28}/PKG-INFO +1 -1
  2. {geocif-0.1.26 → geocif-0.1.28}/geocif/analysis.py +102 -93
  3. {geocif-0.1.26 → geocif-0.1.28}/geocif/geocif.py +48 -51
  4. {geocif-0.1.26 → geocif-0.1.28}/geocif/indices_runner.py +7 -7
  5. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/correlations.py +6 -2
  6. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/feature_selection.py +49 -3
  7. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/stats.py +5 -0
  8. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/trainers.py +9 -12
  9. {geocif-0.1.26 → geocif-0.1.28}/geocif/viz/plot.py +1 -1
  10. {geocif-0.1.26 → geocif-0.1.28/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.1.26 → geocif-0.1.28}/setup.py +1 -1
  12. {geocif-0.1.26 → geocif-0.1.28}/LICENSE +0 -0
  13. {geocif-0.1.26 → geocif-0.1.28}/MANIFEST.in +0 -0
  14. {geocif-0.1.26 → geocif-0.1.28}/README.md +0 -0
  15. {geocif-0.1.26 → geocif-0.1.28}/geocif/__init__.py +0 -0
  16. {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/__init__.py +0 -0
  17. {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/geoagmet.py +0 -0
  18. {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/plot.py +0 -0
  19. {geocif-0.1.26 → geocif-0.1.28}/geocif/agmet/utils.py +0 -0
  20. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/__init__.py +0 -0
  21. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/constants.py +0 -0
  22. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/features.py +0 -0
  23. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/geo.py +0 -0
  24. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/geocif.py +0 -0
  25. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/metadata.py +0 -0
  26. {geocif-0.1.26 → geocif-0.1.28}/geocif/backup/models.py +0 -0
  27. {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/__init__.py +0 -0
  28. {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/definitions.py +0 -0
  29. {geocif-0.1.26 → geocif-0.1.28}/geocif/cei/indices.py +0 -0
  30. {geocif-0.1.26 → geocif-0.1.28}/geocif/logger.py +0 -0
  31. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/__init__.py +0 -0
  32. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/embedding.py +0 -0
  33. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/feature_engineering.py +0 -0
  34. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/outliers.py +0 -0
  35. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/outlook.py +0 -0
  36. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/output.py +0 -0
  37. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/stages.py +0 -0
  38. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/trend.py +0 -0
  39. {geocif-0.1.26 → geocif-0.1.28}/geocif/ml/xai.py +0 -0
  40. {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/__init__.py +0 -0
  41. {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/automl.py +0 -0
  42. {geocif-0.1.26 → geocif-0.1.28}/geocif/playground/misc.py +0 -0
  43. {geocif-0.1.26 → geocif-0.1.28}/geocif/utils.py +0 -0
  44. {geocif-0.1.26 → geocif-0.1.28}/geocif/viz/__init__.py +0 -0
  45. {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/SOURCES.txt +0 -0
  46. {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/dependency_links.txt +0 -0
  47. {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/not-zip-safe +0 -0
  48. {geocif-0.1.26 → geocif-0.1.28}/geocif.egg-info/top_level.txt +0 -0
  49. {geocif-0.1.26 → geocif-0.1.28}/requirements.txt +0 -0
  50. {geocif-0.1.26 → geocif-0.1.28}/setup.cfg +0 -0
  51. {geocif-0.1.26 → geocif-0.1.28}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.26
3
+ Version: 0.1.28
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -374,17 +374,18 @@ class Geoanalysis:
374
374
 
375
375
  # Remove df_tmp from df_model
376
376
  df_model = df_model.drop(df_tmp.index)
377
-
378
377
  # Plot the histogram of MAPE
379
378
  # Create bins for '% of total Area (ha)' and 'MAPE'
379
+ bin_edges = np.linspace(0, df_model["% of total Area (ha)"].max() + 1, 5 + 1)
380
+
380
381
  df_model["Area Bins"] = pd.cut(
381
382
  df_model["% of total Area (ha)"],
382
- bins=[0, 2, 4, 6, 8, 10, 20, max(df_model["% of total Area (ha)"])],
383
+ bins=bin_edges,
383
384
  precision=0,
384
385
  )
385
386
  df_model["MAPE Bins"] = pd.cut(
386
387
  df_model["Mean Absolute Percentage Error"],
387
- bins=6, # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
388
+ bins=5, # [0, 5, 10, 15, 20, 25, 30, 50, max(df_model["Mean Absolute Percentage Error"])],
388
389
  right=False,
389
390
  precision=1,
390
391
  )
@@ -444,7 +445,6 @@ class Geoanalysis:
444
445
  plt.xlabel("Mean Absolute Percentage Error (%)")
445
446
  plt.ylabel("Density")
446
447
  plt.legend(title="Country", title_fontsize="13")
447
-
448
448
  plt.savefig(self.dir_analysis / f"mape_histogram_{model}.png", dpi=250)
449
449
  plt.close()
450
450
 
@@ -460,6 +460,9 @@ class Geoanalysis:
460
460
  countries = df_model["Country"].unique().tolist()
461
461
  # make it title case and replace _ with space
462
462
  countries = [country.title().replace("_", " ") for country in countries]
463
+ countries = ["Malawi"]
464
+ df_model = df_model[df_model["Country"].isin(countries)]
465
+ self.dg = self.dg[self.dg["ADM0_NAME"].isin(countries)]
463
466
  plot.plot_df_shpfile(
464
467
  self.dg, # dataframe containing adm1 name and polygon
465
468
  df_model, # dataframe containing information that will be mapped
@@ -468,9 +471,9 @@ class Geoanalysis:
468
471
  name_col=col, # Which column to plot
469
472
  dir_out=self.dir_analysis, # Output directory
470
473
  fname=fname, # Output file name
471
- label=f"Mean Absolute Percentage Error",
474
+ label=f"MAPE (%)",
472
475
  vmin=df_model[col].min(),
473
- vmax=50, # df_model[col].max(),
476
+ vmax=df_model[col].max(),
474
477
  cmap=pal.scientific.sequential.Bamako_20_r,
475
478
  series="sequential",
476
479
  show_bg=False,
@@ -488,6 +491,10 @@ class Geoanalysis:
488
491
  for model in models:
489
492
  df_model = df_plot[df_plot["Model"] == model]
490
493
 
494
+ countries = ["malawi"]
495
+ df_model = df_model[df_model["Country"].isin(countries)]
496
+ self.dg = self.dg[self.dg["ADM0_NAME"].isin(["Malawi", "malawi"])]
497
+
491
498
  countries = df_model["Country"].unique().tolist()
492
499
  if len(countries) > 1:
493
500
  self.dir_plot = self.dir_analysis
@@ -502,6 +509,7 @@ class Geoanalysis:
502
509
  + df_model["Region"].str.lower().str.replace("_", " ")
503
510
  )
504
511
 
512
+
505
513
  # Change Harvest year to type int
506
514
  df_model["Harvest Year"] = df_model["Harvest Year"].astype(int)
507
515
  annotate_region_column = (
@@ -517,83 +525,84 @@ class Geoanalysis:
517
525
  df_time_period = df_harvest_year[
518
526
  df_harvest_year["Stage Name"] == time_period
519
527
  ]
520
-
521
- """ % of total area """
522
- # if idx == 0:
523
- # fname = f"{self.country}_{self.crop}_perc_area.png"
524
- # col = "% of total Area (ha)"
525
- # plot.plot_df_shpfile(
526
- # self.dg, # dataframe containing adm1 name and polygon
527
- # df_model, # dataframe containing information that will be mapped
528
- # merge_col="Country Region", # Column on which to merge
529
- # name_country=countries, # Plot global map
530
- # name_col=col, # Which column to plot
531
- # dir_out=self.plot_dir / str(year), # Output directory
532
- # fname=fname, # Output file name
533
- # label=f"% of Total Area (ha)\n{self.crop.title()}",
534
- # vmin=df_model[col].min(),
535
- # vmax=df_model[col].max(),
536
- # cmap=pal.scientific.sequential.Bamako_20_r,
537
- # series="sequential",
538
- # show_bg=False,
539
- # annotate_regions=True,
540
- # annotate_region_column=annotate_region_column,
541
- # loc_legend="lower left",
542
- # )
543
- #
544
- # """ Unique regions """
545
- # fname = f"{self.country}_{self.crop}_region_ID.png"
546
- # col = "Region_ID"
547
- # df_model[col] = df_model[col].astype(int) + 1
548
- # if len(df_model["Region_ID"].unique() > 1):
549
- # # Create a dictionary with each region assigned a unique integer identifier and name
550
- # dict_region = {
551
- # int(key): key for key in df_time_period["Region_ID"].unique()
552
- # }
553
- # plot.plot_df_shpfile(
554
- # self.dg, # dataframe containing adm1 name and polygon
555
- # df_model, # dataframe containing information that will be mapped
556
- # dict_lup=dict_region,
557
- # merge_col="Country Region", # Column on which to merge
558
- # name_country=countries, # Plot global map
559
- # name_col=col, # Which column to plot
560
- # dir_out=self.plot_dir / str(year), # Output directory
561
- # fname=fname, # Output file name
562
- # label=f"Region Cluster\n{self.crop.title()}",
563
- # vmin=df_model[col].min(),
564
- # vmax=df_model[col].max(),
565
- # cmap=pal.tableau.Tableau_20.mpl_colors,
566
- # series="qualitative",
567
- # show_bg=False,
568
- # alpha_feature=1,
569
- # use_key=True,
570
- # annotate_regions=True,
571
- # annotate_region_column=annotate_region_column,
572
- # loc_legend="lower left",
573
- # )
574
-
575
- """ Anomaly """
576
- fname = (
577
- f"{fname_prefix}_{self.crop}_{time_period}_{year}_anomaly.png"
578
- )
579
- plot.plot_df_shpfile(
580
- self.dg, # dataframe containing adm1 name and polygon
581
- df_harvest_year, # dataframe containing information that will be mapped
582
- merge_col="Country Region", # Column on which to merge
583
- name_country=countries, # Plot global map
584
- name_col="Anomaly", # Which column to plot
585
- dir_out=self.dir_plot / str(year), # Output directory
586
- fname=fname, # Output file name
587
- label=f"% of {self.number_lag_years}-year Median Yield\n{self.crop.title()}, {year}",
588
- vmin=df_harvest_year["Anomaly"].min(),
589
- vmax=110, # df_harvest_year["Anomaly"].max(),
590
- cmap=pal.cartocolors.diverging.Geyser_5_r,
591
- series="sequential",
592
- show_bg=False,
593
- annotate_regions=False,
594
- annotate_region_column=annotate_region_column,
595
- loc_legend="lower left",
596
- )
528
+ #
529
+ # """ % of total area """
530
+ if idx == 0:
531
+ fname = f"{self.country}_{self.crop}_perc_area.png"
532
+ col = "% of total Area (ha)"
533
+ plot.plot_df_shpfile(
534
+ self.dg, # dataframe containing adm1 name and polygon
535
+ df_model, # dataframe containing information that will be mapped
536
+ merge_col="Country Region", # Column on which to merge
537
+ name_country=countries, # Plot global map
538
+ name_col=col, # Which column to plot
539
+ dir_out=self.dir_plot / str(year), # Output directory
540
+ fname=fname, # Output file name
541
+ label=f"% of Total Area (ha)\n{self.crop.title()}",
542
+ vmin=df_model[col].min(),
543
+ vmax=df_model[col].max(),
544
+ cmap=pal.scientific.sequential.Bamako_20_r,
545
+ series="sequential",
546
+ show_bg=False,
547
+ annotate_regions=False,
548
+ annotate_region_column=annotate_region_column,
549
+ loc_legend="lower left",
550
+ )
551
+ #
552
+ # # """ Unique regions """
553
+ fname = f"{self.country}_{self.crop}_region_ID.png"
554
+ col = "Region_ID"
555
+ df_model[col] = df_model[col].astype(int) + 1
556
+ if len(df_model["Region_ID"].unique() > 1):
557
+ # Create a dictionary with each region assigned a unique integer identifier and name
558
+ dict_region = {
559
+ int(key): key for key in df_time_period["Region_ID"].unique()
560
+ }
561
+ plot.plot_df_shpfile(
562
+ self.dg, # dataframe containing adm1 name and polygon
563
+ df_model, # dataframe containing information that will be mapped
564
+ dict_lup=dict_region,
565
+ merge_col="Country Region", # Column on which to merge
566
+ name_country=countries, # Plot global map
567
+ name_col=col, # Which column to plot
568
+ dir_out=self.dir_plot / str(year), # Output directory
569
+ fname=fname, # Output file name
570
+ label=f"Region Cluster\n{self.crop.title()}",
571
+ vmin=df_model[col].min(),
572
+ vmax=df_model[col].max(),
573
+ cmap=pal.tableau.Tableau_20.mpl_colors,
574
+ series="qualitative",
575
+ show_bg=False,
576
+ alpha_feature=1,
577
+ use_key=True,
578
+ annotate_regions=False,
579
+ annotate_region_column=annotate_region_column,
580
+ loc_legend="lower left",
581
+ )
582
+ # breakpoint()
583
+
584
+ # """ Anomaly """
585
+ # fname = (
586
+ # f"{fname_prefix}_{self.crop}_{time_period}_{year}_anomaly.png"
587
+ # )
588
+ # plot.plot_df_shpfile(
589
+ # self.dg, # dataframe containing adm1 name and polygon
590
+ # df_harvest_year, # dataframe containing information that will be mapped
591
+ # merge_col="Country Region", # Column on which to merge
592
+ # name_country=countries, # Plot global map
593
+ # name_col="Anomaly", # Which column to plot
594
+ # dir_out=self.dir_plot / str(year), # Output directory
595
+ # fname=fname, # Output file name
596
+ # label=f"% of {self.number_lag_years}-year Median Yield\n{self.crop.title()}, {year}",
597
+ # vmin=df_harvest_year["Anomaly"].min(),
598
+ # vmax=110, # df_harvest_year["Anomaly"].max(),
599
+ # cmap=pal.cartocolors.diverging.Geyser_5_r,
600
+ # series="sequential",
601
+ # show_bg=False,
602
+ # annotate_regions=False,
603
+ # annotate_region_column=annotate_region_column,
604
+ # loc_legend="lower left",
605
+ # )
597
606
 
598
607
  """ Predicted Yield """
599
608
  fname = f"{fname_prefix}_{self.crop}_{time_period}_{year}_predicted_yield.png"
@@ -605,7 +614,7 @@ class Geoanalysis:
605
614
  name_col="Predicted Yield (tn per ha)", # Which column to plot
606
615
  dir_out=self.dir_plot / str(year), # Output directory
607
616
  fname=fname, # Output file name
608
- label=f"{self.predicted}\n{self.crop.title()}, {year}",
617
+ label=f"Predicted Yield (Mg/ha)\n{self.crop.title()}, {year}",
609
618
  vmin=df_harvest_year[self.predicted].min(),
610
619
  vmax=df_harvest_year[self.predicted].max(),
611
620
  cmap=pal.scientific.sequential.Bamako_20_r,
@@ -749,10 +758,10 @@ class Geoanalysis:
749
758
  name_shapefile = df[df["Option"] == "boundary_file"]["Value"].values[0]
750
759
 
751
760
  for crop in crops:
752
- # Does a table with the name {country}-{crop} exist in the database?
753
- table = f"{country}-{crop}"
761
+ # Does a table with the name {country}_{crop} exist in the database?
762
+ table = f"{country}_{crop}"
754
763
  if self.table_exists(self.db_path, table):
755
- self.dict_config[f"{country}-{crop}"] = {
764
+ self.dict_config[f"{country}_{crop}"] = {
756
765
  "method": method,
757
766
  "crops": crop,
758
767
  "models": models,
@@ -789,7 +798,7 @@ class Geoanalysis:
789
798
  self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
790
799
  )
791
800
  # Make it lower case
792
- self.dg["Country Region"] = self.dg["Country Region"].str.lower()
801
+ self.dg["Country Region"] = self.dg["Country Region"].str.lower().replace("_", " ")
793
802
 
794
803
 
795
804
  def run(path_config_files=[Path("../config/geocif.txt")]):
@@ -800,16 +809,16 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
800
809
  """ Loop over each country, crop, model combination in dict_config """
801
810
  frames = []
802
811
  for country_crop, value in obj.dict_config.items():
803
- obj.country = country_crop.split("-")[0]
804
- obj.crop = country_crop.split("-")[1]
812
+ obj.crop = value["crops"]
813
+ # to get country, remove obj.crops from country_crop
814
+ obj.country = country_crop.replace(f"_{obj.crop}", "")
805
815
 
806
816
  obj.admin_zone = value["admin_zone"]
807
817
  obj.boundary_file = value["name_shapefile"]
808
818
  obj.method = value["method"]
809
- obj.number_lag_years = value["number_lag_years"]
819
+ obj.number_lag_years = 5
810
820
 
811
- obj.table = f"{obj.country}-{obj.crop}"
812
- breakpoint()
821
+ obj.table = f"{obj.country}_{obj.crop}"
813
822
  models = value["models"]
814
823
  for model in models:
815
824
  obj.model = model
@@ -93,6 +93,7 @@ class Geocif:
93
93
  self.countries = ast.literal_eval(self.parser.get("DEFAULT", "countries"))
94
94
  self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
95
95
  self.update_input_file = self.parser.getboolean("DEFAULT", "update_input_file")
96
+ self.correlation_plots = self.parser.getboolean("DEFAULT", "correlation_plots")
96
97
  self.national_correlation = self.parser.getboolean(
97
98
  "DEFAULT", "national_correlation"
98
99
  )
@@ -143,8 +144,8 @@ class Geocif:
143
144
  """
144
145
  # If ML model is run for individual region or cluster, then Region_ID is the same for each region
145
146
  # or cluster and therefore redundant for the ML model
146
- if self.cluster_strategy in ["individual", "auto_detect"]:
147
- self.cat_features.remove("Region_ID")
147
+ #if self.cluster_strategy in ["individual", "auto_detect"]:
148
+ # self.cat_features.remove("Region_ID")
148
149
 
149
150
  self.fixed_columns: list = [
150
151
  "Country",
@@ -264,6 +265,12 @@ class Geocif:
264
265
  verbose=False,
265
266
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
266
267
  )
268
+ elif self.model_name == "merf":
269
+ Z_train = np.ones((len(X_train), 1))
270
+ clusters_train = df_region["Region"]
271
+ clusters_train.reset_index(drop=True, inplace=True)
272
+
273
+ self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
267
274
  elif self.model_name == "linear":
268
275
  self.model.fit(X_train_scaled, y_train)
269
276
  elif self.model_name == "gam":
@@ -273,23 +280,6 @@ class Geocif:
273
280
  self.model.fit(X_train, y_train)
274
281
  except:
275
282
  self.logger.error(f"Error fitting model for {self.country} {self.crop}")
276
- # if self.cluster_strategy == "individual" or len(X_train) == 1:
277
- # self.model.fit(
278
- # X_train,
279
- # y_train,
280
- # cat_features=self.cat_features,
281
- # verbose=False,
282
- # # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
283
- # )
284
- # elif self.cluster_strategy in ["auto_detect", "single"]:
285
- # # Use MERF
286
- # Z_train = np.ones((len(X_train), 1))
287
- # clusters_train = df_region["Region"]
288
- # clusters_train.reset_index(drop=True, inplace=True)
289
- #
290
- # self.model.fit(X_train, Z_train, clusters_train.astype("object"), y_train.values)
291
- # # change clusters_train to object dtype
292
- # # clusters_train = clusters_train.astype("object")
293
283
 
294
284
  def predict(self, df_region, scaler=None):
295
285
  """
@@ -313,7 +303,6 @@ class Geocif:
313
303
  len(X_test), df_region[f"Last Year {self.target}"].values
314
304
  )
315
305
  else:
316
- best_hyperparameters = self.model.get_params().copy()
317
306
  if self.model_name in ["linear", "gam"]:
318
307
  # Drop cat_features from X_test
319
308
  X_test = X_test.drop(
@@ -327,12 +316,17 @@ class Geocif:
327
316
  if self.estimate_ci:
328
317
  if self.estimate_ci_for_all or self.forecast_season == self.today_year:
329
318
  y_pred, y_pred_ci = self.model.predict(X_test, alpha=0.1)
319
+ best_hyperparameters = self.model.get_params().copy()
320
+ elif self.model_name == "merf":
321
+ Z_test = np.ones((len(X_test), 1))
322
+ clusters_test = df_region["Region"]
323
+ clusters_test.reset_index(drop=True, inplace=True)
324
+
325
+ y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
326
+ best_hyperparameters = self.model.fe_model.get_params().copy()
330
327
  else:
331
328
  y_pred = self.model.predict(X_test)
332
- # Z_test = np.ones((len(X_test), 1))
333
- # clusters_test = df_region["Region"]
334
- # clusters_test.reset_index(drop=True, inplace=True)
335
- # y_pred = self.model.predict(X_test, Z_test, clusters_test.astype("object"))
329
+ best_hyperparameters = self.model.get_params().copy()
336
330
 
337
331
  if self.check_yield_trend:
338
332
  # Get information for retrending
@@ -353,7 +347,7 @@ class Geocif:
353
347
 
354
348
  # Create a dataframe with forecast results
355
349
  shp = len(X_test)
356
- experiment_id = f"{self.country}-{self.crop}"
350
+ experiment_id = f"{self.country}_{self.crop}"
357
351
  now = ar.utcnow().to("America/New_York").format("MMMM-DD-YYYY HH:mm:ss")
358
352
  selected_features = self.selected_features + self.cat_features
359
353
  df = pd.DataFrame(
@@ -431,6 +425,7 @@ class Geocif:
431
425
  # Create an index based on following columns
432
426
  index_columns = [
433
427
  "Model",
428
+ "Cluster Strategy"
434
429
  "Country",
435
430
  "Region",
436
431
  "Crop",
@@ -464,7 +459,7 @@ class Geocif:
464
459
  2. method = "fraction" - Select a fraction (1-100) of all stages
465
460
  """
466
461
  stages_features = stages.select_stages_for_ml(
467
- stages_features, method="fraction", n=30
462
+ stages_features, method="fraction", n=60
468
463
  )
469
464
 
470
465
  for stage in stages_features:
@@ -723,6 +718,7 @@ class Geocif:
723
718
  )
724
719
 
725
720
  # cat_features should be converted to category type
721
+
726
722
  df[self.cat_features] = df[self.cat_features].astype("category")
727
723
 
728
724
  """ Heatmap of correlation of various features with yield at each time step"""
@@ -743,30 +739,31 @@ class Geocif:
743
739
  how="outer",
744
740
  )
745
741
 
746
- dict_kwargs = {}
747
- dict_kwargs["all_stages"] = self.all_stages
748
- dict_kwargs["target_col"] = self.target
749
- dict_kwargs["country"] = self.country
750
- dict_kwargs["crop"] = self.crop
751
- dict_kwargs["dir_output"] = (
752
- self.dir_analysis
753
- / self.country
754
- / self.crop
755
- / self.model_name
756
- / str(self.forecast_season)
757
- )
758
- dict_kwargs["forecast_season"] = self.forecast_season
759
- dict_kwargs["method"] = self.method
760
- dict_kwargs["national_correlation"] = self.national_correlation
761
- dict_kwargs["groupby"] = self.correlation_plot_groupby
762
- dict_kwargs["dg_country"] = self.dg_country
763
- dict_kwargs["combined_dict"] = self.combined_dict
764
-
765
- self.logger.info(f"Correlation plot for {self.country} {self.crop}")
766
- (
767
- dict_selected_features,
768
- dict_best_cei,
769
- ) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
742
+ if self.correlation_plots:
743
+ dict_kwargs = {}
744
+ dict_kwargs["all_stages"] = self.all_stages
745
+ dict_kwargs["target_col"] = self.target
746
+ dict_kwargs["country"] = self.country
747
+ dict_kwargs["crop"] = self.crop
748
+ dict_kwargs["dir_output"] = (
749
+ self.dir_analysis
750
+ / self.country
751
+ / self.crop
752
+ / self.model_name
753
+ / str(self.forecast_season)
754
+ )
755
+ dict_kwargs["forecast_season"] = self.forecast_season
756
+ dict_kwargs["method"] = self.method
757
+ dict_kwargs["national_correlation"] = self.national_correlation
758
+ dict_kwargs["groupby"] = self.correlation_plot_groupby
759
+ dict_kwargs["dg_country"] = self.dg_country
760
+ dict_kwargs["combined_dict"] = self.combined_dict
761
+
762
+ self.logger.info(f"Correlation plot for {self.country} {self.crop}")
763
+ (
764
+ dict_selected_features,
765
+ dict_best_cei,
766
+ ) = correlations.all_correlated_feature_by_time(df, **dict_kwargs)
770
767
 
771
768
  """ Separate into train and test datasets based on forecast_season """
772
769
  mask = df["Harvest Year"] == self.forecast_season
@@ -841,7 +838,7 @@ class Geocif:
841
838
  self.all_seasons = self.df_results["Harvest Year"].unique()
842
839
 
843
840
  """ If not using a ML model then set XAI and CI to False """
844
- if not self.ml_model or self.model_name in ["linear", "gam"]:
841
+ if not self.ml_model or self.model_name in ["linear", "gam", "merf"]:
845
842
  self.do_xai = False
846
843
  self.estimate_ci = False
847
844
  self.check_yield_trend = False
@@ -155,21 +155,21 @@ class cei_runner(base.BaseGeo):
155
155
  "ndvi",
156
156
  True,
157
157
  )
158
- for year in range(2001, ar.utcnow().year + 1)
158
+ for year in range(2023, ar.utcnow().year + 1)
159
159
  for status, path, filename, admin_zone, category in combinations
160
160
  ]
161
161
 
162
162
  # Only keep those entries in combinations where the third elemt is
163
163
  # mozambique, south_africa, angola or dem_people's_rep_of_korea
164
164
  # This is done to test the code for these countries
165
- # combinations = [i for i in combinations if "angola" in i[2] or
166
- # "lesotho" in i[2] or
165
+ combinations = [i for i in combinations if "angola_maize" in i[3] or
166
+ "lesotho_maize" in i[3] or
167
167
  # "namibia" in i[2] or
168
168
  # "united_republic_of_tanzania" in i[2] or
169
- # "zambia" in i[2] or
170
- # "zimbabwe" in i[2] or
169
+ "zambia_maize" in i[3] or
170
+ "zimbabwe_maize" in i[3] or
171
171
  # "south_africa" in i[2] or
172
- # "mozambique" in i[2] or
172
+ "mozambique_maize" in i[3]]
173
173
  # "malawi" in i[2]]
174
174
 
175
175
  if self.do_parallel:
@@ -201,7 +201,7 @@ def run(path_config_files=[]):
201
201
  indices.validate_index_definitions()
202
202
 
203
203
  for method in [
204
- "monthly_r", # "dekad_r" # "dekad_r"
204
+ "biweekly_r", # "dekad_r" # "dekad_r"
205
205
  ]: # , "full_season", "phenological_stages", "fraction_season"]:
206
206
  obj = cei_runner(path_config_files)
207
207
  obj.main(method)
@@ -274,8 +274,12 @@ def all_correlated_feature_by_time(df, **kwargs):
274
274
  # For each element in dict_best_cei, add the type of the cei
275
275
  else:
276
276
  # HACK
277
- dict_selected_features[region_id] = dict_selected_features[0]
278
- dict_best_cei[region_id] = dict_best_cei[0]
277
+ df_corr = _all_correlated_feature_by_time(df, **kwargs)
278
+ dict_selected_features[region_id] = df_corr.columns
279
+ dict_best_cei[region_id] = {}
280
+
281
+ #dict_selected_features[region_id] = dict_selected_features[0]
282
+ #dict_best_cei[region_id] = dict_best_cei[0]
279
283
  # Combine all unique values from the existing dictionary elements
280
284
  # combined_metrics = set()
281
285
  # for key in dict_selected_features:
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ from tqdm import tqdm
2
3
  from sklearn.ensemble import RandomForestRegressor
3
4
 
4
5
 
@@ -27,6 +28,7 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
27
28
  # # You would adjust the threshold based on new criteria since variances have been normalized.
28
29
  # selector = VarianceThreshold(threshold=scaled_data.var().mean())
29
30
  # X = selector.fit_transform(scaled_data)
31
+ selector = None
30
32
 
31
33
  # Fill in columns with median of that column
32
34
  X = X.fillna(X.median())
@@ -47,17 +49,58 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
47
49
  if method == "SelectKBest":
48
50
  from sklearn.feature_selection import SelectKBest, f_regression
49
51
 
50
- k = 10 # Number of features to select
51
- selector = SelectKBest(score_func=f_regression, k=10)
52
+ k = 15 # Number of features to select
53
+ selector = SelectKBest(score_func=f_regression, k=k)
52
54
 
53
55
  # Fit the selector to the data and transform the data to select the best features
54
- X_new = selector.fit_transform(X, y)
56
+ try:
57
+ X_new = selector.fit_transform(X, y)
58
+ except:
59
+ breakpoint()
55
60
 
56
61
  # Get the selected feature indices
57
62
  selected_features = selector.get_support(indices=True)
58
63
 
59
64
  # Get the selected feature names
60
65
  selected_features = X.columns[selected_features].tolist()
66
+ elif method == "SHAP":
67
+ import pandas as pd
68
+ from catboost import CatBoostRegressor
69
+ from fasttreeshap import TreeExplainer as FastTreeExplainer
70
+ from sklearn.model_selection import cross_val_score
71
+
72
+ model = CatBoostRegressor(n_estimators=500, verbose=0, use_best_model=False)
73
+ model.fit(X, y)
74
+
75
+ explainer = FastTreeExplainer(model)
76
+ shap_values = explainer.shap_values(X)
77
+
78
+ # Step 5: Summarize the SHAP values for feature importance
79
+ shap_importances = np.mean(np.abs(shap_values), axis=0)
80
+ shap_importance_df = pd.DataFrame({
81
+ 'feature': X.columns,
82
+ 'importance': shap_importances
83
+ }).sort_values(by='importance', ascending=False)
84
+
85
+ def evaluate_model_with_n_features(N, X_train, y_train):
86
+ top_features = shap_importance_df['feature'].head(N).values
87
+ X_train_selected = X_train[top_features]
88
+ selector = CatBoostRegressor(n_estimators=500, random_state=42, verbose=0)
89
+ scores = cross_val_score(selector, X_train_selected, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
90
+
91
+ return np.mean(scores)
92
+
93
+ # Evaluate model performance with different number of features
94
+ nrange = [5, 10, 15, 20, 25, 30]
95
+ cv_scores = []
96
+ for N in tqdm(nrange):
97
+ cv_scores.append(evaluate_model_with_n_features(N, X, y))
98
+
99
+ # Select the number of features that gives the best cross-validation score (lowest MSE)
100
+ optimal_N = nrange[np.argmax(cv_scores)]
101
+
102
+ # Use optimal N to select features
103
+ selected_features = shap_importance_df['feature'].head(optimal_N).values.tolist()
61
104
  elif method == "feature_engine":
62
105
  from feature_engine.selection import SmartCorrelatedSelection
63
106
 
@@ -110,6 +153,9 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
110
153
  selector.fit(X.values, y.values)
111
154
  selected_features_mask = selector.support_
112
155
  selected_features = X.columns[selected_features_mask].tolist()
156
+ tentative_features = X.columns[selector.support_weak_].tolist()
157
+
158
+ selected_features = selected_features + tentative_features
113
159
  elif method == "Leshy":
114
160
  import arfs.feature_selection.allrelevant as arfsgroot
115
161
  from catboost import CatBoostRegressor
@@ -205,6 +205,11 @@ def add_statistics(dir_stats, df, country, crop, admin_zone, stats, method, targ
205
205
  "value",
206
206
  ]
207
207
 
208
+ # Replace any inf or 0 values by NaN
209
+ yield_value = yield_value.replace([0, np.inf, -np.inf], np.nan)
210
+ area_value = area_value.replace([0, np.inf, -np.inf], np.nan)
211
+ prod_value = prod_value.replace([0, np.inf, -np.inf], np.nan)
212
+
208
213
  if not yield_value.empty:
209
214
  group.loc[:, target_col] = yield_value.values[0]
210
215
  group.loc[:, "Area (ha)"] = area_value.values[0]
@@ -252,7 +252,7 @@ def auto_train(
252
252
  else:
253
253
  hyperparams = {}
254
254
 
255
- if model_name == "catboost":
255
+ if model_name in ["catboost", "merf"]:
256
256
  hyperparams = {
257
257
  "depth": 6,
258
258
  "learning_rate": 0.01,
@@ -265,17 +265,14 @@ def auto_train(
265
265
  "random_seed": seed,
266
266
  "verbose": False,
267
267
  }
268
- model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
269
- # if cluster_strategy in ["auto_detect", "single"]:
270
- # from merf import MERF
271
- #
272
- # regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
273
- #
274
- # model = MERF(regr, max_iterations=5)
275
- # elif cluster_strategy == "individual":
276
- # # For all features with AUC in name, set monotone_constraints to 1, rest are 0
277
- # # monotone_constraints = [1 if "AUC_" in ftr else 0 for ftr in feature_names]
278
- # model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
268
+ if model_name == "catboost":
269
+ model = CatBoostRegressor(**hyperparams, cat_features=cat_features)
270
+ elif model_name == "merf":
271
+ from merf import MERF
272
+
273
+ hyperparams["iterations"] = 1000
274
+ regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
275
+ model = MERF(regr, max_iterations=10)
279
276
  elif model_name == "linear":
280
277
  from sklearn.linear_model import LassoCV
281
278
 
@@ -332,7 +332,7 @@ def plot_df_shpfile(
332
332
  cb.ax.set_title(
333
333
  label, fontsize=8, fontweight="semibold", fontfamily="Arial"
334
334
  )
335
- cb.ax.set_xticklabels(ticks, fontsize=6, fontfamily="Arial")
335
+ cb.ax.set_xticklabels(ticks, fontsize=4, fontfamily="Arial")
336
336
 
337
337
  # Use BoundaryNorm to create discrete levels
338
338
  # sm = plt.cm.ScalarMappable(cmap=cmap.mpl_colormap, norm=norm)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.26
3
+ Version: 0.1.28
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.26",
53
+ version="0.1.28",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes