geocif 0.1.45__tar.gz → 0.1.47__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {geocif-0.1.45/geocif.egg-info → geocif-0.1.47}/PKG-INFO +1 -1
  2. {geocif-0.1.45 → geocif-0.1.47}/geocif/analysis.py +7 -5
  3. {geocif-0.1.45 → geocif-0.1.47}/geocif/experiments.py +3 -9
  4. {geocif-0.1.45 → geocif-0.1.47}/geocif/geocif.py +211 -40
  5. {geocif-0.1.45 → geocif-0.1.47}/geocif/indices_runner.py +2 -2
  6. {geocif-0.1.45 → geocif-0.1.47}/geocif/indices_runner_v2.py +2 -2
  7. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/correlations.py +49 -40
  8. geocif-0.1.47/geocif/ml/misc.py +33 -0
  9. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/output.py +0 -2
  10. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/stages.py +18 -9
  11. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/trainers.py +22 -0
  12. {geocif-0.1.45 → geocif-0.1.47/geocif.egg-info}/PKG-INFO +1 -1
  13. {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/SOURCES.txt +1 -1
  14. {geocif-0.1.45 → geocif-0.1.47}/setup.py +1 -1
  15. geocif-0.1.45/geocif/ml/correlations_backup.py +0 -412
  16. {geocif-0.1.45 → geocif-0.1.47}/LICENSE +0 -0
  17. {geocif-0.1.45 → geocif-0.1.47}/MANIFEST.in +0 -0
  18. {geocif-0.1.45 → geocif-0.1.47}/README.md +0 -0
  19. {geocif-0.1.45 → geocif-0.1.47}/geocif/__init__.py +0 -0
  20. {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/__init__.py +0 -0
  21. {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/geoagmet.py +0 -0
  22. {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/plot.py +0 -0
  23. {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/utils.py +0 -0
  24. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/__init__.py +0 -0
  25. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/constants.py +0 -0
  26. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/features.py +0 -0
  27. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/geo.py +0 -0
  28. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/geocif.py +0 -0
  29. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/metadata.py +0 -0
  30. {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/models.py +0 -0
  31. {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/__init__.py +0 -0
  32. {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/definitions.py +0 -0
  33. {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/indices.py +0 -0
  34. {geocif-0.1.45 → geocif-0.1.47}/geocif/logger.py +0 -0
  35. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/__init__.py +0 -0
  36. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/embedding.py +0 -0
  37. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/feature_engineering.py +0 -0
  38. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/feature_selection.py +0 -0
  39. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/outliers.py +0 -0
  40. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/outlook.py +0 -0
  41. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/spatial_autocorrelation.py +0 -0
  42. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/stats.py +0 -0
  43. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/trend.py +0 -0
  44. {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/xai.py +0 -0
  45. {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/__init__.py +0 -0
  46. {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/automl.py +0 -0
  47. {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/misc.py +0 -0
  48. {geocif-0.1.45 → geocif-0.1.47}/geocif/utils.py +0 -0
  49. {geocif-0.1.45 → geocif-0.1.47}/geocif/viz/__init__.py +0 -0
  50. {geocif-0.1.45 → geocif-0.1.47}/geocif/viz/plot.py +0 -0
  51. {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/dependency_links.txt +0 -0
  52. {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/not-zip-safe +0 -0
  53. {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/top_level.txt +0 -0
  54. {geocif-0.1.45 → geocif-0.1.47}/requirements.txt +0 -0
  55. {geocif-0.1.45 → geocif-0.1.47}/setup.cfg +0 -0
  56. {geocif-0.1.45 → geocif-0.1.47}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.45
3
+ Version: 0.1.47
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -162,8 +162,8 @@ class Geoanalysis:
162
162
  return pd.DataFrame(), pd.DataFrame()
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
- # df_metrics = self._process_metrics(df_metrics)
166
- # self._plot_metrics(df_metrics)
165
+ df_metrics = self._process_metrics(df_metrics)
166
+ self._plot_metrics(df_metrics)
167
167
 
168
168
  df_regional_metrics_by_year = self._compute_regional_metrics(
169
169
  df, by="Harvest Year"
@@ -172,8 +172,10 @@ class Geoanalysis:
172
172
  df_regional_metrics_by_year
173
173
  )
174
174
  df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
175
- breakpoint()
176
- self._store_results(None, df_regional_metrics, df_regional_metrics_by_year)
175
+
176
+ self._store_results(
177
+ df_metrics, df_regional_metrics, df_regional_metrics_by_year
178
+ )
177
179
 
178
180
  df_national_yield = self._compute_national_yield(df)
179
181
  self._plot_national_yield(df_national_yield)
@@ -193,7 +195,7 @@ class Geoanalysis:
193
195
  .apply(self.annual_metrics)
194
196
  .reset_index()
195
197
  )
196
- breakpoint()
198
+
197
199
  return df_metrics.pivot_table(
198
200
  index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
199
201
  columns="level_5",
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
85
85
 
86
86
  # Experiment: lag_years
87
87
  logger.info("Experiment 3: lag_years")
88
- parser = main(
89
- inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
90
- )
88
+ parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
91
89
 
92
90
  # Experiment: lag_yield_as_feature
93
91
  logger.info("Experiment 4: lag_yield_as_feature")
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
103
101
 
104
102
  # Experiment: median_years
105
103
  logger.info("Experiment 5: median_years")
106
- parser = main(
107
- inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
108
- )
104
+ parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
109
105
 
110
106
  # Experiment: median_yield_as_feature
111
107
  logger.info("Experiment 6: median_yield_as_feature")
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
133
129
 
134
130
  # Experiment: optimize
135
131
  logger.info("Experiment 8: optimize")
136
- parser = main(
137
- inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
138
- )
132
+ parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
139
133
 
140
134
 
141
135
  if __name__ == "__main__":
@@ -108,13 +108,18 @@ class Geocif:
108
108
  Config file: ML
109
109
  ====================================================================
110
110
  """
111
- self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
112
111
  self.model_type = self.parser.get("ML", "model_type")
113
112
  self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
114
113
  self.analogous_year_yield_as_feature = self.parser.getboolean(
115
114
  "ML", "analogous_year_yield_as_feature"
116
115
  )
117
- self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
116
+ self.plot_map_for_correlation_plot = self.parser.getboolean(
117
+ "ML", "plot_map_for_correlation_plot"
118
+ )
119
+ self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
120
+ self.include_lat_lon_as_feature = self.parser.getboolean(
121
+ "ML", "include_lat_lon_as_feature"
122
+ )
118
123
  self.spatial_autocorrelation = self.parser.getboolean(
119
124
  "ML", "spatial_autocorrelation"
120
125
  )
@@ -147,6 +152,9 @@ class Geocif:
147
152
  self.parser.get("ML", "cat_features")
148
153
  )
149
154
 
155
+ self.use_cumulative_features = self.parser.getboolean(
156
+ "DEFAULT", "use_cumulative_features"
157
+ )
150
158
  """
151
159
  ====================================================================
152
160
  Variables, Paths
@@ -192,6 +200,9 @@ class Geocif:
192
200
 
193
201
  self.db_path = self.dir_db / self.db_forecasts
194
202
 
203
+ # Store config file in database
204
+ output.config_to_db(self.db_path, self.parser, self.today)
205
+
195
206
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
196
207
  # obj_pickle = outlook.Outlook(self.pickle_file)
197
208
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -218,18 +229,29 @@ class Geocif:
218
229
  y_train = df_region[target_col]
219
230
 
220
231
  if self.ml_model:
221
- self.logger.info(f"Selecting features for {self.country} {self.crop}")
222
- selector, _, self.selected_features = fs.select_features(
223
- X_train, y_train, method=self.feature_selection
224
- )
225
- self.logger.info(f"Selected features: {self.selected_features}")
232
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
233
+ all_features = X_train.columns
234
+
235
+ # Select the columns with use_ceis in it
236
+ self.selected_features = [
237
+ column
238
+ for column in all_features
239
+ if any(cei in column for cei in self.use_ceis)
240
+ ]
241
+ else:
242
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
243
+ selector, _, self.selected_features = fs.select_features(
244
+ X_train, y_train, method=self.feature_selection
245
+ )
246
+ self.logger.info(f"Selected features: {self.selected_features}")
226
247
 
227
248
  """ Update model to include conformal estimates """
228
- if "lat" not in self.selected_features and self.include_lat_lon:
249
+ if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
229
250
  self.selected_features.append("lat")
230
- if "lon" not in self.selected_features and self.include_lat_lon:
251
+ if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
231
252
  self.selected_features.append("lon")
232
253
  X_train = df_region[self.selected_features + self.cat_features]
254
+
233
255
  dir_output = (
234
256
  self.dir_analysis
235
257
  / self.country
@@ -306,8 +328,38 @@ class Geocif:
306
328
  self.best_hyperparams = {}
307
329
  elif self.model_name in ["cubist"]:
308
330
  self.model.fit(X_train, y_train)
309
- except:
310
- self.logger.error(f"Error fitting model for {self.country} {self.crop}")
331
+ elif self.model_name in [
332
+ "cumulative_1",
333
+ "cumulative_2",
334
+ "cumulative_3",
335
+ ]:
336
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
337
+
338
+ # Standardize the numeric features
339
+ scaler = StandardScaler()
340
+ X_numeric = X_train.iloc[:, :3]
341
+ X_scaled_numeric = pd.DataFrame(
342
+ scaler.fit_transform(X_numeric),
343
+ columns=X_numeric.columns,
344
+ index=X_train.index,
345
+ )
346
+
347
+ # Encode the Region as categorical
348
+ le = LabelEncoder()
349
+ X_region = pd.Series(
350
+ le.fit_transform(X_train["Region"]),
351
+ name="Region",
352
+ index=X_train.index,
353
+ )
354
+
355
+ # Combine scaled numeric features and encoded region
356
+ X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
357
+
358
+ self.model.fit(X_train_scaled, y_train)
359
+ except Exception as e:
360
+ self.logger.error(
361
+ f"Error fitting model for {self.country} {self.crop} {e}"
362
+ )
311
363
 
312
364
  def predict(self, df_region, scaler=None):
313
365
  """
@@ -354,6 +406,33 @@ class Geocif:
354
406
  X_test, Z_test, clusters_test.astype("object")
355
407
  )
356
408
  best_hyperparameters = self.model.fe_model.get_params().copy()
409
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
411
+
412
+ # Standardize the numeric features
413
+ scaler = StandardScaler()
414
+ X_numeric = X_test.iloc[:, :3]
415
+ try:
416
+ X_scaled_numeric = pd.DataFrame(
417
+ scaler.fit_transform(X_numeric),
418
+ columns=X_numeric.columns,
419
+ index=X_test.index,
420
+ )
421
+ except:
422
+ breakpoint()
423
+
424
+ # Encode the Region as categorical
425
+ le = LabelEncoder()
426
+ X_region = pd.Series(
427
+ le.fit_transform(X_test["Region"]),
428
+ name="Region",
429
+ index=X_test.index,
430
+ )
431
+
432
+ # Combine scaled numeric features and encoded region
433
+ X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
434
+ y_pred = self.model.predict(X_test_scaled)
435
+ best_hyperparameters = {} # self.model.get_params().copy()
357
436
  elif self.model_name == "geospaNN":
358
437
  import torch
359
438
  import geospaNN
@@ -495,7 +574,9 @@ class Geocif:
495
574
  "Crop",
496
575
  "Harvest Year",
497
576
  "Stage Name",
577
+ "Time",
498
578
  ]
579
+
499
580
  df.index = df.apply(
500
581
  lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
501
582
  )
@@ -507,28 +588,37 @@ class Geocif:
507
588
 
508
589
  def create_feature_names(self, stages_features, selected_features):
509
590
  """
591
+ Create feature names for machine learning stages.
510
592
 
511
593
  Args:
512
- stages_features:
513
- selected_features:
594
+ stages_features (list): List of features for different stages.
595
+ selected_features (dict): Dictionary of selected features.
514
596
 
515
597
  Returns:
516
-
598
+ None
517
599
  """
600
+ # Assert stages_features is a list
601
+ assert isinstance(stages_features, list), "stages_features should be a list"
602
+
518
603
  # Clear out feature names
519
604
  self.feature_names = []
520
605
 
521
- """ Select stages that will be used for ML
606
+ """
607
+ Select stages that will be used for ML
522
608
  1. method = "latest" - Select the latest stage
523
609
  2. method = "fraction" - Select a fraction (1-100) of all stages
524
610
  """
611
+ method = "fraction"
612
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
613
+ method = "latest"
614
+
525
615
  stages_features = stages.select_stages_for_ml(
526
- stages_features, method="fraction", n=60
616
+ stages_features, method=method, n=60
527
617
  )
528
618
 
529
619
  for stage in stages_features:
530
620
  # Convert each element of stage to str and join with _
531
- _stage = "_".join([str(x) for x in stage])
621
+ _stage = "_".join(map(str, stage))
532
622
 
533
623
  # Create a list appending _stage to each element of combined_keys
534
624
  _tmp = [f"{col}_{_stage}" for col in self.combined_keys]
@@ -537,17 +627,33 @@ class Geocif:
537
627
  parts = _t.split("_")
538
628
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
539
629
 
540
- # Check if any element of dict_selected_features is in _t
541
- for x in selected_features["CEI"].values:
542
- if x not in cei:
543
- continue
544
-
545
- dict_fn = stages.get_stage_information_dict(_t, self.method)
546
- tmp_col = dict_fn["CEI"] + " " + dict_fn["Stage Name"]
547
-
548
- if tmp_col in self.df_train.columns:
549
- self.feature_names.append(tmp_col)
550
-
630
+ try:
631
+ if self.model_name in [
632
+ "cumulative_1",
633
+ "cumulative_2",
634
+ "cumulative_3",
635
+ ]:
636
+ dict_fn = stages.get_stage_information_dict(_t, self.method)
637
+ tmp_col = f"{dict_fn['CEI']}"
638
+
639
+ if tmp_col in self.df_train.columns:
640
+ self.feature_names.append(tmp_col)
641
+ else:
642
+ # Check if any element of dict_selected_features is in _t
643
+ if selected_features["CEI"].any():
644
+ for x in selected_features["CEI"].values:
645
+ if x not in cei:
646
+ continue
647
+
648
+ dict_fn = stages.get_stage_information_dict(
649
+ _t, self.method
650
+ )
651
+ tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
652
+
653
+ if tmp_col in self.df_train.columns:
654
+ self.feature_names.append(tmp_col)
655
+ except:
656
+ breakpoint()
551
657
  self.feature_names = list(set(self.feature_names))
552
658
 
553
659
  if self.median_yield_as_feature:
@@ -559,16 +665,14 @@ class Geocif:
559
665
  self.feature_names.append(f"t -{i} {self.target}")
560
666
 
561
667
  if self.analogous_year_yield_as_feature:
562
- self.feature_names.append("Analogous Year")
563
- self.feature_names.append("Analogous Year Yield")
668
+ self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
564
669
 
565
670
  if self.use_outlook_as_feature:
566
671
  self.feature_names.append("FCST")
567
672
 
568
673
  # Add lat and lon to feature names
569
- if self.include_lat_lon:
570
- self.feature_names.append("lat")
571
- self.feature_names.append("lon")
674
+ if self.include_lat_lon_as_feature:
675
+ self.feature_names.extend(["lat", "lon"])
572
676
 
573
677
  self.selected_features = []
574
678
 
@@ -592,6 +696,8 @@ class Geocif:
592
696
  for idx, region in enumerate(pbar):
593
697
  if self.model_name in ["linear", "gam"]:
594
698
  self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
699
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
700
+ self.create_feature_names(stages, {})
595
701
  elif self.ml_model:
596
702
  self.create_feature_names(stages, dict_selected_features[region])
597
703
  elif self.model_name in ["median"]:
@@ -721,11 +827,52 @@ class Geocif:
721
827
  parts = all_cei_columns[-1].split("_")
722
828
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
723
829
 
724
- # HACK: Get feature name with GD4 in it to extract first and last stage id and name
725
- cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
726
- # Select the longest string in cei_column
727
- cei_col = max(cei_column, key=len)
728
- self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
830
+ # For each region, find the column with the longest string in cei_column
831
+ group_by = ["Region"]
832
+ groups = df.groupby(group_by)
833
+ if self.use_cumulative_features:
834
+ frames = []
835
+ for name, group in groups:
836
+ # Drop columns with all NaNs
837
+ group.dropna(axis=1, how="all", inplace=True)
838
+
839
+ cei_column = group[
840
+ group.columns[group.columns.str.contains(cei)]
841
+ ].columns
842
+ max_cei_col = max(cei_column, key=len)
843
+ self.stage_info = stages.get_stage_information_dict(
844
+ max_cei_col, self.method
845
+ )
846
+
847
+ # Subset dataframes to columns that contain self.stage_info["Stage_ID"]
848
+ all_columns = group.columns[
849
+ group.columns.str.contains(self.stage_info["Stage_ID"])
850
+ ].tolist()
851
+
852
+ group = group[
853
+ self.fixed_columns
854
+ + [self.target]
855
+ + self.statistics_columns
856
+ + all_columns
857
+ ]
858
+ # rename all_columns to self.stage_info["CEI"]
859
+ group.rename(
860
+ columns={
861
+ col: stages.get_stage_information_dict(col, self.method)["CEI"]
862
+ for col in all_columns
863
+ },
864
+ inplace=True,
865
+ )
866
+
867
+ frames.append(group)
868
+
869
+ df = pd.concat(frames)
870
+ else:
871
+ # HACK: Get feature name with GD4 in it to extract first and last stage id and name
872
+ cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
873
+ # Select the longest string in cei_column
874
+ cei_col = max(cei_column, key=len)
875
+ self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
729
876
 
730
877
  # Change column name
731
878
  # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
@@ -789,12 +936,14 @@ class Geocif:
789
936
 
790
937
  mask = self.df_results["Stage_ID"].isin(_stages)
791
938
  df = self.df_results[mask]
792
-
793
939
  """ Select which CEI categories to use for ML """
794
940
  if "all" in self.use_ceis:
795
941
  pass
796
942
  else:
797
- df = df[df["Type"].isin(self.use_ceis)]
943
+ if self.select_cei_by == "Type":
944
+ df = df[df["Type"].isin(self.use_ceis)]
945
+ elif self.select_cei_by == "Index":
946
+ df = df[df["Index"].isin(self.use_ceis)]
798
947
 
799
948
  """ Convert this dataframe into an ML ready format and save to disk """
800
949
  df = self.create_ml_dataframe(df)
@@ -859,12 +1008,17 @@ class Geocif:
859
1008
  dict_kwargs["method"] = self.method
860
1009
  dict_kwargs["national_correlation"] = self.national_correlation
861
1010
  dict_kwargs["groupby"] = self.correlation_plot_groupby
1011
+ dict_kwargs["cluster_strategy"] = self.cluster_strategy
862
1012
  dict_kwargs["dg_country"] = self.dg_country
863
1013
  dict_kwargs["combined_dict"] = self.combined_dict
1014
+ dict_kwargs["plot_map"] = self.plot_map_for_correlation_plot
1015
+ dict_kwargs["correlation_threshold"] = self.correlation_threshold
864
1016
 
865
1017
  if self.spatial_autocorrelation:
866
1018
  sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
867
1019
 
1020
+ dict_selected_features = {}
1021
+ dict_best_cei = {}
868
1022
  if self.correlation_plots:
869
1023
  self.logger.info(f"Correlation plot for {self.country} {self.crop}")
870
1024
  (
@@ -940,6 +1094,8 @@ class Geocif:
940
1094
  self.model_name = model
941
1095
  self.experiment_name = self.parser.get("ML", "experiment_name")
942
1096
  self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
1097
+ self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
1098
+ self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
943
1099
  self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
944
1100
  self.optimize = self.parser.getboolean(self.country, "optimize")
945
1101
  self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
@@ -951,6 +1107,21 @@ class Geocif:
951
1107
  self.estimate_ci = False
952
1108
  self.check_yield_trend = False
953
1109
  self.estimate_ci_for_all = False
1110
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
1111
+ self.correlation_plots = False
1112
+ self.lag_yield_as_feature = False
1113
+ self.median_yield_as_feature = False
1114
+ self.median_area_as_feature = False
1115
+ self.analogous_year_yield_as_feature = False
1116
+ self.last_year_yield_as_feature = False
1117
+ self.include_lat_lon_as_feature = False
1118
+ self.do_xai = False
1119
+ self.estimate_ci = False
1120
+ self.estimate_ci_for_all = False
1121
+ self.check_yield_trend = False
1122
+ self.cluster_strategy = "single"
1123
+ self.select_cei_by = "Index"
1124
+ self.use_cumulative_features = True
954
1125
  else:
955
1126
  self.do_xai = self.parser.getboolean("ML", "do_xai")
956
1127
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
173
173
  or "south_africa_maize" in i[3]
174
174
  or "mozambique_maize" in i[3]
175
175
  or "united_states_of_america" in i[3]
176
- or "russian_federation" in i[3]
177
- or "ukraine" in i[3]
176
+ or "russian_federation" in i[3]
177
+ or "ukraine" in i[3]
178
178
  ]
179
179
  # "malawi" in i[2]]
180
180
 
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
47
47
 
48
48
  self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
49
49
  self.base_dir = Path(
50
- r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\afghanistan"
50
+ r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
51
51
  ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
52
52
  self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
53
53
 
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
164
164
  # Only keep those entries in combinations where the third elemt is
165
165
  # mozambique, south_africa, angola or dem_people's_rep_of_korea
166
166
  # This is done to test the code for these countries
167
- combinations = [i for i in combinations if "afghanistan_maize_s1" in i[3]]
167
+ combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
168
168
 
169
169
  if True:
170
170
  num_cpu = int(cpu_count() * 0.5)
@@ -74,18 +74,24 @@ def plot_feature_corr_by_time(df, **kwargs):
74
74
  forecast_season = kwargs.get("forecast_season")
75
75
  national_correlation = kwargs.get("national_correlation")
76
76
  group_by = kwargs.get("groupby")
77
+ plot_map = kwargs.get("plot_map")
78
+ region_name = kwargs.get("region_name")
77
79
 
78
80
  # Setup the figure and gridspec
79
81
  fig = plt.figure(figsize=(10, 5))
80
- gs = fig.add_gridspec(
81
- 3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
82
- )
82
+ if plot_map:
83
+ gs = fig.add_gridspec(
84
+ 3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
85
+ )
86
+ else:
87
+ gs = fig.add_gridspec(3, 1, height_ratios=[6, 5, 1], hspace=0.6, wspace=0.0)
83
88
 
84
89
  # Assign subplots
85
90
  ax_heatmap = fig.add_subplot(gs[0:2, 0])
86
- ax_map = fig.add_subplot(gs[0, 1])
87
91
  cbar_ax = fig.add_subplot(gs[2, 0])
88
- ax4 = fig.add_subplot(gs[2, 1])
92
+ if plot_map:
93
+ ax_map = fig.add_subplot(gs[0, 1])
94
+ ax4 = fig.add_subplot(gs[2, 1])
89
95
 
90
96
  # Transpose and reverse the columns of the dataframe
91
97
  df_transpose = df.T
@@ -107,43 +113,43 @@ def plot_feature_corr_by_time(df, **kwargs):
107
113
  )
108
114
  ax_heatmap.tick_params(left=False, bottom=False)
109
115
 
110
- # Plot the map using GeoPandas
111
- dg_country = kwargs.get("dg_country")
112
-
113
- ax_map = dg_country.plot(
114
- ax=ax_map,
115
- color="white",
116
- edgecolor="black",
117
- linewidth=1.0,
118
- facecolor=None,
119
- legend=False,
120
- )
121
-
122
- if not national_correlation:
123
- id = kwargs["region_id"]
124
- dg_region = dg_country[dg_country[group_by] == id]
125
- ax_map = dg_region.plot(
126
- ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
116
+ if plot_map:
117
+ # Plot the map using GeoPandas
118
+ dg_country = kwargs.get("dg_country")
119
+
120
+ ax_map = dg_country.plot(
121
+ ax=ax_map,
122
+ color="white",
123
+ edgecolor="black",
124
+ linewidth=1.0,
125
+ facecolor=None,
126
+ legend=False,
127
127
  )
128
- # Set title with color blue
129
- ax_map.set_title(f"Region: {id}", color="blue")
130
-
131
- # No colorbar for the map
132
- ax_map.axis("off")
133
- # Remove borders
134
- ax_map.spines["top"].set_visible(False)
135
- ax_map.spines["right"].set_visible(False)
136
- ax_map.spines["bottom"].set_visible(False)
137
- ax_map.spines["left"].set_visible(False)
138
- # ax4 should not be visible
139
- ax4.axis("off")
128
+
129
+ id = kwargs["region_id"]
130
+ if plot_map:
131
+ if not national_correlation:
132
+ dg_region = dg_country[dg_country[group_by] == id]
133
+ ax_map = dg_region.plot(
134
+ ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
135
+ )
136
+ # Set title with color blue
137
+ ax_map.set_title(f"Region: {id}", color="blue")
138
+
139
+ # No colorbar for the map
140
+ ax_map.axis("off")
141
+ # Remove borders
142
+ ax_map.spines["top"].set_visible(False)
143
+ ax_map.spines["right"].set_visible(False)
144
+ ax_map.spines["bottom"].set_visible(False)
145
+ ax_map.spines["left"].set_visible(False)
146
+ # ax4 should not be visible
147
+ ax4.axis("off")
140
148
 
141
149
  # Add colorbar label
142
150
  # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
143
151
  cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
144
- ax_heatmap.set_xticklabels(
145
- ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5
146
- )
152
+ ax_heatmap.set_xticklabels(ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5)
147
153
  ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=5)
148
154
  ax_heatmap.set_xlabel("")
149
155
  ax_heatmap.set_ylabel(" ")
@@ -151,12 +157,13 @@ def plot_feature_corr_by_time(df, **kwargs):
151
157
  cbar_ax.tick_params(axis="both", which="major", labelsize=5)
152
158
 
153
159
  _country = country.title().replace("_", " ")
160
+ _region_name = region_name if not national_correlation else ""
154
161
  _crop = crop.title().replace("_", " ")
155
162
  if not national_correlation:
156
163
  fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
157
164
  else:
158
165
  fname = f"{country}_{crop}_corr_feature_by_time.png"
159
- ax_heatmap.set_title(f"{_country}\n{_crop}")
166
+ ax_heatmap.set_title(f"{_country}, {_crop}\n{_region_name}")
160
167
 
161
168
  # plt.tight_layout()
162
169
  os.makedirs(dir_output, exist_ok=True)
@@ -246,14 +253,14 @@ def all_correlated_feature_by_time(df, **kwargs):
246
253
  Returns:
247
254
 
248
255
  """
249
- THRESHOLD = 0.1
250
256
  national_correlation = kwargs.get("national_correlation")
251
257
  group_by = kwargs.get("groupby")
252
258
  combined_dict = kwargs.get("combined_dict")
259
+ THRESHOLD = kwargs.get("correlation_threshold")
253
260
 
254
261
  dict_selected_features = {}
255
262
  dict_best_cei = {}
256
- breakpoint()
263
+
257
264
  if not national_correlation:
258
265
  groups = df.groupby(group_by)
259
266
  for region_id, group in tqdm(
@@ -297,6 +304,8 @@ def all_correlated_feature_by_time(df, **kwargs):
297
304
  )
298
305
 
299
306
  kwargs["region_id"] = region_id
307
+ _region_names = ", ".join([str(x) for x in group['Region'].unique()])
308
+ kwargs["region_name"] = _region_names
300
309
  plot_feature_corr_by_time(df_tmp, **kwargs)
301
310
  # For each element in dict_best_cei, add the type of the cei
302
311
  else: