geocif 0.1.46__tar.gz → 0.1.48__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {geocif-0.1.46/geocif.egg-info → geocif-0.1.48}/PKG-INFO +1 -1
  2. {geocif-0.1.46 → geocif-0.1.48}/geocif/analysis.py +7 -5
  3. {geocif-0.1.46 → geocif-0.1.48}/geocif/experiments.py +3 -9
  4. {geocif-0.1.46 → geocif-0.1.48}/geocif/geocif.py +206 -44
  5. {geocif-0.1.46 → geocif-0.1.48}/geocif/indices_runner.py +2 -2
  6. {geocif-0.1.46 → geocif-0.1.48}/geocif/indices_runner_v2.py +2 -2
  7. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/correlations.py +3 -3
  8. geocif-0.1.48/geocif/ml/misc.py +33 -0
  9. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/output.py +0 -2
  10. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/stages.py +18 -9
  11. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/trainers.py +39 -2
  12. {geocif-0.1.46 → geocif-0.1.48/geocif.egg-info}/PKG-INFO +1 -1
  13. {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/SOURCES.txt +1 -1
  14. {geocif-0.1.46 → geocif-0.1.48}/setup.py +1 -1
  15. geocif-0.1.46/geocif/ml/correlations_backup.py +0 -412
  16. {geocif-0.1.46 → geocif-0.1.48}/LICENSE +0 -0
  17. {geocif-0.1.46 → geocif-0.1.48}/MANIFEST.in +0 -0
  18. {geocif-0.1.46 → geocif-0.1.48}/README.md +0 -0
  19. {geocif-0.1.46 → geocif-0.1.48}/geocif/__init__.py +0 -0
  20. {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/__init__.py +0 -0
  21. {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/geoagmet.py +0 -0
  22. {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/plot.py +0 -0
  23. {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/utils.py +0 -0
  24. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/__init__.py +0 -0
  25. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/constants.py +0 -0
  26. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/features.py +0 -0
  27. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/geo.py +0 -0
  28. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/geocif.py +0 -0
  29. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/metadata.py +0 -0
  30. {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/models.py +0 -0
  31. {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/__init__.py +0 -0
  32. {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/definitions.py +0 -0
  33. {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/indices.py +0 -0
  34. {geocif-0.1.46 → geocif-0.1.48}/geocif/logger.py +0 -0
  35. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/__init__.py +0 -0
  36. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/embedding.py +0 -0
  37. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/feature_engineering.py +0 -0
  38. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/feature_selection.py +0 -0
  39. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/outliers.py +0 -0
  40. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/outlook.py +0 -0
  41. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/spatial_autocorrelation.py +0 -0
  42. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/stats.py +0 -0
  43. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/trend.py +0 -0
  44. {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/xai.py +0 -0
  45. {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/__init__.py +0 -0
  46. {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/automl.py +0 -0
  47. {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/misc.py +0 -0
  48. {geocif-0.1.46 → geocif-0.1.48}/geocif/utils.py +0 -0
  49. {geocif-0.1.46 → geocif-0.1.48}/geocif/viz/__init__.py +0 -0
  50. {geocif-0.1.46 → geocif-0.1.48}/geocif/viz/plot.py +0 -0
  51. {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/dependency_links.txt +0 -0
  52. {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/not-zip-safe +0 -0
  53. {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/top_level.txt +0 -0
  54. {geocif-0.1.46 → geocif-0.1.48}/requirements.txt +0 -0
  55. {geocif-0.1.46 → geocif-0.1.48}/setup.cfg +0 -0
  56. {geocif-0.1.46 → geocif-0.1.48}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.46
3
+ Version: 0.1.48
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -162,8 +162,8 @@ class Geoanalysis:
162
162
  return pd.DataFrame(), pd.DataFrame()
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
- # df_metrics = self._process_metrics(df_metrics)
166
- # self._plot_metrics(df_metrics)
165
+ df_metrics = self._process_metrics(df_metrics)
166
+ self._plot_metrics(df_metrics)
167
167
 
168
168
  df_regional_metrics_by_year = self._compute_regional_metrics(
169
169
  df, by="Harvest Year"
@@ -172,8 +172,10 @@ class Geoanalysis:
172
172
  df_regional_metrics_by_year
173
173
  )
174
174
  df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
175
- breakpoint()
176
- self._store_results(None, df_regional_metrics, df_regional_metrics_by_year)
175
+
176
+ self._store_results(
177
+ df_metrics, df_regional_metrics, df_regional_metrics_by_year
178
+ )
177
179
 
178
180
  df_national_yield = self._compute_national_yield(df)
179
181
  self._plot_national_yield(df_national_yield)
@@ -193,7 +195,7 @@ class Geoanalysis:
193
195
  .apply(self.annual_metrics)
194
196
  .reset_index()
195
197
  )
196
- breakpoint()
198
+
197
199
  return df_metrics.pivot_table(
198
200
  index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
199
201
  columns="level_5",
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
85
85
 
86
86
  # Experiment: lag_years
87
87
  logger.info("Experiment 3: lag_years")
88
- parser = main(
89
- inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
90
- )
88
+ parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
91
89
 
92
90
  # Experiment: lag_yield_as_feature
93
91
  logger.info("Experiment 4: lag_yield_as_feature")
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
103
101
 
104
102
  # Experiment: median_years
105
103
  logger.info("Experiment 5: median_years")
106
- parser = main(
107
- inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
108
- )
104
+ parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
109
105
 
110
106
  # Experiment: median_yield_as_feature
111
107
  logger.info("Experiment 6: median_yield_as_feature")
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
133
129
 
134
130
  # Experiment: optimize
135
131
  logger.info("Experiment 8: optimize")
136
- parser = main(
137
- inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
138
- )
132
+ parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
139
133
 
140
134
 
141
135
  if __name__ == "__main__":
@@ -11,7 +11,6 @@ import geopandas as gp
11
11
  import matplotlib.pyplot as plt
12
12
  import numpy as np
13
13
  import pandas as pd
14
- import sklearn
15
14
  from tqdm import tqdm
16
15
 
17
16
  from geocif import logger as log
@@ -28,7 +27,6 @@ from .ml import trend
28
27
  from .ml import xai
29
28
 
30
29
  plt.style.use("default")
31
- sklearn.set_config(transform_output="pandas")
32
30
 
33
31
  import warnings
34
32
 
@@ -108,7 +106,6 @@ class Geocif:
108
106
  Config file: ML
109
107
  ====================================================================
110
108
  """
111
- self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
112
109
  self.model_type = self.parser.get("ML", "model_type")
113
110
  self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
114
111
  self.analogous_year_yield_as_feature = self.parser.getboolean(
@@ -117,10 +114,10 @@ class Geocif:
117
114
  self.plot_map_for_correlation_plot = self.parser.getboolean(
118
115
  "ML", "plot_map_for_correlation_plot"
119
116
  )
120
- self.correlation_threshold = self.parser.getfloat(
121
- "ML", "correlation_threshold"
117
+ self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
118
+ self.include_lat_lon_as_feature = self.parser.getboolean(
119
+ "ML", "include_lat_lon_as_feature"
122
120
  )
123
- self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
124
121
  self.spatial_autocorrelation = self.parser.getboolean(
125
122
  "ML", "spatial_autocorrelation"
126
123
  )
@@ -153,6 +150,9 @@ class Geocif:
153
150
  self.parser.get("ML", "cat_features")
154
151
  )
155
152
 
153
+ self.use_cumulative_features = self.parser.getboolean(
154
+ "DEFAULT", "use_cumulative_features"
155
+ )
156
156
  """
157
157
  ====================================================================
158
158
  Variables, Paths
@@ -198,6 +198,9 @@ class Geocif:
198
198
 
199
199
  self.db_path = self.dir_db / self.db_forecasts
200
200
 
201
+ # Store config file in database
202
+ output.config_to_db(self.db_path, self.parser, self.today)
203
+
201
204
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
202
205
  # obj_pickle = outlook.Outlook(self.pickle_file)
203
206
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -224,18 +227,29 @@ class Geocif:
224
227
  y_train = df_region[target_col]
225
228
 
226
229
  if self.ml_model:
227
- self.logger.info(f"Selecting features for {self.country} {self.crop}")
228
- selector, _, self.selected_features = fs.select_features(
229
- X_train, y_train, method=self.feature_selection
230
- )
231
- self.logger.info(f"Selected features: {self.selected_features}")
230
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
231
+ all_features = X_train.columns
232
+
233
+ # Select the columns with use_ceis in it
234
+ self.selected_features = [
235
+ column
236
+ for column in all_features
237
+ if any(cei in column for cei in self.use_ceis)
238
+ ]
239
+ else:
240
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
241
+ selector, _, self.selected_features = fs.select_features(
242
+ X_train, y_train, method=self.feature_selection
243
+ )
244
+ self.logger.info(f"Selected features: {self.selected_features}")
232
245
 
233
246
  """ Update model to include conformal estimates """
234
- if "lat" not in self.selected_features and self.include_lat_lon:
247
+ if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
235
248
  self.selected_features.append("lat")
236
- if "lon" not in self.selected_features and self.include_lat_lon:
249
+ if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
237
250
  self.selected_features.append("lon")
238
251
  X_train = df_region[self.selected_features + self.cat_features]
252
+
239
253
  dir_output = (
240
254
  self.dir_analysis
241
255
  / self.country
@@ -288,6 +302,8 @@ class Geocif:
288
302
  verbose=False,
289
303
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
290
304
  )
305
+ elif self.model_name == "oblique":
306
+ self.model.fit(X_train, y_train)
291
307
  elif self.model_name == "geospaNN":
292
308
  self.model.fit(
293
309
  X_train,
@@ -312,8 +328,38 @@ class Geocif:
312
328
  self.best_hyperparams = {}
313
329
  elif self.model_name in ["cubist"]:
314
330
  self.model.fit(X_train, y_train)
315
- except:
316
- self.logger.error(f"Error fitting model for {self.country} {self.crop}")
331
+ elif self.model_name in [
332
+ "cumulative_1",
333
+ "cumulative_2",
334
+ "cumulative_3",
335
+ ]:
336
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
337
+
338
+ # Standardize the numeric features
339
+ scaler = StandardScaler()
340
+ X_numeric = X_train.iloc[:, :3]
341
+ X_scaled_numeric = pd.DataFrame(
342
+ scaler.fit_transform(X_numeric),
343
+ columns=X_numeric.columns,
344
+ index=X_train.index,
345
+ )
346
+
347
+ # Encode the Region as categorical
348
+ le = LabelEncoder()
349
+ X_region = pd.Series(
350
+ le.fit_transform(X_train["Region"]),
351
+ name="Region",
352
+ index=X_train.index,
353
+ )
354
+
355
+ # Combine scaled numeric features and encoded region
356
+ X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
357
+
358
+ self.model.fit(X_train_scaled, y_train)
359
+ except Exception as e:
360
+ self.logger.error(
361
+ f"Error fitting model for {self.country} {self.crop} {e}"
362
+ )
317
363
 
318
364
  def predict(self, df_region, scaler=None):
319
365
  """
@@ -360,6 +406,33 @@ class Geocif:
360
406
  X_test, Z_test, clusters_test.astype("object")
361
407
  )
362
408
  best_hyperparameters = self.model.fe_model.get_params().copy()
409
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
411
+
412
+ # Standardize the numeric features
413
+ scaler = StandardScaler()
414
+ X_numeric = X_test.iloc[:, :3]
415
+ try:
416
+ X_scaled_numeric = pd.DataFrame(
417
+ scaler.fit_transform(X_numeric),
418
+ columns=X_numeric.columns,
419
+ index=X_test.index,
420
+ )
421
+ except:
422
+ breakpoint()
423
+
424
+ # Encode the Region as categorical
425
+ le = LabelEncoder()
426
+ X_region = pd.Series(
427
+ le.fit_transform(X_test["Region"]),
428
+ name="Region",
429
+ index=X_test.index,
430
+ )
431
+
432
+ # Combine scaled numeric features and encoded region
433
+ X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
434
+ y_pred = self.model.predict(X_test_scaled)
435
+ best_hyperparameters = {} # self.model.get_params().copy()
363
436
  elif self.model_name == "geospaNN":
364
437
  import torch
365
438
  import geospaNN
@@ -501,7 +574,9 @@ class Geocif:
501
574
  "Crop",
502
575
  "Harvest Year",
503
576
  "Stage Name",
577
+ "Time",
504
578
  ]
579
+
505
580
  df.index = df.apply(
506
581
  lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
507
582
  )
@@ -513,28 +588,37 @@ class Geocif:
513
588
 
514
589
  def create_feature_names(self, stages_features, selected_features):
515
590
  """
591
+ Create feature names for machine learning stages.
516
592
 
517
593
  Args:
518
- stages_features:
519
- selected_features:
594
+ stages_features (list): List of features for different stages.
595
+ selected_features (dict): Dictionary of selected features.
520
596
 
521
597
  Returns:
522
-
598
+ None
523
599
  """
600
+ # Assert stages_features is a list
601
+ assert isinstance(stages_features, list), "stages_features should be a list"
602
+
524
603
  # Clear out feature names
525
604
  self.feature_names = []
526
605
 
527
- """ Select stages that will be used for ML
606
+ """
607
+ Select stages that will be used for ML
528
608
  1. method = "latest" - Select the latest stage
529
609
  2. method = "fraction" - Select a fraction (1-100) of all stages
530
610
  """
611
+ method = "fraction"
612
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
613
+ method = "latest"
614
+
531
615
  stages_features = stages.select_stages_for_ml(
532
- stages_features, method="fraction", n=60
616
+ stages_features, method=method, n=60
533
617
  )
534
618
 
535
619
  for stage in stages_features:
536
620
  # Convert each element of stage to str and join with _
537
- _stage = "_".join([str(x) for x in stage])
621
+ _stage = "_".join(map(str, stage))
538
622
 
539
623
  # Create a list appending _stage to each element of combined_keys
540
624
  _tmp = [f"{col}_{_stage}" for col in self.combined_keys]
@@ -543,17 +627,33 @@ class Geocif:
543
627
  parts = _t.split("_")
544
628
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
545
629
 
546
- # Check if any element of dict_selected_features is in _t
547
- for x in selected_features["CEI"].values:
548
- if x not in cei:
549
- continue
550
-
551
- dict_fn = stages.get_stage_information_dict(_t, self.method)
552
- tmp_col = dict_fn["CEI"] + " " + dict_fn["Stage Name"]
553
-
554
- if tmp_col in self.df_train.columns:
555
- self.feature_names.append(tmp_col)
556
-
630
+ try:
631
+ if self.model_name in [
632
+ "cumulative_1",
633
+ "cumulative_2",
634
+ "cumulative_3",
635
+ ]:
636
+ dict_fn = stages.get_stage_information_dict(_t, self.method)
637
+ tmp_col = f"{dict_fn['CEI']}"
638
+
639
+ if tmp_col in self.df_train.columns:
640
+ self.feature_names.append(tmp_col)
641
+ else:
642
+ # Check if any element of dict_selected_features is in _t
643
+ if selected_features["CEI"].any():
644
+ for x in selected_features["CEI"].values:
645
+ if x not in cei:
646
+ continue
647
+
648
+ dict_fn = stages.get_stage_information_dict(
649
+ _t, self.method
650
+ )
651
+ tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
652
+
653
+ if tmp_col in self.df_train.columns:
654
+ self.feature_names.append(tmp_col)
655
+ except:
656
+ breakpoint()
557
657
  self.feature_names = list(set(self.feature_names))
558
658
 
559
659
  if self.median_yield_as_feature:
@@ -565,16 +665,14 @@ class Geocif:
565
665
  self.feature_names.append(f"t -{i} {self.target}")
566
666
 
567
667
  if self.analogous_year_yield_as_feature:
568
- self.feature_names.append("Analogous Year")
569
- self.feature_names.append("Analogous Year Yield")
668
+ self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
570
669
 
571
670
  if self.use_outlook_as_feature:
572
671
  self.feature_names.append("FCST")
573
672
 
574
673
  # Add lat and lon to feature names
575
- if self.include_lat_lon:
576
- self.feature_names.append("lat")
577
- self.feature_names.append("lon")
674
+ if self.include_lat_lon_as_feature:
675
+ self.feature_names.extend(["lat", "lon"])
578
676
 
579
677
  self.selected_features = []
580
678
 
@@ -598,6 +696,8 @@ class Geocif:
598
696
  for idx, region in enumerate(pbar):
599
697
  if self.model_name in ["linear", "gam"]:
600
698
  self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
699
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
700
+ self.create_feature_names(stages, {})
601
701
  elif self.ml_model:
602
702
  self.create_feature_names(stages, dict_selected_features[region])
603
703
  elif self.model_name in ["median"]:
@@ -727,11 +827,52 @@ class Geocif:
727
827
  parts = all_cei_columns[-1].split("_")
728
828
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
729
829
 
730
- # HACK: Get feature name with GD4 in it to extract first and last stage id and name
731
- cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
732
- # Select the longest string in cei_column
733
- cei_col = max(cei_column, key=len)
734
- self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
830
+ # For each region, find the column with the longest string in cei_column
831
+ group_by = ["Region"]
832
+ groups = df.groupby(group_by)
833
+ if self.use_cumulative_features:
834
+ frames = []
835
+ for name, group in groups:
836
+ # Drop columns with all NaNs
837
+ group.dropna(axis=1, how="all", inplace=True)
838
+
839
+ cei_column = group[
840
+ group.columns[group.columns.str.contains(cei)]
841
+ ].columns
842
+ max_cei_col = max(cei_column, key=len)
843
+ self.stage_info = stages.get_stage_information_dict(
844
+ max_cei_col, self.method
845
+ )
846
+
847
+ # Subset dataframes to columns that contain self.stage_info["Stage_ID"]
848
+ all_columns = group.columns[
849
+ group.columns.str.contains(self.stage_info["Stage_ID"])
850
+ ].tolist()
851
+
852
+ group = group[
853
+ self.fixed_columns
854
+ + [self.target]
855
+ + self.statistics_columns
856
+ + all_columns
857
+ ]
858
+ # rename all_columns to self.stage_info["CEI"]
859
+ group.rename(
860
+ columns={
861
+ col: stages.get_stage_information_dict(col, self.method)["CEI"]
862
+ for col in all_columns
863
+ },
864
+ inplace=True,
865
+ )
866
+
867
+ frames.append(group)
868
+
869
+ df = pd.concat(frames)
870
+ else:
871
+ # HACK: Get feature name with GD4 in it to extract first and last stage id and name
872
+ cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
873
+ # Select the longest string in cei_column
874
+ cei_col = max(cei_column, key=len)
875
+ self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
735
876
 
736
877
  # Change column name
737
878
  # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
@@ -795,12 +936,14 @@ class Geocif:
795
936
 
796
937
  mask = self.df_results["Stage_ID"].isin(_stages)
797
938
  df = self.df_results[mask]
798
-
799
939
  """ Select which CEI categories to use for ML """
800
940
  if "all" in self.use_ceis:
801
941
  pass
802
942
  else:
803
- df = df[df["Type"].isin(self.use_ceis)]
943
+ if self.select_cei_by == "Type":
944
+ df = df[df["Type"].isin(self.use_ceis)]
945
+ elif self.select_cei_by == "Index":
946
+ df = df[df["Index"].isin(self.use_ceis)]
804
947
 
805
948
  """ Convert this dataframe into an ML ready format and save to disk """
806
949
  df = self.create_ml_dataframe(df)
@@ -874,6 +1017,8 @@ class Geocif:
874
1017
  if self.spatial_autocorrelation:
875
1018
  sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
876
1019
 
1020
+ dict_selected_features = {}
1021
+ dict_best_cei = {}
877
1022
  if self.correlation_plots:
878
1023
  self.logger.info(f"Correlation plot for {self.country} {self.crop}")
879
1024
  (
@@ -949,6 +1094,8 @@ class Geocif:
949
1094
  self.model_name = model
950
1095
  self.experiment_name = self.parser.get("ML", "experiment_name")
951
1096
  self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
1097
+ self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
1098
+ self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
952
1099
  self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
953
1100
  self.optimize = self.parser.getboolean(self.country, "optimize")
954
1101
  self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
@@ -960,6 +1107,21 @@ class Geocif:
960
1107
  self.estimate_ci = False
961
1108
  self.check_yield_trend = False
962
1109
  self.estimate_ci_for_all = False
1110
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
1111
+ self.correlation_plots = False
1112
+ self.lag_yield_as_feature = False
1113
+ self.median_yield_as_feature = False
1114
+ self.median_area_as_feature = False
1115
+ self.analogous_year_yield_as_feature = False
1116
+ self.last_year_yield_as_feature = False
1117
+ self.include_lat_lon_as_feature = False
1118
+ self.do_xai = False
1119
+ self.estimate_ci = False
1120
+ self.estimate_ci_for_all = False
1121
+ self.check_yield_trend = False
1122
+ self.cluster_strategy = "single"
1123
+ self.select_cei_by = "Index"
1124
+ self.use_cumulative_features = True
963
1125
  else:
964
1126
  self.do_xai = self.parser.getboolean("ML", "do_xai")
965
1127
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
173
173
  or "south_africa_maize" in i[3]
174
174
  or "mozambique_maize" in i[3]
175
175
  or "united_states_of_america" in i[3]
176
- or "russian_federation" in i[3]
177
- or "ukraine" in i[3]
176
+ or "russian_federation" in i[3]
177
+ or "ukraine" in i[3]
178
178
  ]
179
179
  # "malawi" in i[2]]
180
180
 
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
47
47
 
48
48
  self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
49
49
  self.base_dir = Path(
50
- r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\afghanistan"
50
+ r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
51
51
  ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
52
52
  self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
53
53
 
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
164
164
  # Only keep those entries in combinations where the third elemt is
165
165
  # mozambique, south_africa, angola or dem_people's_rep_of_korea
166
166
  # This is done to test the code for these countries
167
- combinations = [i for i in combinations if "afghanistan_maize_s1" in i[3]]
167
+ combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
168
168
 
169
169
  if True:
170
170
  num_cpu = int(cpu_count() * 0.5)
@@ -157,8 +157,8 @@ def plot_feature_corr_by_time(df, **kwargs):
157
157
  cbar_ax.tick_params(axis="both", which="major", labelsize=5)
158
158
 
159
159
  _country = country.title().replace("_", " ")
160
- _region_name = region_name.replace("_", " ") if not national_correlation else ""
161
- _crop = "Poppy" # crop.title().replace("_", " ")
160
+ _region_name = region_name if not national_correlation else ""
161
+ _crop = crop.title().replace("_", " ")
162
162
  if not national_correlation:
163
163
  fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
164
164
  else:
@@ -304,7 +304,7 @@ def all_correlated_feature_by_time(df, **kwargs):
304
304
  )
305
305
 
306
306
  kwargs["region_id"] = region_id
307
- _region_names = "_".join([str(x) for x in group['Region'].unique()])
307
+ _region_names = ", ".join([str(x) for x in group['Region'].unique()])
308
308
  kwargs["region_name"] = _region_names
309
309
  plot_feature_corr_by_time(df_tmp, **kwargs)
310
310
  # For each element in dict_best_cei, add the type of the cei
@@ -0,0 +1,33 @@
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from taipy.gui import Gui
5
+
6
+ # Load the dataset
7
+ file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\ml\analysis\July_05_2024\russian_federation\maize\cumulative_1\2010\X_train_1.csv' # Update with the correct file path
8
+ df = pd.read_csv(file_path)
9
+ print(df.head())
10
+ # Define a function to create the plot
11
+ def plot_auc_ndvi(data):
12
+ fig, ax = plt.subplots(figsize=(14, 8))
13
+ sns.lineplot(data=data, x="Harvest Year", y="AUC_NDVI Oct 7-Mar 25", hue="Region", marker="o", ax=ax)
14
+ ax.set_title("Trends of AUC_NDVI by Region (Oct 7 - Mar 25)")
15
+ ax.set_xlabel("Harvest Year")
16
+ ax.set_ylabel("AUC_NDVI Oct 7 - Mar 25")
17
+ ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
18
+ plt.show()
19
+ return fig
20
+
21
+ # Create the plot and save it
22
+ plot_fig = plot_auc_ndvi(df)
23
+
24
+ # Define the Taipy page with the plot
25
+ page = """
26
+ # Trends of AUC_NDVI by Region
27
+
28
+ <|{plot_fig}|chart|>
29
+ """
30
+
31
+ # Create and run the GUI
32
+ gui = Gui(page)
33
+ gui.run()
@@ -107,7 +107,6 @@ def store(db_path, experiment_id, df, model, model_name):
107
107
  try:
108
108
  utils.to_db(db_path, experiment_id, df)
109
109
  except Exception as e:
110
- breakpoint()
111
110
  print(f"Error: {e}")
112
111
 
113
112
  index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
@@ -128,7 +127,6 @@ def store(db_path, experiment_id, df, model, model_name):
128
127
  df_model.index.set_names(["Index"], inplace=True)
129
128
  utils.to_db(db_path, "models", df_model)
130
129
  except Exception as e:
131
- breakpoint()
132
130
  print(f"Error: {e}")
133
131
 
134
132
  con.commit()
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ from typing import Union
2
3
 
3
4
  from geocif import utils
4
5
 
@@ -277,23 +278,31 @@ def update_feature_names(df, method):
277
278
  return df
278
279
 
279
280
 
280
- def convert_stage_string(stage_info, to_array=True):
281
+ def convert_stage_string(stage_info: Union[str, np.ndarray], to_array: bool = True) -> Union[np.ndarray, str]:
281
282
  """
282
- e.g. input: '13_12_11'
283
- output: array([13, 12, 11])
284
- or vice versa if to_array = False
283
+ Converts a string of stage information to a numpy array or vice versa.
285
284
 
286
285
  Args:
287
- stage_info:
288
- to_array:
286
+ stage_info: A string of stages separated by underscores or a numpy array of stages e.g. '13_12_11'
287
+ to_array: A boolean indicating the direction of conversion. If True, converts string to numpy array e.g. array([13, 12, 11])
288
+ If False, converts numpy array to string.
289
289
 
290
290
  Returns:
291
+ A numpy array of stages if to_array is True, or a string of stages if to_array is False.
291
292
 
293
+ Raises:
294
+ ValueError: If the input format is incorrect.
292
295
  """
293
296
  if to_array:
294
- stages = stage_info.split("_")
295
- stages = np.array([int(stage) for stage in stages])
297
+ if not isinstance(stage_info, str):
298
+ raise ValueError("Expected a string for stage_info when to_array is True.")
299
+ try:
300
+ stages = np.array([int(stage) for stage in stage_info.split("_")])
301
+ except ValueError:
302
+ raise ValueError("Stage info string should contain integers separated by underscores.")
296
303
  else:
297
- stages = "_".join(stage_info.astype(str))
304
+ if not isinstance(stage_info, np.ndarray):
305
+ raise ValueError("Expected a numpy array for stage_info when to_array is False.")
306
+ stages = "_".join(map(str, stage_info))
298
307
 
299
308
  return stages