geocif 0.1.46__tar.gz → 0.1.47__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {geocif-0.1.46/geocif.egg-info → geocif-0.1.47}/PKG-INFO +1 -1
  2. {geocif-0.1.46 → geocif-0.1.47}/geocif/analysis.py +7 -5
  3. {geocif-0.1.46 → geocif-0.1.47}/geocif/experiments.py +3 -9
  4. {geocif-0.1.46 → geocif-0.1.47}/geocif/geocif.py +204 -42
  5. {geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner.py +2 -2
  6. {geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner_v2.py +2 -2
  7. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/correlations.py +3 -3
  8. geocif-0.1.47/geocif/ml/misc.py +33 -0
  9. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/output.py +0 -2
  10. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/stages.py +18 -9
  11. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/trainers.py +22 -0
  12. {geocif-0.1.46 → geocif-0.1.47/geocif.egg-info}/PKG-INFO +1 -1
  13. {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/SOURCES.txt +1 -1
  14. {geocif-0.1.46 → geocif-0.1.47}/setup.py +1 -1
  15. geocif-0.1.46/geocif/ml/correlations_backup.py +0 -412
  16. {geocif-0.1.46 → geocif-0.1.47}/LICENSE +0 -0
  17. {geocif-0.1.46 → geocif-0.1.47}/MANIFEST.in +0 -0
  18. {geocif-0.1.46 → geocif-0.1.47}/README.md +0 -0
  19. {geocif-0.1.46 → geocif-0.1.47}/geocif/__init__.py +0 -0
  20. {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/__init__.py +0 -0
  21. {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/geoagmet.py +0 -0
  22. {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/plot.py +0 -0
  23. {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/utils.py +0 -0
  24. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/__init__.py +0 -0
  25. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/constants.py +0 -0
  26. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/features.py +0 -0
  27. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/geo.py +0 -0
  28. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/geocif.py +0 -0
  29. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/metadata.py +0 -0
  30. {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/models.py +0 -0
  31. {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/__init__.py +0 -0
  32. {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/definitions.py +0 -0
  33. {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/indices.py +0 -0
  34. {geocif-0.1.46 → geocif-0.1.47}/geocif/logger.py +0 -0
  35. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/__init__.py +0 -0
  36. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/embedding.py +0 -0
  37. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/feature_engineering.py +0 -0
  38. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/feature_selection.py +0 -0
  39. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/outliers.py +0 -0
  40. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/outlook.py +0 -0
  41. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/spatial_autocorrelation.py +0 -0
  42. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/stats.py +0 -0
  43. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/trend.py +0 -0
  44. {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/xai.py +0 -0
  45. {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/__init__.py +0 -0
  46. {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/automl.py +0 -0
  47. {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/misc.py +0 -0
  48. {geocif-0.1.46 → geocif-0.1.47}/geocif/utils.py +0 -0
  49. {geocif-0.1.46 → geocif-0.1.47}/geocif/viz/__init__.py +0 -0
  50. {geocif-0.1.46 → geocif-0.1.47}/geocif/viz/plot.py +0 -0
  51. {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/dependency_links.txt +0 -0
  52. {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/not-zip-safe +0 -0
  53. {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/top_level.txt +0 -0
  54. {geocif-0.1.46 → geocif-0.1.47}/requirements.txt +0 -0
  55. {geocif-0.1.46 → geocif-0.1.47}/setup.cfg +0 -0
  56. {geocif-0.1.46 → geocif-0.1.47}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.46
3
+ Version: 0.1.47
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -162,8 +162,8 @@ class Geoanalysis:
162
162
  return pd.DataFrame(), pd.DataFrame()
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
- # df_metrics = self._process_metrics(df_metrics)
166
- # self._plot_metrics(df_metrics)
165
+ df_metrics = self._process_metrics(df_metrics)
166
+ self._plot_metrics(df_metrics)
167
167
 
168
168
  df_regional_metrics_by_year = self._compute_regional_metrics(
169
169
  df, by="Harvest Year"
@@ -172,8 +172,10 @@ class Geoanalysis:
172
172
  df_regional_metrics_by_year
173
173
  )
174
174
  df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
175
- breakpoint()
176
- self._store_results(None, df_regional_metrics, df_regional_metrics_by_year)
175
+
176
+ self._store_results(
177
+ df_metrics, df_regional_metrics, df_regional_metrics_by_year
178
+ )
177
179
 
178
180
  df_national_yield = self._compute_national_yield(df)
179
181
  self._plot_national_yield(df_national_yield)
@@ -193,7 +195,7 @@ class Geoanalysis:
193
195
  .apply(self.annual_metrics)
194
196
  .reset_index()
195
197
  )
196
- breakpoint()
198
+
197
199
  return df_metrics.pivot_table(
198
200
  index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
199
201
  columns="level_5",
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
85
85
 
86
86
  # Experiment: lag_years
87
87
  logger.info("Experiment 3: lag_years")
88
- parser = main(
89
- inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
90
- )
88
+ parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
91
89
 
92
90
  # Experiment: lag_yield_as_feature
93
91
  logger.info("Experiment 4: lag_yield_as_feature")
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
103
101
 
104
102
  # Experiment: median_years
105
103
  logger.info("Experiment 5: median_years")
106
- parser = main(
107
- inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
108
- )
104
+ parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
109
105
 
110
106
  # Experiment: median_yield_as_feature
111
107
  logger.info("Experiment 6: median_yield_as_feature")
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
133
129
 
134
130
  # Experiment: optimize
135
131
  logger.info("Experiment 8: optimize")
136
- parser = main(
137
- inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
138
- )
132
+ parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
139
133
 
140
134
 
141
135
  if __name__ == "__main__":
@@ -108,7 +108,6 @@ class Geocif:
108
108
  Config file: ML
109
109
  ====================================================================
110
110
  """
111
- self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
112
111
  self.model_type = self.parser.get("ML", "model_type")
113
112
  self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
114
113
  self.analogous_year_yield_as_feature = self.parser.getboolean(
@@ -117,10 +116,10 @@ class Geocif:
117
116
  self.plot_map_for_correlation_plot = self.parser.getboolean(
118
117
  "ML", "plot_map_for_correlation_plot"
119
118
  )
120
- self.correlation_threshold = self.parser.getfloat(
121
- "ML", "correlation_threshold"
119
+ self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
120
+ self.include_lat_lon_as_feature = self.parser.getboolean(
121
+ "ML", "include_lat_lon_as_feature"
122
122
  )
123
- self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
124
123
  self.spatial_autocorrelation = self.parser.getboolean(
125
124
  "ML", "spatial_autocorrelation"
126
125
  )
@@ -153,6 +152,9 @@ class Geocif:
153
152
  self.parser.get("ML", "cat_features")
154
153
  )
155
154
 
155
+ self.use_cumulative_features = self.parser.getboolean(
156
+ "DEFAULT", "use_cumulative_features"
157
+ )
156
158
  """
157
159
  ====================================================================
158
160
  Variables, Paths
@@ -198,6 +200,9 @@ class Geocif:
198
200
 
199
201
  self.db_path = self.dir_db / self.db_forecasts
200
202
 
203
+ # Store config file in database
204
+ output.config_to_db(self.db_path, self.parser, self.today)
205
+
201
206
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
202
207
  # obj_pickle = outlook.Outlook(self.pickle_file)
203
208
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -224,18 +229,29 @@ class Geocif:
224
229
  y_train = df_region[target_col]
225
230
 
226
231
  if self.ml_model:
227
- self.logger.info(f"Selecting features for {self.country} {self.crop}")
228
- selector, _, self.selected_features = fs.select_features(
229
- X_train, y_train, method=self.feature_selection
230
- )
231
- self.logger.info(f"Selected features: {self.selected_features}")
232
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
233
+ all_features = X_train.columns
234
+
235
+ # Select the columns with use_ceis in it
236
+ self.selected_features = [
237
+ column
238
+ for column in all_features
239
+ if any(cei in column for cei in self.use_ceis)
240
+ ]
241
+ else:
242
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
243
+ selector, _, self.selected_features = fs.select_features(
244
+ X_train, y_train, method=self.feature_selection
245
+ )
246
+ self.logger.info(f"Selected features: {self.selected_features}")
232
247
 
233
248
  """ Update model to include conformal estimates """
234
- if "lat" not in self.selected_features and self.include_lat_lon:
249
+ if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
235
250
  self.selected_features.append("lat")
236
- if "lon" not in self.selected_features and self.include_lat_lon:
251
+ if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
237
252
  self.selected_features.append("lon")
238
253
  X_train = df_region[self.selected_features + self.cat_features]
254
+
239
255
  dir_output = (
240
256
  self.dir_analysis
241
257
  / self.country
@@ -312,8 +328,38 @@ class Geocif:
312
328
  self.best_hyperparams = {}
313
329
  elif self.model_name in ["cubist"]:
314
330
  self.model.fit(X_train, y_train)
315
- except:
316
- self.logger.error(f"Error fitting model for {self.country} {self.crop}")
331
+ elif self.model_name in [
332
+ "cumulative_1",
333
+ "cumulative_2",
334
+ "cumulative_3",
335
+ ]:
336
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
337
+
338
+ # Standardize the numeric features
339
+ scaler = StandardScaler()
340
+ X_numeric = X_train.iloc[:, :3]
341
+ X_scaled_numeric = pd.DataFrame(
342
+ scaler.fit_transform(X_numeric),
343
+ columns=X_numeric.columns,
344
+ index=X_train.index,
345
+ )
346
+
347
+ # Encode the Region as categorical
348
+ le = LabelEncoder()
349
+ X_region = pd.Series(
350
+ le.fit_transform(X_train["Region"]),
351
+ name="Region",
352
+ index=X_train.index,
353
+ )
354
+
355
+ # Combine scaled numeric features and encoded region
356
+ X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
357
+
358
+ self.model.fit(X_train_scaled, y_train)
359
+ except Exception as e:
360
+ self.logger.error(
361
+ f"Error fitting model for {self.country} {self.crop} {e}"
362
+ )
317
363
 
318
364
  def predict(self, df_region, scaler=None):
319
365
  """
@@ -360,6 +406,33 @@ class Geocif:
360
406
  X_test, Z_test, clusters_test.astype("object")
361
407
  )
362
408
  best_hyperparameters = self.model.fe_model.get_params().copy()
409
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
411
+
412
+ # Standardize the numeric features
413
+ scaler = StandardScaler()
414
+ X_numeric = X_test.iloc[:, :3]
415
+ try:
416
+ X_scaled_numeric = pd.DataFrame(
417
+ scaler.fit_transform(X_numeric),
418
+ columns=X_numeric.columns,
419
+ index=X_test.index,
420
+ )
421
+ except:
422
+ breakpoint()
423
+
424
+ # Encode the Region as categorical
425
+ le = LabelEncoder()
426
+ X_region = pd.Series(
427
+ le.fit_transform(X_test["Region"]),
428
+ name="Region",
429
+ index=X_test.index,
430
+ )
431
+
432
+ # Combine scaled numeric features and encoded region
433
+ X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
434
+ y_pred = self.model.predict(X_test_scaled)
435
+ best_hyperparameters = {} # self.model.get_params().copy()
363
436
  elif self.model_name == "geospaNN":
364
437
  import torch
365
438
  import geospaNN
@@ -501,7 +574,9 @@ class Geocif:
501
574
  "Crop",
502
575
  "Harvest Year",
503
576
  "Stage Name",
577
+ "Time",
504
578
  ]
579
+
505
580
  df.index = df.apply(
506
581
  lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
507
582
  )
@@ -513,28 +588,37 @@ class Geocif:
513
588
 
514
589
  def create_feature_names(self, stages_features, selected_features):
515
590
  """
591
+ Create feature names for machine learning stages.
516
592
 
517
593
  Args:
518
- stages_features:
519
- selected_features:
594
+ stages_features (list): List of features for different stages.
595
+ selected_features (dict): Dictionary of selected features.
520
596
 
521
597
  Returns:
522
-
598
+ None
523
599
  """
600
+ # Assert stages_features is a list
601
+ assert isinstance(stages_features, list), "stages_features should be a list"
602
+
524
603
  # Clear out feature names
525
604
  self.feature_names = []
526
605
 
527
- """ Select stages that will be used for ML
606
+ """
607
+ Select stages that will be used for ML
528
608
  1. method = "latest" - Select the latest stage
529
609
  2. method = "fraction" - Select a fraction (1-100) of all stages
530
610
  """
611
+ method = "fraction"
612
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
613
+ method = "latest"
614
+
531
615
  stages_features = stages.select_stages_for_ml(
532
- stages_features, method="fraction", n=60
616
+ stages_features, method=method, n=60
533
617
  )
534
618
 
535
619
  for stage in stages_features:
536
620
  # Convert each element of stage to str and join with _
537
- _stage = "_".join([str(x) for x in stage])
621
+ _stage = "_".join(map(str, stage))
538
622
 
539
623
  # Create a list appending _stage to each element of combined_keys
540
624
  _tmp = [f"{col}_{_stage}" for col in self.combined_keys]
@@ -543,17 +627,33 @@ class Geocif:
543
627
  parts = _t.split("_")
544
628
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
545
629
 
546
- # Check if any element of dict_selected_features is in _t
547
- for x in selected_features["CEI"].values:
548
- if x not in cei:
549
- continue
550
-
551
- dict_fn = stages.get_stage_information_dict(_t, self.method)
552
- tmp_col = dict_fn["CEI"] + " " + dict_fn["Stage Name"]
553
-
554
- if tmp_col in self.df_train.columns:
555
- self.feature_names.append(tmp_col)
556
-
630
+ try:
631
+ if self.model_name in [
632
+ "cumulative_1",
633
+ "cumulative_2",
634
+ "cumulative_3",
635
+ ]:
636
+ dict_fn = stages.get_stage_information_dict(_t, self.method)
637
+ tmp_col = f"{dict_fn['CEI']}"
638
+
639
+ if tmp_col in self.df_train.columns:
640
+ self.feature_names.append(tmp_col)
641
+ else:
642
+ # Check if any element of dict_selected_features is in _t
643
+ if selected_features["CEI"].any():
644
+ for x in selected_features["CEI"].values:
645
+ if x not in cei:
646
+ continue
647
+
648
+ dict_fn = stages.get_stage_information_dict(
649
+ _t, self.method
650
+ )
651
+ tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
652
+
653
+ if tmp_col in self.df_train.columns:
654
+ self.feature_names.append(tmp_col)
655
+ except:
656
+ breakpoint()
557
657
  self.feature_names = list(set(self.feature_names))
558
658
 
559
659
  if self.median_yield_as_feature:
@@ -565,16 +665,14 @@ class Geocif:
565
665
  self.feature_names.append(f"t -{i} {self.target}")
566
666
 
567
667
  if self.analogous_year_yield_as_feature:
568
- self.feature_names.append("Analogous Year")
569
- self.feature_names.append("Analogous Year Yield")
668
+ self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
570
669
 
571
670
  if self.use_outlook_as_feature:
572
671
  self.feature_names.append("FCST")
573
672
 
574
673
  # Add lat and lon to feature names
575
- if self.include_lat_lon:
576
- self.feature_names.append("lat")
577
- self.feature_names.append("lon")
674
+ if self.include_lat_lon_as_feature:
675
+ self.feature_names.extend(["lat", "lon"])
578
676
 
579
677
  self.selected_features = []
580
678
 
@@ -598,6 +696,8 @@ class Geocif:
598
696
  for idx, region in enumerate(pbar):
599
697
  if self.model_name in ["linear", "gam"]:
600
698
  self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
699
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
700
+ self.create_feature_names(stages, {})
601
701
  elif self.ml_model:
602
702
  self.create_feature_names(stages, dict_selected_features[region])
603
703
  elif self.model_name in ["median"]:
@@ -727,11 +827,52 @@ class Geocif:
727
827
  parts = all_cei_columns[-1].split("_")
728
828
  cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
729
829
 
730
- # HACK: Get feature name with GD4 in it to extract first and last stage id and name
731
- cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
732
- # Select the longest string in cei_column
733
- cei_col = max(cei_column, key=len)
734
- self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
830
+ # For each region, find the column with the longest string in cei_column
831
+ group_by = ["Region"]
832
+ groups = df.groupby(group_by)
833
+ if self.use_cumulative_features:
834
+ frames = []
835
+ for name, group in groups:
836
+ # Drop columns with all NaNs
837
+ group.dropna(axis=1, how="all", inplace=True)
838
+
839
+ cei_column = group[
840
+ group.columns[group.columns.str.contains(cei)]
841
+ ].columns
842
+ max_cei_col = max(cei_column, key=len)
843
+ self.stage_info = stages.get_stage_information_dict(
844
+ max_cei_col, self.method
845
+ )
846
+
847
+ # Subset dataframes to columns that contain self.stage_info["Stage_ID"]
848
+ all_columns = group.columns[
849
+ group.columns.str.contains(self.stage_info["Stage_ID"])
850
+ ].tolist()
851
+
852
+ group = group[
853
+ self.fixed_columns
854
+ + [self.target]
855
+ + self.statistics_columns
856
+ + all_columns
857
+ ]
858
+ # rename all_columns to self.stage_info["CEI"]
859
+ group.rename(
860
+ columns={
861
+ col: stages.get_stage_information_dict(col, self.method)["CEI"]
862
+ for col in all_columns
863
+ },
864
+ inplace=True,
865
+ )
866
+
867
+ frames.append(group)
868
+
869
+ df = pd.concat(frames)
870
+ else:
871
+ # HACK: Get feature name with GD4 in it to extract first and last stage id and name
872
+ cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
873
+ # Select the longest string in cei_column
874
+ cei_col = max(cei_column, key=len)
875
+ self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
735
876
 
736
877
  # Change column name
737
878
  # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
@@ -795,12 +936,14 @@ class Geocif:
795
936
 
796
937
  mask = self.df_results["Stage_ID"].isin(_stages)
797
938
  df = self.df_results[mask]
798
-
799
939
  """ Select which CEI categories to use for ML """
800
940
  if "all" in self.use_ceis:
801
941
  pass
802
942
  else:
803
- df = df[df["Type"].isin(self.use_ceis)]
943
+ if self.select_cei_by == "Type":
944
+ df = df[df["Type"].isin(self.use_ceis)]
945
+ elif self.select_cei_by == "Index":
946
+ df = df[df["Index"].isin(self.use_ceis)]
804
947
 
805
948
  """ Convert this dataframe into an ML ready format and save to disk """
806
949
  df = self.create_ml_dataframe(df)
@@ -874,6 +1017,8 @@ class Geocif:
874
1017
  if self.spatial_autocorrelation:
875
1018
  sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
876
1019
 
1020
+ dict_selected_features = {}
1021
+ dict_best_cei = {}
877
1022
  if self.correlation_plots:
878
1023
  self.logger.info(f"Correlation plot for {self.country} {self.crop}")
879
1024
  (
@@ -949,6 +1094,8 @@ class Geocif:
949
1094
  self.model_name = model
950
1095
  self.experiment_name = self.parser.get("ML", "experiment_name")
951
1096
  self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
1097
+ self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
1098
+ self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
952
1099
  self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
953
1100
  self.optimize = self.parser.getboolean(self.country, "optimize")
954
1101
  self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
@@ -960,6 +1107,21 @@ class Geocif:
960
1107
  self.estimate_ci = False
961
1108
  self.check_yield_trend = False
962
1109
  self.estimate_ci_for_all = False
1110
+ elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
1111
+ self.correlation_plots = False
1112
+ self.lag_yield_as_feature = False
1113
+ self.median_yield_as_feature = False
1114
+ self.median_area_as_feature = False
1115
+ self.analogous_year_yield_as_feature = False
1116
+ self.last_year_yield_as_feature = False
1117
+ self.include_lat_lon_as_feature = False
1118
+ self.do_xai = False
1119
+ self.estimate_ci = False
1120
+ self.estimate_ci_for_all = False
1121
+ self.check_yield_trend = False
1122
+ self.cluster_strategy = "single"
1123
+ self.select_cei_by = "Index"
1124
+ self.use_cumulative_features = True
963
1125
  else:
964
1126
  self.do_xai = self.parser.getboolean("ML", "do_xai")
965
1127
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
173
173
  or "south_africa_maize" in i[3]
174
174
  or "mozambique_maize" in i[3]
175
175
  or "united_states_of_america" in i[3]
176
- or "russian_federation" in i[3]
177
- or "ukraine" in i[3]
176
+ or "russian_federation" in i[3]
177
+ or "ukraine" in i[3]
178
178
  ]
179
179
  # "malawi" in i[2]]
180
180
 
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
47
47
 
48
48
  self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
49
49
  self.base_dir = Path(
50
- r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\afghanistan"
50
+ r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
51
51
  ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
52
52
  self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
53
53
 
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
164
164
  # Only keep those entries in combinations where the third elemt is
165
165
  # mozambique, south_africa, angola or dem_people's_rep_of_korea
166
166
  # This is done to test the code for these countries
167
- combinations = [i for i in combinations if "afghanistan_maize_s1" in i[3]]
167
+ combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
168
168
 
169
169
  if True:
170
170
  num_cpu = int(cpu_count() * 0.5)
@@ -157,8 +157,8 @@ def plot_feature_corr_by_time(df, **kwargs):
157
157
  cbar_ax.tick_params(axis="both", which="major", labelsize=5)
158
158
 
159
159
  _country = country.title().replace("_", " ")
160
- _region_name = region_name.replace("_", " ") if not national_correlation else ""
161
- _crop = "Poppy" # crop.title().replace("_", " ")
160
+ _region_name = region_name if not national_correlation else ""
161
+ _crop = crop.title().replace("_", " ")
162
162
  if not national_correlation:
163
163
  fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
164
164
  else:
@@ -304,7 +304,7 @@ def all_correlated_feature_by_time(df, **kwargs):
304
304
  )
305
305
 
306
306
  kwargs["region_id"] = region_id
307
- _region_names = "_".join([str(x) for x in group['Region'].unique()])
307
+ _region_names = ", ".join([str(x) for x in group['Region'].unique()])
308
308
  kwargs["region_name"] = _region_names
309
309
  plot_feature_corr_by_time(df_tmp, **kwargs)
310
310
  # For each element in dict_best_cei, add the type of the cei
@@ -0,0 +1,33 @@
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from taipy.gui import Gui
5
+
6
+ # Load the dataset
7
+ file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\ml\analysis\July_05_2024\russian_federation\maize\cumulative_1\2010\X_train_1.csv' # Update with the correct file path
8
+ df = pd.read_csv(file_path)
9
+ print(df.head())
10
+ # Define a function to create the plot
11
+ def plot_auc_ndvi(data):
12
+ fig, ax = plt.subplots(figsize=(14, 8))
13
+ sns.lineplot(data=data, x="Harvest Year", y="AUC_NDVI Oct 7-Mar 25", hue="Region", marker="o", ax=ax)
14
+ ax.set_title("Trends of AUC_NDVI by Region (Oct 7 - Mar 25)")
15
+ ax.set_xlabel("Harvest Year")
16
+ ax.set_ylabel("AUC_NDVI Oct 7 - Mar 25")
17
+ ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
18
+ plt.show()
19
+ return fig
20
+
21
+ # Create the plot and save it
22
+ plot_fig = plot_auc_ndvi(df)
23
+
24
+ # Define the Taipy page with the plot
25
+ page = """
26
+ # Trends of AUC_NDVI by Region
27
+
28
+ <|{plot_fig}|chart|>
29
+ """
30
+
31
+ # Create and run the GUI
32
+ gui = Gui(page)
33
+ gui.run()
@@ -107,7 +107,6 @@ def store(db_path, experiment_id, df, model, model_name):
107
107
  try:
108
108
  utils.to_db(db_path, experiment_id, df)
109
109
  except Exception as e:
110
- breakpoint()
111
110
  print(f"Error: {e}")
112
111
 
113
112
  index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
@@ -128,7 +127,6 @@ def store(db_path, experiment_id, df, model, model_name):
128
127
  df_model.index.set_names(["Index"], inplace=True)
129
128
  utils.to_db(db_path, "models", df_model)
130
129
  except Exception as e:
131
- breakpoint()
132
130
  print(f"Error: {e}")
133
131
 
134
132
  con.commit()
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ from typing import Union
2
3
 
3
4
  from geocif import utils
4
5
 
@@ -277,23 +278,31 @@ def update_feature_names(df, method):
277
278
  return df
278
279
 
279
280
 
280
- def convert_stage_string(stage_info, to_array=True):
281
+ def convert_stage_string(stage_info: Union[str, np.ndarray], to_array: bool = True) -> Union[np.ndarray, str]:
281
282
  """
282
- e.g. input: '13_12_11'
283
- output: array([13, 12, 11])
284
- or vice versa if to_array = False
283
+ Converts a string of stage information to a numpy array or vice versa.
285
284
 
286
285
  Args:
287
- stage_info:
288
- to_array:
286
+ stage_info: A string of stages separated by underscores or a numpy array of stages e.g. '13_12_11'
287
+ to_array: A boolean indicating the direction of conversion. If True, converts string to numpy array e.g. array([13, 12, 11])
288
+ If False, converts numpy array to string.
289
289
 
290
290
  Returns:
291
+ A numpy array of stages if to_array is True, or a string of stages if to_array is False.
291
292
 
293
+ Raises:
294
+ ValueError: If the input format is incorrect.
292
295
  """
293
296
  if to_array:
294
- stages = stage_info.split("_")
295
- stages = np.array([int(stage) for stage in stages])
297
+ if not isinstance(stage_info, str):
298
+ raise ValueError("Expected a string for stage_info when to_array is True.")
299
+ try:
300
+ stages = np.array([int(stage) for stage in stage_info.split("_")])
301
+ except ValueError:
302
+ raise ValueError("Stage info string should contain integers separated by underscores.")
296
303
  else:
297
- stages = "_".join(stage_info.astype(str))
304
+ if not isinstance(stage_info, np.ndarray):
305
+ raise ValueError("Expected a numpy array for stage_info when to_array is False.")
306
+ stages = "_".join(map(str, stage_info))
298
307
 
299
308
  return stages
@@ -2,6 +2,7 @@ import multiprocessing as mp
2
2
 
3
3
  import numpy as np
4
4
  import optuna
5
+ import pandas as pd
5
6
  from catboost import CatBoostRegressor
6
7
  from sklearn.metrics import root_mean_squared_error
7
8
  from sklearn.model_selection import train_test_split
@@ -289,6 +290,27 @@ def auto_train(
289
290
  model = LinearGAM(n_splines=25, spline_order=3).gridsearch(
290
291
  X_train.values, y_train.values, lam=np.logspace(-3, 3, 11)
291
292
  )
293
+ elif model_name == "cumulative_1":
294
+ from pygam import GAM, s, f, te
295
+
296
+ # compute index of column Region
297
+ region_idx = X_train.columns.get_loc("Region")
298
+
299
+ model = GAM(s(0) + f(region_idx))
300
+ elif model_name == "cumulative_2":
301
+ from pygam import GAM, s, f, te
302
+
303
+ # compute index of column Region
304
+ region_idx = X_train.columns.get_loc("Region")
305
+
306
+ model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
307
+ elif model_name == "cumulative_3":
308
+ from pygam import GAM, s, f, te
309
+
310
+ # compute index of column Region
311
+ region_idx = X_train.columns.get_loc("Region")
312
+
313
+ model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
292
314
  elif model_name == "geospaNN":
293
315
  import torch
294
316
  import geospaNN
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.46
3
+ Version: 0.1.47
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -33,10 +33,10 @@ geocif/cei/definitions.py
33
33
  geocif/cei/indices.py
34
34
  geocif/ml/__init__.py
35
35
  geocif/ml/correlations.py
36
- geocif/ml/correlations_backup.py
37
36
  geocif/ml/embedding.py
38
37
  geocif/ml/feature_engineering.py
39
38
  geocif/ml/feature_selection.py
39
+ geocif/ml/misc.py
40
40
  geocif/ml/outliers.py
41
41
  geocif/ml/outlook.py
42
42
  geocif/ml/output.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.46",
53
+ version="0.1.47",
54
54
  zip_safe=False,
55
55
  )
@@ -1,412 +0,0 @@
1
- import os
2
-
3
- import matplotlib.pyplot as plt
4
- import palettable as pal
5
- import pandas as pd
6
- import seaborn as sns
7
- from tqdm import tqdm
8
-
9
- from geocif import utils
10
- from geocif.ml import embedding
11
- from geocif.ml import stages
12
-
13
-
14
- def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
15
- """
16
-
17
- Args:
18
- df_train:
19
- simulation_stages:
20
- target_col:
21
-
22
- Returns:
23
-
24
- """
25
- frames = []
26
-
27
- stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
28
-
29
- # Only select columns that have been observed till the current stage
30
- for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
31
- current_feature_set = [
32
- col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
33
- ]
34
-
35
- # Get the most correlated feature for each region
36
- top_feature_by_region, counter = embedding.get_top_correlated_features(
37
- df_train[current_feature_set + ["Region"]],
38
- df_train[target_col],
39
- )
40
-
41
- # Create a dataframe with the most common top feature and number of occurrences over timestep
42
- _feature = counter.most_common(1)[0][0]
43
- # Loop through top_feature_by_region and find the average score for _feature
44
- # Calculate the average score for 'DTR_36'
45
- _feature_scores = [
46
- value[1][0]
47
- for key, value in top_feature_by_region.items()
48
- if _feature in value[0]
49
- ]
50
- average_score = sum(_feature_scores) / len(_feature_scores)
51
- _feature = utils.remove_last_part(_feature)
52
-
53
- df = pd.DataFrame(
54
- {
55
- "Stage": [stage[-1]],
56
- "Date": [utils.dict_growth_stages[stage[-1]]],
57
- "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
58
- "Feature Category": [_feature],
59
- "Score": [average_score],
60
- # "Type": [ci.dict_indices[_feature][0]],
61
- "Number of Occurrences": [counter.most_common(1)[0][1]],
62
- # "Current Feature Set": [current_feature_set],
63
- }
64
- )
65
- frames.append(df)
66
-
67
- df_most_corr_feature_by_time = pd.concat(frames)
68
-
69
-
70
- def plot_feature_corr_by_time(df, **kwargs):
71
- country = kwargs.get("country")
72
- crop = kwargs.get("crop")
73
- dir_output = kwargs.get("dir_output")
74
- forecast_season = kwargs.get("forecast_season")
75
- national_correlation = kwargs.get("national_correlation")
76
- group_by = kwargs.get("groupby")
77
-
78
- # Setup the figure and gridspec
79
- fig = plt.figure(figsize=(10, 5))
80
- gs = fig.add_gridspec(
81
- 3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
82
- )
83
-
84
- # Assign subplots
85
- ax_heatmap = fig.add_subplot(gs[0:2, 0])
86
- ax_map = fig.add_subplot(gs[0, 1])
87
- cbar_ax = fig.add_subplot(gs[2, 0])
88
- ax4 = fig.add_subplot(gs[2, 1])
89
-
90
- # Transpose and reverse the columns of the dataframe
91
- #breakpoint()
92
- ## Only select foll. columns:
93
-
94
- df = df[
95
- [
96
- "TG",
97
- "TG10p",
98
- "DTR",
99
- "vDTR",
100
- "R99p",
101
- "RX5day",
102
- "MEAN_ESI4WK",
103
- ]
104
- ]
105
- df_transpose = df.T
106
- df = df_transpose[df_transpose.columns[::-1]]
107
-
108
- # Split column names and only use value before space
109
- df.columns = df.columns.str.split(" ").str[0]
110
- # In row names, replace ESI4WK by ES
111
- df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
112
- df.index = df.index.str.replace("R99p", "MEAN_SM")
113
- df.index = df.index.str.replace("RX5day", "AUC_SM")
114
- # Remove the last row
115
- # Select the first, third and fifth column
116
- df = df[["Dec", "Feb", "Apr"]]
117
- # Rename Dec to Planting - Early Vegetative
118
- # Rename Feb to Early Vegetative - Senescence
119
- # Rename Apr to Senescence - Harvest
120
- df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
121
- ax_heatmap = sns.heatmap(
122
- df,
123
- ax=ax_heatmap,
124
- annot=True,
125
- cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
126
- fmt=".2f",
127
- square=False,
128
- linewidths=0.5,
129
- linecolor="white",
130
- cbar_ax=cbar_ax,
131
- cbar_kws={"orientation": "horizontal"}, # , "shrink": 0.5},
132
- annot_kws={"size": 6},
133
- xticklabels=True,
134
- yticklabels=True,
135
- )
136
- ax_heatmap.tick_params(left=False, bottom=False)
137
-
138
- # Plot the map using GeoPandas
139
- dg_country = kwargs.get("dg_country")
140
-
141
- ax_map = dg_country.plot(
142
- ax=ax_map,
143
- color="white",
144
- edgecolor="black",
145
- linewidth=1.0,
146
- facecolor=None,
147
- legend=False,
148
- )
149
-
150
- if not national_correlation:
151
- id = kwargs["region_id"]
152
- dg_region = dg_country[dg_country[group_by] == id]
153
- ax_map = dg_region.plot(
154
- ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
155
- )
156
- # Set title with color blue
157
- ax_map.set_title(f"Region: {id}", color="blue")
158
-
159
- # No colorbar for the map
160
- ax_map.axis("off")
161
- # Remove borders
162
- ax_map.spines["top"].set_visible(False)
163
- ax_map.spines["right"].set_visible(False)
164
- ax_map.spines["bottom"].set_visible(False)
165
- ax_map.spines["left"].set_visible(False)
166
- # ax4 should not be visible
167
- ax4.axis("off")
168
-
169
- # Add colorbar label
170
- # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
171
- cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
172
- ax_heatmap.set_xticklabels(
173
- ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
174
- )
175
- ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
176
- ax_heatmap.set_xlabel("")
177
- ax_heatmap.set_ylabel(" ")
178
- # Reduce font size of ticks of colorbar
179
- cbar_ax.tick_params(axis="both", which="major", labelsize=6)
180
-
181
- _country = country.title().replace("_", " ")
182
- _crop = crop.title().replace("_", " ")
183
- if not national_correlation:
184
- fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
185
- else:
186
- fname = f"{country}_{crop}_corr_feature_by_time.png"
187
- ax_heatmap.set_title(f"{_country}\n{_crop}")
188
-
189
- # plt.tight_layout()
190
- os.makedirs(dir_output, exist_ok=True)
191
- plt.savefig(dir_output / fname, dpi=250)
192
- plt.close()
193
-
194
-
195
- def _all_correlated_feature_by_time(df, **kwargs):
196
- """
197
-
198
- Args:
199
- df:
200
- **kwargs:
201
-
202
- Returns:
203
-
204
- """
205
- frames = []
206
- all_stages = kwargs.get("all_stages")
207
- target_col = kwargs.get("target_col")
208
- method = kwargs.get("method")
209
-
210
- longest_stage = max(all_stages, key=len)
211
-
212
- # Split the original string into a list of its parts
213
- longest_stage = longest_stage.split("_")
214
-
215
- # Generate the list of strings as described by the user, removing one element from the start each time
216
- stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
217
-
218
- # Drop columns with no yield information
219
- df = df.dropna(subset=[target_col])
220
-
221
- # Only select columns that have been observed till the current stage
222
- pbar = tqdm(stages_features, total=len(stages_features), leave=False)
223
- for stage in pbar:
224
- pbar.set_description(f"Calculating correlations")
225
- pbar.update()
226
-
227
- stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
228
- "Stage Name"
229
- ]
230
- # starting_stage = stage_name.split("-")[0]
231
- current_feature_set = [col for col in df.columns if stage_name in col]
232
-
233
- # Get the most correlated feature for each region
234
- df_tmp = embedding.get_all_features_correlation(
235
- df[current_feature_set + ["Region"]], df[target_col], method
236
- )
237
-
238
- frames.append(df_tmp)
239
-
240
- df_results = pd.concat(frames)
241
- if not df_results.empty:
242
- # Exclude Region column
243
- df_results = df_results.drop(columns="Region")
244
- # Groupby Dekad and compute mean of all columns apart from Region
245
- df_results = df_results.groupby(method).mean()
246
-
247
- all_stage_names = []
248
- for stage in stages_features:
249
- _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
250
- "Stage Name"
251
- ]
252
- all_stage_names.append(_tmp)
253
-
254
- df_results = df_results.reindex(all_stage_names)
255
-
256
- # Drop rows with all NaN values
257
- df_results = df_results.dropna(how="all")
258
-
259
- # Split the index based on - and only keep the first element
260
- df_results.index = df_results.index.str.split("-").str[0]
261
-
262
- return df_results
263
- else:
264
- return pd.DataFrame()
265
-
266
-
267
- def all_correlated_feature_by_time(df, **kwargs):
268
- """
269
-
270
- Args:
271
- df:
272
- **kwargs:
273
-
274
- Returns:
275
-
276
- """
277
- THRESHOLD = 0.1
278
- national_correlation = kwargs.get("national_correlation")
279
- group_by = kwargs.get("groupby")
280
- combined_dict = kwargs.get("combined_dict")
281
-
282
- dict_selected_features = {}
283
- dict_best_cei = {}
284
-
285
- if not national_correlation:
286
- groups = df.groupby(group_by)
287
- for region_id, group in tqdm(
288
- groups, desc=f"Compute all correlated feature by {group_by}", leave=False
289
- ):
290
- df_corr = _all_correlated_feature_by_time(group, **kwargs)
291
-
292
- # Remove columns with more than 50% NaN values
293
- df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
294
-
295
- if not df_corr.empty:
296
- df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
297
- # Add the columns to dict_selected_features along with the absolute mean value
298
- absolute_medians = df_tmp.abs().median()
299
-
300
- # Create a DataFrame to display the column names and their absolute median values
301
- absolute_median_df = absolute_medians.reset_index()
302
- absolute_median_df.columns = ['CEI', 'Median']
303
-
304
- # Add the CEI and Median value to dict_selected_features
305
- dict_selected_features[region_id] = absolute_median_df
306
-
307
- df_tmp2 = (
308
- df_tmp.median(axis=0)
309
- .abs()
310
- .sort_values(ascending=False)
311
- .reset_index()
312
- )
313
- df_tmp2.columns = ["Metric", "Value"]
314
- # Add another column based on Type of Metric
315
- for idx, row in df_tmp2.iterrows():
316
- df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
317
-
318
- # Compute median of each CEI and sort the dataframe based on the absolute value of the median
319
- dict_best_cei[region_id] = (
320
- df_tmp2.groupby("Type")
321
- .max()
322
- .reset_index()
323
- .sort_values("Value", ascending=False)["Metric"]
324
- .values
325
- )
326
-
327
- kwargs["region_id"] = region_id
328
- plot_feature_corr_by_time(df_tmp, **kwargs)
329
- # For each element in dict_best_cei, add the type of the cei
330
- else:
331
- # HACK
332
- df_corr = _all_correlated_feature_by_time(df, **kwargs)
333
-
334
- df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
335
- # Add the columns to dict_selected_features along with the absolute mean value
336
- absolute_medians = df_tmp.abs().median()
337
-
338
- # Create a DataFrame to display the column names and their absolute median values
339
- absolute_median_df = absolute_medians.reset_index()
340
- absolute_median_df.columns = ['CEI', 'Median']
341
-
342
- # Add the CEI and Median value to dict_selected_features
343
- dict_selected_features[region_id] = absolute_median_df
344
- dict_best_cei[region_id] = {}
345
- else:
346
- df_corr = _all_correlated_feature_by_time(df, **kwargs)
347
- df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
348
- # Add the columns to dict_selected_features along with the absolute mean value
349
- absolute_medians = df_tmp.abs().median()
350
-
351
- # Create a DataFrame to display the column names and their absolute median values
352
- absolute_median_df = absolute_medians.reset_index()
353
- absolute_median_df.columns = ['CEI', 'Median']
354
-
355
- # Add the CEI and Median value to dict_selected_features
356
- dict_selected_features[0] = absolute_median_df
357
-
358
- plot_feature_corr_by_time(df_corr, **kwargs)
359
-
360
- return dict_selected_features, dict_best_cei
361
-
362
-
363
- def feature_correlation_by_time(**kwargs):
364
- raise NotImplementedError()
365
-
366
- frames = []
367
- simulation_stages = kwargs.get("simulation_stages")
368
- df_train = kwargs.get("df_train")
369
- target_col = kwargs.get("target_col")
370
-
371
- stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
372
-
373
- # Only select columns that have been observed till the current stage
374
- for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
375
- current_feature_set = [
376
- col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
377
- ]
378
-
379
- # Get the most correlated feature for each region
380
- top_feature_by_region, counter = embedding.compute_feature_correlations(
381
- df_train[current_feature_set + ["Region"]],
382
- df_train[target_col],
383
- "all",
384
- )
385
-
386
- # Create a dataframe with the most common top feature and number of occurrences over timestep
387
- _feature = counter.most_common(1)[0][0]
388
- # Loop through top_feature_by_region and find the average score for _feature
389
- # Calculate the average score for 'DTR_36'
390
- _feature_scores = [
391
- value[1][0]
392
- for key, value in top_feature_by_region.items()
393
- if _feature in value[0]
394
- ]
395
- average_score = sum(_feature_scores) / len(_feature_scores)
396
- _feature = utils.remove_last_part(_feature)
397
-
398
- df = pd.DataFrame(
399
- {
400
- "Stage": [stage[-1]],
401
- "Date": [utils.dict_growth_stages[stage[-1]]],
402
- "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
403
- "Feature Category": [_feature],
404
- "Score": [average_score],
405
- # "Type": [ci.dict_indices[_feature][0]],
406
- "Number of Occurrences": [counter.most_common(1)[0][1]],
407
- # "Current Feature Set": [current_feature_set],
408
- }
409
- )
410
- frames.append(df)
411
-
412
- df_corr_feature_by_time = pd.concat(frames)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes