geocif 0.1.48__tar.gz → 0.1.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {geocif-0.1.48/geocif.egg-info → geocif-0.1.50}/PKG-INFO +1 -1
  2. {geocif-0.1.48 → geocif-0.1.50}/geocif/analysis.py +12 -5
  3. {geocif-0.1.48 → geocif-0.1.50}/geocif/geocif.py +84 -28
  4. {geocif-0.1.48 → geocif-0.1.50}/geocif/logger.py +24 -1
  5. geocif-0.1.50/geocif/ml/aa.py +28 -0
  6. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/feature_selection.py +10 -0
  7. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/output.py +13 -13
  8. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/stats.py +7 -1
  9. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/trainers.py +27 -22
  10. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/trend.py +32 -11
  11. geocif-0.1.50/geocif/viz/misc.py +55 -0
  12. {geocif-0.1.48 → geocif-0.1.50/geocif.egg-info}/PKG-INFO +1 -1
  13. {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/SOURCES.txt +2 -0
  14. {geocif-0.1.48 → geocif-0.1.50}/setup.py +1 -1
  15. {geocif-0.1.48 → geocif-0.1.50}/LICENSE +0 -0
  16. {geocif-0.1.48 → geocif-0.1.50}/MANIFEST.in +0 -0
  17. {geocif-0.1.48 → geocif-0.1.50}/README.md +0 -0
  18. {geocif-0.1.48 → geocif-0.1.50}/geocif/__init__.py +0 -0
  19. {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/__init__.py +0 -0
  20. {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/geoagmet.py +0 -0
  21. {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/plot.py +0 -0
  22. {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/utils.py +0 -0
  23. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/__init__.py +0 -0
  24. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/constants.py +0 -0
  25. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/features.py +0 -0
  26. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/geo.py +0 -0
  27. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/geocif.py +0 -0
  28. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/metadata.py +0 -0
  29. {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/models.py +0 -0
  30. {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/__init__.py +0 -0
  31. {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/definitions.py +0 -0
  32. {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/indices.py +0 -0
  33. {geocif-0.1.48 → geocif-0.1.50}/geocif/experiments.py +0 -0
  34. {geocif-0.1.48 → geocif-0.1.50}/geocif/indices_runner.py +0 -0
  35. {geocif-0.1.48 → geocif-0.1.50}/geocif/indices_runner_v2.py +0 -0
  36. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/__init__.py +0 -0
  37. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/correlations.py +0 -0
  38. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/embedding.py +0 -0
  39. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/feature_engineering.py +0 -0
  40. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/misc.py +0 -0
  41. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/outliers.py +0 -0
  42. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/outlook.py +0 -0
  43. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/spatial_autocorrelation.py +0 -0
  44. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/stages.py +0 -0
  45. {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/xai.py +0 -0
  46. {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/__init__.py +0 -0
  47. {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/automl.py +0 -0
  48. {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/misc.py +0 -0
  49. {geocif-0.1.48 → geocif-0.1.50}/geocif/utils.py +0 -0
  50. {geocif-0.1.48 → geocif-0.1.50}/geocif/viz/__init__.py +0 -0
  51. {geocif-0.1.48 → geocif-0.1.50}/geocif/viz/plot.py +0 -0
  52. {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/dependency_links.txt +0 -0
  53. {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/not-zip-safe +0 -0
  54. {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/top_level.txt +0 -0
  55. {geocif-0.1.48 → geocif-0.1.50}/requirements.txt +0 -0
  56. {geocif-0.1.48 → geocif-0.1.50}/setup.cfg +0 -0
  57. {geocif-0.1.48 → geocif-0.1.50}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.48
3
+ Version: 0.1.50
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -163,6 +163,7 @@ class Geoanalysis:
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
165
  df_metrics = self._process_metrics(df_metrics)
166
+
166
167
  self._plot_metrics(df_metrics)
167
168
 
168
169
  df_regional_metrics_by_year = self._compute_regional_metrics(
@@ -183,6 +184,11 @@ class Geoanalysis:
183
184
  return df_metrics, df_regional_metrics, df_national_yield
184
185
 
185
186
  def _clean_data(self):
187
+ # Hack exclude 2012 if country == "illinois"
188
+ if self.country == "illinois":
189
+ self.df_analysis = self.df_analysis[
190
+ self.df_analysis["Harvest Year"] != 2012
191
+ ]
186
192
  # Remove rows with missing values in Observed Yield (tn per ha)
187
193
  return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
188
194
 
@@ -196,11 +202,12 @@ class Geoanalysis:
196
202
  .reset_index()
197
203
  )
198
204
 
199
- return df_metrics.pivot_table(
200
- index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
201
- columns="level_5",
202
- values=0,
203
- ).reset_index()
205
+ #return df_metrics.pivot_table(
206
+ # index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
207
+ # columns="level_5",
208
+ # values=0,
209
+ #).reset_index()
210
+ return df_metrics
204
211
 
205
212
  def _process_metrics(self, df_metrics):
206
213
  # Assign each unique Stage Name a unique integer identifier
@@ -82,6 +82,13 @@ class Geocif:
82
82
  self.today_full = self._date.format("MMMM_DD_YYYY_HH_mm")
83
83
 
84
84
  self.df_forecast = pd.DataFrame()
85
+ """
86
+ ====================================================================
87
+ Config file: Logging
88
+ ====================================================================
89
+ """
90
+ self.log_level = self.parser.get("LOGGING", "log_level")
91
+
85
92
  """
86
93
  ====================================================================
87
94
  Config file: Default
@@ -198,9 +205,6 @@ class Geocif:
198
205
 
199
206
  self.db_path = self.dir_db / self.db_forecasts
200
207
 
201
- # Store config file in database
202
- output.config_to_db(self.db_path, self.parser, self.today)
203
-
204
208
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
205
209
  # obj_pickle = outlook.Outlook(self.pickle_file)
206
210
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -221,6 +225,9 @@ class Geocif:
221
225
  f"Detrended {self.target}" if self.check_yield_trend else self.target
222
226
  )
223
227
 
228
+ # Drop rows where target_col is NaN
229
+ df_region = df_region.dropna(subset=[target_col])
230
+
224
231
  X_train = df_region[self.feature_names]
225
232
  # Drop any columns with NaNs
226
233
  X_train = X_train.dropna(axis=1, how="any")
@@ -280,7 +287,7 @@ class Geocif:
280
287
  X_train_scaled,
281
288
  y_train,
282
289
  feature_names=self.selected_features,
283
- target_col=self.target,
290
+ target_col=target_col,
284
291
  optimize=self.optimize,
285
292
  fraction_loocv=self.fraction_loocv,
286
293
  cat_features=self.cat_features,
@@ -302,8 +309,13 @@ class Geocif:
302
309
  verbose=False,
303
310
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
304
311
  )
305
- elif self.model_name == "oblique":
312
+ elif self.model_name in ["oblique"]:
306
313
  self.model.fit(X_train, y_train)
314
+ elif self.model_name == "ydf":
315
+ # Combine X_train and y_train
316
+ df_train = pd.concat([X_train, y_train], axis=1)
317
+
318
+ self.model = self.model.train(df_train)
307
319
  elif self.model_name == "geospaNN":
308
320
  self.model.fit(
309
321
  X_train,
@@ -335,9 +347,16 @@ class Geocif:
335
347
  ]:
336
348
  from sklearn.preprocessing import StandardScaler, LabelEncoder
337
349
 
350
+ if self.model_name == "cumulative_1":
351
+ num_columns = 1
352
+ elif self.model_name == "cumulative_2":
353
+ num_columns = 2
354
+ elif self.model_name == "cumulative_3":
355
+ num_columns = 3
356
+
338
357
  # Standardize the numeric features
339
358
  scaler = StandardScaler()
340
- X_numeric = X_train.iloc[:, :3]
359
+ X_numeric = X_train.iloc[:, :num_columns]
341
360
  X_scaled_numeric = pd.DataFrame(
342
361
  scaler.fit_transform(X_numeric),
343
362
  columns=X_numeric.columns,
@@ -409,9 +428,16 @@ class Geocif:
409
428
  elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
429
  from sklearn.preprocessing import StandardScaler, LabelEncoder
411
430
 
431
+ if self.model_name == "cumulative_1":
432
+ num_columns = 1
433
+ elif self.model_name == "cumulative_2":
434
+ num_columns = 2
435
+ elif self.model_name == "cumulative_3":
436
+ num_columns = 3
437
+
412
438
  # Standardize the numeric features
413
439
  scaler = StandardScaler()
414
- X_numeric = X_test.iloc[:, :3]
440
+ X_numeric = X_test.iloc[:, :num_columns]
415
441
  try:
416
442
  X_scaled_numeric = pd.DataFrame(
417
443
  scaler.fit_transform(X_numeric),
@@ -455,7 +481,9 @@ class Geocif:
455
481
  self.selected_features + self.cat_features + [self.target]
456
482
  ]
457
483
  w_train = data_train.y - self.estimate(data_train.x)
458
-
484
+ elif self.model_name == "ydf":
485
+ y_pred = self.model.evaluate(X_test)
486
+ best_hyperparameters = {}
459
487
  else:
460
488
  y_pred = self.model.predict(X_test)
461
489
  best_hyperparameters = self.model.get_params().copy()
@@ -468,8 +496,8 @@ class Geocif:
468
496
 
469
497
  obj_trend = trend.DetrendedData(
470
498
  df_tmp[f"Detrended {self.target}"],
471
- df_tmp["Detrend Model"],
472
- df_tmp["Detrend Model Type"],
499
+ df_tmp["Detrended Model"],
500
+ df_tmp["Detrended Model Type"],
473
501
  )
474
502
 
475
503
  # Retrend the predicted yield
@@ -477,6 +505,8 @@ class Geocif:
477
505
  obj_trend, df_region.iloc[idx][["Harvest Year"]]
478
506
  )[0]
479
507
 
508
+ df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
509
+
480
510
  # Create a dataframe with forecast results
481
511
  shp = len(X_test)
482
512
  experiment_id = f"{self.country}_{self.crop}"
@@ -530,7 +560,6 @@ class Geocif:
530
560
 
531
561
  if self.check_yield_trend:
532
562
  df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
533
- df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
534
563
 
535
564
  if self.last_year_yield_as_feature:
536
565
  # Add last year yield to dataframe
@@ -729,7 +758,7 @@ class Geocif:
729
758
  + ["Region_ID"]
730
759
  )
731
760
  if self.check_yield_trend:
732
- common_columns += ["Detrended Model Type", "Detrended Model"]
761
+ common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
733
762
 
734
763
  if self.last_year_yield_as_feature:
735
764
  common_columns += [f"Last Year {self.target}"]
@@ -738,11 +767,15 @@ class Geocif:
738
767
  # Filter dataframe based on region and self.feature_names
739
768
  df_region_train = self.df_train[mask_train]
740
769
  df_region_train = df_region_train[self.fixed_columns + common_columns]
770
+ df_region_train.reset_index(drop=True, inplace=True)
741
771
  self.train(df_region_train, scaler)
742
772
 
743
773
  """ Predict """
774
+ if self.check_yield_trend:
775
+ common_columns = common_columns[:-3]
744
776
  df_region_test = self.df_test[mask_test]
745
777
  df_region_test = df_region_test[self.fixed_columns + common_columns]
778
+ df_region_test.reset_index(drop=True, inplace=True)
746
779
  experiment_id, df = self.predict(df_region_test, scaler)
747
780
  # df.reset_index(inplace=True)
748
781
 
@@ -849,12 +882,15 @@ class Geocif:
849
882
  group.columns.str.contains(self.stage_info["Stage_ID"])
850
883
  ].tolist()
851
884
 
852
- group = group[
853
- self.fixed_columns
854
- + [self.target]
855
- + self.statistics_columns
856
- + all_columns
857
- ]
885
+ try:
886
+ group = group[
887
+ self.fixed_columns
888
+ + [self.target]
889
+ + self.statistics_columns
890
+ + all_columns
891
+ ]
892
+ except:
893
+ continue
858
894
  # rename all_columns to self.stage_info["CEI"]
859
895
  group.rename(
860
896
  columns={
@@ -897,12 +933,12 @@ class Geocif:
897
933
 
898
934
  if self.lag_yield_as_feature:
899
935
  df = fe.compute_lag_yield(
900
- df, self.all_seasons_with_yield, self.number_lag_years
936
+ df, self.all_seasons_with_yield, self.number_lag_years, self.target
901
937
  )
902
938
 
903
939
  if self.analogous_year_yield_as_feature:
904
940
  df = fe.compute_analogous_yield(
905
- df, self.all_seasons_with_yield, self.number_median_years
941
+ df, self.all_seasons_with_yield, self.number_median_years, self.target
906
942
  )
907
943
 
908
944
  # Create Region_ID column based on Region column category code
@@ -912,7 +948,7 @@ class Geocif:
912
948
  elif self.cluster_strategy == "individual":
913
949
  df["Region_ID"] = df["Region"].cat.codes
914
950
  elif self.cluster_strategy == "auto_detect":
915
- clusters_assigned = fe.detect_clusters(df)
951
+ clusters_assigned = fe.detect_clusters(df, self.target)
916
952
  # Merge the cluster labels with the original DataFrame
917
953
  df = df.merge(clusters_assigned, on="Region")
918
954
 
@@ -1036,8 +1072,8 @@ class Geocif:
1036
1072
 
1037
1073
  """ Groupby Region column and compute detrended yield """
1038
1074
  self.df_train[f"Detrended {self.target}"] = np.NaN
1039
- self.df_train["Detrend Model"] = np.NaN
1040
- self.df_train["Detrend Model Type"] = np.NaN
1075
+ self.df_train["Detrended Model"] = np.NaN
1076
+ self.df_train["Detrended Model Type"] = np.NaN
1041
1077
  if self.check_yield_trend:
1042
1078
  group_by = ["Region"]
1043
1079
  groups = self.df_train.groupby(group_by)
@@ -1050,10 +1086,10 @@ class Geocif:
1050
1086
  group.index, f"Detrended {self.target}"
1051
1087
  ] = detrended_data.detrended_series
1052
1088
  self.df_train.loc[
1053
- group.index, "Detrend Model"
1089
+ group.index, "Detrended Model"
1054
1090
  ] = detrended_data.trend_model
1055
1091
  self.df_train.loc[
1056
- group.index, "Detrend Model Type"
1092
+ group.index, "Detrended Model Type"
1057
1093
  ] = detrended_data.model_type
1058
1094
 
1059
1095
  # 6. Exclude years without yields from df_train
@@ -1118,10 +1154,19 @@ class Geocif:
1118
1154
  self.do_xai = False
1119
1155
  self.estimate_ci = False
1120
1156
  self.estimate_ci_for_all = False
1121
- self.check_yield_trend = False
1157
+ self.check_yield_trend = True
1122
1158
  self.cluster_strategy = "single"
1123
1159
  self.select_cei_by = "Index"
1124
1160
  self.use_cumulative_features = True
1161
+ elif self.model_name in ["oblique", "ydf"]:
1162
+ self.do_xai = False
1163
+ self.estimate_ci = False
1164
+ # Remove Region from cat_features as it is object type
1165
+ self.cat_features = [col for col in self.cat_features if col != "Region"]
1166
+ # if self.model_name == "ydf":
1167
+ # # HACK, for ydf model, target_col is Yield
1168
+ # self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
1169
+ # self.target = "Yield"
1125
1170
  else:
1126
1171
  self.do_xai = self.parser.getboolean("ML", "do_xai")
1127
1172
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -1188,6 +1233,9 @@ class Geocif:
1188
1233
  self.dg["Country Region"] = (
1189
1234
  self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
1190
1235
  )
1236
+ elif self.country == "illinois":
1237
+ self.dg["ADM0_NAME"] = "illinois"
1238
+ self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
1191
1239
  else:
1192
1240
  self.dg["Country Region"] = (
1193
1241
  self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
@@ -1240,6 +1288,9 @@ class Geocif:
1240
1288
  # TODO ignore file with _2000 in its name
1241
1289
  all_files = [f for f in all_files if "_2000" not in f.name]
1242
1290
 
1291
+ # Assert that all_files is not empty
1292
+ assert all_files, f"No files found in {_dir_country} with {file_name}"
1293
+
1243
1294
  self.df_results = pd.concat(
1244
1295
  (pd.read_csv(f) for f in all_files), ignore_index=True
1245
1296
  )
@@ -1284,7 +1335,7 @@ def loop_execute(inputs):
1284
1335
  )
1285
1336
 
1286
1337
  with PyCallGraph(output=graphviz, config=config):
1287
- country, crop, season, model, logger, parser = inputs
1338
+ country, crop, season, model, logger, parser, index = inputs
1288
1339
 
1289
1340
  logger.info("=====================================================")
1290
1341
  logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
@@ -1293,6 +1344,11 @@ def loop_execute(inputs):
1293
1344
  obj = Geocif(logger=logger, parser=parser)
1294
1345
  obj.read_data(country, crop, season)
1295
1346
 
1347
+ # Store config file in database, only execute this for
1348
+ # the first iteration of the loop
1349
+ if index == 0:
1350
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
1351
+
1296
1352
  # Setup metadata and run ML code
1297
1353
  obj.setup(season, model)
1298
1354
  if obj.simulation_stages:
@@ -1336,7 +1392,7 @@ def execute_models(inputs, logger, parser):
1336
1392
  do_parallel = parser.getboolean("DEFAULT", "do_parallel")
1337
1393
 
1338
1394
  # Add logger and parser to each element in inputs
1339
- inputs = [item + [logger, parser] for item in inputs]
1395
+ inputs = [item + [logger, parser, idx] for idx, item in enumerate(inputs)]
1340
1396
 
1341
1397
  if do_parallel:
1342
1398
  cpu_count = int(mp.cpu_count() * 0.3)
@@ -71,8 +71,29 @@ class Logger:
71
71
  self.logger.error(msg)
72
72
 
73
73
 
74
+ def get_logging_level(level):
75
+ """
76
+
77
+ Args:
78
+ level:
79
+
80
+ Returns:
81
+
82
+ """
83
+ if level == "DEBUG":
84
+ return logging.DEBUG
85
+ elif level == "INFO":
86
+ return logging.INFO
87
+ elif level == "WARNING":
88
+ return logging.WARNING
89
+ elif level == "ERROR":
90
+ return logging.ERROR
91
+ else:
92
+ return logging.INFO
93
+
94
+
74
95
  def setup_logger_parser(
75
- path_config_file, name_project="geocif", name_file="ml", level=logging.DEBUG
96
+ path_config_file, name_project="geocif", name_file="ml"
76
97
  ):
77
98
  """
78
99
 
@@ -87,6 +108,8 @@ def setup_logger_parser(
87
108
  """
88
109
  parser = read_config(path_config_file)
89
110
  dir_log = parser.get("PATHS", "dir_log")
111
+ level = parser.get("LOGGING", "log_level")
112
+ level = get_logging_level(level)
90
113
 
91
114
  logger = Logger(
92
115
  dir_log=dir_log,
@@ -0,0 +1,28 @@
1
+ import ydf
2
+ import pandas as pd
3
+
4
+ # Load dataset with Pandas
5
+ ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
6
+ train_ds = pd.read_csv(ds_path + "adult_train.csv")
7
+ test_ds = pd.read_csv(ds_path + "adult_test.csv")
8
+
9
+ # Train a Gradient Boosted Trees model
10
+ model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
11
+
12
+ # Look at a model (input features, training logs, structure, etc.)
13
+ model.describe()
14
+
15
+ # Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
16
+ model.evaluate(test_ds)
17
+
18
+ # Generate predictions
19
+ model.predict(test_ds)
20
+
21
+ # Analyse a model (e.g. partial dependence plot, variable importance)
22
+ model.analyze(test_ds)
23
+
24
+ # Benchmark the inference speed of a model
25
+ model.benchmark(test_ds)
26
+
27
+ # Save the model
28
+ model.save("/tmp/my_model")
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
131
131
 
132
132
  X_filtered = selector.fit_transform(X, y)
133
133
  selected_features = X_filtered.columns.tolist()
134
+ elif method == "mrmr":
135
+ from mrmr import mrmr_regression
136
+
137
+ try:
138
+ selected_features = mrmr_regression(X=X, y=y, K=10)
139
+ except:
140
+ breakpoint()
141
+ # combine X and y into a dataframe
142
+ # df = pd.concat([X, y], axis=1)
143
+
134
144
  elif method == "RFECV":
135
145
  from sklearn.feature_selection import RFECV
136
146
  from sklearn.model_selection import KFold
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
109
109
  except Exception as e:
110
110
  print(f"Error: {e}")
111
111
 
112
- index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
113
- # Output model pickle as a blob to database
114
- df_model = pd.DataFrame(
115
- {
116
- "Experiment_ID": [experiment_id],
117
- "Model": [model_name],
118
- "Model_Blob": [pickle.dumps(model)],
119
- }
120
- )
121
- # df_model.index = df_model.apply(
122
- # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
123
- # )
124
-
125
112
  # name the index level
126
113
  try:
114
+ index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
115
+ # Output model pickle as a blob to database
116
+ df_model = pd.DataFrame(
117
+ {
118
+ "Experiment_ID": [experiment_id],
119
+ "Model": [model_name],
120
+ "Model_Blob": [pickle.dumps(model)],
121
+ }
122
+ )
123
+ # df_model.index = df_model.apply(
124
+ # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
125
+ # )
126
+
127
127
  df_model.index.set_names(["Index"], inplace=True)
128
128
  utils.to_db(db_path, "models", df_model)
129
129
  except Exception as e:
@@ -191,7 +191,12 @@ def add_statistics(
191
191
 
192
192
  """
193
193
  # First check if country and crop are in the admin_crop_production.csv file
194
- fn = "afghanistan.csv" if country == "Afghanistan" else "adm_crop_production.csv"
194
+ if country == "Afghanistan":
195
+ fn = "afghanistan.csv"
196
+ elif country == "Illinois":
197
+ fn = "illinois.csv"
198
+ else:
199
+ fn = "adm_crop_production.csv"
195
200
  df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
196
201
 
197
202
  # HACK
@@ -206,6 +211,7 @@ def add_statistics(
206
211
  df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
207
212
  else:
208
213
  group_by = ["Region", "Harvest Year"]
214
+
209
215
  groups = df.groupby(group_by)
210
216
 
211
217
  # Define processing for each group
@@ -264,8 +264,7 @@ def auto_train(
264
264
  if model_name in ["catboost", "merf"]:
265
265
  hyperparams = {
266
266
  "depth": 6,
267
- "learning_rate": 0.01,
268
- "iterations": 5000,
267
+
269
268
  "subsample": 1.0,
270
269
  "random_strength": 0.5,
271
270
  "reg_lambda": 0.001,
@@ -283,18 +282,33 @@ def auto_train(
283
282
  regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
284
283
  model = MERF(regr, max_iterations=10)
285
284
  elif model_name == "oblique":
286
- breakpoint()
287
- from sktree.ensemble import ObliqueRandomForestRegressor
285
+ from treeple import ExtraObliqueRandomForestRegressor
286
+
287
+ # https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
288
+ n_features = X_train.shape[1]
288
289
 
289
- print("Training ObliqueRandomForestRegressor")
290
- model = ObliqueRandomForestRegressor(
291
- n_estimators=500,
292
- max_depth=7,
290
+ model = ExtraObliqueRandomForestRegressor(
291
+ n_estimators=1500,
292
+ max_depth=20,
293
+ max_features=n_features**2,
294
+ feature_combinations=n_features,
293
295
  n_jobs=-1,
294
- verbose=2,
295
296
  random_state=42,
296
297
  )
297
- print("Finished training ObliqueRandomForestRegressor")
298
+ elif model_name == "ydf":
299
+ import ydf
300
+ templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
301
+
302
+ model = ydf.GradientBoostedTreesLearner(
303
+ label=target_col,
304
+ task=ydf.Task.REGRESSION,
305
+ growing_strategy='BEST_FIRST_GLOBAL',
306
+ categorical_algorithm='RANDOM',
307
+ split_axis='SPARSE_OBLIQUE',
308
+ sparse_oblique_normalization='MIN_MAX',
309
+ sparse_oblique_num_projections_exponent=2.0)
310
+
311
+ hyperparams = templates["benchmark_rank1v1"]
298
312
  elif model_name == "linear":
299
313
  from sklearn.linear_model import LassoCV
300
314
 
@@ -308,24 +322,15 @@ def auto_train(
308
322
  elif model_name == "cumulative_1":
309
323
  from pygam import GAM, s, f, te
310
324
 
311
- # compute index of column Region
312
- region_idx = X_train.columns.get_loc("Region")
313
-
314
- model = GAM(s(0) + f(region_idx))
325
+ model = GAM(s(0) + f(1))
315
326
  elif model_name == "cumulative_2":
316
327
  from pygam import GAM, s, f, te
317
328
 
318
- # compute index of column Region
319
- region_idx = X_train.columns.get_loc("Region")
320
-
321
- model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
329
+ model = GAM(s(0) + s(1) + te(0, 1) + f(2))
322
330
  elif model_name == "cumulative_3":
323
331
  from pygam import GAM, s, f, te
324
332
 
325
- # compute index of column Region
326
- region_idx = X_train.columns.get_loc("Region")
327
-
328
- model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
333
+ model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
329
334
  elif model_name == "geospaNN":
330
335
  import torch
331
336
  import geospaNN
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ import pandas as pd
2
3
  from statsmodels.regression.linear_model import OLS
3
4
  from statsmodels.tools.tools import add_constant
4
5
 
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
6
7
  class DetrendedData:
7
8
  """
8
9
  A class to store the detrended series, the model used for detrending,
9
- and the type of model ('mean', 'linear', 'quadratic').
10
+ and the type of model ('mean', 'linear', 'quadratic', 'difference').
10
11
  """
11
12
 
12
13
  def __init__(self, detrended_series, trend_model, model_type):
@@ -15,14 +16,16 @@ class DetrendedData:
15
16
  self.model_type = model_type
16
17
 
17
18
 
18
- def detrend_dataframe(df, column_name="y"):
19
+ def detrend_dataframe(df, column_name="y", model_type="best"):
19
20
  """
20
- Removes the trend from the specified column of a DataFrame using the method
21
- (mean, linear, quadratic) that results in the lowest AIC value.
21
+ Removes the trend from the specified column of a DataFrame using the specified method
22
+ (mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
22
23
 
23
24
  Parameters:
24
25
  - df: pandas DataFrame containing the time series data.
25
26
  - column_name: string name of the column to detrend.
27
+ - model_type: string specifying which model to use for detrending ('mean', 'linear',
28
+ 'quadratic', 'difference', or 'best' for automatic selection based on AIC).
26
29
 
27
30
  Returns:
28
31
  - DetrendedData object containing the detrended series, the statistical model,
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
41
44
  X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
42
45
  quad_model = OLS(df[column_name], X_quad).fit()
43
46
 
44
- models = {"mean": mean_model, "linear": linear_model, "quadratic": quad_model}
45
- best_model_type = min(models, key=lambda x: models[x].aic)
47
+ # Differencing method
48
+ diff_series = df[column_name].diff().dropna()
49
+ diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
50
+
51
+ models = {
52
+ "mean": mean_model,
53
+ "linear": linear_model,
54
+ "quadratic": quad_model,
55
+ "difference": diff_model
56
+ }
57
+
58
+ if model_type == "best":
59
+ best_model_type = min(models, key=lambda x: models[x].aic)
60
+ else:
61
+ best_model_type = model_type
62
+
46
63
  best_model = models[best_model_type]
47
64
 
48
65
  if best_model_type == "mean":
49
66
  detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
50
67
  elif best_model_type == "linear":
51
68
  detrended = df[column_name] - linear_model.predict(X_linear)
52
- else: # quadratic
69
+ elif best_model_type == "quadratic":
53
70
  detrended = df[column_name] - quad_model.predict(X_quad)
71
+ else: # difference
72
+ detrended = df[column_name].diff().dropna()
54
73
 
55
74
  return DetrendedData(detrended, best_model, best_model_type)
56
75
 
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
67
86
  Returns:
68
87
  - The retrended series as a pandas Series.
69
88
  """
70
- # if future_time_points is not of type pandas dataframe then convert it to one
71
89
  future_time_points = np.array(future_time_points)
72
90
 
73
- model_type = detrended_data.model_type[0]
74
- model = detrended_data.trend_model[0]
91
+ model_type = detrended_data.model_type.unique()[0]
92
+ model = detrended_data.trend_model.unique()[0]
75
93
 
76
94
  if model_type == "mean":
77
95
  trend_component = model.predict(
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
80
98
  elif model_type == "linear":
81
99
  X_linear = add_constant(future_time_points, has_constant="add")
82
100
  trend_component = model.predict(X_linear)
83
- else: # quadratic
101
+ elif model_type == "quadratic":
84
102
  X_quad = add_constant(
85
103
  np.column_stack((future_time_points, future_time_points**2)),
86
104
  has_constant="add",
87
105
  )
88
106
  trend_component = model.predict(X_quad)
107
+ else: # difference
108
+ trend_component = pd.Series(np.nan, index=future_time_points)
109
+ trend_component.iloc[0] = model.params[0] # Add mean of differenced series
89
110
 
90
111
  return trend_component
@@ -0,0 +1,55 @@
1
+ import pandas as pd
2
+ import hvplot.pandas
3
+ import panel as pn
4
+
5
+ # Load the CSV file
6
+ file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
7
+ data = pd.read_csv(file_path)
8
+
9
+ # Extract unique values for dropdowns
10
+ countries = data['Country'].unique().tolist()
11
+
12
+ # Create dropdown widgets
13
+ country_dropdown = pn.widgets.Select(name='Country', options=countries)
14
+ region_dropdown = pn.widgets.Select(name='Region', options=[])
15
+ crop_dropdown = pn.widgets.Select(name='Crop', options=[])
16
+ season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
17
+
18
+
19
+ # Function to update region and crop options based on selected country
20
+ @pn.depends(country_dropdown.param.value, watch=True)
21
+ def update_region_and_crop_options(country):
22
+ filtered_data = data[data['Country'] == country]
23
+ regions = filtered_data['Region'].unique().tolist()
24
+ crops = filtered_data['Crop'].unique().tolist()
25
+
26
+ region_dropdown.options = regions
27
+ crop_dropdown.options = crops
28
+
29
+
30
+ # Function to filter data based on dropdown selections
31
+ @pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
32
+ season_dropdown.param.value)
33
+ def update_plot(country, region, crop, season):
34
+ filtered_data = data[(data['Country'] == country) &
35
+ (data['Region'] == region) &
36
+ (data['Crop'] == crop) &
37
+ (data['Season'] == season)]
38
+
39
+ if not filtered_data.empty:
40
+ plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
41
+ hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
42
+ return plot
43
+ else:
44
+ return pn.pane.Markdown("No data available for the selected combination.")
45
+
46
+
47
+ # Create the dashboard
48
+ dashboard = pn.Column(
49
+ pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
50
+ update_plot
51
+ )
52
+
53
+ # Save as html page
54
+ dashboard.save('dashboard.html', embed=True)
55
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.48
3
+ Version: 0.1.50
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
32
32
  geocif/cei/definitions.py
33
33
  geocif/cei/indices.py
34
34
  geocif/ml/__init__.py
35
+ geocif/ml/aa.py
35
36
  geocif/ml/correlations.py
36
37
  geocif/ml/embedding.py
37
38
  geocif/ml/feature_engineering.py
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
50
51
  geocif/playground/automl.py
51
52
  geocif/playground/misc.py
52
53
  geocif/viz/__init__.py
54
+ geocif/viz/misc.py
53
55
  geocif/viz/plot.py
54
56
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.48",
53
+ version="0.1.50",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes