geocif 0.1.47__tar.gz → 0.1.49__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {geocif-0.1.47/geocif.egg-info → geocif-0.1.49}/PKG-INFO +1 -1
  2. {geocif-0.1.47 → geocif-0.1.49}/geocif/geocif.py +64 -20
  3. geocif-0.1.49/geocif/ml/aa.py +28 -0
  4. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/feature_selection.py +10 -0
  5. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/output.py +13 -13
  6. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/stats.py +7 -1
  7. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/trainers.py +37 -17
  8. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/trend.py +32 -11
  9. geocif-0.1.49/geocif/viz/misc.py +55 -0
  10. {geocif-0.1.47 → geocif-0.1.49/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/SOURCES.txt +2 -0
  12. {geocif-0.1.47 → geocif-0.1.49}/setup.py +1 -1
  13. {geocif-0.1.47 → geocif-0.1.49}/LICENSE +0 -0
  14. {geocif-0.1.47 → geocif-0.1.49}/MANIFEST.in +0 -0
  15. {geocif-0.1.47 → geocif-0.1.49}/README.md +0 -0
  16. {geocif-0.1.47 → geocif-0.1.49}/geocif/__init__.py +0 -0
  17. {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/__init__.py +0 -0
  18. {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/geoagmet.py +0 -0
  19. {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/plot.py +0 -0
  20. {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/utils.py +0 -0
  21. {geocif-0.1.47 → geocif-0.1.49}/geocif/analysis.py +0 -0
  22. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/__init__.py +0 -0
  23. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/constants.py +0 -0
  24. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/features.py +0 -0
  25. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/geo.py +0 -0
  26. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/geocif.py +0 -0
  27. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/metadata.py +0 -0
  28. {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/models.py +0 -0
  29. {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/__init__.py +0 -0
  30. {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/definitions.py +0 -0
  31. {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/indices.py +0 -0
  32. {geocif-0.1.47 → geocif-0.1.49}/geocif/experiments.py +0 -0
  33. {geocif-0.1.47 → geocif-0.1.49}/geocif/indices_runner.py +0 -0
  34. {geocif-0.1.47 → geocif-0.1.49}/geocif/indices_runner_v2.py +0 -0
  35. {geocif-0.1.47 → geocif-0.1.49}/geocif/logger.py +0 -0
  36. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/__init__.py +0 -0
  37. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/correlations.py +0 -0
  38. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/embedding.py +0 -0
  39. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/feature_engineering.py +0 -0
  40. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/misc.py +0 -0
  41. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/outliers.py +0 -0
  42. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/outlook.py +0 -0
  43. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/spatial_autocorrelation.py +0 -0
  44. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/stages.py +0 -0
  45. {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/xai.py +0 -0
  46. {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/__init__.py +0 -0
  47. {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/automl.py +0 -0
  48. {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/misc.py +0 -0
  49. {geocif-0.1.47 → geocif-0.1.49}/geocif/utils.py +0 -0
  50. {geocif-0.1.47 → geocif-0.1.49}/geocif/viz/__init__.py +0 -0
  51. {geocif-0.1.47 → geocif-0.1.49}/geocif/viz/plot.py +0 -0
  52. {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/dependency_links.txt +0 -0
  53. {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/not-zip-safe +0 -0
  54. {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/top_level.txt +0 -0
  55. {geocif-0.1.47 → geocif-0.1.49}/requirements.txt +0 -0
  56. {geocif-0.1.47 → geocif-0.1.49}/setup.cfg +0 -0
  57. {geocif-0.1.47 → geocif-0.1.49}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.47
3
+ Version: 0.1.49
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -11,7 +11,6 @@ import geopandas as gp
11
11
  import matplotlib.pyplot as plt
12
12
  import numpy as np
13
13
  import pandas as pd
14
- import sklearn
15
14
  from tqdm import tqdm
16
15
 
17
16
  from geocif import logger as log
@@ -28,7 +27,6 @@ from .ml import trend
28
27
  from .ml import xai
29
28
 
30
29
  plt.style.use("default")
31
- sklearn.set_config(transform_output="pandas")
32
30
 
33
31
  import warnings
34
32
 
@@ -200,9 +198,6 @@ class Geocif:
200
198
 
201
199
  self.db_path = self.dir_db / self.db_forecasts
202
200
 
203
- # Store config file in database
204
- output.config_to_db(self.db_path, self.parser, self.today)
205
-
206
201
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
207
202
  # obj_pickle = outlook.Outlook(self.pickle_file)
208
203
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -223,6 +218,9 @@ class Geocif:
223
218
  f"Detrended {self.target}" if self.check_yield_trend else self.target
224
219
  )
225
220
 
221
+ # Drop rows where target_col is NaN
222
+ df_region = df_region.dropna(subset=[target_col])
223
+
226
224
  X_train = df_region[self.feature_names]
227
225
  # Drop any columns with NaNs
228
226
  X_train = X_train.dropna(axis=1, how="any")
@@ -282,7 +280,7 @@ class Geocif:
282
280
  X_train_scaled,
283
281
  y_train,
284
282
  feature_names=self.selected_features,
285
- target_col=self.target,
283
+ target_col=target_col,
286
284
  optimize=self.optimize,
287
285
  fraction_loocv=self.fraction_loocv,
288
286
  cat_features=self.cat_features,
@@ -304,6 +302,13 @@ class Geocif:
304
302
  verbose=False,
305
303
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
306
304
  )
305
+ elif self.model_name in ["oblique"]:
306
+ self.model.fit(X_train, y_train)
307
+ elif self.model_name == "ydf":
308
+ # Combine X_train and y_train
309
+ df_train = pd.concat([X_train, y_train], axis=1)
310
+
311
+ self.model = self.model.train(df_train)
307
312
  elif self.model_name == "geospaNN":
308
313
  self.model.fit(
309
314
  X_train,
@@ -335,9 +340,16 @@ class Geocif:
335
340
  ]:
336
341
  from sklearn.preprocessing import StandardScaler, LabelEncoder
337
342
 
343
+ if self.model_name == "cumulative_1":
344
+ num_columns = 1
345
+ elif self.model_name == "cumulative_2":
346
+ num_columns = 2
347
+ elif self.model_name == "cumulative_3":
348
+ num_columns = 3
349
+
338
350
  # Standardize the numeric features
339
351
  scaler = StandardScaler()
340
- X_numeric = X_train.iloc[:, :3]
352
+ X_numeric = X_train.iloc[:, :num_columns]
341
353
  X_scaled_numeric = pd.DataFrame(
342
354
  scaler.fit_transform(X_numeric),
343
355
  columns=X_numeric.columns,
@@ -409,9 +421,16 @@ class Geocif:
409
421
  elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
422
  from sklearn.preprocessing import StandardScaler, LabelEncoder
411
423
 
424
+ if self.model_name == "cumulative_1":
425
+ num_columns = 1
426
+ elif self.model_name == "cumulative_2":
427
+ num_columns = 2
428
+ elif self.model_name == "cumulative_3":
429
+ num_columns = 3
430
+
412
431
  # Standardize the numeric features
413
432
  scaler = StandardScaler()
414
- X_numeric = X_test.iloc[:, :3]
433
+ X_numeric = X_test.iloc[:, :num_columns]
415
434
  try:
416
435
  X_scaled_numeric = pd.DataFrame(
417
436
  scaler.fit_transform(X_numeric),
@@ -455,7 +474,9 @@ class Geocif:
455
474
  self.selected_features + self.cat_features + [self.target]
456
475
  ]
457
476
  w_train = data_train.y - self.estimate(data_train.x)
458
-
477
+ elif self.model_name == "ydf":
478
+ y_pred = self.model.evaluate(X_test)
479
+ best_hyperparameters = {}
459
480
  else:
460
481
  y_pred = self.model.predict(X_test)
461
482
  best_hyperparameters = self.model.get_params().copy()
@@ -468,8 +489,8 @@ class Geocif:
468
489
 
469
490
  obj_trend = trend.DetrendedData(
470
491
  df_tmp[f"Detrended {self.target}"],
471
- df_tmp["Detrend Model"],
472
- df_tmp["Detrend Model Type"],
492
+ df_tmp["Detrended Model"],
493
+ df_tmp["Detrended Model Type"],
473
494
  )
474
495
 
475
496
  # Retrend the predicted yield
@@ -477,6 +498,8 @@ class Geocif:
477
498
  obj_trend, df_region.iloc[idx][["Harvest Year"]]
478
499
  )[0]
479
500
 
501
+ df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
502
+
480
503
  # Create a dataframe with forecast results
481
504
  shp = len(X_test)
482
505
  experiment_id = f"{self.country}_{self.crop}"
@@ -530,7 +553,6 @@ class Geocif:
530
553
 
531
554
  if self.check_yield_trend:
532
555
  df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
533
- df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
534
556
 
535
557
  if self.last_year_yield_as_feature:
536
558
  # Add last year yield to dataframe
@@ -729,7 +751,7 @@ class Geocif:
729
751
  + ["Region_ID"]
730
752
  )
731
753
  if self.check_yield_trend:
732
- common_columns += ["Detrended Model Type", "Detrended Model"]
754
+ common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
733
755
 
734
756
  if self.last_year_yield_as_feature:
735
757
  common_columns += [f"Last Year {self.target}"]
@@ -738,11 +760,15 @@ class Geocif:
738
760
  # Filter dataframe based on region and self.feature_names
739
761
  df_region_train = self.df_train[mask_train]
740
762
  df_region_train = df_region_train[self.fixed_columns + common_columns]
763
+ df_region_train.reset_index(drop=True, inplace=True)
741
764
  self.train(df_region_train, scaler)
742
765
 
743
766
  """ Predict """
767
+ if self.check_yield_trend:
768
+ common_columns = common_columns[:-3]
744
769
  df_region_test = self.df_test[mask_test]
745
770
  df_region_test = df_region_test[self.fixed_columns + common_columns]
771
+ df_region_test.reset_index(drop=True, inplace=True)
746
772
  experiment_id, df = self.predict(df_region_test, scaler)
747
773
  # df.reset_index(inplace=True)
748
774
 
@@ -897,12 +923,12 @@ class Geocif:
897
923
 
898
924
  if self.lag_yield_as_feature:
899
925
  df = fe.compute_lag_yield(
900
- df, self.all_seasons_with_yield, self.number_lag_years
926
+ df, self.all_seasons_with_yield, self.number_lag_years, self.target
901
927
  )
902
928
 
903
929
  if self.analogous_year_yield_as_feature:
904
930
  df = fe.compute_analogous_yield(
905
- df, self.all_seasons_with_yield, self.number_median_years
931
+ df, self.all_seasons_with_yield, self.number_median_years, self.target
906
932
  )
907
933
 
908
934
  # Create Region_ID column based on Region column category code
@@ -912,7 +938,7 @@ class Geocif:
912
938
  elif self.cluster_strategy == "individual":
913
939
  df["Region_ID"] = df["Region"].cat.codes
914
940
  elif self.cluster_strategy == "auto_detect":
915
- clusters_assigned = fe.detect_clusters(df)
941
+ clusters_assigned = fe.detect_clusters(df, self.target)
916
942
  # Merge the cluster labels with the original DataFrame
917
943
  df = df.merge(clusters_assigned, on="Region")
918
944
 
@@ -1036,8 +1062,8 @@ class Geocif:
1036
1062
 
1037
1063
  """ Groupby Region column and compute detrended yield """
1038
1064
  self.df_train[f"Detrended {self.target}"] = np.NaN
1039
- self.df_train["Detrend Model"] = np.NaN
1040
- self.df_train["Detrend Model Type"] = np.NaN
1065
+ self.df_train["Detrended Model"] = np.NaN
1066
+ self.df_train["Detrended Model Type"] = np.NaN
1041
1067
  if self.check_yield_trend:
1042
1068
  group_by = ["Region"]
1043
1069
  groups = self.df_train.groupby(group_by)
@@ -1050,10 +1076,10 @@ class Geocif:
1050
1076
  group.index, f"Detrended {self.target}"
1051
1077
  ] = detrended_data.detrended_series
1052
1078
  self.df_train.loc[
1053
- group.index, "Detrend Model"
1079
+ group.index, "Detrended Model"
1054
1080
  ] = detrended_data.trend_model
1055
1081
  self.df_train.loc[
1056
- group.index, "Detrend Model Type"
1082
+ group.index, "Detrended Model Type"
1057
1083
  ] = detrended_data.model_type
1058
1084
 
1059
1085
  # 6. Exclude years without yields from df_train
@@ -1122,6 +1148,15 @@ class Geocif:
1122
1148
  self.cluster_strategy = "single"
1123
1149
  self.select_cei_by = "Index"
1124
1150
  self.use_cumulative_features = True
1151
+ elif self.model_name in ["oblique", "ydf"]:
1152
+ self.do_xai = False
1153
+ self.estimate_ci = False
1154
+ # Remove Region from cat_features as it is object type
1155
+ self.cat_features = [col for col in self.cat_features if col != "Region"]
1156
+ # if self.model_name == "ydf":
1157
+ # # HACK, for ydf model, target_col is Yield
1158
+ # self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
1159
+ # self.target = "Yield"
1125
1160
  else:
1126
1161
  self.do_xai = self.parser.getboolean("ML", "do_xai")
1127
1162
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -1188,6 +1223,9 @@ class Geocif:
1188
1223
  self.dg["Country Region"] = (
1189
1224
  self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
1190
1225
  )
1226
+ elif self.country == "illinois":
1227
+ self.dg["ADM0_NAME"] = "illinois"
1228
+ self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
1191
1229
  else:
1192
1230
  self.dg["Country Region"] = (
1193
1231
  self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
@@ -1240,6 +1278,9 @@ class Geocif:
1240
1278
  # TODO ignore file with _2000 in its name
1241
1279
  all_files = [f for f in all_files if "_2000" not in f.name]
1242
1280
 
1281
+ # Assert that all_files is not empty
1282
+ assert all_files, f"No files found in {_dir_country} with {file_name}"
1283
+
1243
1284
  self.df_results = pd.concat(
1244
1285
  (pd.read_csv(f) for f in all_files), ignore_index=True
1245
1286
  )
@@ -1293,6 +1334,9 @@ def loop_execute(inputs):
1293
1334
  obj = Geocif(logger=logger, parser=parser)
1294
1335
  obj.read_data(country, crop, season)
1295
1336
 
1337
+ # Store config file in database
1338
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
1339
+
1296
1340
  # Setup metadata and run ML code
1297
1341
  obj.setup(season, model)
1298
1342
  if obj.simulation_stages:
@@ -0,0 +1,28 @@
1
+ import ydf
2
+ import pandas as pd
3
+
4
+ # Load dataset with Pandas
5
+ ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
6
+ train_ds = pd.read_csv(ds_path + "adult_train.csv")
7
+ test_ds = pd.read_csv(ds_path + "adult_test.csv")
8
+
9
+ # Train a Gradient Boosted Trees model
10
+ model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
11
+
12
+ # Look at a model (input features, training logs, structure, etc.)
13
+ model.describe()
14
+
15
+ # Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
16
+ model.evaluate(test_ds)
17
+
18
+ # Generate predictions
19
+ model.predict(test_ds)
20
+
21
+ # Analyse a model (e.g. partial dependence plot, variable importance)
22
+ model.analyze(test_ds)
23
+
24
+ # Benchmark the inference speed of a model
25
+ model.benchmark(test_ds)
26
+
27
+ # Save the model
28
+ model.save("/tmp/my_model")
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
131
131
 
132
132
  X_filtered = selector.fit_transform(X, y)
133
133
  selected_features = X_filtered.columns.tolist()
134
+ elif method == "mrmr":
135
+ from mrmr import mrmr_regression
136
+
137
+ try:
138
+ selected_features = mrmr_regression(X=X, y=y, K=10)
139
+ except:
140
+ breakpoint()
141
+ # combine X and y into a dataframe
142
+ # df = pd.concat([X, y], axis=1)
143
+
134
144
  elif method == "RFECV":
135
145
  from sklearn.feature_selection import RFECV
136
146
  from sklearn.model_selection import KFold
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
109
109
  except Exception as e:
110
110
  print(f"Error: {e}")
111
111
 
112
- index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
113
- # Output model pickle as a blob to database
114
- df_model = pd.DataFrame(
115
- {
116
- "Experiment_ID": [experiment_id],
117
- "Model": [model_name],
118
- "Model_Blob": [pickle.dumps(model)],
119
- }
120
- )
121
- # df_model.index = df_model.apply(
122
- # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
123
- # )
124
-
125
112
  # name the index level
126
113
  try:
114
+ index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
115
+ # Output model pickle as a blob to database
116
+ df_model = pd.DataFrame(
117
+ {
118
+ "Experiment_ID": [experiment_id],
119
+ "Model": [model_name],
120
+ "Model_Blob": [pickle.dumps(model)],
121
+ }
122
+ )
123
+ # df_model.index = df_model.apply(
124
+ # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
125
+ # )
126
+
127
127
  df_model.index.set_names(["Index"], inplace=True)
128
128
  utils.to_db(db_path, "models", df_model)
129
129
  except Exception as e:
@@ -191,7 +191,12 @@ def add_statistics(
191
191
 
192
192
  """
193
193
  # First check if country and crop are in the admin_crop_production.csv file
194
- fn = "afghanistan.csv" if country == "Afghanistan" else "adm_crop_production.csv"
194
+ if country == "Afghanistan":
195
+ fn = "afghanistan.csv"
196
+ elif country == "Illinois":
197
+ fn = "illinois.csv"
198
+ else:
199
+ fn = "adm_crop_production.csv"
195
200
  df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
196
201
 
197
202
  # HACK
@@ -206,6 +211,7 @@ def add_statistics(
206
211
  df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
207
212
  else:
208
213
  group_by = ["Region", "Harvest Year"]
214
+
209
215
  groups = df.groupby(group_by)
210
216
 
211
217
  # Define processing for each group
@@ -2,10 +2,7 @@ import multiprocessing as mp
2
2
 
3
3
  import numpy as np
4
4
  import optuna
5
- import pandas as pd
6
5
  from catboost import CatBoostRegressor
7
- from sklearn.metrics import root_mean_squared_error
8
- from sklearn.model_selection import train_test_split
9
6
  from tqdm import tqdm
10
7
 
11
8
 
@@ -30,6 +27,8 @@ def loocv(
30
27
  :param cat_features: list, list of categorical feature names
31
28
  :return: float, average RMSE
32
29
  """
30
+ from sklearn.metrics import root_mean_squared_error
31
+
33
32
  rmse_values = []
34
33
 
35
34
  X = df[feature_names + cat_features]
@@ -81,6 +80,9 @@ def optuna_objective(model, df, feature_names, target_col, cat_features=[]):
81
80
  Returns:
82
81
 
83
82
  """
83
+ from sklearn.metrics import root_mean_squared_error
84
+ from sklearn.model_selection import train_test_split
85
+
84
86
  X = df[feature_names + cat_features]
85
87
  y = df[target_col]
86
88
 
@@ -262,8 +264,7 @@ def auto_train(
262
264
  if model_name in ["catboost", "merf"]:
263
265
  hyperparams = {
264
266
  "depth": 6,
265
- "learning_rate": 0.01,
266
- "iterations": 5000,
267
+
267
268
  "subsample": 1.0,
268
269
  "random_strength": 0.5,
269
270
  "reg_lambda": 0.001,
@@ -280,6 +281,34 @@ def auto_train(
280
281
  hyperparams["iterations"] = 1000
281
282
  regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
282
283
  model = MERF(regr, max_iterations=10)
284
+ elif model_name == "oblique":
285
+ from treeple import ExtraObliqueRandomForestRegressor
286
+
287
+ # https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
288
+ n_features = X_train.shape[1]
289
+
290
+ model = ExtraObliqueRandomForestRegressor(
291
+ n_estimators=1500,
292
+ max_depth=20,
293
+ max_features=n_features**2,
294
+ feature_combinations=n_features,
295
+ n_jobs=-1,
296
+ random_state=42,
297
+ )
298
+ elif model_name == "ydf":
299
+ import ydf
300
+ templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
301
+
302
+ model = ydf.GradientBoostedTreesLearner(
303
+ label=target_col,
304
+ task=ydf.Task.REGRESSION,
305
+ growing_strategy='BEST_FIRST_GLOBAL',
306
+ categorical_algorithm='RANDOM',
307
+ split_axis='SPARSE_OBLIQUE',
308
+ sparse_oblique_normalization='MIN_MAX',
309
+ sparse_oblique_num_projections_exponent=2.0)
310
+
311
+ hyperparams = templates["benchmark_rank1v1"]
283
312
  elif model_name == "linear":
284
313
  from sklearn.linear_model import LassoCV
285
314
 
@@ -293,24 +322,15 @@ def auto_train(
293
322
  elif model_name == "cumulative_1":
294
323
  from pygam import GAM, s, f, te
295
324
 
296
- # compute index of column Region
297
- region_idx = X_train.columns.get_loc("Region")
298
-
299
- model = GAM(s(0) + f(region_idx))
325
+ model = GAM(s(0) + f(1))
300
326
  elif model_name == "cumulative_2":
301
327
  from pygam import GAM, s, f, te
302
328
 
303
- # compute index of column Region
304
- region_idx = X_train.columns.get_loc("Region")
305
-
306
- model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
329
+ model = GAM(s(0) + s(1) + te(0, 1) + f(2))
307
330
  elif model_name == "cumulative_3":
308
331
  from pygam import GAM, s, f, te
309
332
 
310
- # compute index of column Region
311
- region_idx = X_train.columns.get_loc("Region")
312
-
313
- model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
333
+ model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
314
334
  elif model_name == "geospaNN":
315
335
  import torch
316
336
  import geospaNN
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ import pandas as pd
2
3
  from statsmodels.regression.linear_model import OLS
3
4
  from statsmodels.tools.tools import add_constant
4
5
 
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
6
7
  class DetrendedData:
7
8
  """
8
9
  A class to store the detrended series, the model used for detrending,
9
- and the type of model ('mean', 'linear', 'quadratic').
10
+ and the type of model ('mean', 'linear', 'quadratic', 'difference').
10
11
  """
11
12
 
12
13
  def __init__(self, detrended_series, trend_model, model_type):
@@ -15,14 +16,16 @@ class DetrendedData:
15
16
  self.model_type = model_type
16
17
 
17
18
 
18
- def detrend_dataframe(df, column_name="y"):
19
+ def detrend_dataframe(df, column_name="y", model_type="best"):
19
20
  """
20
- Removes the trend from the specified column of a DataFrame using the method
21
- (mean, linear, quadratic) that results in the lowest AIC value.
21
+ Removes the trend from the specified column of a DataFrame using the specified method
22
+ (mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
22
23
 
23
24
  Parameters:
24
25
  - df: pandas DataFrame containing the time series data.
25
26
  - column_name: string name of the column to detrend.
27
+ - model_type: string specifying which model to use for detrending ('mean', 'linear',
28
+ 'quadratic', 'difference', or 'best' for automatic selection based on AIC).
26
29
 
27
30
  Returns:
28
31
  - DetrendedData object containing the detrended series, the statistical model,
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
41
44
  X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
42
45
  quad_model = OLS(df[column_name], X_quad).fit()
43
46
 
44
- models = {"mean": mean_model, "linear": linear_model, "quadratic": quad_model}
45
- best_model_type = min(models, key=lambda x: models[x].aic)
47
+ # Differencing method
48
+ diff_series = df[column_name].diff().dropna()
49
+ diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
50
+
51
+ models = {
52
+ "mean": mean_model,
53
+ "linear": linear_model,
54
+ "quadratic": quad_model,
55
+ "difference": diff_model
56
+ }
57
+
58
+ if model_type == "best":
59
+ best_model_type = min(models, key=lambda x: models[x].aic)
60
+ else:
61
+ best_model_type = model_type
62
+
46
63
  best_model = models[best_model_type]
47
64
 
48
65
  if best_model_type == "mean":
49
66
  detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
50
67
  elif best_model_type == "linear":
51
68
  detrended = df[column_name] - linear_model.predict(X_linear)
52
- else: # quadratic
69
+ elif best_model_type == "quadratic":
53
70
  detrended = df[column_name] - quad_model.predict(X_quad)
71
+ else: # difference
72
+ detrended = df[column_name].diff().dropna()
54
73
 
55
74
  return DetrendedData(detrended, best_model, best_model_type)
56
75
 
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
67
86
  Returns:
68
87
  - The retrended series as a pandas Series.
69
88
  """
70
- # if future_time_points is not of type pandas dataframe then convert it to one
71
89
  future_time_points = np.array(future_time_points)
72
90
 
73
- model_type = detrended_data.model_type[0]
74
- model = detrended_data.trend_model[0]
91
+ model_type = detrended_data.model_type.unique()[0]
92
+ model = detrended_data.trend_model.unique()[0]
75
93
 
76
94
  if model_type == "mean":
77
95
  trend_component = model.predict(
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
80
98
  elif model_type == "linear":
81
99
  X_linear = add_constant(future_time_points, has_constant="add")
82
100
  trend_component = model.predict(X_linear)
83
- else: # quadratic
101
+ elif model_type == "quadratic":
84
102
  X_quad = add_constant(
85
103
  np.column_stack((future_time_points, future_time_points**2)),
86
104
  has_constant="add",
87
105
  )
88
106
  trend_component = model.predict(X_quad)
107
+ else: # difference
108
+ trend_component = pd.Series(np.nan, index=future_time_points)
109
+ trend_component.iloc[0] = model.params[0] # Add mean of differenced series
89
110
 
90
111
  return trend_component
@@ -0,0 +1,55 @@
1
+ import pandas as pd
2
+ import hvplot.pandas
3
+ import panel as pn
4
+
5
+ # Load the CSV file
6
+ file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
7
+ data = pd.read_csv(file_path)
8
+
9
+ # Extract unique values for dropdowns
10
+ countries = data['Country'].unique().tolist()
11
+
12
+ # Create dropdown widgets
13
+ country_dropdown = pn.widgets.Select(name='Country', options=countries)
14
+ region_dropdown = pn.widgets.Select(name='Region', options=[])
15
+ crop_dropdown = pn.widgets.Select(name='Crop', options=[])
16
+ season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
17
+
18
+
19
+ # Function to update region and crop options based on selected country
20
+ @pn.depends(country_dropdown.param.value, watch=True)
21
+ def update_region_and_crop_options(country):
22
+ filtered_data = data[data['Country'] == country]
23
+ regions = filtered_data['Region'].unique().tolist()
24
+ crops = filtered_data['Crop'].unique().tolist()
25
+
26
+ region_dropdown.options = regions
27
+ crop_dropdown.options = crops
28
+
29
+
30
+ # Function to filter data based on dropdown selections
31
+ @pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
32
+ season_dropdown.param.value)
33
+ def update_plot(country, region, crop, season):
34
+ filtered_data = data[(data['Country'] == country) &
35
+ (data['Region'] == region) &
36
+ (data['Crop'] == crop) &
37
+ (data['Season'] == season)]
38
+
39
+ if not filtered_data.empty:
40
+ plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
41
+ hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
42
+ return plot
43
+ else:
44
+ return pn.pane.Markdown("No data available for the selected combination.")
45
+
46
+
47
+ # Create the dashboard
48
+ dashboard = pn.Column(
49
+ pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
50
+ update_plot
51
+ )
52
+
53
+ # Save as html page
54
+ dashboard.save('dashboard.html', embed=True)
55
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.47
3
+ Version: 0.1.49
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
32
32
  geocif/cei/definitions.py
33
33
  geocif/cei/indices.py
34
34
  geocif/ml/__init__.py
35
+ geocif/ml/aa.py
35
36
  geocif/ml/correlations.py
36
37
  geocif/ml/embedding.py
37
38
  geocif/ml/feature_engineering.py
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
50
51
  geocif/playground/automl.py
51
52
  geocif/playground/misc.py
52
53
  geocif/viz/__init__.py
54
+ geocif/viz/misc.py
53
55
  geocif/viz/plot.py
54
56
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.47",
53
+ version="0.1.49",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes