geocif 0.1.48__tar.gz → 0.1.49__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {geocif-0.1.48/geocif.egg-info → geocif-0.1.49}/PKG-INFO +1 -1
  2. {geocif-0.1.48 → geocif-0.1.49}/geocif/geocif.py +63 -19
  3. geocif-0.1.49/geocif/ml/aa.py +28 -0
  4. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/feature_selection.py +10 -0
  5. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/output.py +13 -13
  6. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/stats.py +7 -1
  7. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/trainers.py +27 -22
  8. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/trend.py +32 -11
  9. geocif-0.1.49/geocif/viz/misc.py +55 -0
  10. {geocif-0.1.48 → geocif-0.1.49/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/SOURCES.txt +2 -0
  12. {geocif-0.1.48 → geocif-0.1.49}/setup.py +1 -1
  13. {geocif-0.1.48 → geocif-0.1.49}/LICENSE +0 -0
  14. {geocif-0.1.48 → geocif-0.1.49}/MANIFEST.in +0 -0
  15. {geocif-0.1.48 → geocif-0.1.49}/README.md +0 -0
  16. {geocif-0.1.48 → geocif-0.1.49}/geocif/__init__.py +0 -0
  17. {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/__init__.py +0 -0
  18. {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/geoagmet.py +0 -0
  19. {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/plot.py +0 -0
  20. {geocif-0.1.48 → geocif-0.1.49}/geocif/agmet/utils.py +0 -0
  21. {geocif-0.1.48 → geocif-0.1.49}/geocif/analysis.py +0 -0
  22. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/__init__.py +0 -0
  23. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/constants.py +0 -0
  24. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/features.py +0 -0
  25. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/geo.py +0 -0
  26. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/geocif.py +0 -0
  27. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/metadata.py +0 -0
  28. {geocif-0.1.48 → geocif-0.1.49}/geocif/backup/models.py +0 -0
  29. {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/__init__.py +0 -0
  30. {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/definitions.py +0 -0
  31. {geocif-0.1.48 → geocif-0.1.49}/geocif/cei/indices.py +0 -0
  32. {geocif-0.1.48 → geocif-0.1.49}/geocif/experiments.py +0 -0
  33. {geocif-0.1.48 → geocif-0.1.49}/geocif/indices_runner.py +0 -0
  34. {geocif-0.1.48 → geocif-0.1.49}/geocif/indices_runner_v2.py +0 -0
  35. {geocif-0.1.48 → geocif-0.1.49}/geocif/logger.py +0 -0
  36. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/__init__.py +0 -0
  37. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/correlations.py +0 -0
  38. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/embedding.py +0 -0
  39. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/feature_engineering.py +0 -0
  40. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/misc.py +0 -0
  41. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/outliers.py +0 -0
  42. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/outlook.py +0 -0
  43. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/spatial_autocorrelation.py +0 -0
  44. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/stages.py +0 -0
  45. {geocif-0.1.48 → geocif-0.1.49}/geocif/ml/xai.py +0 -0
  46. {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/__init__.py +0 -0
  47. {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/automl.py +0 -0
  48. {geocif-0.1.48 → geocif-0.1.49}/geocif/playground/misc.py +0 -0
  49. {geocif-0.1.48 → geocif-0.1.49}/geocif/utils.py +0 -0
  50. {geocif-0.1.48 → geocif-0.1.49}/geocif/viz/__init__.py +0 -0
  51. {geocif-0.1.48 → geocif-0.1.49}/geocif/viz/plot.py +0 -0
  52. {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/dependency_links.txt +0 -0
  53. {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/not-zip-safe +0 -0
  54. {geocif-0.1.48 → geocif-0.1.49}/geocif.egg-info/top_level.txt +0 -0
  55. {geocif-0.1.48 → geocif-0.1.49}/requirements.txt +0 -0
  56. {geocif-0.1.48 → geocif-0.1.49}/setup.cfg +0 -0
  57. {geocif-0.1.48 → geocif-0.1.49}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.48
3
+ Version: 0.1.49
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -198,9 +198,6 @@ class Geocif:
198
198
 
199
199
  self.db_path = self.dir_db / self.db_forecasts
200
200
 
201
- # Store config file in database
202
- output.config_to_db(self.db_path, self.parser, self.today)
203
-
204
201
  # self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
205
202
  # obj_pickle = outlook.Outlook(self.pickle_file)
206
203
  # self.df_outlook = obj_pickle.read_outlook_file()
@@ -221,6 +218,9 @@ class Geocif:
221
218
  f"Detrended {self.target}" if self.check_yield_trend else self.target
222
219
  )
223
220
 
221
+ # Drop rows where target_col is NaN
222
+ df_region = df_region.dropna(subset=[target_col])
223
+
224
224
  X_train = df_region[self.feature_names]
225
225
  # Drop any columns with NaNs
226
226
  X_train = X_train.dropna(axis=1, how="any")
@@ -280,7 +280,7 @@ class Geocif:
280
280
  X_train_scaled,
281
281
  y_train,
282
282
  feature_names=self.selected_features,
283
- target_col=self.target,
283
+ target_col=target_col,
284
284
  optimize=self.optimize,
285
285
  fraction_loocv=self.fraction_loocv,
286
286
  cat_features=self.cat_features,
@@ -302,8 +302,13 @@ class Geocif:
302
302
  verbose=False,
303
303
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
304
304
  )
305
- elif self.model_name == "oblique":
305
+ elif self.model_name in ["oblique"]:
306
306
  self.model.fit(X_train, y_train)
307
+ elif self.model_name == "ydf":
308
+ # Combine X_train and y_train
309
+ df_train = pd.concat([X_train, y_train], axis=1)
310
+
311
+ self.model = self.model.train(df_train)
307
312
  elif self.model_name == "geospaNN":
308
313
  self.model.fit(
309
314
  X_train,
@@ -335,9 +340,16 @@ class Geocif:
335
340
  ]:
336
341
  from sklearn.preprocessing import StandardScaler, LabelEncoder
337
342
 
343
+ if self.model_name == "cumulative_1":
344
+ num_columns = 1
345
+ elif self.model_name == "cumulative_2":
346
+ num_columns = 2
347
+ elif self.model_name == "cumulative_3":
348
+ num_columns = 3
349
+
338
350
  # Standardize the numeric features
339
351
  scaler = StandardScaler()
340
- X_numeric = X_train.iloc[:, :3]
352
+ X_numeric = X_train.iloc[:, :num_columns]
341
353
  X_scaled_numeric = pd.DataFrame(
342
354
  scaler.fit_transform(X_numeric),
343
355
  columns=X_numeric.columns,
@@ -409,9 +421,16 @@ class Geocif:
409
421
  elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
410
422
  from sklearn.preprocessing import StandardScaler, LabelEncoder
411
423
 
424
+ if self.model_name == "cumulative_1":
425
+ num_columns = 1
426
+ elif self.model_name == "cumulative_2":
427
+ num_columns = 2
428
+ elif self.model_name == "cumulative_3":
429
+ num_columns = 3
430
+
412
431
  # Standardize the numeric features
413
432
  scaler = StandardScaler()
414
- X_numeric = X_test.iloc[:, :3]
433
+ X_numeric = X_test.iloc[:, :num_columns]
415
434
  try:
416
435
  X_scaled_numeric = pd.DataFrame(
417
436
  scaler.fit_transform(X_numeric),
@@ -455,7 +474,9 @@ class Geocif:
455
474
  self.selected_features + self.cat_features + [self.target]
456
475
  ]
457
476
  w_train = data_train.y - self.estimate(data_train.x)
458
-
477
+ elif self.model_name == "ydf":
478
+ y_pred = self.model.evaluate(X_test)
479
+ best_hyperparameters = {}
459
480
  else:
460
481
  y_pred = self.model.predict(X_test)
461
482
  best_hyperparameters = self.model.get_params().copy()
@@ -468,8 +489,8 @@ class Geocif:
468
489
 
469
490
  obj_trend = trend.DetrendedData(
470
491
  df_tmp[f"Detrended {self.target}"],
471
- df_tmp["Detrend Model"],
472
- df_tmp["Detrend Model Type"],
492
+ df_tmp["Detrended Model"],
493
+ df_tmp["Detrended Model Type"],
473
494
  )
474
495
 
475
496
  # Retrend the predicted yield
@@ -477,6 +498,8 @@ class Geocif:
477
498
  obj_trend, df_region.iloc[idx][["Harvest Year"]]
478
499
  )[0]
479
500
 
501
+ df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
502
+
480
503
  # Create a dataframe with forecast results
481
504
  shp = len(X_test)
482
505
  experiment_id = f"{self.country}_{self.crop}"
@@ -530,7 +553,6 @@ class Geocif:
530
553
 
531
554
  if self.check_yield_trend:
532
555
  df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
533
- df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
534
556
 
535
557
  if self.last_year_yield_as_feature:
536
558
  # Add last year yield to dataframe
@@ -729,7 +751,7 @@ class Geocif:
729
751
  + ["Region_ID"]
730
752
  )
731
753
  if self.check_yield_trend:
732
- common_columns += ["Detrended Model Type", "Detrended Model"]
754
+ common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
733
755
 
734
756
  if self.last_year_yield_as_feature:
735
757
  common_columns += [f"Last Year {self.target}"]
@@ -738,11 +760,15 @@ class Geocif:
738
760
  # Filter dataframe based on region and self.feature_names
739
761
  df_region_train = self.df_train[mask_train]
740
762
  df_region_train = df_region_train[self.fixed_columns + common_columns]
763
+ df_region_train.reset_index(drop=True, inplace=True)
741
764
  self.train(df_region_train, scaler)
742
765
 
743
766
  """ Predict """
767
+ if self.check_yield_trend:
768
+ common_columns = common_columns[:-3]
744
769
  df_region_test = self.df_test[mask_test]
745
770
  df_region_test = df_region_test[self.fixed_columns + common_columns]
771
+ df_region_test.reset_index(drop=True, inplace=True)
746
772
  experiment_id, df = self.predict(df_region_test, scaler)
747
773
  # df.reset_index(inplace=True)
748
774
 
@@ -897,12 +923,12 @@ class Geocif:
897
923
 
898
924
  if self.lag_yield_as_feature:
899
925
  df = fe.compute_lag_yield(
900
- df, self.all_seasons_with_yield, self.number_lag_years
926
+ df, self.all_seasons_with_yield, self.number_lag_years, self.target
901
927
  )
902
928
 
903
929
  if self.analogous_year_yield_as_feature:
904
930
  df = fe.compute_analogous_yield(
905
- df, self.all_seasons_with_yield, self.number_median_years
931
+ df, self.all_seasons_with_yield, self.number_median_years, self.target
906
932
  )
907
933
 
908
934
  # Create Region_ID column based on Region column category code
@@ -912,7 +938,7 @@ class Geocif:
912
938
  elif self.cluster_strategy == "individual":
913
939
  df["Region_ID"] = df["Region"].cat.codes
914
940
  elif self.cluster_strategy == "auto_detect":
915
- clusters_assigned = fe.detect_clusters(df)
941
+ clusters_assigned = fe.detect_clusters(df, self.target)
916
942
  # Merge the cluster labels with the original DataFrame
917
943
  df = df.merge(clusters_assigned, on="Region")
918
944
 
@@ -1036,8 +1062,8 @@ class Geocif:
1036
1062
 
1037
1063
  """ Groupby Region column and compute detrended yield """
1038
1064
  self.df_train[f"Detrended {self.target}"] = np.NaN
1039
- self.df_train["Detrend Model"] = np.NaN
1040
- self.df_train["Detrend Model Type"] = np.NaN
1065
+ self.df_train["Detrended Model"] = np.NaN
1066
+ self.df_train["Detrended Model Type"] = np.NaN
1041
1067
  if self.check_yield_trend:
1042
1068
  group_by = ["Region"]
1043
1069
  groups = self.df_train.groupby(group_by)
@@ -1050,10 +1076,10 @@ class Geocif:
1050
1076
  group.index, f"Detrended {self.target}"
1051
1077
  ] = detrended_data.detrended_series
1052
1078
  self.df_train.loc[
1053
- group.index, "Detrend Model"
1079
+ group.index, "Detrended Model"
1054
1080
  ] = detrended_data.trend_model
1055
1081
  self.df_train.loc[
1056
- group.index, "Detrend Model Type"
1082
+ group.index, "Detrended Model Type"
1057
1083
  ] = detrended_data.model_type
1058
1084
 
1059
1085
  # 6. Exclude years without yields from df_train
@@ -1122,6 +1148,15 @@ class Geocif:
1122
1148
  self.cluster_strategy = "single"
1123
1149
  self.select_cei_by = "Index"
1124
1150
  self.use_cumulative_features = True
1151
+ elif self.model_name in ["oblique", "ydf"]:
1152
+ self.do_xai = False
1153
+ self.estimate_ci = False
1154
+ # Remove Region from cat_features as it is object type
1155
+ self.cat_features = [col for col in self.cat_features if col != "Region"]
1156
+ # if self.model_name == "ydf":
1157
+ # # HACK, for ydf model, target_col is Yield
1158
+ # self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
1159
+ # self.target = "Yield"
1125
1160
  else:
1126
1161
  self.do_xai = self.parser.getboolean("ML", "do_xai")
1127
1162
  self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
@@ -1188,6 +1223,9 @@ class Geocif:
1188
1223
  self.dg["Country Region"] = (
1189
1224
  self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
1190
1225
  )
1226
+ elif self.country == "illinois":
1227
+ self.dg["ADM0_NAME"] = "illinois"
1228
+ self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
1191
1229
  else:
1192
1230
  self.dg["Country Region"] = (
1193
1231
  self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
@@ -1240,6 +1278,9 @@ class Geocif:
1240
1278
  # TODO ignore file with _2000 in its name
1241
1279
  all_files = [f for f in all_files if "_2000" not in f.name]
1242
1280
 
1281
+ # Assert that all_files is not empty
1282
+ assert all_files, f"No files found in {_dir_country} with {file_name}"
1283
+
1243
1284
  self.df_results = pd.concat(
1244
1285
  (pd.read_csv(f) for f in all_files), ignore_index=True
1245
1286
  )
@@ -1293,6 +1334,9 @@ def loop_execute(inputs):
1293
1334
  obj = Geocif(logger=logger, parser=parser)
1294
1335
  obj.read_data(country, crop, season)
1295
1336
 
1337
+ # Store config file in database
1338
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
1339
+
1296
1340
  # Setup metadata and run ML code
1297
1341
  obj.setup(season, model)
1298
1342
  if obj.simulation_stages:
@@ -0,0 +1,28 @@
1
+ import ydf
2
+ import pandas as pd
3
+
4
+ # Load dataset with Pandas
5
+ ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
6
+ train_ds = pd.read_csv(ds_path + "adult_train.csv")
7
+ test_ds = pd.read_csv(ds_path + "adult_test.csv")
8
+
9
+ # Train a Gradient Boosted Trees model
10
+ model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
11
+
12
+ # Look at a model (input features, training logs, structure, etc.)
13
+ model.describe()
14
+
15
+ # Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
16
+ model.evaluate(test_ds)
17
+
18
+ # Generate predictions
19
+ model.predict(test_ds)
20
+
21
+ # Analyse a model (e.g. partial dependence plot, variable importance)
22
+ model.analyze(test_ds)
23
+
24
+ # Benchmark the inference speed of a model
25
+ model.benchmark(test_ds)
26
+
27
+ # Save the model
28
+ model.save("/tmp/my_model")
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
131
131
 
132
132
  X_filtered = selector.fit_transform(X, y)
133
133
  selected_features = X_filtered.columns.tolist()
134
+ elif method == "mrmr":
135
+ from mrmr import mrmr_regression
136
+
137
+ try:
138
+ selected_features = mrmr_regression(X=X, y=y, K=10)
139
+ except:
140
+ breakpoint()
141
+ # combine X and y into a dataframe
142
+ # df = pd.concat([X, y], axis=1)
143
+
134
144
  elif method == "RFECV":
135
145
  from sklearn.feature_selection import RFECV
136
146
  from sklearn.model_selection import KFold
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
109
109
  except Exception as e:
110
110
  print(f"Error: {e}")
111
111
 
112
- index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
113
- # Output model pickle as a blob to database
114
- df_model = pd.DataFrame(
115
- {
116
- "Experiment_ID": [experiment_id],
117
- "Model": [model_name],
118
- "Model_Blob": [pickle.dumps(model)],
119
- }
120
- )
121
- # df_model.index = df_model.apply(
122
- # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
123
- # )
124
-
125
112
  # name the index level
126
113
  try:
114
+ index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
115
+ # Output model pickle as a blob to database
116
+ df_model = pd.DataFrame(
117
+ {
118
+ "Experiment_ID": [experiment_id],
119
+ "Model": [model_name],
120
+ "Model_Blob": [pickle.dumps(model)],
121
+ }
122
+ )
123
+ # df_model.index = df_model.apply(
124
+ # lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
125
+ # )
126
+
127
127
  df_model.index.set_names(["Index"], inplace=True)
128
128
  utils.to_db(db_path, "models", df_model)
129
129
  except Exception as e:
@@ -191,7 +191,12 @@ def add_statistics(
191
191
 
192
192
  """
193
193
  # First check if country and crop are in the admin_crop_production.csv file
194
- fn = "afghanistan.csv" if country == "Afghanistan" else "adm_crop_production.csv"
194
+ if country == "Afghanistan":
195
+ fn = "afghanistan.csv"
196
+ elif country == "Illinois":
197
+ fn = "illinois.csv"
198
+ else:
199
+ fn = "adm_crop_production.csv"
195
200
  df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
196
201
 
197
202
  # HACK
@@ -206,6 +211,7 @@ def add_statistics(
206
211
  df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
207
212
  else:
208
213
  group_by = ["Region", "Harvest Year"]
214
+
209
215
  groups = df.groupby(group_by)
210
216
 
211
217
  # Define processing for each group
@@ -264,8 +264,7 @@ def auto_train(
264
264
  if model_name in ["catboost", "merf"]:
265
265
  hyperparams = {
266
266
  "depth": 6,
267
- "learning_rate": 0.01,
268
- "iterations": 5000,
267
+
269
268
  "subsample": 1.0,
270
269
  "random_strength": 0.5,
271
270
  "reg_lambda": 0.001,
@@ -283,18 +282,33 @@ def auto_train(
283
282
  regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
284
283
  model = MERF(regr, max_iterations=10)
285
284
  elif model_name == "oblique":
286
- breakpoint()
287
- from sktree.ensemble import ObliqueRandomForestRegressor
285
+ from treeple import ExtraObliqueRandomForestRegressor
286
+
287
+ # https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
288
+ n_features = X_train.shape[1]
288
289
 
289
- print("Training ObliqueRandomForestRegressor")
290
- model = ObliqueRandomForestRegressor(
291
- n_estimators=500,
292
- max_depth=7,
290
+ model = ExtraObliqueRandomForestRegressor(
291
+ n_estimators=1500,
292
+ max_depth=20,
293
+ max_features=n_features**2,
294
+ feature_combinations=n_features,
293
295
  n_jobs=-1,
294
- verbose=2,
295
296
  random_state=42,
296
297
  )
297
- print("Finished training ObliqueRandomForestRegressor")
298
+ elif model_name == "ydf":
299
+ import ydf
300
+ templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
301
+
302
+ model = ydf.GradientBoostedTreesLearner(
303
+ label=target_col,
304
+ task=ydf.Task.REGRESSION,
305
+ growing_strategy='BEST_FIRST_GLOBAL',
306
+ categorical_algorithm='RANDOM',
307
+ split_axis='SPARSE_OBLIQUE',
308
+ sparse_oblique_normalization='MIN_MAX',
309
+ sparse_oblique_num_projections_exponent=2.0)
310
+
311
+ hyperparams = templates["benchmark_rank1v1"]
298
312
  elif model_name == "linear":
299
313
  from sklearn.linear_model import LassoCV
300
314
 
@@ -308,24 +322,15 @@ def auto_train(
308
322
  elif model_name == "cumulative_1":
309
323
  from pygam import GAM, s, f, te
310
324
 
311
- # compute index of column Region
312
- region_idx = X_train.columns.get_loc("Region")
313
-
314
- model = GAM(s(0) + f(region_idx))
325
+ model = GAM(s(0) + f(1))
315
326
  elif model_name == "cumulative_2":
316
327
  from pygam import GAM, s, f, te
317
328
 
318
- # compute index of column Region
319
- region_idx = X_train.columns.get_loc("Region")
320
-
321
- model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
329
+ model = GAM(s(0) + s(1) + te(0, 1) + f(2))
322
330
  elif model_name == "cumulative_3":
323
331
  from pygam import GAM, s, f, te
324
332
 
325
- # compute index of column Region
326
- region_idx = X_train.columns.get_loc("Region")
327
-
328
- model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
333
+ model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
329
334
  elif model_name == "geospaNN":
330
335
  import torch
331
336
  import geospaNN
@@ -1,4 +1,5 @@
1
1
  import numpy as np
2
+ import pandas as pd
2
3
  from statsmodels.regression.linear_model import OLS
3
4
  from statsmodels.tools.tools import add_constant
4
5
 
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
6
7
  class DetrendedData:
7
8
  """
8
9
  A class to store the detrended series, the model used for detrending,
9
- and the type of model ('mean', 'linear', 'quadratic').
10
+ and the type of model ('mean', 'linear', 'quadratic', 'difference').
10
11
  """
11
12
 
12
13
  def __init__(self, detrended_series, trend_model, model_type):
@@ -15,14 +16,16 @@ class DetrendedData:
15
16
  self.model_type = model_type
16
17
 
17
18
 
18
- def detrend_dataframe(df, column_name="y"):
19
+ def detrend_dataframe(df, column_name="y", model_type="best"):
19
20
  """
20
- Removes the trend from the specified column of a DataFrame using the method
21
- (mean, linear, quadratic) that results in the lowest AIC value.
21
+ Removes the trend from the specified column of a DataFrame using the specified method
22
+ (mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
22
23
 
23
24
  Parameters:
24
25
  - df: pandas DataFrame containing the time series data.
25
26
  - column_name: string name of the column to detrend.
27
+ - model_type: string specifying which model to use for detrending ('mean', 'linear',
28
+ 'quadratic', 'difference', or 'best' for automatic selection based on AIC).
26
29
 
27
30
  Returns:
28
31
  - DetrendedData object containing the detrended series, the statistical model,
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
41
44
  X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
42
45
  quad_model = OLS(df[column_name], X_quad).fit()
43
46
 
44
- models = {"mean": mean_model, "linear": linear_model, "quadratic": quad_model}
45
- best_model_type = min(models, key=lambda x: models[x].aic)
47
+ # Differencing method
48
+ diff_series = df[column_name].diff().dropna()
49
+ diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
50
+
51
+ models = {
52
+ "mean": mean_model,
53
+ "linear": linear_model,
54
+ "quadratic": quad_model,
55
+ "difference": diff_model
56
+ }
57
+
58
+ if model_type == "best":
59
+ best_model_type = min(models, key=lambda x: models[x].aic)
60
+ else:
61
+ best_model_type = model_type
62
+
46
63
  best_model = models[best_model_type]
47
64
 
48
65
  if best_model_type == "mean":
49
66
  detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
50
67
  elif best_model_type == "linear":
51
68
  detrended = df[column_name] - linear_model.predict(X_linear)
52
- else: # quadratic
69
+ elif best_model_type == "quadratic":
53
70
  detrended = df[column_name] - quad_model.predict(X_quad)
71
+ else: # difference
72
+ detrended = df[column_name].diff().dropna()
54
73
 
55
74
  return DetrendedData(detrended, best_model, best_model_type)
56
75
 
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
67
86
  Returns:
68
87
  - The retrended series as a pandas Series.
69
88
  """
70
- # if future_time_points is not of type pandas dataframe then convert it to one
71
89
  future_time_points = np.array(future_time_points)
72
90
 
73
- model_type = detrended_data.model_type[0]
74
- model = detrended_data.trend_model[0]
91
+ model_type = detrended_data.model_type.unique()[0]
92
+ model = detrended_data.trend_model.unique()[0]
75
93
 
76
94
  if model_type == "mean":
77
95
  trend_component = model.predict(
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
80
98
  elif model_type == "linear":
81
99
  X_linear = add_constant(future_time_points, has_constant="add")
82
100
  trend_component = model.predict(X_linear)
83
- else: # quadratic
101
+ elif model_type == "quadratic":
84
102
  X_quad = add_constant(
85
103
  np.column_stack((future_time_points, future_time_points**2)),
86
104
  has_constant="add",
87
105
  )
88
106
  trend_component = model.predict(X_quad)
107
+ else: # difference
108
+ trend_component = pd.Series(np.nan, index=future_time_points)
109
+ trend_component.iloc[0] = model.params[0] # Add mean of differenced series
89
110
 
90
111
  return trend_component
@@ -0,0 +1,55 @@
1
+ import pandas as pd
2
+ import hvplot.pandas
3
+ import panel as pn
4
+
5
+ # Load the CSV file
6
+ file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
7
+ data = pd.read_csv(file_path)
8
+
9
+ # Extract unique values for dropdowns
10
+ countries = data['Country'].unique().tolist()
11
+
12
+ # Create dropdown widgets
13
+ country_dropdown = pn.widgets.Select(name='Country', options=countries)
14
+ region_dropdown = pn.widgets.Select(name='Region', options=[])
15
+ crop_dropdown = pn.widgets.Select(name='Crop', options=[])
16
+ season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
17
+
18
+
19
+ # Function to update region and crop options based on selected country
20
+ @pn.depends(country_dropdown.param.value, watch=True)
21
+ def update_region_and_crop_options(country):
22
+ filtered_data = data[data['Country'] == country]
23
+ regions = filtered_data['Region'].unique().tolist()
24
+ crops = filtered_data['Crop'].unique().tolist()
25
+
26
+ region_dropdown.options = regions
27
+ crop_dropdown.options = crops
28
+
29
+
30
+ # Function to filter data based on dropdown selections
31
+ @pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
32
+ season_dropdown.param.value)
33
+ def update_plot(country, region, crop, season):
34
+ filtered_data = data[(data['Country'] == country) &
35
+ (data['Region'] == region) &
36
+ (data['Crop'] == crop) &
37
+ (data['Season'] == season)]
38
+
39
+ if not filtered_data.empty:
40
+ plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
41
+ hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
42
+ return plot
43
+ else:
44
+ return pn.pane.Markdown("No data available for the selected combination.")
45
+
46
+
47
+ # Create the dashboard
48
+ dashboard = pn.Column(
49
+ pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
50
+ update_plot
51
+ )
52
+
53
+ # Save as html page
54
+ dashboard.save('dashboard.html', embed=True)
55
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.48
3
+ Version: 0.1.49
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
32
32
  geocif/cei/definitions.py
33
33
  geocif/cei/indices.py
34
34
  geocif/ml/__init__.py
35
+ geocif/ml/aa.py
35
36
  geocif/ml/correlations.py
36
37
  geocif/ml/embedding.py
37
38
  geocif/ml/feature_engineering.py
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
50
51
  geocif/playground/automl.py
51
52
  geocif/playground/misc.py
52
53
  geocif/viz/__init__.py
54
+ geocif/viz/misc.py
53
55
  geocif/viz/plot.py
54
56
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.48",
53
+ version="0.1.49",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes