geocif 0.1.32__tar.gz → 0.1.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {geocif-0.1.32/geocif.egg-info → geocif-0.1.34}/PKG-INFO +1 -1
  2. {geocif-0.1.32 → geocif-0.1.34}/geocif/analysis.py +5 -5
  3. {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/indices.py +11 -3
  4. {geocif-0.1.32 → geocif-0.1.34}/geocif/geocif.py +44 -3
  5. {geocif-0.1.32 → geocif-0.1.34}/geocif/indices_runner.py +5 -4
  6. geocif-0.1.34/geocif/indices_runner_v2.py +208 -0
  7. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/correlations.py +3 -0
  8. geocif-0.1.34/geocif/ml/correlations_backup.py +412 -0
  9. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/spatial_autocorrelation.py +6 -7
  10. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/stages.py +6 -3
  11. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/trainers.py +34 -0
  12. {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/misc.py +72 -2
  13. {geocif-0.1.32 → geocif-0.1.34/geocif.egg-info}/PKG-INFO +1 -1
  14. {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/SOURCES.txt +2 -0
  15. {geocif-0.1.32 → geocif-0.1.34}/setup.py +1 -1
  16. {geocif-0.1.32 → geocif-0.1.34}/LICENSE +0 -0
  17. {geocif-0.1.32 → geocif-0.1.34}/MANIFEST.in +0 -0
  18. {geocif-0.1.32 → geocif-0.1.34}/README.md +0 -0
  19. {geocif-0.1.32 → geocif-0.1.34}/geocif/__init__.py +0 -0
  20. {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/__init__.py +0 -0
  21. {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/geoagmet.py +0 -0
  22. {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/plot.py +0 -0
  23. {geocif-0.1.32 → geocif-0.1.34}/geocif/agmet/utils.py +0 -0
  24. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/__init__.py +0 -0
  25. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/constants.py +0 -0
  26. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/features.py +0 -0
  27. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/geo.py +0 -0
  28. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/geocif.py +0 -0
  29. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/metadata.py +0 -0
  30. {geocif-0.1.32 → geocif-0.1.34}/geocif/backup/models.py +0 -0
  31. {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/__init__.py +0 -0
  32. {geocif-0.1.32 → geocif-0.1.34}/geocif/cei/definitions.py +0 -0
  33. {geocif-0.1.32 → geocif-0.1.34}/geocif/logger.py +0 -0
  34. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/__init__.py +0 -0
  35. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/embedding.py +0 -0
  36. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/feature_engineering.py +0 -0
  37. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/feature_selection.py +0 -0
  38. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/outliers.py +0 -0
  39. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/outlook.py +0 -0
  40. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/output.py +0 -0
  41. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/stats.py +0 -0
  42. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/trend.py +0 -0
  43. {geocif-0.1.32 → geocif-0.1.34}/geocif/ml/xai.py +0 -0
  44. {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/__init__.py +0 -0
  45. {geocif-0.1.32 → geocif-0.1.34}/geocif/playground/automl.py +0 -0
  46. {geocif-0.1.32 → geocif-0.1.34}/geocif/utils.py +0 -0
  47. {geocif-0.1.32 → geocif-0.1.34}/geocif/viz/__init__.py +0 -0
  48. {geocif-0.1.32 → geocif-0.1.34}/geocif/viz/plot.py +0 -0
  49. {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/dependency_links.txt +0 -0
  50. {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/not-zip-safe +0 -0
  51. {geocif-0.1.32 → geocif-0.1.34}/geocif.egg-info/top_level.txt +0 -0
  52. {geocif-0.1.32 → geocif-0.1.34}/requirements.txt +0 -0
  53. {geocif-0.1.32 → geocif-0.1.34}/setup.cfg +0 -0
  54. {geocif-0.1.32 → geocif-0.1.34}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.32
3
+ Version: 0.1.34
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -162,8 +162,8 @@ class Geoanalysis:
162
162
  return pd.DataFrame(), pd.DataFrame()
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
- df_metrics = self._process_metrics(df_metrics)
166
- self._plot_metrics(df_metrics)
165
+ #df_metrics = self._process_metrics(df_metrics)
166
+ #self._plot_metrics(df_metrics)
167
167
 
168
168
  df_regional_metrics_by_year = self._compute_regional_metrics(
169
169
  df, by="Harvest Year"
@@ -172,9 +172,9 @@ class Geoanalysis:
172
172
  df_regional_metrics_by_year
173
173
  )
174
174
  df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
175
-
175
+ breakpoint()
176
176
  self._store_results(
177
- df_metrics, df_regional_metrics, df_regional_metrics_by_year
177
+ None, df_regional_metrics, df_regional_metrics_by_year
178
178
  )
179
179
 
180
180
  df_national_yield = self._compute_national_yield(df)
@@ -195,7 +195,7 @@ class Geoanalysis:
195
195
  .apply(self.annual_metrics)
196
196
  .reset_index()
197
197
  )
198
-
198
+ breakpoint()
199
199
  return df_metrics.pivot_table(
200
200
  index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
201
201
  columns="level_5",
@@ -465,7 +465,7 @@ class CEIs:
465
465
 
466
466
  extended_stages_list = []
467
467
  if self.method in ["phenological_stages", "fraction_season", "full_season"]:
468
- extended_stages_list = [stages]
468
+ extended_stages_list = stages
469
469
  elif self.method in ["dekad_r", "biweekly_r", "monthly_r"]:
470
470
  # reverse stages
471
471
  stages = stages[::-1]
@@ -566,10 +566,10 @@ class CEIs:
566
566
 
567
567
  """
568
568
  if self.method in ["phenological_stages", "fraction_season"]:
569
- mask = df_harvest_year_region[col].isin(stages)
569
+ mask = df_harvest_year_region[col].isin([stages])
570
570
  df_time_period = df_harvest_year_region[mask]
571
571
 
572
- mask = df_all_years[col].isin(stages)
572
+ mask = df_all_years[col].isin([stages])
573
573
  df_base_period = df_all_years[mask]
574
574
  elif self.method in [
575
575
  "dekad",
@@ -605,6 +605,10 @@ class CEIs:
605
605
  Returns:
606
606
 
607
607
  """
608
+ # If stage is not a list then convert it to a list
609
+ if not isinstance(stage, list):
610
+ stage = [stage]
611
+
608
612
  columns = [
609
613
  "Description",
610
614
  "CEI",
@@ -721,6 +725,10 @@ class CEIs:
721
725
  :param index_details:
722
726
  :return:
723
727
  """
728
+ # If stage is not a list then convert it to a list
729
+ if not isinstance(stage, list):
730
+ stage = [stage]
731
+
724
732
  df = df[df["bounds"] == 1]
725
733
  # Exclude lat, lon, time, bounds and time_bounds columns
726
734
  df = df.drop(columns=["lat", "lon", "time", "bounds", "time_bounds"])
@@ -222,6 +222,10 @@ class Geocif:
222
222
  self.logger.info(f"Selected features: {self.selected_features}")
223
223
 
224
224
  """ Update model to include conformal estimates """
225
+ if "lat" not in self.selected_features:
226
+ self.selected_features.append("lat")
227
+ if "lon" not in self.selected_features:
228
+ self.selected_features.append("lon")
225
229
  X_train = df_region[self.selected_features + self.cat_features]
226
230
  dir_output = (
227
231
  self.dir_analysis
@@ -275,6 +279,12 @@ class Geocif:
275
279
  verbose=False,
276
280
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
277
281
  )
282
+ elif self.model_name == "geospaNN":
283
+ self.model.fit(
284
+ X_train,
285
+ y_train,
286
+ # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
287
+ )
278
288
  elif self.model_name == "merf":
279
289
  Z_train = np.ones((len(X_train), 1))
280
290
  clusters_train = df_region["Region"]
@@ -341,6 +351,25 @@ class Geocif:
341
351
  X_test, Z_test, clusters_test.astype("object")
342
352
  )
343
353
  best_hyperparameters = self.model.fe_model.get_params().copy()
354
+ elif self.model_name == "geospaNN":
355
+ import torch
356
+ import geospaNN
357
+
358
+ # Remove any categorical features
359
+ X_test = X_test.drop(columns=self.cat_features)
360
+ X = torch.from_numpy(X_test.to_numpy()).float()
361
+ coord = torch.from_numpy(self.df_test[['lon', 'lat']].to_numpy()).float()
362
+
363
+ p = X.shape[1]
364
+ n = X.shape[0]
365
+ nn = 5
366
+
367
+ data = geospaNN.make_graph(X, Y, coord, nn)
368
+
369
+ # remove categorical features from df_train
370
+ data_train = df_region[self.selected_features + self.cat_features + [self.target]]
371
+ w_train = data_train.y - self.estimate(data_train.x)
372
+
344
373
  else:
345
374
  y_pred = self.model.predict(X_test)
346
375
  best_hyperparameters = self.model.get_params().copy()
@@ -458,9 +487,10 @@ class Geocif:
458
487
  "Harvest Year",
459
488
  "Stage Name",
460
489
  ]
461
- df.index = df.apply(
462
- lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
463
- )
490
+ try:
491
+ df.index = df.apply(lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1)
492
+ except Exception as e:
493
+ breakpoint()
464
494
 
465
495
  # name the index level
466
496
  df.index.set_names(["Index"], inplace=True)
@@ -527,6 +557,10 @@ class Geocif:
527
557
  if self.use_outlook_as_feature:
528
558
  self.feature_names.append("FCST")
529
559
 
560
+ # Add lat and lon to feature names
561
+ self.feature_names.append("lat")
562
+ self.feature_names.append("lon")
563
+
530
564
  self.selected_features = []
531
565
 
532
566
  def loop_ml(self, stages, dict_selected_features, dict_best_cei):
@@ -782,6 +816,13 @@ class Geocif:
782
816
  how="outer",
783
817
  )
784
818
 
819
+ # Add a lat and lon column to self.dg_country
820
+ self.dg_country["lat"] = self.dg_country.centroid.y
821
+ self.dg_country["lon"] = self.dg_country.centroid.x
822
+
823
+ # Add lat and lon columns to df by merging on Country Region column
824
+ df = df.merge(self.dg_country[["Country Region", "lat", "lon"]].drop_duplicates(), on="Country Region", how="left")
825
+
785
826
  dict_kwargs = {}
786
827
  dict_kwargs["all_stages"] = self.all_stages
787
828
  dict_kwargs["target_col"] = self.target
@@ -165,11 +165,12 @@ class cei_runner(base.BaseGeo):
165
165
  combinations = [
166
166
  i
167
167
  for i in combinations
168
- if "angola_maize" in i[3] or "lesotho_maize" in i[3] or
169
- # "namibia" in i[2] or
170
- # "united_republic_of_tanzania" in i[2] or
168
+ if "angola_maize" in i[3] or
169
+ "lesotho_maize" in i[3] or
170
+ # "namibia_" in i[2] or
171
+ "united_republic_of_tanzania_maize" in i[3] or
171
172
  "zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
172
- # "south_africa" in i[2] or
173
+ "south_africa_maize" in i[3] or
173
174
  "mozambique_maize" in i[3]
174
175
  ]
175
176
  # "malawi" in i[2]]
@@ -0,0 +1,208 @@
1
+ import itertools
2
+ import warnings
3
+ from multiprocessing import Pool, cpu_count
4
+ from pathlib import Path
5
+
6
+ import arrow as ar
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from .cei import indices
13
+ from geoprepare import base
14
+
15
+
16
+ def remove_duplicates(lst):
17
+ """
18
+
19
+ :param lst:
20
+ :return:
21
+ """
22
+ return list(set([i for i in lst]))
23
+
24
+
25
+ def get_admin_zone(country, dg_shp):
26
+ admin_zone = "admin_1"
27
+ country = country.title().replace(" ", "_")
28
+
29
+ # Read in shapefile
30
+ dg_country = dg_shp[dg_shp["ADMIN0"] == country]
31
+
32
+ # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
33
+ if dg_country.empty:
34
+ admin_zone = "admin_1"
35
+ elif not dg_country["ADMIN2"].isna().all():
36
+ admin_zone = "admin_2"
37
+
38
+ return admin_zone
39
+
40
+
41
+ class cei_runner(base.BaseGeo):
42
+ def __init__(self, path_config_file):
43
+ super().__init__(path_config_file)
44
+
45
+ # Parse configuration files
46
+ self.parse_config()
47
+
48
+ self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
49
+ self.base_dir = Path(self.parser.get("PATHS", "dir_crop_inputs"))
50
+ self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
51
+
52
+ def collect_files(self):
53
+ """
54
+ 1. Collect all the files which contain EO information
55
+ 2. Exclude files from the `processed` directory if it is already in
56
+ processed_include_fall directory
57
+ 3. Create a dataframe that contains the following columns:
58
+ - directory: name of directory where file is located
59
+ - path: full path to file
60
+ - filename: name of file
61
+ :return: Return the dataframe created above
62
+ """
63
+ import geopandas as gp
64
+
65
+ dg_shp = gp.read_file(
66
+ self.dir_input
67
+ / "Global_Datasets"
68
+ / "Regions"
69
+ / "Shps"
70
+ / "adm_shapefile.shp",
71
+ engine="pyogrio",
72
+ )
73
+
74
+ # Collect all the files which contain EO information
75
+ df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
76
+ for filepath in self.base_dir.rglob("*.csv"):
77
+ country = filepath.parents[0].name
78
+
79
+ admin_zone = get_admin_zone(country, dg_shp)
80
+
81
+ # If country is not in cc.COUNTRIES then skip
82
+ # HACK: Skip korea for now, as it is giving errors
83
+ if country == "republic_of_korea":
84
+ continue
85
+
86
+ # Get name of directory one level up
87
+ process_type = filepath.parents[1].name
88
+
89
+ # Get name of file
90
+ filename = filepath.name
91
+
92
+ # Add to dataframe
93
+ df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
94
+
95
+ # Exclude those rows where directory is processed and file is already in
96
+ # processed_include_fall directory
97
+ no_fall = df_files["directory"] == "processed"
98
+ include_fall = df_files[df_files["directory"] == "processed_include_fall"][
99
+ "filename"
100
+ ]
101
+
102
+ df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
103
+
104
+ return df_files
105
+
106
+ def process_combinations(self, df, method):
107
+ """
108
+ Create a list of tuples of the following:
109
+ - directory: name of directory where file is located
110
+ - path: full path to file
111
+ - filename: name of file
112
+ - method: whether to compute indices for phenological stages or not
113
+ This tuple will be used as input to the `process` function
114
+ :param df:
115
+ :param method:
116
+ :return:
117
+ """
118
+ combinations = []
119
+
120
+ for index, row in tqdm(df.iterrows()):
121
+ combinations.extend(
122
+ list(
123
+ itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
124
+ )
125
+ )
126
+
127
+ combinations = remove_duplicates(combinations)
128
+
129
+ return combinations
130
+
131
+ def main(self, method):
132
+ """
133
+
134
+ :param method:
135
+ :return:
136
+ """
137
+ # Create a dataframe of the files to be analyzed
138
+ df_files = self.collect_files()
139
+
140
+ combinations = self.process_combinations(df_files, method)
141
+
142
+ # Add an element to the tuple to indicate the season
143
+ # Last element is redo flag which is True if the analysis is to be redone
144
+ # and False otherwise. Analysis is always redone for the current year
145
+ # and last year whether file exists or not
146
+ combinations = [
147
+ (
148
+ self.parser,
149
+ status,
150
+ path,
151
+ filename,
152
+ admin_zone,
153
+ category,
154
+ year,
155
+ "ndvi",
156
+ False, # redo
157
+ )
158
+ for year in range(2001, ar.utcnow().year + 1)
159
+ for status, path, filename, admin_zone, category in combinations
160
+ ]
161
+
162
+ # Only keep those entries in combinations where the third elemt is
163
+ # mozambique, south_africa, angola or dem_people's_rep_of_korea
164
+ # This is done to test the code for these countries
165
+ #combinations = [
166
+ # i
167
+ # for i in combinations
168
+ # if "ethiopia_maize_s1" in i[3]
169
+ #]
170
+ # "malawi" in i[2]]
171
+
172
+ if True:
173
+ num_cpu = int(cpu_count() * 0.8)
174
+ with Pool(num_cpu) as p:
175
+ for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
176
+ pass
177
+ else:
178
+ # Use the code below if you want to test without parallelization or
179
+ # if you want to debug by using pdb
180
+ pbar = tqdm(combinations)
181
+ for i, val in enumerate(pbar):
182
+ pbar.set_description(
183
+ f"Main loop {combinations[i][2]} {combinations[i][5]}"
184
+ )
185
+ indices.process(val)
186
+
187
+
188
+ def run(path_config_files=[]):
189
+ """
190
+
191
+ Args:
192
+ path_config_files:
193
+
194
+ Returns:
195
+
196
+ """
197
+ """ Check dictionary keys to have no spaces"""
198
+ indices.validate_index_definitions()
199
+
200
+ for method in [
201
+ "phenological_stages", # "dekad_r" # "dekad_r"
202
+ ]: # , "full_season", "phenological_stages", "fraction_season"]:
203
+ obj = cei_runner(path_config_files)
204
+ obj.main(method)
205
+
206
+
207
+ if __name__ == "__main__":
208
+ run()
@@ -260,6 +260,9 @@ def all_correlated_feature_by_time(df, **kwargs):
260
260
  ):
261
261
  df_corr = _all_correlated_feature_by_time(group, **kwargs)
262
262
 
263
+ # Remove columns with more than 50% NaN values
264
+ df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
265
+
263
266
  if not df_corr.empty:
264
267
  df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
265
268
  dict_selected_features[region_id] = df_tmp.columns
@@ -0,0 +1,412 @@
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import palettable as pal
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ from tqdm import tqdm
8
+
9
+ from geocif import utils
10
+ from geocif.ml import embedding
11
+ from geocif.ml import stages
12
+
13
+
14
+ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
15
+ """
16
+
17
+ Args:
18
+ df_train:
19
+ simulation_stages:
20
+ target_col:
21
+
22
+ Returns:
23
+
24
+ """
25
+ frames = []
26
+
27
+ stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
28
+
29
+ # Only select columns that have been observed till the current stage
30
+ for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
31
+ current_feature_set = [
32
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
33
+ ]
34
+
35
+ # Get the most correlated feature for each region
36
+ top_feature_by_region, counter = embedding.get_top_correlated_features(
37
+ df_train[current_feature_set + ["Region"]],
38
+ df_train[target_col],
39
+ )
40
+
41
+ # Create a dataframe with the most common top feature and number of occurrences over timestep
42
+ _feature = counter.most_common(1)[0][0]
43
+ # Loop through top_feature_by_region and find the average score for _feature
44
+ # Calculate the average score for 'DTR_36'
45
+ _feature_scores = [
46
+ value[1][0]
47
+ for key, value in top_feature_by_region.items()
48
+ if _feature in value[0]
49
+ ]
50
+ average_score = sum(_feature_scores) / len(_feature_scores)
51
+ _feature = utils.remove_last_part(_feature)
52
+
53
+ df = pd.DataFrame(
54
+ {
55
+ "Stage": [stage[-1]],
56
+ "Date": [utils.dict_growth_stages[stage[-1]]],
57
+ "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
58
+ "Feature Category": [_feature],
59
+ "Score": [average_score],
60
+ # "Type": [ci.dict_indices[_feature][0]],
61
+ "Number of Occurrences": [counter.most_common(1)[0][1]],
62
+ # "Current Feature Set": [current_feature_set],
63
+ }
64
+ )
65
+ frames.append(df)
66
+
67
+ df_most_corr_feature_by_time = pd.concat(frames)
68
+
69
+
70
+ def plot_feature_corr_by_time(df, **kwargs):
71
+ country = kwargs.get("country")
72
+ crop = kwargs.get("crop")
73
+ dir_output = kwargs.get("dir_output")
74
+ forecast_season = kwargs.get("forecast_season")
75
+ national_correlation = kwargs.get("national_correlation")
76
+ group_by = kwargs.get("groupby")
77
+
78
+ # Setup the figure and gridspec
79
+ fig = plt.figure(figsize=(10, 5))
80
+ gs = fig.add_gridspec(
81
+ 3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
82
+ )
83
+
84
+ # Assign subplots
85
+ ax_heatmap = fig.add_subplot(gs[0:2, 0])
86
+ ax_map = fig.add_subplot(gs[0, 1])
87
+ cbar_ax = fig.add_subplot(gs[2, 0])
88
+ ax4 = fig.add_subplot(gs[2, 1])
89
+
90
+ # Transpose and reverse the columns of the dataframe
91
+ #breakpoint()
92
+ ## Only select foll. columns:
93
+
94
+ df = df[
95
+ [
96
+ "TG",
97
+ "TG10p",
98
+ "DTR",
99
+ "vDTR",
100
+ "R99p",
101
+ "RX5day",
102
+ "MEAN_ESI4WK",
103
+ ]
104
+ ]
105
+ df_transpose = df.T
106
+ df = df_transpose[df_transpose.columns[::-1]]
107
+
108
+ # Split column names and only use value before space
109
+ df.columns = df.columns.str.split(" ").str[0]
110
+ # In row names, replace ESI4WK by ES
111
+ df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
112
+ df.index = df.index.str.replace("R99p", "MEAN_SM")
113
+ df.index = df.index.str.replace("RX5day", "AUC_SM")
114
+ # Remove the last row
115
+ # Select the first, third and fifth column
116
+ df = df[["Dec", "Feb", "Apr"]]
117
+ # Rename Dec to Planting - Early Vegetative
118
+ # Rename Feb to Early Vegetative - Senescence
119
+ # Rename Apr to Senescence - Harvest
120
+ df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
121
+ ax_heatmap = sns.heatmap(
122
+ df,
123
+ ax=ax_heatmap,
124
+ annot=True,
125
+ cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
126
+ fmt=".2f",
127
+ square=False,
128
+ linewidths=0.5,
129
+ linecolor="white",
130
+ cbar_ax=cbar_ax,
131
+ cbar_kws={"orientation": "horizontal"}, # , "shrink": 0.5},
132
+ annot_kws={"size": 6},
133
+ xticklabels=True,
134
+ yticklabels=True,
135
+ )
136
+ ax_heatmap.tick_params(left=False, bottom=False)
137
+
138
+ # Plot the map using GeoPandas
139
+ dg_country = kwargs.get("dg_country")
140
+
141
+ ax_map = dg_country.plot(
142
+ ax=ax_map,
143
+ color="white",
144
+ edgecolor="black",
145
+ linewidth=1.0,
146
+ facecolor=None,
147
+ legend=False,
148
+ )
149
+
150
+ if not national_correlation:
151
+ id = kwargs["region_id"]
152
+ dg_region = dg_country[dg_country[group_by] == id]
153
+ ax_map = dg_region.plot(
154
+ ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
155
+ )
156
+ # Set title with color blue
157
+ ax_map.set_title(f"Region: {id}", color="blue")
158
+
159
+ # No colorbar for the map
160
+ ax_map.axis("off")
161
+ # Remove borders
162
+ ax_map.spines["top"].set_visible(False)
163
+ ax_map.spines["right"].set_visible(False)
164
+ ax_map.spines["bottom"].set_visible(False)
165
+ ax_map.spines["left"].set_visible(False)
166
+ # ax4 should not be visible
167
+ ax4.axis("off")
168
+
169
+ # Add colorbar label
170
+ # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
171
+ cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
172
+ ax_heatmap.set_xticklabels(
173
+ ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
174
+ )
175
+ ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
176
+ ax_heatmap.set_xlabel("")
177
+ ax_heatmap.set_ylabel(" ")
178
+ # Reduce font size of ticks of colorbar
179
+ cbar_ax.tick_params(axis="both", which="major", labelsize=6)
180
+
181
+ _country = country.title().replace("_", " ")
182
+ _crop = crop.title().replace("_", " ")
183
+ if not national_correlation:
184
+ fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
185
+ else:
186
+ fname = f"{country}_{crop}_corr_feature_by_time.png"
187
+ ax_heatmap.set_title(f"{_country}\n{_crop}")
188
+
189
+ # plt.tight_layout()
190
+ os.makedirs(dir_output, exist_ok=True)
191
+ plt.savefig(dir_output / fname, dpi=250)
192
+ plt.close()
193
+
194
+
195
+ def _all_correlated_feature_by_time(df, **kwargs):
196
+ """
197
+
198
+ Args:
199
+ df:
200
+ **kwargs:
201
+
202
+ Returns:
203
+
204
+ """
205
+ frames = []
206
+ all_stages = kwargs.get("all_stages")
207
+ target_col = kwargs.get("target_col")
208
+ method = kwargs.get("method")
209
+
210
+ longest_stage = max(all_stages, key=len)
211
+
212
+ # Split the original string into a list of its parts
213
+ longest_stage = longest_stage.split("_")
214
+
215
+ # Generate the list of strings as described by the user, removing one element from the start each time
216
+ stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
217
+
218
+ # Drop columns with no yield information
219
+ df = df.dropna(subset=[target_col])
220
+
221
+ # Only select columns that have been observed till the current stage
222
+ pbar = tqdm(stages_features, total=len(stages_features), leave=False)
223
+ for stage in pbar:
224
+ pbar.set_description(f"Calculating correlations")
225
+ pbar.update()
226
+
227
+ stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
228
+ "Stage Name"
229
+ ]
230
+ # starting_stage = stage_name.split("-")[0]
231
+ current_feature_set = [col for col in df.columns if stage_name in col]
232
+
233
+ # Get the most correlated feature for each region
234
+ df_tmp = embedding.get_all_features_correlation(
235
+ df[current_feature_set + ["Region"]], df[target_col], method
236
+ )
237
+
238
+ frames.append(df_tmp)
239
+
240
+ df_results = pd.concat(frames)
241
+ if not df_results.empty:
242
+ # Exclude Region column
243
+ df_results = df_results.drop(columns="Region")
244
+ # Groupby Dekad and compute mean of all columns apart from Region
245
+ df_results = df_results.groupby(method).mean()
246
+
247
+ all_stage_names = []
248
+ for stage in stages_features:
249
+ _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
250
+ "Stage Name"
251
+ ]
252
+ all_stage_names.append(_tmp)
253
+
254
+ df_results = df_results.reindex(all_stage_names)
255
+
256
+ # Drop rows with all NaN values
257
+ df_results = df_results.dropna(how="all")
258
+
259
+ # Split the index based on - and only keep the first element
260
+ df_results.index = df_results.index.str.split("-").str[0]
261
+
262
+ return df_results
263
+ else:
264
+ return pd.DataFrame()
265
+
266
+
267
+ def all_correlated_feature_by_time(df, **kwargs):
268
+ """
269
+
270
+ Args:
271
+ df:
272
+ **kwargs:
273
+
274
+ Returns:
275
+
276
+ """
277
+ THRESHOLD = 0.1
278
+ national_correlation = kwargs.get("national_correlation")
279
+ group_by = kwargs.get("groupby")
280
+ combined_dict = kwargs.get("combined_dict")
281
+
282
+ dict_selected_features = {}
283
+ dict_best_cei = {}
284
+
285
+ if not national_correlation:
286
+ groups = df.groupby(group_by)
287
+ for region_id, group in tqdm(
288
+ groups, desc=f"Compute all correlated feature by {group_by}", leave=False
289
+ ):
290
+ df_corr = _all_correlated_feature_by_time(group, **kwargs)
291
+
292
+ # Remove columns with more than 50% NaN values
293
+ df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
294
+
295
+ if not df_corr.empty:
296
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
297
+ # Add the columns to dict_selected_features along with the absolute mean value
298
+ absolute_medians = df_tmp.abs().median()
299
+
300
+ # Create a DataFrame to display the column names and their absolute median values
301
+ absolute_median_df = absolute_medians.reset_index()
302
+ absolute_median_df.columns = ['CEI', 'Median']
303
+
304
+ # Add the CEI and Median value to dict_selected_features
305
+ dict_selected_features[region_id] = absolute_median_df
306
+
307
+ df_tmp2 = (
308
+ df_tmp.median(axis=0)
309
+ .abs()
310
+ .sort_values(ascending=False)
311
+ .reset_index()
312
+ )
313
+ df_tmp2.columns = ["Metric", "Value"]
314
+ # Add another column based on Type of Metric
315
+ for idx, row in df_tmp2.iterrows():
316
+ df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
317
+
318
+ # Compute median of each CEI and sort the dataframe based on the absolute value of the median
319
+ dict_best_cei[region_id] = (
320
+ df_tmp2.groupby("Type")
321
+ .max()
322
+ .reset_index()
323
+ .sort_values("Value", ascending=False)["Metric"]
324
+ .values
325
+ )
326
+
327
+ kwargs["region_id"] = region_id
328
+ plot_feature_corr_by_time(df_tmp, **kwargs)
329
+ # For each element in dict_best_cei, add the type of the cei
330
+ else:
331
+ # HACK
332
+ df_corr = _all_correlated_feature_by_time(df, **kwargs)
333
+
334
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
335
+ # Add the columns to dict_selected_features along with the absolute mean value
336
+ absolute_medians = df_tmp.abs().median()
337
+
338
+ # Create a DataFrame to display the column names and their absolute median values
339
+ absolute_median_df = absolute_medians.reset_index()
340
+ absolute_median_df.columns = ['CEI', 'Median']
341
+
342
+ # Add the CEI and Median value to dict_selected_features
343
+ dict_selected_features[region_id] = absolute_median_df
344
+ dict_best_cei[region_id] = {}
345
+ else:
346
+ df_corr = _all_correlated_feature_by_time(df, **kwargs)
347
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
348
+ # Add the columns to dict_selected_features along with the absolute mean value
349
+ absolute_medians = df_tmp.abs().median()
350
+
351
+ # Create a DataFrame to display the column names and their absolute median values
352
+ absolute_median_df = absolute_medians.reset_index()
353
+ absolute_median_df.columns = ['CEI', 'Median']
354
+
355
+ # Add the CEI and Median value to dict_selected_features
356
+ dict_selected_features[0] = absolute_median_df
357
+
358
+ plot_feature_corr_by_time(df_corr, **kwargs)
359
+
360
+ return dict_selected_features, dict_best_cei
361
+
362
+
363
+ def feature_correlation_by_time(**kwargs):
364
+ raise NotImplementedError()
365
+
366
+ frames = []
367
+ simulation_stages = kwargs.get("simulation_stages")
368
+ df_train = kwargs.get("df_train")
369
+ target_col = kwargs.get("target_col")
370
+
371
+ stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
372
+
373
+ # Only select columns that have been observed till the current stage
374
+ for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
375
+ current_feature_set = [
376
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
377
+ ]
378
+
379
+ # Get the most correlated feature for each region
380
+ top_feature_by_region, counter = embedding.compute_feature_correlations(
381
+ df_train[current_feature_set + ["Region"]],
382
+ df_train[target_col],
383
+ "all",
384
+ )
385
+
386
+ # Create a dataframe with the most common top feature and number of occurrences over timestep
387
+ _feature = counter.most_common(1)[0][0]
388
+ # Loop through top_feature_by_region and find the average score for _feature
389
+ # Calculate the average score for 'DTR_36'
390
+ _feature_scores = [
391
+ value[1][0]
392
+ for key, value in top_feature_by_region.items()
393
+ if _feature in value[0]
394
+ ]
395
+ average_score = sum(_feature_scores) / len(_feature_scores)
396
+ _feature = utils.remove_last_part(_feature)
397
+
398
+ df = pd.DataFrame(
399
+ {
400
+ "Stage": [stage[-1]],
401
+ "Date": [utils.dict_growth_stages[stage[-1]]],
402
+ "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
403
+ "Feature Category": [_feature],
404
+ "Score": [average_score],
405
+ # "Type": [ci.dict_indices[_feature][0]],
406
+ "Number of Occurrences": [counter.most_common(1)[0][1]],
407
+ # "Current Feature Set": [current_feature_set],
408
+ }
409
+ )
410
+ frames.append(df)
411
+
412
+ df_corr_feature_by_time = pd.concat(frames)
@@ -77,7 +77,7 @@ def create_base_weights(merged_df):
77
77
  return w_base, dg
78
78
 
79
79
 
80
- def create_weights_for_year(dg_country, regions_with_data):
80
+ def create_weights_for_year(dg_country, regions_with_data, year):
81
81
  """
82
82
 
83
83
  Args:
@@ -97,10 +97,8 @@ def create_weights_for_year(dg_country, regions_with_data):
97
97
  ]
98
98
  if no_neighbors:
99
99
  dg = dg.drop(index=no_neighbors[0]).reset_index(drop=True)
100
- try:
101
- wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
102
- except:
103
- breakpoint()
100
+ wt = weights.Queen.from_dataframe(dg[["Country Region", "geometry"]])
101
+
104
102
  return wt, dg
105
103
 
106
104
 
@@ -125,6 +123,8 @@ def compute_morans_i(merged_df):
125
123
  for year in tqdm(years, desc="Compute Moran's I"):
126
124
  year_data = merged_df[merged_df["Harvest Year"] == year]
127
125
  regions_with_data = year_data["Country Region"].unique()
126
+ if len(regions_with_data) < 3:
127
+ continue
128
128
  year_data = year_data[year_data["Country Region"].isin(regions_with_data)]
129
129
 
130
130
  y = year_data[
@@ -132,9 +132,8 @@ def compute_morans_i(merged_df):
132
132
  ].drop_duplicates()
133
133
  dg_country = year_data[["Country Region", "geometry"]].drop_duplicates()
134
134
 
135
- w, x = create_weights_for_year(dg_country, regions_with_data)
135
+ w, x = create_weights_for_year(dg_country, regions_with_data, year)
136
136
  y = y[y["Country Region"].isin(x["Country Region"])]
137
-
138
137
  if len(y) > 1:
139
138
  try:
140
139
  mi = esda.Moran(y["Yield (tn per ha)"].values, w, permutations=999)
@@ -144,10 +144,13 @@ def select_stages_for_ml(stages_features, method="latest", n=100):
144
144
 
145
145
  selected_stages = []
146
146
  if method == "latest":
147
+ # Find the longest array in the list of arrays
148
+ selected_stages = [max(stages_features, key=len)]
149
+
147
150
  # Only select those arrays in the list of arrays that are starting with latest_stage
148
- for stage in stages_features:
149
- if stage[0] == latest_stage[0]:
150
- selected_stages.append(stage)
151
+ # for stage in stages_features:
152
+ # if stage[0] == latest_stage[0]:
153
+ # selected_stages.append(stage)
151
154
  elif method == "fraction":
152
155
  # Filter arrays with exactly 2 elements
153
156
  two_element_arrays = []
@@ -289,6 +289,40 @@ def auto_train(
289
289
  model = LinearGAM(n_splines=25, spline_order=3).gridsearch(
290
290
  X_train.values, y_train.values, lam=np.logspace(-3, 3, 11)
291
291
  )
292
+ elif model_name == "geospaNN":
293
+ import torch
294
+ import geospaNN
295
+
296
+ # Remove any categorical features
297
+ X_train = X_train.drop(columns=cat_features)
298
+ X = torch.from_numpy(X_train.to_numpy()).float()
299
+ Y = torch.from_numpy(y_train.to_numpy().reshape(-1)).float()
300
+
301
+ coord = torch.from_numpy(df_train[['lon', 'lat']].to_numpy()).float()
302
+
303
+ p = X.shape[1]
304
+ n = X.shape[0]
305
+ nn = 5
306
+
307
+ data = geospaNN.make_graph(X, Y, coord, nn)
308
+
309
+ mlp = torch.nn.Sequential(
310
+ torch.nn.Linear(p, 50),
311
+ torch.nn.ReLU(),
312
+ torch.nn.Linear(50, 20),
313
+ torch.nn.ReLU(),
314
+ torch.nn.Linear(20, 10),
315
+ torch.nn.ReLU(),
316
+ torch.nn.Linear(10, 1),
317
+ )
318
+
319
+ # Split data
320
+ data_train, data_val, data_test = geospaNN.split_data(X, Y, coord, neighbor_size=nn, test_proportion=0.1)
321
+ theta0 = geospaNN.theta_update(torch.tensor([1, 1.5, 0.01]), mlp(data_train.x).squeeze() - data_train.y, data_train.pos, neighbor_size=5)
322
+ model = geospaNN.nngls(p=p, neighbor_size=nn, coord_dimensions=2, mlp=mlp, theta=torch.tensor(theta0))
323
+ nngls_model = geospaNN.nngls_train(model, lr=0.01, min_delta=0.001)
324
+ # Log training process
325
+ training_log = nngls_model.train(data_train, data_val, data_test, Update_init=10, Update_step=10)
292
326
  elif model_name == "xgboost":
293
327
  raise NotImplementedError
294
328
  else:
@@ -1,6 +1,76 @@
1
- import pandas as pd
1
+ import geopandas as gpd
2
+ import pygmt
2
3
  import matplotlib.pyplot as plt
3
- import matplotlib.patches as patches
4
+ from matplotlib.lines import Line2D
5
+ import matplotlib.patches as mpatches
6
+ import os
7
+ filtered_shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\filtered_shapefile5.shp"
8
+
9
+ if not os.path.isfile(filtered_shapefile_path):
10
+
11
+ # Load the shapefile using GeoPandas
12
+ shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\adm_shapefile.shp"
13
+ gdf = gpd.read_file(shapefile_path, engine="pyogrio")
14
+
15
+ # Only keep one row per ADMIN0
16
+ gdf = gdf.drop_duplicates(subset="ADMIN0")
17
+
18
+ sh2_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\Level_1.shp"
19
+ gdf2 = gpd.read_file(sh2_path, engine="pyogrio")
20
+
21
+ # Subset gdf2 to USA, Pakistan and Afghanistan
22
+ gdf2 = gdf2[gdf2["ADM0_NAME"].isin(["United States of America"])]
23
+
24
+ # Exclude Alska and Hawaii from the USA
25
+ gdf2 = gdf2[~gdf2["ADM1_NAME"].isin(["Alaska", "Hawaii"])]
26
+
27
+ # Now combine all the states into one polygon
28
+ gdf2 = gdf2.dissolve(by="ADM0_NAME")
29
+ gdf2 = gdf2.reset_index()
30
+
31
+ # Rename ADM0_NAME to ADMIN0 for consistency
32
+ gdf2.rename(columns={"ADM0_NAME": "ADMIN0"}, inplace=True)
33
+
34
+ # Only keep ADMIN0 and geometry columns in gdf and gdf2
35
+ gdf = gdf[["ADMIN0", "geometry"]]
36
+ gdf2 = gdf2[["ADMIN0", "geometry"]]
37
+
38
+ # Merge gdf and gdf2
39
+ import pandas as pd
40
+ gdf = pd.concat([gdf, gdf2], ignore_index=True)
41
+
42
+ # Save the filtered shapefile as a temporary file
43
+
44
+ gdf.to_file(filtered_shapefile_path)
45
+ else:
46
+ gdf = gpd.read_file(filtered_shapefile_path, engine="pyogrio")
47
+
48
+ # Create the global map with highlighted countries
49
+ fig = pygmt.Figure()
50
+
51
+ # Define the region of interest and projection
52
+ # fig.basemap(region="g", projection="R12c/20", frame=True)
53
+ fig.basemap(region=[-135, 60, -35, 53], projection="Q12c", frame=True)
54
+
55
+ # Use the coast function to draw land and water
56
+ fig.coast(land="lightgray", water="lightcyan")
57
+
58
+ # Highlight the countries using the filtered shapefile
59
+ fig.plot(data=filtered_shapefile_path, pen="0.35p,black")
60
+
61
+ # Add hatches to Pakistan and Afghanistan
62
+ gdf_filled = gdf[gdf["ADMIN0"].isin(["Pakistan", "Afghanistan"])]
63
+ for _, row in gdf_filled.iterrows():
64
+ fill_gdf = gpd.GeoDataFrame([row], columns=gdf.columns)
65
+ with pygmt.helpers.GMTTempFile() as tmpfile:
66
+ fill_gdf.to_file(tmpfile.name, driver="GeoJSON")
67
+ fig.plot(data=tmpfile.name, pen="0.35p,black", fill="black@50+h")
68
+
69
+ # Save the figure
70
+ fig.savefig("global_choropleth_highlighted_v1.png", dpi=1000)
71
+
72
+ # Show the figure
73
+ fig.show()
4
74
 
5
75
  import matplotlib.pyplot as plt
6
76
  import cartopy.crs as ccrs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.32
3
+ Version: 0.1.34
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -8,6 +8,7 @@ geocif/__init__.py
8
8
  geocif/analysis.py
9
9
  geocif/geocif.py
10
10
  geocif/indices_runner.py
11
+ geocif/indices_runner_v2.py
11
12
  geocif/logger.py
12
13
  geocif/utils.py
13
14
  geocif.egg-info/PKG-INFO
@@ -31,6 +32,7 @@ geocif/cei/definitions.py
31
32
  geocif/cei/indices.py
32
33
  geocif/ml/__init__.py
33
34
  geocif/ml/correlations.py
35
+ geocif/ml/correlations_backup.py
34
36
  geocif/ml/embedding.py
35
37
  geocif/ml/feature_engineering.py
36
38
  geocif/ml/feature_selection.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.32",
53
+ version="0.1.34",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes