geocif 0.1.33__tar.gz → 0.1.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {geocif-0.1.33/geocif.egg-info → geocif-0.1.35}/PKG-INFO +1 -1
  2. {geocif-0.1.33 → geocif-0.1.35}/geocif/analysis.py +5 -5
  3. {geocif-0.1.33 → geocif-0.1.35}/geocif/cei/indices.py +12 -3
  4. {geocif-0.1.33 → geocif-0.1.35}/geocif/indices_runner.py +5 -4
  5. geocif-0.1.35/geocif/indices_runner_v2.py +207 -0
  6. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/correlations.py +35 -16
  7. geocif-0.1.35/geocif/ml/correlations_backup.py +412 -0
  8. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/stages.py +6 -3
  9. {geocif-0.1.33 → geocif-0.1.35}/geocif/playground/misc.py +72 -2
  10. {geocif-0.1.33 → geocif-0.1.35/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.1.33 → geocif-0.1.35}/geocif.egg-info/SOURCES.txt +2 -0
  12. {geocif-0.1.33 → geocif-0.1.35}/setup.py +1 -1
  13. {geocif-0.1.33 → geocif-0.1.35}/LICENSE +0 -0
  14. {geocif-0.1.33 → geocif-0.1.35}/MANIFEST.in +0 -0
  15. {geocif-0.1.33 → geocif-0.1.35}/README.md +0 -0
  16. {geocif-0.1.33 → geocif-0.1.35}/geocif/__init__.py +0 -0
  17. {geocif-0.1.33 → geocif-0.1.35}/geocif/agmet/__init__.py +0 -0
  18. {geocif-0.1.33 → geocif-0.1.35}/geocif/agmet/geoagmet.py +0 -0
  19. {geocif-0.1.33 → geocif-0.1.35}/geocif/agmet/plot.py +0 -0
  20. {geocif-0.1.33 → geocif-0.1.35}/geocif/agmet/utils.py +0 -0
  21. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/__init__.py +0 -0
  22. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/constants.py +0 -0
  23. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/features.py +0 -0
  24. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/geo.py +0 -0
  25. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/geocif.py +0 -0
  26. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/metadata.py +0 -0
  27. {geocif-0.1.33 → geocif-0.1.35}/geocif/backup/models.py +0 -0
  28. {geocif-0.1.33 → geocif-0.1.35}/geocif/cei/__init__.py +0 -0
  29. {geocif-0.1.33 → geocif-0.1.35}/geocif/cei/definitions.py +0 -0
  30. {geocif-0.1.33 → geocif-0.1.35}/geocif/geocif.py +0 -0
  31. {geocif-0.1.33 → geocif-0.1.35}/geocif/logger.py +0 -0
  32. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/__init__.py +0 -0
  33. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/embedding.py +0 -0
  34. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/feature_engineering.py +0 -0
  35. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/feature_selection.py +0 -0
  36. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/outliers.py +0 -0
  37. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/outlook.py +0 -0
  38. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/output.py +0 -0
  39. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/spatial_autocorrelation.py +0 -0
  40. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/stats.py +0 -0
  41. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/trainers.py +0 -0
  42. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/trend.py +0 -0
  43. {geocif-0.1.33 → geocif-0.1.35}/geocif/ml/xai.py +0 -0
  44. {geocif-0.1.33 → geocif-0.1.35}/geocif/playground/__init__.py +0 -0
  45. {geocif-0.1.33 → geocif-0.1.35}/geocif/playground/automl.py +0 -0
  46. {geocif-0.1.33 → geocif-0.1.35}/geocif/utils.py +0 -0
  47. {geocif-0.1.33 → geocif-0.1.35}/geocif/viz/__init__.py +0 -0
  48. {geocif-0.1.33 → geocif-0.1.35}/geocif/viz/plot.py +0 -0
  49. {geocif-0.1.33 → geocif-0.1.35}/geocif.egg-info/dependency_links.txt +0 -0
  50. {geocif-0.1.33 → geocif-0.1.35}/geocif.egg-info/not-zip-safe +0 -0
  51. {geocif-0.1.33 → geocif-0.1.35}/geocif.egg-info/top_level.txt +0 -0
  52. {geocif-0.1.33 → geocif-0.1.35}/requirements.txt +0 -0
  53. {geocif-0.1.33 → geocif-0.1.35}/setup.cfg +0 -0
  54. {geocif-0.1.33 → geocif-0.1.35}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.33
3
+ Version: 0.1.35
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -162,8 +162,8 @@ class Geoanalysis:
162
162
  return pd.DataFrame(), pd.DataFrame()
163
163
 
164
164
  df_metrics = self._compute_metrics(df)
165
- df_metrics = self._process_metrics(df_metrics)
166
- self._plot_metrics(df_metrics)
165
+ #df_metrics = self._process_metrics(df_metrics)
166
+ #self._plot_metrics(df_metrics)
167
167
 
168
168
  df_regional_metrics_by_year = self._compute_regional_metrics(
169
169
  df, by="Harvest Year"
@@ -172,9 +172,9 @@ class Geoanalysis:
172
172
  df_regional_metrics_by_year
173
173
  )
174
174
  df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
175
-
175
+ breakpoint()
176
176
  self._store_results(
177
- df_metrics, df_regional_metrics, df_regional_metrics_by_year
177
+ None, df_regional_metrics, df_regional_metrics_by_year
178
178
  )
179
179
 
180
180
  df_national_yield = self._compute_national_yield(df)
@@ -195,7 +195,7 @@ class Geoanalysis:
195
195
  .apply(self.annual_metrics)
196
196
  .reset_index()
197
197
  )
198
-
198
+ breakpoint()
199
199
  return df_metrics.pivot_table(
200
200
  index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
201
201
  columns="level_5",
@@ -393,6 +393,7 @@ class CEIs:
393
393
  / self.admin_zone
394
394
  / self.country
395
395
  )
396
+
396
397
  os.makedirs(self.dir_output, exist_ok=True)
397
398
  os.makedirs(self.dir_intermediate, exist_ok=True)
398
399
 
@@ -465,7 +466,7 @@ class CEIs:
465
466
 
466
467
  extended_stages_list = []
467
468
  if self.method in ["phenological_stages", "fraction_season", "full_season"]:
468
- extended_stages_list = [stages]
469
+ extended_stages_list = stages
469
470
  elif self.method in ["dekad_r", "biweekly_r", "monthly_r"]:
470
471
  # reverse stages
471
472
  stages = stages[::-1]
@@ -566,10 +567,10 @@ class CEIs:
566
567
 
567
568
  """
568
569
  if self.method in ["phenological_stages", "fraction_season"]:
569
- mask = df_harvest_year_region[col].isin(stages)
570
+ mask = df_harvest_year_region[col].isin([stages])
570
571
  df_time_period = df_harvest_year_region[mask]
571
572
 
572
- mask = df_all_years[col].isin(stages)
573
+ mask = df_all_years[col].isin([stages])
573
574
  df_base_period = df_all_years[mask]
574
575
  elif self.method in [
575
576
  "dekad",
@@ -605,6 +606,10 @@ class CEIs:
605
606
  Returns:
606
607
 
607
608
  """
609
+ # If stage is not a list then convert it to a list
610
+ if not isinstance(stage, list):
611
+ stage = [stage]
612
+
608
613
  columns = [
609
614
  "Description",
610
615
  "CEI",
@@ -721,6 +726,10 @@ class CEIs:
721
726
  :param index_details:
722
727
  :return:
723
728
  """
729
+ # If stage is not a list then convert it to a list
730
+ if not isinstance(stage, list):
731
+ stage = [stage]
732
+
724
733
  df = df[df["bounds"] == 1]
725
734
  # Exclude lat, lon, time, bounds and time_bounds columns
726
735
  df = df.drop(columns=["lat", "lon", "time", "bounds", "time_bounds"])
@@ -165,11 +165,12 @@ class cei_runner(base.BaseGeo):
165
165
  combinations = [
166
166
  i
167
167
  for i in combinations
168
- if "angola_maize" in i[3] or "lesotho_maize" in i[3] or
169
- # "namibia" in i[2] or
170
- # "united_republic_of_tanzania" in i[2] or
168
+ if "angola_maize" in i[3] or
169
+ "lesotho_maize" in i[3] or
170
+ # "namibia_" in i[2] or
171
+ "united_republic_of_tanzania_maize" in i[3] or
171
172
  "zambia_maize" in i[3] or "zimbabwe_maize" in i[3] or
172
- # "south_africa" in i[2] or
173
+ "south_africa_maize" in i[3] or
173
174
  "mozambique_maize" in i[3]
174
175
  ]
175
176
  # "malawi" in i[2]]
@@ -0,0 +1,207 @@
1
+ import itertools
2
+ import warnings
3
+ from multiprocessing import Pool, cpu_count
4
+ from pathlib import Path
5
+
6
+ import arrow as ar
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from .cei import indices
13
+ from geoprepare import base
14
+
15
+
16
+ def remove_duplicates(lst):
17
+ """
18
+
19
+ :param lst:
20
+ :return:
21
+ """
22
+ return list(set([i for i in lst]))
23
+
24
+
25
+ def get_admin_zone(country, dg_shp):
26
+ admin_zone = "admin_1"
27
+ country = country.title().replace(" ", "_")
28
+
29
+ # Read in shapefile
30
+ dg_country = dg_shp[dg_shp["ADMIN0"] == country]
31
+
32
+ # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
33
+ if dg_country.empty:
34
+ admin_zone = "admin_1"
35
+ elif not dg_country["ADMIN2"].isna().all():
36
+ admin_zone = "admin_2"
37
+
38
+ return admin_zone
39
+
40
+
41
+ class cei_runner(base.BaseGeo):
42
+ def __init__(self, path_config_file):
43
+ super().__init__(path_config_file)
44
+
45
+ # Parse configuration files
46
+ self.parse_config()
47
+
48
+ self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
49
+ self.base_dir = Path(self.parser.get("PATHS", "dir_crop_inputs"))
50
+ self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
51
+
52
+ def collect_files(self):
53
+ """
54
+ 1. Collect all the files which contain EO information
55
+ 2. Exclude files from the `processed` directory if it is already in
56
+ processed_include_fall directory
57
+ 3. Create a dataframe that contains the following columns:
58
+ - directory: name of directory where file is located
59
+ - path: full path to file
60
+ - filename: name of file
61
+ :return: Return the dataframe created above
62
+ """
63
+ import geopandas as gp
64
+
65
+ dg_shp = gp.read_file(
66
+ self.dir_input
67
+ / "Global_Datasets"
68
+ / "Regions"
69
+ / "Shps"
70
+ / "adm_shapefile.shp",
71
+ engine="pyogrio",
72
+ )
73
+
74
+ # Collect all the files which contain EO information
75
+ df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
76
+ for filepath in self.base_dir.rglob("*.csv"):
77
+ country = filepath.parents[0].name
78
+
79
+ admin_zone = get_admin_zone(country, dg_shp)
80
+
81
+ # If country is not in cc.COUNTRIES then skip
82
+ # HACK: Skip korea for now, as it is giving errors
83
+ if country == "republic_of_korea":
84
+ continue
85
+
86
+ # Get name of directory one level up
87
+ process_type = filepath.parents[1].name
88
+
89
+ # Get name of file
90
+ filename = filepath.name
91
+
92
+ # Add to dataframe
93
+ df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
94
+
95
+ # Exclude those rows where directory is processed and file is already in
96
+ # processed_include_fall directory
97
+ no_fall = df_files["directory"] == "processed"
98
+ include_fall = df_files[df_files["directory"] == "processed_include_fall"][
99
+ "filename"
100
+ ]
101
+
102
+ df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
103
+
104
+ return df_files
105
+
106
+ def process_combinations(self, df, method):
107
+ """
108
+ Create a list of tuples of the following:
109
+ - directory: name of directory where file is located
110
+ - path: full path to file
111
+ - filename: name of file
112
+ - method: whether to compute indices for phenological stages or not
113
+ This tuple will be used as input to the `process` function
114
+ :param df:
115
+ :param method:
116
+ :return:
117
+ """
118
+ combinations = []
119
+
120
+ for index, row in tqdm(df.iterrows()):
121
+ combinations.extend(
122
+ list(
123
+ itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
124
+ )
125
+ )
126
+
127
+ combinations = remove_duplicates(combinations)
128
+
129
+ return combinations
130
+
131
+ def main(self, method):
132
+ """
133
+
134
+ :param method:
135
+ :return:
136
+ """
137
+ # Create a dataframe of the files to be analyzed
138
+ df_files = self.collect_files()
139
+
140
+ combinations = self.process_combinations(df_files, method)
141
+
142
+ # Add an element to the tuple to indicate the season
143
+ # Last element is redo flag which is True if the analysis is to be redone
144
+ # and False otherwise. Analysis is always redone for the current year
145
+ # and last year whether file exists or not
146
+ combinations = [
147
+ (
148
+ self.parser,
149
+ status,
150
+ path,
151
+ filename,
152
+ admin_zone,
153
+ category,
154
+ year,
155
+ "ndvi",
156
+ False, # redo
157
+ )
158
+ for year in range(2024, ar.utcnow().year + 1)
159
+ for status, path, filename, admin_zone, category in combinations
160
+ ]
161
+
162
+ # Only keep those entries in combinations where the third elemt is
163
+ # mozambique, south_africa, angola or dem_people's_rep_of_korea
164
+ # This is done to test the code for these countries
165
+ combinations = [
166
+ i
167
+ for i in combinations
168
+ if "malawi_maize_s1" in i[3]
169
+ ]
170
+
171
+ if False:
172
+ num_cpu = int(cpu_count() * 0.3)
173
+ with Pool(num_cpu) as p:
174
+ for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
175
+ pass
176
+ else:
177
+ # Use the code below if you want to test without parallelization or
178
+ # if you want to debug by using pdb
179
+ pbar = tqdm(combinations)
180
+ for i, val in enumerate(pbar):
181
+ pbar.set_description(
182
+ f"Main loop {combinations[i][2]} {combinations[i][5]}"
183
+ )
184
+ indices.process(val)
185
+
186
+
187
+ def run(path_config_files=[]):
188
+ """
189
+
190
+ Args:
191
+ path_config_files:
192
+
193
+ Returns:
194
+
195
+ """
196
+ """ Check dictionary keys to have no spaces"""
197
+ indices.validate_index_definitions()
198
+
199
+ for method in [
200
+ "biweekly_r", # "dekad_r" # "dekad_r"
201
+ ]: # , "full_season", "phenological_stages", "fraction_season"]:
202
+ obj = cei_runner(path_config_files)
203
+ obj.main(method)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ run()
@@ -246,6 +246,7 @@ def all_correlated_feature_by_time(df, **kwargs):
246
246
  Returns:
247
247
 
248
248
  """
249
+ THRESHOLD = 0.1
249
250
  national_correlation = kwargs.get("national_correlation")
250
251
  group_by = kwargs.get("groupby")
251
252
  combined_dict = kwargs.get("combined_dict")
@@ -260,9 +261,20 @@ def all_correlated_feature_by_time(df, **kwargs):
260
261
  ):
261
262
  df_corr = _all_correlated_feature_by_time(group, **kwargs)
262
263
 
264
+ # Remove columns with more than 50% NaN values
265
+ df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
266
+
263
267
  if not df_corr.empty:
264
- df_tmp = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
265
- dict_selected_features[region_id] = df_tmp.columns
268
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
269
+ # Add the columns to dict_selected_features along with the absolute mean value
270
+ absolute_medians = df_tmp.abs().median()
271
+
272
+ # Create a DataFrame to display the column names and their absolute median values
273
+ absolute_median_df = absolute_medians.reset_index()
274
+ absolute_median_df.columns = ['CEI', 'Median']
275
+
276
+ # Add the CEI and Median value to dict_selected_features
277
+ dict_selected_features[region_id] = absolute_median_df
266
278
 
267
279
  df_tmp2 = (
268
280
  df_tmp.median(axis=0)
@@ -290,24 +302,31 @@ def all_correlated_feature_by_time(df, **kwargs):
290
302
  else:
291
303
  # HACK
292
304
  df_corr = _all_correlated_feature_by_time(df, **kwargs)
293
- dict_selected_features[region_id] = df_corr.columns
294
- dict_best_cei[region_id] = {}
295
305
 
296
- # dict_selected_features[region_id] = dict_selected_features[0]
297
- # dict_best_cei[region_id] = dict_best_cei[0]
298
- # Combine all unique values from the existing dictionary elements
299
- # combined_metrics = set()
300
- # for key in dict_selected_features:
301
- # breakpoint()
302
- # combined_metrics.update(dict_selected_features[key])
303
- #
304
- # # Add the combined set as a new element with key 3
305
- # dict_selected_features[region_id] = sorted(list(combined_metrics))
306
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
307
+ # Add the columns to dict_selected_features along with the absolute mean value
308
+ absolute_medians = df_tmp.abs().median()
309
+
310
+ # Create a DataFrame to display the column names and their absolute median values
311
+ absolute_median_df = absolute_medians.reset_index()
312
+ absolute_median_df.columns = ['CEI', 'Median']
313
+
314
+ # Add the CEI and Median value to dict_selected_features
315
+ dict_selected_features[region_id] = absolute_median_df
316
+ dict_best_cei[region_id] = {}
306
317
  else:
307
318
  df_corr = _all_correlated_feature_by_time(df, **kwargs)
308
- dict_selected_features[0] = df_corr.columns
319
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
320
+ # Add the columns to dict_selected_features along with the absolute mean value
321
+ absolute_medians = df_tmp.abs().median()
322
+
323
+ # Create a DataFrame to display the column names and their absolute median values
324
+ absolute_median_df = absolute_medians.reset_index()
325
+ absolute_median_df.columns = ['CEI', 'Median']
326
+
327
+ # Add the CEI and Median value to dict_selected_features
328
+ dict_selected_features[0] = absolute_median_df
309
329
 
310
- df_corr = df_corr[df_corr.columns[(df_corr.mean() > 0.1)]]
311
330
  plot_feature_corr_by_time(df_corr, **kwargs)
312
331
 
313
332
  return dict_selected_features, dict_best_cei
@@ -0,0 +1,412 @@
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import palettable as pal
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ from tqdm import tqdm
8
+
9
+ from geocif import utils
10
+ from geocif.ml import embedding
11
+ from geocif.ml import stages
12
+
13
+
14
+ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
15
+ """
16
+
17
+ Args:
18
+ df_train:
19
+ simulation_stages:
20
+ target_col:
21
+
22
+ Returns:
23
+
24
+ """
25
+ frames = []
26
+
27
+ stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
28
+
29
+ # Only select columns that have been observed till the current stage
30
+ for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
31
+ current_feature_set = [
32
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
33
+ ]
34
+
35
+ # Get the most correlated feature for each region
36
+ top_feature_by_region, counter = embedding.get_top_correlated_features(
37
+ df_train[current_feature_set + ["Region"]],
38
+ df_train[target_col],
39
+ )
40
+
41
+ # Create a dataframe with the most common top feature and number of occurrences over timestep
42
+ _feature = counter.most_common(1)[0][0]
43
+ # Loop through top_feature_by_region and find the average score for _feature
44
+ # Calculate the average score for 'DTR_36'
45
+ _feature_scores = [
46
+ value[1][0]
47
+ for key, value in top_feature_by_region.items()
48
+ if _feature in value[0]
49
+ ]
50
+ average_score = sum(_feature_scores) / len(_feature_scores)
51
+ _feature = utils.remove_last_part(_feature)
52
+
53
+ df = pd.DataFrame(
54
+ {
55
+ "Stage": [stage[-1]],
56
+ "Date": [utils.dict_growth_stages[stage[-1]]],
57
+ "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
58
+ "Feature Category": [_feature],
59
+ "Score": [average_score],
60
+ # "Type": [ci.dict_indices[_feature][0]],
61
+ "Number of Occurrences": [counter.most_common(1)[0][1]],
62
+ # "Current Feature Set": [current_feature_set],
63
+ }
64
+ )
65
+ frames.append(df)
66
+
67
+ df_most_corr_feature_by_time = pd.concat(frames)
68
+
69
+
70
+ def plot_feature_corr_by_time(df, **kwargs):
71
+ country = kwargs.get("country")
72
+ crop = kwargs.get("crop")
73
+ dir_output = kwargs.get("dir_output")
74
+ forecast_season = kwargs.get("forecast_season")
75
+ national_correlation = kwargs.get("national_correlation")
76
+ group_by = kwargs.get("groupby")
77
+
78
+ # Setup the figure and gridspec
79
+ fig = plt.figure(figsize=(10, 5))
80
+ gs = fig.add_gridspec(
81
+ 3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
82
+ )
83
+
84
+ # Assign subplots
85
+ ax_heatmap = fig.add_subplot(gs[0:2, 0])
86
+ ax_map = fig.add_subplot(gs[0, 1])
87
+ cbar_ax = fig.add_subplot(gs[2, 0])
88
+ ax4 = fig.add_subplot(gs[2, 1])
89
+
90
+ # Transpose and reverse the columns of the dataframe
91
+ #breakpoint()
92
+ ## Only select foll. columns:
93
+
94
+ df = df[
95
+ [
96
+ "TG",
97
+ "TG10p",
98
+ "DTR",
99
+ "vDTR",
100
+ "R99p",
101
+ "RX5day",
102
+ "MEAN_ESI4WK",
103
+ ]
104
+ ]
105
+ df_transpose = df.T
106
+ df = df_transpose[df_transpose.columns[::-1]]
107
+
108
+ # Split column names and only use value before space
109
+ df.columns = df.columns.str.split(" ").str[0]
110
+ # In row names, replace ESI4WK by ES
111
+ df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
112
+ df.index = df.index.str.replace("R99p", "MEAN_SM")
113
+ df.index = df.index.str.replace("RX5day", "AUC_SM")
114
+ # Remove the last row
115
+ # Select the first, third and fifth column
116
+ df = df[["Dec", "Feb", "Apr"]]
117
+ # Rename Dec to Planting - Early Vegetative
118
+ # Rename Feb to Early Vegetative - Senescence
119
+ # Rename Apr to Senescence - Harvest
120
+ df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
121
+ ax_heatmap = sns.heatmap(
122
+ df,
123
+ ax=ax_heatmap,
124
+ annot=True,
125
+ cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
126
+ fmt=".2f",
127
+ square=False,
128
+ linewidths=0.5,
129
+ linecolor="white",
130
+ cbar_ax=cbar_ax,
131
+ cbar_kws={"orientation": "horizontal"}, # , "shrink": 0.5},
132
+ annot_kws={"size": 6},
133
+ xticklabels=True,
134
+ yticklabels=True,
135
+ )
136
+ ax_heatmap.tick_params(left=False, bottom=False)
137
+
138
+ # Plot the map using GeoPandas
139
+ dg_country = kwargs.get("dg_country")
140
+
141
+ ax_map = dg_country.plot(
142
+ ax=ax_map,
143
+ color="white",
144
+ edgecolor="black",
145
+ linewidth=1.0,
146
+ facecolor=None,
147
+ legend=False,
148
+ )
149
+
150
+ if not national_correlation:
151
+ id = kwargs["region_id"]
152
+ dg_region = dg_country[dg_country[group_by] == id]
153
+ ax_map = dg_region.plot(
154
+ ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
155
+ )
156
+ # Set title with color blue
157
+ ax_map.set_title(f"Region: {id}", color="blue")
158
+
159
+ # No colorbar for the map
160
+ ax_map.axis("off")
161
+ # Remove borders
162
+ ax_map.spines["top"].set_visible(False)
163
+ ax_map.spines["right"].set_visible(False)
164
+ ax_map.spines["bottom"].set_visible(False)
165
+ ax_map.spines["left"].set_visible(False)
166
+ # ax4 should not be visible
167
+ ax4.axis("off")
168
+
169
+ # Add colorbar label
170
+ # cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
171
+ cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
172
+ ax_heatmap.set_xticklabels(
173
+ ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
174
+ )
175
+ ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
176
+ ax_heatmap.set_xlabel("")
177
+ ax_heatmap.set_ylabel(" ")
178
+ # Reduce font size of ticks of colorbar
179
+ cbar_ax.tick_params(axis="both", which="major", labelsize=6)
180
+
181
+ _country = country.title().replace("_", " ")
182
+ _crop = crop.title().replace("_", " ")
183
+ if not national_correlation:
184
+ fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
185
+ else:
186
+ fname = f"{country}_{crop}_corr_feature_by_time.png"
187
+ ax_heatmap.set_title(f"{_country}\n{_crop}")
188
+
189
+ # plt.tight_layout()
190
+ os.makedirs(dir_output, exist_ok=True)
191
+ plt.savefig(dir_output / fname, dpi=250)
192
+ plt.close()
193
+
194
+
195
+ def _all_correlated_feature_by_time(df, **kwargs):
196
+ """
197
+
198
+ Args:
199
+ df:
200
+ **kwargs:
201
+
202
+ Returns:
203
+
204
+ """
205
+ frames = []
206
+ all_stages = kwargs.get("all_stages")
207
+ target_col = kwargs.get("target_col")
208
+ method = kwargs.get("method")
209
+
210
+ longest_stage = max(all_stages, key=len)
211
+
212
+ # Split the original string into a list of its parts
213
+ longest_stage = longest_stage.split("_")
214
+
215
+ # Generate the list of strings as described by the user, removing one element from the start each time
216
+ stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
217
+
218
+ # Drop columns with no yield information
219
+ df = df.dropna(subset=[target_col])
220
+
221
+ # Only select columns that have been observed till the current stage
222
+ pbar = tqdm(stages_features, total=len(stages_features), leave=False)
223
+ for stage in pbar:
224
+ pbar.set_description(f"Calculating correlations")
225
+ pbar.update()
226
+
227
+ stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
228
+ "Stage Name"
229
+ ]
230
+ # starting_stage = stage_name.split("-")[0]
231
+ current_feature_set = [col for col in df.columns if stage_name in col]
232
+
233
+ # Get the most correlated feature for each region
234
+ df_tmp = embedding.get_all_features_correlation(
235
+ df[current_feature_set + ["Region"]], df[target_col], method
236
+ )
237
+
238
+ frames.append(df_tmp)
239
+
240
+ df_results = pd.concat(frames)
241
+ if not df_results.empty:
242
+ # Exclude Region column
243
+ df_results = df_results.drop(columns="Region")
244
+ # Groupby Dekad and compute mean of all columns apart from Region
245
+ df_results = df_results.groupby(method).mean()
246
+
247
+ all_stage_names = []
248
+ for stage in stages_features:
249
+ _tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
250
+ "Stage Name"
251
+ ]
252
+ all_stage_names.append(_tmp)
253
+
254
+ df_results = df_results.reindex(all_stage_names)
255
+
256
+ # Drop rows with all NaN values
257
+ df_results = df_results.dropna(how="all")
258
+
259
+ # Split the index based on - and only keep the first element
260
+ df_results.index = df_results.index.str.split("-").str[0]
261
+
262
+ return df_results
263
+ else:
264
+ return pd.DataFrame()
265
+
266
+
267
+ def all_correlated_feature_by_time(df, **kwargs):
268
+ """
269
+
270
+ Args:
271
+ df:
272
+ **kwargs:
273
+
274
+ Returns:
275
+
276
+ """
277
+ THRESHOLD = 0.1
278
+ national_correlation = kwargs.get("national_correlation")
279
+ group_by = kwargs.get("groupby")
280
+ combined_dict = kwargs.get("combined_dict")
281
+
282
+ dict_selected_features = {}
283
+ dict_best_cei = {}
284
+
285
+ if not national_correlation:
286
+ groups = df.groupby(group_by)
287
+ for region_id, group in tqdm(
288
+ groups, desc=f"Compute all correlated feature by {group_by}", leave=False
289
+ ):
290
+ df_corr = _all_correlated_feature_by_time(group, **kwargs)
291
+
292
+ # Remove columns with more than 50% NaN values
293
+ df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
294
+
295
+ if not df_corr.empty:
296
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
297
+ # Add the columns to dict_selected_features along with the absolute mean value
298
+ absolute_medians = df_tmp.abs().median()
299
+
300
+ # Create a DataFrame to display the column names and their absolute median values
301
+ absolute_median_df = absolute_medians.reset_index()
302
+ absolute_median_df.columns = ['CEI', 'Median']
303
+
304
+ # Add the CEI and Median value to dict_selected_features
305
+ dict_selected_features[region_id] = absolute_median_df
306
+
307
+ df_tmp2 = (
308
+ df_tmp.median(axis=0)
309
+ .abs()
310
+ .sort_values(ascending=False)
311
+ .reset_index()
312
+ )
313
+ df_tmp2.columns = ["Metric", "Value"]
314
+ # Add another column based on Type of Metric
315
+ for idx, row in df_tmp2.iterrows():
316
+ df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
317
+
318
+ # Compute median of each CEI and sort the dataframe based on the absolute value of the median
319
+ dict_best_cei[region_id] = (
320
+ df_tmp2.groupby("Type")
321
+ .max()
322
+ .reset_index()
323
+ .sort_values("Value", ascending=False)["Metric"]
324
+ .values
325
+ )
326
+
327
+ kwargs["region_id"] = region_id
328
+ plot_feature_corr_by_time(df_tmp, **kwargs)
329
+ # For each element in dict_best_cei, add the type of the cei
330
+ else:
331
+ # HACK
332
+ df_corr = _all_correlated_feature_by_time(df, **kwargs)
333
+
334
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
335
+ # Add the columns to dict_selected_features along with the absolute mean value
336
+ absolute_medians = df_tmp.abs().median()
337
+
338
+ # Create a DataFrame to display the column names and their absolute median values
339
+ absolute_median_df = absolute_medians.reset_index()
340
+ absolute_median_df.columns = ['CEI', 'Median']
341
+
342
+ # Add the CEI and Median value to dict_selected_features
343
+ dict_selected_features[region_id] = absolute_median_df
344
+ dict_best_cei[region_id] = {}
345
+ else:
346
+ df_corr = _all_correlated_feature_by_time(df, **kwargs)
347
+ df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
348
+ # Add the columns to dict_selected_features along with the absolute mean value
349
+ absolute_medians = df_tmp.abs().median()
350
+
351
+ # Create a DataFrame to display the column names and their absolute median values
352
+ absolute_median_df = absolute_medians.reset_index()
353
+ absolute_median_df.columns = ['CEI', 'Median']
354
+
355
+ # Add the CEI and Median value to dict_selected_features
356
+ dict_selected_features[0] = absolute_median_df
357
+
358
+ plot_feature_corr_by_time(df_corr, **kwargs)
359
+
360
+ return dict_selected_features, dict_best_cei
361
+
362
+
363
+ def feature_correlation_by_time(**kwargs):
364
+ raise NotImplementedError()
365
+
366
+ frames = []
367
+ simulation_stages = kwargs.get("simulation_stages")
368
+ df_train = kwargs.get("df_train")
369
+ target_col = kwargs.get("target_col")
370
+
371
+ stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
372
+
373
+ # Only select columns that have been observed till the current stage
374
+ for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
375
+ current_feature_set = [
376
+ col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
377
+ ]
378
+
379
+ # Get the most correlated feature for each region
380
+ top_feature_by_region, counter = embedding.compute_feature_correlations(
381
+ df_train[current_feature_set + ["Region"]],
382
+ df_train[target_col],
383
+ "all",
384
+ )
385
+
386
+ # Create a dataframe with the most common top feature and number of occurrences over timestep
387
+ _feature = counter.most_common(1)[0][0]
388
+ # Loop through top_feature_by_region and find the average score for _feature
389
+ # Calculate the average score for 'DTR_36'
390
+ _feature_scores = [
391
+ value[1][0]
392
+ for key, value in top_feature_by_region.items()
393
+ if _feature in value[0]
394
+ ]
395
+ average_score = sum(_feature_scores) / len(_feature_scores)
396
+ _feature = utils.remove_last_part(_feature)
397
+
398
+ df = pd.DataFrame(
399
+ {
400
+ "Stage": [stage[-1]],
401
+ "Date": [utils.dict_growth_stages[stage[-1]]],
402
+ "Feature with Highest Correlation": [counter.most_common(1)[0][0]],
403
+ "Feature Category": [_feature],
404
+ "Score": [average_score],
405
+ # "Type": [ci.dict_indices[_feature][0]],
406
+ "Number of Occurrences": [counter.most_common(1)[0][1]],
407
+ # "Current Feature Set": [current_feature_set],
408
+ }
409
+ )
410
+ frames.append(df)
411
+
412
+ df_corr_feature_by_time = pd.concat(frames)
@@ -144,10 +144,13 @@ def select_stages_for_ml(stages_features, method="latest", n=100):
144
144
 
145
145
  selected_stages = []
146
146
  if method == "latest":
147
+ # Find the longest array in the list of arrays
148
+ selected_stages = [max(stages_features, key=len)]
149
+
147
150
  # Only select those arrays in the list of arrays that are starting with latest_stage
148
- for stage in stages_features:
149
- if stage[0] == latest_stage[0]:
150
- selected_stages.append(stage)
151
+ # for stage in stages_features:
152
+ # if stage[0] == latest_stage[0]:
153
+ # selected_stages.append(stage)
151
154
  elif method == "fraction":
152
155
  # Filter arrays with exactly 2 elements
153
156
  two_element_arrays = []
@@ -1,6 +1,76 @@
1
- import pandas as pd
1
+ import geopandas as gpd
2
+ import pygmt
2
3
  import matplotlib.pyplot as plt
3
- import matplotlib.patches as patches
4
+ from matplotlib.lines import Line2D
5
+ import matplotlib.patches as mpatches
6
+ import os
7
+ filtered_shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\filtered_shapefile5.shp"
8
+
9
+ if not os.path.isfile(filtered_shapefile_path):
10
+
11
+ # Load the shapefile using GeoPandas
12
+ shapefile_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\adm_shapefile.shp"
13
+ gdf = gpd.read_file(shapefile_path, engine="pyogrio")
14
+
15
+ # Only keep one row per ADMIN0
16
+ gdf = gdf.drop_duplicates(subset="ADMIN0")
17
+
18
+ sh2_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Regions\Shps\Level_1.shp"
19
+ gdf2 = gpd.read_file(sh2_path, engine="pyogrio")
20
+
21
+ # Subset gdf2 to USA, Pakistan and Afghanistan
22
+ gdf2 = gdf2[gdf2["ADM0_NAME"].isin(["United States of America"])]
23
+
24
+ # Exclude Alska and Hawaii from the USA
25
+ gdf2 = gdf2[~gdf2["ADM1_NAME"].isin(["Alaska", "Hawaii"])]
26
+
27
+ # Now combine all the states into one polygon
28
+ gdf2 = gdf2.dissolve(by="ADM0_NAME")
29
+ gdf2 = gdf2.reset_index()
30
+
31
+ # Rename ADM0_NAME to ADMIN0 for consistency
32
+ gdf2.rename(columns={"ADM0_NAME": "ADMIN0"}, inplace=True)
33
+
34
+ # Only keep ADMIN0 and geometry columns in gdf and gdf2
35
+ gdf = gdf[["ADMIN0", "geometry"]]
36
+ gdf2 = gdf2[["ADMIN0", "geometry"]]
37
+
38
+ # Merge gdf and gdf2
39
+ import pandas as pd
40
+ gdf = pd.concat([gdf, gdf2], ignore_index=True)
41
+
42
+ # Save the filtered shapefile as a temporary file
43
+
44
+ gdf.to_file(filtered_shapefile_path)
45
+ else:
46
+ gdf = gpd.read_file(filtered_shapefile_path, engine="pyogrio")
47
+
48
+ # Create the global map with highlighted countries
49
+ fig = pygmt.Figure()
50
+
51
+ # Define the region of interest and projection
52
+ # fig.basemap(region="g", projection="R12c/20", frame=True)
53
+ fig.basemap(region=[-135, 60, -35, 53], projection="Q12c", frame=True)
54
+
55
+ # Use the coast function to draw land and water
56
+ fig.coast(land="lightgray", water="lightcyan")
57
+
58
+ # Highlight the countries using the filtered shapefile
59
+ fig.plot(data=filtered_shapefile_path, pen="0.35p,black")
60
+
61
+ # Add hatches to Pakistan and Afghanistan
62
+ gdf_filled = gdf[gdf["ADMIN0"].isin(["Pakistan", "Afghanistan"])]
63
+ for _, row in gdf_filled.iterrows():
64
+ fill_gdf = gpd.GeoDataFrame([row], columns=gdf.columns)
65
+ with pygmt.helpers.GMTTempFile() as tmpfile:
66
+ fill_gdf.to_file(tmpfile.name, driver="GeoJSON")
67
+ fig.plot(data=tmpfile.name, pen="0.35p,black", fill="black@50+h")
68
+
69
+ # Save the figure
70
+ fig.savefig("global_choropleth_highlighted_v1.png", dpi=1000)
71
+
72
+ # Show the figure
73
+ fig.show()
4
74
 
5
75
  import matplotlib.pyplot as plt
6
76
  import cartopy.crs as ccrs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.33
3
+ Version: 0.1.35
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -8,6 +8,7 @@ geocif/__init__.py
8
8
  geocif/analysis.py
9
9
  geocif/geocif.py
10
10
  geocif/indices_runner.py
11
+ geocif/indices_runner_v2.py
11
12
  geocif/logger.py
12
13
  geocif/utils.py
13
14
  geocif.egg-info/PKG-INFO
@@ -31,6 +32,7 @@ geocif/cei/definitions.py
31
32
  geocif/cei/indices.py
32
33
  geocif/ml/__init__.py
33
34
  geocif/ml/correlations.py
35
+ geocif/ml/correlations_backup.py
34
36
  geocif/ml/embedding.py
35
37
  geocif/ml/feature_engineering.py
36
38
  geocif/ml/feature_selection.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.33",
53
+ version="0.1.35",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes