geocif 0.1.68__tar.gz → 0.1.69__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {geocif-0.1.68/geocif.egg-info → geocif-0.1.69}/PKG-INFO +1 -1
  2. {geocif-0.1.68 → geocif-0.1.69}/geocif/geocif.py +21 -21
  3. {geocif-0.1.68 → geocif-0.1.69}/geocif/geocif_runner.py +34 -34
  4. geocif-0.1.69/geocif/indices_runner_angola.py +212 -0
  5. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/correlations.py +10 -7
  6. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/embedding.py +11 -8
  7. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/feature_engineering.py +6 -5
  8. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/trainers.py +1 -1
  9. geocif-0.1.69/geocif/playground/wolayita_maize_mask.py +156 -0
  10. geocif-0.1.69/geocif/viz/gt.py +69 -0
  11. {geocif-0.1.68 → geocif-0.1.69}/geocif/viz/tmp.py +1 -1
  12. {geocif-0.1.68 → geocif-0.1.69/geocif.egg-info}/PKG-INFO +1 -1
  13. {geocif-0.1.68 → geocif-0.1.69}/geocif.egg-info/SOURCES.txt +2 -0
  14. {geocif-0.1.68 → geocif-0.1.69}/setup.py +1 -1
  15. geocif-0.1.68/geocif/indices_runner_angola.py +0 -212
  16. {geocif-0.1.68 → geocif-0.1.69}/LICENSE +0 -0
  17. {geocif-0.1.68 → geocif-0.1.69}/MANIFEST.in +0 -0
  18. {geocif-0.1.68 → geocif-0.1.69}/README.md +0 -0
  19. {geocif-0.1.68 → geocif-0.1.69}/geocif/__init__.py +0 -0
  20. {geocif-0.1.68 → geocif-0.1.69}/geocif/agmet/__init__.py +0 -0
  21. {geocif-0.1.68 → geocif-0.1.69}/geocif/agmet/geoagmet.py +0 -0
  22. {geocif-0.1.68 → geocif-0.1.69}/geocif/agmet/plot.py +0 -0
  23. {geocif-0.1.68 → geocif-0.1.69}/geocif/agmet/utils.py +0 -0
  24. {geocif-0.1.68 → geocif-0.1.69}/geocif/analysis.py +0 -0
  25. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/__init__.py +0 -0
  26. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/constants.py +0 -0
  27. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/features.py +0 -0
  28. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/geo.py +0 -0
  29. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/geocif.py +0 -0
  30. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/metadata.py +0 -0
  31. {geocif-0.1.68 → geocif-0.1.69}/geocif/backup/models.py +0 -0
  32. {geocif-0.1.68 → geocif-0.1.69}/geocif/cei/__init__.py +0 -0
  33. {geocif-0.1.68 → geocif-0.1.69}/geocif/cei/definitions.py +0 -0
  34. {geocif-0.1.68 → geocif-0.1.69}/geocif/cei/indices.py +0 -0
  35. {geocif-0.1.68 → geocif-0.1.69}/geocif/experiments.py +0 -0
  36. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner.py +0 -0
  37. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_madagascar.py +0 -0
  38. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_malawi.py +0 -0
  39. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_mozambique.py +0 -0
  40. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_south_africa.py +0 -0
  41. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_zambia.py +0 -0
  42. {geocif-0.1.68 → geocif-0.1.69}/geocif/indices_runner_zimbabwe.py +0 -0
  43. {geocif-0.1.68 → geocif-0.1.69}/geocif/logger.py +0 -0
  44. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/__init__.py +0 -0
  45. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/feature_selection.py +0 -0
  46. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/outliers.py +0 -0
  47. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/outlook.py +0 -0
  48. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/output.py +0 -0
  49. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/spatial_autocorrelation.py +0 -0
  50. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/stages.py +0 -0
  51. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/stats.py +0 -0
  52. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/trend.py +0 -0
  53. {geocif-0.1.68 → geocif-0.1.69}/geocif/ml/xai.py +0 -0
  54. {geocif-0.1.68 → geocif-0.1.69}/geocif/mm.py +0 -0
  55. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/__init__.py +0 -0
  56. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/aa.py +0 -0
  57. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/area.py +0 -0
  58. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/automl.py +0 -0
  59. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/download_esi.py +0 -0
  60. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/enso.py +0 -0
  61. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/eval.py +0 -0
  62. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/gamtest.py +0 -0
  63. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/gee_access.py +0 -0
  64. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/misc.py +0 -0
  65. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/play_xagg.py +0 -0
  66. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/reg.py +0 -0
  67. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/sustain.py +0 -0
  68. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/test_catboost.py +0 -0
  69. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/tmp.py +0 -0
  70. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/tmp2.py +0 -0
  71. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/tmp3.py +0 -0
  72. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/tmp4.py +0 -0
  73. {geocif-0.1.68 → geocif-0.1.69}/geocif/playground/tmp5.py +0 -0
  74. {geocif-0.1.68 → geocif-0.1.69}/geocif/risk/__init__.py +0 -0
  75. {geocif-0.1.68 → geocif-0.1.69}/geocif/risk/impact_assessment.py +0 -0
  76. {geocif-0.1.68 → geocif-0.1.69}/geocif/utils.py +0 -0
  77. {geocif-0.1.68 → geocif-0.1.69}/geocif/viz/__init__.py +0 -0
  78. {geocif-0.1.68 → geocif-0.1.69}/geocif/viz/plot.py +0 -0
  79. {geocif-0.1.68 → geocif-0.1.69}/geocif.egg-info/dependency_links.txt +0 -0
  80. {geocif-0.1.68 → geocif-0.1.69}/geocif.egg-info/not-zip-safe +0 -0
  81. {geocif-0.1.68 → geocif-0.1.69}/geocif.egg-info/top_level.txt +0 -0
  82. {geocif-0.1.68 → geocif-0.1.69}/requirements.txt +0 -0
  83. {geocif-0.1.68 → geocif-0.1.69}/setup.cfg +0 -0
  84. {geocif-0.1.68 → geocif-0.1.69}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.68
3
+ Version: 0.1.69
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -243,11 +243,11 @@ class Geocif:
243
243
  if any(cei in column for cei in self.use_ceis)
244
244
  ]
245
245
  else:
246
- # self.logger.info(f"Selecting features for {self.country} {self.crop}")
246
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
247
247
  selector, _, self.selected_features = fs.select_features(
248
248
  X_train, y_train, method=self.feature_selection
249
249
  )
250
- # self.logger.info(f"Selected features: {self.selected_features}")
250
+ self.logger.info(f"Selected features: {self.selected_features}")
251
251
 
252
252
  """ Update model to include conformal estimates """
253
253
  if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
@@ -306,7 +306,7 @@ class Geocif:
306
306
  X_train,
307
307
  y_train,
308
308
  cat_features=self.cat_features,
309
- verbose=False,
309
+ verbose=True,
310
310
  )
311
311
  elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
312
312
  X_train = X_train.drop(
@@ -598,15 +598,15 @@ class Geocif:
598
598
  df_region[f"Median {self.target}"].values, 3
599
599
  )
600
600
 
601
- # if f"Median {self.target} (2014-2018)" in df_region.columns:
602
- # df.loc[:, f"Median {self.target} (2014-2018)"] = np.around(
603
- # df_region[f"Median {self.target} (2014-2018)"].values, 3
604
- # )
605
- #
606
- # if f"Median {self.target} (2013-2017)" in df_region.columns:
607
- # df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
608
- # df_region[f"Median {self.target} (2013-2017)"].values, 3
609
- # )
601
+ if f"Median {self.target} (2018-2022)" in df_region.columns:
602
+ df.loc[:, f"Median {self.target} (2018-2022)"] = np.around(
603
+ df_region[f"Median {self.target} (2018-2022)"].values, 3
604
+ )
605
+
606
+ if f"Median {self.target} (2013-2017)" in df_region.columns:
607
+ df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
608
+ df_region[f"Median {self.target} (2013-2017)"].values, 3
609
+ )
610
610
 
611
611
  if self.estimate_ci:
612
612
  if self.estimate_ci_for_all or self.forecast_season == self.today_year:
@@ -820,8 +820,8 @@ class Geocif:
820
820
  + self.statistics_columns
821
821
  + self.feature_names
822
822
  + [f"Median {self.target}"]
823
- #+ [f"Median {self.target} (2014-2018)"]
824
- #+ [f"Median {self.target} (2013-2017)"]
823
+ + [f"Median {self.target} (2018-2022)"]
824
+ + [f"Median {self.target} (2013-2017)"]
825
825
  + ["Region_ID"]
826
826
  )
827
827
  if self.check_yield_trend:
@@ -1011,13 +1011,13 @@ class Geocif:
1011
1011
  df, self.all_seasons_with_yield, self.number_median_years, self.target
1012
1012
  )
1013
1013
 
1014
- # df = fe.compute_user_median_statistics(
1015
- # df, [2014, 2015, 2016, 2017, 2018]
1016
- # )
1017
- #
1018
- # df = fe.compute_user_median_statistics(
1019
- # df, [2013, 2014, 2015, 2016, 2017]
1020
- # )
1014
+ df = fe.compute_user_median_statistics(
1015
+ df, range(2018, 2023)
1016
+ )
1017
+
1018
+ df = fe.compute_user_median_statistics(
1019
+ df, range(2013, 2018)
1020
+ )
1021
1021
 
1022
1022
  if self.median_area_as_feature:
1023
1023
  df = fe.compute_median_statistics(
@@ -26,40 +26,40 @@ def loop_execute(inputs):
26
26
  Returns:
27
27
 
28
28
  """
29
- from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
- from pycallgraph2.output import GraphvizOutput
31
-
32
- graphviz = GraphvizOutput()
33
- graphviz.output_file = "geocif_visualization.png"
34
- plt.rcParams["figure.dpi"] = 600
35
- config = Config(max_depth=5)
36
- config.trace_filter = GlobbingFilter(
37
- exclude=[
38
- "pycallgraph.*",
39
- ]
40
- )
41
-
42
- with PyCallGraph(output=graphviz, config=config):
43
- project_name, country, crop, season, model, logger, parser, index = inputs
44
-
45
- logger.info("=====================================================")
46
- logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
47
- logger.info("=====================================================")
48
-
49
- obj = geocif.Geocif(logger=logger,
50
- parser=parser,
51
- project_name=project_name)
52
- obj.read_data(country, crop, season)
53
-
54
- # Store config file in database, only execute this for
55
- # the first iteration of the loop
56
- if index == 0:
57
- output.config_to_db(obj.db_path, obj.parser, obj.today)
58
-
59
- # Setup metadata and run ML code
60
- obj.setup(season, model)
61
- if obj.simulation_stages:
62
- obj.execute()
29
+ # from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
+ # from pycallgraph2.output import GraphvizOutput
31
+ #
32
+ # graphviz = GraphvizOutput()
33
+ # graphviz.output_file = "geocif_visualization.png"
34
+ # plt.rcParams["figure.dpi"] = 600
35
+ # config = Config(max_depth=5)
36
+ # config.trace_filter = GlobbingFilter(
37
+ # exclude=[
38
+ # "pycallgraph.*",
39
+ # ]
40
+ # )
41
+ #
42
+ # with PyCallGraph(output=graphviz, config=config):
43
+ project_name, country, crop, season, model, logger, parser, index = inputs
44
+
45
+ logger.info("=====================================================")
46
+ logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
47
+ logger.info("=====================================================")
48
+
49
+ obj = geocif.Geocif(logger=logger,
50
+ parser=parser,
51
+ project_name=project_name)
52
+ obj.read_data(country, crop, season)
53
+
54
+ # Store config file in database, only execute this for
55
+ # the first iteration of the loop
56
+ if index == 0:
57
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
58
+
59
+ # Setup metadata and run ML code
60
+ obj.setup(season, model)
61
+ if obj.simulation_stages:
62
+ obj.execute()
63
63
 
64
64
 
65
65
  def gather_inputs(parser):
@@ -0,0 +1,212 @@
1
+ import itertools
2
+ import warnings
3
+ from multiprocessing import Pool, cpu_count
4
+ from pathlib import Path
5
+
6
+ import arrow as ar
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from .cei import indices
13
+ from geoprepare import base
14
+
15
+ country = "angola"
16
+
17
+ def remove_duplicates(lst):
18
+ """
19
+
20
+ :param lst:
21
+ :return:
22
+ """
23
+ return list(set([i for i in lst]))
24
+
25
+
26
+ def get_admin_zone(country, dg_shp):
27
+ admin_zone = "admin_1"
28
+ country = country.title().replace(" ", "_")
29
+
30
+ # Read in shapefile
31
+ dg_country = dg_shp[dg_shp["ADMIN0"] == country]
32
+
33
+ # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
34
+ if dg_country.empty:
35
+ admin_zone = "admin_1"
36
+ elif not dg_country["ADMIN2"].isna().all():
37
+ admin_zone = "admin_2"
38
+
39
+ return admin_zone
40
+
41
+
42
+ class cei_runner(base.BaseGeo):
43
+ def __init__(self, path_config_file):
44
+ super().__init__(path_config_file)
45
+
46
+ # Parse configuration files
47
+ self.parse_config()
48
+
49
+ self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
50
+ import platform
51
+ if platform.system() == "Linux":
52
+ self.base_dir = Path(
53
+ rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
54
+ )
55
+ else:
56
+ self.base_dir = Path(
57
+ rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
58
+ ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
59
+ self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
60
+
61
+ def collect_files(self):
62
+ """
63
+ 1. Collect all the files which contain EO information
64
+ 2. Exclude files from the `processed` directory if it is already in
65
+ processed_include_fall directory
66
+ 3. Create a dataframe that contains the following columns:
67
+ - directory: name of directory where file is located
68
+ - path: full path to file
69
+ - filename: name of file
70
+ :return: Return the dataframe created above
71
+ """
72
+ import geopandas as gp
73
+
74
+ dg_shp = gp.read_file(
75
+ self.dir_input
76
+ / "Global_Datasets"
77
+ / "Regions"
78
+ / "Shps"
79
+ / "adm_shapefile.shp",
80
+ engine="pyogrio",
81
+ )
82
+
83
+ # Collect all the files which contain EO information
84
+ df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
85
+ for filepath in self.base_dir.rglob("*.csv"):
86
+ country = filepath.parents[0].name
87
+
88
+ admin_zone = get_admin_zone(country, dg_shp)
89
+
90
+ # If country is not in cc.COUNTRIES then skip
91
+ # HACK: Skip korea for now, as it is giving errors
92
+ if country == "republic_of_korea":
93
+ continue
94
+
95
+ # Get name of directory one level up
96
+ process_type = filepath.parents[1].name
97
+
98
+ # Get name of file
99
+ filename = filepath.name
100
+
101
+ # Add to dataframe
102
+ df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
103
+
104
+ # Exclude those rows where directory is processed and file is already in
105
+ # processed_include_fall directory
106
+ no_fall = df_files["directory"] == "processed"
107
+ include_fall = df_files[df_files["directory"] == "processed_include_fall"][
108
+ "filename"
109
+ ]
110
+
111
+ df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
112
+
113
+ return df_files
114
+
115
+ def process_combinations(self, df, method):
116
+ """
117
+ Create a list of tuples of the following:
118
+ - directory: name of directory where file is located
119
+ - path: full path to file
120
+ - filename: name of file
121
+ - method: whether to compute indices for phenological stages or not
122
+ This tuple will be used as input to the `process` function
123
+ :param df:
124
+ :param method:
125
+ :return:
126
+ """
127
+ combinations = []
128
+
129
+ for index, row in tqdm(df.iterrows()):
130
+ combinations.extend(
131
+ list(
132
+ itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
133
+ )
134
+ )
135
+
136
+ combinations = remove_duplicates(combinations)
137
+
138
+ return combinations
139
+
140
+ def main(self, method):
141
+ """
142
+
143
+ :param method:
144
+ :return:
145
+ """
146
+ # Create a dataframe of the files to be analyzed
147
+ df_files = self.collect_files()
148
+
149
+ combinations = self.process_combinations(df_files, method)
150
+
151
+ # Add an element to the tuple to indicate the season
152
+ # Last element is redo flag which is True if the analysis is to be redone
153
+ # and False otherwise. Analysis is always redone for the current year
154
+ # and last year whether file exists or not
155
+ combinations = [
156
+ (
157
+ self.parser,
158
+ status,
159
+ path,
160
+ filename,
161
+ admin_zone,
162
+ category,
163
+ year,
164
+ "ndvi",
165
+ False, # redo
166
+ )
167
+ for year in range(2001, ar.utcnow().year + 1)
168
+ for status, path, filename, admin_zone, category in combinations
169
+ ]
170
+
171
+ # Only keep those entries in combinations where the third elemt is
172
+ # mozambique, south_africa, angola or dem_people's_rep_of_korea
173
+ # This is done to test the code for these countries
174
+ combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
175
+
176
+ if True:
177
+ num_cpu = int(cpu_count() * 0.9)
178
+ with Pool(num_cpu) as p:
179
+ for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
+ pass
181
+ else:
182
+ # Use the code below if you want to test without parallelization or
183
+ # if you want to debug by using pdb
184
+ pbar = tqdm(combinations)
185
+ for i, val in enumerate(pbar):
186
+ pbar.set_description(
187
+ f"Main loop {combinations[i][2]} {combinations[i][5]}"
188
+ )
189
+ indices.process(val)
190
+
191
+
192
+ def run(path_config_files=[]):
193
+ """
194
+
195
+ Args:
196
+ path_config_files:
197
+
198
+ Returns:
199
+
200
+ """
201
+ """ Check dictionary keys to have no spaces"""
202
+ indices.validate_index_definitions()
203
+
204
+ for method in [
205
+ "monthly_r", # "dekad_r" # "dekad_r"
206
+ ]: # , "full_season", "phenological_stages", "fraction_season"]:
207
+ obj = cei_runner(path_config_files)
208
+ obj.main(method)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ run()
@@ -295,13 +295,16 @@ def all_correlated_feature_by_time(df, **kwargs):
295
295
  df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
296
296
 
297
297
  # Compute median of each CEI and sort the dataframe based on the absolute value of the median
298
- dict_best_cei[region_id] = (
299
- df_tmp2.groupby("Type")
300
- .max()
301
- .reset_index()
302
- .sort_values("Value", ascending=False)["Metric"]
303
- .values
304
- )
298
+ try:
299
+ dict_best_cei[region_id] = (
300
+ df_tmp2.groupby("Type")
301
+ .max()
302
+ .reset_index()
303
+ .sort_values("Value", ascending=False)["Metric"]
304
+ .values
305
+ )
306
+ except:
307
+ breakpoint()
305
308
 
306
309
  kwargs["region_id"] = region_id
307
310
  _region_names = ", ".join([str(x) for x in group['Region'].unique()])
@@ -25,29 +25,32 @@ def _compute_correlations(X, y):
25
25
  feature_correlations = {}
26
26
 
27
27
  for feature in X.columns:
28
- # Ignore columns that are object or categorical type
28
+ # Ignore object or categorical type columns
29
29
  if X[feature].dtypes.name in ["object", "category"]:
30
30
  continue
31
31
 
32
32
  f_series = X[feature]
33
33
 
34
- # Ignore NaN values in either y and f_series
35
- mask = ~ (np.isnan(y) | np.isnan(f_series))
36
- y = y[mask]
37
- f_series = f_series[mask]
34
+ # Ignore NaN values in either y or f_series
35
+ mask = ~(np.isnan(y) | np.isnan(f_series))
36
+ y_filtered = y[mask]
37
+ f_series_filtered = f_series[mask]
38
38
 
39
- if np.std(f_series) == 0 or np.std(y) == 0:
39
+ # Handle cases where std is zero
40
+ if np.std(f_series_filtered) == 0 or np.std(y_filtered) == 0:
40
41
  feature_correlations[feature] = np.nan
41
42
  else:
42
43
  try:
43
- r = pearsonr(y, f_series)[0]
44
+ r = pearsonr(y_filtered, f_series_filtered)[0]
44
45
  feature_correlations[feature] = round(r, 3)
45
- except:
46
+ except Exception as e:
47
+ print(f"Error computing correlation for {feature}: {e}")
46
48
  feature_correlations[feature] = np.nan
47
49
 
48
50
  return feature_correlations
49
51
 
50
52
 
53
+
51
54
  def find_most_common_top_feature(top_feature_by_region):
52
55
  """
53
56
  Find the most common top feature and number of occurences
@@ -39,10 +39,10 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
39
39
 
40
40
  return df
41
41
 
42
-
43
42
  def compute_closest_years(all_years, harvest_year, number_lag_years):
44
43
  """
45
- Finds the years closest to a given harvest year, excluding the harvest year itself.
44
+ Finds the historical years closest to a given harvest year,
45
+ excluding any future year (harvest_year itself and beyond).
46
46
 
47
47
  Args:
48
48
  all_years (array-like): List or array of all years to consider.
@@ -50,7 +50,8 @@ def compute_closest_years(all_years, harvest_year, number_lag_years):
50
50
  number_lag_years (int): Number of closest years to return.
51
51
 
52
52
  Returns:
53
- list: Years closest to the given harvest year.
53
+ list: The historical years closest to the given harvest year.
54
+ Returns an empty list if no historical years exist.
54
55
  """
55
56
  # Exclude the harvest year before computation to simplify logic
56
57
  filtered_years = [year for year in all_years if year != harvest_year]
@@ -96,7 +97,7 @@ def compute_median_statistics(
96
97
  mask = (group["Harvest Year"].isin(closest_years)) & (
97
98
  group["Region"] == region
98
99
  )
99
- median_yield = group.loc[mask, target_col].median()
100
+ median_yield = group.loc[mask, target_col].mean()
100
101
  df.loc[
101
102
  (df["Region"] == region) & (df["Harvest Year"] == harvest_year),
102
103
  f"Median {target_col}",
@@ -186,7 +187,7 @@ def compute_lag_yield(
186
187
  else:
187
188
  # Add median yield
188
189
  mask_group_median = group["Harvest Year"].isin(closest_years)
189
- median_yield = group.loc[mask_group_median, target_col].median()
190
+ median_yield = group.loc[mask_group_median, target_col].mean()
190
191
 
191
192
  df.loc[mask_region, col] = median_yield
192
193
 
@@ -278,7 +278,7 @@ def auto_train(
278
278
  "loss_function": loss_function,
279
279
  "early_stopping_rounds": 20,
280
280
  "random_seed": seed,
281
- "verbose": False,
281
+ "verbose": True,
282
282
  }
283
283
 
284
284
  if model_name == "catboost":
@@ -0,0 +1,156 @@
1
+ import rasterio
2
+ from rasterio.warp import calculate_default_transform, reproject, Resampling
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import math
6
+
7
+ # Input / Output paths
8
+ input_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize.tif"
9
+ output_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize_5km_percentage.tif"
10
+
11
+ import rasterio
12
+ from rasterio.warp import calculate_default_transform, reproject, Resampling
13
+ from math import ceil
14
+ import numpy as np
15
+
16
+ input_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize.tif"
17
+ output_path = r"D:\Users\ritvik\projects\GEOGLAM\Input\Global_Datasets\Masks\wolayita_maize_5km_percentage.tif"
18
+
19
+ with rasterio.open(input_path) as src:
20
+ # 1) If needed, assign correct CRS
21
+ # Example: if you know it's actually EPSG:32637 but isn't set
22
+ # src_crs = rasterio.crs.CRS.from_epsg(32637)
23
+ # else if it's already correct, do:
24
+ src_crs = src.crs
25
+
26
+ # 2) Decide your pixel size.
27
+ # If src_crs is lat/lon (EPSG:4326), use ~0.045 deg for ~5 km.
28
+ # If src_crs is UTM in meters, use 5000 for 5 km.
29
+ pixel_size = 0.045 # or 5000 if in meters
30
+
31
+ transform, width, height = calculate_default_transform(
32
+ src_crs, # source crs
33
+ src_crs, # target crs (same if you just want coarser in place)
34
+ src.width,
35
+ src.height,
36
+ *src.bounds,
37
+ resolution=pixel_size
38
+ )
39
+
40
+ # Prepare output fraction array
41
+ fraction_array = np.full((height, width), -9999, dtype=np.float32)
42
+
43
+ # Reproject with average -> fraction
44
+ reproject(
45
+ source=rasterio.band(src, 1),
46
+ destination=fraction_array,
47
+ src_transform=src.transform,
48
+ src_crs=src_crs,
49
+ dst_transform=transform,
50
+ dst_crs=src_crs,
51
+ resampling=Resampling.average,
52
+ dst_nodata=-9999
53
+ )
54
+
55
+ # Now fraction_array should have values in [0..1], with -9999 for nodata.
56
+ valid_mask = (fraction_array != -9999)
57
+
58
+ if not np.any(valid_mask):
59
+ print("No valid cells at all (everything is nodata). This indicates a bounding box or CRS mismatch.")
60
+ else:
61
+ frac_min = fraction_array[valid_mask].min()
62
+ frac_max = fraction_array[valid_mask].max()
63
+ print("Fraction min:", frac_min)
64
+ print("Fraction max:", frac_max)
65
+
66
+ # If both min and max are 0.0, it means there's truly no coverage or it's extremely small.
67
+ # Otherwise you might see something like 0.0, 0.01, 0.5, etc.
68
+
69
+ # Then let's see if maybe they're all below 0.005:
70
+ below_005 = (fraction_array[valid_mask] < 0.005).all()
71
+ print("All fractions < 0.5%?", below_005)
72
+
73
+ breakpoint()
74
+ with rasterio.open(input_path) as src:
75
+ # If src.crs is None but you KNOW it's EPSG:4326, assign it:
76
+ # src_crs = rasterio.crs.CRS.from_epsg(4326)
77
+ # Otherwise, just use what's in the file:
78
+ src_crs = src.crs
79
+
80
+ # Let's assume the file is already lat/lon (EPSG:4326).
81
+ # We'll define ~0.045° as "5 km" at the equator.
82
+ new_res = 0.045
83
+
84
+ # Calculate a new transform and new shape
85
+ # for coarser resolution in the SAME EPSG:4326.
86
+ transform, width, height = calculate_default_transform(
87
+ src_crs, # src CRS
88
+ src_crs, # dst CRS (same if you want to stay in lat/lon)
89
+ src.width,
90
+ src.height,
91
+ *src.bounds,
92
+ resolution=new_res # sets pixel size to 0.045 degrees
93
+ )
94
+
95
+ # Read full data for histogram plotting
96
+ data_in = src.read(1, masked=True)
97
+ in_profile = src.profile.copy()
98
+
99
+ # Plot input histogram (0 or 1)
100
+ arr_in = data_in.compressed()
101
+ plt.figure()
102
+ plt.hist(arr_in, bins=[-0.5, 0.5, 1.5], edgecolor='black')
103
+ plt.title("Input (0/1)")
104
+ plt.xlabel("Value")
105
+ plt.ylabel("Frequency")
106
+ plt.show()
107
+
108
+ # Prepare output array, float32 with sentinel -9999
109
+ out_array = np.full((height, width), -9999, dtype=np.float32)
110
+
111
+ with rasterio.open(input_path) as src:
112
+ reproject(
113
+ source=rasterio.band(src, 1),
114
+ destination=out_array,
115
+ src_transform=src.transform,
116
+ src_crs=src.crs,
117
+ dst_transform=transform,
118
+ dst_crs=src_crs, # same
119
+ resampling=Resampling.average,
120
+ dst_nodata=-9999
121
+ )
122
+
123
+ # Now out_array has fraction in [0..1]. Convert to % (0..100).
124
+ breakpoint()
125
+ mask_valid = (out_array != -9999)
126
+ out_array[mask_valid] *= 100.0
127
+ out_array[mask_valid] = np.rint(out_array[mask_valid]) # round
128
+ out_array = out_array.astype(np.int32)
129
+
130
+ # Update profile
131
+ out_profile = in_profile.copy()
132
+ out_profile.update({
133
+ 'driver': 'GTiff',
134
+ 'width': width,
135
+ 'height': height,
136
+ 'transform': transform,
137
+ 'crs': src_crs,
138
+ 'dtype': 'int32',
139
+ 'nodata': -9999
140
+ })
141
+
142
+ # Write out
143
+ with rasterio.open(output_path, 'w', **out_profile) as dst:
144
+ dst.write(out_array, 1)
145
+
146
+ print("Wrote:", output_path)
147
+
148
+ # Plot histogram of output (ignore -9999)
149
+ out_data = np.where(out_array == -9999, np.nan, out_array)
150
+ valid_data = out_data[~np.isnan(out_data)]
151
+ plt.figure()
152
+ plt.hist(valid_data, bins=50, edgecolor="black")
153
+ plt.title("5km Percentage (0-100)")
154
+ plt.xlabel("Percent cropped")
155
+ plt.ylabel("Frequency")
156
+ plt.show()
@@ -0,0 +1,69 @@
1
+ from great_tables import GT, md, system_fonts
2
+ import pandas as pd
3
+
4
+ # Your data as a pandas DataFrame
5
+ data = [
6
+ [2, "10<sup>th</sup>", "<=14; >14", 89.2, "2010 - 2021"],
7
+ [2, "25<sup>th</sup>", "<=18.7; >18.7", 82.2, "2010 - 2021"],
8
+ [2, "50<sup>th</sup>", "<=24.6; >24.6", 83.7, "2010 - 2021"],
9
+ [2, "75<sup>th</sup>", "<=31; >31", 88.3, "2010 - 2021"],
10
+ [2, "90<sup>th</sup>", "<=38.9; >38.9", 96.9, "2010 - 2021"],
11
+ [3, "33<sup>rd</sup>, 67<sup>th</sup>", "<=20.3; 20.3 - 29.6; >29.6", 60.5, "2010 - 2021"],
12
+ [4, "25<sup>th</sup>, 50<sup>th</sup>, 75<sup>th</sup>",
13
+ "<=18.7; 18.7-24.6; 24.6-31; >31", 64.4, "2010 - 2021"]
14
+ ]
15
+ cols = ["Number of classes", "Percentile(s)", "Yield categories", "Accuracy (%)", "Years"]
16
+
17
+ df = pd.DataFrame(data, columns=cols)
18
+
19
+ # Create a Great Tables object
20
+ gt_tbl = GT(data=df)
21
+
22
+ # Example formatting, coloring, and styling
23
+ gt_tbl = (gt_tbl
24
+ # Format the "Accuracy (%)" column to show one decimal place
25
+ .fmt_number(
26
+ columns=["Accuracy (%)"],
27
+ decimals=1
28
+ )
29
+ # Color-scale the "Accuracy (%)" column (optional)
30
+ #.data_color(
31
+ # columns=["Accuracy (%)"],
32
+ # palette=["tomato", "gold", "palegreen"],
33
+ # domain=[50, 100] # Range from the lowest to highest accuracy
34
+ #)
35
+ # Set column widths
36
+ .cols_width({
37
+ "Number classes": "60px",
38
+ "Percentile(s)": "140px",
39
+ "Yield categories": "220px",
40
+ "Accuracy (%)": "100px",
41
+ "Years": "90px"
42
+ })
43
+ # Add a table header/title
44
+ .tab_header(
45
+ title=md("**Accuracy of Model for Different Yield Categories**")
46
+ )
47
+ # Add a source note (optional)
48
+ # .tab_source_note(
49
+ # md(
50
+ # "**Source**: Internal records<br>"
51
+ # "**Note**: Data from 2010-2021"
52
+ # )
53
+ # )
54
+ # Customize general table options
55
+ .tab_options(
56
+ heading_background_color='antiquewhite',
57
+ column_labels_background_color='antiquewhite',
58
+ source_notes_background_color='antiquewhite',
59
+ table_background_color='snow',
60
+ table_font_names=system_fonts("humanist"),
61
+ data_row_padding='2px'
62
+ )
63
+ # Align all columns center except "Yield categories", which might be longer text
64
+ .cols_align(align="center")
65
+ .cols_align(align="left", columns=["Yield categories"])
66
+ )
67
+
68
+ # Display the table
69
+ GT.save(gt_tbl, file="aa.png")
@@ -7,7 +7,7 @@ import glob
7
7
  import os
8
8
 
9
9
  # 1. Specify the directory containing your .dta files:
10
- data_dir = r"."
10
+ data_dir = r"C:\Users\ritvik\Downloads\maize_yield (2)\maize_yield"
11
11
 
12
12
  # 2. Use glob to find all .dta files in that directory:
13
13
  dta_files = glob.glob(os.path.join(data_dir, "*.dta"))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.68
3
+ Version: 0.1.69
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -72,9 +72,11 @@ geocif/playground/tmp2.py
72
72
  geocif/playground/tmp3.py
73
73
  geocif/playground/tmp4.py
74
74
  geocif/playground/tmp5.py
75
+ geocif/playground/wolayita_maize_mask.py
75
76
  geocif/risk/__init__.py
76
77
  geocif/risk/impact_assessment.py
77
78
  geocif/viz/__init__.py
79
+ geocif/viz/gt.py
78
80
  geocif/viz/plot.py
79
81
  geocif/viz/tmp.py
80
82
  tests/test_geocif.py
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.1.68",
53
+ version="0.1.69",
54
54
  zip_safe=False,
55
55
  )
@@ -1,212 +0,0 @@
1
- import itertools
2
- import warnings
3
- from multiprocessing import Pool, cpu_count
4
- from pathlib import Path
5
-
6
- import arrow as ar
7
- import pandas as pd
8
- from tqdm import tqdm
9
-
10
- warnings.filterwarnings("ignore")
11
-
12
- from .cei import indices
13
- from geoprepare import base
14
-
15
- country = "ethiopia"
16
-
17
- def remove_duplicates(lst):
18
- """
19
-
20
- :param lst:
21
- :return:
22
- """
23
- return list(set([i for i in lst]))
24
-
25
-
26
- def get_admin_zone(country, dg_shp):
27
- admin_zone = "admin_1"
28
- country = country.title().replace(" ", "_")
29
-
30
- # Read in shapefile
31
- dg_country = dg_shp[dg_shp["ADMIN0"] == country]
32
-
33
- # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
34
- if dg_country.empty:
35
- admin_zone = "admin_1"
36
- elif not dg_country["ADMIN2"].isna().all():
37
- admin_zone = "admin_2"
38
-
39
- return admin_zone
40
-
41
-
42
- class cei_runner(base.BaseGeo):
43
- def __init__(self, path_config_file):
44
- super().__init__(path_config_file)
45
-
46
- # Parse configuration files
47
- self.parse_config()
48
-
49
- self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
50
- import platform
51
- if platform.system() == "Linux":
52
- self.base_dir = Path(
53
- rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
54
- )
55
- else:
56
- self.base_dir = Path(
57
- rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
58
- ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
59
- self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
60
-
61
- def collect_files(self):
62
- """
63
- 1. Collect all the files which contain EO information
64
- 2. Exclude files from the `processed` directory if it is already in
65
- processed_include_fall directory
66
- 3. Create a dataframe that contains the following columns:
67
- - directory: name of directory where file is located
68
- - path: full path to file
69
- - filename: name of file
70
- :return: Return the dataframe created above
71
- """
72
- import geopandas as gp
73
-
74
- dg_shp = gp.read_file(
75
- self.dir_input
76
- / "Global_Datasets"
77
- / "Regions"
78
- / "Shps"
79
- / "adm_shapefile.shp",
80
- engine="pyogrio",
81
- )
82
-
83
- # Collect all the files which contain EO information
84
- df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
85
- for filepath in self.base_dir.rglob("*.csv"):
86
- country = filepath.parents[0].name
87
-
88
- admin_zone = get_admin_zone(country, dg_shp)
89
-
90
- # If country is not in cc.COUNTRIES then skip
91
- # HACK: Skip korea for now, as it is giving errors
92
- if country == "republic_of_korea":
93
- continue
94
-
95
- # Get name of directory one level up
96
- process_type = filepath.parents[1].name
97
-
98
- # Get name of file
99
- filename = filepath.name
100
-
101
- # Add to dataframe
102
- df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
103
-
104
- # Exclude those rows where directory is processed and file is already in
105
- # processed_include_fall directory
106
- no_fall = df_files["directory"] == "processed"
107
- include_fall = df_files[df_files["directory"] == "processed_include_fall"][
108
- "filename"
109
- ]
110
-
111
- df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
112
-
113
- return df_files
114
-
115
- def process_combinations(self, df, method):
116
- """
117
- Create a list of tuples of the following:
118
- - directory: name of directory where file is located
119
- - path: full path to file
120
- - filename: name of file
121
- - method: whether to compute indices for phenological stages or not
122
- This tuple will be used as input to the `process` function
123
- :param df:
124
- :param method:
125
- :return:
126
- """
127
- combinations = []
128
-
129
- for index, row in tqdm(df.iterrows()):
130
- combinations.extend(
131
- list(
132
- itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
133
- )
134
- )
135
-
136
- combinations = remove_duplicates(combinations)
137
-
138
- return combinations
139
-
140
- def main(self, method):
141
- """
142
-
143
- :param method:
144
- :return:
145
- """
146
- # Create a dataframe of the files to be analyzed
147
- df_files = self.collect_files()
148
-
149
- combinations = self.process_combinations(df_files, method)
150
-
151
- # Add an element to the tuple to indicate the season
152
- # Last element is redo flag which is True if the analysis is to be redone
153
- # and False otherwise. Analysis is always redone for the current year
154
- # and last year whether file exists or not
155
- combinations = [
156
- (
157
- self.parser,
158
- status,
159
- path,
160
- filename,
161
- admin_zone,
162
- category,
163
- year,
164
- "ndvi",
165
- False, # redo
166
- )
167
- for year in range(2001, ar.utcnow().year + 1)
168
- for status, path, filename, admin_zone, category in combinations
169
- ]
170
-
171
- # Only keep those entries in combinations where the third elemt is
172
- # mozambique, south_africa, angola or dem_people's_rep_of_korea
173
- # This is done to test the code for these countries
174
- combinations = [i for i in combinations if f"{country}_winter_wheat_s1" in i[3]]
175
-
176
- if True:
177
- num_cpu = int(cpu_count() * 0.9)
178
- with Pool(num_cpu) as p:
179
- for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
- pass
181
- else:
182
- # Use the code below if you want to test without parallelization or
183
- # if you want to debug by using pdb
184
- pbar = tqdm(combinations)
185
- for i, val in enumerate(pbar):
186
- pbar.set_description(
187
- f"Main loop {combinations[i][2]} {combinations[i][5]}"
188
- )
189
- indices.process(val)
190
-
191
-
192
- def run(path_config_files=[]):
193
- """
194
-
195
- Args:
196
- path_config_files:
197
-
198
- Returns:
199
-
200
- """
201
- """ Check dictionary keys to have no spaces"""
202
- indices.validate_index_definitions()
203
-
204
- for method in [
205
- "monthly_r", # "dekad_r" # "dekad_r"
206
- ]: # , "full_season", "phenological_stages", "fraction_season"]:
207
- obj = cei_runner(path_config_files)
208
- obj.main(method)
209
-
210
-
211
- if __name__ == "__main__":
212
- run()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes