geocif 0.1.68__tar.gz → 0.1.70__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {geocif-0.1.68/geocif.egg-info → geocif-0.1.70}/PKG-INFO +1 -1
  2. {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/indices.py +1 -1
  3. {geocif-0.1.68 → geocif-0.1.70}/geocif/geocif.py +45 -22
  4. {geocif-0.1.68 → geocif-0.1.70}/geocif/geocif_runner.py +34 -34
  5. geocif-0.1.70/geocif/indices_runner_angola.py +212 -0
  6. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_south_africa.py +2 -2
  7. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/correlations.py +10 -7
  8. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/embedding.py +11 -8
  9. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/feature_engineering.py +6 -5
  10. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/trainers.py +1 -1
  11. geocif-0.1.70/geocif/playground/wolayita_maize_mask.py +156 -0
  12. geocif-0.1.70/geocif/viz/gt.py +69 -0
  13. {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/tmp.py +1 -1
  14. {geocif-0.1.68 → geocif-0.1.70/geocif.egg-info}/PKG-INFO +1 -1
  15. {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/SOURCES.txt +2 -0
  16. {geocif-0.1.68 → geocif-0.1.70}/setup.py +1 -1
  17. geocif-0.1.68/geocif/indices_runner_angola.py +0 -212
  18. {geocif-0.1.68 → geocif-0.1.70}/LICENSE +0 -0
  19. {geocif-0.1.68 → geocif-0.1.70}/MANIFEST.in +0 -0
  20. {geocif-0.1.68 → geocif-0.1.70}/README.md +0 -0
  21. {geocif-0.1.68 → geocif-0.1.70}/geocif/__init__.py +0 -0
  22. {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/__init__.py +0 -0
  23. {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/geoagmet.py +0 -0
  24. {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/plot.py +0 -0
  25. {geocif-0.1.68 → geocif-0.1.70}/geocif/agmet/utils.py +0 -0
  26. {geocif-0.1.68 → geocif-0.1.70}/geocif/analysis.py +0 -0
  27. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/__init__.py +0 -0
  28. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/constants.py +0 -0
  29. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/features.py +0 -0
  30. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/geo.py +0 -0
  31. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/geocif.py +0 -0
  32. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/metadata.py +0 -0
  33. {geocif-0.1.68 → geocif-0.1.70}/geocif/backup/models.py +0 -0
  34. {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/__init__.py +0 -0
  35. {geocif-0.1.68 → geocif-0.1.70}/geocif/cei/definitions.py +0 -0
  36. {geocif-0.1.68 → geocif-0.1.70}/geocif/experiments.py +0 -0
  37. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner.py +0 -0
  38. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_madagascar.py +0 -0
  39. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_malawi.py +0 -0
  40. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_mozambique.py +0 -0
  41. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_zambia.py +0 -0
  42. {geocif-0.1.68 → geocif-0.1.70}/geocif/indices_runner_zimbabwe.py +0 -0
  43. {geocif-0.1.68 → geocif-0.1.70}/geocif/logger.py +0 -0
  44. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/__init__.py +0 -0
  45. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/feature_selection.py +0 -0
  46. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/outliers.py +0 -0
  47. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/outlook.py +0 -0
  48. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/output.py +0 -0
  49. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/spatial_autocorrelation.py +0 -0
  50. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/stages.py +0 -0
  51. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/stats.py +0 -0
  52. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/trend.py +0 -0
  53. {geocif-0.1.68 → geocif-0.1.70}/geocif/ml/xai.py +0 -0
  54. {geocif-0.1.68 → geocif-0.1.70}/geocif/mm.py +0 -0
  55. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/__init__.py +0 -0
  56. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/aa.py +0 -0
  57. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/area.py +0 -0
  58. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/automl.py +0 -0
  59. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/download_esi.py +0 -0
  60. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/enso.py +0 -0
  61. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/eval.py +0 -0
  62. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/gamtest.py +0 -0
  63. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/gee_access.py +0 -0
  64. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/misc.py +0 -0
  65. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/play_xagg.py +0 -0
  66. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/reg.py +0 -0
  67. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/sustain.py +0 -0
  68. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/test_catboost.py +0 -0
  69. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp.py +0 -0
  70. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp2.py +0 -0
  71. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp3.py +0 -0
  72. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp4.py +0 -0
  73. {geocif-0.1.68 → geocif-0.1.70}/geocif/playground/tmp5.py +0 -0
  74. {geocif-0.1.68 → geocif-0.1.70}/geocif/risk/__init__.py +0 -0
  75. {geocif-0.1.68 → geocif-0.1.70}/geocif/risk/impact_assessment.py +0 -0
  76. {geocif-0.1.68 → geocif-0.1.70}/geocif/utils.py +0 -0
  77. {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/__init__.py +0 -0
  78. {geocif-0.1.68 → geocif-0.1.70}/geocif/viz/plot.py +0 -0
  79. {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/dependency_links.txt +0 -0
  80. {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/not-zip-safe +0 -0
  81. {geocif-0.1.68 → geocif-0.1.70}/geocif.egg-info/top_level.txt +0 -0
  82. {geocif-0.1.68 → geocif-0.1.70}/requirements.txt +0 -0
  83. {geocif-0.1.68 → geocif-0.1.70}/setup.cfg +0 -0
  84. {geocif-0.1.68 → geocif-0.1.70}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.1.68
3
+ Version: 0.1.70
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -675,7 +675,7 @@ class CEIs:
675
675
  elif "STD" in iname:
676
676
  val = np.nanstd(eo_vals)
677
677
  elif "AUC" in iname:
678
- val = np.trapz(eo_vals, dx=len(eo_vals))
678
+ val = np.trapz(eo_vals)
679
679
  elif "H-INDEX" in iname:
680
680
  # Multiply by 10 for h-index to work
681
681
  val = utils.compute_h_index(eo_vals * 10)
@@ -243,11 +243,11 @@ class Geocif:
243
243
  if any(cei in column for cei in self.use_ceis)
244
244
  ]
245
245
  else:
246
- # self.logger.info(f"Selecting features for {self.country} {self.crop}")
246
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
247
247
  selector, _, self.selected_features = fs.select_features(
248
248
  X_train, y_train, method=self.feature_selection
249
249
  )
250
- # self.logger.info(f"Selected features: {self.selected_features}")
250
+ self.logger.info(f"Selected features: {self.selected_features}")
251
251
 
252
252
  """ Update model to include conformal estimates """
253
253
  if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
@@ -306,7 +306,7 @@ class Geocif:
306
306
  X_train,
307
307
  y_train,
308
308
  cat_features=self.cat_features,
309
- verbose=False,
309
+ verbose=True,
310
310
  )
311
311
  elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
312
312
  X_train = X_train.drop(
@@ -598,15 +598,15 @@ class Geocif:
598
598
  df_region[f"Median {self.target}"].values, 3
599
599
  )
600
600
 
601
- # if f"Median {self.target} (2014-2018)" in df_region.columns:
602
- # df.loc[:, f"Median {self.target} (2014-2018)"] = np.around(
603
- # df_region[f"Median {self.target} (2014-2018)"].values, 3
604
- # )
605
- #
606
- # if f"Median {self.target} (2013-2017)" in df_region.columns:
607
- # df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
608
- # df_region[f"Median {self.target} (2013-2017)"].values, 3
609
- # )
601
+ if f"Median {self.target} (2018-2022)" in df_region.columns:
602
+ df.loc[:, f"Median {self.target} (2018-2022)"] = np.around(
603
+ df_region[f"Median {self.target} (2018-2022)"].values, 3
604
+ )
605
+
606
+ if f"Median {self.target} (2013-2017)" in df_region.columns:
607
+ df.loc[:, f"Median {self.target} (2013-2017)"] = np.around(
608
+ df_region[f"Median {self.target} (2013-2017)"].values, 3
609
+ )
610
610
 
611
611
  if self.estimate_ci:
612
612
  if self.estimate_ci_for_all or self.forecast_season == self.today_year:
@@ -820,8 +820,8 @@ class Geocif:
820
820
  + self.statistics_columns
821
821
  + self.feature_names
822
822
  + [f"Median {self.target}"]
823
- #+ [f"Median {self.target} (2014-2018)"]
824
- #+ [f"Median {self.target} (2013-2017)"]
823
+ + [f"Median {self.target} (2018-2022)"]
824
+ + [f"Median {self.target} (2013-2017)"]
825
825
  + ["Region_ID"]
826
826
  )
827
827
  if self.check_yield_trend:
@@ -997,6 +997,25 @@ class Geocif:
997
997
  if self.use_single_time_period_as_feature:
998
998
  df = stages.select_single_time_period_features(df)
999
999
 
1000
+ # If forecasting for current season, then exclude the latest month data as it will be partial
1001
+ # and will confuse the model
1002
+ if self.forecast_season == self.today_year:
1003
+ current_month = ar.utcnow().month
1004
+
1005
+ # Identify columns where the second chunk equals the current month index
1006
+ cols_to_drop = []
1007
+ for col in df.columns:
1008
+ if "_" in col:
1009
+ mon = stages.get_stage_information_dict(col, self.method)['Starting Stage']
1010
+
1011
+ if mon == current_month:
1012
+ cols_to_drop.append(col)
1013
+
1014
+ # Drop those columns
1015
+
1016
+ df = df.drop(columns=cols_to_drop)
1017
+
1018
+ # Hack: If
1000
1019
  # Change column name
1001
1020
  # e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
1002
1021
  df = stages.update_feature_names(df, self.method)
@@ -1011,13 +1030,13 @@ class Geocif:
1011
1030
  df, self.all_seasons_with_yield, self.number_median_years, self.target
1012
1031
  )
1013
1032
 
1014
- # df = fe.compute_user_median_statistics(
1015
- # df, [2014, 2015, 2016, 2017, 2018]
1016
- # )
1017
- #
1018
- # df = fe.compute_user_median_statistics(
1019
- # df, [2013, 2014, 2015, 2016, 2017]
1020
- # )
1033
+ df = fe.compute_user_median_statistics(
1034
+ df, range(2018, 2023)
1035
+ )
1036
+
1037
+ df = fe.compute_user_median_statistics(
1038
+ df, range(2013, 2018)
1039
+ )
1021
1040
 
1022
1041
  if self.median_area_as_feature:
1023
1042
  df = fe.compute_median_statistics(
@@ -1343,7 +1362,11 @@ class Geocif:
1343
1362
  # e.g. _stages = ['13_12_11', '13_12_11_10', '13_12_11_10_9']
1344
1363
  # then self.simulation_stages = [array([13, 12, 11]), array([13, 12, 11, 10]), array([13, 12, 11, 10, 9])]
1345
1364
  # Drop stages in self.all_stages that do not have _ in them
1346
- self.all_stages = [element for element in self.all_stages if "_" in element]
1365
+ # self.all_stages = [element for element in self.all_stages if "_" in element]
1366
+
1367
+ if self.forecast_season == self.today_year:
1368
+ current_month = ar.utcnow().month
1369
+ self.all_stages = [elem for elem in self.all_stages if not elem.startswith(str(current_month))]
1347
1370
 
1348
1371
  self.simulation_stages = [
1349
1372
  np.array([int(stage) for stage in s.split("_")]) for s in self.all_stages
@@ -26,40 +26,40 @@ def loop_execute(inputs):
26
26
  Returns:
27
27
 
28
28
  """
29
- from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
- from pycallgraph2.output import GraphvizOutput
31
-
32
- graphviz = GraphvizOutput()
33
- graphviz.output_file = "geocif_visualization.png"
34
- plt.rcParams["figure.dpi"] = 600
35
- config = Config(max_depth=5)
36
- config.trace_filter = GlobbingFilter(
37
- exclude=[
38
- "pycallgraph.*",
39
- ]
40
- )
41
-
42
- with PyCallGraph(output=graphviz, config=config):
43
- project_name, country, crop, season, model, logger, parser, index = inputs
44
-
45
- logger.info("=====================================================")
46
- logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
47
- logger.info("=====================================================")
48
-
49
- obj = geocif.Geocif(logger=logger,
50
- parser=parser,
51
- project_name=project_name)
52
- obj.read_data(country, crop, season)
53
-
54
- # Store config file in database, only execute this for
55
- # the first iteration of the loop
56
- if index == 0:
57
- output.config_to_db(obj.db_path, obj.parser, obj.today)
58
-
59
- # Setup metadata and run ML code
60
- obj.setup(season, model)
61
- if obj.simulation_stages:
62
- obj.execute()
29
+ # from pycallgraph2 import Config, PyCallGraph, GlobbingFilter
30
+ # from pycallgraph2.output import GraphvizOutput
31
+ #
32
+ # graphviz = GraphvizOutput()
33
+ # graphviz.output_file = "geocif_visualization.png"
34
+ # plt.rcParams["figure.dpi"] = 600
35
+ # config = Config(max_depth=5)
36
+ # config.trace_filter = GlobbingFilter(
37
+ # exclude=[
38
+ # "pycallgraph.*",
39
+ # ]
40
+ # )
41
+ #
42
+ # with PyCallGraph(output=graphviz, config=config):
43
+ project_name, country, crop, season, model, logger, parser, index = inputs
44
+
45
+ logger.info("=====================================================")
46
+ logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
47
+ logger.info("=====================================================")
48
+
49
+ obj = geocif.Geocif(logger=logger,
50
+ parser=parser,
51
+ project_name=project_name)
52
+ obj.read_data(country, crop, season)
53
+
54
+ # Store config file in database, only execute this for
55
+ # the first iteration of the loop
56
+ if index == 0:
57
+ output.config_to_db(obj.db_path, obj.parser, obj.today)
58
+
59
+ # Setup metadata and run ML code
60
+ obj.setup(season, model)
61
+ if obj.simulation_stages:
62
+ obj.execute()
63
63
 
64
64
 
65
65
  def gather_inputs(parser):
@@ -0,0 +1,212 @@
1
+ import itertools
2
+ import warnings
3
+ from multiprocessing import Pool, cpu_count
4
+ from pathlib import Path
5
+
6
+ import arrow as ar
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from .cei import indices
13
+ from geoprepare import base
14
+
15
+ country = "angola"
16
+
17
+ def remove_duplicates(lst):
18
+ """
19
+
20
+ :param lst:
21
+ :return:
22
+ """
23
+ return list(set([i for i in lst]))
24
+
25
+
26
+ def get_admin_zone(country, dg_shp):
27
+ admin_zone = "admin_1"
28
+ country = country.title().replace(" ", "_")
29
+
30
+ # Read in shapefile
31
+ dg_country = dg_shp[dg_shp["ADMIN0"] == country]
32
+
33
+ # Is the ADMIN2 column all None? If so, return admin_1 else return admin_2
34
+ if dg_country.empty:
35
+ admin_zone = "admin_1"
36
+ elif not dg_country["ADMIN2"].isna().all():
37
+ admin_zone = "admin_2"
38
+
39
+ return admin_zone
40
+
41
+
42
+ class cei_runner(base.BaseGeo):
43
+ def __init__(self, path_config_file):
44
+ super().__init__(path_config_file)
45
+
46
+ # Parse configuration files
47
+ self.parse_config()
48
+
49
+ self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
50
+ import platform
51
+ if platform.system() == "Linux":
52
+ self.base_dir = Path(
53
+ rf"/gpfs/data1/cmongp1/GEOGLAM/Output/countries/{country}"
54
+ )
55
+ else:
56
+ self.base_dir = Path(
57
+ rf"D:\Users\ritvik\projects\GEOGLAM\Output\countries\{country}"
58
+ ) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
59
+ self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
60
+
61
+ def collect_files(self):
62
+ """
63
+ 1. Collect all the files which contain EO information
64
+ 2. Exclude files from the `processed` directory if it is already in
65
+ processed_include_fall directory
66
+ 3. Create a dataframe that contains the following columns:
67
+ - directory: name of directory where file is located
68
+ - path: full path to file
69
+ - filename: name of file
70
+ :return: Return the dataframe created above
71
+ """
72
+ import geopandas as gp
73
+
74
+ dg_shp = gp.read_file(
75
+ self.dir_input
76
+ / "Global_Datasets"
77
+ / "Regions"
78
+ / "Shps"
79
+ / "adm_shapefile.shp",
80
+ engine="pyogrio",
81
+ )
82
+
83
+ # Collect all the files which contain EO information
84
+ df_files = pd.DataFrame(columns=["directory", "path", "filename", "admin_zone"])
85
+ for filepath in self.base_dir.rglob("*.csv"):
86
+ country = filepath.parents[0].name
87
+
88
+ admin_zone = get_admin_zone(country, dg_shp)
89
+
90
+ # If country is not in cc.COUNTRIES then skip
91
+ # HACK: Skip korea for now, as it is giving errors
92
+ if country == "republic_of_korea":
93
+ continue
94
+
95
+ # Get name of directory one level up
96
+ process_type = filepath.parents[1].name
97
+
98
+ # Get name of file
99
+ filename = filepath.name
100
+
101
+ # Add to dataframe
102
+ df_files.loc[len(df_files)] = [process_type, filepath, filename, admin_zone]
103
+
104
+ # Exclude those rows where directory is processed and file is already in
105
+ # processed_include_fall directory
106
+ no_fall = df_files["directory"] == "processed"
107
+ include_fall = df_files[df_files["directory"] == "processed_include_fall"][
108
+ "filename"
109
+ ]
110
+
111
+ df_files = df_files[~(no_fall & (df_files["filename"].isin(include_fall)))]
112
+
113
+ return df_files
114
+
115
+ def process_combinations(self, df, method):
116
+ """
117
+ Create a list of tuples of the following:
118
+ - directory: name of directory where file is located
119
+ - path: full path to file
120
+ - filename: name of file
121
+ - method: whether to compute indices for phenological stages or not
122
+ This tuple will be used as input to the `process` function
123
+ :param df:
124
+ :param method:
125
+ :return:
126
+ """
127
+ combinations = []
128
+
129
+ for index, row in tqdm(df.iterrows()):
130
+ combinations.extend(
131
+ list(
132
+ itertools.product([row[0]], [row[1]], [row[2]], [row[3]], [method])
133
+ )
134
+ )
135
+
136
+ combinations = remove_duplicates(combinations)
137
+
138
+ return combinations
139
+
140
+ def main(self, method):
141
+ """
142
+
143
+ :param method:
144
+ :return:
145
+ """
146
+ # Create a dataframe of the files to be analyzed
147
+ df_files = self.collect_files()
148
+
149
+ combinations = self.process_combinations(df_files, method)
150
+
151
+ # Add an element to the tuple to indicate the season
152
+ # Last element is redo flag which is True if the analysis is to be redone
153
+ # and False otherwise. Analysis is always redone for the current year
154
+ # and last year whether file exists or not
155
+ combinations = [
156
+ (
157
+ self.parser,
158
+ status,
159
+ path,
160
+ filename,
161
+ admin_zone,
162
+ category,
163
+ year,
164
+ "ndvi",
165
+ False, # redo
166
+ )
167
+ for year in range(2001, ar.utcnow().year + 1)
168
+ for status, path, filename, admin_zone, category in combinations
169
+ ]
170
+
171
+ # Only keep those entries in combinations where the third elemt is
172
+ # mozambique, south_africa, angola or dem_people's_rep_of_korea
173
+ # This is done to test the code for these countries
174
+ combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
175
+
176
+ if True:
177
+ num_cpu = int(cpu_count() * 0.9)
178
+ with Pool(num_cpu) as p:
179
+ for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
+ pass
181
+ else:
182
+ # Use the code below if you want to test without parallelization or
183
+ # if you want to debug by using pdb
184
+ pbar = tqdm(combinations)
185
+ for i, val in enumerate(pbar):
186
+ pbar.set_description(
187
+ f"Main loop {combinations[i][2]} {combinations[i][5]}"
188
+ )
189
+ indices.process(val)
190
+
191
+
192
+ def run(path_config_files=[]):
193
+ """
194
+
195
+ Args:
196
+ path_config_files:
197
+
198
+ Returns:
199
+
200
+ """
201
+ """ Check dictionary keys to have no spaces"""
202
+ indices.validate_index_definitions()
203
+
204
+ for method in [
205
+ "monthly_r", # "dekad_r" # "dekad_r"
206
+ ]: # , "full_season", "phenological_stages", "fraction_season"]:
207
+ obj = cei_runner(path_config_files)
208
+ obj.main(method)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ run()
@@ -162,7 +162,7 @@ class cei_runner(base.BaseGeo):
162
162
  category,
163
163
  year,
164
164
  "ndvi",
165
- False, # redo
165
+ True, # redo
166
166
  )
167
167
  for year in range(2001, ar.utcnow().year + 1)
168
168
  for status, path, filename, admin_zone, category in combinations
@@ -174,7 +174,7 @@ class cei_runner(base.BaseGeo):
174
174
  combinations = [i for i in combinations if f"{country}_maize_s1" in i[3]]
175
175
 
176
176
  if True:
177
- num_cpu = int(cpu_count() * 0.1)
177
+ num_cpu = int(cpu_count() * 0.8)
178
178
  with Pool(num_cpu) as p:
179
179
  for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
180
180
  pass
@@ -295,13 +295,16 @@ def all_correlated_feature_by_time(df, **kwargs):
295
295
  df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
296
296
 
297
297
  # Compute median of each CEI and sort the dataframe based on the absolute value of the median
298
- dict_best_cei[region_id] = (
299
- df_tmp2.groupby("Type")
300
- .max()
301
- .reset_index()
302
- .sort_values("Value", ascending=False)["Metric"]
303
- .values
304
- )
298
+ try:
299
+ dict_best_cei[region_id] = (
300
+ df_tmp2.groupby("Type")
301
+ .max()
302
+ .reset_index()
303
+ .sort_values("Value", ascending=False)["Metric"]
304
+ .values
305
+ )
306
+ except:
307
+ breakpoint()
305
308
 
306
309
  kwargs["region_id"] = region_id
307
310
  _region_names = ", ".join([str(x) for x in group['Region'].unique()])
@@ -25,29 +25,32 @@ def _compute_correlations(X, y):
25
25
  feature_correlations = {}
26
26
 
27
27
  for feature in X.columns:
28
- # Ignore columns that are object or categorical type
28
+ # Ignore object or categorical type columns
29
29
  if X[feature].dtypes.name in ["object", "category"]:
30
30
  continue
31
31
 
32
32
  f_series = X[feature]
33
33
 
34
- # Ignore NaN values in either y and f_series
35
- mask = ~ (np.isnan(y) | np.isnan(f_series))
36
- y = y[mask]
37
- f_series = f_series[mask]
34
+ # Ignore NaN values in either y or f_series
35
+ mask = ~(np.isnan(y) | np.isnan(f_series))
36
+ y_filtered = y[mask]
37
+ f_series_filtered = f_series[mask]
38
38
 
39
- if np.std(f_series) == 0 or np.std(y) == 0:
39
+ # Handle cases where std is zero
40
+ if np.std(f_series_filtered) == 0 or np.std(y_filtered) == 0:
40
41
  feature_correlations[feature] = np.nan
41
42
  else:
42
43
  try:
43
- r = pearsonr(y, f_series)[0]
44
+ r = pearsonr(y_filtered, f_series_filtered)[0]
44
45
  feature_correlations[feature] = round(r, 3)
45
- except:
46
+ except Exception as e:
47
+ print(f"Error computing correlation for {feature}: {e}")
46
48
  feature_correlations[feature] = np.nan
47
49
 
48
50
  return feature_correlations
49
51
 
50
52
 
53
+
51
54
  def find_most_common_top_feature(top_feature_by_region):
52
55
  """
53
56
  Find the most common top feature and number of occurences
@@ -39,10 +39,10 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
39
39
 
40
40
  return df
41
41
 
42
-
43
42
  def compute_closest_years(all_years, harvest_year, number_lag_years):
44
43
  """
45
- Finds the years closest to a given harvest year, excluding the harvest year itself.
44
+ Finds the historical years closest to a given harvest year,
45
+ excluding any future year (harvest_year itself and beyond).
46
46
 
47
47
  Args:
48
48
  all_years (array-like): List or array of all years to consider.
@@ -50,7 +50,8 @@ def compute_closest_years(all_years, harvest_year, number_lag_years):
50
50
  number_lag_years (int): Number of closest years to return.
51
51
 
52
52
  Returns:
53
- list: Years closest to the given harvest year.
53
+ list: The historical years closest to the given harvest year.
54
+ Returns an empty list if no historical years exist.
54
55
  """
55
56
  # Exclude the harvest year before computation to simplify logic
56
57
  filtered_years = [year for year in all_years if year != harvest_year]
@@ -96,7 +97,7 @@ def compute_median_statistics(
96
97
  mask = (group["Harvest Year"].isin(closest_years)) & (
97
98
  group["Region"] == region
98
99
  )
99
- median_yield = group.loc[mask, target_col].median()
100
+ median_yield = group.loc[mask, target_col].mean()
100
101
  df.loc[
101
102
  (df["Region"] == region) & (df["Harvest Year"] == harvest_year),
102
103
  f"Median {target_col}",
@@ -186,7 +187,7 @@ def compute_lag_yield(
186
187
  else:
187
188
  # Add median yield
188
189
  mask_group_median = group["Harvest Year"].isin(closest_years)
189
- median_yield = group.loc[mask_group_median, target_col].median()
190
+ median_yield = group.loc[mask_group_median, target_col].mean()
190
191
 
191
192
  df.loc[mask_region, col] = median_yield
192
193
 
@@ -278,7 +278,7 @@ def auto_train(
278
278
  "loss_function": loss_function,
279
279
  "early_stopping_rounds": 20,
280
280
  "random_seed": seed,
281
- "verbose": False,
281
+ "verbose": True,
282
282
  }
283
283
 
284
284
  if model_name == "catboost":