geocif 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {geocif-0.2.2/geocif.egg-info → geocif-0.2.4}/PKG-INFO +1 -1
  2. {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/indices.py +2 -2
  3. {geocif-0.2.2 → geocif-0.2.4}/geocif/geocif.py +109 -76
  4. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner.py +3 -3
  5. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/correlations.py +9 -11
  6. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/embedding.py +36 -48
  7. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/feature_engineering.py +14 -5
  8. geocif-0.2.4/geocif/ml/feature_selection.py +346 -0
  9. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/output.py +17 -0
  10. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/trainers.py +4 -5
  11. {geocif-0.2.2 → geocif-0.2.4}/geocif/mm.py +16 -0
  12. geocif-0.2.4/geocif/playground/wolayita.py +103 -0
  13. geocif-0.2.4/geocif/playground/wolayita_v2.py +80 -0
  14. geocif-0.2.4/geocif/playground/wolayita_v3.py +219 -0
  15. {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/plot.py +10 -17
  16. geocif-0.2.4/geocif/viz/viz_ml.py +95 -0
  17. {geocif-0.2.2 → geocif-0.2.4/geocif.egg-info}/PKG-INFO +1 -1
  18. {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/SOURCES.txt +4 -0
  19. {geocif-0.2.2 → geocif-0.2.4}/setup.py +1 -1
  20. geocif-0.2.2/geocif/ml/feature_selection.py +0 -350
  21. {geocif-0.2.2 → geocif-0.2.4}/LICENSE +0 -0
  22. {geocif-0.2.2 → geocif-0.2.4}/MANIFEST.in +0 -0
  23. {geocif-0.2.2 → geocif-0.2.4}/README.md +0 -0
  24. {geocif-0.2.2 → geocif-0.2.4}/geocif/__init__.py +0 -0
  25. {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/__init__.py +0 -0
  26. {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/geoagmet.py +0 -0
  27. {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/plot.py +0 -0
  28. {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/utils.py +0 -0
  29. {geocif-0.2.2 → geocif-0.2.4}/geocif/analysis.py +0 -0
  30. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/__init__.py +0 -0
  31. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/constants.py +0 -0
  32. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/features.py +0 -0
  33. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/geo.py +0 -0
  34. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/geocif.py +0 -0
  35. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/metadata.py +0 -0
  36. {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/models.py +0 -0
  37. {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/__init__.py +0 -0
  38. {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/definitions.py +0 -0
  39. {geocif-0.2.2 → geocif-0.2.4}/geocif/experiments.py +0 -0
  40. {geocif-0.2.2 → geocif-0.2.4}/geocif/geocif_runner.py +0 -0
  41. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_angola.py +0 -0
  42. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_madagascar.py +0 -0
  43. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_malawi.py +0 -0
  44. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_mozambique.py +0 -0
  45. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_south_africa.py +0 -0
  46. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_zambia.py +0 -0
  47. {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_zimbabwe.py +0 -0
  48. {geocif-0.2.2 → geocif-0.2.4}/geocif/logger.py +0 -0
  49. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/__init__.py +0 -0
  50. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/outliers.py +0 -0
  51. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/outlook.py +0 -0
  52. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/spatial_autocorrelation.py +0 -0
  53. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/stages.py +0 -0
  54. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/stats.py +0 -0
  55. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/trend.py +0 -0
  56. {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/xai.py +0 -0
  57. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/__init__.py +0 -0
  58. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/aa.py +0 -0
  59. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/area.py +0 -0
  60. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/automl.py +0 -0
  61. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/download_esi.py +0 -0
  62. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/enso.py +0 -0
  63. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/eval.py +0 -0
  64. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/gamtest.py +0 -0
  65. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/gee_access.py +0 -0
  66. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/misc.py +0 -0
  67. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/play_xagg.py +0 -0
  68. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/reg.py +0 -0
  69. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/sustain.py +0 -0
  70. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/test_catboost.py +0 -0
  71. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp.py +0 -0
  72. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp2.py +0 -0
  73. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp3.py +0 -0
  74. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp4.py +0 -0
  75. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp5.py +0 -0
  76. {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/wolayita_maize_mask.py +0 -0
  77. {geocif-0.2.2 → geocif-0.2.4}/geocif/risk/__init__.py +0 -0
  78. {geocif-0.2.2 → geocif-0.2.4}/geocif/risk/impact_assessment.py +0 -0
  79. {geocif-0.2.2 → geocif-0.2.4}/geocif/utils.py +0 -0
  80. {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/__init__.py +0 -0
  81. {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/gt.py +0 -0
  82. {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/tmp.py +0 -0
  83. {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/dependency_links.txt +0 -0
  84. {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/not-zip-safe +0 -0
  85. {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/top_level.txt +0 -0
  86. {geocif-0.2.2 → geocif-0.2.4}/requirements.txt +0 -0
  87. {geocif-0.2.2 → geocif-0.2.4}/setup.cfg +0 -0
  88. {geocif-0.2.2 → geocif-0.2.4}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -94,7 +94,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
94
94
  if df[vi_var].max() > 1:
95
95
  df[vi_var] = (df[vi_var] - 50) / 200
96
96
 
97
- # Exclude seasons before 2001 if that’s your logic
97
+ # HACK Exclude seasons before 2001
98
98
  df = df[df["Season"] >= 2001]
99
99
 
100
100
  return df
@@ -507,7 +507,7 @@ class CEIs:
507
507
  if not self.redo:
508
508
  # If harvest_year is older than last year and file exists, skip
509
509
  if (self.harvest_year < (current_year - 1)) and cei_file.is_file():
510
- logger.info("CEI file exists and year is old. Skipping: %s", cei_file)
510
+ logger.info(f"CEI file exists, skipping: {cei_file}")
511
511
  return None
512
512
 
513
513
  return intermediate_file
@@ -179,6 +179,13 @@ class Geocif:
179
179
  "Production (tn)",
180
180
  ]
181
181
 
182
+ if self.model_type == "REGRESSION":
183
+ self.target_column = (
184
+ f"Detrended {self.target}" if self.check_yield_trend else self.target
185
+ )
186
+ elif self.model_type == "CLASSIFICATION":
187
+ self.target_column = self.target_class
188
+
182
189
  self.combined_dict = {
183
190
  **di.dict_indices,
184
191
  **di.dict_ndvi,
@@ -204,65 +211,47 @@ class Geocif:
204
211
  # obj_pickle = outlook.Outlook(self.pickle_file)
205
212
  # self.df_outlook = obj_pickle.read_outlook_file()
206
213
 
207
- def train(self, df_region, scaler=None):
214
+ def apply_feature_selector(self, region, dir_output):
215
+ if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
216
+ all_features = self.X_train.columns
217
+
218
+ # Select the columns with use_ceis in it
219
+ self.selected_features = [
220
+ column
221
+ for column in all_features
222
+ if any(cei in column for cei in self.use_ceis)
223
+ ]
224
+ else:
225
+ self.logger.info(f"Selecting features for {self.country} {self.crop}")
226
+ selector, _, self.selected_features = fs.select_features(
227
+ self.X_train,
228
+ self.y_train,
229
+ method=self.feature_selection,
230
+ dir_output=dir_output,
231
+ region=region
232
+ )
233
+ self.logger.info(f"Selected features: {self.selected_features}")
234
+
235
+ """ Update model to include conformal estimates """
236
+ if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
237
+ self.selected_features.append("lat")
238
+ if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
239
+ self.selected_features.append("lon")
240
+
241
+ def train_model(self, df_region, dir_output, scaler=None):
208
242
  """
209
243
 
210
244
  Args:
211
245
  df_region:
246
+ dir_output:
212
247
  scaler:
213
248
 
214
249
  Returns:
215
250
 
216
251
  """
217
-
218
- """ Perform feature selection """
219
- if self.model_type == "REGRESSION":
220
- target_column = (
221
- f"Detrended {self.target}" if self.check_yield_trend else self.target
222
- )
223
- elif self.model_type == "CLASSIFICATION":
224
- target_column = self.target_class
225
-
226
- # Drop rows where target_column is NaN
227
- df_region = df_region.dropna(subset=[target_column])
228
-
229
- X_train = df_region[self.feature_names]
230
- # Drop any columns with NaNs
231
- X_train = X_train.dropna(axis=1, how="any")
232
- y_train = df_region[target_column]
233
-
234
252
  if self.ml_model:
235
- if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
236
- all_features = X_train.columns
237
-
238
- # Select the columns with use_ceis in it
239
- self.selected_features = [
240
- column
241
- for column in all_features
242
- if any(cei in column for cei in self.use_ceis)
243
- ]
244
- else:
245
- self.logger.info(f"Selecting features for {self.country} {self.crop}")
246
- selector, _, self.selected_features = fs.select_features(
247
- X_train, y_train, method=self.feature_selection
248
- )
249
- self.logger.info(f"Selected features: {self.selected_features}")
250
-
251
- """ Update model to include conformal estimates """
252
- if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
253
- self.selected_features.append("lat")
254
- if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
255
- self.selected_features.append("lon")
256
253
  X_train = df_region[self.selected_features + self.cat_features]
257
254
 
258
- dir_output = (
259
- self.dir_analysis
260
- / self.country
261
- / self.crop
262
- / self.model_name
263
- / str(self.forecast_season)
264
- )
265
-
266
255
  region_id = df_region["Region_ID"].unique()[0]
267
256
  X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
268
257
  if scaler:
@@ -284,9 +273,9 @@ class Geocif:
284
273
  "Harvest Year",
285
274
  df_region[self.selected_features + self.cat_features + [self.target]],
286
275
  X_train_scaled,
287
- y_train,
276
+ self.y_train,
288
277
  feature_names=self.selected_features,
289
- target_col=target_column,
278
+ target_col=self.target_column,
290
279
  optimize=self.optimize,
291
280
  fraction_loocv=self.fraction_loocv,
292
281
  cat_features=self.cat_features,
@@ -303,9 +292,9 @@ class Geocif:
303
292
  if self.model_name == "catboost":
304
293
  self.model.fit(
305
294
  X_train,
306
- y_train,
295
+ self.y_train,
307
296
  cat_features=self.cat_features,
308
- verbose=True,
297
+ verbose=False,
309
298
  )
310
299
  elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
311
300
  X_train = X_train.drop(
@@ -313,16 +302,16 @@ class Geocif:
313
302
  item for item in self.cat_features if item != "Harvest Year"
314
303
  ]
315
304
  )
316
- self.model.fit(X_train, y_train)
305
+ self.model.fit(X_train, self.y_train)
317
306
  elif self.model_name == "ydf":
318
307
  # Combine X_train and y_train
319
- df_train = pd.concat([X_train, y_train], axis=1)
308
+ df_train = pd.concat([X_train, self.y_train], axis=1)
320
309
 
321
310
  self.model = self.model.train(df_train)
322
311
  elif self.model_name == "geospaNN":
323
312
  self.model.fit(
324
313
  X_train,
325
- y_train,
314
+ self.y_train,
326
315
  # callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
327
316
  )
328
317
  elif self.model_name == "merf":
@@ -334,15 +323,15 @@ class Geocif:
334
323
  X_train,
335
324
  Z_train,
336
325
  clusters_train.astype("object"),
337
- y_train.values,
326
+ self.y_train.values,
338
327
  )
339
328
  elif self.model_name == "linear":
340
- self.model.fit(X_train_scaled, y_train)
329
+ self.model.fit(X_train_scaled, self.y_train)
341
330
  elif self.model_name == "gam":
342
- self.model.fit(X_train_scaled.values, y_train.values)
331
+ self.model.fit(X_train_scaled, self.y_train.values)
343
332
  self.best_hyperparams = {}
344
333
  elif self.model_name in ["cubist"]:
345
- self.model.fit(X_train, y_train)
334
+ self.model.fit(X_train, self.y_train)
346
335
  elif self.model_name in [
347
336
  "cumulative_1",
348
337
  "cumulative_2",
@@ -377,7 +366,7 @@ class Geocif:
377
366
  # Combine scaled numeric features and encoded region
378
367
  X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
379
368
 
380
- self.model.fit(X_train_scaled, y_train)
369
+ self.model.fit(X_train_scaled, self.y_train)
381
370
  except Exception as e:
382
371
  self.logger.error(
383
372
  f"Error fitting model for {self.country} {self.crop} {e}"
@@ -782,6 +771,14 @@ class Geocif:
782
771
  Returns:
783
772
 
784
773
  """
774
+ dir_output = (
775
+ self.dir_analysis
776
+ / self.country
777
+ / self.crop
778
+ / self.model_name
779
+ / str(self.forecast_season)
780
+ )
781
+
785
782
  from sklearn.preprocessing import StandardScaler
786
783
 
787
784
  scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
@@ -789,7 +786,7 @@ class Geocif:
789
786
  """ Train, Predict, Explain and Store results for each region """
790
787
  pbar = tqdm(self.df_train["Region_ID"].unique(), leave=False)
791
788
  for idx, region in enumerate(pbar):
792
- if self.model_name in ["linear", "gam"]:
789
+ if self.model_name in ["linear"]:
793
790
  self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
794
791
  elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
795
792
  self.create_feature_names(stages, {})
@@ -807,16 +804,6 @@ class Geocif:
807
804
  mask_train = self.df_train["Region_ID"] == region
808
805
  mask_test = self.df_test["Region_ID"] == region
809
806
 
810
- num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
811
-
812
- if self.cluster_strategy == "individual":
813
- region_name = self.df_train["Region"].unique()[idx]
814
- pbar.set_description(f"Fit/Predict for {region_name}")
815
- pbar.update()
816
- elif self.cluster_strategy in ["auto_detect", "single"]:
817
- pbar.set_description(f"Fit/Predict for group {idx + 1}")
818
- pbar.update()
819
-
820
807
  common_columns = (
821
808
  [self.target, self.target_class]
822
809
  + self.statistics_columns
@@ -836,12 +823,43 @@ class Geocif:
836
823
  if self.last_year_yield_as_feature:
837
824
  common_columns += [f"Last Year {self.target}"]
838
825
 
839
- """ Train """
826
+ """ Feature selection and then Train """
840
827
  # Filter dataframe based on region and self.feature_names
841
828
  df_region_train = self.df_train[mask_train]
842
829
  df_region_train = df_region_train[self.fixed_columns + common_columns]
843
830
  df_region_train.reset_index(drop=True, inplace=True)
844
- self.train(df_region_train, scaler)
831
+ df_region_train = df_region_train.dropna(subset=[self.target_column])
832
+
833
+ self.X_train = df_region_train[self.feature_names]
834
+
835
+ # Drop any columns with NaNs except the lag yield columns
836
+ lag_prefix = "t -"
837
+ lag_cols = [c for c in self.X_train.columns if c.startswith(lag_prefix)]
838
+ self.X_train = (
839
+ self.X_train
840
+ .drop(columns=lag_cols) # temporarily remove the lag-yield cols
841
+ .dropna(axis=1, how="any") # drop cols with any NA left
842
+ .join(self.X_train[lag_cols]) # add lag-yield cols back untouched
843
+ )
844
+ # Some models cannot handle any NaN values, so gapfill them
845
+ if self.model_name in ["gam", "linear"]:
846
+ for col in self.X_train.columns:
847
+ if self.X_train[col].isnull().any():
848
+ median = self.X_train[col].median()
849
+ self.X_train[col].fillna(median, inplace=True)
850
+
851
+ self.y_train = df_region_train[self.target_column]
852
+
853
+ self.apply_feature_selector(region, dir_output)
854
+
855
+ if self.cluster_strategy == "individual":
856
+ region_name = self.df_train["Region"].unique()[idx]
857
+ pbar.set_description(f"Fit/Predict for {region_name}")
858
+ pbar.update()
859
+ elif self.cluster_strategy in ["auto_detect", "single"]:
860
+ pbar.set_description(f"Fit/Predict for group {idx + 1}")
861
+ pbar.update()
862
+ self.train_model(df_region_train, dir_output, scaler)
845
863
 
846
864
  """ Predict """
847
865
  if self.check_yield_trend:
@@ -1040,17 +1058,27 @@ class Geocif:
1040
1058
 
1041
1059
  if self.median_area_as_feature:
1042
1060
  df = fe.compute_median_statistics(
1043
- df, self.all_seasons_with_yield, self.number_median_years, "Area (ha)"
1061
+ df,
1062
+ self.all_seasons_with_yield,
1063
+ self.number_median_years,
1064
+ "Area (ha)"
1044
1065
  )
1045
1066
 
1046
1067
  if self.lag_yield_as_feature:
1047
1068
  df = fe.compute_lag_yield(
1048
- df, self.all_seasons_with_yield, self.number_lag_years, self.target
1069
+ df,
1070
+ self.all_seasons_with_yield,
1071
+ self.forecast_season,
1072
+ self.number_lag_years,
1073
+ self.target
1049
1074
  )
1050
1075
 
1051
1076
  if self.analogous_year_yield_as_feature:
1052
1077
  df = fe.compute_analogous_yield(
1053
- df, self.all_seasons_with_yield, self.number_median_years, self.target
1078
+ df,
1079
+ self.all_seasons_with_yield,
1080
+ self.number_median_years,
1081
+ self.target
1054
1082
  )
1055
1083
 
1056
1084
  # Create Region_ID column based on Region column category code
@@ -1066,6 +1094,8 @@ class Geocif:
1066
1094
 
1067
1095
  # Region_ID should be type category
1068
1096
  df["Region_ID"] = df["Region_ID"].astype("category")
1097
+ else:
1098
+ raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
1069
1099
 
1070
1100
  return df
1071
1101
 
@@ -1247,7 +1277,10 @@ class Geocif:
1247
1277
  )
1248
1278
  pbar.update()
1249
1279
 
1250
- self.loop_ml(stage, dict_selected_features, dict_best_cei)
1280
+ try:
1281
+ self.loop_ml(stage, dict_selected_features, dict_best_cei)
1282
+ except Exception as e:
1283
+ self.logger.error(e)
1251
1284
  wandb.finish()
1252
1285
 
1253
1286
  def setup(self, forecast_season, model):
@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
165
165
  combinations = [
166
166
  i
167
167
  for i in combinations
168
- if "ukraine" in i[3]
168
+ if "ethiopia" in i[3]
169
169
  # or "lesotho_maize" in i[3] or
170
170
  # # "namibia_" in i[2] or
171
171
  # "united_republic_of_tanzania_maize" in i[3]
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
174
174
  # or "south_africa_maize" in i[3]
175
175
  # or "mozambique_maize" in i[3]
176
176
  # or "united_states_of_america" in i[3]
177
- or "russian_federation" in i[3]
177
+ #or "russian_federation" in i[3]
178
178
  # or "ukraine" in i[3]
179
179
  ]
180
180
  # "malawi" in i[2]]
181
181
 
182
182
  if self.do_parallel:
183
- num_cpu = int(cpu_count() * 0.6)
183
+ num_cpu = int(cpu_count() * 0.75)
184
184
  with Pool(num_cpu) as p:
185
185
  for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
186
186
  pass
@@ -3,7 +3,6 @@ import os
3
3
  import matplotlib.pyplot as plt
4
4
  import palettable as pal
5
5
  import pandas as pd
6
- import seaborn as sns
7
6
  from tqdm import tqdm
8
7
 
9
8
  from geocif import utils
@@ -68,6 +67,8 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
68
67
 
69
68
 
70
69
  def plot_feature_corr_by_time(df, **kwargs):
70
+ import seaborn as sns
71
+
71
72
  country = kwargs.get("country")
72
73
  crop = kwargs.get("crop")
73
74
  dir_output = kwargs.get("dir_output")
@@ -295,16 +296,13 @@ def all_correlated_feature_by_time(df, **kwargs):
295
296
  df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
296
297
 
297
298
  # Compute median of each CEI and sort the dataframe based on the absolute value of the median
298
- try:
299
- dict_best_cei[region_id] = (
300
- df_tmp2.groupby("Type")
301
- .max()
302
- .reset_index()
303
- .sort_values("Value", ascending=False)["Metric"]
304
- .values
305
- )
306
- except:
307
- breakpoint()
299
+ dict_best_cei[region_id] = (
300
+ df_tmp2.groupby("Type")
301
+ .max()
302
+ .reset_index()
303
+ .sort_values("Value", ascending=False)["Metric"]
304
+ .values
305
+ )
308
306
 
309
307
  kwargs["region_id"] = region_id
310
308
  _region_names = ", ".join([str(x) for x in group['Region'].unique()])
@@ -3,6 +3,7 @@ from collections import Counter
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
  from scipy.stats import pearsonr as pearsonr
6
+ from tqdm import tqdm
6
7
 
7
8
 
8
9
  def extract_regions(X, y, regions=[]):
@@ -32,10 +33,7 @@ def _compute_correlations(X, y):
32
33
  f_series = X[feature]
33
34
 
34
35
  # Ignore NaN values in either y or f_series
35
- try:
36
- mask = ~(np.isnan(y) | np.isnan(f_series))
37
- except:
38
- breakpoint()
36
+ mask = ~(np.isnan(y) | np.isnan(f_series))
39
37
  y_filtered = y[mask]
40
38
  f_series_filtered = f_series[mask]
41
39
 
@@ -107,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
107
105
  return feature_by_region, counter
108
106
 
109
107
 
110
- def get_all_features_correlation(inputs, targets, method):
108
+ def get_all_features_correlation(inputs: pd.DataFrame,
109
+ targets: pd.Series,
110
+ method: str) -> pd.DataFrame:
111
111
  """
112
- Get the top correlated features for each region
113
- :param inputs: pd.DataFrame, input data
114
- :param targets: pd.Series, target data
115
- :param method: str, method to use to find the top correlated features
112
+ Fast version identical output, no length-mismatch on regions whose
113
+ feature names contain no spaces.
116
114
  """
117
- frames = []
118
- for region_id in inputs["Region"].unique():
119
- X, y = extract_regions(inputs, targets, regions=[region_id])
115
+ numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
120
116
 
121
- feature_correlations = _compute_correlations(X, y)
117
+ df_all = inputs[numeric_cols + ["Region"]].copy()
118
+ df_all["__target__"] = targets.values
122
119
 
123
- # Exclude any nan values
124
- feature_correlations = {
125
- k: v for k, v in feature_correlations.items() if not np.isnan(v)
126
- }
120
+ frames: list[pd.DataFrame] = []
127
121
 
128
- if not feature_correlations:
122
+ for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
123
+ corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
124
+ if corr.empty:
129
125
  continue
130
126
 
131
- split_keys = []
132
- for key in feature_correlations.keys():
133
- parts = key.split(" ")
134
- cei = parts[0]
135
- time_period = " ".join(parts[1:])
136
-
137
- split_keys.append([cei, time_period])
138
-
139
- # split_keys = [key.rsplit("_", 1) for key in feature_correlations.keys()]
140
- values = list(feature_correlations.values())
141
-
142
- # Creating a DataFrame
143
- df = pd.DataFrame(split_keys, columns=["Metric", method])
144
- df["Value"] = values
145
-
146
- # Pivot the DataFrame so each metric becomes a column name and include the year as a separate column
147
- df_pivoted = df.pivot_table(
148
- index=method, columns="Metric", values="Value", aggfunc="first"
149
- ).reset_index()
150
- df_pivoted["Region"] = region_id
151
- # Move the 'Region' column to the front
152
- cols = df_pivoted.columns.tolist()
153
- cols = cols[-1:] + cols[:-1]
154
- df_pivoted = df_pivoted[cols]
127
+ # ---- safe split: always two columns --------------------------------
128
+ split = (
129
+ pd.Series(corr.index) # guarantees a Series
130
+ .str.split(" ", n=1, expand=True)
131
+ )
132
+ if split.shape[1] == 1: # no spaces in any feature name
133
+ split[1] = "" # match legacy behaviour
134
+ split.columns = [0, 1] # make column labels predictable
135
+
136
+ df_region = (
137
+ pd.DataFrame({
138
+ "Metric": split[0].values,
139
+ method: split[1].values,
140
+ "Value": corr.values # same length as above
141
+ })
142
+ .pivot_table(index=method, columns="Metric",
143
+ values="Value", aggfunc="first")
144
+ .reset_index()
145
+ )
146
+ df_region.insert(0, "Region", region_id)
147
+ frames.append(df_region)
155
148
 
156
- frames.append(df_pivoted)
149
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
157
150
 
158
- if len(frames):
159
- feature_by_region = pd.concat(frames)
160
- else:
161
- feature_by_region = pd.DataFrame()
162
151
 
163
- return feature_by_region
@@ -39,23 +39,32 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
39
39
 
40
40
  return df
41
41
 
42
- def compute_closest_years(all_years, harvest_year, number_lag_years):
42
+ def compute_closest_years(all_years, harvest_year, number_lag_years, only_historic=False):
43
43
  """
44
44
  Finds the historical years closest to a given harvest year,
45
- excluding any future year (harvest_year itself and beyond).
45
+ excluding any future year (harvest_year itself and beyond) based on the only_historic flag.
46
46
 
47
47
  Args:
48
48
  all_years (array-like): List or array of all years to consider.
49
49
  harvest_year (int): The year from which to compute distance.
50
50
  number_lag_years (int): Number of closest years to return.
51
+ only_historic (bool): If True, only consider years before the harvest year.
51
52
 
52
53
  Returns:
53
54
  list: The historical years closest to the given harvest year.
54
55
  Returns an empty list if no historical years exist.
55
56
  """
56
57
  # Exclude the harvest year before computation to simplify logic
57
- filtered_years = [year for year in all_years if year != harvest_year]
58
+ if only_historic:
59
+ filtered_years = [year for year in all_years if year < harvest_year]
60
+ else:
61
+ filtered_years = [year for year in all_years if year != harvest_year]
62
+
63
+ # If no historical years exist, return an empty list
64
+ if not filtered_years:
65
+ return []
58
66
 
67
+ # Sort the years based on their absolute difference from the harvest year
59
68
  closest_years = np.array(filtered_years)[
60
69
  np.argsort(np.abs(np.array(filtered_years) - harvest_year))[:number_lag_years]
61
70
  ]
@@ -150,7 +159,7 @@ def compute_user_median_statistics(df, user_years, target_col="Yield (tn per ha)
150
159
 
151
160
 
152
161
  def compute_lag_yield(
153
- df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
162
+ df, all_seasons_with_yield, forecast_season, number_lag_years, target_col="Yield (tn per ha)"
154
163
  ):
155
164
  # For the number of years specified in self.number_lag_years, add the yield of that number of years
156
165
  # ago to the dataframe
@@ -169,7 +178,7 @@ def compute_lag_yield(
169
178
 
170
179
  for harvest_year in unique_years:
171
180
  closest_years = compute_closest_years(
172
- all_seasons_with_yield, harvest_year, number_lag_years
181
+ all_seasons_with_yield, harvest_year, number_lag_years, only_historic=True
173
182
  )
174
183
 
175
184
  # For each year in the closest years, add the yield to the dataframe as a new column