geocif 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.2.2/geocif.egg-info → geocif-0.2.4}/PKG-INFO +1 -1
- {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/indices.py +2 -2
- {geocif-0.2.2 → geocif-0.2.4}/geocif/geocif.py +109 -76
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner.py +3 -3
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/correlations.py +9 -11
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/embedding.py +36 -48
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/feature_engineering.py +14 -5
- geocif-0.2.4/geocif/ml/feature_selection.py +346 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/output.py +17 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/trainers.py +4 -5
- {geocif-0.2.2 → geocif-0.2.4}/geocif/mm.py +16 -0
- geocif-0.2.4/geocif/playground/wolayita.py +103 -0
- geocif-0.2.4/geocif/playground/wolayita_v2.py +80 -0
- geocif-0.2.4/geocif/playground/wolayita_v3.py +219 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/plot.py +10 -17
- geocif-0.2.4/geocif/viz/viz_ml.py +95 -0
- {geocif-0.2.2 → geocif-0.2.4/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/SOURCES.txt +4 -0
- {geocif-0.2.2 → geocif-0.2.4}/setup.py +1 -1
- geocif-0.2.2/geocif/ml/feature_selection.py +0 -350
- {geocif-0.2.2 → geocif-0.2.4}/LICENSE +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/MANIFEST.in +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/README.md +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/plot.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/agmet/utils.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/analysis.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/constants.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/features.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/geo.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/geocif.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/metadata.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/backup/models.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/cei/definitions.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/experiments.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/geocif_runner.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_angola.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/logger.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/outliers.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/outlook.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/stages.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/stats.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/trend.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/ml/xai.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/aa.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/area.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/automl.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/download_esi.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/enso.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/eval.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/gamtest.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/gee_access.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/misc.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/reg.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/sustain.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp2.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp3.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp4.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/tmp5.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/playground/wolayita_maize_mask.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/risk/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/utils.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/__init__.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/gt.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif/viz/tmp.py +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/requirements.txt +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/setup.cfg +0 -0
- {geocif-0.2.2 → geocif-0.2.4}/tests/test_geocif.py +0 -0
@@ -94,7 +94,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
|
|
94
94
|
if df[vi_var].max() > 1:
|
95
95
|
df[vi_var] = (df[vi_var] - 50) / 200
|
96
96
|
|
97
|
-
# Exclude seasons before 2001
|
97
|
+
# HACK Exclude seasons before 2001
|
98
98
|
df = df[df["Season"] >= 2001]
|
99
99
|
|
100
100
|
return df
|
@@ -507,7 +507,7 @@ class CEIs:
|
|
507
507
|
if not self.redo:
|
508
508
|
# If harvest_year is older than last year and file exists, skip
|
509
509
|
if (self.harvest_year < (current_year - 1)) and cei_file.is_file():
|
510
|
-
logger.info("CEI file exists
|
510
|
+
logger.info(f"CEI file exists, skipping: {cei_file}")
|
511
511
|
return None
|
512
512
|
|
513
513
|
return intermediate_file
|
@@ -179,6 +179,13 @@ class Geocif:
|
|
179
179
|
"Production (tn)",
|
180
180
|
]
|
181
181
|
|
182
|
+
if self.model_type == "REGRESSION":
|
183
|
+
self.target_column = (
|
184
|
+
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
185
|
+
)
|
186
|
+
elif self.model_type == "CLASSIFICATION":
|
187
|
+
self.target_column = self.target_class
|
188
|
+
|
182
189
|
self.combined_dict = {
|
183
190
|
**di.dict_indices,
|
184
191
|
**di.dict_ndvi,
|
@@ -204,65 +211,47 @@ class Geocif:
|
|
204
211
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
205
212
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
206
213
|
|
207
|
-
def
|
214
|
+
def apply_feature_selector(self, region, dir_output):
|
215
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
216
|
+
all_features = self.X_train.columns
|
217
|
+
|
218
|
+
# Select the columns with use_ceis in it
|
219
|
+
self.selected_features = [
|
220
|
+
column
|
221
|
+
for column in all_features
|
222
|
+
if any(cei in column for cei in self.use_ceis)
|
223
|
+
]
|
224
|
+
else:
|
225
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
226
|
+
selector, _, self.selected_features = fs.select_features(
|
227
|
+
self.X_train,
|
228
|
+
self.y_train,
|
229
|
+
method=self.feature_selection,
|
230
|
+
dir_output=dir_output,
|
231
|
+
region=region
|
232
|
+
)
|
233
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
234
|
+
|
235
|
+
""" Update model to include conformal estimates """
|
236
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
237
|
+
self.selected_features.append("lat")
|
238
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
239
|
+
self.selected_features.append("lon")
|
240
|
+
|
241
|
+
def train_model(self, df_region, dir_output, scaler=None):
|
208
242
|
"""
|
209
243
|
|
210
244
|
Args:
|
211
245
|
df_region:
|
246
|
+
dir_output:
|
212
247
|
scaler:
|
213
248
|
|
214
249
|
Returns:
|
215
250
|
|
216
251
|
"""
|
217
|
-
|
218
|
-
""" Perform feature selection """
|
219
|
-
if self.model_type == "REGRESSION":
|
220
|
-
target_column = (
|
221
|
-
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
222
|
-
)
|
223
|
-
elif self.model_type == "CLASSIFICATION":
|
224
|
-
target_column = self.target_class
|
225
|
-
|
226
|
-
# Drop rows where target_column is NaN
|
227
|
-
df_region = df_region.dropna(subset=[target_column])
|
228
|
-
|
229
|
-
X_train = df_region[self.feature_names]
|
230
|
-
# Drop any columns with NaNs
|
231
|
-
X_train = X_train.dropna(axis=1, how="any")
|
232
|
-
y_train = df_region[target_column]
|
233
|
-
|
234
252
|
if self.ml_model:
|
235
|
-
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
236
|
-
all_features = X_train.columns
|
237
|
-
|
238
|
-
# Select the columns with use_ceis in it
|
239
|
-
self.selected_features = [
|
240
|
-
column
|
241
|
-
for column in all_features
|
242
|
-
if any(cei in column for cei in self.use_ceis)
|
243
|
-
]
|
244
|
-
else:
|
245
|
-
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
246
|
-
selector, _, self.selected_features = fs.select_features(
|
247
|
-
X_train, y_train, method=self.feature_selection
|
248
|
-
)
|
249
|
-
self.logger.info(f"Selected features: {self.selected_features}")
|
250
|
-
|
251
|
-
""" Update model to include conformal estimates """
|
252
|
-
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
253
|
-
self.selected_features.append("lat")
|
254
|
-
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
255
|
-
self.selected_features.append("lon")
|
256
253
|
X_train = df_region[self.selected_features + self.cat_features]
|
257
254
|
|
258
|
-
dir_output = (
|
259
|
-
self.dir_analysis
|
260
|
-
/ self.country
|
261
|
-
/ self.crop
|
262
|
-
/ self.model_name
|
263
|
-
/ str(self.forecast_season)
|
264
|
-
)
|
265
|
-
|
266
255
|
region_id = df_region["Region_ID"].unique()[0]
|
267
256
|
X_train.to_csv(dir_output / f"X_train_{region_id}.csv", index=False)
|
268
257
|
if scaler:
|
@@ -284,9 +273,9 @@ class Geocif:
|
|
284
273
|
"Harvest Year",
|
285
274
|
df_region[self.selected_features + self.cat_features + [self.target]],
|
286
275
|
X_train_scaled,
|
287
|
-
y_train,
|
276
|
+
self.y_train,
|
288
277
|
feature_names=self.selected_features,
|
289
|
-
target_col=target_column,
|
278
|
+
target_col=self.target_column,
|
290
279
|
optimize=self.optimize,
|
291
280
|
fraction_loocv=self.fraction_loocv,
|
292
281
|
cat_features=self.cat_features,
|
@@ -303,9 +292,9 @@ class Geocif:
|
|
303
292
|
if self.model_name == "catboost":
|
304
293
|
self.model.fit(
|
305
294
|
X_train,
|
306
|
-
y_train,
|
295
|
+
self.y_train,
|
307
296
|
cat_features=self.cat_features,
|
308
|
-
verbose=
|
297
|
+
verbose=False,
|
309
298
|
)
|
310
299
|
elif self.model_name in ["ngboost", "oblique", "tabpfn"]:
|
311
300
|
X_train = X_train.drop(
|
@@ -313,16 +302,16 @@ class Geocif:
|
|
313
302
|
item for item in self.cat_features if item != "Harvest Year"
|
314
303
|
]
|
315
304
|
)
|
316
|
-
self.model.fit(X_train, y_train)
|
305
|
+
self.model.fit(X_train, self.y_train)
|
317
306
|
elif self.model_name == "ydf":
|
318
307
|
# Combine X_train and y_train
|
319
|
-
df_train = pd.concat([X_train, y_train], axis=1)
|
308
|
+
df_train = pd.concat([X_train, self.y_train], axis=1)
|
320
309
|
|
321
310
|
self.model = self.model.train(df_train)
|
322
311
|
elif self.model_name == "geospaNN":
|
323
312
|
self.model.fit(
|
324
313
|
X_train,
|
325
|
-
y_train,
|
314
|
+
self.y_train,
|
326
315
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
327
316
|
)
|
328
317
|
elif self.model_name == "merf":
|
@@ -334,15 +323,15 @@ class Geocif:
|
|
334
323
|
X_train,
|
335
324
|
Z_train,
|
336
325
|
clusters_train.astype("object"),
|
337
|
-
y_train.values,
|
326
|
+
self.y_train.values,
|
338
327
|
)
|
339
328
|
elif self.model_name == "linear":
|
340
|
-
self.model.fit(X_train_scaled, y_train)
|
329
|
+
self.model.fit(X_train_scaled, self.y_train)
|
341
330
|
elif self.model_name == "gam":
|
342
|
-
self.model.fit(X_train_scaled
|
331
|
+
self.model.fit(X_train_scaled, self.y_train.values)
|
343
332
|
self.best_hyperparams = {}
|
344
333
|
elif self.model_name in ["cubist"]:
|
345
|
-
self.model.fit(X_train, y_train)
|
334
|
+
self.model.fit(X_train, self.y_train)
|
346
335
|
elif self.model_name in [
|
347
336
|
"cumulative_1",
|
348
337
|
"cumulative_2",
|
@@ -377,7 +366,7 @@ class Geocif:
|
|
377
366
|
# Combine scaled numeric features and encoded region
|
378
367
|
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
379
368
|
|
380
|
-
self.model.fit(X_train_scaled, y_train)
|
369
|
+
self.model.fit(X_train_scaled, self.y_train)
|
381
370
|
except Exception as e:
|
382
371
|
self.logger.error(
|
383
372
|
f"Error fitting model for {self.country} {self.crop} {e}"
|
@@ -782,6 +771,14 @@ class Geocif:
|
|
782
771
|
Returns:
|
783
772
|
|
784
773
|
"""
|
774
|
+
dir_output = (
|
775
|
+
self.dir_analysis
|
776
|
+
/ self.country
|
777
|
+
/ self.crop
|
778
|
+
/ self.model_name
|
779
|
+
/ str(self.forecast_season)
|
780
|
+
)
|
781
|
+
|
785
782
|
from sklearn.preprocessing import StandardScaler
|
786
783
|
|
787
784
|
scaler = StandardScaler() if self.model_name in ["linear", "gam"] else None
|
@@ -789,7 +786,7 @@ class Geocif:
|
|
789
786
|
""" Train, Predict, Explain and Store results for each region """
|
790
787
|
pbar = tqdm(self.df_train["Region_ID"].unique(), leave=False)
|
791
788
|
for idx, region in enumerate(pbar):
|
792
|
-
if self.model_name in ["linear"
|
789
|
+
if self.model_name in ["linear"]:
|
793
790
|
self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
|
794
791
|
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
795
792
|
self.create_feature_names(stages, {})
|
@@ -807,16 +804,6 @@ class Geocif:
|
|
807
804
|
mask_train = self.df_train["Region_ID"] == region
|
808
805
|
mask_test = self.df_test["Region_ID"] == region
|
809
806
|
|
810
|
-
num_regions_in_cluster = self.df_train[mask_train]["Region"].unique()
|
811
|
-
|
812
|
-
if self.cluster_strategy == "individual":
|
813
|
-
region_name = self.df_train["Region"].unique()[idx]
|
814
|
-
pbar.set_description(f"Fit/Predict for {region_name}")
|
815
|
-
pbar.update()
|
816
|
-
elif self.cluster_strategy in ["auto_detect", "single"]:
|
817
|
-
pbar.set_description(f"Fit/Predict for group {idx + 1}")
|
818
|
-
pbar.update()
|
819
|
-
|
820
807
|
common_columns = (
|
821
808
|
[self.target, self.target_class]
|
822
809
|
+ self.statistics_columns
|
@@ -836,12 +823,43 @@ class Geocif:
|
|
836
823
|
if self.last_year_yield_as_feature:
|
837
824
|
common_columns += [f"Last Year {self.target}"]
|
838
825
|
|
839
|
-
""" Train """
|
826
|
+
""" Feature selection and then Train """
|
840
827
|
# Filter dataframe based on region and self.feature_names
|
841
828
|
df_region_train = self.df_train[mask_train]
|
842
829
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
843
830
|
df_region_train.reset_index(drop=True, inplace=True)
|
844
|
-
self.
|
831
|
+
df_region_train = df_region_train.dropna(subset=[self.target_column])
|
832
|
+
|
833
|
+
self.X_train = df_region_train[self.feature_names]
|
834
|
+
|
835
|
+
# Drop any columns with NaNs except the lag yield columns
|
836
|
+
lag_prefix = "t -"
|
837
|
+
lag_cols = [c for c in self.X_train.columns if c.startswith(lag_prefix)]
|
838
|
+
self.X_train = (
|
839
|
+
self.X_train
|
840
|
+
.drop(columns=lag_cols) # temporarily remove the lag-yield cols
|
841
|
+
.dropna(axis=1, how="any") # drop cols with any NA left
|
842
|
+
.join(self.X_train[lag_cols]) # add lag-yield cols back untouched
|
843
|
+
)
|
844
|
+
# Some models cannot handle any NaN values, so gapfill them
|
845
|
+
if self.model_name in ["gam", "linear"]:
|
846
|
+
for col in self.X_train.columns:
|
847
|
+
if self.X_train[col].isnull().any():
|
848
|
+
median = self.X_train[col].median()
|
849
|
+
self.X_train[col].fillna(median, inplace=True)
|
850
|
+
|
851
|
+
self.y_train = df_region_train[self.target_column]
|
852
|
+
|
853
|
+
self.apply_feature_selector(region, dir_output)
|
854
|
+
|
855
|
+
if self.cluster_strategy == "individual":
|
856
|
+
region_name = self.df_train["Region"].unique()[idx]
|
857
|
+
pbar.set_description(f"Fit/Predict for {region_name}")
|
858
|
+
pbar.update()
|
859
|
+
elif self.cluster_strategy in ["auto_detect", "single"]:
|
860
|
+
pbar.set_description(f"Fit/Predict for group {idx + 1}")
|
861
|
+
pbar.update()
|
862
|
+
self.train_model(df_region_train, dir_output, scaler)
|
845
863
|
|
846
864
|
""" Predict """
|
847
865
|
if self.check_yield_trend:
|
@@ -1040,17 +1058,27 @@ class Geocif:
|
|
1040
1058
|
|
1041
1059
|
if self.median_area_as_feature:
|
1042
1060
|
df = fe.compute_median_statistics(
|
1043
|
-
df,
|
1061
|
+
df,
|
1062
|
+
self.all_seasons_with_yield,
|
1063
|
+
self.number_median_years,
|
1064
|
+
"Area (ha)"
|
1044
1065
|
)
|
1045
1066
|
|
1046
1067
|
if self.lag_yield_as_feature:
|
1047
1068
|
df = fe.compute_lag_yield(
|
1048
|
-
df,
|
1069
|
+
df,
|
1070
|
+
self.all_seasons_with_yield,
|
1071
|
+
self.forecast_season,
|
1072
|
+
self.number_lag_years,
|
1073
|
+
self.target
|
1049
1074
|
)
|
1050
1075
|
|
1051
1076
|
if self.analogous_year_yield_as_feature:
|
1052
1077
|
df = fe.compute_analogous_yield(
|
1053
|
-
df,
|
1078
|
+
df,
|
1079
|
+
self.all_seasons_with_yield,
|
1080
|
+
self.number_median_years,
|
1081
|
+
self.target
|
1054
1082
|
)
|
1055
1083
|
|
1056
1084
|
# Create Region_ID column based on Region column category code
|
@@ -1066,6 +1094,8 @@ class Geocif:
|
|
1066
1094
|
|
1067
1095
|
# Region_ID should be type category
|
1068
1096
|
df["Region_ID"] = df["Region_ID"].astype("category")
|
1097
|
+
else:
|
1098
|
+
raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
|
1069
1099
|
|
1070
1100
|
return df
|
1071
1101
|
|
@@ -1247,7 +1277,10 @@ class Geocif:
|
|
1247
1277
|
)
|
1248
1278
|
pbar.update()
|
1249
1279
|
|
1250
|
-
|
1280
|
+
try:
|
1281
|
+
self.loop_ml(stage, dict_selected_features, dict_best_cei)
|
1282
|
+
except Exception as e:
|
1283
|
+
self.logger.error(e)
|
1251
1284
|
wandb.finish()
|
1252
1285
|
|
1253
1286
|
def setup(self, forecast_season, model):
|
@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
|
|
165
165
|
combinations = [
|
166
166
|
i
|
167
167
|
for i in combinations
|
168
|
-
if "
|
168
|
+
if "ethiopia" in i[3]
|
169
169
|
# or "lesotho_maize" in i[3] or
|
170
170
|
# # "namibia_" in i[2] or
|
171
171
|
# "united_republic_of_tanzania_maize" in i[3]
|
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
|
|
174
174
|
# or "south_africa_maize" in i[3]
|
175
175
|
# or "mozambique_maize" in i[3]
|
176
176
|
# or "united_states_of_america" in i[3]
|
177
|
-
or "russian_federation" in i[3]
|
177
|
+
#or "russian_federation" in i[3]
|
178
178
|
# or "ukraine" in i[3]
|
179
179
|
]
|
180
180
|
# "malawi" in i[2]]
|
181
181
|
|
182
182
|
if self.do_parallel:
|
183
|
-
num_cpu = int(cpu_count() * 0.
|
183
|
+
num_cpu = int(cpu_count() * 0.75)
|
184
184
|
with Pool(num_cpu) as p:
|
185
185
|
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
186
186
|
pass
|
@@ -3,7 +3,6 @@ import os
|
|
3
3
|
import matplotlib.pyplot as plt
|
4
4
|
import palettable as pal
|
5
5
|
import pandas as pd
|
6
|
-
import seaborn as sns
|
7
6
|
from tqdm import tqdm
|
8
7
|
|
9
8
|
from geocif import utils
|
@@ -68,6 +67,8 @@ def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
|
|
68
67
|
|
69
68
|
|
70
69
|
def plot_feature_corr_by_time(df, **kwargs):
|
70
|
+
import seaborn as sns
|
71
|
+
|
71
72
|
country = kwargs.get("country")
|
72
73
|
crop = kwargs.get("crop")
|
73
74
|
dir_output = kwargs.get("dir_output")
|
@@ -295,16 +296,13 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
295
296
|
df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
|
296
297
|
|
297
298
|
# Compute median of each CEI and sort the dataframe based on the absolute value of the median
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
)
|
306
|
-
except:
|
307
|
-
breakpoint()
|
299
|
+
dict_best_cei[region_id] = (
|
300
|
+
df_tmp2.groupby("Type")
|
301
|
+
.max()
|
302
|
+
.reset_index()
|
303
|
+
.sort_values("Value", ascending=False)["Metric"]
|
304
|
+
.values
|
305
|
+
)
|
308
306
|
|
309
307
|
kwargs["region_id"] = region_id
|
310
308
|
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
@@ -3,6 +3,7 @@ from collections import Counter
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
5
5
|
from scipy.stats import pearsonr as pearsonr
|
6
|
+
from tqdm import tqdm
|
6
7
|
|
7
8
|
|
8
9
|
def extract_regions(X, y, regions=[]):
|
@@ -32,10 +33,7 @@ def _compute_correlations(X, y):
|
|
32
33
|
f_series = X[feature]
|
33
34
|
|
34
35
|
# Ignore NaN values in either y or f_series
|
35
|
-
|
36
|
-
mask = ~(np.isnan(y) | np.isnan(f_series))
|
37
|
-
except:
|
38
|
-
breakpoint()
|
36
|
+
mask = ~(np.isnan(y) | np.isnan(f_series))
|
39
37
|
y_filtered = y[mask]
|
40
38
|
f_series_filtered = f_series[mask]
|
41
39
|
|
@@ -107,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
|
|
107
105
|
return feature_by_region, counter
|
108
106
|
|
109
107
|
|
110
|
-
def get_all_features_correlation(inputs
|
108
|
+
def get_all_features_correlation(inputs: pd.DataFrame,
|
109
|
+
targets: pd.Series,
|
110
|
+
method: str) -> pd.DataFrame:
|
111
111
|
"""
|
112
|
-
|
113
|
-
|
114
|
-
:param targets: pd.Series, target data
|
115
|
-
:param method: str, method to use to find the top correlated features
|
112
|
+
Fast version – identical output, no length-mismatch on regions whose
|
113
|
+
feature names contain no spaces.
|
116
114
|
"""
|
117
|
-
|
118
|
-
for region_id in inputs["Region"].unique():
|
119
|
-
X, y = extract_regions(inputs, targets, regions=[region_id])
|
115
|
+
numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
|
120
116
|
|
121
|
-
|
117
|
+
df_all = inputs[numeric_cols + ["Region"]].copy()
|
118
|
+
df_all["__target__"] = targets.values
|
122
119
|
|
123
|
-
|
124
|
-
feature_correlations = {
|
125
|
-
k: v for k, v in feature_correlations.items() if not np.isnan(v)
|
126
|
-
}
|
120
|
+
frames: list[pd.DataFrame] = []
|
127
121
|
|
128
|
-
|
122
|
+
for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
|
123
|
+
corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
|
124
|
+
if corr.empty:
|
129
125
|
continue
|
130
126
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
)
|
150
|
-
|
151
|
-
|
152
|
-
cols = df_pivoted.columns.tolist()
|
153
|
-
cols = cols[-1:] + cols[:-1]
|
154
|
-
df_pivoted = df_pivoted[cols]
|
127
|
+
# ---- safe split: always two columns --------------------------------
|
128
|
+
split = (
|
129
|
+
pd.Series(corr.index) # guarantees a Series
|
130
|
+
.str.split(" ", n=1, expand=True)
|
131
|
+
)
|
132
|
+
if split.shape[1] == 1: # no spaces in any feature name
|
133
|
+
split[1] = "" # match legacy behaviour
|
134
|
+
split.columns = [0, 1] # make column labels predictable
|
135
|
+
|
136
|
+
df_region = (
|
137
|
+
pd.DataFrame({
|
138
|
+
"Metric": split[0].values,
|
139
|
+
method: split[1].values,
|
140
|
+
"Value": corr.values # same length as above
|
141
|
+
})
|
142
|
+
.pivot_table(index=method, columns="Metric",
|
143
|
+
values="Value", aggfunc="first")
|
144
|
+
.reset_index()
|
145
|
+
)
|
146
|
+
df_region.insert(0, "Region", region_id)
|
147
|
+
frames.append(df_region)
|
155
148
|
|
156
|
-
|
149
|
+
return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
|
157
150
|
|
158
|
-
if len(frames):
|
159
|
-
feature_by_region = pd.concat(frames)
|
160
|
-
else:
|
161
|
-
feature_by_region = pd.DataFrame()
|
162
151
|
|
163
|
-
return feature_by_region
|
@@ -39,23 +39,32 @@ def compute_last_year_yield(df, target_col="Yield (tn per ha)"):
|
|
39
39
|
|
40
40
|
return df
|
41
41
|
|
42
|
-
def compute_closest_years(all_years, harvest_year, number_lag_years):
|
42
|
+
def compute_closest_years(all_years, harvest_year, number_lag_years, only_historic=False):
|
43
43
|
"""
|
44
44
|
Finds the historical years closest to a given harvest year,
|
45
|
-
excluding any future year (harvest_year itself and beyond).
|
45
|
+
excluding any future year (harvest_year itself and beyond) based on the only_historic flag.
|
46
46
|
|
47
47
|
Args:
|
48
48
|
all_years (array-like): List or array of all years to consider.
|
49
49
|
harvest_year (int): The year from which to compute distance.
|
50
50
|
number_lag_years (int): Number of closest years to return.
|
51
|
+
only_historic (bool): If True, only consider years before the harvest year.
|
51
52
|
|
52
53
|
Returns:
|
53
54
|
list: The historical years closest to the given harvest year.
|
54
55
|
Returns an empty list if no historical years exist.
|
55
56
|
"""
|
56
57
|
# Exclude the harvest year before computation to simplify logic
|
57
|
-
|
58
|
+
if only_historic:
|
59
|
+
filtered_years = [year for year in all_years if year < harvest_year]
|
60
|
+
else:
|
61
|
+
filtered_years = [year for year in all_years if year != harvest_year]
|
62
|
+
|
63
|
+
# If no historical years exist, return an empty list
|
64
|
+
if not filtered_years:
|
65
|
+
return []
|
58
66
|
|
67
|
+
# Sort the years based on their absolute difference from the harvest year
|
59
68
|
closest_years = np.array(filtered_years)[
|
60
69
|
np.argsort(np.abs(np.array(filtered_years) - harvest_year))[:number_lag_years]
|
61
70
|
]
|
@@ -150,7 +159,7 @@ def compute_user_median_statistics(df, user_years, target_col="Yield (tn per ha)
|
|
150
159
|
|
151
160
|
|
152
161
|
def compute_lag_yield(
|
153
|
-
df, all_seasons_with_yield, number_lag_years, target_col="Yield (tn per ha)"
|
162
|
+
df, all_seasons_with_yield, forecast_season, number_lag_years, target_col="Yield (tn per ha)"
|
154
163
|
):
|
155
164
|
# For the number of years specified in self.number_lag_years, add the yield of that number of years
|
156
165
|
# ago to the dataframe
|
@@ -169,7 +178,7 @@ def compute_lag_yield(
|
|
169
178
|
|
170
179
|
for harvest_year in unique_years:
|
171
180
|
closest_years = compute_closest_years(
|
172
|
-
all_seasons_with_yield, harvest_year, number_lag_years
|
181
|
+
all_seasons_with_yield, harvest_year, number_lag_years, only_historic=True
|
173
182
|
)
|
174
183
|
|
175
184
|
# For each year in the closest years, add the yield to the dataframe as a new column
|