geocif 0.1.45__tar.gz → 0.1.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.45/geocif.egg-info → geocif-0.1.47}/PKG-INFO +1 -1
- {geocif-0.1.45 → geocif-0.1.47}/geocif/analysis.py +7 -5
- {geocif-0.1.45 → geocif-0.1.47}/geocif/experiments.py +3 -9
- {geocif-0.1.45 → geocif-0.1.47}/geocif/geocif.py +211 -40
- {geocif-0.1.45 → geocif-0.1.47}/geocif/indices_runner.py +2 -2
- {geocif-0.1.45 → geocif-0.1.47}/geocif/indices_runner_v2.py +2 -2
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/correlations.py +49 -40
- geocif-0.1.47/geocif/ml/misc.py +33 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/output.py +0 -2
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/stages.py +18 -9
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/trainers.py +22 -0
- {geocif-0.1.45 → geocif-0.1.47/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/SOURCES.txt +1 -1
- {geocif-0.1.45 → geocif-0.1.47}/setup.py +1 -1
- geocif-0.1.45/geocif/ml/correlations_backup.py +0 -412
- {geocif-0.1.45 → geocif-0.1.47}/LICENSE +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/MANIFEST.in +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/README.md +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/constants.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/features.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/geo.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/backup/models.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/cei/indices.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/logger.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/stats.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/trend.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/ml/xai.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/automl.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/playground/misc.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/utils.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif/viz/plot.py +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/requirements.txt +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/setup.cfg +0 -0
- {geocif-0.1.45 → geocif-0.1.47}/tests/test_geocif.py +0 -0
@@ -162,8 +162,8 @@ class Geoanalysis:
|
|
162
162
|
return pd.DataFrame(), pd.DataFrame()
|
163
163
|
|
164
164
|
df_metrics = self._compute_metrics(df)
|
165
|
-
|
166
|
-
|
165
|
+
df_metrics = self._process_metrics(df_metrics)
|
166
|
+
self._plot_metrics(df_metrics)
|
167
167
|
|
168
168
|
df_regional_metrics_by_year = self._compute_regional_metrics(
|
169
169
|
df, by="Harvest Year"
|
@@ -172,8 +172,10 @@ class Geoanalysis:
|
|
172
172
|
df_regional_metrics_by_year
|
173
173
|
)
|
174
174
|
df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
|
175
|
-
|
176
|
-
self._store_results(
|
175
|
+
|
176
|
+
self._store_results(
|
177
|
+
df_metrics, df_regional_metrics, df_regional_metrics_by_year
|
178
|
+
)
|
177
179
|
|
178
180
|
df_national_yield = self._compute_national_yield(df)
|
179
181
|
self._plot_national_yield(df_national_yield)
|
@@ -193,7 +195,7 @@ class Geoanalysis:
|
|
193
195
|
.apply(self.annual_metrics)
|
194
196
|
.reset_index()
|
195
197
|
)
|
196
|
-
|
198
|
+
|
197
199
|
return df_metrics.pivot_table(
|
198
200
|
index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
199
201
|
columns="level_5",
|
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
85
85
|
|
86
86
|
# Experiment: lag_years
|
87
87
|
logger.info("Experiment 3: lag_years")
|
88
|
-
parser = main(
|
89
|
-
inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
|
90
|
-
)
|
88
|
+
parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
|
91
89
|
|
92
90
|
# Experiment: lag_yield_as_feature
|
93
91
|
logger.info("Experiment 4: lag_yield_as_feature")
|
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
103
101
|
|
104
102
|
# Experiment: median_years
|
105
103
|
logger.info("Experiment 5: median_years")
|
106
|
-
parser = main(
|
107
|
-
inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
|
108
|
-
)
|
104
|
+
parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
|
109
105
|
|
110
106
|
# Experiment: median_yield_as_feature
|
111
107
|
logger.info("Experiment 6: median_yield_as_feature")
|
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
133
129
|
|
134
130
|
# Experiment: optimize
|
135
131
|
logger.info("Experiment 8: optimize")
|
136
|
-
parser = main(
|
137
|
-
inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
|
138
|
-
)
|
132
|
+
parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
|
139
133
|
|
140
134
|
|
141
135
|
if __name__ == "__main__":
|
@@ -108,13 +108,18 @@ class Geocif:
|
|
108
108
|
Config file: ML
|
109
109
|
====================================================================
|
110
110
|
"""
|
111
|
-
self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
|
112
111
|
self.model_type = self.parser.get("ML", "model_type")
|
113
112
|
self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
|
114
113
|
self.analogous_year_yield_as_feature = self.parser.getboolean(
|
115
114
|
"ML", "analogous_year_yield_as_feature"
|
116
115
|
)
|
117
|
-
self.
|
116
|
+
self.plot_map_for_correlation_plot = self.parser.getboolean(
|
117
|
+
"ML", "plot_map_for_correlation_plot"
|
118
|
+
)
|
119
|
+
self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
|
120
|
+
self.include_lat_lon_as_feature = self.parser.getboolean(
|
121
|
+
"ML", "include_lat_lon_as_feature"
|
122
|
+
)
|
118
123
|
self.spatial_autocorrelation = self.parser.getboolean(
|
119
124
|
"ML", "spatial_autocorrelation"
|
120
125
|
)
|
@@ -147,6 +152,9 @@ class Geocif:
|
|
147
152
|
self.parser.get("ML", "cat_features")
|
148
153
|
)
|
149
154
|
|
155
|
+
self.use_cumulative_features = self.parser.getboolean(
|
156
|
+
"DEFAULT", "use_cumulative_features"
|
157
|
+
)
|
150
158
|
"""
|
151
159
|
====================================================================
|
152
160
|
Variables, Paths
|
@@ -192,6 +200,9 @@ class Geocif:
|
|
192
200
|
|
193
201
|
self.db_path = self.dir_db / self.db_forecasts
|
194
202
|
|
203
|
+
# Store config file in database
|
204
|
+
output.config_to_db(self.db_path, self.parser, self.today)
|
205
|
+
|
195
206
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
196
207
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
197
208
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -218,18 +229,29 @@ class Geocif:
|
|
218
229
|
y_train = df_region[target_col]
|
219
230
|
|
220
231
|
if self.ml_model:
|
221
|
-
self.
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
232
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
233
|
+
all_features = X_train.columns
|
234
|
+
|
235
|
+
# Select the columns with use_ceis in it
|
236
|
+
self.selected_features = [
|
237
|
+
column
|
238
|
+
for column in all_features
|
239
|
+
if any(cei in column for cei in self.use_ceis)
|
240
|
+
]
|
241
|
+
else:
|
242
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
243
|
+
selector, _, self.selected_features = fs.select_features(
|
244
|
+
X_train, y_train, method=self.feature_selection
|
245
|
+
)
|
246
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
226
247
|
|
227
248
|
""" Update model to include conformal estimates """
|
228
|
-
if "lat" not in self.selected_features and self.
|
249
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
229
250
|
self.selected_features.append("lat")
|
230
|
-
if "lon" not in self.selected_features and self.
|
251
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
231
252
|
self.selected_features.append("lon")
|
232
253
|
X_train = df_region[self.selected_features + self.cat_features]
|
254
|
+
|
233
255
|
dir_output = (
|
234
256
|
self.dir_analysis
|
235
257
|
/ self.country
|
@@ -306,8 +328,38 @@ class Geocif:
|
|
306
328
|
self.best_hyperparams = {}
|
307
329
|
elif self.model_name in ["cubist"]:
|
308
330
|
self.model.fit(X_train, y_train)
|
309
|
-
|
310
|
-
|
331
|
+
elif self.model_name in [
|
332
|
+
"cumulative_1",
|
333
|
+
"cumulative_2",
|
334
|
+
"cumulative_3",
|
335
|
+
]:
|
336
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
|
+
|
338
|
+
# Standardize the numeric features
|
339
|
+
scaler = StandardScaler()
|
340
|
+
X_numeric = X_train.iloc[:, :3]
|
341
|
+
X_scaled_numeric = pd.DataFrame(
|
342
|
+
scaler.fit_transform(X_numeric),
|
343
|
+
columns=X_numeric.columns,
|
344
|
+
index=X_train.index,
|
345
|
+
)
|
346
|
+
|
347
|
+
# Encode the Region as categorical
|
348
|
+
le = LabelEncoder()
|
349
|
+
X_region = pd.Series(
|
350
|
+
le.fit_transform(X_train["Region"]),
|
351
|
+
name="Region",
|
352
|
+
index=X_train.index,
|
353
|
+
)
|
354
|
+
|
355
|
+
# Combine scaled numeric features and encoded region
|
356
|
+
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
357
|
+
|
358
|
+
self.model.fit(X_train_scaled, y_train)
|
359
|
+
except Exception as e:
|
360
|
+
self.logger.error(
|
361
|
+
f"Error fitting model for {self.country} {self.crop} {e}"
|
362
|
+
)
|
311
363
|
|
312
364
|
def predict(self, df_region, scaler=None):
|
313
365
|
"""
|
@@ -354,6 +406,33 @@ class Geocif:
|
|
354
406
|
X_test, Z_test, clusters_test.astype("object")
|
355
407
|
)
|
356
408
|
best_hyperparameters = self.model.fe_model.get_params().copy()
|
409
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
|
+
|
412
|
+
# Standardize the numeric features
|
413
|
+
scaler = StandardScaler()
|
414
|
+
X_numeric = X_test.iloc[:, :3]
|
415
|
+
try:
|
416
|
+
X_scaled_numeric = pd.DataFrame(
|
417
|
+
scaler.fit_transform(X_numeric),
|
418
|
+
columns=X_numeric.columns,
|
419
|
+
index=X_test.index,
|
420
|
+
)
|
421
|
+
except:
|
422
|
+
breakpoint()
|
423
|
+
|
424
|
+
# Encode the Region as categorical
|
425
|
+
le = LabelEncoder()
|
426
|
+
X_region = pd.Series(
|
427
|
+
le.fit_transform(X_test["Region"]),
|
428
|
+
name="Region",
|
429
|
+
index=X_test.index,
|
430
|
+
)
|
431
|
+
|
432
|
+
# Combine scaled numeric features and encoded region
|
433
|
+
X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
434
|
+
y_pred = self.model.predict(X_test_scaled)
|
435
|
+
best_hyperparameters = {} # self.model.get_params().copy()
|
357
436
|
elif self.model_name == "geospaNN":
|
358
437
|
import torch
|
359
438
|
import geospaNN
|
@@ -495,7 +574,9 @@ class Geocif:
|
|
495
574
|
"Crop",
|
496
575
|
"Harvest Year",
|
497
576
|
"Stage Name",
|
577
|
+
"Time",
|
498
578
|
]
|
579
|
+
|
499
580
|
df.index = df.apply(
|
500
581
|
lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
501
582
|
)
|
@@ -507,28 +588,37 @@ class Geocif:
|
|
507
588
|
|
508
589
|
def create_feature_names(self, stages_features, selected_features):
|
509
590
|
"""
|
591
|
+
Create feature names for machine learning stages.
|
510
592
|
|
511
593
|
Args:
|
512
|
-
stages_features:
|
513
|
-
selected_features:
|
594
|
+
stages_features (list): List of features for different stages.
|
595
|
+
selected_features (dict): Dictionary of selected features.
|
514
596
|
|
515
597
|
Returns:
|
516
|
-
|
598
|
+
None
|
517
599
|
"""
|
600
|
+
# Assert stages_features is a list
|
601
|
+
assert isinstance(stages_features, list), "stages_features should be a list"
|
602
|
+
|
518
603
|
# Clear out feature names
|
519
604
|
self.feature_names = []
|
520
605
|
|
521
|
-
"""
|
606
|
+
"""
|
607
|
+
Select stages that will be used for ML
|
522
608
|
1. method = "latest" - Select the latest stage
|
523
609
|
2. method = "fraction" - Select a fraction (1-100) of all stages
|
524
610
|
"""
|
611
|
+
method = "fraction"
|
612
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
613
|
+
method = "latest"
|
614
|
+
|
525
615
|
stages_features = stages.select_stages_for_ml(
|
526
|
-
stages_features, method=
|
616
|
+
stages_features, method=method, n=60
|
527
617
|
)
|
528
618
|
|
529
619
|
for stage in stages_features:
|
530
620
|
# Convert each element of stage to str and join with _
|
531
|
-
_stage = "_".join(
|
621
|
+
_stage = "_".join(map(str, stage))
|
532
622
|
|
533
623
|
# Create a list appending _stage to each element of combined_keys
|
534
624
|
_tmp = [f"{col}_{_stage}" for col in self.combined_keys]
|
@@ -537,17 +627,33 @@ class Geocif:
|
|
537
627
|
parts = _t.split("_")
|
538
628
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
539
629
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
self.
|
550
|
-
|
630
|
+
try:
|
631
|
+
if self.model_name in [
|
632
|
+
"cumulative_1",
|
633
|
+
"cumulative_2",
|
634
|
+
"cumulative_3",
|
635
|
+
]:
|
636
|
+
dict_fn = stages.get_stage_information_dict(_t, self.method)
|
637
|
+
tmp_col = f"{dict_fn['CEI']}"
|
638
|
+
|
639
|
+
if tmp_col in self.df_train.columns:
|
640
|
+
self.feature_names.append(tmp_col)
|
641
|
+
else:
|
642
|
+
# Check if any element of dict_selected_features is in _t
|
643
|
+
if selected_features["CEI"].any():
|
644
|
+
for x in selected_features["CEI"].values:
|
645
|
+
if x not in cei:
|
646
|
+
continue
|
647
|
+
|
648
|
+
dict_fn = stages.get_stage_information_dict(
|
649
|
+
_t, self.method
|
650
|
+
)
|
651
|
+
tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
|
652
|
+
|
653
|
+
if tmp_col in self.df_train.columns:
|
654
|
+
self.feature_names.append(tmp_col)
|
655
|
+
except:
|
656
|
+
breakpoint()
|
551
657
|
self.feature_names = list(set(self.feature_names))
|
552
658
|
|
553
659
|
if self.median_yield_as_feature:
|
@@ -559,16 +665,14 @@ class Geocif:
|
|
559
665
|
self.feature_names.append(f"t -{i} {self.target}")
|
560
666
|
|
561
667
|
if self.analogous_year_yield_as_feature:
|
562
|
-
self.feature_names.
|
563
|
-
self.feature_names.append("Analogous Year Yield")
|
668
|
+
self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
|
564
669
|
|
565
670
|
if self.use_outlook_as_feature:
|
566
671
|
self.feature_names.append("FCST")
|
567
672
|
|
568
673
|
# Add lat and lon to feature names
|
569
|
-
if self.
|
570
|
-
self.feature_names.
|
571
|
-
self.feature_names.append("lon")
|
674
|
+
if self.include_lat_lon_as_feature:
|
675
|
+
self.feature_names.extend(["lat", "lon"])
|
572
676
|
|
573
677
|
self.selected_features = []
|
574
678
|
|
@@ -592,6 +696,8 @@ class Geocif:
|
|
592
696
|
for idx, region in enumerate(pbar):
|
593
697
|
if self.model_name in ["linear", "gam"]:
|
594
698
|
self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
|
699
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
700
|
+
self.create_feature_names(stages, {})
|
595
701
|
elif self.ml_model:
|
596
702
|
self.create_feature_names(stages, dict_selected_features[region])
|
597
703
|
elif self.model_name in ["median"]:
|
@@ -721,11 +827,52 @@ class Geocif:
|
|
721
827
|
parts = all_cei_columns[-1].split("_")
|
722
828
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
723
829
|
|
724
|
-
#
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
830
|
+
# For each region, find the column with the longest string in cei_column
|
831
|
+
group_by = ["Region"]
|
832
|
+
groups = df.groupby(group_by)
|
833
|
+
if self.use_cumulative_features:
|
834
|
+
frames = []
|
835
|
+
for name, group in groups:
|
836
|
+
# Drop columns with all NaNs
|
837
|
+
group.dropna(axis=1, how="all", inplace=True)
|
838
|
+
|
839
|
+
cei_column = group[
|
840
|
+
group.columns[group.columns.str.contains(cei)]
|
841
|
+
].columns
|
842
|
+
max_cei_col = max(cei_column, key=len)
|
843
|
+
self.stage_info = stages.get_stage_information_dict(
|
844
|
+
max_cei_col, self.method
|
845
|
+
)
|
846
|
+
|
847
|
+
# Subset dataframes to columns that contain self.stage_info["Stage_ID"]
|
848
|
+
all_columns = group.columns[
|
849
|
+
group.columns.str.contains(self.stage_info["Stage_ID"])
|
850
|
+
].tolist()
|
851
|
+
|
852
|
+
group = group[
|
853
|
+
self.fixed_columns
|
854
|
+
+ [self.target]
|
855
|
+
+ self.statistics_columns
|
856
|
+
+ all_columns
|
857
|
+
]
|
858
|
+
# rename all_columns to self.stage_info["CEI"]
|
859
|
+
group.rename(
|
860
|
+
columns={
|
861
|
+
col: stages.get_stage_information_dict(col, self.method)["CEI"]
|
862
|
+
for col in all_columns
|
863
|
+
},
|
864
|
+
inplace=True,
|
865
|
+
)
|
866
|
+
|
867
|
+
frames.append(group)
|
868
|
+
|
869
|
+
df = pd.concat(frames)
|
870
|
+
else:
|
871
|
+
# HACK: Get feature name with GD4 in it to extract first and last stage id and name
|
872
|
+
cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
|
873
|
+
# Select the longest string in cei_column
|
874
|
+
cei_col = max(cei_column, key=len)
|
875
|
+
self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
|
729
876
|
|
730
877
|
# Change column name
|
731
878
|
# e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
|
@@ -789,12 +936,14 @@ class Geocif:
|
|
789
936
|
|
790
937
|
mask = self.df_results["Stage_ID"].isin(_stages)
|
791
938
|
df = self.df_results[mask]
|
792
|
-
|
793
939
|
""" Select which CEI categories to use for ML """
|
794
940
|
if "all" in self.use_ceis:
|
795
941
|
pass
|
796
942
|
else:
|
797
|
-
|
943
|
+
if self.select_cei_by == "Type":
|
944
|
+
df = df[df["Type"].isin(self.use_ceis)]
|
945
|
+
elif self.select_cei_by == "Index":
|
946
|
+
df = df[df["Index"].isin(self.use_ceis)]
|
798
947
|
|
799
948
|
""" Convert this dataframe into an ML ready format and save to disk """
|
800
949
|
df = self.create_ml_dataframe(df)
|
@@ -859,12 +1008,17 @@ class Geocif:
|
|
859
1008
|
dict_kwargs["method"] = self.method
|
860
1009
|
dict_kwargs["national_correlation"] = self.national_correlation
|
861
1010
|
dict_kwargs["groupby"] = self.correlation_plot_groupby
|
1011
|
+
dict_kwargs["cluster_strategy"] = self.cluster_strategy
|
862
1012
|
dict_kwargs["dg_country"] = self.dg_country
|
863
1013
|
dict_kwargs["combined_dict"] = self.combined_dict
|
1014
|
+
dict_kwargs["plot_map"] = self.plot_map_for_correlation_plot
|
1015
|
+
dict_kwargs["correlation_threshold"] = self.correlation_threshold
|
864
1016
|
|
865
1017
|
if self.spatial_autocorrelation:
|
866
1018
|
sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
|
867
1019
|
|
1020
|
+
dict_selected_features = {}
|
1021
|
+
dict_best_cei = {}
|
868
1022
|
if self.correlation_plots:
|
869
1023
|
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
870
1024
|
(
|
@@ -940,6 +1094,8 @@ class Geocif:
|
|
940
1094
|
self.model_name = model
|
941
1095
|
self.experiment_name = self.parser.get("ML", "experiment_name")
|
942
1096
|
self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
|
1097
|
+
self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
|
1098
|
+
self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
|
943
1099
|
self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
|
944
1100
|
self.optimize = self.parser.getboolean(self.country, "optimize")
|
945
1101
|
self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
|
@@ -951,6 +1107,21 @@ class Geocif:
|
|
951
1107
|
self.estimate_ci = False
|
952
1108
|
self.check_yield_trend = False
|
953
1109
|
self.estimate_ci_for_all = False
|
1110
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
1111
|
+
self.correlation_plots = False
|
1112
|
+
self.lag_yield_as_feature = False
|
1113
|
+
self.median_yield_as_feature = False
|
1114
|
+
self.median_area_as_feature = False
|
1115
|
+
self.analogous_year_yield_as_feature = False
|
1116
|
+
self.last_year_yield_as_feature = False
|
1117
|
+
self.include_lat_lon_as_feature = False
|
1118
|
+
self.do_xai = False
|
1119
|
+
self.estimate_ci = False
|
1120
|
+
self.estimate_ci_for_all = False
|
1121
|
+
self.check_yield_trend = False
|
1122
|
+
self.cluster_strategy = "single"
|
1123
|
+
self.select_cei_by = "Index"
|
1124
|
+
self.use_cumulative_features = True
|
954
1125
|
else:
|
955
1126
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
956
1127
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
|
|
173
173
|
or "south_africa_maize" in i[3]
|
174
174
|
or "mozambique_maize" in i[3]
|
175
175
|
or "united_states_of_america" in i[3]
|
176
|
-
|
177
|
-
|
176
|
+
or "russian_federation" in i[3]
|
177
|
+
or "ukraine" in i[3]
|
178
178
|
]
|
179
179
|
# "malawi" in i[2]]
|
180
180
|
|
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
|
|
47
47
|
|
48
48
|
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
49
49
|
self.base_dir = Path(
|
50
|
-
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\
|
50
|
+
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
|
51
51
|
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
52
52
|
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
53
53
|
|
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
|
|
164
164
|
# Only keep those entries in combinations where the third elemt is
|
165
165
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
166
166
|
# This is done to test the code for these countries
|
167
|
-
combinations = [i for i in combinations if "
|
167
|
+
combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
|
168
168
|
|
169
169
|
if True:
|
170
170
|
num_cpu = int(cpu_count() * 0.5)
|
@@ -74,18 +74,24 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
74
74
|
forecast_season = kwargs.get("forecast_season")
|
75
75
|
national_correlation = kwargs.get("national_correlation")
|
76
76
|
group_by = kwargs.get("groupby")
|
77
|
+
plot_map = kwargs.get("plot_map")
|
78
|
+
region_name = kwargs.get("region_name")
|
77
79
|
|
78
80
|
# Setup the figure and gridspec
|
79
81
|
fig = plt.figure(figsize=(10, 5))
|
80
|
-
|
81
|
-
|
82
|
-
|
82
|
+
if plot_map:
|
83
|
+
gs = fig.add_gridspec(
|
84
|
+
3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
|
85
|
+
)
|
86
|
+
else:
|
87
|
+
gs = fig.add_gridspec(3, 1, height_ratios=[6, 5, 1], hspace=0.6, wspace=0.0)
|
83
88
|
|
84
89
|
# Assign subplots
|
85
90
|
ax_heatmap = fig.add_subplot(gs[0:2, 0])
|
86
|
-
ax_map = fig.add_subplot(gs[0, 1])
|
87
91
|
cbar_ax = fig.add_subplot(gs[2, 0])
|
88
|
-
|
92
|
+
if plot_map:
|
93
|
+
ax_map = fig.add_subplot(gs[0, 1])
|
94
|
+
ax4 = fig.add_subplot(gs[2, 1])
|
89
95
|
|
90
96
|
# Transpose and reverse the columns of the dataframe
|
91
97
|
df_transpose = df.T
|
@@ -107,43 +113,43 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
107
113
|
)
|
108
114
|
ax_heatmap.tick_params(left=False, bottom=False)
|
109
115
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
if not national_correlation:
|
123
|
-
id = kwargs["region_id"]
|
124
|
-
dg_region = dg_country[dg_country[group_by] == id]
|
125
|
-
ax_map = dg_region.plot(
|
126
|
-
ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
|
116
|
+
if plot_map:
|
117
|
+
# Plot the map using GeoPandas
|
118
|
+
dg_country = kwargs.get("dg_country")
|
119
|
+
|
120
|
+
ax_map = dg_country.plot(
|
121
|
+
ax=ax_map,
|
122
|
+
color="white",
|
123
|
+
edgecolor="black",
|
124
|
+
linewidth=1.0,
|
125
|
+
facecolor=None,
|
126
|
+
legend=False,
|
127
127
|
)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
128
|
+
|
129
|
+
id = kwargs["region_id"]
|
130
|
+
if plot_map:
|
131
|
+
if not national_correlation:
|
132
|
+
dg_region = dg_country[dg_country[group_by] == id]
|
133
|
+
ax_map = dg_region.plot(
|
134
|
+
ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
|
135
|
+
)
|
136
|
+
# Set title with color blue
|
137
|
+
ax_map.set_title(f"Region: {id}", color="blue")
|
138
|
+
|
139
|
+
# No colorbar for the map
|
140
|
+
ax_map.axis("off")
|
141
|
+
# Remove borders
|
142
|
+
ax_map.spines["top"].set_visible(False)
|
143
|
+
ax_map.spines["right"].set_visible(False)
|
144
|
+
ax_map.spines["bottom"].set_visible(False)
|
145
|
+
ax_map.spines["left"].set_visible(False)
|
146
|
+
# ax4 should not be visible
|
147
|
+
ax4.axis("off")
|
140
148
|
|
141
149
|
# Add colorbar label
|
142
150
|
# cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
|
143
151
|
cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
|
144
|
-
ax_heatmap.set_xticklabels(
|
145
|
-
ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5
|
146
|
-
)
|
152
|
+
ax_heatmap.set_xticklabels(ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=5)
|
147
153
|
ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=5)
|
148
154
|
ax_heatmap.set_xlabel("")
|
149
155
|
ax_heatmap.set_ylabel(" ")
|
@@ -151,12 +157,13 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
151
157
|
cbar_ax.tick_params(axis="both", which="major", labelsize=5)
|
152
158
|
|
153
159
|
_country = country.title().replace("_", " ")
|
160
|
+
_region_name = region_name if not national_correlation else ""
|
154
161
|
_crop = crop.title().replace("_", " ")
|
155
162
|
if not national_correlation:
|
156
163
|
fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
|
157
164
|
else:
|
158
165
|
fname = f"{country}_{crop}_corr_feature_by_time.png"
|
159
|
-
ax_heatmap.set_title(f"{_country}\n{
|
166
|
+
ax_heatmap.set_title(f"{_country}, {_crop}\n{_region_name}")
|
160
167
|
|
161
168
|
# plt.tight_layout()
|
162
169
|
os.makedirs(dir_output, exist_ok=True)
|
@@ -246,14 +253,14 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
246
253
|
Returns:
|
247
254
|
|
248
255
|
"""
|
249
|
-
THRESHOLD = 0.1
|
250
256
|
national_correlation = kwargs.get("national_correlation")
|
251
257
|
group_by = kwargs.get("groupby")
|
252
258
|
combined_dict = kwargs.get("combined_dict")
|
259
|
+
THRESHOLD = kwargs.get("correlation_threshold")
|
253
260
|
|
254
261
|
dict_selected_features = {}
|
255
262
|
dict_best_cei = {}
|
256
|
-
|
263
|
+
|
257
264
|
if not national_correlation:
|
258
265
|
groups = df.groupby(group_by)
|
259
266
|
for region_id, group in tqdm(
|
@@ -297,6 +304,8 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
297
304
|
)
|
298
305
|
|
299
306
|
kwargs["region_id"] = region_id
|
307
|
+
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
308
|
+
kwargs["region_name"] = _region_names
|
300
309
|
plot_feature_corr_by_time(df_tmp, **kwargs)
|
301
310
|
# For each element in dict_best_cei, add the type of the cei
|
302
311
|
else:
|