geocif 0.1.46__tar.gz → 0.1.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.46/geocif.egg-info → geocif-0.1.47}/PKG-INFO +1 -1
- {geocif-0.1.46 → geocif-0.1.47}/geocif/analysis.py +7 -5
- {geocif-0.1.46 → geocif-0.1.47}/geocif/experiments.py +3 -9
- {geocif-0.1.46 → geocif-0.1.47}/geocif/geocif.py +204 -42
- {geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner.py +2 -2
- {geocif-0.1.46 → geocif-0.1.47}/geocif/indices_runner_v2.py +2 -2
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/correlations.py +3 -3
- geocif-0.1.47/geocif/ml/misc.py +33 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/output.py +0 -2
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/stages.py +18 -9
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/trainers.py +22 -0
- {geocif-0.1.46 → geocif-0.1.47/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/SOURCES.txt +1 -1
- {geocif-0.1.46 → geocif-0.1.47}/setup.py +1 -1
- geocif-0.1.46/geocif/ml/correlations_backup.py +0 -412
- {geocif-0.1.46 → geocif-0.1.47}/LICENSE +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/MANIFEST.in +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/README.md +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/constants.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/features.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/geo.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/backup/models.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/cei/indices.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/logger.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/stats.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/trend.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/ml/xai.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/automl.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/playground/misc.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/utils.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif/viz/plot.py +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/requirements.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/setup.cfg +0 -0
- {geocif-0.1.46 → geocif-0.1.47}/tests/test_geocif.py +0 -0
@@ -162,8 +162,8 @@ class Geoanalysis:
|
|
162
162
|
return pd.DataFrame(), pd.DataFrame()
|
163
163
|
|
164
164
|
df_metrics = self._compute_metrics(df)
|
165
|
-
|
166
|
-
|
165
|
+
df_metrics = self._process_metrics(df_metrics)
|
166
|
+
self._plot_metrics(df_metrics)
|
167
167
|
|
168
168
|
df_regional_metrics_by_year = self._compute_regional_metrics(
|
169
169
|
df, by="Harvest Year"
|
@@ -172,8 +172,10 @@ class Geoanalysis:
|
|
172
172
|
df_regional_metrics_by_year
|
173
173
|
)
|
174
174
|
df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
|
175
|
-
|
176
|
-
self._store_results(
|
175
|
+
|
176
|
+
self._store_results(
|
177
|
+
df_metrics, df_regional_metrics, df_regional_metrics_by_year
|
178
|
+
)
|
177
179
|
|
178
180
|
df_national_yield = self._compute_national_yield(df)
|
179
181
|
self._plot_national_yield(df_national_yield)
|
@@ -193,7 +195,7 @@ class Geoanalysis:
|
|
193
195
|
.apply(self.annual_metrics)
|
194
196
|
.reset_index()
|
195
197
|
)
|
196
|
-
|
198
|
+
|
197
199
|
return df_metrics.pivot_table(
|
198
200
|
index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
199
201
|
columns="level_5",
|
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
85
85
|
|
86
86
|
# Experiment: lag_years
|
87
87
|
logger.info("Experiment 3: lag_years")
|
88
|
-
parser = main(
|
89
|
-
inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
|
90
|
-
)
|
88
|
+
parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
|
91
89
|
|
92
90
|
# Experiment: lag_yield_as_feature
|
93
91
|
logger.info("Experiment 4: lag_yield_as_feature")
|
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
103
101
|
|
104
102
|
# Experiment: median_years
|
105
103
|
logger.info("Experiment 5: median_years")
|
106
|
-
parser = main(
|
107
|
-
inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
|
108
|
-
)
|
104
|
+
parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
|
109
105
|
|
110
106
|
# Experiment: median_yield_as_feature
|
111
107
|
logger.info("Experiment 6: median_yield_as_feature")
|
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
133
129
|
|
134
130
|
# Experiment: optimize
|
135
131
|
logger.info("Experiment 8: optimize")
|
136
|
-
parser = main(
|
137
|
-
inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
|
138
|
-
)
|
132
|
+
parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
|
139
133
|
|
140
134
|
|
141
135
|
if __name__ == "__main__":
|
@@ -108,7 +108,6 @@ class Geocif:
|
|
108
108
|
Config file: ML
|
109
109
|
====================================================================
|
110
110
|
"""
|
111
|
-
self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
|
112
111
|
self.model_type = self.parser.get("ML", "model_type")
|
113
112
|
self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
|
114
113
|
self.analogous_year_yield_as_feature = self.parser.getboolean(
|
@@ -117,10 +116,10 @@ class Geocif:
|
|
117
116
|
self.plot_map_for_correlation_plot = self.parser.getboolean(
|
118
117
|
"ML", "plot_map_for_correlation_plot"
|
119
118
|
)
|
120
|
-
self.correlation_threshold = self.parser.getfloat(
|
121
|
-
|
119
|
+
self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
|
120
|
+
self.include_lat_lon_as_feature = self.parser.getboolean(
|
121
|
+
"ML", "include_lat_lon_as_feature"
|
122
122
|
)
|
123
|
-
self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
|
124
123
|
self.spatial_autocorrelation = self.parser.getboolean(
|
125
124
|
"ML", "spatial_autocorrelation"
|
126
125
|
)
|
@@ -153,6 +152,9 @@ class Geocif:
|
|
153
152
|
self.parser.get("ML", "cat_features")
|
154
153
|
)
|
155
154
|
|
155
|
+
self.use_cumulative_features = self.parser.getboolean(
|
156
|
+
"DEFAULT", "use_cumulative_features"
|
157
|
+
)
|
156
158
|
"""
|
157
159
|
====================================================================
|
158
160
|
Variables, Paths
|
@@ -198,6 +200,9 @@ class Geocif:
|
|
198
200
|
|
199
201
|
self.db_path = self.dir_db / self.db_forecasts
|
200
202
|
|
203
|
+
# Store config file in database
|
204
|
+
output.config_to_db(self.db_path, self.parser, self.today)
|
205
|
+
|
201
206
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
202
207
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
203
208
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -224,18 +229,29 @@ class Geocif:
|
|
224
229
|
y_train = df_region[target_col]
|
225
230
|
|
226
231
|
if self.ml_model:
|
227
|
-
self.
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
233
|
+
all_features = X_train.columns
|
234
|
+
|
235
|
+
# Select the columns with use_ceis in it
|
236
|
+
self.selected_features = [
|
237
|
+
column
|
238
|
+
for column in all_features
|
239
|
+
if any(cei in column for cei in self.use_ceis)
|
240
|
+
]
|
241
|
+
else:
|
242
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
243
|
+
selector, _, self.selected_features = fs.select_features(
|
244
|
+
X_train, y_train, method=self.feature_selection
|
245
|
+
)
|
246
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
232
247
|
|
233
248
|
""" Update model to include conformal estimates """
|
234
|
-
if "lat" not in self.selected_features and self.
|
249
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
235
250
|
self.selected_features.append("lat")
|
236
|
-
if "lon" not in self.selected_features and self.
|
251
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
237
252
|
self.selected_features.append("lon")
|
238
253
|
X_train = df_region[self.selected_features + self.cat_features]
|
254
|
+
|
239
255
|
dir_output = (
|
240
256
|
self.dir_analysis
|
241
257
|
/ self.country
|
@@ -312,8 +328,38 @@ class Geocif:
|
|
312
328
|
self.best_hyperparams = {}
|
313
329
|
elif self.model_name in ["cubist"]:
|
314
330
|
self.model.fit(X_train, y_train)
|
315
|
-
|
316
|
-
|
331
|
+
elif self.model_name in [
|
332
|
+
"cumulative_1",
|
333
|
+
"cumulative_2",
|
334
|
+
"cumulative_3",
|
335
|
+
]:
|
336
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
|
+
|
338
|
+
# Standardize the numeric features
|
339
|
+
scaler = StandardScaler()
|
340
|
+
X_numeric = X_train.iloc[:, :3]
|
341
|
+
X_scaled_numeric = pd.DataFrame(
|
342
|
+
scaler.fit_transform(X_numeric),
|
343
|
+
columns=X_numeric.columns,
|
344
|
+
index=X_train.index,
|
345
|
+
)
|
346
|
+
|
347
|
+
# Encode the Region as categorical
|
348
|
+
le = LabelEncoder()
|
349
|
+
X_region = pd.Series(
|
350
|
+
le.fit_transform(X_train["Region"]),
|
351
|
+
name="Region",
|
352
|
+
index=X_train.index,
|
353
|
+
)
|
354
|
+
|
355
|
+
# Combine scaled numeric features and encoded region
|
356
|
+
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
357
|
+
|
358
|
+
self.model.fit(X_train_scaled, y_train)
|
359
|
+
except Exception as e:
|
360
|
+
self.logger.error(
|
361
|
+
f"Error fitting model for {self.country} {self.crop} {e}"
|
362
|
+
)
|
317
363
|
|
318
364
|
def predict(self, df_region, scaler=None):
|
319
365
|
"""
|
@@ -360,6 +406,33 @@ class Geocif:
|
|
360
406
|
X_test, Z_test, clusters_test.astype("object")
|
361
407
|
)
|
362
408
|
best_hyperparameters = self.model.fe_model.get_params().copy()
|
409
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
|
+
|
412
|
+
# Standardize the numeric features
|
413
|
+
scaler = StandardScaler()
|
414
|
+
X_numeric = X_test.iloc[:, :3]
|
415
|
+
try:
|
416
|
+
X_scaled_numeric = pd.DataFrame(
|
417
|
+
scaler.fit_transform(X_numeric),
|
418
|
+
columns=X_numeric.columns,
|
419
|
+
index=X_test.index,
|
420
|
+
)
|
421
|
+
except:
|
422
|
+
breakpoint()
|
423
|
+
|
424
|
+
# Encode the Region as categorical
|
425
|
+
le = LabelEncoder()
|
426
|
+
X_region = pd.Series(
|
427
|
+
le.fit_transform(X_test["Region"]),
|
428
|
+
name="Region",
|
429
|
+
index=X_test.index,
|
430
|
+
)
|
431
|
+
|
432
|
+
# Combine scaled numeric features and encoded region
|
433
|
+
X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
434
|
+
y_pred = self.model.predict(X_test_scaled)
|
435
|
+
best_hyperparameters = {} # self.model.get_params().copy()
|
363
436
|
elif self.model_name == "geospaNN":
|
364
437
|
import torch
|
365
438
|
import geospaNN
|
@@ -501,7 +574,9 @@ class Geocif:
|
|
501
574
|
"Crop",
|
502
575
|
"Harvest Year",
|
503
576
|
"Stage Name",
|
577
|
+
"Time",
|
504
578
|
]
|
579
|
+
|
505
580
|
df.index = df.apply(
|
506
581
|
lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
507
582
|
)
|
@@ -513,28 +588,37 @@ class Geocif:
|
|
513
588
|
|
514
589
|
def create_feature_names(self, stages_features, selected_features):
|
515
590
|
"""
|
591
|
+
Create feature names for machine learning stages.
|
516
592
|
|
517
593
|
Args:
|
518
|
-
stages_features:
|
519
|
-
selected_features:
|
594
|
+
stages_features (list): List of features for different stages.
|
595
|
+
selected_features (dict): Dictionary of selected features.
|
520
596
|
|
521
597
|
Returns:
|
522
|
-
|
598
|
+
None
|
523
599
|
"""
|
600
|
+
# Assert stages_features is a list
|
601
|
+
assert isinstance(stages_features, list), "stages_features should be a list"
|
602
|
+
|
524
603
|
# Clear out feature names
|
525
604
|
self.feature_names = []
|
526
605
|
|
527
|
-
"""
|
606
|
+
"""
|
607
|
+
Select stages that will be used for ML
|
528
608
|
1. method = "latest" - Select the latest stage
|
529
609
|
2. method = "fraction" - Select a fraction (1-100) of all stages
|
530
610
|
"""
|
611
|
+
method = "fraction"
|
612
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
613
|
+
method = "latest"
|
614
|
+
|
531
615
|
stages_features = stages.select_stages_for_ml(
|
532
|
-
stages_features, method=
|
616
|
+
stages_features, method=method, n=60
|
533
617
|
)
|
534
618
|
|
535
619
|
for stage in stages_features:
|
536
620
|
# Convert each element of stage to str and join with _
|
537
|
-
_stage = "_".join(
|
621
|
+
_stage = "_".join(map(str, stage))
|
538
622
|
|
539
623
|
# Create a list appending _stage to each element of combined_keys
|
540
624
|
_tmp = [f"{col}_{_stage}" for col in self.combined_keys]
|
@@ -543,17 +627,33 @@ class Geocif:
|
|
543
627
|
parts = _t.split("_")
|
544
628
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
545
629
|
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
self.
|
556
|
-
|
630
|
+
try:
|
631
|
+
if self.model_name in [
|
632
|
+
"cumulative_1",
|
633
|
+
"cumulative_2",
|
634
|
+
"cumulative_3",
|
635
|
+
]:
|
636
|
+
dict_fn = stages.get_stage_information_dict(_t, self.method)
|
637
|
+
tmp_col = f"{dict_fn['CEI']}"
|
638
|
+
|
639
|
+
if tmp_col in self.df_train.columns:
|
640
|
+
self.feature_names.append(tmp_col)
|
641
|
+
else:
|
642
|
+
# Check if any element of dict_selected_features is in _t
|
643
|
+
if selected_features["CEI"].any():
|
644
|
+
for x in selected_features["CEI"].values:
|
645
|
+
if x not in cei:
|
646
|
+
continue
|
647
|
+
|
648
|
+
dict_fn = stages.get_stage_information_dict(
|
649
|
+
_t, self.method
|
650
|
+
)
|
651
|
+
tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
|
652
|
+
|
653
|
+
if tmp_col in self.df_train.columns:
|
654
|
+
self.feature_names.append(tmp_col)
|
655
|
+
except:
|
656
|
+
breakpoint()
|
557
657
|
self.feature_names = list(set(self.feature_names))
|
558
658
|
|
559
659
|
if self.median_yield_as_feature:
|
@@ -565,16 +665,14 @@ class Geocif:
|
|
565
665
|
self.feature_names.append(f"t -{i} {self.target}")
|
566
666
|
|
567
667
|
if self.analogous_year_yield_as_feature:
|
568
|
-
self.feature_names.
|
569
|
-
self.feature_names.append("Analogous Year Yield")
|
668
|
+
self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
|
570
669
|
|
571
670
|
if self.use_outlook_as_feature:
|
572
671
|
self.feature_names.append("FCST")
|
573
672
|
|
574
673
|
# Add lat and lon to feature names
|
575
|
-
if self.
|
576
|
-
self.feature_names.
|
577
|
-
self.feature_names.append("lon")
|
674
|
+
if self.include_lat_lon_as_feature:
|
675
|
+
self.feature_names.extend(["lat", "lon"])
|
578
676
|
|
579
677
|
self.selected_features = []
|
580
678
|
|
@@ -598,6 +696,8 @@ class Geocif:
|
|
598
696
|
for idx, region in enumerate(pbar):
|
599
697
|
if self.model_name in ["linear", "gam"]:
|
600
698
|
self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
|
699
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
700
|
+
self.create_feature_names(stages, {})
|
601
701
|
elif self.ml_model:
|
602
702
|
self.create_feature_names(stages, dict_selected_features[region])
|
603
703
|
elif self.model_name in ["median"]:
|
@@ -727,11 +827,52 @@ class Geocif:
|
|
727
827
|
parts = all_cei_columns[-1].split("_")
|
728
828
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
729
829
|
|
730
|
-
#
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
830
|
+
# For each region, find the column with the longest string in cei_column
|
831
|
+
group_by = ["Region"]
|
832
|
+
groups = df.groupby(group_by)
|
833
|
+
if self.use_cumulative_features:
|
834
|
+
frames = []
|
835
|
+
for name, group in groups:
|
836
|
+
# Drop columns with all NaNs
|
837
|
+
group.dropna(axis=1, how="all", inplace=True)
|
838
|
+
|
839
|
+
cei_column = group[
|
840
|
+
group.columns[group.columns.str.contains(cei)]
|
841
|
+
].columns
|
842
|
+
max_cei_col = max(cei_column, key=len)
|
843
|
+
self.stage_info = stages.get_stage_information_dict(
|
844
|
+
max_cei_col, self.method
|
845
|
+
)
|
846
|
+
|
847
|
+
# Subset dataframes to columns that contain self.stage_info["Stage_ID"]
|
848
|
+
all_columns = group.columns[
|
849
|
+
group.columns.str.contains(self.stage_info["Stage_ID"])
|
850
|
+
].tolist()
|
851
|
+
|
852
|
+
group = group[
|
853
|
+
self.fixed_columns
|
854
|
+
+ [self.target]
|
855
|
+
+ self.statistics_columns
|
856
|
+
+ all_columns
|
857
|
+
]
|
858
|
+
# rename all_columns to self.stage_info["CEI"]
|
859
|
+
group.rename(
|
860
|
+
columns={
|
861
|
+
col: stages.get_stage_information_dict(col, self.method)["CEI"]
|
862
|
+
for col in all_columns
|
863
|
+
},
|
864
|
+
inplace=True,
|
865
|
+
)
|
866
|
+
|
867
|
+
frames.append(group)
|
868
|
+
|
869
|
+
df = pd.concat(frames)
|
870
|
+
else:
|
871
|
+
# HACK: Get feature name with GD4 in it to extract first and last stage id and name
|
872
|
+
cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
|
873
|
+
# Select the longest string in cei_column
|
874
|
+
cei_col = max(cei_column, key=len)
|
875
|
+
self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
|
735
876
|
|
736
877
|
# Change column name
|
737
878
|
# e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
|
@@ -795,12 +936,14 @@ class Geocif:
|
|
795
936
|
|
796
937
|
mask = self.df_results["Stage_ID"].isin(_stages)
|
797
938
|
df = self.df_results[mask]
|
798
|
-
|
799
939
|
""" Select which CEI categories to use for ML """
|
800
940
|
if "all" in self.use_ceis:
|
801
941
|
pass
|
802
942
|
else:
|
803
|
-
|
943
|
+
if self.select_cei_by == "Type":
|
944
|
+
df = df[df["Type"].isin(self.use_ceis)]
|
945
|
+
elif self.select_cei_by == "Index":
|
946
|
+
df = df[df["Index"].isin(self.use_ceis)]
|
804
947
|
|
805
948
|
""" Convert this dataframe into an ML ready format and save to disk """
|
806
949
|
df = self.create_ml_dataframe(df)
|
@@ -874,6 +1017,8 @@ class Geocif:
|
|
874
1017
|
if self.spatial_autocorrelation:
|
875
1018
|
sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
|
876
1019
|
|
1020
|
+
dict_selected_features = {}
|
1021
|
+
dict_best_cei = {}
|
877
1022
|
if self.correlation_plots:
|
878
1023
|
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
879
1024
|
(
|
@@ -949,6 +1094,8 @@ class Geocif:
|
|
949
1094
|
self.model_name = model
|
950
1095
|
self.experiment_name = self.parser.get("ML", "experiment_name")
|
951
1096
|
self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
|
1097
|
+
self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
|
1098
|
+
self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
|
952
1099
|
self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
|
953
1100
|
self.optimize = self.parser.getboolean(self.country, "optimize")
|
954
1101
|
self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
|
@@ -960,6 +1107,21 @@ class Geocif:
|
|
960
1107
|
self.estimate_ci = False
|
961
1108
|
self.check_yield_trend = False
|
962
1109
|
self.estimate_ci_for_all = False
|
1110
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
1111
|
+
self.correlation_plots = False
|
1112
|
+
self.lag_yield_as_feature = False
|
1113
|
+
self.median_yield_as_feature = False
|
1114
|
+
self.median_area_as_feature = False
|
1115
|
+
self.analogous_year_yield_as_feature = False
|
1116
|
+
self.last_year_yield_as_feature = False
|
1117
|
+
self.include_lat_lon_as_feature = False
|
1118
|
+
self.do_xai = False
|
1119
|
+
self.estimate_ci = False
|
1120
|
+
self.estimate_ci_for_all = False
|
1121
|
+
self.check_yield_trend = False
|
1122
|
+
self.cluster_strategy = "single"
|
1123
|
+
self.select_cei_by = "Index"
|
1124
|
+
self.use_cumulative_features = True
|
963
1125
|
else:
|
964
1126
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
965
1127
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
|
|
173
173
|
or "south_africa_maize" in i[3]
|
174
174
|
or "mozambique_maize" in i[3]
|
175
175
|
or "united_states_of_america" in i[3]
|
176
|
-
|
177
|
-
|
176
|
+
or "russian_federation" in i[3]
|
177
|
+
or "ukraine" in i[3]
|
178
178
|
]
|
179
179
|
# "malawi" in i[2]]
|
180
180
|
|
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
|
|
47
47
|
|
48
48
|
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
49
49
|
self.base_dir = Path(
|
50
|
-
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\
|
50
|
+
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
|
51
51
|
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
52
52
|
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
53
53
|
|
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
|
|
164
164
|
# Only keep those entries in combinations where the third elemt is
|
165
165
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
166
166
|
# This is done to test the code for these countries
|
167
|
-
combinations = [i for i in combinations if "
|
167
|
+
combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
|
168
168
|
|
169
169
|
if True:
|
170
170
|
num_cpu = int(cpu_count() * 0.5)
|
@@ -157,8 +157,8 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
157
157
|
cbar_ax.tick_params(axis="both", which="major", labelsize=5)
|
158
158
|
|
159
159
|
_country = country.title().replace("_", " ")
|
160
|
-
_region_name = region_name
|
161
|
-
_crop =
|
160
|
+
_region_name = region_name if not national_correlation else ""
|
161
|
+
_crop = crop.title().replace("_", " ")
|
162
162
|
if not national_correlation:
|
163
163
|
fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
|
164
164
|
else:
|
@@ -304,7 +304,7 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
304
304
|
)
|
305
305
|
|
306
306
|
kwargs["region_id"] = region_id
|
307
|
-
_region_names = "
|
307
|
+
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
308
308
|
kwargs["region_name"] = _region_names
|
309
309
|
plot_feature_corr_by_time(df_tmp, **kwargs)
|
310
310
|
# For each element in dict_best_cei, add the type of the cei
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import matplotlib.pyplot as plt
|
3
|
+
import seaborn as sns
|
4
|
+
from taipy.gui import Gui
|
5
|
+
|
6
|
+
# Load the dataset
|
7
|
+
file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\ml\analysis\July_05_2024\russian_federation\maize\cumulative_1\2010\X_train_1.csv' # Update with the correct file path
|
8
|
+
df = pd.read_csv(file_path)
|
9
|
+
print(df.head())
|
10
|
+
# Define a function to create the plot
|
11
|
+
def plot_auc_ndvi(data):
|
12
|
+
fig, ax = plt.subplots(figsize=(14, 8))
|
13
|
+
sns.lineplot(data=data, x="Harvest Year", y="AUC_NDVI Oct 7-Mar 25", hue="Region", marker="o", ax=ax)
|
14
|
+
ax.set_title("Trends of AUC_NDVI by Region (Oct 7 - Mar 25)")
|
15
|
+
ax.set_xlabel("Harvest Year")
|
16
|
+
ax.set_ylabel("AUC_NDVI Oct 7 - Mar 25")
|
17
|
+
ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
|
18
|
+
plt.show()
|
19
|
+
return fig
|
20
|
+
|
21
|
+
# Create the plot and save it
|
22
|
+
plot_fig = plot_auc_ndvi(df)
|
23
|
+
|
24
|
+
# Define the Taipy page with the plot
|
25
|
+
page = """
|
26
|
+
# Trends of AUC_NDVI by Region
|
27
|
+
|
28
|
+
<|{plot_fig}|chart|>
|
29
|
+
"""
|
30
|
+
|
31
|
+
# Create and run the GUI
|
32
|
+
gui = Gui(page)
|
33
|
+
gui.run()
|
@@ -107,7 +107,6 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
107
107
|
try:
|
108
108
|
utils.to_db(db_path, experiment_id, df)
|
109
109
|
except Exception as e:
|
110
|
-
breakpoint()
|
111
110
|
print(f"Error: {e}")
|
112
111
|
|
113
112
|
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
@@ -128,7 +127,6 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
128
127
|
df_model.index.set_names(["Index"], inplace=True)
|
129
128
|
utils.to_db(db_path, "models", df_model)
|
130
129
|
except Exception as e:
|
131
|
-
breakpoint()
|
132
130
|
print(f"Error: {e}")
|
133
131
|
|
134
132
|
con.commit()
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
from typing import Union
|
2
3
|
|
3
4
|
from geocif import utils
|
4
5
|
|
@@ -277,23 +278,31 @@ def update_feature_names(df, method):
|
|
277
278
|
return df
|
278
279
|
|
279
280
|
|
280
|
-
def convert_stage_string(stage_info, to_array=True):
|
281
|
+
def convert_stage_string(stage_info: Union[str, np.ndarray], to_array: bool = True) -> Union[np.ndarray, str]:
|
281
282
|
"""
|
282
|
-
|
283
|
-
output: array([13, 12, 11])
|
284
|
-
or vice versa if to_array = False
|
283
|
+
Converts a string of stage information to a numpy array or vice versa.
|
285
284
|
|
286
285
|
Args:
|
287
|
-
stage_info:
|
288
|
-
to_array:
|
286
|
+
stage_info: A string of stages separated by underscores or a numpy array of stages e.g. '13_12_11'
|
287
|
+
to_array: A boolean indicating the direction of conversion. If True, converts string to numpy array e.g. array([13, 12, 11])
|
288
|
+
If False, converts numpy array to string.
|
289
289
|
|
290
290
|
Returns:
|
291
|
+
A numpy array of stages if to_array is True, or a string of stages if to_array is False.
|
291
292
|
|
293
|
+
Raises:
|
294
|
+
ValueError: If the input format is incorrect.
|
292
295
|
"""
|
293
296
|
if to_array:
|
294
|
-
|
295
|
-
|
297
|
+
if not isinstance(stage_info, str):
|
298
|
+
raise ValueError("Expected a string for stage_info when to_array is True.")
|
299
|
+
try:
|
300
|
+
stages = np.array([int(stage) for stage in stage_info.split("_")])
|
301
|
+
except ValueError:
|
302
|
+
raise ValueError("Stage info string should contain integers separated by underscores.")
|
296
303
|
else:
|
297
|
-
|
304
|
+
if not isinstance(stage_info, np.ndarray):
|
305
|
+
raise ValueError("Expected a numpy array for stage_info when to_array is False.")
|
306
|
+
stages = "_".join(map(str, stage_info))
|
298
307
|
|
299
308
|
return stages
|
@@ -2,6 +2,7 @@ import multiprocessing as mp
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import optuna
|
5
|
+
import pandas as pd
|
5
6
|
from catboost import CatBoostRegressor
|
6
7
|
from sklearn.metrics import root_mean_squared_error
|
7
8
|
from sklearn.model_selection import train_test_split
|
@@ -289,6 +290,27 @@ def auto_train(
|
|
289
290
|
model = LinearGAM(n_splines=25, spline_order=3).gridsearch(
|
290
291
|
X_train.values, y_train.values, lam=np.logspace(-3, 3, 11)
|
291
292
|
)
|
293
|
+
elif model_name == "cumulative_1":
|
294
|
+
from pygam import GAM, s, f, te
|
295
|
+
|
296
|
+
# compute index of column Region
|
297
|
+
region_idx = X_train.columns.get_loc("Region")
|
298
|
+
|
299
|
+
model = GAM(s(0) + f(region_idx))
|
300
|
+
elif model_name == "cumulative_2":
|
301
|
+
from pygam import GAM, s, f, te
|
302
|
+
|
303
|
+
# compute index of column Region
|
304
|
+
region_idx = X_train.columns.get_loc("Region")
|
305
|
+
|
306
|
+
model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
|
307
|
+
elif model_name == "cumulative_3":
|
308
|
+
from pygam import GAM, s, f, te
|
309
|
+
|
310
|
+
# compute index of column Region
|
311
|
+
region_idx = X_train.columns.get_loc("Region")
|
312
|
+
|
313
|
+
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
|
292
314
|
elif model_name == "geospaNN":
|
293
315
|
import torch
|
294
316
|
import geospaNN
|
@@ -33,10 +33,10 @@ geocif/cei/definitions.py
|
|
33
33
|
geocif/cei/indices.py
|
34
34
|
geocif/ml/__init__.py
|
35
35
|
geocif/ml/correlations.py
|
36
|
-
geocif/ml/correlations_backup.py
|
37
36
|
geocif/ml/embedding.py
|
38
37
|
geocif/ml/feature_engineering.py
|
39
38
|
geocif/ml/feature_selection.py
|
39
|
+
geocif/ml/misc.py
|
40
40
|
geocif/ml/outliers.py
|
41
41
|
geocif/ml/outlook.py
|
42
42
|
geocif/ml/output.py
|
@@ -1,412 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
import matplotlib.pyplot as plt
|
4
|
-
import palettable as pal
|
5
|
-
import pandas as pd
|
6
|
-
import seaborn as sns
|
7
|
-
from tqdm import tqdm
|
8
|
-
|
9
|
-
from geocif import utils
|
10
|
-
from geocif.ml import embedding
|
11
|
-
from geocif.ml import stages
|
12
|
-
|
13
|
-
|
14
|
-
def most_correlated_feature_by_time(df_train, simulation_stages, target_col):
|
15
|
-
"""
|
16
|
-
|
17
|
-
Args:
|
18
|
-
df_train:
|
19
|
-
simulation_stages:
|
20
|
-
target_col:
|
21
|
-
|
22
|
-
Returns:
|
23
|
-
|
24
|
-
"""
|
25
|
-
frames = []
|
26
|
-
|
27
|
-
stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
|
28
|
-
|
29
|
-
# Only select columns that have been observed till the current stage
|
30
|
-
for stage in tqdm(stages, leave=False, desc="Compute most correlated feature"):
|
31
|
-
current_feature_set = [
|
32
|
-
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
33
|
-
]
|
34
|
-
|
35
|
-
# Get the most correlated feature for each region
|
36
|
-
top_feature_by_region, counter = embedding.get_top_correlated_features(
|
37
|
-
df_train[current_feature_set + ["Region"]],
|
38
|
-
df_train[target_col],
|
39
|
-
)
|
40
|
-
|
41
|
-
# Create a dataframe with the most common top feature and number of occurrences over timestep
|
42
|
-
_feature = counter.most_common(1)[0][0]
|
43
|
-
# Loop through top_feature_by_region and find the average score for _feature
|
44
|
-
# Calculate the average score for 'DTR_36'
|
45
|
-
_feature_scores = [
|
46
|
-
value[1][0]
|
47
|
-
for key, value in top_feature_by_region.items()
|
48
|
-
if _feature in value[0]
|
49
|
-
]
|
50
|
-
average_score = sum(_feature_scores) / len(_feature_scores)
|
51
|
-
_feature = utils.remove_last_part(_feature)
|
52
|
-
|
53
|
-
df = pd.DataFrame(
|
54
|
-
{
|
55
|
-
"Stage": [stage[-1]],
|
56
|
-
"Date": [utils.dict_growth_stages[stage[-1]]],
|
57
|
-
"Feature with Highest Correlation": [counter.most_common(1)[0][0]],
|
58
|
-
"Feature Category": [_feature],
|
59
|
-
"Score": [average_score],
|
60
|
-
# "Type": [ci.dict_indices[_feature][0]],
|
61
|
-
"Number of Occurrences": [counter.most_common(1)[0][1]],
|
62
|
-
# "Current Feature Set": [current_feature_set],
|
63
|
-
}
|
64
|
-
)
|
65
|
-
frames.append(df)
|
66
|
-
|
67
|
-
df_most_corr_feature_by_time = pd.concat(frames)
|
68
|
-
|
69
|
-
|
70
|
-
def plot_feature_corr_by_time(df, **kwargs):
|
71
|
-
country = kwargs.get("country")
|
72
|
-
crop = kwargs.get("crop")
|
73
|
-
dir_output = kwargs.get("dir_output")
|
74
|
-
forecast_season = kwargs.get("forecast_season")
|
75
|
-
national_correlation = kwargs.get("national_correlation")
|
76
|
-
group_by = kwargs.get("groupby")
|
77
|
-
|
78
|
-
# Setup the figure and gridspec
|
79
|
-
fig = plt.figure(figsize=(10, 5))
|
80
|
-
gs = fig.add_gridspec(
|
81
|
-
3, 2, height_ratios=[6, 5, 1], width_ratios=[5, 1.5], hspace=0.6, wspace=0.0
|
82
|
-
)
|
83
|
-
|
84
|
-
# Assign subplots
|
85
|
-
ax_heatmap = fig.add_subplot(gs[0:2, 0])
|
86
|
-
ax_map = fig.add_subplot(gs[0, 1])
|
87
|
-
cbar_ax = fig.add_subplot(gs[2, 0])
|
88
|
-
ax4 = fig.add_subplot(gs[2, 1])
|
89
|
-
|
90
|
-
# Transpose and reverse the columns of the dataframe
|
91
|
-
#breakpoint()
|
92
|
-
## Only select foll. columns:
|
93
|
-
|
94
|
-
df = df[
|
95
|
-
[
|
96
|
-
"TG",
|
97
|
-
"TG10p",
|
98
|
-
"DTR",
|
99
|
-
"vDTR",
|
100
|
-
"R99p",
|
101
|
-
"RX5day",
|
102
|
-
"MEAN_ESI4WK",
|
103
|
-
]
|
104
|
-
]
|
105
|
-
df_transpose = df.T
|
106
|
-
df = df_transpose[df_transpose.columns[::-1]]
|
107
|
-
|
108
|
-
# Split column names and only use value before space
|
109
|
-
df.columns = df.columns.str.split(" ").str[0]
|
110
|
-
# In row names, replace ESI4WK by ES
|
111
|
-
df.index = df.index.str.replace("MEAN_ESI4WK", "ZScore_ES")
|
112
|
-
df.index = df.index.str.replace("R99p", "MEAN_SM")
|
113
|
-
df.index = df.index.str.replace("RX5day", "AUC_SM")
|
114
|
-
# Remove the last row
|
115
|
-
# Select the first, third and fifth column
|
116
|
-
df = df[["Dec", "Feb", "Apr"]]
|
117
|
-
# Rename Dec to Planting - Early Vegetative
|
118
|
-
# Rename Feb to Early Vegetative - Senescence
|
119
|
-
# Rename Apr to Senescence - Harvest
|
120
|
-
df.columns = ["Planting - Early Vegetative", "Early Vegetative - Senescence", "Senescence - Harvest"]
|
121
|
-
ax_heatmap = sns.heatmap(
|
122
|
-
df,
|
123
|
-
ax=ax_heatmap,
|
124
|
-
annot=True,
|
125
|
-
cmap=pal.cartocolors.diverging.Earth_5.get_mpl_colormap(),
|
126
|
-
fmt=".2f",
|
127
|
-
square=False,
|
128
|
-
linewidths=0.5,
|
129
|
-
linecolor="white",
|
130
|
-
cbar_ax=cbar_ax,
|
131
|
-
cbar_kws={"orientation": "horizontal"}, # , "shrink": 0.5},
|
132
|
-
annot_kws={"size": 6},
|
133
|
-
xticklabels=True,
|
134
|
-
yticklabels=True,
|
135
|
-
)
|
136
|
-
ax_heatmap.tick_params(left=False, bottom=False)
|
137
|
-
|
138
|
-
# Plot the map using GeoPandas
|
139
|
-
dg_country = kwargs.get("dg_country")
|
140
|
-
|
141
|
-
ax_map = dg_country.plot(
|
142
|
-
ax=ax_map,
|
143
|
-
color="white",
|
144
|
-
edgecolor="black",
|
145
|
-
linewidth=1.0,
|
146
|
-
facecolor=None,
|
147
|
-
legend=False,
|
148
|
-
)
|
149
|
-
|
150
|
-
if not national_correlation:
|
151
|
-
id = kwargs["region_id"]
|
152
|
-
dg_region = dg_country[dg_country[group_by] == id]
|
153
|
-
ax_map = dg_region.plot(
|
154
|
-
ax=ax_map, color="blue", edgecolor="blue", linewidth=1.0, legend=False
|
155
|
-
)
|
156
|
-
# Set title with color blue
|
157
|
-
ax_map.set_title(f"Region: {id}", color="blue")
|
158
|
-
|
159
|
-
# No colorbar for the map
|
160
|
-
ax_map.axis("off")
|
161
|
-
# Remove borders
|
162
|
-
ax_map.spines["top"].set_visible(False)
|
163
|
-
ax_map.spines["right"].set_visible(False)
|
164
|
-
ax_map.spines["bottom"].set_visible(False)
|
165
|
-
ax_map.spines["left"].set_visible(False)
|
166
|
-
# ax4 should not be visible
|
167
|
-
ax4.axis("off")
|
168
|
-
|
169
|
-
# Add colorbar label
|
170
|
-
# cbar_ax.set_xlabel("Correlation Coefficient", labelpad=3, size="small")
|
171
|
-
cbar_ax.set_title("Correlation Coefficient", loc="left", size="small")
|
172
|
-
ax_heatmap.set_xticklabels(
|
173
|
-
ax_heatmap.get_xticklabels(), size="x-small", rotation=0, fontsize=7
|
174
|
-
)
|
175
|
-
ax_heatmap.set_yticklabels(ax_heatmap.get_yticklabels(), size="x-small", fontsize=7)
|
176
|
-
ax_heatmap.set_xlabel("")
|
177
|
-
ax_heatmap.set_ylabel(" ")
|
178
|
-
# Reduce font size of ticks of colorbar
|
179
|
-
cbar_ax.tick_params(axis="both", which="major", labelsize=6)
|
180
|
-
|
181
|
-
_country = country.title().replace("_", " ")
|
182
|
-
_crop = crop.title().replace("_", " ")
|
183
|
-
if not national_correlation:
|
184
|
-
fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
|
185
|
-
else:
|
186
|
-
fname = f"{country}_{crop}_corr_feature_by_time.png"
|
187
|
-
ax_heatmap.set_title(f"{_country}\n{_crop}")
|
188
|
-
|
189
|
-
# plt.tight_layout()
|
190
|
-
os.makedirs(dir_output, exist_ok=True)
|
191
|
-
plt.savefig(dir_output / fname, dpi=250)
|
192
|
-
plt.close()
|
193
|
-
|
194
|
-
|
195
|
-
def _all_correlated_feature_by_time(df, **kwargs):
|
196
|
-
"""
|
197
|
-
|
198
|
-
Args:
|
199
|
-
df:
|
200
|
-
**kwargs:
|
201
|
-
|
202
|
-
Returns:
|
203
|
-
|
204
|
-
"""
|
205
|
-
frames = []
|
206
|
-
all_stages = kwargs.get("all_stages")
|
207
|
-
target_col = kwargs.get("target_col")
|
208
|
-
method = kwargs.get("method")
|
209
|
-
|
210
|
-
longest_stage = max(all_stages, key=len)
|
211
|
-
|
212
|
-
# Split the original string into a list of its parts
|
213
|
-
longest_stage = longest_stage.split("_")
|
214
|
-
|
215
|
-
# Generate the list of strings as described by the user, removing one element from the start each time
|
216
|
-
stages_features = ["_".join(longest_stage[i:]) for i in range(len(longest_stage))]
|
217
|
-
|
218
|
-
# Drop columns with no yield information
|
219
|
-
df = df.dropna(subset=[target_col])
|
220
|
-
|
221
|
-
# Only select columns that have been observed till the current stage
|
222
|
-
pbar = tqdm(stages_features, total=len(stages_features), leave=False)
|
223
|
-
for stage in pbar:
|
224
|
-
pbar.set_description(f"Calculating correlations")
|
225
|
-
pbar.update()
|
226
|
-
|
227
|
-
stage_name = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
228
|
-
"Stage Name"
|
229
|
-
]
|
230
|
-
# starting_stage = stage_name.split("-")[0]
|
231
|
-
current_feature_set = [col for col in df.columns if stage_name in col]
|
232
|
-
|
233
|
-
# Get the most correlated feature for each region
|
234
|
-
df_tmp = embedding.get_all_features_correlation(
|
235
|
-
df[current_feature_set + ["Region"]], df[target_col], method
|
236
|
-
)
|
237
|
-
|
238
|
-
frames.append(df_tmp)
|
239
|
-
|
240
|
-
df_results = pd.concat(frames)
|
241
|
-
if not df_results.empty:
|
242
|
-
# Exclude Region column
|
243
|
-
df_results = df_results.drop(columns="Region")
|
244
|
-
# Groupby Dekad and compute mean of all columns apart from Region
|
245
|
-
df_results = df_results.groupby(method).mean()
|
246
|
-
|
247
|
-
all_stage_names = []
|
248
|
-
for stage in stages_features:
|
249
|
-
_tmp = stages.get_stage_information_dict(f"GD4_{stage}", method)[
|
250
|
-
"Stage Name"
|
251
|
-
]
|
252
|
-
all_stage_names.append(_tmp)
|
253
|
-
|
254
|
-
df_results = df_results.reindex(all_stage_names)
|
255
|
-
|
256
|
-
# Drop rows with all NaN values
|
257
|
-
df_results = df_results.dropna(how="all")
|
258
|
-
|
259
|
-
# Split the index based on - and only keep the first element
|
260
|
-
df_results.index = df_results.index.str.split("-").str[0]
|
261
|
-
|
262
|
-
return df_results
|
263
|
-
else:
|
264
|
-
return pd.DataFrame()
|
265
|
-
|
266
|
-
|
267
|
-
def all_correlated_feature_by_time(df, **kwargs):
|
268
|
-
"""
|
269
|
-
|
270
|
-
Args:
|
271
|
-
df:
|
272
|
-
**kwargs:
|
273
|
-
|
274
|
-
Returns:
|
275
|
-
|
276
|
-
"""
|
277
|
-
THRESHOLD = 0.1
|
278
|
-
national_correlation = kwargs.get("national_correlation")
|
279
|
-
group_by = kwargs.get("groupby")
|
280
|
-
combined_dict = kwargs.get("combined_dict")
|
281
|
-
|
282
|
-
dict_selected_features = {}
|
283
|
-
dict_best_cei = {}
|
284
|
-
|
285
|
-
if not national_correlation:
|
286
|
-
groups = df.groupby(group_by)
|
287
|
-
for region_id, group in tqdm(
|
288
|
-
groups, desc=f"Compute all correlated feature by {group_by}", leave=False
|
289
|
-
):
|
290
|
-
df_corr = _all_correlated_feature_by_time(group, **kwargs)
|
291
|
-
|
292
|
-
# Remove columns with more than 50% NaN values
|
293
|
-
df_corr = df_corr.dropna(thresh=len(df_corr) / 2, axis=1)
|
294
|
-
|
295
|
-
if not df_corr.empty:
|
296
|
-
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
297
|
-
# Add the columns to dict_selected_features along with the absolute mean value
|
298
|
-
absolute_medians = df_tmp.abs().median()
|
299
|
-
|
300
|
-
# Create a DataFrame to display the column names and their absolute median values
|
301
|
-
absolute_median_df = absolute_medians.reset_index()
|
302
|
-
absolute_median_df.columns = ['CEI', 'Median']
|
303
|
-
|
304
|
-
# Add the CEI and Median value to dict_selected_features
|
305
|
-
dict_selected_features[region_id] = absolute_median_df
|
306
|
-
|
307
|
-
df_tmp2 = (
|
308
|
-
df_tmp.median(axis=0)
|
309
|
-
.abs()
|
310
|
-
.sort_values(ascending=False)
|
311
|
-
.reset_index()
|
312
|
-
)
|
313
|
-
df_tmp2.columns = ["Metric", "Value"]
|
314
|
-
# Add another column based on Type of Metric
|
315
|
-
for idx, row in df_tmp2.iterrows():
|
316
|
-
df_tmp2.loc[idx, "Type"] = combined_dict[row[0]][0]
|
317
|
-
|
318
|
-
# Compute median of each CEI and sort the dataframe based on the absolute value of the median
|
319
|
-
dict_best_cei[region_id] = (
|
320
|
-
df_tmp2.groupby("Type")
|
321
|
-
.max()
|
322
|
-
.reset_index()
|
323
|
-
.sort_values("Value", ascending=False)["Metric"]
|
324
|
-
.values
|
325
|
-
)
|
326
|
-
|
327
|
-
kwargs["region_id"] = region_id
|
328
|
-
plot_feature_corr_by_time(df_tmp, **kwargs)
|
329
|
-
# For each element in dict_best_cei, add the type of the cei
|
330
|
-
else:
|
331
|
-
# HACK
|
332
|
-
df_corr = _all_correlated_feature_by_time(df, **kwargs)
|
333
|
-
|
334
|
-
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
335
|
-
# Add the columns to dict_selected_features along with the absolute mean value
|
336
|
-
absolute_medians = df_tmp.abs().median()
|
337
|
-
|
338
|
-
# Create a DataFrame to display the column names and their absolute median values
|
339
|
-
absolute_median_df = absolute_medians.reset_index()
|
340
|
-
absolute_median_df.columns = ['CEI', 'Median']
|
341
|
-
|
342
|
-
# Add the CEI and Median value to dict_selected_features
|
343
|
-
dict_selected_features[region_id] = absolute_median_df
|
344
|
-
dict_best_cei[region_id] = {}
|
345
|
-
else:
|
346
|
-
df_corr = _all_correlated_feature_by_time(df, **kwargs)
|
347
|
-
df_tmp = df_corr[df_corr.columns[(abs(df_corr.mean()) > THRESHOLD)]]
|
348
|
-
# Add the columns to dict_selected_features along with the absolute mean value
|
349
|
-
absolute_medians = df_tmp.abs().median()
|
350
|
-
|
351
|
-
# Create a DataFrame to display the column names and their absolute median values
|
352
|
-
absolute_median_df = absolute_medians.reset_index()
|
353
|
-
absolute_median_df.columns = ['CEI', 'Median']
|
354
|
-
|
355
|
-
# Add the CEI and Median value to dict_selected_features
|
356
|
-
dict_selected_features[0] = absolute_median_df
|
357
|
-
|
358
|
-
plot_feature_corr_by_time(df_corr, **kwargs)
|
359
|
-
|
360
|
-
return dict_selected_features, dict_best_cei
|
361
|
-
|
362
|
-
|
363
|
-
def feature_correlation_by_time(**kwargs):
|
364
|
-
raise NotImplementedError()
|
365
|
-
|
366
|
-
frames = []
|
367
|
-
simulation_stages = kwargs.get("simulation_stages")
|
368
|
-
df_train = kwargs.get("df_train")
|
369
|
-
target_col = kwargs.get("target_col")
|
370
|
-
|
371
|
-
stages = [simulation_stages[: idx + 1] for idx in range(len(simulation_stages))]
|
372
|
-
|
373
|
-
# Only select columns that have been observed till the current stage
|
374
|
-
for stage in tqdm(stages, leave=False, desc="Compute feature correlation by time"):
|
375
|
-
current_feature_set = [
|
376
|
-
col for col in df_train.columns if col.endswith(f"_{stage[-1]}")
|
377
|
-
]
|
378
|
-
|
379
|
-
# Get the most correlated feature for each region
|
380
|
-
top_feature_by_region, counter = embedding.compute_feature_correlations(
|
381
|
-
df_train[current_feature_set + ["Region"]],
|
382
|
-
df_train[target_col],
|
383
|
-
"all",
|
384
|
-
)
|
385
|
-
|
386
|
-
# Create a dataframe with the most common top feature and number of occurrences over timestep
|
387
|
-
_feature = counter.most_common(1)[0][0]
|
388
|
-
# Loop through top_feature_by_region and find the average score for _feature
|
389
|
-
# Calculate the average score for 'DTR_36'
|
390
|
-
_feature_scores = [
|
391
|
-
value[1][0]
|
392
|
-
for key, value in top_feature_by_region.items()
|
393
|
-
if _feature in value[0]
|
394
|
-
]
|
395
|
-
average_score = sum(_feature_scores) / len(_feature_scores)
|
396
|
-
_feature = utils.remove_last_part(_feature)
|
397
|
-
|
398
|
-
df = pd.DataFrame(
|
399
|
-
{
|
400
|
-
"Stage": [stage[-1]],
|
401
|
-
"Date": [utils.dict_growth_stages[stage[-1]]],
|
402
|
-
"Feature with Highest Correlation": [counter.most_common(1)[0][0]],
|
403
|
-
"Feature Category": [_feature],
|
404
|
-
"Score": [average_score],
|
405
|
-
# "Type": [ci.dict_indices[_feature][0]],
|
406
|
-
"Number of Occurrences": [counter.most_common(1)[0][1]],
|
407
|
-
# "Current Feature Set": [current_feature_set],
|
408
|
-
}
|
409
|
-
)
|
410
|
-
frames.append(df)
|
411
|
-
|
412
|
-
df_corr_feature_by_time = pd.concat(frames)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|