geocif 0.1.46__tar.gz → 0.1.48__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.46/geocif.egg-info → geocif-0.1.48}/PKG-INFO +1 -1
- {geocif-0.1.46 → geocif-0.1.48}/geocif/analysis.py +7 -5
- {geocif-0.1.46 → geocif-0.1.48}/geocif/experiments.py +3 -9
- {geocif-0.1.46 → geocif-0.1.48}/geocif/geocif.py +206 -44
- {geocif-0.1.46 → geocif-0.1.48}/geocif/indices_runner.py +2 -2
- {geocif-0.1.46 → geocif-0.1.48}/geocif/indices_runner_v2.py +2 -2
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/correlations.py +3 -3
- geocif-0.1.48/geocif/ml/misc.py +33 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/output.py +0 -2
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/stages.py +18 -9
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/trainers.py +39 -2
- {geocif-0.1.46 → geocif-0.1.48/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/SOURCES.txt +1 -1
- {geocif-0.1.46 → geocif-0.1.48}/setup.py +1 -1
- geocif-0.1.46/geocif/ml/correlations_backup.py +0 -412
- {geocif-0.1.46 → geocif-0.1.48}/LICENSE +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/MANIFEST.in +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/README.md +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/constants.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/features.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/geo.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/backup/models.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/cei/indices.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/logger.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/stats.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/trend.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/ml/xai.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/automl.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/playground/misc.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/utils.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif/viz/plot.py +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/requirements.txt +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/setup.cfg +0 -0
- {geocif-0.1.46 → geocif-0.1.48}/tests/test_geocif.py +0 -0
@@ -162,8 +162,8 @@ class Geoanalysis:
|
|
162
162
|
return pd.DataFrame(), pd.DataFrame()
|
163
163
|
|
164
164
|
df_metrics = self._compute_metrics(df)
|
165
|
-
|
166
|
-
|
165
|
+
df_metrics = self._process_metrics(df_metrics)
|
166
|
+
self._plot_metrics(df_metrics)
|
167
167
|
|
168
168
|
df_regional_metrics_by_year = self._compute_regional_metrics(
|
169
169
|
df, by="Harvest Year"
|
@@ -172,8 +172,10 @@ class Geoanalysis:
|
|
172
172
|
df_regional_metrics_by_year
|
173
173
|
)
|
174
174
|
df_regional_metrics = self._average_mape(df_regional_metrics_by_year)
|
175
|
-
|
176
|
-
self._store_results(
|
175
|
+
|
176
|
+
self._store_results(
|
177
|
+
df_metrics, df_regional_metrics, df_regional_metrics_by_year
|
178
|
+
)
|
177
179
|
|
178
180
|
df_national_yield = self._compute_national_yield(df)
|
179
181
|
self._plot_national_yield(df_national_yield)
|
@@ -193,7 +195,7 @@ class Geoanalysis:
|
|
193
195
|
.apply(self.annual_metrics)
|
194
196
|
.reset_index()
|
195
197
|
)
|
196
|
-
|
198
|
+
|
197
199
|
return df_metrics.pivot_table(
|
198
200
|
index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
199
201
|
columns="level_5",
|
@@ -85,9 +85,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
85
85
|
|
86
86
|
# Experiment: lag_years
|
87
87
|
logger.info("Experiment 3: lag_years")
|
88
|
-
parser = main(
|
89
|
-
inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5]
|
90
|
-
)
|
88
|
+
parser = main(inputs, logger, parser, "ML", "lag_years", "int", [1, 2, 3, 4, 5])
|
91
89
|
|
92
90
|
# Experiment: lag_yield_as_feature
|
93
91
|
logger.info("Experiment 4: lag_yield_as_feature")
|
@@ -103,9 +101,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
103
101
|
|
104
102
|
# Experiment: median_years
|
105
103
|
logger.info("Experiment 5: median_years")
|
106
|
-
parser = main(
|
107
|
-
inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5]
|
108
|
-
)
|
104
|
+
parser = main(inputs, logger, parser, "ML", "median_years", "int", [2, 3, 4, 5])
|
109
105
|
|
110
106
|
# Experiment: median_yield_as_feature
|
111
107
|
logger.info("Experiment 6: median_yield_as_feature")
|
@@ -133,9 +129,7 @@ def run(path_config_files=[Path("../config/geocif.txt")]):
|
|
133
129
|
|
134
130
|
# Experiment: optimize
|
135
131
|
logger.info("Experiment 8: optimize")
|
136
|
-
parser = main(
|
137
|
-
inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False]
|
138
|
-
)
|
132
|
+
parser = main(inputs, logger, parser, "DEFAULT", "optimize", "bool", [True, False])
|
139
133
|
|
140
134
|
|
141
135
|
if __name__ == "__main__":
|
@@ -11,7 +11,6 @@ import geopandas as gp
|
|
11
11
|
import matplotlib.pyplot as plt
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
-
import sklearn
|
15
14
|
from tqdm import tqdm
|
16
15
|
|
17
16
|
from geocif import logger as log
|
@@ -28,7 +27,6 @@ from .ml import trend
|
|
28
27
|
from .ml import xai
|
29
28
|
|
30
29
|
plt.style.use("default")
|
31
|
-
sklearn.set_config(transform_output="pandas")
|
32
30
|
|
33
31
|
import warnings
|
34
32
|
|
@@ -108,7 +106,6 @@ class Geocif:
|
|
108
106
|
Config file: ML
|
109
107
|
====================================================================
|
110
108
|
"""
|
111
|
-
self.use_ceis = ast.literal_eval(self.parser.get("ML", "use_ceis"))
|
112
109
|
self.model_type = self.parser.get("ML", "model_type")
|
113
110
|
self.fraction_simulate = self.parser.getint("ML", "fraction_simulate")
|
114
111
|
self.analogous_year_yield_as_feature = self.parser.getboolean(
|
@@ -117,10 +114,10 @@ class Geocif:
|
|
117
114
|
self.plot_map_for_correlation_plot = self.parser.getboolean(
|
118
115
|
"ML", "plot_map_for_correlation_plot"
|
119
116
|
)
|
120
|
-
self.correlation_threshold = self.parser.getfloat(
|
121
|
-
|
117
|
+
self.correlation_threshold = self.parser.getfloat("ML", "correlation_threshold")
|
118
|
+
self.include_lat_lon_as_feature = self.parser.getboolean(
|
119
|
+
"ML", "include_lat_lon_as_feature"
|
122
120
|
)
|
123
|
-
self.include_lat_lon = self.parser.getboolean("ML", "include_lat_lon")
|
124
121
|
self.spatial_autocorrelation = self.parser.getboolean(
|
125
122
|
"ML", "spatial_autocorrelation"
|
126
123
|
)
|
@@ -153,6 +150,9 @@ class Geocif:
|
|
153
150
|
self.parser.get("ML", "cat_features")
|
154
151
|
)
|
155
152
|
|
153
|
+
self.use_cumulative_features = self.parser.getboolean(
|
154
|
+
"DEFAULT", "use_cumulative_features"
|
155
|
+
)
|
156
156
|
"""
|
157
157
|
====================================================================
|
158
158
|
Variables, Paths
|
@@ -198,6 +198,9 @@ class Geocif:
|
|
198
198
|
|
199
199
|
self.db_path = self.dir_db / self.db_forecasts
|
200
200
|
|
201
|
+
# Store config file in database
|
202
|
+
output.config_to_db(self.db_path, self.parser, self.today)
|
203
|
+
|
201
204
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
202
205
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
203
206
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -224,18 +227,29 @@ class Geocif:
|
|
224
227
|
y_train = df_region[target_col]
|
225
228
|
|
226
229
|
if self.ml_model:
|
227
|
-
self.
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
230
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
231
|
+
all_features = X_train.columns
|
232
|
+
|
233
|
+
# Select the columns with use_ceis in it
|
234
|
+
self.selected_features = [
|
235
|
+
column
|
236
|
+
for column in all_features
|
237
|
+
if any(cei in column for cei in self.use_ceis)
|
238
|
+
]
|
239
|
+
else:
|
240
|
+
self.logger.info(f"Selecting features for {self.country} {self.crop}")
|
241
|
+
selector, _, self.selected_features = fs.select_features(
|
242
|
+
X_train, y_train, method=self.feature_selection
|
243
|
+
)
|
244
|
+
self.logger.info(f"Selected features: {self.selected_features}")
|
232
245
|
|
233
246
|
""" Update model to include conformal estimates """
|
234
|
-
if "lat" not in self.selected_features and self.
|
247
|
+
if "lat" not in self.selected_features and self.include_lat_lon_as_feature:
|
235
248
|
self.selected_features.append("lat")
|
236
|
-
if "lon" not in self.selected_features and self.
|
249
|
+
if "lon" not in self.selected_features and self.include_lat_lon_as_feature:
|
237
250
|
self.selected_features.append("lon")
|
238
251
|
X_train = df_region[self.selected_features + self.cat_features]
|
252
|
+
|
239
253
|
dir_output = (
|
240
254
|
self.dir_analysis
|
241
255
|
/ self.country
|
@@ -288,6 +302,8 @@ class Geocif:
|
|
288
302
|
verbose=False,
|
289
303
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
290
304
|
)
|
305
|
+
elif self.model_name == "oblique":
|
306
|
+
self.model.fit(X_train, y_train)
|
291
307
|
elif self.model_name == "geospaNN":
|
292
308
|
self.model.fit(
|
293
309
|
X_train,
|
@@ -312,8 +328,38 @@ class Geocif:
|
|
312
328
|
self.best_hyperparams = {}
|
313
329
|
elif self.model_name in ["cubist"]:
|
314
330
|
self.model.fit(X_train, y_train)
|
315
|
-
|
316
|
-
|
331
|
+
elif self.model_name in [
|
332
|
+
"cumulative_1",
|
333
|
+
"cumulative_2",
|
334
|
+
"cumulative_3",
|
335
|
+
]:
|
336
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
|
+
|
338
|
+
# Standardize the numeric features
|
339
|
+
scaler = StandardScaler()
|
340
|
+
X_numeric = X_train.iloc[:, :3]
|
341
|
+
X_scaled_numeric = pd.DataFrame(
|
342
|
+
scaler.fit_transform(X_numeric),
|
343
|
+
columns=X_numeric.columns,
|
344
|
+
index=X_train.index,
|
345
|
+
)
|
346
|
+
|
347
|
+
# Encode the Region as categorical
|
348
|
+
le = LabelEncoder()
|
349
|
+
X_region = pd.Series(
|
350
|
+
le.fit_transform(X_train["Region"]),
|
351
|
+
name="Region",
|
352
|
+
index=X_train.index,
|
353
|
+
)
|
354
|
+
|
355
|
+
# Combine scaled numeric features and encoded region
|
356
|
+
X_train_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
357
|
+
|
358
|
+
self.model.fit(X_train_scaled, y_train)
|
359
|
+
except Exception as e:
|
360
|
+
self.logger.error(
|
361
|
+
f"Error fitting model for {self.country} {self.crop} {e}"
|
362
|
+
)
|
317
363
|
|
318
364
|
def predict(self, df_region, scaler=None):
|
319
365
|
"""
|
@@ -360,6 +406,33 @@ class Geocif:
|
|
360
406
|
X_test, Z_test, clusters_test.astype("object")
|
361
407
|
)
|
362
408
|
best_hyperparameters = self.model.fe_model.get_params().copy()
|
409
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
|
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
|
+
|
412
|
+
# Standardize the numeric features
|
413
|
+
scaler = StandardScaler()
|
414
|
+
X_numeric = X_test.iloc[:, :3]
|
415
|
+
try:
|
416
|
+
X_scaled_numeric = pd.DataFrame(
|
417
|
+
scaler.fit_transform(X_numeric),
|
418
|
+
columns=X_numeric.columns,
|
419
|
+
index=X_test.index,
|
420
|
+
)
|
421
|
+
except:
|
422
|
+
breakpoint()
|
423
|
+
|
424
|
+
# Encode the Region as categorical
|
425
|
+
le = LabelEncoder()
|
426
|
+
X_region = pd.Series(
|
427
|
+
le.fit_transform(X_test["Region"]),
|
428
|
+
name="Region",
|
429
|
+
index=X_test.index,
|
430
|
+
)
|
431
|
+
|
432
|
+
# Combine scaled numeric features and encoded region
|
433
|
+
X_test_scaled = pd.concat([X_scaled_numeric, X_region], axis=1)
|
434
|
+
y_pred = self.model.predict(X_test_scaled)
|
435
|
+
best_hyperparameters = {} # self.model.get_params().copy()
|
363
436
|
elif self.model_name == "geospaNN":
|
364
437
|
import torch
|
365
438
|
import geospaNN
|
@@ -501,7 +574,9 @@ class Geocif:
|
|
501
574
|
"Crop",
|
502
575
|
"Harvest Year",
|
503
576
|
"Stage Name",
|
577
|
+
"Time",
|
504
578
|
]
|
579
|
+
|
505
580
|
df.index = df.apply(
|
506
581
|
lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
507
582
|
)
|
@@ -513,28 +588,37 @@ class Geocif:
|
|
513
588
|
|
514
589
|
def create_feature_names(self, stages_features, selected_features):
|
515
590
|
"""
|
591
|
+
Create feature names for machine learning stages.
|
516
592
|
|
517
593
|
Args:
|
518
|
-
stages_features:
|
519
|
-
selected_features:
|
594
|
+
stages_features (list): List of features for different stages.
|
595
|
+
selected_features (dict): Dictionary of selected features.
|
520
596
|
|
521
597
|
Returns:
|
522
|
-
|
598
|
+
None
|
523
599
|
"""
|
600
|
+
# Assert stages_features is a list
|
601
|
+
assert isinstance(stages_features, list), "stages_features should be a list"
|
602
|
+
|
524
603
|
# Clear out feature names
|
525
604
|
self.feature_names = []
|
526
605
|
|
527
|
-
"""
|
606
|
+
"""
|
607
|
+
Select stages that will be used for ML
|
528
608
|
1. method = "latest" - Select the latest stage
|
529
609
|
2. method = "fraction" - Select a fraction (1-100) of all stages
|
530
610
|
"""
|
611
|
+
method = "fraction"
|
612
|
+
if self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
613
|
+
method = "latest"
|
614
|
+
|
531
615
|
stages_features = stages.select_stages_for_ml(
|
532
|
-
stages_features, method=
|
616
|
+
stages_features, method=method, n=60
|
533
617
|
)
|
534
618
|
|
535
619
|
for stage in stages_features:
|
536
620
|
# Convert each element of stage to str and join with _
|
537
|
-
_stage = "_".join(
|
621
|
+
_stage = "_".join(map(str, stage))
|
538
622
|
|
539
623
|
# Create a list appending _stage to each element of combined_keys
|
540
624
|
_tmp = [f"{col}_{_stage}" for col in self.combined_keys]
|
@@ -543,17 +627,33 @@ class Geocif:
|
|
543
627
|
parts = _t.split("_")
|
544
628
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
545
629
|
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
self.
|
556
|
-
|
630
|
+
try:
|
631
|
+
if self.model_name in [
|
632
|
+
"cumulative_1",
|
633
|
+
"cumulative_2",
|
634
|
+
"cumulative_3",
|
635
|
+
]:
|
636
|
+
dict_fn = stages.get_stage_information_dict(_t, self.method)
|
637
|
+
tmp_col = f"{dict_fn['CEI']}"
|
638
|
+
|
639
|
+
if tmp_col in self.df_train.columns:
|
640
|
+
self.feature_names.append(tmp_col)
|
641
|
+
else:
|
642
|
+
# Check if any element of dict_selected_features is in _t
|
643
|
+
if selected_features["CEI"].any():
|
644
|
+
for x in selected_features["CEI"].values:
|
645
|
+
if x not in cei:
|
646
|
+
continue
|
647
|
+
|
648
|
+
dict_fn = stages.get_stage_information_dict(
|
649
|
+
_t, self.method
|
650
|
+
)
|
651
|
+
tmp_col = f"{dict_fn['CEI']} {dict_fn['Stage Name']}"
|
652
|
+
|
653
|
+
if tmp_col in self.df_train.columns:
|
654
|
+
self.feature_names.append(tmp_col)
|
655
|
+
except:
|
656
|
+
breakpoint()
|
557
657
|
self.feature_names = list(set(self.feature_names))
|
558
658
|
|
559
659
|
if self.median_yield_as_feature:
|
@@ -565,16 +665,14 @@ class Geocif:
|
|
565
665
|
self.feature_names.append(f"t -{i} {self.target}")
|
566
666
|
|
567
667
|
if self.analogous_year_yield_as_feature:
|
568
|
-
self.feature_names.
|
569
|
-
self.feature_names.append("Analogous Year Yield")
|
668
|
+
self.feature_names.extend(["Analogous Year", "Analogous Year Yield"])
|
570
669
|
|
571
670
|
if self.use_outlook_as_feature:
|
572
671
|
self.feature_names.append("FCST")
|
573
672
|
|
574
673
|
# Add lat and lon to feature names
|
575
|
-
if self.
|
576
|
-
self.feature_names.
|
577
|
-
self.feature_names.append("lon")
|
674
|
+
if self.include_lat_lon_as_feature:
|
675
|
+
self.feature_names.extend(["lat", "lon"])
|
578
676
|
|
579
677
|
self.selected_features = []
|
580
678
|
|
@@ -598,6 +696,8 @@ class Geocif:
|
|
598
696
|
for idx, region in enumerate(pbar):
|
599
697
|
if self.model_name in ["linear", "gam"]:
|
600
698
|
self.create_feature_names(stages, dict_best_cei[region][0:3].tolist())
|
699
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
700
|
+
self.create_feature_names(stages, {})
|
601
701
|
elif self.ml_model:
|
602
702
|
self.create_feature_names(stages, dict_selected_features[region])
|
603
703
|
elif self.model_name in ["median"]:
|
@@ -727,11 +827,52 @@ class Geocif:
|
|
727
827
|
parts = all_cei_columns[-1].split("_")
|
728
828
|
cei = parts[0] if parts[1].isdigit() else "_".join(parts[:2])
|
729
829
|
|
730
|
-
#
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
830
|
+
# For each region, find the column with the longest string in cei_column
|
831
|
+
group_by = ["Region"]
|
832
|
+
groups = df.groupby(group_by)
|
833
|
+
if self.use_cumulative_features:
|
834
|
+
frames = []
|
835
|
+
for name, group in groups:
|
836
|
+
# Drop columns with all NaNs
|
837
|
+
group.dropna(axis=1, how="all", inplace=True)
|
838
|
+
|
839
|
+
cei_column = group[
|
840
|
+
group.columns[group.columns.str.contains(cei)]
|
841
|
+
].columns
|
842
|
+
max_cei_col = max(cei_column, key=len)
|
843
|
+
self.stage_info = stages.get_stage_information_dict(
|
844
|
+
max_cei_col, self.method
|
845
|
+
)
|
846
|
+
|
847
|
+
# Subset dataframes to columns that contain self.stage_info["Stage_ID"]
|
848
|
+
all_columns = group.columns[
|
849
|
+
group.columns.str.contains(self.stage_info["Stage_ID"])
|
850
|
+
].tolist()
|
851
|
+
|
852
|
+
group = group[
|
853
|
+
self.fixed_columns
|
854
|
+
+ [self.target]
|
855
|
+
+ self.statistics_columns
|
856
|
+
+ all_columns
|
857
|
+
]
|
858
|
+
# rename all_columns to self.stage_info["CEI"]
|
859
|
+
group.rename(
|
860
|
+
columns={
|
861
|
+
col: stages.get_stage_information_dict(col, self.method)["CEI"]
|
862
|
+
for col in all_columns
|
863
|
+
},
|
864
|
+
inplace=True,
|
865
|
+
)
|
866
|
+
|
867
|
+
frames.append(group)
|
868
|
+
|
869
|
+
df = pd.concat(frames)
|
870
|
+
else:
|
871
|
+
# HACK: Get feature name with GD4 in it to extract first and last stage id and name
|
872
|
+
cei_column = df[df.columns[df.columns.str.contains(cei)]].columns
|
873
|
+
# Select the longest string in cei_column
|
874
|
+
cei_col = max(cei_column, key=len)
|
875
|
+
self.stage_info = stages.get_stage_information_dict(cei_col, self.method)
|
735
876
|
|
736
877
|
# Change column name
|
737
878
|
# e.g. 'vDTR_7_6_5_4_3_2_1_37_36_35_34_33_32_31' to 'vDTR Mar 1-Oct 27'
|
@@ -795,12 +936,14 @@ class Geocif:
|
|
795
936
|
|
796
937
|
mask = self.df_results["Stage_ID"].isin(_stages)
|
797
938
|
df = self.df_results[mask]
|
798
|
-
|
799
939
|
""" Select which CEI categories to use for ML """
|
800
940
|
if "all" in self.use_ceis:
|
801
941
|
pass
|
802
942
|
else:
|
803
|
-
|
943
|
+
if self.select_cei_by == "Type":
|
944
|
+
df = df[df["Type"].isin(self.use_ceis)]
|
945
|
+
elif self.select_cei_by == "Index":
|
946
|
+
df = df[df["Index"].isin(self.use_ceis)]
|
804
947
|
|
805
948
|
""" Convert this dataframe into an ML ready format and save to disk """
|
806
949
|
df = self.create_ml_dataframe(df)
|
@@ -874,6 +1017,8 @@ class Geocif:
|
|
874
1017
|
if self.spatial_autocorrelation:
|
875
1018
|
sa.compute_spatial_autocorrelation(self.df_results, **dict_kwargs)
|
876
1019
|
|
1020
|
+
dict_selected_features = {}
|
1021
|
+
dict_best_cei = {}
|
877
1022
|
if self.correlation_plots:
|
878
1023
|
self.logger.info(f"Correlation plot for {self.country} {self.crop}")
|
879
1024
|
(
|
@@ -949,6 +1094,8 @@ class Geocif:
|
|
949
1094
|
self.model_name = model
|
950
1095
|
self.experiment_name = self.parser.get("ML", "experiment_name")
|
951
1096
|
self.ml_model = self.parser.getboolean(self.model_name, "ML_model")
|
1097
|
+
self.select_cei_by = self.parser.get(self.model_name, "select_cei_by")
|
1098
|
+
self.use_ceis = ast.literal_eval(self.parser.get(self.model_name, "use_ceis"))
|
952
1099
|
self.model_names = ast.literal_eval(self.parser.get(self.country, "models"))
|
953
1100
|
self.optimize = self.parser.getboolean(self.country, "optimize")
|
954
1101
|
self.fraction_loocv = self.parser.getfloat(self.country, "fraction_loocv")
|
@@ -960,6 +1107,21 @@ class Geocif:
|
|
960
1107
|
self.estimate_ci = False
|
961
1108
|
self.check_yield_trend = False
|
962
1109
|
self.estimate_ci_for_all = False
|
1110
|
+
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
1111
|
+
self.correlation_plots = False
|
1112
|
+
self.lag_yield_as_feature = False
|
1113
|
+
self.median_yield_as_feature = False
|
1114
|
+
self.median_area_as_feature = False
|
1115
|
+
self.analogous_year_yield_as_feature = False
|
1116
|
+
self.last_year_yield_as_feature = False
|
1117
|
+
self.include_lat_lon_as_feature = False
|
1118
|
+
self.do_xai = False
|
1119
|
+
self.estimate_ci = False
|
1120
|
+
self.estimate_ci_for_all = False
|
1121
|
+
self.check_yield_trend = False
|
1122
|
+
self.cluster_strategy = "single"
|
1123
|
+
self.select_cei_by = "Index"
|
1124
|
+
self.use_cumulative_features = True
|
963
1125
|
else:
|
964
1126
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
965
1127
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -173,8 +173,8 @@ class cei_runner(base.BaseGeo):
|
|
173
173
|
or "south_africa_maize" in i[3]
|
174
174
|
or "mozambique_maize" in i[3]
|
175
175
|
or "united_states_of_america" in i[3]
|
176
|
-
|
177
|
-
|
176
|
+
or "russian_federation" in i[3]
|
177
|
+
or "ukraine" in i[3]
|
178
178
|
]
|
179
179
|
# "malawi" in i[2]]
|
180
180
|
|
@@ -47,7 +47,7 @@ class cei_runner(base.BaseGeo):
|
|
47
47
|
|
48
48
|
self.dir_input = Path(self.parser.get("PATHS", "dir_input"))
|
49
49
|
self.base_dir = Path(
|
50
|
-
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\
|
50
|
+
r"D:\Users\ritvik\projects\GEOGLAM\Output\countries\illinois"
|
51
51
|
) # Path(self.parser.get("PATHS", "dir_crop_inputs"))
|
52
52
|
self.do_parallel = self.parser.getboolean("DEFAULT", "do_parallel")
|
53
53
|
|
@@ -164,7 +164,7 @@ class cei_runner(base.BaseGeo):
|
|
164
164
|
# Only keep those entries in combinations where the third elemt is
|
165
165
|
# mozambique, south_africa, angola or dem_people's_rep_of_korea
|
166
166
|
# This is done to test the code for these countries
|
167
|
-
combinations = [i for i in combinations if "
|
167
|
+
combinations = [i for i in combinations if "illinois_maize_s1" in i[3]]
|
168
168
|
|
169
169
|
if True:
|
170
170
|
num_cpu = int(cpu_count() * 0.5)
|
@@ -157,8 +157,8 @@ def plot_feature_corr_by_time(df, **kwargs):
|
|
157
157
|
cbar_ax.tick_params(axis="both", which="major", labelsize=5)
|
158
158
|
|
159
159
|
_country = country.title().replace("_", " ")
|
160
|
-
_region_name = region_name
|
161
|
-
_crop =
|
160
|
+
_region_name = region_name if not national_correlation else ""
|
161
|
+
_crop = crop.title().replace("_", " ")
|
162
162
|
if not national_correlation:
|
163
163
|
fname = f"{country}_{crop}_{id}_corr_feature_by_time.png"
|
164
164
|
else:
|
@@ -304,7 +304,7 @@ def all_correlated_feature_by_time(df, **kwargs):
|
|
304
304
|
)
|
305
305
|
|
306
306
|
kwargs["region_id"] = region_id
|
307
|
-
_region_names = "
|
307
|
+
_region_names = ", ".join([str(x) for x in group['Region'].unique()])
|
308
308
|
kwargs["region_name"] = _region_names
|
309
309
|
plot_feature_corr_by_time(df_tmp, **kwargs)
|
310
310
|
# For each element in dict_best_cei, add the type of the cei
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import matplotlib.pyplot as plt
|
3
|
+
import seaborn as sns
|
4
|
+
from taipy.gui import Gui
|
5
|
+
|
6
|
+
# Load the dataset
|
7
|
+
file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\ml\analysis\July_05_2024\russian_federation\maize\cumulative_1\2010\X_train_1.csv' # Update with the correct file path
|
8
|
+
df = pd.read_csv(file_path)
|
9
|
+
print(df.head())
|
10
|
+
# Define a function to create the plot
|
11
|
+
def plot_auc_ndvi(data):
|
12
|
+
fig, ax = plt.subplots(figsize=(14, 8))
|
13
|
+
sns.lineplot(data=data, x="Harvest Year", y="AUC_NDVI Oct 7-Mar 25", hue="Region", marker="o", ax=ax)
|
14
|
+
ax.set_title("Trends of AUC_NDVI by Region (Oct 7 - Mar 25)")
|
15
|
+
ax.set_xlabel("Harvest Year")
|
16
|
+
ax.set_ylabel("AUC_NDVI Oct 7 - Mar 25")
|
17
|
+
ax.legend(title="Region", bbox_to_anchor=(1.05, 1), loc='upper left')
|
18
|
+
plt.show()
|
19
|
+
return fig
|
20
|
+
|
21
|
+
# Create the plot and save it
|
22
|
+
plot_fig = plot_auc_ndvi(df)
|
23
|
+
|
24
|
+
# Define the Taipy page with the plot
|
25
|
+
page = """
|
26
|
+
# Trends of AUC_NDVI by Region
|
27
|
+
|
28
|
+
<|{plot_fig}|chart|>
|
29
|
+
"""
|
30
|
+
|
31
|
+
# Create and run the GUI
|
32
|
+
gui = Gui(page)
|
33
|
+
gui.run()
|
@@ -107,7 +107,6 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
107
107
|
try:
|
108
108
|
utils.to_db(db_path, experiment_id, df)
|
109
109
|
except Exception as e:
|
110
|
-
breakpoint()
|
111
110
|
print(f"Error: {e}")
|
112
111
|
|
113
112
|
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
@@ -128,7 +127,6 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
128
127
|
df_model.index.set_names(["Index"], inplace=True)
|
129
128
|
utils.to_db(db_path, "models", df_model)
|
130
129
|
except Exception as e:
|
131
|
-
breakpoint()
|
132
130
|
print(f"Error: {e}")
|
133
131
|
|
134
132
|
con.commit()
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
from typing import Union
|
2
3
|
|
3
4
|
from geocif import utils
|
4
5
|
|
@@ -277,23 +278,31 @@ def update_feature_names(df, method):
|
|
277
278
|
return df
|
278
279
|
|
279
280
|
|
280
|
-
def convert_stage_string(stage_info, to_array=True):
|
281
|
+
def convert_stage_string(stage_info: Union[str, np.ndarray], to_array: bool = True) -> Union[np.ndarray, str]:
|
281
282
|
"""
|
282
|
-
|
283
|
-
output: array([13, 12, 11])
|
284
|
-
or vice versa if to_array = False
|
283
|
+
Converts a string of stage information to a numpy array or vice versa.
|
285
284
|
|
286
285
|
Args:
|
287
|
-
stage_info:
|
288
|
-
to_array:
|
286
|
+
stage_info: A string of stages separated by underscores or a numpy array of stages e.g. '13_12_11'
|
287
|
+
to_array: A boolean indicating the direction of conversion. If True, converts string to numpy array e.g. array([13, 12, 11])
|
288
|
+
If False, converts numpy array to string.
|
289
289
|
|
290
290
|
Returns:
|
291
|
+
A numpy array of stages if to_array is True, or a string of stages if to_array is False.
|
291
292
|
|
293
|
+
Raises:
|
294
|
+
ValueError: If the input format is incorrect.
|
292
295
|
"""
|
293
296
|
if to_array:
|
294
|
-
|
295
|
-
|
297
|
+
if not isinstance(stage_info, str):
|
298
|
+
raise ValueError("Expected a string for stage_info when to_array is True.")
|
299
|
+
try:
|
300
|
+
stages = np.array([int(stage) for stage in stage_info.split("_")])
|
301
|
+
except ValueError:
|
302
|
+
raise ValueError("Stage info string should contain integers separated by underscores.")
|
296
303
|
else:
|
297
|
-
|
304
|
+
if not isinstance(stage_info, np.ndarray):
|
305
|
+
raise ValueError("Expected a numpy array for stage_info when to_array is False.")
|
306
|
+
stages = "_".join(map(str, stage_info))
|
298
307
|
|
299
308
|
return stages
|