geocif 0.1.48__tar.gz → 0.1.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.48/geocif.egg-info → geocif-0.1.50}/PKG-INFO +1 -1
- {geocif-0.1.48 → geocif-0.1.50}/geocif/analysis.py +12 -5
- {geocif-0.1.48 → geocif-0.1.50}/geocif/geocif.py +84 -28
- {geocif-0.1.48 → geocif-0.1.50}/geocif/logger.py +24 -1
- geocif-0.1.50/geocif/ml/aa.py +28 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/feature_selection.py +10 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/output.py +13 -13
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/stats.py +7 -1
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/trainers.py +27 -22
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/trend.py +32 -11
- geocif-0.1.50/geocif/viz/misc.py +55 -0
- {geocif-0.1.48 → geocif-0.1.50/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.48 → geocif-0.1.50}/setup.py +1 -1
- {geocif-0.1.48 → geocif-0.1.50}/LICENSE +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/MANIFEST.in +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/README.md +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/constants.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/features.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/geo.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/backup/models.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/cei/indices.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/experiments.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/indices_runner.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/indices_runner_v2.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/misc.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/stages.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/ml/xai.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/automl.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/playground/misc.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/utils.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif/viz/plot.py +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/requirements.txt +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/setup.cfg +0 -0
- {geocif-0.1.48 → geocif-0.1.50}/tests/test_geocif.py +0 -0
@@ -163,6 +163,7 @@ class Geoanalysis:
|
|
163
163
|
|
164
164
|
df_metrics = self._compute_metrics(df)
|
165
165
|
df_metrics = self._process_metrics(df_metrics)
|
166
|
+
|
166
167
|
self._plot_metrics(df_metrics)
|
167
168
|
|
168
169
|
df_regional_metrics_by_year = self._compute_regional_metrics(
|
@@ -183,6 +184,11 @@ class Geoanalysis:
|
|
183
184
|
return df_metrics, df_regional_metrics, df_national_yield
|
184
185
|
|
185
186
|
def _clean_data(self):
|
187
|
+
# Hack exclude 2012 if country == "illinois"
|
188
|
+
if self.country == "illinois":
|
189
|
+
self.df_analysis = self.df_analysis[
|
190
|
+
self.df_analysis["Harvest Year"] != 2012
|
191
|
+
]
|
186
192
|
# Remove rows with missing values in Observed Yield (tn per ha)
|
187
193
|
return self.df_analysis.dropna(subset=["Observed Yield (tn per ha)"])
|
188
194
|
|
@@ -196,11 +202,12 @@ class Geoanalysis:
|
|
196
202
|
.reset_index()
|
197
203
|
)
|
198
204
|
|
199
|
-
return df_metrics.pivot_table(
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
).reset_index()
|
205
|
+
#return df_metrics.pivot_table(
|
206
|
+
# index=["Country", "Model", "Harvest Year", "Stage Name", "Stage Range"],
|
207
|
+
# columns="level_5",
|
208
|
+
# values=0,
|
209
|
+
#).reset_index()
|
210
|
+
return df_metrics
|
204
211
|
|
205
212
|
def _process_metrics(self, df_metrics):
|
206
213
|
# Assign each unique Stage Name a unique integer identifier
|
@@ -82,6 +82,13 @@ class Geocif:
|
|
82
82
|
self.today_full = self._date.format("MMMM_DD_YYYY_HH_mm")
|
83
83
|
|
84
84
|
self.df_forecast = pd.DataFrame()
|
85
|
+
"""
|
86
|
+
====================================================================
|
87
|
+
Config file: Logging
|
88
|
+
====================================================================
|
89
|
+
"""
|
90
|
+
self.log_level = self.parser.get("LOGGING", "log_level")
|
91
|
+
|
85
92
|
"""
|
86
93
|
====================================================================
|
87
94
|
Config file: Default
|
@@ -198,9 +205,6 @@ class Geocif:
|
|
198
205
|
|
199
206
|
self.db_path = self.dir_db / self.db_forecasts
|
200
207
|
|
201
|
-
# Store config file in database
|
202
|
-
output.config_to_db(self.db_path, self.parser, self.today)
|
203
|
-
|
204
208
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
205
209
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
206
210
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -221,6 +225,9 @@ class Geocif:
|
|
221
225
|
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
222
226
|
)
|
223
227
|
|
228
|
+
# Drop rows where target_col is NaN
|
229
|
+
df_region = df_region.dropna(subset=[target_col])
|
230
|
+
|
224
231
|
X_train = df_region[self.feature_names]
|
225
232
|
# Drop any columns with NaNs
|
226
233
|
X_train = X_train.dropna(axis=1, how="any")
|
@@ -280,7 +287,7 @@ class Geocif:
|
|
280
287
|
X_train_scaled,
|
281
288
|
y_train,
|
282
289
|
feature_names=self.selected_features,
|
283
|
-
target_col=
|
290
|
+
target_col=target_col,
|
284
291
|
optimize=self.optimize,
|
285
292
|
fraction_loocv=self.fraction_loocv,
|
286
293
|
cat_features=self.cat_features,
|
@@ -302,8 +309,13 @@ class Geocif:
|
|
302
309
|
verbose=False,
|
303
310
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
304
311
|
)
|
305
|
-
elif self.model_name
|
312
|
+
elif self.model_name in ["oblique"]:
|
306
313
|
self.model.fit(X_train, y_train)
|
314
|
+
elif self.model_name == "ydf":
|
315
|
+
# Combine X_train and y_train
|
316
|
+
df_train = pd.concat([X_train, y_train], axis=1)
|
317
|
+
|
318
|
+
self.model = self.model.train(df_train)
|
307
319
|
elif self.model_name == "geospaNN":
|
308
320
|
self.model.fit(
|
309
321
|
X_train,
|
@@ -335,9 +347,16 @@ class Geocif:
|
|
335
347
|
]:
|
336
348
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
349
|
|
350
|
+
if self.model_name == "cumulative_1":
|
351
|
+
num_columns = 1
|
352
|
+
elif self.model_name == "cumulative_2":
|
353
|
+
num_columns = 2
|
354
|
+
elif self.model_name == "cumulative_3":
|
355
|
+
num_columns = 3
|
356
|
+
|
338
357
|
# Standardize the numeric features
|
339
358
|
scaler = StandardScaler()
|
340
|
-
X_numeric = X_train.iloc[:, :
|
359
|
+
X_numeric = X_train.iloc[:, :num_columns]
|
341
360
|
X_scaled_numeric = pd.DataFrame(
|
342
361
|
scaler.fit_transform(X_numeric),
|
343
362
|
columns=X_numeric.columns,
|
@@ -409,9 +428,16 @@ class Geocif:
|
|
409
428
|
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
429
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
430
|
|
431
|
+
if self.model_name == "cumulative_1":
|
432
|
+
num_columns = 1
|
433
|
+
elif self.model_name == "cumulative_2":
|
434
|
+
num_columns = 2
|
435
|
+
elif self.model_name == "cumulative_3":
|
436
|
+
num_columns = 3
|
437
|
+
|
412
438
|
# Standardize the numeric features
|
413
439
|
scaler = StandardScaler()
|
414
|
-
X_numeric = X_test.iloc[:, :
|
440
|
+
X_numeric = X_test.iloc[:, :num_columns]
|
415
441
|
try:
|
416
442
|
X_scaled_numeric = pd.DataFrame(
|
417
443
|
scaler.fit_transform(X_numeric),
|
@@ -455,7 +481,9 @@ class Geocif:
|
|
455
481
|
self.selected_features + self.cat_features + [self.target]
|
456
482
|
]
|
457
483
|
w_train = data_train.y - self.estimate(data_train.x)
|
458
|
-
|
484
|
+
elif self.model_name == "ydf":
|
485
|
+
y_pred = self.model.evaluate(X_test)
|
486
|
+
best_hyperparameters = {}
|
459
487
|
else:
|
460
488
|
y_pred = self.model.predict(X_test)
|
461
489
|
best_hyperparameters = self.model.get_params().copy()
|
@@ -468,8 +496,8 @@ class Geocif:
|
|
468
496
|
|
469
497
|
obj_trend = trend.DetrendedData(
|
470
498
|
df_tmp[f"Detrended {self.target}"],
|
471
|
-
df_tmp["
|
472
|
-
df_tmp["
|
499
|
+
df_tmp["Detrended Model"],
|
500
|
+
df_tmp["Detrended Model Type"],
|
473
501
|
)
|
474
502
|
|
475
503
|
# Retrend the predicted yield
|
@@ -477,6 +505,8 @@ class Geocif:
|
|
477
505
|
obj_trend, df_region.iloc[idx][["Harvest Year"]]
|
478
506
|
)[0]
|
479
507
|
|
508
|
+
df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
|
509
|
+
|
480
510
|
# Create a dataframe with forecast results
|
481
511
|
shp = len(X_test)
|
482
512
|
experiment_id = f"{self.country}_{self.crop}"
|
@@ -530,7 +560,6 @@ class Geocif:
|
|
530
560
|
|
531
561
|
if self.check_yield_trend:
|
532
562
|
df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
|
533
|
-
df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
|
534
563
|
|
535
564
|
if self.last_year_yield_as_feature:
|
536
565
|
# Add last year yield to dataframe
|
@@ -729,7 +758,7 @@ class Geocif:
|
|
729
758
|
+ ["Region_ID"]
|
730
759
|
)
|
731
760
|
if self.check_yield_trend:
|
732
|
-
common_columns += ["Detrended Model Type", "Detrended Model"]
|
761
|
+
common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
|
733
762
|
|
734
763
|
if self.last_year_yield_as_feature:
|
735
764
|
common_columns += [f"Last Year {self.target}"]
|
@@ -738,11 +767,15 @@ class Geocif:
|
|
738
767
|
# Filter dataframe based on region and self.feature_names
|
739
768
|
df_region_train = self.df_train[mask_train]
|
740
769
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
770
|
+
df_region_train.reset_index(drop=True, inplace=True)
|
741
771
|
self.train(df_region_train, scaler)
|
742
772
|
|
743
773
|
""" Predict """
|
774
|
+
if self.check_yield_trend:
|
775
|
+
common_columns = common_columns[:-3]
|
744
776
|
df_region_test = self.df_test[mask_test]
|
745
777
|
df_region_test = df_region_test[self.fixed_columns + common_columns]
|
778
|
+
df_region_test.reset_index(drop=True, inplace=True)
|
746
779
|
experiment_id, df = self.predict(df_region_test, scaler)
|
747
780
|
# df.reset_index(inplace=True)
|
748
781
|
|
@@ -849,12 +882,15 @@ class Geocif:
|
|
849
882
|
group.columns.str.contains(self.stage_info["Stage_ID"])
|
850
883
|
].tolist()
|
851
884
|
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
885
|
+
try:
|
886
|
+
group = group[
|
887
|
+
self.fixed_columns
|
888
|
+
+ [self.target]
|
889
|
+
+ self.statistics_columns
|
890
|
+
+ all_columns
|
891
|
+
]
|
892
|
+
except:
|
893
|
+
continue
|
858
894
|
# rename all_columns to self.stage_info["CEI"]
|
859
895
|
group.rename(
|
860
896
|
columns={
|
@@ -897,12 +933,12 @@ class Geocif:
|
|
897
933
|
|
898
934
|
if self.lag_yield_as_feature:
|
899
935
|
df = fe.compute_lag_yield(
|
900
|
-
df, self.all_seasons_with_yield, self.number_lag_years
|
936
|
+
df, self.all_seasons_with_yield, self.number_lag_years, self.target
|
901
937
|
)
|
902
938
|
|
903
939
|
if self.analogous_year_yield_as_feature:
|
904
940
|
df = fe.compute_analogous_yield(
|
905
|
-
df, self.all_seasons_with_yield, self.number_median_years
|
941
|
+
df, self.all_seasons_with_yield, self.number_median_years, self.target
|
906
942
|
)
|
907
943
|
|
908
944
|
# Create Region_ID column based on Region column category code
|
@@ -912,7 +948,7 @@ class Geocif:
|
|
912
948
|
elif self.cluster_strategy == "individual":
|
913
949
|
df["Region_ID"] = df["Region"].cat.codes
|
914
950
|
elif self.cluster_strategy == "auto_detect":
|
915
|
-
clusters_assigned = fe.detect_clusters(df)
|
951
|
+
clusters_assigned = fe.detect_clusters(df, self.target)
|
916
952
|
# Merge the cluster labels with the original DataFrame
|
917
953
|
df = df.merge(clusters_assigned, on="Region")
|
918
954
|
|
@@ -1036,8 +1072,8 @@ class Geocif:
|
|
1036
1072
|
|
1037
1073
|
""" Groupby Region column and compute detrended yield """
|
1038
1074
|
self.df_train[f"Detrended {self.target}"] = np.NaN
|
1039
|
-
self.df_train["
|
1040
|
-
self.df_train["
|
1075
|
+
self.df_train["Detrended Model"] = np.NaN
|
1076
|
+
self.df_train["Detrended Model Type"] = np.NaN
|
1041
1077
|
if self.check_yield_trend:
|
1042
1078
|
group_by = ["Region"]
|
1043
1079
|
groups = self.df_train.groupby(group_by)
|
@@ -1050,10 +1086,10 @@ class Geocif:
|
|
1050
1086
|
group.index, f"Detrended {self.target}"
|
1051
1087
|
] = detrended_data.detrended_series
|
1052
1088
|
self.df_train.loc[
|
1053
|
-
group.index, "
|
1089
|
+
group.index, "Detrended Model"
|
1054
1090
|
] = detrended_data.trend_model
|
1055
1091
|
self.df_train.loc[
|
1056
|
-
group.index, "
|
1092
|
+
group.index, "Detrended Model Type"
|
1057
1093
|
] = detrended_data.model_type
|
1058
1094
|
|
1059
1095
|
# 6. Exclude years without yields from df_train
|
@@ -1118,10 +1154,19 @@ class Geocif:
|
|
1118
1154
|
self.do_xai = False
|
1119
1155
|
self.estimate_ci = False
|
1120
1156
|
self.estimate_ci_for_all = False
|
1121
|
-
self.check_yield_trend =
|
1157
|
+
self.check_yield_trend = True
|
1122
1158
|
self.cluster_strategy = "single"
|
1123
1159
|
self.select_cei_by = "Index"
|
1124
1160
|
self.use_cumulative_features = True
|
1161
|
+
elif self.model_name in ["oblique", "ydf"]:
|
1162
|
+
self.do_xai = False
|
1163
|
+
self.estimate_ci = False
|
1164
|
+
# Remove Region from cat_features as it is object type
|
1165
|
+
self.cat_features = [col for col in self.cat_features if col != "Region"]
|
1166
|
+
# if self.model_name == "ydf":
|
1167
|
+
# # HACK, for ydf model, target_col is Yield
|
1168
|
+
# self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
|
1169
|
+
# self.target = "Yield"
|
1125
1170
|
else:
|
1126
1171
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
1127
1172
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -1188,6 +1233,9 @@ class Geocif:
|
|
1188
1233
|
self.dg["Country Region"] = (
|
1189
1234
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
|
1190
1235
|
)
|
1236
|
+
elif self.country == "illinois":
|
1237
|
+
self.dg["ADM0_NAME"] = "illinois"
|
1238
|
+
self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
|
1191
1239
|
else:
|
1192
1240
|
self.dg["Country Region"] = (
|
1193
1241
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
|
@@ -1240,6 +1288,9 @@ class Geocif:
|
|
1240
1288
|
# TODO ignore file with _2000 in its name
|
1241
1289
|
all_files = [f for f in all_files if "_2000" not in f.name]
|
1242
1290
|
|
1291
|
+
# Assert that all_files is not empty
|
1292
|
+
assert all_files, f"No files found in {_dir_country} with {file_name}"
|
1293
|
+
|
1243
1294
|
self.df_results = pd.concat(
|
1244
1295
|
(pd.read_csv(f) for f in all_files), ignore_index=True
|
1245
1296
|
)
|
@@ -1284,7 +1335,7 @@ def loop_execute(inputs):
|
|
1284
1335
|
)
|
1285
1336
|
|
1286
1337
|
with PyCallGraph(output=graphviz, config=config):
|
1287
|
-
country, crop, season, model, logger, parser = inputs
|
1338
|
+
country, crop, season, model, logger, parser, index = inputs
|
1288
1339
|
|
1289
1340
|
logger.info("=====================================================")
|
1290
1341
|
logger.info(f"\tStarting GEOCIF: {country} {crop} {season} {model}")
|
@@ -1293,6 +1344,11 @@ def loop_execute(inputs):
|
|
1293
1344
|
obj = Geocif(logger=logger, parser=parser)
|
1294
1345
|
obj.read_data(country, crop, season)
|
1295
1346
|
|
1347
|
+
# Store config file in database, only execute this for
|
1348
|
+
# the first iteration of the loop
|
1349
|
+
if index == 0:
|
1350
|
+
output.config_to_db(obj.db_path, obj.parser, obj.today)
|
1351
|
+
|
1296
1352
|
# Setup metadata and run ML code
|
1297
1353
|
obj.setup(season, model)
|
1298
1354
|
if obj.simulation_stages:
|
@@ -1336,7 +1392,7 @@ def execute_models(inputs, logger, parser):
|
|
1336
1392
|
do_parallel = parser.getboolean("DEFAULT", "do_parallel")
|
1337
1393
|
|
1338
1394
|
# Add logger and parser to each element in inputs
|
1339
|
-
inputs = [item + [logger, parser] for item in inputs]
|
1395
|
+
inputs = [item + [logger, parser, idx] for idx, item in enumerate(inputs)]
|
1340
1396
|
|
1341
1397
|
if do_parallel:
|
1342
1398
|
cpu_count = int(mp.cpu_count() * 0.3)
|
@@ -71,8 +71,29 @@ class Logger:
|
|
71
71
|
self.logger.error(msg)
|
72
72
|
|
73
73
|
|
74
|
+
def get_logging_level(level):
|
75
|
+
"""
|
76
|
+
|
77
|
+
Args:
|
78
|
+
level:
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
|
82
|
+
"""
|
83
|
+
if level == "DEBUG":
|
84
|
+
return logging.DEBUG
|
85
|
+
elif level == "INFO":
|
86
|
+
return logging.INFO
|
87
|
+
elif level == "WARNING":
|
88
|
+
return logging.WARNING
|
89
|
+
elif level == "ERROR":
|
90
|
+
return logging.ERROR
|
91
|
+
else:
|
92
|
+
return logging.INFO
|
93
|
+
|
94
|
+
|
74
95
|
def setup_logger_parser(
|
75
|
-
path_config_file, name_project="geocif", name_file="ml"
|
96
|
+
path_config_file, name_project="geocif", name_file="ml"
|
76
97
|
):
|
77
98
|
"""
|
78
99
|
|
@@ -87,6 +108,8 @@ def setup_logger_parser(
|
|
87
108
|
"""
|
88
109
|
parser = read_config(path_config_file)
|
89
110
|
dir_log = parser.get("PATHS", "dir_log")
|
111
|
+
level = parser.get("LOGGING", "log_level")
|
112
|
+
level = get_logging_level(level)
|
90
113
|
|
91
114
|
logger = Logger(
|
92
115
|
dir_log=dir_log,
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import ydf
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
# Load dataset with Pandas
|
5
|
+
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
|
6
|
+
train_ds = pd.read_csv(ds_path + "adult_train.csv")
|
7
|
+
test_ds = pd.read_csv(ds_path + "adult_test.csv")
|
8
|
+
|
9
|
+
# Train a Gradient Boosted Trees model
|
10
|
+
model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
|
11
|
+
|
12
|
+
# Look at a model (input features, training logs, structure, etc.)
|
13
|
+
model.describe()
|
14
|
+
|
15
|
+
# Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
|
16
|
+
model.evaluate(test_ds)
|
17
|
+
|
18
|
+
# Generate predictions
|
19
|
+
model.predict(test_ds)
|
20
|
+
|
21
|
+
# Analyse a model (e.g. partial dependence plot, variable importance)
|
22
|
+
model.analyze(test_ds)
|
23
|
+
|
24
|
+
# Benchmark the inference speed of a model
|
25
|
+
model.benchmark(test_ds)
|
26
|
+
|
27
|
+
# Save the model
|
28
|
+
model.save("/tmp/my_model")
|
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
131
131
|
|
132
132
|
X_filtered = selector.fit_transform(X, y)
|
133
133
|
selected_features = X_filtered.columns.tolist()
|
134
|
+
elif method == "mrmr":
|
135
|
+
from mrmr import mrmr_regression
|
136
|
+
|
137
|
+
try:
|
138
|
+
selected_features = mrmr_regression(X=X, y=y, K=10)
|
139
|
+
except:
|
140
|
+
breakpoint()
|
141
|
+
# combine X and y into a dataframe
|
142
|
+
# df = pd.concat([X, y], axis=1)
|
143
|
+
|
134
144
|
elif method == "RFECV":
|
135
145
|
from sklearn.feature_selection import RFECV
|
136
146
|
from sklearn.model_selection import KFold
|
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
109
109
|
except Exception as e:
|
110
110
|
print(f"Error: {e}")
|
111
111
|
|
112
|
-
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
113
|
-
# Output model pickle as a blob to database
|
114
|
-
df_model = pd.DataFrame(
|
115
|
-
{
|
116
|
-
"Experiment_ID": [experiment_id],
|
117
|
-
"Model": [model_name],
|
118
|
-
"Model_Blob": [pickle.dumps(model)],
|
119
|
-
}
|
120
|
-
)
|
121
|
-
# df_model.index = df_model.apply(
|
122
|
-
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
123
|
-
# )
|
124
|
-
|
125
112
|
# name the index level
|
126
113
|
try:
|
114
|
+
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
115
|
+
# Output model pickle as a blob to database
|
116
|
+
df_model = pd.DataFrame(
|
117
|
+
{
|
118
|
+
"Experiment_ID": [experiment_id],
|
119
|
+
"Model": [model_name],
|
120
|
+
"Model_Blob": [pickle.dumps(model)],
|
121
|
+
}
|
122
|
+
)
|
123
|
+
# df_model.index = df_model.apply(
|
124
|
+
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
125
|
+
# )
|
126
|
+
|
127
127
|
df_model.index.set_names(["Index"], inplace=True)
|
128
128
|
utils.to_db(db_path, "models", df_model)
|
129
129
|
except Exception as e:
|
@@ -191,7 +191,12 @@ def add_statistics(
|
|
191
191
|
|
192
192
|
"""
|
193
193
|
# First check if country and crop are in the admin_crop_production.csv file
|
194
|
-
|
194
|
+
if country == "Afghanistan":
|
195
|
+
fn = "afghanistan.csv"
|
196
|
+
elif country == "Illinois":
|
197
|
+
fn = "illinois.csv"
|
198
|
+
else:
|
199
|
+
fn = "adm_crop_production.csv"
|
195
200
|
df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
|
196
201
|
|
197
202
|
# HACK
|
@@ -206,6 +211,7 @@ def add_statistics(
|
|
206
211
|
df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
|
207
212
|
else:
|
208
213
|
group_by = ["Region", "Harvest Year"]
|
214
|
+
|
209
215
|
groups = df.groupby(group_by)
|
210
216
|
|
211
217
|
# Define processing for each group
|
@@ -264,8 +264,7 @@ def auto_train(
|
|
264
264
|
if model_name in ["catboost", "merf"]:
|
265
265
|
hyperparams = {
|
266
266
|
"depth": 6,
|
267
|
-
|
268
|
-
"iterations": 5000,
|
267
|
+
|
269
268
|
"subsample": 1.0,
|
270
269
|
"random_strength": 0.5,
|
271
270
|
"reg_lambda": 0.001,
|
@@ -283,18 +282,33 @@ def auto_train(
|
|
283
282
|
regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
284
283
|
model = MERF(regr, max_iterations=10)
|
285
284
|
elif model_name == "oblique":
|
286
|
-
|
287
|
-
|
285
|
+
from treeple import ExtraObliqueRandomForestRegressor
|
286
|
+
|
287
|
+
# https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
|
288
|
+
n_features = X_train.shape[1]
|
288
289
|
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
290
|
+
model = ExtraObliqueRandomForestRegressor(
|
291
|
+
n_estimators=1500,
|
292
|
+
max_depth=20,
|
293
|
+
max_features=n_features**2,
|
294
|
+
feature_combinations=n_features,
|
293
295
|
n_jobs=-1,
|
294
|
-
verbose=2,
|
295
296
|
random_state=42,
|
296
297
|
)
|
297
|
-
|
298
|
+
elif model_name == "ydf":
|
299
|
+
import ydf
|
300
|
+
templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
|
301
|
+
|
302
|
+
model = ydf.GradientBoostedTreesLearner(
|
303
|
+
label=target_col,
|
304
|
+
task=ydf.Task.REGRESSION,
|
305
|
+
growing_strategy='BEST_FIRST_GLOBAL',
|
306
|
+
categorical_algorithm='RANDOM',
|
307
|
+
split_axis='SPARSE_OBLIQUE',
|
308
|
+
sparse_oblique_normalization='MIN_MAX',
|
309
|
+
sparse_oblique_num_projections_exponent=2.0)
|
310
|
+
|
311
|
+
hyperparams = templates["benchmark_rank1v1"]
|
298
312
|
elif model_name == "linear":
|
299
313
|
from sklearn.linear_model import LassoCV
|
300
314
|
|
@@ -308,24 +322,15 @@ def auto_train(
|
|
308
322
|
elif model_name == "cumulative_1":
|
309
323
|
from pygam import GAM, s, f, te
|
310
324
|
|
311
|
-
|
312
|
-
region_idx = X_train.columns.get_loc("Region")
|
313
|
-
|
314
|
-
model = GAM(s(0) + f(region_idx))
|
325
|
+
model = GAM(s(0) + f(1))
|
315
326
|
elif model_name == "cumulative_2":
|
316
327
|
from pygam import GAM, s, f, te
|
317
328
|
|
318
|
-
|
319
|
-
region_idx = X_train.columns.get_loc("Region")
|
320
|
-
|
321
|
-
model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
|
329
|
+
model = GAM(s(0) + s(1) + te(0, 1) + f(2))
|
322
330
|
elif model_name == "cumulative_3":
|
323
331
|
from pygam import GAM, s, f, te
|
324
332
|
|
325
|
-
|
326
|
-
region_idx = X_train.columns.get_loc("Region")
|
327
|
-
|
328
|
-
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
|
333
|
+
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
|
329
334
|
elif model_name == "geospaNN":
|
330
335
|
import torch
|
331
336
|
import geospaNN
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
import pandas as pd
|
2
3
|
from statsmodels.regression.linear_model import OLS
|
3
4
|
from statsmodels.tools.tools import add_constant
|
4
5
|
|
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
|
|
6
7
|
class DetrendedData:
|
7
8
|
"""
|
8
9
|
A class to store the detrended series, the model used for detrending,
|
9
|
-
and the type of model ('mean', 'linear', 'quadratic').
|
10
|
+
and the type of model ('mean', 'linear', 'quadratic', 'difference').
|
10
11
|
"""
|
11
12
|
|
12
13
|
def __init__(self, detrended_series, trend_model, model_type):
|
@@ -15,14 +16,16 @@ class DetrendedData:
|
|
15
16
|
self.model_type = model_type
|
16
17
|
|
17
18
|
|
18
|
-
def detrend_dataframe(df, column_name="y"):
|
19
|
+
def detrend_dataframe(df, column_name="y", model_type="best"):
|
19
20
|
"""
|
20
|
-
Removes the trend from the specified column of a DataFrame using the method
|
21
|
-
(mean, linear, quadratic) that results in the lowest AIC value.
|
21
|
+
Removes the trend from the specified column of a DataFrame using the specified method
|
22
|
+
(mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
|
22
23
|
|
23
24
|
Parameters:
|
24
25
|
- df: pandas DataFrame containing the time series data.
|
25
26
|
- column_name: string name of the column to detrend.
|
27
|
+
- model_type: string specifying which model to use for detrending ('mean', 'linear',
|
28
|
+
'quadratic', 'difference', or 'best' for automatic selection based on AIC).
|
26
29
|
|
27
30
|
Returns:
|
28
31
|
- DetrendedData object containing the detrended series, the statistical model,
|
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
|
|
41
44
|
X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
|
42
45
|
quad_model = OLS(df[column_name], X_quad).fit()
|
43
46
|
|
44
|
-
|
45
|
-
|
47
|
+
# Differencing method
|
48
|
+
diff_series = df[column_name].diff().dropna()
|
49
|
+
diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
|
50
|
+
|
51
|
+
models = {
|
52
|
+
"mean": mean_model,
|
53
|
+
"linear": linear_model,
|
54
|
+
"quadratic": quad_model,
|
55
|
+
"difference": diff_model
|
56
|
+
}
|
57
|
+
|
58
|
+
if model_type == "best":
|
59
|
+
best_model_type = min(models, key=lambda x: models[x].aic)
|
60
|
+
else:
|
61
|
+
best_model_type = model_type
|
62
|
+
|
46
63
|
best_model = models[best_model_type]
|
47
64
|
|
48
65
|
if best_model_type == "mean":
|
49
66
|
detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
|
50
67
|
elif best_model_type == "linear":
|
51
68
|
detrended = df[column_name] - linear_model.predict(X_linear)
|
52
|
-
|
69
|
+
elif best_model_type == "quadratic":
|
53
70
|
detrended = df[column_name] - quad_model.predict(X_quad)
|
71
|
+
else: # difference
|
72
|
+
detrended = df[column_name].diff().dropna()
|
54
73
|
|
55
74
|
return DetrendedData(detrended, best_model, best_model_type)
|
56
75
|
|
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
67
86
|
Returns:
|
68
87
|
- The retrended series as a pandas Series.
|
69
88
|
"""
|
70
|
-
# if future_time_points is not of type pandas dataframe then convert it to one
|
71
89
|
future_time_points = np.array(future_time_points)
|
72
90
|
|
73
|
-
model_type = detrended_data.model_type[0]
|
74
|
-
model = detrended_data.trend_model[0]
|
91
|
+
model_type = detrended_data.model_type.unique()[0]
|
92
|
+
model = detrended_data.trend_model.unique()[0]
|
75
93
|
|
76
94
|
if model_type == "mean":
|
77
95
|
trend_component = model.predict(
|
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
80
98
|
elif model_type == "linear":
|
81
99
|
X_linear = add_constant(future_time_points, has_constant="add")
|
82
100
|
trend_component = model.predict(X_linear)
|
83
|
-
|
101
|
+
elif model_type == "quadratic":
|
84
102
|
X_quad = add_constant(
|
85
103
|
np.column_stack((future_time_points, future_time_points**2)),
|
86
104
|
has_constant="add",
|
87
105
|
)
|
88
106
|
trend_component = model.predict(X_quad)
|
107
|
+
else: # difference
|
108
|
+
trend_component = pd.Series(np.nan, index=future_time_points)
|
109
|
+
trend_component.iloc[0] = model.params[0] # Add mean of differenced series
|
89
110
|
|
90
111
|
return trend_component
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import hvplot.pandas
|
3
|
+
import panel as pn
|
4
|
+
|
5
|
+
# Load the CSV file
|
6
|
+
file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
|
7
|
+
data = pd.read_csv(file_path)
|
8
|
+
|
9
|
+
# Extract unique values for dropdowns
|
10
|
+
countries = data['Country'].unique().tolist()
|
11
|
+
|
12
|
+
# Create dropdown widgets
|
13
|
+
country_dropdown = pn.widgets.Select(name='Country', options=countries)
|
14
|
+
region_dropdown = pn.widgets.Select(name='Region', options=[])
|
15
|
+
crop_dropdown = pn.widgets.Select(name='Crop', options=[])
|
16
|
+
season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
|
17
|
+
|
18
|
+
|
19
|
+
# Function to update region and crop options based on selected country
|
20
|
+
@pn.depends(country_dropdown.param.value, watch=True)
|
21
|
+
def update_region_and_crop_options(country):
|
22
|
+
filtered_data = data[data['Country'] == country]
|
23
|
+
regions = filtered_data['Region'].unique().tolist()
|
24
|
+
crops = filtered_data['Crop'].unique().tolist()
|
25
|
+
|
26
|
+
region_dropdown.options = regions
|
27
|
+
crop_dropdown.options = crops
|
28
|
+
|
29
|
+
|
30
|
+
# Function to filter data based on dropdown selections
|
31
|
+
@pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
|
32
|
+
season_dropdown.param.value)
|
33
|
+
def update_plot(country, region, crop, season):
|
34
|
+
filtered_data = data[(data['Country'] == country) &
|
35
|
+
(data['Region'] == region) &
|
36
|
+
(data['Crop'] == crop) &
|
37
|
+
(data['Season'] == season)]
|
38
|
+
|
39
|
+
if not filtered_data.empty:
|
40
|
+
plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
|
41
|
+
hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
|
42
|
+
return plot
|
43
|
+
else:
|
44
|
+
return pn.pane.Markdown("No data available for the selected combination.")
|
45
|
+
|
46
|
+
|
47
|
+
# Create the dashboard
|
48
|
+
dashboard = pn.Column(
|
49
|
+
pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
|
50
|
+
update_plot
|
51
|
+
)
|
52
|
+
|
53
|
+
# Save as html page
|
54
|
+
dashboard.save('dashboard.html', embed=True)
|
55
|
+
|
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
|
|
32
32
|
geocif/cei/definitions.py
|
33
33
|
geocif/cei/indices.py
|
34
34
|
geocif/ml/__init__.py
|
35
|
+
geocif/ml/aa.py
|
35
36
|
geocif/ml/correlations.py
|
36
37
|
geocif/ml/embedding.py
|
37
38
|
geocif/ml/feature_engineering.py
|
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
|
|
50
51
|
geocif/playground/automl.py
|
51
52
|
geocif/playground/misc.py
|
52
53
|
geocif/viz/__init__.py
|
54
|
+
geocif/viz/misc.py
|
53
55
|
geocif/viz/plot.py
|
54
56
|
tests/test_geocif.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|