geocif 0.1.47__tar.gz → 0.1.49__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.1.47/geocif.egg-info → geocif-0.1.49}/PKG-INFO +1 -1
- {geocif-0.1.47 → geocif-0.1.49}/geocif/geocif.py +64 -20
- geocif-0.1.49/geocif/ml/aa.py +28 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/feature_selection.py +10 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/output.py +13 -13
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/stats.py +7 -1
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/trainers.py +37 -17
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/trend.py +32 -11
- geocif-0.1.49/geocif/viz/misc.py +55 -0
- {geocif-0.1.47 → geocif-0.1.49/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/SOURCES.txt +2 -0
- {geocif-0.1.47 → geocif-0.1.49}/setup.py +1 -1
- {geocif-0.1.47 → geocif-0.1.49}/LICENSE +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/MANIFEST.in +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/README.md +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/plot.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/agmet/utils.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/analysis.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/constants.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/features.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/geo.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/geocif.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/metadata.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/backup/models.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/definitions.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/cei/indices.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/experiments.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/indices_runner.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/indices_runner_v2.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/logger.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/correlations.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/embedding.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/misc.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/outliers.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/outlook.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/stages.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/ml/xai.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/automl.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/playground/misc.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/utils.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/viz/__init__.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif/viz/plot.py +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/requirements.txt +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/setup.cfg +0 -0
- {geocif-0.1.47 → geocif-0.1.49}/tests/test_geocif.py +0 -0
@@ -11,7 +11,6 @@ import geopandas as gp
|
|
11
11
|
import matplotlib.pyplot as plt
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
-
import sklearn
|
15
14
|
from tqdm import tqdm
|
16
15
|
|
17
16
|
from geocif import logger as log
|
@@ -28,7 +27,6 @@ from .ml import trend
|
|
28
27
|
from .ml import xai
|
29
28
|
|
30
29
|
plt.style.use("default")
|
31
|
-
sklearn.set_config(transform_output="pandas")
|
32
30
|
|
33
31
|
import warnings
|
34
32
|
|
@@ -200,9 +198,6 @@ class Geocif:
|
|
200
198
|
|
201
199
|
self.db_path = self.dir_db / self.db_forecasts
|
202
200
|
|
203
|
-
# Store config file in database
|
204
|
-
output.config_to_db(self.db_path, self.parser, self.today)
|
205
|
-
|
206
201
|
# self.pickle_file = self.base_dir / self.parser.get("outlook", "pickle_file")
|
207
202
|
# obj_pickle = outlook.Outlook(self.pickle_file)
|
208
203
|
# self.df_outlook = obj_pickle.read_outlook_file()
|
@@ -223,6 +218,9 @@ class Geocif:
|
|
223
218
|
f"Detrended {self.target}" if self.check_yield_trend else self.target
|
224
219
|
)
|
225
220
|
|
221
|
+
# Drop rows where target_col is NaN
|
222
|
+
df_region = df_region.dropna(subset=[target_col])
|
223
|
+
|
226
224
|
X_train = df_region[self.feature_names]
|
227
225
|
# Drop any columns with NaNs
|
228
226
|
X_train = X_train.dropna(axis=1, how="any")
|
@@ -282,7 +280,7 @@ class Geocif:
|
|
282
280
|
X_train_scaled,
|
283
281
|
y_train,
|
284
282
|
feature_names=self.selected_features,
|
285
|
-
target_col=
|
283
|
+
target_col=target_col,
|
286
284
|
optimize=self.optimize,
|
287
285
|
fraction_loocv=self.fraction_loocv,
|
288
286
|
cat_features=self.cat_features,
|
@@ -304,6 +302,13 @@ class Geocif:
|
|
304
302
|
verbose=False,
|
305
303
|
# callbacks=[TQDMCallback(self.best_hyperparams["iterations"])],
|
306
304
|
)
|
305
|
+
elif self.model_name in ["oblique"]:
|
306
|
+
self.model.fit(X_train, y_train)
|
307
|
+
elif self.model_name == "ydf":
|
308
|
+
# Combine X_train and y_train
|
309
|
+
df_train = pd.concat([X_train, y_train], axis=1)
|
310
|
+
|
311
|
+
self.model = self.model.train(df_train)
|
307
312
|
elif self.model_name == "geospaNN":
|
308
313
|
self.model.fit(
|
309
314
|
X_train,
|
@@ -335,9 +340,16 @@ class Geocif:
|
|
335
340
|
]:
|
336
341
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
337
342
|
|
343
|
+
if self.model_name == "cumulative_1":
|
344
|
+
num_columns = 1
|
345
|
+
elif self.model_name == "cumulative_2":
|
346
|
+
num_columns = 2
|
347
|
+
elif self.model_name == "cumulative_3":
|
348
|
+
num_columns = 3
|
349
|
+
|
338
350
|
# Standardize the numeric features
|
339
351
|
scaler = StandardScaler()
|
340
|
-
X_numeric = X_train.iloc[:, :
|
352
|
+
X_numeric = X_train.iloc[:, :num_columns]
|
341
353
|
X_scaled_numeric = pd.DataFrame(
|
342
354
|
scaler.fit_transform(X_numeric),
|
343
355
|
columns=X_numeric.columns,
|
@@ -409,9 +421,16 @@ class Geocif:
|
|
409
421
|
elif self.model_name in ["cumulative_1", "cumulative_2", "cumulative_3"]:
|
410
422
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
411
423
|
|
424
|
+
if self.model_name == "cumulative_1":
|
425
|
+
num_columns = 1
|
426
|
+
elif self.model_name == "cumulative_2":
|
427
|
+
num_columns = 2
|
428
|
+
elif self.model_name == "cumulative_3":
|
429
|
+
num_columns = 3
|
430
|
+
|
412
431
|
# Standardize the numeric features
|
413
432
|
scaler = StandardScaler()
|
414
|
-
X_numeric = X_test.iloc[:, :
|
433
|
+
X_numeric = X_test.iloc[:, :num_columns]
|
415
434
|
try:
|
416
435
|
X_scaled_numeric = pd.DataFrame(
|
417
436
|
scaler.fit_transform(X_numeric),
|
@@ -455,7 +474,9 @@ class Geocif:
|
|
455
474
|
self.selected_features + self.cat_features + [self.target]
|
456
475
|
]
|
457
476
|
w_train = data_train.y - self.estimate(data_train.x)
|
458
|
-
|
477
|
+
elif self.model_name == "ydf":
|
478
|
+
y_pred = self.model.evaluate(X_test)
|
479
|
+
best_hyperparameters = {}
|
459
480
|
else:
|
460
481
|
y_pred = self.model.predict(X_test)
|
461
482
|
best_hyperparameters = self.model.get_params().copy()
|
@@ -468,8 +489,8 @@ class Geocif:
|
|
468
489
|
|
469
490
|
obj_trend = trend.DetrendedData(
|
470
491
|
df_tmp[f"Detrended {self.target}"],
|
471
|
-
df_tmp["
|
472
|
-
df_tmp["
|
492
|
+
df_tmp["Detrended Model"],
|
493
|
+
df_tmp["Detrended Model Type"],
|
473
494
|
)
|
474
495
|
|
475
496
|
# Retrend the predicted yield
|
@@ -477,6 +498,8 @@ class Geocif:
|
|
477
498
|
obj_trend, df_region.iloc[idx][["Harvest Year"]]
|
478
499
|
)[0]
|
479
500
|
|
501
|
+
df_region.loc[idx, "Detrended Model Type"] = obj_trend.model_type.unique()[0]
|
502
|
+
|
480
503
|
# Create a dataframe with forecast results
|
481
504
|
shp = len(X_test)
|
482
505
|
experiment_id = f"{self.country}_{self.crop}"
|
@@ -530,7 +553,6 @@ class Geocif:
|
|
530
553
|
|
531
554
|
if self.check_yield_trend:
|
532
555
|
df.loc[:, "Detrended Model Type"] = df_region["Detrended Model Type"].values
|
533
|
-
df.loc[:, "Detrended Model"] = df_region["Detrended Model"].values
|
534
556
|
|
535
557
|
if self.last_year_yield_as_feature:
|
536
558
|
# Add last year yield to dataframe
|
@@ -729,7 +751,7 @@ class Geocif:
|
|
729
751
|
+ ["Region_ID"]
|
730
752
|
)
|
731
753
|
if self.check_yield_trend:
|
732
|
-
common_columns += ["Detrended Model Type", "Detrended Model"]
|
754
|
+
common_columns += [f"Detrended {self.target}", "Detrended Model Type", "Detrended Model"]
|
733
755
|
|
734
756
|
if self.last_year_yield_as_feature:
|
735
757
|
common_columns += [f"Last Year {self.target}"]
|
@@ -738,11 +760,15 @@ class Geocif:
|
|
738
760
|
# Filter dataframe based on region and self.feature_names
|
739
761
|
df_region_train = self.df_train[mask_train]
|
740
762
|
df_region_train = df_region_train[self.fixed_columns + common_columns]
|
763
|
+
df_region_train.reset_index(drop=True, inplace=True)
|
741
764
|
self.train(df_region_train, scaler)
|
742
765
|
|
743
766
|
""" Predict """
|
767
|
+
if self.check_yield_trend:
|
768
|
+
common_columns = common_columns[:-3]
|
744
769
|
df_region_test = self.df_test[mask_test]
|
745
770
|
df_region_test = df_region_test[self.fixed_columns + common_columns]
|
771
|
+
df_region_test.reset_index(drop=True, inplace=True)
|
746
772
|
experiment_id, df = self.predict(df_region_test, scaler)
|
747
773
|
# df.reset_index(inplace=True)
|
748
774
|
|
@@ -897,12 +923,12 @@ class Geocif:
|
|
897
923
|
|
898
924
|
if self.lag_yield_as_feature:
|
899
925
|
df = fe.compute_lag_yield(
|
900
|
-
df, self.all_seasons_with_yield, self.number_lag_years
|
926
|
+
df, self.all_seasons_with_yield, self.number_lag_years, self.target
|
901
927
|
)
|
902
928
|
|
903
929
|
if self.analogous_year_yield_as_feature:
|
904
930
|
df = fe.compute_analogous_yield(
|
905
|
-
df, self.all_seasons_with_yield, self.number_median_years
|
931
|
+
df, self.all_seasons_with_yield, self.number_median_years, self.target
|
906
932
|
)
|
907
933
|
|
908
934
|
# Create Region_ID column based on Region column category code
|
@@ -912,7 +938,7 @@ class Geocif:
|
|
912
938
|
elif self.cluster_strategy == "individual":
|
913
939
|
df["Region_ID"] = df["Region"].cat.codes
|
914
940
|
elif self.cluster_strategy == "auto_detect":
|
915
|
-
clusters_assigned = fe.detect_clusters(df)
|
941
|
+
clusters_assigned = fe.detect_clusters(df, self.target)
|
916
942
|
# Merge the cluster labels with the original DataFrame
|
917
943
|
df = df.merge(clusters_assigned, on="Region")
|
918
944
|
|
@@ -1036,8 +1062,8 @@ class Geocif:
|
|
1036
1062
|
|
1037
1063
|
""" Groupby Region column and compute detrended yield """
|
1038
1064
|
self.df_train[f"Detrended {self.target}"] = np.NaN
|
1039
|
-
self.df_train["
|
1040
|
-
self.df_train["
|
1065
|
+
self.df_train["Detrended Model"] = np.NaN
|
1066
|
+
self.df_train["Detrended Model Type"] = np.NaN
|
1041
1067
|
if self.check_yield_trend:
|
1042
1068
|
group_by = ["Region"]
|
1043
1069
|
groups = self.df_train.groupby(group_by)
|
@@ -1050,10 +1076,10 @@ class Geocif:
|
|
1050
1076
|
group.index, f"Detrended {self.target}"
|
1051
1077
|
] = detrended_data.detrended_series
|
1052
1078
|
self.df_train.loc[
|
1053
|
-
group.index, "
|
1079
|
+
group.index, "Detrended Model"
|
1054
1080
|
] = detrended_data.trend_model
|
1055
1081
|
self.df_train.loc[
|
1056
|
-
group.index, "
|
1082
|
+
group.index, "Detrended Model Type"
|
1057
1083
|
] = detrended_data.model_type
|
1058
1084
|
|
1059
1085
|
# 6. Exclude years without yields from df_train
|
@@ -1122,6 +1148,15 @@ class Geocif:
|
|
1122
1148
|
self.cluster_strategy = "single"
|
1123
1149
|
self.select_cei_by = "Index"
|
1124
1150
|
self.use_cumulative_features = True
|
1151
|
+
elif self.model_name in ["oblique", "ydf"]:
|
1152
|
+
self.do_xai = False
|
1153
|
+
self.estimate_ci = False
|
1154
|
+
# Remove Region from cat_features as it is object type
|
1155
|
+
self.cat_features = [col for col in self.cat_features if col != "Region"]
|
1156
|
+
# if self.model_name == "ydf":
|
1157
|
+
# # HACK, for ydf model, target_col is Yield
|
1158
|
+
# self.df_results.rename(columns={self.target: "Yield"}, inplace=True)
|
1159
|
+
# self.target = "Yield"
|
1125
1160
|
else:
|
1126
1161
|
self.do_xai = self.parser.getboolean("ML", "do_xai")
|
1127
1162
|
self.estimate_ci = self.parser.getboolean("ML", "estimate_ci")
|
@@ -1188,6 +1223,9 @@ class Geocif:
|
|
1188
1223
|
self.dg["Country Region"] = (
|
1189
1224
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM1_NAME"]
|
1190
1225
|
)
|
1226
|
+
elif self.country == "illinois":
|
1227
|
+
self.dg["ADM0_NAME"] = "illinois"
|
1228
|
+
self.dg["Country Region"] = self.dg["ADM0_NAME"] + " " + self.dg["NAME"]
|
1191
1229
|
else:
|
1192
1230
|
self.dg["Country Region"] = (
|
1193
1231
|
self.dg["ADM0_NAME"] + " " + self.dg["ADM2_NAME"]
|
@@ -1240,6 +1278,9 @@ class Geocif:
|
|
1240
1278
|
# TODO ignore file with _2000 in its name
|
1241
1279
|
all_files = [f for f in all_files if "_2000" not in f.name]
|
1242
1280
|
|
1281
|
+
# Assert that all_files is not empty
|
1282
|
+
assert all_files, f"No files found in {_dir_country} with {file_name}"
|
1283
|
+
|
1243
1284
|
self.df_results = pd.concat(
|
1244
1285
|
(pd.read_csv(f) for f in all_files), ignore_index=True
|
1245
1286
|
)
|
@@ -1293,6 +1334,9 @@ def loop_execute(inputs):
|
|
1293
1334
|
obj = Geocif(logger=logger, parser=parser)
|
1294
1335
|
obj.read_data(country, crop, season)
|
1295
1336
|
|
1337
|
+
# Store config file in database
|
1338
|
+
output.config_to_db(obj.db_path, obj.parser, obj.today)
|
1339
|
+
|
1296
1340
|
# Setup metadata and run ML code
|
1297
1341
|
obj.setup(season, model)
|
1298
1342
|
if obj.simulation_stages:
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import ydf
|
2
|
+
import pandas as pd
|
3
|
+
|
4
|
+
# Load dataset with Pandas
|
5
|
+
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset/"
|
6
|
+
train_ds = pd.read_csv(ds_path + "adult_train.csv")
|
7
|
+
test_ds = pd.read_csv(ds_path + "adult_test.csv")
|
8
|
+
|
9
|
+
# Train a Gradient Boosted Trees model
|
10
|
+
model = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
|
11
|
+
|
12
|
+
# Look at a model (input features, training logs, structure, etc.)
|
13
|
+
model.describe()
|
14
|
+
|
15
|
+
# Evaluate a model (e.g. roc, accuracy, confusion matrix, confidence intervals)
|
16
|
+
model.evaluate(test_ds)
|
17
|
+
|
18
|
+
# Generate predictions
|
19
|
+
model.predict(test_ds)
|
20
|
+
|
21
|
+
# Analyse a model (e.g. partial dependence plot, variable importance)
|
22
|
+
model.analyze(test_ds)
|
23
|
+
|
24
|
+
# Benchmark the inference speed of a model
|
25
|
+
model.benchmark(test_ds)
|
26
|
+
|
27
|
+
# Save the model
|
28
|
+
model.save("/tmp/my_model")
|
@@ -131,6 +131,16 @@ def select_features(X, y, method="RFE", min_features_to_select=3):
|
|
131
131
|
|
132
132
|
X_filtered = selector.fit_transform(X, y)
|
133
133
|
selected_features = X_filtered.columns.tolist()
|
134
|
+
elif method == "mrmr":
|
135
|
+
from mrmr import mrmr_regression
|
136
|
+
|
137
|
+
try:
|
138
|
+
selected_features = mrmr_regression(X=X, y=y, K=10)
|
139
|
+
except:
|
140
|
+
breakpoint()
|
141
|
+
# combine X and y into a dataframe
|
142
|
+
# df = pd.concat([X, y], axis=1)
|
143
|
+
|
134
144
|
elif method == "RFECV":
|
135
145
|
from sklearn.feature_selection import RFECV
|
136
146
|
from sklearn.model_selection import KFold
|
@@ -109,21 +109,21 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
109
109
|
except Exception as e:
|
110
110
|
print(f"Error: {e}")
|
111
111
|
|
112
|
-
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
113
|
-
# Output model pickle as a blob to database
|
114
|
-
df_model = pd.DataFrame(
|
115
|
-
{
|
116
|
-
"Experiment_ID": [experiment_id],
|
117
|
-
"Model": [model_name],
|
118
|
-
"Model_Blob": [pickle.dumps(model)],
|
119
|
-
}
|
120
|
-
)
|
121
|
-
# df_model.index = df_model.apply(
|
122
|
-
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
123
|
-
# )
|
124
|
-
|
125
112
|
# name the index level
|
126
113
|
try:
|
114
|
+
index_columns = ["Country", "Region", "Crop", "Harvest Year", "Stages"]
|
115
|
+
# Output model pickle as a blob to database
|
116
|
+
df_model = pd.DataFrame(
|
117
|
+
{
|
118
|
+
"Experiment_ID": [experiment_id],
|
119
|
+
"Model": [model_name],
|
120
|
+
"Model_Blob": [pickle.dumps(model)],
|
121
|
+
}
|
122
|
+
)
|
123
|
+
# df_model.index = df_model.apply(
|
124
|
+
# lambda row: "_".join([str(row[col]) for col in index_columns]), axis=1
|
125
|
+
# )
|
126
|
+
|
127
127
|
df_model.index.set_names(["Index"], inplace=True)
|
128
128
|
utils.to_db(db_path, "models", df_model)
|
129
129
|
except Exception as e:
|
@@ -191,7 +191,12 @@ def add_statistics(
|
|
191
191
|
|
192
192
|
"""
|
193
193
|
# First check if country and crop are in the admin_crop_production.csv file
|
194
|
-
|
194
|
+
if country == "Afghanistan":
|
195
|
+
fn = "afghanistan.csv"
|
196
|
+
elif country == "Illinois":
|
197
|
+
fn = "illinois.csv"
|
198
|
+
else:
|
199
|
+
fn = "adm_crop_production.csv"
|
195
200
|
df_fewsnet = pd.read_csv(dir_stats / fn, low_memory=False)
|
196
201
|
|
197
202
|
# HACK
|
@@ -206,6 +211,7 @@ def add_statistics(
|
|
206
211
|
df = add_GEOGLAM_statistics(dir_stats, df, stats, method, admin_zone)
|
207
212
|
else:
|
208
213
|
group_by = ["Region", "Harvest Year"]
|
214
|
+
|
209
215
|
groups = df.groupby(group_by)
|
210
216
|
|
211
217
|
# Define processing for each group
|
@@ -2,10 +2,7 @@ import multiprocessing as mp
|
|
2
2
|
|
3
3
|
import numpy as np
|
4
4
|
import optuna
|
5
|
-
import pandas as pd
|
6
5
|
from catboost import CatBoostRegressor
|
7
|
-
from sklearn.metrics import root_mean_squared_error
|
8
|
-
from sklearn.model_selection import train_test_split
|
9
6
|
from tqdm import tqdm
|
10
7
|
|
11
8
|
|
@@ -30,6 +27,8 @@ def loocv(
|
|
30
27
|
:param cat_features: list, list of categorical feature names
|
31
28
|
:return: float, average RMSE
|
32
29
|
"""
|
30
|
+
from sklearn.metrics import root_mean_squared_error
|
31
|
+
|
33
32
|
rmse_values = []
|
34
33
|
|
35
34
|
X = df[feature_names + cat_features]
|
@@ -81,6 +80,9 @@ def optuna_objective(model, df, feature_names, target_col, cat_features=[]):
|
|
81
80
|
Returns:
|
82
81
|
|
83
82
|
"""
|
83
|
+
from sklearn.metrics import root_mean_squared_error
|
84
|
+
from sklearn.model_selection import train_test_split
|
85
|
+
|
84
86
|
X = df[feature_names + cat_features]
|
85
87
|
y = df[target_col]
|
86
88
|
|
@@ -262,8 +264,7 @@ def auto_train(
|
|
262
264
|
if model_name in ["catboost", "merf"]:
|
263
265
|
hyperparams = {
|
264
266
|
"depth": 6,
|
265
|
-
|
266
|
-
"iterations": 5000,
|
267
|
+
|
267
268
|
"subsample": 1.0,
|
268
269
|
"random_strength": 0.5,
|
269
270
|
"reg_lambda": 0.001,
|
@@ -280,6 +281,34 @@ def auto_train(
|
|
280
281
|
hyperparams["iterations"] = 1000
|
281
282
|
regr = CatBoostRegressor(**hyperparams, cat_features=cat_features)
|
282
283
|
model = MERF(regr, max_iterations=10)
|
284
|
+
elif model_name == "oblique":
|
285
|
+
from treeple import ExtraObliqueRandomForestRegressor
|
286
|
+
|
287
|
+
# https://docs.neurodata.io/treeple/dev/modules/supervised_tree.html#oblique-trees
|
288
|
+
n_features = X_train.shape[1]
|
289
|
+
|
290
|
+
model = ExtraObliqueRandomForestRegressor(
|
291
|
+
n_estimators=1500,
|
292
|
+
max_depth=20,
|
293
|
+
max_features=n_features**2,
|
294
|
+
feature_combinations=n_features,
|
295
|
+
n_jobs=-1,
|
296
|
+
random_state=42,
|
297
|
+
)
|
298
|
+
elif model_name == "ydf":
|
299
|
+
import ydf
|
300
|
+
templates = ydf.GradientBoostedTreesLearner.hyperparameter_templates()
|
301
|
+
|
302
|
+
model = ydf.GradientBoostedTreesLearner(
|
303
|
+
label=target_col,
|
304
|
+
task=ydf.Task.REGRESSION,
|
305
|
+
growing_strategy='BEST_FIRST_GLOBAL',
|
306
|
+
categorical_algorithm='RANDOM',
|
307
|
+
split_axis='SPARSE_OBLIQUE',
|
308
|
+
sparse_oblique_normalization='MIN_MAX',
|
309
|
+
sparse_oblique_num_projections_exponent=2.0)
|
310
|
+
|
311
|
+
hyperparams = templates["benchmark_rank1v1"]
|
283
312
|
elif model_name == "linear":
|
284
313
|
from sklearn.linear_model import LassoCV
|
285
314
|
|
@@ -293,24 +322,15 @@ def auto_train(
|
|
293
322
|
elif model_name == "cumulative_1":
|
294
323
|
from pygam import GAM, s, f, te
|
295
324
|
|
296
|
-
|
297
|
-
region_idx = X_train.columns.get_loc("Region")
|
298
|
-
|
299
|
-
model = GAM(s(0) + f(region_idx))
|
325
|
+
model = GAM(s(0) + f(1))
|
300
326
|
elif model_name == "cumulative_2":
|
301
327
|
from pygam import GAM, s, f, te
|
302
328
|
|
303
|
-
|
304
|
-
region_idx = X_train.columns.get_loc("Region")
|
305
|
-
|
306
|
-
model = GAM(s(0) + s(1) + te(0, 1) + f(region_idx))
|
329
|
+
model = GAM(s(0) + s(1) + te(0, 1) + f(2))
|
307
330
|
elif model_name == "cumulative_3":
|
308
331
|
from pygam import GAM, s, f, te
|
309
332
|
|
310
|
-
|
311
|
-
region_idx = X_train.columns.get_loc("Region")
|
312
|
-
|
313
|
-
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(region_idx))
|
333
|
+
model = GAM(s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3))
|
314
334
|
elif model_name == "geospaNN":
|
315
335
|
import torch
|
316
336
|
import geospaNN
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import numpy as np
|
2
|
+
import pandas as pd
|
2
3
|
from statsmodels.regression.linear_model import OLS
|
3
4
|
from statsmodels.tools.tools import add_constant
|
4
5
|
|
@@ -6,7 +7,7 @@ from statsmodels.tools.tools import add_constant
|
|
6
7
|
class DetrendedData:
|
7
8
|
"""
|
8
9
|
A class to store the detrended series, the model used for detrending,
|
9
|
-
and the type of model ('mean', 'linear', 'quadratic').
|
10
|
+
and the type of model ('mean', 'linear', 'quadratic', 'difference').
|
10
11
|
"""
|
11
12
|
|
12
13
|
def __init__(self, detrended_series, trend_model, model_type):
|
@@ -15,14 +16,16 @@ class DetrendedData:
|
|
15
16
|
self.model_type = model_type
|
16
17
|
|
17
18
|
|
18
|
-
def detrend_dataframe(df, column_name="y"):
|
19
|
+
def detrend_dataframe(df, column_name="y", model_type="best"):
|
19
20
|
"""
|
20
|
-
Removes the trend from the specified column of a DataFrame using the method
|
21
|
-
(mean, linear, quadratic) that results in the lowest AIC value.
|
21
|
+
Removes the trend from the specified column of a DataFrame using the specified method
|
22
|
+
(mean, linear, quadratic, difference) or the method that results in the lowest AIC value.
|
22
23
|
|
23
24
|
Parameters:
|
24
25
|
- df: pandas DataFrame containing the time series data.
|
25
26
|
- column_name: string name of the column to detrend.
|
27
|
+
- model_type: string specifying which model to use for detrending ('mean', 'linear',
|
28
|
+
'quadratic', 'difference', or 'best' for automatic selection based on AIC).
|
26
29
|
|
27
30
|
Returns:
|
28
31
|
- DetrendedData object containing the detrended series, the statistical model,
|
@@ -41,16 +44,32 @@ def detrend_dataframe(df, column_name="y"):
|
|
41
44
|
X_quad = add_constant(np.column_stack((df["t"], df["t"] ** 2)))
|
42
45
|
quad_model = OLS(df[column_name], X_quad).fit()
|
43
46
|
|
44
|
-
|
45
|
-
|
47
|
+
# Differencing method
|
48
|
+
diff_series = df[column_name].diff().dropna()
|
49
|
+
diff_model = OLS(diff_series, np.ones(len(diff_series))).fit()
|
50
|
+
|
51
|
+
models = {
|
52
|
+
"mean": mean_model,
|
53
|
+
"linear": linear_model,
|
54
|
+
"quadratic": quad_model,
|
55
|
+
"difference": diff_model
|
56
|
+
}
|
57
|
+
|
58
|
+
if model_type == "best":
|
59
|
+
best_model_type = min(models, key=lambda x: models[x].aic)
|
60
|
+
else:
|
61
|
+
best_model_type = model_type
|
62
|
+
|
46
63
|
best_model = models[best_model_type]
|
47
64
|
|
48
65
|
if best_model_type == "mean":
|
49
66
|
detrended = df[column_name] - mean_model.predict(np.ones(len(df)))
|
50
67
|
elif best_model_type == "linear":
|
51
68
|
detrended = df[column_name] - linear_model.predict(X_linear)
|
52
|
-
|
69
|
+
elif best_model_type == "quadratic":
|
53
70
|
detrended = df[column_name] - quad_model.predict(X_quad)
|
71
|
+
else: # difference
|
72
|
+
detrended = df[column_name].diff().dropna()
|
54
73
|
|
55
74
|
return DetrendedData(detrended, best_model, best_model_type)
|
56
75
|
|
@@ -67,11 +86,10 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
67
86
|
Returns:
|
68
87
|
- The retrended series as a pandas Series.
|
69
88
|
"""
|
70
|
-
# if future_time_points is not of type pandas dataframe then convert it to one
|
71
89
|
future_time_points = np.array(future_time_points)
|
72
90
|
|
73
|
-
model_type = detrended_data.model_type[0]
|
74
|
-
model = detrended_data.trend_model[0]
|
91
|
+
model_type = detrended_data.model_type.unique()[0]
|
92
|
+
model = detrended_data.trend_model.unique()[0]
|
75
93
|
|
76
94
|
if model_type == "mean":
|
77
95
|
trend_component = model.predict(
|
@@ -80,11 +98,14 @@ def compute_trend(detrended_data, future_time_points=None):
|
|
80
98
|
elif model_type == "linear":
|
81
99
|
X_linear = add_constant(future_time_points, has_constant="add")
|
82
100
|
trend_component = model.predict(X_linear)
|
83
|
-
|
101
|
+
elif model_type == "quadratic":
|
84
102
|
X_quad = add_constant(
|
85
103
|
np.column_stack((future_time_points, future_time_points**2)),
|
86
104
|
has_constant="add",
|
87
105
|
)
|
88
106
|
trend_component = model.predict(X_quad)
|
107
|
+
else: # difference
|
108
|
+
trend_component = pd.Series(np.nan, index=future_time_points)
|
109
|
+
trend_component.iloc[0] = model.params[0] # Add mean of differenced series
|
89
110
|
|
90
111
|
return trend_component
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import hvplot.pandas
|
3
|
+
import panel as pn
|
4
|
+
|
5
|
+
# Load the CSV file
|
6
|
+
file_path = r'D:\Users\ritvik\projects\GEOGLAM\Output\fao\regional_cei_slope.csv'
|
7
|
+
data = pd.read_csv(file_path)
|
8
|
+
|
9
|
+
# Extract unique values for dropdowns
|
10
|
+
countries = data['Country'].unique().tolist()
|
11
|
+
|
12
|
+
# Create dropdown widgets
|
13
|
+
country_dropdown = pn.widgets.Select(name='Country', options=countries)
|
14
|
+
region_dropdown = pn.widgets.Select(name='Region', options=[])
|
15
|
+
crop_dropdown = pn.widgets.Select(name='Crop', options=[])
|
16
|
+
season_dropdown = pn.widgets.Select(name='Season', options=data['Season'].unique().tolist())
|
17
|
+
|
18
|
+
|
19
|
+
# Function to update region and crop options based on selected country
|
20
|
+
@pn.depends(country_dropdown.param.value, watch=True)
|
21
|
+
def update_region_and_crop_options(country):
|
22
|
+
filtered_data = data[data['Country'] == country]
|
23
|
+
regions = filtered_data['Region'].unique().tolist()
|
24
|
+
crops = filtered_data['Crop'].unique().tolist()
|
25
|
+
|
26
|
+
region_dropdown.options = regions
|
27
|
+
crop_dropdown.options = crops
|
28
|
+
|
29
|
+
|
30
|
+
# Function to filter data based on dropdown selections
|
31
|
+
@pn.depends(country_dropdown.param.value, region_dropdown.param.value, crop_dropdown.param.value,
|
32
|
+
season_dropdown.param.value)
|
33
|
+
def update_plot(country, region, crop, season):
|
34
|
+
filtered_data = data[(data['Country'] == country) &
|
35
|
+
(data['Region'] == region) &
|
36
|
+
(data['Crop'] == crop) &
|
37
|
+
(data['Season'] == season)]
|
38
|
+
|
39
|
+
if not filtered_data.empty:
|
40
|
+
plot = filtered_data.hvplot.scatter(x='Slope', y='Intercept',
|
41
|
+
hover_cols=['Growth Stage', 'p-value', 'Index', 'Description'])
|
42
|
+
return plot
|
43
|
+
else:
|
44
|
+
return pn.pane.Markdown("No data available for the selected combination.")
|
45
|
+
|
46
|
+
|
47
|
+
# Create the dashboard
|
48
|
+
dashboard = pn.Column(
|
49
|
+
pn.Row(country_dropdown, region_dropdown, crop_dropdown, season_dropdown),
|
50
|
+
update_plot
|
51
|
+
)
|
52
|
+
|
53
|
+
# Save as html page
|
54
|
+
dashboard.save('dashboard.html', embed=True)
|
55
|
+
|
@@ -32,6 +32,7 @@ geocif/cei/__init__.py
|
|
32
32
|
geocif/cei/definitions.py
|
33
33
|
geocif/cei/indices.py
|
34
34
|
geocif/ml/__init__.py
|
35
|
+
geocif/ml/aa.py
|
35
36
|
geocif/ml/correlations.py
|
36
37
|
geocif/ml/embedding.py
|
37
38
|
geocif/ml/feature_engineering.py
|
@@ -50,5 +51,6 @@ geocif/playground/__init__.py
|
|
50
51
|
geocif/playground/automl.py
|
51
52
|
geocif/playground/misc.py
|
52
53
|
geocif/viz/__init__.py
|
54
|
+
geocif/viz/misc.py
|
53
55
|
geocif/viz/plot.py
|
54
56
|
tests/test_geocif.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|