geocif 0.2.43__tar.gz → 0.2.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {geocif-0.2.43/geocif.egg-info → geocif-0.2.45}/PKG-INFO +1 -1
- {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/indices.py +1 -1
- {geocif-0.2.43 → geocif-0.2.45}/geocif/geocif.py +11 -2
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner.py +3 -3
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/correlations.py +0 -1
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/embedding.py +35 -44
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/output.py +17 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/trainers.py +1 -8
- {geocif-0.2.43 → geocif-0.2.45}/geocif/mm.py +16 -0
- {geocif-0.2.43 → geocif-0.2.45/geocif.egg-info}/PKG-INFO +1 -1
- {geocif-0.2.43 → geocif-0.2.45}/setup.py +1 -1
- {geocif-0.2.43 → geocif-0.2.45}/LICENSE +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/MANIFEST.in +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/README.md +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/geoagmet.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/plot.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/utils.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/analysis.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/constants.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/features.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/geo.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/geocif.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/metadata.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/models.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/definitions.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/experiments.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/geocif_runner.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_angola.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_madagascar.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_malawi.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_mozambique.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_south_africa.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_zambia.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_zimbabwe.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/logger.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/feature_engineering.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/feature_selection.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/outliers.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/outlook.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/spatial_autocorrelation.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/stages.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/stats.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/trend.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/xai.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/aa.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/area.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/automl.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/download_esi.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/enso.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/eval.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/gamtest.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/gee_access.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/misc.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/play_xagg.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/reg.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/sustain.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/test_catboost.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp2.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp3.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp4.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp5.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_maize_mask.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_v2.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_v3.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/risk/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/risk/impact_assessment.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/utils.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/__init__.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/gt.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/plot.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/tmp.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/viz_ml.py +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/SOURCES.txt +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/dependency_links.txt +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/not-zip-safe +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/top_level.txt +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/requirements.txt +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/setup.cfg +0 -0
- {geocif-0.2.43 → geocif-0.2.45}/tests/test_geocif.py +0 -0
@@ -67,7 +67,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
|
|
67
67
|
if "time" not in df.columns:
|
68
68
|
# Use year + day of year if no time column
|
69
69
|
df["time"] = pd.to_datetime(
|
70
|
-
df["year"].astype(str) + df["
|
70
|
+
df["year"].astype(str) + df["Doy"].astype(str),
|
71
71
|
format="%Y%j"
|
72
72
|
)
|
73
73
|
else:
|
@@ -328,8 +328,7 @@ class Geocif:
|
|
328
328
|
elif self.model_name == "linear":
|
329
329
|
self.model.fit(X_train_scaled, self.y_train)
|
330
330
|
elif self.model_name == "gam":
|
331
|
-
|
332
|
-
self.model.fit(X_train_scaled.values, self.y_train.values)
|
331
|
+
self.model.fit(X_train_scaled, self.y_train.values)
|
333
332
|
self.best_hyperparams = {}
|
334
333
|
elif self.model_name in ["cubist"]:
|
335
334
|
self.model.fit(X_train, self.y_train)
|
@@ -842,6 +841,14 @@ class Geocif:
|
|
842
841
|
.dropna(axis=1, how="any") # drop cols with any NA left
|
843
842
|
.join(self.X_train[lag_cols]) # add lag-yield cols back untouched
|
844
843
|
)
|
844
|
+
# Some models cannot handle any NaN values, so gapfill them
|
845
|
+
if self.model_name in ["gam", "linear"]:
|
846
|
+
for col in self.X_train.columns:
|
847
|
+
if self.X_train[col].isnull().any():
|
848
|
+
breakpoint()
|
849
|
+
median = self.X_train[col].median()
|
850
|
+
self.X_train[col].fillna(median, inplace=True)
|
851
|
+
|
845
852
|
self.y_train = df_region_train[self.target_column]
|
846
853
|
|
847
854
|
self.apply_feature_selector(region, dir_output)
|
@@ -1088,6 +1095,8 @@ class Geocif:
|
|
1088
1095
|
|
1089
1096
|
# Region_ID should be type category
|
1090
1097
|
df["Region_ID"] = df["Region_ID"].astype("category")
|
1098
|
+
else:
|
1099
|
+
raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
|
1091
1100
|
|
1092
1101
|
return df
|
1093
1102
|
|
@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
|
|
165
165
|
combinations = [
|
166
166
|
i
|
167
167
|
for i in combinations
|
168
|
-
if "
|
168
|
+
if "ethiopia" in i[3]
|
169
169
|
# or "lesotho_maize" in i[3] or
|
170
170
|
# # "namibia_" in i[2] or
|
171
171
|
# "united_republic_of_tanzania_maize" in i[3]
|
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
|
|
174
174
|
# or "south_africa_maize" in i[3]
|
175
175
|
# or "mozambique_maize" in i[3]
|
176
176
|
# or "united_states_of_america" in i[3]
|
177
|
-
or "russian_federation" in i[3]
|
177
|
+
#or "russian_federation" in i[3]
|
178
178
|
# or "ukraine" in i[3]
|
179
179
|
]
|
180
180
|
# "malawi" in i[2]]
|
181
181
|
|
182
182
|
if self.do_parallel:
|
183
|
-
num_cpu = int(cpu_count() * 0.
|
183
|
+
num_cpu = int(cpu_count() * 0.75)
|
184
184
|
with Pool(num_cpu) as p:
|
185
185
|
for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
|
186
186
|
pass
|
@@ -200,7 +200,6 @@ def _all_correlated_feature_by_time(df, **kwargs):
|
|
200
200
|
|
201
201
|
# Only select columns that have been observed till the current stage
|
202
202
|
pbar = tqdm(stages_features, total=len(stages_features), leave=False)
|
203
|
-
|
204
203
|
for stage in pbar:
|
205
204
|
pbar.set_description(f"Calculating correlations")
|
206
205
|
pbar.update()
|
@@ -3,6 +3,7 @@ from collections import Counter
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
5
5
|
from scipy.stats import pearsonr as pearsonr
|
6
|
+
from tqdm import tqdm
|
6
7
|
|
7
8
|
|
8
9
|
def extract_regions(X, y, regions=[]):
|
@@ -104,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
|
|
104
105
|
return feature_by_region, counter
|
105
106
|
|
106
107
|
|
107
|
-
def get_all_features_correlation(inputs
|
108
|
+
def get_all_features_correlation(inputs: pd.DataFrame,
|
109
|
+
targets: pd.Series,
|
110
|
+
method: str) -> pd.DataFrame:
|
108
111
|
"""
|
109
|
-
|
110
|
-
|
111
|
-
:param targets: pd.Series, target data
|
112
|
-
:param method: str, method to use to find the top correlated features
|
112
|
+
Fast version – identical output, no length-mismatch on regions whose
|
113
|
+
feature names contain no spaces.
|
113
114
|
"""
|
114
|
-
|
115
|
-
for region_id in inputs["Region"].unique():
|
116
|
-
X, y = extract_regions(inputs, targets, regions=[region_id])
|
115
|
+
numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
|
117
116
|
|
118
|
-
|
117
|
+
df_all = inputs[numeric_cols + ["Region"]].copy()
|
118
|
+
df_all["__target__"] = targets.values
|
119
119
|
|
120
|
-
|
121
|
-
feature_correlations = {
|
122
|
-
k: v for k, v in feature_correlations.items() if not np.isnan(v)
|
123
|
-
}
|
120
|
+
frames: list[pd.DataFrame] = []
|
124
121
|
|
125
|
-
|
122
|
+
for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
|
123
|
+
corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
|
124
|
+
if corr.empty:
|
126
125
|
continue
|
127
126
|
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
)
|
147
|
-
|
148
|
-
|
149
|
-
cols = df_pivoted.columns.tolist()
|
150
|
-
cols = cols[-1:] + cols[:-1]
|
151
|
-
df_pivoted = df_pivoted[cols]
|
127
|
+
# ---- safe split: always two columns --------------------------------
|
128
|
+
split = (
|
129
|
+
pd.Series(corr.index) # guarantees a Series
|
130
|
+
.str.split(" ", n=1, expand=True)
|
131
|
+
)
|
132
|
+
if split.shape[1] == 1: # no spaces in any feature name
|
133
|
+
split[1] = "" # match legacy behaviour
|
134
|
+
split.columns = [0, 1] # make column labels predictable
|
135
|
+
|
136
|
+
df_region = (
|
137
|
+
pd.DataFrame({
|
138
|
+
"Metric": split[0].values,
|
139
|
+
method: split[1].values,
|
140
|
+
"Value": corr.values # same length as above
|
141
|
+
})
|
142
|
+
.pivot_table(index=method, columns="Metric",
|
143
|
+
values="Value", aggfunc="first")
|
144
|
+
.reset_index()
|
145
|
+
)
|
146
|
+
df_region.insert(0, "Region", region_id)
|
147
|
+
frames.append(df_region)
|
152
148
|
|
153
|
-
|
149
|
+
return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
|
154
150
|
|
155
|
-
if len(frames):
|
156
|
-
feature_by_region = pd.concat(frames)
|
157
|
-
else:
|
158
|
-
feature_by_region = pd.DataFrame()
|
159
151
|
|
160
|
-
return feature_by_region
|
@@ -6,6 +6,20 @@ import pandas as pd
|
|
6
6
|
from geocif import utils
|
7
7
|
|
8
8
|
|
9
|
+
def make_serializable(hparams):
|
10
|
+
serializable = hparams.copy()
|
11
|
+
|
12
|
+
# Convert callbacks to strings
|
13
|
+
if 'callbacks' in serializable:
|
14
|
+
serializable['callbacks'] = [str(cb) for cb in serializable['callbacks']]
|
15
|
+
|
16
|
+
# Convert terms to string
|
17
|
+
if 'terms' in serializable:
|
18
|
+
serializable['terms'] = str(serializable['terms'])
|
19
|
+
|
20
|
+
return serializable
|
21
|
+
|
22
|
+
|
9
23
|
def config_to_dict(parser):
|
10
24
|
"""
|
11
25
|
Reads a configuration file and returns the configuration as a nested dictionary.
|
@@ -103,6 +117,9 @@ def store(db_path, experiment_id, df, model, model_name):
|
|
103
117
|
for col in df.select_dtypes(include=["category"]).columns:
|
104
118
|
df[col] = df[col].astype(str)
|
105
119
|
|
120
|
+
# Convert all columns to string
|
121
|
+
df['Best Hyperparameters'] = df['Best Hyperparameters'].apply(make_serializable)
|
122
|
+
|
106
123
|
# Output results to database
|
107
124
|
try:
|
108
125
|
utils.to_db(db_path, experiment_id, df)
|
@@ -350,14 +350,7 @@ def auto_train(
|
|
350
350
|
"cumulative_2": s(0) + s(1) + te(0, 1) + f(2),
|
351
351
|
"cumulative_3": s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3),
|
352
352
|
}
|
353
|
-
|
354
|
-
# Fill nans with medians
|
355
|
-
for col in X_train.columns:
|
356
|
-
if X_train[col].isnull().any():
|
357
|
-
median = X_train[col].median()
|
358
|
-
X_train[col].fillna(median, inplace=True)
|
359
|
-
except:
|
360
|
-
breakpoint()
|
353
|
+
breakpoint()
|
361
354
|
formula = gam_formulas.get(model_name, gam_cls(n_splines=25, spline_order=3))
|
362
355
|
model = gam_cls(n_splines=25, spline_order=3).gridsearch(X_train, y_train.values, lam=np.logspace(-3, 3, 11)) if model_name.startswith("gam") else formula
|
363
356
|
elif model_name == "geospaNN":
|
@@ -15,6 +15,22 @@
|
|
15
15
|
|
16
16
|
import os
|
17
17
|
|
18
|
+
from pygam import LinearGAM, GammaGAM, s
|
19
|
+
import numpy as np, pandas as pd
|
20
|
+
|
21
|
+
X = np.random.uniform(0, 5, 500)[:, None]
|
22
|
+
y_pos = 2 * np.exp(0.3*X.squeeze()) + np.random.gamma(shape=2, scale=1, size=500)
|
23
|
+
|
24
|
+
# Bad idea – LinearGAM on skewed positive data
|
25
|
+
lin = LinearGAM(s(0)).fit(X, y_pos)
|
26
|
+
|
27
|
+
# Appropriate – GammaGAM with log link
|
28
|
+
gam = GammaGAM(terms, fit_intercept=True)
|
29
|
+
|
30
|
+
print("LinearGAM R2:", lin.statistics_['pseudo_r2']['explained_deviance'])
|
31
|
+
print("GammaGAM R2:", gam.statistics_['pseudo_r2']['explained_deviance'])
|
32
|
+
|
33
|
+
breakpoint()
|
18
34
|
# Set R_HOME environment variable before importing rpy2
|
19
35
|
os.environ["R_HOME"] = f"{os.environ['CONDA_PREFIX']}\Lib\R"
|
20
36
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|