geocif 0.2.43__tar.gz → 0.2.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {geocif-0.2.43/geocif.egg-info → geocif-0.2.45}/PKG-INFO +1 -1
  2. {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/indices.py +1 -1
  3. {geocif-0.2.43 → geocif-0.2.45}/geocif/geocif.py +11 -2
  4. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner.py +3 -3
  5. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/correlations.py +0 -1
  6. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/embedding.py +35 -44
  7. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/output.py +17 -0
  8. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/trainers.py +1 -8
  9. {geocif-0.2.43 → geocif-0.2.45}/geocif/mm.py +16 -0
  10. {geocif-0.2.43 → geocif-0.2.45/geocif.egg-info}/PKG-INFO +1 -1
  11. {geocif-0.2.43 → geocif-0.2.45}/setup.py +1 -1
  12. {geocif-0.2.43 → geocif-0.2.45}/LICENSE +0 -0
  13. {geocif-0.2.43 → geocif-0.2.45}/MANIFEST.in +0 -0
  14. {geocif-0.2.43 → geocif-0.2.45}/README.md +0 -0
  15. {geocif-0.2.43 → geocif-0.2.45}/geocif/__init__.py +0 -0
  16. {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/__init__.py +0 -0
  17. {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/geoagmet.py +0 -0
  18. {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/plot.py +0 -0
  19. {geocif-0.2.43 → geocif-0.2.45}/geocif/agmet/utils.py +0 -0
  20. {geocif-0.2.43 → geocif-0.2.45}/geocif/analysis.py +0 -0
  21. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/__init__.py +0 -0
  22. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/constants.py +0 -0
  23. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/features.py +0 -0
  24. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/geo.py +0 -0
  25. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/geocif.py +0 -0
  26. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/metadata.py +0 -0
  27. {geocif-0.2.43 → geocif-0.2.45}/geocif/backup/models.py +0 -0
  28. {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/__init__.py +0 -0
  29. {geocif-0.2.43 → geocif-0.2.45}/geocif/cei/definitions.py +0 -0
  30. {geocif-0.2.43 → geocif-0.2.45}/geocif/experiments.py +0 -0
  31. {geocif-0.2.43 → geocif-0.2.45}/geocif/geocif_runner.py +0 -0
  32. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_angola.py +0 -0
  33. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_madagascar.py +0 -0
  34. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_malawi.py +0 -0
  35. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_mozambique.py +0 -0
  36. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_south_africa.py +0 -0
  37. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_zambia.py +0 -0
  38. {geocif-0.2.43 → geocif-0.2.45}/geocif/indices_runner_zimbabwe.py +0 -0
  39. {geocif-0.2.43 → geocif-0.2.45}/geocif/logger.py +0 -0
  40. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/__init__.py +0 -0
  41. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/feature_engineering.py +0 -0
  42. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/feature_selection.py +0 -0
  43. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/outliers.py +0 -0
  44. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/outlook.py +0 -0
  45. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/spatial_autocorrelation.py +0 -0
  46. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/stages.py +0 -0
  47. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/stats.py +0 -0
  48. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/trend.py +0 -0
  49. {geocif-0.2.43 → geocif-0.2.45}/geocif/ml/xai.py +0 -0
  50. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/__init__.py +0 -0
  51. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/aa.py +0 -0
  52. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/area.py +0 -0
  53. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/automl.py +0 -0
  54. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/download_esi.py +0 -0
  55. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/enso.py +0 -0
  56. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/eval.py +0 -0
  57. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/gamtest.py +0 -0
  58. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/gee_access.py +0 -0
  59. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/misc.py +0 -0
  60. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/play_xagg.py +0 -0
  61. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/reg.py +0 -0
  62. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/sustain.py +0 -0
  63. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/test_catboost.py +0 -0
  64. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp.py +0 -0
  65. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp2.py +0 -0
  66. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp3.py +0 -0
  67. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp4.py +0 -0
  68. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/tmp5.py +0 -0
  69. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita.py +0 -0
  70. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_maize_mask.py +0 -0
  71. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_v2.py +0 -0
  72. {geocif-0.2.43 → geocif-0.2.45}/geocif/playground/wolayita_v3.py +0 -0
  73. {geocif-0.2.43 → geocif-0.2.45}/geocif/risk/__init__.py +0 -0
  74. {geocif-0.2.43 → geocif-0.2.45}/geocif/risk/impact_assessment.py +0 -0
  75. {geocif-0.2.43 → geocif-0.2.45}/geocif/utils.py +0 -0
  76. {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/__init__.py +0 -0
  77. {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/gt.py +0 -0
  78. {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/plot.py +0 -0
  79. {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/tmp.py +0 -0
  80. {geocif-0.2.43 → geocif-0.2.45}/geocif/viz/viz_ml.py +0 -0
  81. {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/SOURCES.txt +0 -0
  82. {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/dependency_links.txt +0 -0
  83. {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/not-zip-safe +0 -0
  84. {geocif-0.2.43 → geocif-0.2.45}/geocif.egg-info/top_level.txt +0 -0
  85. {geocif-0.2.43 → geocif-0.2.45}/requirements.txt +0 -0
  86. {geocif-0.2.43 → geocif-0.2.45}/setup.cfg +0 -0
  87. {geocif-0.2.43 → geocif-0.2.45}/tests/test_geocif.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.43
3
+ Version: 0.2.45
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -67,7 +67,7 @@ def standardize_dataframe(df: pd.DataFrame, vi_var: str) -> pd.DataFrame:
67
67
  if "time" not in df.columns:
68
68
  # Use year + day of year if no time column
69
69
  df["time"] = pd.to_datetime(
70
- df["year"].astype(str) + df["JD"].astype(str),
70
+ df["year"].astype(str) + df["Doy"].astype(str),
71
71
  format="%Y%j"
72
72
  )
73
73
  else:
@@ -328,8 +328,7 @@ class Geocif:
328
328
  elif self.model_name == "linear":
329
329
  self.model.fit(X_train_scaled, self.y_train)
330
330
  elif self.model_name == "gam":
331
- breakpoint()
332
- self.model.fit(X_train_scaled.values, self.y_train.values)
331
+ self.model.fit(X_train_scaled, self.y_train.values)
333
332
  self.best_hyperparams = {}
334
333
  elif self.model_name in ["cubist"]:
335
334
  self.model.fit(X_train, self.y_train)
@@ -842,6 +841,14 @@ class Geocif:
842
841
  .dropna(axis=1, how="any") # drop cols with any NA left
843
842
  .join(self.X_train[lag_cols]) # add lag-yield cols back untouched
844
843
  )
844
+ # Some models cannot handle any NaN values, so gapfill them
845
+ if self.model_name in ["gam", "linear"]:
846
+ for col in self.X_train.columns:
847
+ if self.X_train[col].isnull().any():
848
+ breakpoint()
849
+ median = self.X_train[col].median()
850
+ self.X_train[col].fillna(median, inplace=True)
851
+
845
852
  self.y_train = df_region_train[self.target_column]
846
853
 
847
854
  self.apply_feature_selector(region, dir_output)
@@ -1088,6 +1095,8 @@ class Geocif:
1088
1095
 
1089
1096
  # Region_ID should be type category
1090
1097
  df["Region_ID"] = df["Region_ID"].astype("category")
1098
+ else:
1099
+ raise ValueError(f"Unsupported cluster strategy {self.cluster_strategy}")
1091
1100
 
1092
1101
  return df
1093
1102
 
@@ -165,7 +165,7 @@ class cei_runner(base.BaseGeo):
165
165
  combinations = [
166
166
  i
167
167
  for i in combinations
168
- if "ukraine" in i[3]
168
+ if "ethiopia" in i[3]
169
169
  # or "lesotho_maize" in i[3] or
170
170
  # # "namibia_" in i[2] or
171
171
  # "united_republic_of_tanzania_maize" in i[3]
@@ -174,13 +174,13 @@ class cei_runner(base.BaseGeo):
174
174
  # or "south_africa_maize" in i[3]
175
175
  # or "mozambique_maize" in i[3]
176
176
  # or "united_states_of_america" in i[3]
177
- or "russian_federation" in i[3]
177
+ #or "russian_federation" in i[3]
178
178
  # or "ukraine" in i[3]
179
179
  ]
180
180
  # "malawi" in i[2]]
181
181
 
182
182
  if self.do_parallel:
183
- num_cpu = int(cpu_count() * 0.6)
183
+ num_cpu = int(cpu_count() * 0.75)
184
184
  with Pool(num_cpu) as p:
185
185
  for i, _ in enumerate(p.imap_unordered(indices.process, combinations)):
186
186
  pass
@@ -200,7 +200,6 @@ def _all_correlated_feature_by_time(df, **kwargs):
200
200
 
201
201
  # Only select columns that have been observed till the current stage
202
202
  pbar = tqdm(stages_features, total=len(stages_features), leave=False)
203
-
204
203
  for stage in pbar:
205
204
  pbar.set_description(f"Calculating correlations")
206
205
  pbar.update()
@@ -3,6 +3,7 @@ from collections import Counter
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
  from scipy.stats import pearsonr as pearsonr
6
+ from tqdm import tqdm
6
7
 
7
8
 
8
9
  def extract_regions(X, y, regions=[]):
@@ -104,57 +105,47 @@ def get_top_correlated_features(inputs, targets):
104
105
  return feature_by_region, counter
105
106
 
106
107
 
107
- def get_all_features_correlation(inputs, targets, method):
108
+ def get_all_features_correlation(inputs: pd.DataFrame,
109
+ targets: pd.Series,
110
+ method: str) -> pd.DataFrame:
108
111
  """
109
- Get the top correlated features for each region
110
- :param inputs: pd.DataFrame, input data
111
- :param targets: pd.Series, target data
112
- :param method: str, method to use to find the top correlated features
112
+ Fast version identical output, no length-mismatch on regions whose
113
+ feature names contain no spaces.
113
114
  """
114
- frames = []
115
- for region_id in inputs["Region"].unique():
116
- X, y = extract_regions(inputs, targets, regions=[region_id])
115
+ numeric_cols = inputs.select_dtypes(include=[np.number]).columns.tolist()
117
116
 
118
- feature_correlations = _compute_correlations(X, y)
117
+ df_all = inputs[numeric_cols + ["Region"]].copy()
118
+ df_all["__target__"] = targets.values
119
119
 
120
- # Exclude any nan values
121
- feature_correlations = {
122
- k: v for k, v in feature_correlations.items() if not np.isnan(v)
123
- }
120
+ frames: list[pd.DataFrame] = []
124
121
 
125
- if not feature_correlations:
122
+ for region_id, g in tqdm(df_all.groupby("Region", sort=False), leave=False):
123
+ corr = g[numeric_cols].corrwith(g["__target__"]).round(3).dropna()
124
+ if corr.empty:
126
125
  continue
127
126
 
128
- split_keys = []
129
- for key in feature_correlations.keys():
130
- parts = key.split(" ")
131
- cei = parts[0]
132
- time_period = " ".join(parts[1:])
133
-
134
- split_keys.append([cei, time_period])
135
-
136
- # split_keys = [key.rsplit("_", 1) for key in feature_correlations.keys()]
137
- values = list(feature_correlations.values())
138
-
139
- # Creating a DataFrame
140
- df = pd.DataFrame(split_keys, columns=["Metric", method])
141
- df["Value"] = values
142
-
143
- # Pivot the DataFrame so each metric becomes a column name and include the year as a separate column
144
- df_pivoted = df.pivot_table(
145
- index=method, columns="Metric", values="Value", aggfunc="first"
146
- ).reset_index()
147
- df_pivoted["Region"] = region_id
148
- # Move the 'Region' column to the front
149
- cols = df_pivoted.columns.tolist()
150
- cols = cols[-1:] + cols[:-1]
151
- df_pivoted = df_pivoted[cols]
127
+ # ---- safe split: always two columns --------------------------------
128
+ split = (
129
+ pd.Series(corr.index) # guarantees a Series
130
+ .str.split(" ", n=1, expand=True)
131
+ )
132
+ if split.shape[1] == 1: # no spaces in any feature name
133
+ split[1] = "" # match legacy behaviour
134
+ split.columns = [0, 1] # make column labels predictable
135
+
136
+ df_region = (
137
+ pd.DataFrame({
138
+ "Metric": split[0].values,
139
+ method: split[1].values,
140
+ "Value": corr.values # same length as above
141
+ })
142
+ .pivot_table(index=method, columns="Metric",
143
+ values="Value", aggfunc="first")
144
+ .reset_index()
145
+ )
146
+ df_region.insert(0, "Region", region_id)
147
+ frames.append(df_region)
152
148
 
153
- frames.append(df_pivoted)
149
+ return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
154
150
 
155
- if len(frames):
156
- feature_by_region = pd.concat(frames)
157
- else:
158
- feature_by_region = pd.DataFrame()
159
151
 
160
- return feature_by_region
@@ -6,6 +6,20 @@ import pandas as pd
6
6
  from geocif import utils
7
7
 
8
8
 
9
+ def make_serializable(hparams):
10
+ serializable = hparams.copy()
11
+
12
+ # Convert callbacks to strings
13
+ if 'callbacks' in serializable:
14
+ serializable['callbacks'] = [str(cb) for cb in serializable['callbacks']]
15
+
16
+ # Convert terms to string
17
+ if 'terms' in serializable:
18
+ serializable['terms'] = str(serializable['terms'])
19
+
20
+ return serializable
21
+
22
+
9
23
  def config_to_dict(parser):
10
24
  """
11
25
  Reads a configuration file and returns the configuration as a nested dictionary.
@@ -103,6 +117,9 @@ def store(db_path, experiment_id, df, model, model_name):
103
117
  for col in df.select_dtypes(include=["category"]).columns:
104
118
  df[col] = df[col].astype(str)
105
119
 
120
+ # Convert all columns to string
121
+ df['Best Hyperparameters'] = df['Best Hyperparameters'].apply(make_serializable)
122
+
106
123
  # Output results to database
107
124
  try:
108
125
  utils.to_db(db_path, experiment_id, df)
@@ -350,14 +350,7 @@ def auto_train(
350
350
  "cumulative_2": s(0) + s(1) + te(0, 1) + f(2),
351
351
  "cumulative_3": s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2) + f(3),
352
352
  }
353
- try:
354
- # Fill nans with medians
355
- for col in X_train.columns:
356
- if X_train[col].isnull().any():
357
- median = X_train[col].median()
358
- X_train[col].fillna(median, inplace=True)
359
- except:
360
- breakpoint()
353
+ breakpoint()
361
354
  formula = gam_formulas.get(model_name, gam_cls(n_splines=25, spline_order=3))
362
355
  model = gam_cls(n_splines=25, spline_order=3).gridsearch(X_train, y_train.values, lam=np.logspace(-3, 3, 11)) if model_name.startswith("gam") else formula
363
356
  elif model_name == "geospaNN":
@@ -15,6 +15,22 @@
15
15
 
16
16
  import os
17
17
 
18
+ from pygam import LinearGAM, GammaGAM, s
19
+ import numpy as np, pandas as pd
20
+
21
+ X = np.random.uniform(0, 5, 500)[:, None]
22
+ y_pos = 2 * np.exp(0.3*X.squeeze()) + np.random.gamma(shape=2, scale=1, size=500)
23
+
24
+ # Bad idea – LinearGAM on skewed positive data
25
+ lin = LinearGAM(s(0)).fit(X, y_pos)
26
+
27
+ # Appropriate – GammaGAM with log link
28
+ gam = GammaGAM(terms, fit_intercept=True)
29
+
30
+ print("LinearGAM R2:", lin.statistics_['pseudo_r2']['explained_deviance'])
31
+ print("GammaGAM R2:", gam.statistics_['pseudo_r2']['explained_deviance'])
32
+
33
+ breakpoint()
18
34
  # Set R_HOME environment variable before importing rpy2
19
35
  os.environ["R_HOME"] = f"{os.environ['CONDA_PREFIX']}\Lib\R"
20
36
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: geocif
3
- Version: 0.2.43
3
+ Version: 0.2.45
4
4
  Summary: Models to visualize and forecast crop conditions and yields
5
5
  Home-page: https://ritviksahajpal.github.io/yield_forecasting/
6
6
  Author: Ritvik Sahajpal
@@ -50,6 +50,6 @@ setup(
50
50
  test_suite="tests",
51
51
  tests_require=test_requirements,
52
52
  url="https://ritviksahajpal.github.io/yield_forecasting/",
53
- version="0.2.43",
53
+ version="0.2.45",
54
54
  zip_safe=False,
55
55
  )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes